summaryrefslogtreecommitdiff
path: root/neozip/arch/loongarch/compare256_lsx.c
diff options
context:
space:
mode:
Diffstat (limited to 'neozip/arch/loongarch/compare256_lsx.c')
-rw-r--r--neozip/arch/loongarch/compare256_lsx.c88
1 files changed, 88 insertions, 0 deletions
diff --git a/neozip/arch/loongarch/compare256_lsx.c b/neozip/arch/loongarch/compare256_lsx.c
new file mode 100644
index 0000000000..4afd261e76
--- /dev/null
+++ b/neozip/arch/loongarch/compare256_lsx.c
@@ -0,0 +1,88 @@
+/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef LOONGARCH_LSX
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+
+static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) {
+ __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+ /* Do the first load unaligned, than all subsequent ones we have at least
+ * one aligned load. Sadly aligning both loads is probably unrealistic */
+ xmm_src0 = __lsx_vld(src0, 0);
+ xmm_src1 = __lsx_vld(src1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return zng_ctz32(~mask);
+
+ const uint8_t *last0 = src0 + 240;
+ const uint8_t *last1 = src1 + 240;
+
+ int align_offset = ((uintptr_t)src0) & 15;
+ int align_adv = 16 - align_offset;
+ uint32_t len = align_adv;
+
+ src0 += align_adv;
+ src1 += align_adv;
+
+ for (int i = 0; i < 15; i++) {
+ xmm_src0 = __lsx_vld(src0, 0);
+ xmm_src1 = __lsx_vld(src1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return len + zng_ctz32(~mask);
+
+ len += 16, src0 += 16, src1 += 16;
+ }
+
+ if (align_offset) {
+ xmm_src0 = __lsx_vld(last0, 0);
+ xmm_src1 = __lsx_vld(last1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ if (mask != 0xFFFF)
+ return 240 + zng_ctz32(~mask);
+ }
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_lsx_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_lsx
+#define COMPARE256 compare256_lsx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_lsx
+#define COMPARE256 compare256_lsx_static
+
+#include "match_tpl.h"
+
+#endif