diff options
Diffstat (limited to 'neozip/arch/loongarch/compare256_lsx.c')
| -rw-r--r-- | neozip/arch/loongarch/compare256_lsx.c | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/neozip/arch/loongarch/compare256_lsx.c b/neozip/arch/loongarch/compare256_lsx.c new file mode 100644 index 0000000000..4afd261e76 --- /dev/null +++ b/neozip/arch/loongarch/compare256_lsx.c @@ -0,0 +1,88 @@ +/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation + * Copyright Adam Stylinski <kungfujesus06@gmail.com> + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef LOONGARCH_LSX + +#include <lsxintrin.h> +#include "lsxintrin_ext.h" + +static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) { + __m128i xmm_src0, xmm_src1, xmm_cmp; + + /* Do the first load unaligned, than all subsequent ones we have at least + * one aligned load. Sadly aligning both loads is probably unrealistic */ + xmm_src0 = __lsx_vld(src0, 0); + xmm_src1 = __lsx_vld(src1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return zng_ctz32(~mask); + + const uint8_t *last0 = src0 + 240; + const uint8_t *last1 = src1 + 240; + + int align_offset = ((uintptr_t)src0) & 15; + int align_adv = 16 - align_offset; + uint32_t len = align_adv; + + src0 += align_adv; + src1 += align_adv; + + for (int i = 0; i < 15; i++) { + xmm_src0 = __lsx_vld(src0, 0); + xmm_src1 = __lsx_vld(src1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + mask = (unsigned)lsx_movemask_b(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return len + zng_ctz32(~mask); + + len += 16, src0 += 16, src1 += 16; + } + + if (align_offset) { + xmm_src0 = __lsx_vld(last0, 0); + xmm_src1 = __lsx_vld(last1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + mask = (unsigned)lsx_movemask_b(xmm_cmp); + + if (mask != 0xFFFF) + return 240 + zng_ctz32(~mask); + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) { + return compare256_lsx_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_lsx +#define COMPARE256 compare256_lsx_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_lsx +#define COMPARE256 compare256_lsx_static + +#include "match_tpl.h" + +#endif |
