diff options
Diffstat (limited to 'neozip/arch/loongarch/adler32_lasx.c')
| -rw-r--r-- | neozip/arch/loongarch/adler32_lasx.c | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/neozip/arch/loongarch/adler32_lasx.c b/neozip/arch/loongarch/adler32_lasx.c new file mode 100644 index 0000000000..a7268e73ff --- /dev/null +++ b/neozip/arch/loongarch/adler32_lasx.c @@ -0,0 +1,154 @@ +/* adler32_lasx.c -- compute the Adler-32 checksum of a data stream, based on Intel AVX2 implementation + * Copyright (C) 1995-2011 Mark Adler + * Copyright (C) 2022 Adam Stylinski + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * Authors: + * Brian Bockelman <bockelman@gmail.com> + * Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_LASX + +#include "zbuild.h" +#include "adler32_p.h" + +#include <lasxintrin.h> +#include "lasxintrin_ext.h" + + +/* 32 bit horizontal sum */ +static inline uint32_t hsum256(__m256i x) { + __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8)); + __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2)); + __m256i sum3 = __lasx_xvadd_w(sum2, __lasx_xvbsrl_v(sum2, 4)); + return (uint32_t)__lasx_xvpickve2gr_wu(sum3, 0); +} + +static inline uint32_t partial_hsum256(__m256i x) { + __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8)); + __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2)); + return (uint32_t)__lasx_xvpickve2gr_wu(sum2, 0); +} + +extern uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) { + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); + } else if (len < 32) { + if (COPY) { + return adler32_copy_lsx(adler, dst, src, len); + } else { + return adler32_lsx(adler, src, len); + } + } + + __m256i vs1, vs2, vs2_0; + + const __m256i dot2v = (__m256i)((v32i8){ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, + 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33 }); + const __m256i dot2v_0 = (__m256i)((v32i8){ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }); + const __m256i dot3v = __lasx_xvreplgr2vr_h(1); + const __m256i zero = __lasx_xvldi(0); + + while (len >= 32) { + vs1 = __lasx_xvinsgr2vr_w(zero, adler0, 0); + vs2 = __lasx_xvinsgr2vr_w(zero, adler1, 0); + + __m256i vs1_0 = vs1; + __m256i vs3 = __lasx_xvldi(0); + vs2_0 = vs3; + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); + len -= k; + + while (k >= 64) { + __m256i vbuf = __lasx_xvld(src, 0); + __m256i vbuf_0 = __lasx_xvld(src, 32); + src += 64; + k -= 64; + + __m256i vs1_sad = lasx_sad_bu(vbuf, zero); + __m256i vs1_sad2 = lasx_sad_bu(vbuf_0, zero); + + if (COPY) { + __lasx_xvst(vbuf, dst, 0); + __lasx_xvst(vbuf_0, dst, 32); + dst += 64; + } + + vs1 = __lasx_xvadd_w(vs1, vs1_sad); + vs3 = __lasx_xvadd_w(vs3, vs1_0); + __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v); // sum 32 uint8s to 16 shorts + __m256i v_short_sum2_0 = lasx_maddubs_w_h(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + __m256i vsum2_0 = lasx_madd_w_h(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s + vs1 = __lasx_xvadd_w(vs1_sad2, vs1); + vs2 = __lasx_xvadd_w(vsum2, vs2); + vs2_0 = __lasx_xvadd_w(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = __lasx_xvadd_w(vs2_0, vs2); + vs3 = __lasx_xvslli_w(vs3, 6); + vs2 = __lasx_xvadd_w(vs3, vs2); + vs3 = __lasx_xvldi(0); + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) + */ + __m256i vbuf = __lasx_xvld(src, 0); + src += 32; + k -= 32; + + __m256i vs1_sad = lasx_sad_bu(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's + + if (COPY) { + __lasx_xvst(vbuf, dst, 0); + dst += 32; + } + + vs1 = __lasx_xvadd_w(vs1, vs1_sad); + vs3 = __lasx_xvadd_w(vs3, vs1_0); + __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + vs2 = __lasx_xvadd_w(vsum2, vs2); + vs1_0 = vs1; + } + + /* Defer the multiplication with 32 to outside of the loop */ + vs3 = __lasx_xvslli_w(vs3, 5); + vs2 = __lasx_xvadd_w(vs2, vs3); + + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif |
