diff options
| -rw-r--r-- | adler32_p.h | 107 | ||||
| -rw-r--r-- | arch/arm/adler32_neon.c | 43 | ||||
| -rw-r--r-- | arch/generic/adler32_c.c | 6 | ||||
| -rw-r--r-- | arch/loongarch/adler32_lasx.c | 2 | ||||
| -rw-r--r-- | arch/loongarch/adler32_lsx.c | 2 | ||||
| -rw-r--r-- | arch/power/adler32_power8.c | 6 | ||||
| -rw-r--r-- | arch/power/adler32_vmx.c | 25 | ||||
| -rw-r--r-- | arch/riscv/adler32_rvv.c | 16 | ||||
| -rw-r--r-- | arch/x86/adler32_avx2.c | 2 | ||||
| -rw-r--r-- | arch/x86/adler32_sse42.c | 2 | ||||
| -rw-r--r-- | arch/x86/adler32_ssse3.c | 12 | ||||
| -rw-r--r-- | zbuild.h | 8 |
12 files changed, 104 insertions, 127 deletions
diff --git a/adler32_p.h b/adler32_p.h index 12fb0ddf23..b5d5f1615c 100644 --- a/adler32_p.h +++ b/adler32_p.h @@ -18,54 +18,81 @@ #define ADLER_DO8(sum1, sum2, buf, i) {ADLER_DO4(sum1, sum2, buf, i); ADLER_DO4(sum1, sum2, buf, i+4);} #define ADLER_DO16(sum1, sum2, buf) {ADLER_DO8(sum1, sum2, buf, 0); ADLER_DO8(sum1, sum2, buf, 8);} -static inline uint32_t adler32_copy_len_1(uint32_t adler, uint8_t *dst, const uint8_t *buf, uint32_t sum2, const int COPY) { - uint8_t c = *buf; - if (COPY) { - *dst = c; +Z_FORCEINLINE static void adler32_copy_align(uint32_t *Z_RESTRICT adler, uint8_t *dst, const uint8_t *buf, size_t len, + uint32_t *Z_RESTRICT sum2, const int MAX_LEN, const int COPY) { + Z_UNUSED(MAX_LEN); + if (len & 1) { + if (COPY) { + *dst = *buf; + dst += 1; + } + ADLER_DO1(*adler, *sum2, buf, 0); + buf += 1; } - adler += c; - adler %= BASE; - sum2 += adler; - sum2 %= BASE; - return adler | (sum2 << 16); -} - -static inline uint32_t adler32_copy_len_16(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) { - while (len--) { - uint8_t c = *buf++; + if (len & 2) { if (COPY) { - *dst++ = c; + memcpy(dst, buf, 2); + dst += 2; } - adler += c; - sum2 += adler; + ADLER_DO2(*adler, *sum2, buf, 0); + buf += 2; + } + while (len >= 4) { + if (COPY) { + memcpy(dst, buf, 4); + dst += 4; + } + len -= 4; + ADLER_DO4(*adler, *sum2, buf, 0); + buf += 4; } - adler %= BASE; - sum2 %= BASE; /* only added so many BASE's */ - /* return recombined sums */ - return adler | (sum2 << 16); } -static inline uint32_t adler32_copy_len_64(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) { - const uint8_t *src = buf; - const size_t src_len = len; -#ifdef UNROLL_MORE - while (len >= 16) { - len -= 16; - ADLER_DO16(adler, sum2, buf); - buf += 16; -#else - while (len >= 8) { - len -= 8; - ADLER_DO8(adler, sum2, buf, 0); - buf += 8; -#endif +Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, + uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) { + if (len) { + /* DO16 loop for large remainders only (scalar, risc-v). */ + if (MAX_LEN >= 32) { + while (len >= 16) { + if (COPY) { + memcpy(dst, buf, 16); + dst += 16; + } + len -= 16; + ADLER_DO16(adler, sum2, buf); + buf += 16; + } + } + /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */ + while (len >= 4) { + if (COPY) { + memcpy(dst, buf, 4); + dst += 4; + } + len -= 4; + ADLER_DO4(adler, sum2, buf, 0); + buf += 4; + } + if (len & 2) { + if (COPY) { + memcpy(dst, buf, 2); + dst += 2; + } + ADLER_DO2(adler, sum2, buf, 0); + buf += 2; + } + if (len & 1) { + if (COPY) + *dst = *buf; + ADLER_DO1(adler, sum2, buf, 0); + } } - /* Process tail (len < 16). */ - adler = adler32_copy_len_16(adler, NULL, buf, len, sum2, 0); - if (COPY) { - memcpy(dst, src, src_len); + if (REBASE) { + adler %= BASE; + sum2 %= BASE; } - return adler; + /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ + return adler | (sum2 << 16); } #endif /* ADLER32_P_H */ diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index 2ef02398e6..cbb1c784ef 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -259,14 +259,6 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l s[1] = vget_lane_u32(as, 1); } -static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) { - unsigned int i; - for (i = 0; i < len; ++i) { - pair[0] += buf[i]; - pair[1] += pair[0]; - } -} - Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { /* split Adler-32 into component sums */ uint32_t sum2 = (adler >> 16) & 0xffff; @@ -274,11 +266,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co /* in case user likes doing a byte at a time, keep it fast */ if (UNLIKELY(len == 1)) - return adler32_copy_len_1(adler, dst, src, sum2, COPY); + return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY); /* in case short lengths are provided, keep it somewhat fast */ if (UNLIKELY(len < 16)) - return adler32_copy_len_16(adler, dst, src, len, sum2, COPY); + return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY); uint32_t pair[2]; int n = NMAX; @@ -308,21 +300,10 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co unsigned int align_adj = (align_offset) ? 32 - align_offset : 0; if (align_offset && len >= (16 + align_adj)) { - NEON_handle_tail(pair, src, align_adj); - - if (COPY) { - const uint8_t* __restrict src_noalias = src; - uint8_t* __restrict dst_noalias = dst; - unsigned cpy_len = align_adj; - - while (cpy_len--) { - *dst_noalias++ = *src_noalias++; - } - } + adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY); n -= align_adj; done += align_adj; - } else { /* If here, we failed the len criteria test, it wouldn't be * worthwhile to do scalar aligning sums */ @@ -348,22 +329,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co done += actual_nsums; } - /* Handle the tail elements. */ - if (done < len) { - NEON_handle_tail(pair, (src + done), len - done); - if (COPY) { - const uint8_t* __restrict src_noalias = src + done; - uint8_t* __restrict dst_noalias = dst + done; - while (done++ != len) { - *dst_noalias++ = *src_noalias++; - } - } - pair[0] %= BASE; - pair[1] %= BASE; - } - - /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ - return (pair[1] << 16) | pair[0]; + /* Process tail (len < 16). */ + return adler32_copy_tail(pair[0], dst + done, src + done, len - done, pair[1], done < len, 15, COPY); } Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) { diff --git a/arch/generic/adler32_c.c b/arch/generic/adler32_c.c index 4bd5538168..84c946f452 100644 --- a/arch/generic/adler32_c.c +++ b/arch/generic/adler32_c.c @@ -17,11 +17,11 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { /* in case user likes doing a byte at a time, keep it fast */ if (UNLIKELY(len == 1)) - return adler32_copy_len_1(adler, NULL, buf, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0); /* in case short lengths are provided, keep it somewhat fast */ if (UNLIKELY(len < 16)) - return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0); /* do length NMAX blocks -- requires just one modulo operation */ while (len >= NMAX) { @@ -45,7 +45,7 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { } /* do remaining bytes (less than NMAX, still just one modulo) */ - return adler32_copy_len_64(adler, NULL, buf, len, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0); } Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { diff --git a/arch/loongarch/adler32_lasx.c b/arch/loongarch/adler32_lasx.c index 92b942ea4b..a7268e73ff 100644 --- a/arch/loongarch/adler32_lasx.c +++ b/arch/loongarch/adler32_lasx.c @@ -41,7 +41,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co rem_peel: if (len < 16) { - return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY); + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); } else if (len < 32) { if (COPY) { return adler32_copy_lsx(adler, dst, src, len); diff --git a/arch/loongarch/adler32_lsx.c b/arch/loongarch/adler32_lsx.c index 4c3603193a..389f74c683 100644 --- a/arch/loongarch/adler32_lsx.c +++ b/arch/loongarch/adler32_lsx.c @@ -36,7 +36,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co rem_peel: if (len < 16) - return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY); + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); __m128i vbuf, vbuf_0; __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c index b002e4ed3a..39b3cf399c 100644 --- a/arch/power/adler32_power8.c +++ b/arch/power/adler32_power8.c @@ -59,11 +59,11 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s /* in case user likes doing a byte at a time, keep it fast */ if (UNLIKELY(len == 1)) - return adler32_copy_len_1(s1, NULL, buf, s2, 0); + return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0); /* This is faster than VSX code for len < 64. */ if (len < 64) - return adler32_copy_len_64(s1, NULL, buf, len, s2, 0); + return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0); /* Use POWER VSX instructions for len >= 64. */ const vector unsigned int v_zeros = { 0 }; @@ -144,7 +144,7 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s s2 = vs2[0] % BASE; /* Process tail (len < 16). */ - return adler32_copy_len_16(s1, NULL, buf, len, s2, 0); + return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0); } Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) { diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c index b1371da950..31eaf5e36d 100644 --- a/arch/power/adler32_vmx.c +++ b/arch/power/adler32_vmx.c @@ -15,14 +15,6 @@ #define vmx_zero() (vec_splat_u32(0)) -static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) { - unsigned int i; - for (i = 0; i < len; ++i) { - pair[0] += buf[i]; - pair[1] += pair[0]; - } -} - static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) { /* Different taps for the separable components of sums */ const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49}; @@ -127,11 +119,11 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) /* in case user likes doing a byte at a time, keep it fast */ if (UNLIKELY(len == 1)) - return adler32_copy_len_1(adler, NULL, buf, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0); /* in case short lengths are provided, keep it somewhat fast */ if (UNLIKELY(len < 16)) - return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0); uint32_t pair[4] ALIGNED_(16); pair[0] = adler; @@ -144,7 +136,7 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) unsigned int done = 0; size_t align_len = (size_t)MIN(ALIGN_DIFF(buf, 16), len); if (align_len) { - vmx_handle_head_or_tail(pair, buf, align_len); + adler32_copy_align(&pair[0], NULL, buf, align_len, &pair[1], 15, 0); done += align_len; /* Rather than rebasing, we can reduce the max sums for the * first round only */ @@ -163,15 +155,8 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) done += (n / 16) * 16; } - /* Handle the tail elements. */ - if (done < len) { - vmx_handle_head_or_tail(pair, (buf + done), len - done); - pair[0] %= BASE; - pair[1] %= BASE; - } - - /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ - return (pair[1] << 16) | pair[0]; + /* Process tail (len < 16). */ + return adler32_copy_tail(pair[0], NULL, buf + done, len - done, pair[1], done < len, 15, 0); } /* VMX stores can have higher latency than optimized memcpy */ diff --git a/arch/riscv/adler32_rvv.c b/arch/riscv/adler32_rvv.c index 8c297a617b..e446189302 100644 --- a/arch/riscv/adler32_rvv.c +++ b/arch/riscv/adler32_rvv.c @@ -18,11 +18,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric /* in case user likes doing a byte at a time, keep it fast */ if (UNLIKELY(len == 1)) - return adler32_copy_len_1(adler, dst, src, sum2, COPY); + return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY); /* in case short lengths are provided, keep it somewhat fast */ if (UNLIKELY(len < 16)) - return adler32_copy_len_16(adler, dst, src, len, sum2, COPY); + return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY); size_t left = len; size_t vl = __riscv_vsetvlmax_e8m1(); @@ -104,16 +104,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric sum2 %= BASE; adler %= BASE; - while (left--) { - if (COPY) *dst++ = *src; - adler += *src++; - sum2 += adler; - } - - sum2 %= BASE; - adler %= BASE; - - return adler | (sum2 << 16); + /* Process tail (left < 256). */ + return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY); } Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c index 4b1f0dac98..d1811b254d 100644 --- a/arch/x86/adler32_avx2.c +++ b/arch/x86/adler32_avx2.c @@ -25,7 +25,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co rem_peel: if (len < 16) { - return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY); + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); } else if (len < 32) { if (COPY) { return adler32_copy_sse42(adler, dst, src, len); diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c index ea1d370372..c2301213f0 100644 --- a/arch/x86/adler32_sse42.c +++ b/arch/x86/adler32_sse42.c @@ -21,7 +21,7 @@ Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8 rem_peel: if (UNLIKELY(len < 16)) - return adler32_copy_len_16(adler0, dst, src, len, adler1, 1); + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1); __m128i vbuf, vbuf_0; __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 4f9ee22f91..9d2715a435 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -21,11 +21,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len /* in case user likes doing a byte at a time, keep it fast */ if (UNLIKELY(len == 1)) - return adler32_copy_len_1(adler, NULL, buf, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0); /* in case short lengths are provided, keep it somewhat fast */ if (UNLIKELY(len < 16)) - return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0); const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); @@ -60,13 +60,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len goto unaligned_jmp; } - for (size_t i = 0; i < align_offset; ++i) { - adler += *(buf++); - sum2 += adler; - } + adler32_copy_align(&adler, NULL, buf, align_offset, &sum2, 15, 0); /* lop off the max number of sums based on the scalar sums done * above */ + buf += align_offset; len -= align_offset; max_iters -= align_offset; } @@ -143,7 +141,7 @@ unaligned_jmp: } /* Process tail (len < 16). */ - return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0); + return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0); } /* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */ @@ -180,6 +180,14 @@ # define Z_REGISTER #endif +#if defined(_MSC_VER) +# define Z_RESTRICT __restrict +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +# define Z_RESTRICT restrict +#else +# define Z_RESTRICT __restrict__ +#endif + /* Reverse the bytes in a value. Use compiler intrinsics when possible to take advantage of hardware implementations. */ #if defined(_MSC_VER) && (_MSC_VER >= 1300) |
