summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--adler32_p.h107
-rw-r--r--arch/arm/adler32_neon.c43
-rw-r--r--arch/generic/adler32_c.c6
-rw-r--r--arch/loongarch/adler32_lasx.c2
-rw-r--r--arch/loongarch/adler32_lsx.c2
-rw-r--r--arch/power/adler32_power8.c6
-rw-r--r--arch/power/adler32_vmx.c25
-rw-r--r--arch/riscv/adler32_rvv.c16
-rw-r--r--arch/x86/adler32_avx2.c2
-rw-r--r--arch/x86/adler32_sse42.c2
-rw-r--r--arch/x86/adler32_ssse3.c12
-rw-r--r--zbuild.h8
12 files changed, 104 insertions, 127 deletions
diff --git a/adler32_p.h b/adler32_p.h
index 12fb0ddf23..b5d5f1615c 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -18,54 +18,81 @@
#define ADLER_DO8(sum1, sum2, buf, i) {ADLER_DO4(sum1, sum2, buf, i); ADLER_DO4(sum1, sum2, buf, i+4);}
#define ADLER_DO16(sum1, sum2, buf) {ADLER_DO8(sum1, sum2, buf, 0); ADLER_DO8(sum1, sum2, buf, 8);}
-static inline uint32_t adler32_copy_len_1(uint32_t adler, uint8_t *dst, const uint8_t *buf, uint32_t sum2, const int COPY) {
- uint8_t c = *buf;
- if (COPY) {
- *dst = c;
+Z_FORCEINLINE static void adler32_copy_align(uint32_t *Z_RESTRICT adler, uint8_t *dst, const uint8_t *buf, size_t len,
+ uint32_t *Z_RESTRICT sum2, const int MAX_LEN, const int COPY) {
+ Z_UNUSED(MAX_LEN);
+ if (len & 1) {
+ if (COPY) {
+ *dst = *buf;
+ dst += 1;
+ }
+ ADLER_DO1(*adler, *sum2, buf, 0);
+ buf += 1;
}
- adler += c;
- adler %= BASE;
- sum2 += adler;
- sum2 %= BASE;
- return adler | (sum2 << 16);
-}
-
-static inline uint32_t adler32_copy_len_16(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
- while (len--) {
- uint8_t c = *buf++;
+ if (len & 2) {
if (COPY) {
- *dst++ = c;
+ memcpy(dst, buf, 2);
+ dst += 2;
}
- adler += c;
- sum2 += adler;
+ ADLER_DO2(*adler, *sum2, buf, 0);
+ buf += 2;
+ }
+ while (len >= 4) {
+ if (COPY) {
+ memcpy(dst, buf, 4);
+ dst += 4;
+ }
+ len -= 4;
+ ADLER_DO4(*adler, *sum2, buf, 0);
+ buf += 4;
}
- adler %= BASE;
- sum2 %= BASE; /* only added so many BASE's */
- /* return recombined sums */
- return adler | (sum2 << 16);
}
-static inline uint32_t adler32_copy_len_64(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
- const uint8_t *src = buf;
- const size_t src_len = len;
-#ifdef UNROLL_MORE
- while (len >= 16) {
- len -= 16;
- ADLER_DO16(adler, sum2, buf);
- buf += 16;
-#else
- while (len >= 8) {
- len -= 8;
- ADLER_DO8(adler, sum2, buf, 0);
- buf += 8;
-#endif
+Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len,
+ uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
+ if (len) {
+ /* DO16 loop for large remainders only (scalar, risc-v). */
+ if (MAX_LEN >= 32) {
+ while (len >= 16) {
+ if (COPY) {
+ memcpy(dst, buf, 16);
+ dst += 16;
+ }
+ len -= 16;
+ ADLER_DO16(adler, sum2, buf);
+ buf += 16;
+ }
+ }
+ /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
+ while (len >= 4) {
+ if (COPY) {
+ memcpy(dst, buf, 4);
+ dst += 4;
+ }
+ len -= 4;
+ ADLER_DO4(adler, sum2, buf, 0);
+ buf += 4;
+ }
+ if (len & 2) {
+ if (COPY) {
+ memcpy(dst, buf, 2);
+ dst += 2;
+ }
+ ADLER_DO2(adler, sum2, buf, 0);
+ buf += 2;
+ }
+ if (len & 1) {
+ if (COPY)
+ *dst = *buf;
+ ADLER_DO1(adler, sum2, buf, 0);
+ }
}
- /* Process tail (len < 16). */
- adler = adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
- if (COPY) {
- memcpy(dst, src, src_len);
+ if (REBASE) {
+ adler %= BASE;
+ sum2 %= BASE;
}
- return adler;
+ /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+ return adler | (sum2 << 16);
}
#endif /* ADLER32_P_H */
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 2ef02398e6..cbb1c784ef 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -259,14 +259,6 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
s[1] = vget_lane_u32(as, 1);
}
-static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
- unsigned int i;
- for (i = 0; i < len; ++i) {
- pair[0] += buf[i];
- pair[1] += pair[0];
- }
-}
-
Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
@@ -274,11 +266,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_copy_len_1(adler, dst, src, sum2, COPY);
+ return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
+ return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
uint32_t pair[2];
int n = NMAX;
@@ -308,21 +300,10 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
unsigned int align_adj = (align_offset) ? 32 - align_offset : 0;
if (align_offset && len >= (16 + align_adj)) {
- NEON_handle_tail(pair, src, align_adj);
-
- if (COPY) {
- const uint8_t* __restrict src_noalias = src;
- uint8_t* __restrict dst_noalias = dst;
- unsigned cpy_len = align_adj;
-
- while (cpy_len--) {
- *dst_noalias++ = *src_noalias++;
- }
- }
+ adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY);
n -= align_adj;
done += align_adj;
-
} else {
/* If here, we failed the len criteria test, it wouldn't be
* worthwhile to do scalar aligning sums */
@@ -348,22 +329,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
done += actual_nsums;
}
- /* Handle the tail elements. */
- if (done < len) {
- NEON_handle_tail(pair, (src + done), len - done);
- if (COPY) {
- const uint8_t* __restrict src_noalias = src + done;
- uint8_t* __restrict dst_noalias = dst + done;
- while (done++ != len) {
- *dst_noalias++ = *src_noalias++;
- }
- }
- pair[0] %= BASE;
- pair[1] %= BASE;
- }
-
- /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
- return (pair[1] << 16) | pair[0];
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(pair[0], dst + done, src + done, len - done, pair[1], done < len, 15, COPY);
}
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
diff --git a/arch/generic/adler32_c.c b/arch/generic/adler32_c.c
index 4bd5538168..84c946f452 100644
--- a/arch/generic/adler32_c.c
+++ b/arch/generic/adler32_c.c
@@ -17,11 +17,11 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
@@ -45,7 +45,7 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
}
/* do remaining bytes (less than NMAX, still just one modulo) */
- return adler32_copy_len_64(adler, NULL, buf, len, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0);
}
Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
diff --git a/arch/loongarch/adler32_lasx.c b/arch/loongarch/adler32_lasx.c
index 92b942ea4b..a7268e73ff 100644
--- a/arch/loongarch/adler32_lasx.c
+++ b/arch/loongarch/adler32_lasx.c
@@ -41,7 +41,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
rem_peel:
if (len < 16) {
- return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
} else if (len < 32) {
if (COPY) {
return adler32_copy_lsx(adler, dst, src, len);
diff --git a/arch/loongarch/adler32_lsx.c b/arch/loongarch/adler32_lsx.c
index 4c3603193a..389f74c683 100644
--- a/arch/loongarch/adler32_lsx.c
+++ b/arch/loongarch/adler32_lsx.c
@@ -36,7 +36,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
rem_peel:
if (len < 16)
- return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
__m128i vbuf, vbuf_0;
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c
index b002e4ed3a..39b3cf399c 100644
--- a/arch/power/adler32_power8.c
+++ b/arch/power/adler32_power8.c
@@ -59,11 +59,11 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_copy_len_1(s1, NULL, buf, s2, 0);
+ return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0);
/* This is faster than VSX code for len < 64. */
if (len < 64)
- return adler32_copy_len_64(s1, NULL, buf, len, s2, 0);
+ return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0);
/* Use POWER VSX instructions for len >= 64. */
const vector unsigned int v_zeros = { 0 };
@@ -144,7 +144,7 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s
s2 = vs2[0] % BASE;
/* Process tail (len < 16). */
- return adler32_copy_len_16(s1, NULL, buf, len, s2, 0);
+ return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0);
}
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
index b1371da950..31eaf5e36d 100644
--- a/arch/power/adler32_vmx.c
+++ b/arch/power/adler32_vmx.c
@@ -15,14 +15,6 @@
#define vmx_zero() (vec_splat_u32(0))
-static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
- unsigned int i;
- for (i = 0; i < len; ++i) {
- pair[0] += buf[i];
- pair[1] += pair[0];
- }
-}
-
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
@@ -127,11 +119,11 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
uint32_t pair[4] ALIGNED_(16);
pair[0] = adler;
@@ -144,7 +136,7 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
unsigned int done = 0;
size_t align_len = (size_t)MIN(ALIGN_DIFF(buf, 16), len);
if (align_len) {
- vmx_handle_head_or_tail(pair, buf, align_len);
+ adler32_copy_align(&pair[0], NULL, buf, align_len, &pair[1], 15, 0);
done += align_len;
/* Rather than rebasing, we can reduce the max sums for the
* first round only */
@@ -163,15 +155,8 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
done += (n / 16) * 16;
}
- /* Handle the tail elements. */
- if (done < len) {
- vmx_handle_head_or_tail(pair, (buf + done), len - done);
- pair[0] %= BASE;
- pair[1] %= BASE;
- }
-
- /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
- return (pair[1] << 16) | pair[0];
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(pair[0], NULL, buf + done, len - done, pair[1], done < len, 15, 0);
}
/* VMX stores can have higher latency than optimized memcpy */
diff --git a/arch/riscv/adler32_rvv.c b/arch/riscv/adler32_rvv.c
index 8c297a617b..e446189302 100644
--- a/arch/riscv/adler32_rvv.c
+++ b/arch/riscv/adler32_rvv.c
@@ -18,11 +18,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_copy_len_1(adler, dst, src, sum2, COPY);
+ return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
+ return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
size_t left = len;
size_t vl = __riscv_vsetvlmax_e8m1();
@@ -104,16 +104,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric
sum2 %= BASE;
adler %= BASE;
- while (left--) {
- if (COPY) *dst++ = *src;
- adler += *src++;
- sum2 += adler;
- }
-
- sum2 %= BASE;
- adler %= BASE;
-
- return adler | (sum2 << 16);
+ /* Process tail (left < 256). */
+ return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY);
}
Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c
index 4b1f0dac98..d1811b254d 100644
--- a/arch/x86/adler32_avx2.c
+++ b/arch/x86/adler32_avx2.c
@@ -25,7 +25,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
rem_peel:
if (len < 16) {
- return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
} else if (len < 32) {
if (COPY) {
return adler32_copy_sse42(adler, dst, src, len);
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c
index ea1d370372..c2301213f0 100644
--- a/arch/x86/adler32_sse42.c
+++ b/arch/x86/adler32_sse42.c
@@ -21,7 +21,7 @@ Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8
rem_peel:
if (UNLIKELY(len < 16))
- return adler32_copy_len_16(adler0, dst, src, len, adler1, 1);
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1);
__m128i vbuf, vbuf_0;
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c
index 4f9ee22f91..9d2715a435 100644
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -21,11 +21,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
@@ -60,13 +60,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
goto unaligned_jmp;
}
- for (size_t i = 0; i < align_offset; ++i) {
- adler += *(buf++);
- sum2 += adler;
- }
+ adler32_copy_align(&adler, NULL, buf, align_offset, &sum2, 15, 0);
/* lop off the max number of sums based on the scalar sums done
* above */
+ buf += align_offset;
len -= align_offset;
max_iters -= align_offset;
}
@@ -143,7 +141,7 @@ unaligned_jmp:
}
/* Process tail (len < 16). */
- return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0);
}
/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
diff --git a/zbuild.h b/zbuild.h
index f1a0f3217e..39903d2176 100644
--- a/zbuild.h
+++ b/zbuild.h
@@ -180,6 +180,14 @@
# define Z_REGISTER
#endif
+#if defined(_MSC_VER)
+# define Z_RESTRICT __restrict
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+# define Z_RESTRICT restrict
+#else
+# define Z_RESTRICT __restrict__
+#endif
+
/* Reverse the bytes in a value. Use compiler intrinsics when
possible to take advantage of hardware implementations. */
#if defined(_MSC_VER) && (_MSC_VER >= 1300)