12 files changed, 104 insertions, 127 deletions
diff --git a/adler32_p.h b/adler32_p.h
index 12fb0ddf23..b5d5f1615c 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -18,54 +18,81 @@
 #define ADLER_DO8(sum1, sum2, buf, i)  {ADLER_DO4(sum1, sum2, buf, i); ADLER_DO4(sum1, sum2, buf, i+4);}
 #define ADLER_DO16(sum1, sum2, buf)    {ADLER_DO8(sum1, sum2, buf, 0); ADLER_DO8(sum1, sum2, buf, 8);}
 
-static inline uint32_t adler32_copy_len_1(uint32_t adler, uint8_t *dst, const uint8_t *buf, uint32_t sum2, const int COPY) {
-    uint8_t c = *buf;
-    if (COPY) {
-        *dst = c;
+Z_FORCEINLINE static void adler32_copy_align(uint32_t *Z_RESTRICT adler, uint8_t *dst, const uint8_t *buf, size_t len,
+                                             uint32_t *Z_RESTRICT sum2, const int MAX_LEN, const int COPY) {
+    Z_UNUSED(MAX_LEN);
+    if (len & 1) {
+        if (COPY) {
+            *dst = *buf;
+            dst += 1;
+        }
+        ADLER_DO1(*adler, *sum2, buf, 0);
+        buf += 1;
     }
-    adler += c;
-    adler %= BASE;
-    sum2 += adler;
-    sum2 %= BASE;
-    return adler | (sum2 << 16);
-}
-
-static inline uint32_t adler32_copy_len_16(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
-    while (len--) {
-        uint8_t c = *buf++;
+    if (len & 2) {
         if (COPY) {
-            *dst++ = c;
+            memcpy(dst, buf, 2);
+            dst += 2;
         }
-        adler += c;
-        sum2 += adler;
+        ADLER_DO2(*adler, *sum2, buf, 0);
+        buf += 2;
+    }
+    while (len >= 4) {
+        if (COPY) {
+            memcpy(dst, buf, 4);
+            dst += 4;
+        }
+        len -= 4;
+        ADLER_DO4(*adler, *sum2, buf, 0);
+        buf += 4;
     }
-    adler %= BASE;
-    sum2 %= BASE;            /* only added so many BASE's */
-    /* return recombined sums */
-    return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_copy_len_64(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
-    const uint8_t *src = buf;
-    const size_t src_len = len;
-#ifdef UNROLL_MORE
-    while (len >= 16) {
-        len -= 16;
-        ADLER_DO16(adler, sum2, buf);
-        buf += 16;
-#else
-    while (len >= 8) {
-        len -= 8;
-        ADLER_DO8(adler, sum2, buf, 0);
-        buf += 8;
-#endif
+Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len,
+                                                uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
+    if (len) {
+        /* DO16 loop for large remainders only (scalar, risc-v). */
+        if (MAX_LEN >= 32) {
+            while (len >= 16) {
+                if (COPY) {
+                    memcpy(dst, buf, 16);
+                    dst += 16;
+                }
+                len -= 16;
+                ADLER_DO16(adler, sum2, buf);
+                buf += 16;
+            }
+        }
+        /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
+        while (len >= 4) {
+            if (COPY) {
+                memcpy(dst, buf, 4);
+                dst += 4;
+            }
+            len -= 4;
+            ADLER_DO4(adler, sum2, buf, 0);
+            buf += 4;
+        }
+        if (len & 2) {
+            if (COPY) {
+                memcpy(dst, buf, 2);
+                dst += 2;
+            }
+            ADLER_DO2(adler, sum2, buf, 0);
+            buf += 2;
+        }
+        if (len & 1) {
+            if (COPY)
+                *dst = *buf;
+            ADLER_DO1(adler, sum2, buf, 0);
+        }
     }
-    /* Process tail (len < 16).  */
-    adler = adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
-    if (COPY) {
-        memcpy(dst, src, src_len);
+    if (REBASE) {
+        adler %= BASE;
+        sum2 %= BASE;
     }
-    return adler;
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return adler | (sum2 << 16);
 }
 
 #endif /* ADLER32_P_H */
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 2ef02398e6..cbb1c784ef 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -259,14 +259,6 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
     s[1] = vget_lane_u32(as, 1);
 }
 
-static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
-    unsigned int i;
-    for (i = 0; i < len; ++i) {
-        pair[0] += buf[i];
-        pair[1] += pair[0];
-    }
-}
-
 Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
     /* split Adler-32 into component sums */
     uint32_t sum2 = (adler >> 16) & 0xffff;
@@ -274,11 +266,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, dst, src, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
 
     uint32_t pair[2];
     int n = NMAX;
@@ -308,21 +300,10 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
     unsigned int align_adj = (align_offset) ? 32 - align_offset : 0;
 
     if (align_offset && len >= (16 + align_adj)) {
-        NEON_handle_tail(pair, src, align_adj);
-
-        if (COPY) {
-            const uint8_t* __restrict src_noalias = src;
-            uint8_t* __restrict dst_noalias = dst;
-            unsigned cpy_len = align_adj;
-
-            while (cpy_len--) {
-                *dst_noalias++ = *src_noalias++;
-            }
-        }
+        adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY);
 
         n -= align_adj;
         done += align_adj;
-
     } else {
         /* If here, we failed the len criteria test, it wouldn't be
          * worthwhile to do scalar aligning sums */
@@ -348,22 +329,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
         done += actual_nsums;
     }
 
-    /* Handle the tail elements. */
-    if (done < len) {
-        NEON_handle_tail(pair, (src + done), len - done);
-        if (COPY) {
-            const uint8_t* __restrict src_noalias = src + done;
-            uint8_t* __restrict dst_noalias = dst + done;
-            while (done++ != len) {
-                *dst_noalias++ = *src_noalias++;
-            }
-        }
-        pair[0] %= BASE;
-        pair[1] %= BASE;
-    }
-
-    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
-    return (pair[1] << 16) | pair[0];
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(pair[0], dst + done, src + done, len - done, pair[1], done < len, 15, COPY);
 }
 
 Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
diff --git a/arch/generic/adler32_c.c b/arch/generic/adler32_c.c
index 4bd5538168..84c946f452 100644
--- a/arch/generic/adler32_c.c
+++ b/arch/generic/adler32_c.c
@@ -17,11 +17,11 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
 
     /* do length NMAX blocks -- requires just one modulo operation */
     while (len >= NMAX) {
@@ -45,7 +45,7 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
     }
 
     /* do remaining bytes (less than NMAX, still just one modulo) */
-    return adler32_copy_len_64(adler, NULL, buf, len, sum2, 0);
+    return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0);
 }
 
 Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
diff --git a/arch/loongarch/adler32_lasx.c b/arch/loongarch/adler32_lasx.c
index 92b942ea4b..a7268e73ff 100644
--- a/arch/loongarch/adler32_lasx.c
+++ b/arch/loongarch/adler32_lasx.c
@@ -41,7 +41,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
 rem_peel:
     if (len < 16) {
-        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
     } else if (len < 32) {
         if (COPY) {
             return adler32_copy_lsx(adler, dst, src, len);
diff --git a/arch/loongarch/adler32_lsx.c b/arch/loongarch/adler32_lsx.c
index 4c3603193a..389f74c683 100644
--- a/arch/loongarch/adler32_lsx.c
+++ b/arch/loongarch/adler32_lsx.c
@@ -36,7 +36,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
 rem_peel:
     if (len < 16)
-        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
 
     __m128i vbuf, vbuf_0;
     __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c
index b002e4ed3a..39b3cf399c 100644
--- a/arch/power/adler32_power8.c
+++ b/arch/power/adler32_power8.c
@@ -59,11 +59,11 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(s1, NULL, buf, s2, 0);
+        return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0);
 
     /* This is faster than VSX code for len < 64.  */
     if (len < 64)
-        return adler32_copy_len_64(s1, NULL, buf, len, s2, 0);
+        return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0);
 
     /* Use POWER VSX instructions for len >= 64. */
     const vector unsigned int v_zeros = { 0 };
@@ -144,7 +144,7 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s
     s2 = vs2[0] % BASE;
 
     /* Process tail (len < 16).  */
-    return adler32_copy_len_16(s1, NULL, buf, len, s2, 0);
+    return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0);
 }
 
 Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
index b1371da950..31eaf5e36d 100644
--- a/arch/power/adler32_vmx.c
+++ b/arch/power/adler32_vmx.c
@@ -15,14 +15,6 @@
 
 #define vmx_zero()  (vec_splat_u32(0))
 
-static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
-    unsigned int i;
-    for (i = 0; i < len; ++i) {
-        pair[0] += buf[i];
-        pair[1] += pair[0];
-    }
-}
-
 static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
     /* Different taps for the separable components of sums */
     const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
@@ -127,11 +119,11 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
 
     uint32_t pair[4] ALIGNED_(16);
     pair[0] = adler;
@@ -144,7 +136,7 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
     unsigned int done = 0;
     size_t align_len = (size_t)MIN(ALIGN_DIFF(buf, 16), len);
     if (align_len) {
-        vmx_handle_head_or_tail(pair, buf, align_len);
+        adler32_copy_align(&pair[0], NULL, buf, align_len, &pair[1], 15, 0);
         done += align_len;
         /* Rather than rebasing, we can reduce the max sums for the
          * first round only */
@@ -163,15 +155,8 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
         done += (n / 16) * 16;
     }
 
-    /* Handle the tail elements. */
-    if (done < len) {
-        vmx_handle_head_or_tail(pair, (buf + done), len - done);
-        pair[0] %= BASE;
-        pair[1] %= BASE;
-    }
-
-    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
-    return (pair[1] << 16) | pair[0];
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(pair[0], NULL, buf + done, len - done, pair[1], done < len, 15, 0);
 }
 
 /* VMX stores can have higher latency than optimized memcpy */
diff --git a/arch/riscv/adler32_rvv.c b/arch/riscv/adler32_rvv.c
index 8c297a617b..e446189302 100644
--- a/arch/riscv/adler32_rvv.c
+++ b/arch/riscv/adler32_rvv.c
@@ -18,11 +18,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, dst, src, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
 
     size_t left = len;
     size_t vl = __riscv_vsetvlmax_e8m1();
@@ -104,16 +104,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric
     sum2 %= BASE;
     adler %= BASE;
 
-    while (left--) {
-        if (COPY) *dst++ = *src;
-        adler += *src++;
-        sum2 += adler;
-    }
-
-    sum2 %= BASE;
-    adler %= BASE;
-
-    return adler | (sum2 << 16);
+    /* Process tail (left < 256). */
+    return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY);
 }
 
 Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c
index 4b1f0dac98..d1811b254d 100644
--- a/arch/x86/adler32_avx2.c
+++ b/arch/x86/adler32_avx2.c
@@ -25,7 +25,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
 rem_peel:
     if (len < 16) {
-        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
     } else if (len < 32) {
         if (COPY) {
             return adler32_copy_sse42(adler, dst, src, len);
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c
index ea1d370372..c2301213f0 100644
--- a/arch/x86/adler32_sse42.c
+++ b/arch/x86/adler32_sse42.c
@@ -21,7 +21,7 @@ Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8
 
 rem_peel:
     if (UNLIKELY(len < 16))
-       return adler32_copy_len_16(adler0, dst, src, len, adler1, 1);
+       return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1);
 
     __m128i vbuf, vbuf_0;
     __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c
index 4f9ee22f91..9d2715a435 100644
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -21,11 +21,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
 
     const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
     const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
@@ -60,13 +60,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
             goto unaligned_jmp;
         }
 
-        for (size_t i = 0; i < align_offset; ++i) {
-            adler += *(buf++);
-            sum2 += adler;
-        }
+        adler32_copy_align(&adler, NULL, buf, align_offset, &sum2, 15, 0);
 
         /* lop off the max number of sums based on the scalar sums done
          * above */
+        buf += align_offset;
         len -= align_offset;
         max_iters -= align_offset;
     }
@@ -143,7 +141,7 @@ unaligned_jmp:
     }
 
     /* Process tail (len < 16).  */
-    return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+    return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0);
 }
 
 /* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
diff --git a/zbuild.h b/zbuild.h
index f1a0f3217e..39903d2176 100644
--- a/zbuild.h
+++ b/zbuild.h
@@ -180,6 +180,14 @@
 #  define Z_REGISTER
 #endif
 
+#if defined(_MSC_VER)
+#  define Z_RESTRICT __restrict
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#  define Z_RESTRICT restrict
+#else
+#  define Z_RESTRICT __restrict__
+#endif
+
 /* Reverse the bytes in a value. Use compiler intrinsics when
    possible to take advantage of hardware implementations. */
 #if defined(_MSC_VER) && (_MSC_VER >= 1300)