From 5fd8b67c1a586302106606b2be3c784fb6dc2124 Mon Sep 17 00:00:00 2001
From: Nathan Moinvaziri <nathan@nathanm.com>
Date: Sun, 18 Jan 2026 17:03:26 -0800
Subject: Refactor and unify adler32 short length processing.

We have one function for aligning and one for tail processing. When
processing the tail, we only need to rebase if there is data left to
process, by checking for this condition we can reduce a rebase which
is benefitical for slower machines.

Used a DO4 loop maximum for the inlined tail for GCC/-O2 to limit
register pressure on x86.

For tails where MAX_LEN can be larger, we support using DO16 similar
to the default loop used in scalar C version of adler32.

Z_RESTRICT is necessary to let the compiler know that src and dst
won't overlap and that it doesn't have to account for that case.
---
 adler32_p.h                   | 107 ++++++++++++++++++++++++++----------------
 arch/arm/adler32_neon.c       |  43 ++---------------
 arch/generic/adler32_c.c      |   6 +--
 arch/loongarch/adler32_lasx.c |   2 +-
 arch/loongarch/adler32_lsx.c  |   2 +-
 arch/power/adler32_power8.c   |   6 +--
 arch/power/adler32_vmx.c      |  25 ++--------
 arch/riscv/adler32_rvv.c      |  16 ++-----
 arch/x86/adler32_avx2.c       |   2 +-
 arch/x86/adler32_sse42.c      |   2 +-
 arch/x86/adler32_ssse3.c      |  12 ++---
 zbuild.h                      |   8 ++++
 12 files changed, 104 insertions(+), 127 deletions(-)

diff --git a/adler32_p.h b/adler32_p.h
index 12fb0ddf23..b5d5f1615c 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -18,54 +18,81 @@
 #define ADLER_DO8(sum1, sum2, buf, i)  {ADLER_DO4(sum1, sum2, buf, i); ADLER_DO4(sum1, sum2, buf, i+4);}
 #define ADLER_DO16(sum1, sum2, buf)    {ADLER_DO8(sum1, sum2, buf, 0); ADLER_DO8(sum1, sum2, buf, 8);}
 
-static inline uint32_t adler32_copy_len_1(uint32_t adler, uint8_t *dst, const uint8_t *buf, uint32_t sum2, const int COPY) {
-    uint8_t c = *buf;
-    if (COPY) {
-        *dst = c;
+Z_FORCEINLINE static void adler32_copy_align(uint32_t *Z_RESTRICT adler, uint8_t *dst, const uint8_t *buf, size_t len,
+                                             uint32_t *Z_RESTRICT sum2, const int MAX_LEN, const int COPY) {
+    Z_UNUSED(MAX_LEN);
+    if (len & 1) {
+        if (COPY) {
+            *dst = *buf;
+            dst += 1;
+        }
+        ADLER_DO1(*adler, *sum2, buf, 0);
+        buf += 1;
     }
-    adler += c;
-    adler %= BASE;
-    sum2 += adler;
-    sum2 %= BASE;
-    return adler | (sum2 << 16);
-}
-
-static inline uint32_t adler32_copy_len_16(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
-    while (len--) {
-        uint8_t c = *buf++;
+    if (len & 2) {
         if (COPY) {
-            *dst++ = c;
+            memcpy(dst, buf, 2);
+            dst += 2;
         }
-        adler += c;
-        sum2 += adler;
+        ADLER_DO2(*adler, *sum2, buf, 0);
+        buf += 2;
+    }
+    while (len >= 4) {
+        if (COPY) {
+            memcpy(dst, buf, 4);
+            dst += 4;
+        }
+        len -= 4;
+        ADLER_DO4(*adler, *sum2, buf, 0);
+        buf += 4;
     }
-    adler %= BASE;
-    sum2 %= BASE;            /* only added so many BASE's */
-    /* return recombined sums */
-    return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_copy_len_64(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
-    const uint8_t *src = buf;
-    const size_t src_len = len;
-#ifdef UNROLL_MORE
-    while (len >= 16) {
-        len -= 16;
-        ADLER_DO16(adler, sum2, buf);
-        buf += 16;
-#else
-    while (len >= 8) {
-        len -= 8;
-        ADLER_DO8(adler, sum2, buf, 0);
-        buf += 8;
-#endif
+Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len,
+                                                uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
+    if (len) {
+        /* DO16 loop for large remainders only (scalar, risc-v). */
+        if (MAX_LEN >= 32) {
+            while (len >= 16) {
+                if (COPY) {
+                    memcpy(dst, buf, 16);
+                    dst += 16;
+                }
+                len -= 16;
+                ADLER_DO16(adler, sum2, buf);
+                buf += 16;
+            }
+        }
+        /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
+        while (len >= 4) {
+            if (COPY) {
+                memcpy(dst, buf, 4);
+                dst += 4;
+            }
+            len -= 4;
+            ADLER_DO4(adler, sum2, buf, 0);
+            buf += 4;
+        }
+        if (len & 2) {
+            if (COPY) {
+                memcpy(dst, buf, 2);
+                dst += 2;
+            }
+            ADLER_DO2(adler, sum2, buf, 0);
+            buf += 2;
+        }
+        if (len & 1) {
+            if (COPY)
+                *dst = *buf;
+            ADLER_DO1(adler, sum2, buf, 0);
+        }
     }
-    /* Process tail (len < 16).  */
-    adler = adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
-    if (COPY) {
-        memcpy(dst, src, src_len);
+    if (REBASE) {
+        adler %= BASE;
+        sum2 %= BASE;
     }
-    return adler;
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return adler | (sum2 << 16);
 }
 
 #endif /* ADLER32_P_H */
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 2ef02398e6..cbb1c784ef 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -259,14 +259,6 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
     s[1] = vget_lane_u32(as, 1);
 }
 
-static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
-    unsigned int i;
-    for (i = 0; i < len; ++i) {
-        pair[0] += buf[i];
-        pair[1] += pair[0];
-    }
-}
-
 Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
     /* split Adler-32 into component sums */
     uint32_t sum2 = (adler >> 16) & 0xffff;
@@ -274,11 +266,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, dst, src, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
 
     uint32_t pair[2];
     int n = NMAX;
@@ -308,21 +300,10 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
     unsigned int align_adj = (align_offset) ? 32 - align_offset : 0;
 
     if (align_offset && len >= (16 + align_adj)) {
-        NEON_handle_tail(pair, src, align_adj);
-
-        if (COPY) {
-            const uint8_t* __restrict src_noalias = src;
-            uint8_t* __restrict dst_noalias = dst;
-            unsigned cpy_len = align_adj;
-
-            while (cpy_len--) {
-                *dst_noalias++ = *src_noalias++;
-            }
-        }
+        adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY);
 
         n -= align_adj;
         done += align_adj;
-
     } else {
         /* If here, we failed the len criteria test, it wouldn't be
          * worthwhile to do scalar aligning sums */
@@ -348,22 +329,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
         done += actual_nsums;
     }
 
-    /* Handle the tail elements. */
-    if (done < len) {
-        NEON_handle_tail(pair, (src + done), len - done);
-        if (COPY) {
-            const uint8_t* __restrict src_noalias = src + done;
-            uint8_t* __restrict dst_noalias = dst + done;
-            while (done++ != len) {
-                *dst_noalias++ = *src_noalias++;
-            }
-        }
-        pair[0] %= BASE;
-        pair[1] %= BASE;
-    }
-
-    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
-    return (pair[1] << 16) | pair[0];
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(pair[0], dst + done, src + done, len - done, pair[1], done < len, 15, COPY);
 }
 
 Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
diff --git a/arch/generic/adler32_c.c b/arch/generic/adler32_c.c
index 4bd5538168..84c946f452 100644
--- a/arch/generic/adler32_c.c
+++ b/arch/generic/adler32_c.c
@@ -17,11 +17,11 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
 
     /* do length NMAX blocks -- requires just one modulo operation */
     while (len >= NMAX) {
@@ -45,7 +45,7 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
     }
 
     /* do remaining bytes (less than NMAX, still just one modulo) */
-    return adler32_copy_len_64(adler, NULL, buf, len, sum2, 0);
+    return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0);
 }
 
 Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
diff --git a/arch/loongarch/adler32_lasx.c b/arch/loongarch/adler32_lasx.c
index 92b942ea4b..a7268e73ff 100644
--- a/arch/loongarch/adler32_lasx.c
+++ b/arch/loongarch/adler32_lasx.c
@@ -41,7 +41,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
 rem_peel:
     if (len < 16) {
-        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
     } else if (len < 32) {
         if (COPY) {
             return adler32_copy_lsx(adler, dst, src, len);
diff --git a/arch/loongarch/adler32_lsx.c b/arch/loongarch/adler32_lsx.c
index 4c3603193a..389f74c683 100644
--- a/arch/loongarch/adler32_lsx.c
+++ b/arch/loongarch/adler32_lsx.c
@@ -36,7 +36,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
 rem_peel:
     if (len < 16)
-        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
 
     __m128i vbuf, vbuf_0;
     __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c
index b002e4ed3a..39b3cf399c 100644
--- a/arch/power/adler32_power8.c
+++ b/arch/power/adler32_power8.c
@@ -59,11 +59,11 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(s1, NULL, buf, s2, 0);
+        return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0);
 
     /* This is faster than VSX code for len < 64.  */
     if (len < 64)
-        return adler32_copy_len_64(s1, NULL, buf, len, s2, 0);
+        return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0);
 
     /* Use POWER VSX instructions for len >= 64. */
     const vector unsigned int v_zeros = { 0 };
@@ -144,7 +144,7 @@ Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, s
     s2 = vs2[0] % BASE;
 
     /* Process tail (len < 16).  */
-    return adler32_copy_len_16(s1, NULL, buf, len, s2, 0);
+    return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0);
 }
 
 Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
index b1371da950..31eaf5e36d 100644
--- a/arch/power/adler32_vmx.c
+++ b/arch/power/adler32_vmx.c
@@ -15,14 +15,6 @@
 
 #define vmx_zero()  (vec_splat_u32(0))
 
-static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
-    unsigned int i;
-    for (i = 0; i < len; ++i) {
-        pair[0] += buf[i];
-        pair[1] += pair[0];
-    }
-}
-
 static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
     /* Different taps for the separable components of sums */
     const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
@@ -127,11 +119,11 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
 
     uint32_t pair[4] ALIGNED_(16);
     pair[0] = adler;
@@ -144,7 +136,7 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
     unsigned int done = 0;
     size_t align_len = (size_t)MIN(ALIGN_DIFF(buf, 16), len);
     if (align_len) {
-        vmx_handle_head_or_tail(pair, buf, align_len);
+        adler32_copy_align(&pair[0], NULL, buf, align_len, &pair[1], 15, 0);
         done += align_len;
         /* Rather than rebasing, we can reduce the max sums for the
          * first round only */
@@ -163,15 +155,8 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
         done += (n / 16) * 16;
     }
 
-    /* Handle the tail elements. */
-    if (done < len) {
-        vmx_handle_head_or_tail(pair, (buf + done), len - done);
-        pair[0] %= BASE;
-        pair[1] %= BASE;
-    }
-
-    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
-    return (pair[1] << 16) | pair[0];
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(pair[0], NULL, buf + done, len - done, pair[1], done < len, 15, 0);
 }
 
 /* VMX stores can have higher latency than optimized memcpy */
diff --git a/arch/riscv/adler32_rvv.c b/arch/riscv/adler32_rvv.c
index 8c297a617b..e446189302 100644
--- a/arch/riscv/adler32_rvv.c
+++ b/arch/riscv/adler32_rvv.c
@@ -18,11 +18,11 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, dst, src, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
+        return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
 
     size_t left = len;
     size_t vl = __riscv_vsetvlmax_e8m1();
@@ -104,16 +104,8 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restric
     sum2 %= BASE;
     adler %= BASE;
 
-    while (left--) {
-        if (COPY) *dst++ = *src;
-        adler += *src++;
-        sum2 += adler;
-    }
-
-    sum2 %= BASE;
-    adler %= BASE;
-
-    return adler | (sum2 << 16);
+    /* Process tail (left < 256). */
+    return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY);
 }
 
 Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c
index 4b1f0dac98..d1811b254d 100644
--- a/arch/x86/adler32_avx2.c
+++ b/arch/x86/adler32_avx2.c
@@ -25,7 +25,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
 
 rem_peel:
     if (len < 16) {
-        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
     } else if (len < 32) {
         if (COPY) {
             return adler32_copy_sse42(adler, dst, src, len);
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c
index ea1d370372..c2301213f0 100644
--- a/arch/x86/adler32_sse42.c
+++ b/arch/x86/adler32_sse42.c
@@ -21,7 +21,7 @@ Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8
 
 rem_peel:
     if (UNLIKELY(len < 16))
-       return adler32_copy_len_16(adler0, dst, src, len, adler1, 1);
+       return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1);
 
     __m128i vbuf, vbuf_0;
     __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c
index 4f9ee22f91..9d2715a435 100644
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -21,11 +21,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
 
     /* in case user likes doing a byte at a time, keep it fast */
     if (UNLIKELY(len == 1))
-        return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
 
     /* in case short lengths are provided, keep it somewhat fast */
     if (UNLIKELY(len < 16))
-        return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
 
     const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
     const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
@@ -60,13 +60,11 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
             goto unaligned_jmp;
         }
 
-        for (size_t i = 0; i < align_offset; ++i) {
-            adler += *(buf++);
-            sum2 += adler;
-        }
+        adler32_copy_align(&adler, NULL, buf, align_offset, &sum2, 15, 0);
 
         /* lop off the max number of sums based on the scalar sums done
          * above */
+        buf += align_offset;
         len -= align_offset;
         max_iters -= align_offset;
     }
@@ -143,7 +141,7 @@ unaligned_jmp:
     }
 
     /* Process tail (len < 16).  */
-    return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+    return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0);
 }
 
 /* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
diff --git a/zbuild.h b/zbuild.h
index f1a0f3217e..39903d2176 100644
--- a/zbuild.h
+++ b/zbuild.h
@@ -180,6 +180,14 @@
 #  define Z_REGISTER
 #endif
 
+#if defined(_MSC_VER)
+#  define Z_RESTRICT __restrict
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#  define Z_RESTRICT restrict
+#else
+#  define Z_RESTRICT __restrict__
+#endif
+
 /* Reverse the bytes in a value. Use compiler intrinsics when
    possible to take advantage of hardware implementations. */
 #if defined(_MSC_VER) && (_MSC_VER >= 1300)
-- 
cgit 0.0.5-2-1-g0f52