Simplify adler32 alignment loops to advance pointers

Replace done-offset tracking with direct pointer advancement in NEON, VMX, and SSSE3 adler32 implementations. Use ALIGN_DIFF consistently across all architectures for the initial alignment step.
author: Nathan Moinvaziri <nathan@nathanm.com> 2026-02-23 14:26:41 -0800
committer: Hans Kristian Rosbach <hk-github@circlestorm.org> 2026-03-05 20:30:46 +0100
commit: b24577be61651aa14586864f3f49d98c07bfaee3 (patch)
tree: 20becf6c365a492af3cf04e4eefc49f6c10739f9
parent: d8136aea2a074c950b91f6c609c43a55a7990056 (diff)
download: Project-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.tar.gz
Project-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.zip
3 files changed, 63 insertions, 73 deletions
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index a55c8c1353..5ba9b41178 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -273,8 +273,6 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
         return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
 
     uint32_t pair[2];
-    int n = NMAX;
-    unsigned int done = 0;
 
     /* Split Adler-32 into component sums, it can be supplied by
      * the caller sites (e.g. in a PNG file).
@@ -295,43 +293,39 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
      * alignment, but it's unclear which other SIPs will benefit from it.
      * In the copying variant we use fallback to 4x loads and 4x stores,
      * as ld1x4 seems to block ILP when stores are in the mix */
-    unsigned int align_offset = ((uintptr_t)src & 31);
-    unsigned int align_adj = (align_offset) ? 32 - align_offset : 0;
-
-    if (len < (16 + align_adj)) {
-        return adler32_copy_tail(pair[0], dst, src, len, pair[1], 1, 15, COPY);
-    } else if (align_offset) {
-        adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY);
-
-        n -= align_adj;
-        done += align_adj;
-    } else {
-        /* If here, we failed the len criteria test, it wouldn't be
-         * worthwhile to do scalar aligning sums */
-        align_adj = 0;
-    }
+    size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
+    size_t n = NMAX;
+    if (align_diff) {
+        adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
 
-    while (done < len) {
-        int remaining = (int)(len - done);
-        n = MIN(remaining, (done == align_adj) ? n : NMAX);
+        if (COPY)
+            dst += align_diff;
+        src += align_diff;
+        len -= align_diff;
+        n -= align_diff;
+    }
 
-        if (n < 16)
-            break;
+    while (len >= 16) {
+        n = MIN(len, n);
 
         if (COPY)
-            NEON_accum32_copy(pair, dst + done, src + done, n >> 4);
+            NEON_accum32_copy(pair, dst, src, n >> 4);
         else
-            NEON_accum32(pair, src + done, n >> 4);
+            NEON_accum32(pair, src, n >> 4);
 
         pair[0] %= BASE;
         pair[1] %= BASE;
 
-        int actual_nsums = (n >> 4) << 4;
-        done += actual_nsums;
+        size_t k = (n >> 4) << 4;
+        src += k;
+        if (COPY)
+            dst += k;
+        len -= k;
+        n = NMAX;
     }
 
     /* Process tail (len < 16).  */
-    return adler32_copy_tail(pair[0], dst + done, src + done, len - done, pair[1], done < len, 15, COPY);
+    return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY);
 }
 
 Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
index 31eaf5e36d..5171bab35b 100644
--- a/arch/power/adler32_vmx.c
+++ b/arch/power/adler32_vmx.c
@@ -132,31 +132,31 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
     pair[3] = 0;
 
     // Align buffer
-    int n = NMAX;
-    unsigned int done = 0;
-    size_t align_len = (size_t)MIN(ALIGN_DIFF(buf, 16), len);
-    if (align_len) {
-        adler32_copy_align(&pair[0], NULL, buf, align_len, &pair[1], 15, 0);
-        done += align_len;
-        /* Rather than rebasing, we can reduce the max sums for the
-         * first round only */
-        n -= align_len;
+    size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+    size_t n = NMAX;
+    if (align_diff) {
+        adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0);
+
+        buf += align_diff;
+        len -= align_diff;
+        n -= align_diff;
     }
-    for (size_t i = align_len; i < len; i += n) {
-        int remaining = (int)(len-i);
-        n = MIN(remaining, (i == align_len) ? n : NMAX);
-        if (n < 16)
-            break;
 
-        vmx_accum32(pair, buf + i, n / 16);
+    while (len >= 16) {
+        n = MIN(len, n);
+
+        vmx_accum32(pair, buf, n / 16);
         pair[0] %= BASE;
         pair[1] %= BASE;
 
-        done += (n / 16) * 16;
+        size_t k = (n / 16) * 16;
+        buf += k;
+        len -= k;
+        n = NMAX;
     }
 
     /* Process tail (len < 16).  */
-    return adler32_copy_tail(pair[0], NULL, buf + done, len - done, pair[1], done < len, 15, 0);
+    return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0);
 }
 
 /* VMX stores can have higher latency than optimized memcpy */
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c
index 9d2715a435..702db50251 100644
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -38,37 +38,33 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
     /* If our buffer is unaligned (likely), make the determination whether
      * or not there's enough of a buffer to consume to make the scalar, aligning
      * additions worthwhile or if it's worth it to just eat the cost of an unaligned
-     * load. This is a pretty simple test, just test if 16 - the remainder + len is
-     * < 16 */
-    size_t max_iters = NMAX;
-    size_t rem = (uintptr_t)buf & 15;
-    size_t align_offset = 16 - rem;
+     * load. This is a pretty simple test, just test if len < 32 */
+    size_t n = NMAX;
     size_t k = 0;
-    if (rem) {
-        if (len < 16 + align_offset) {
-            /* Let's eat the cost of this one unaligned load so that
-             * we don't completely skip over the vectorization. Doing
-             * 16 bytes at a time unaligned is better than 16 + <= 15
-             * sums */
-            vbuf = _mm_loadu_si128((__m128i*)buf);
-            len -= 16;
-            buf += 16;
-            vs1 = _mm_cvtsi32_si128(adler);
-            vs2 = _mm_cvtsi32_si128(sum2);
-            vs3 = _mm_setzero_si128();
-            vs1_0 = vs1;
-            goto unaligned_jmp;
-        }
 
-        adler32_copy_align(&adler, NULL, buf, align_offset, &sum2, 15, 0);
-
-        /* lop off the max number of sums based on the scalar sums done
-         * above */
-        buf += align_offset;
-        len -= align_offset;
-        max_iters -= align_offset;
+    if (len < 32) {
+        /* Let's eat the cost of this one unaligned load so that
+         * we don't completely skip over the vectorization. Doing
+         * 16 bytes at a time unaligned is better than 16 + <= 15
+         * sums */
+        vbuf = _mm_loadu_si128((__m128i*)buf);
+        len -= 16;
+        buf += 16;
+        vs1 = _mm_cvtsi32_si128(adler);
+        vs2 = _mm_cvtsi32_si128(sum2);
+        vs3 = _mm_setzero_si128();
+        vs1_0 = vs1;
+        goto unaligned_jmp;
     }
 
+    size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+    if (align_diff) {
+        adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0);
+
+        buf += align_diff;
+        len -= align_diff;
+        n -= align_diff;
+    }
 
     while (len >= 16) {
         vs1 = _mm_cvtsi32_si128(adler);
@@ -77,7 +73,7 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
         vs2_0 = _mm_setzero_si128();
         vs1_0 = vs1;
 
-        k = ALIGN_DOWN(MIN(len, max_iters), 16);
+        k = ALIGN_DOWN(MIN(len, n), 16);
         len -= k;
 
         while (k >= 32) {
@@ -137,7 +133,7 @@ unaligned_jmp:
          * 0 and 2. This saves us some contention on the shuffle port(s) */
         adler = partial_hsum(vs1) % BASE;
         sum2 = hsum(vs2) % BASE;
-        max_iters = NMAX;
+        n = NMAX;
     }
 
     /* Process tail (len < 16).  */
author	Nathan Moinvaziri <nathan@nathanm.com>	2026-02-23 14:26:41 -0800
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>	2026-03-05 20:30:46 +0100
commit	b24577be61651aa14586864f3f49d98c07bfaee3 (patch)
tree	20becf6c365a492af3cf04e4eefc49f6c10739f9
parent	d8136aea2a074c950b91f6c609c43a55a7990056 (diff)
download	Project-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.tar.gz Project-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.zip