Add NMAX_ALIGNED32 and use it in NEON adler32

Define NMAX_ALIGNED32 as NMAX rounded down to a multiple of 32 (5536) and use it in the NEON adler32 implementation to ensure that src stays 32-byte aligned throughout the main SIMD loop. Previously, NMAX (5552) is not a multiple of 32, so after the alignment preamble the first iteration could process a non-32-aligned number of bytes, causing src to lose 32-byte alignment for all subsequent iterations. The first iteration's budget is rounded down with ALIGN_DOWN after subtracting align_diff, ensuring k is always a multiple of 32. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
author: Nathan Moinvaziri <nathan@nathanm.com> 2026-03-06 11:38:28 -0800
committer: Hans Kristian Rosbach <hk-github@circlestorm.org> 2026-03-08 23:37:37 +0100
commit: a5b10a321d1a4011cb188e09f9d29f22fe6b5338 (patch)
tree: 677824c625472cc162dad4869a969bf69906e54e
parent: 30206c1cac40a8bed335405ed2e64559fb7b40bc (diff)
download: Project-Tick-a5b10a321d1a4011cb188e09f9d29f22fe6b5338.tar.gz
Project-Tick-a5b10a321d1a4011cb188e09f9d29f22fe6b5338.zip
2 files changed, 5 insertions, 3 deletions
diff --git a/adler32_p.h b/adler32_p.h
index b5d5f1615c..836029b2ab 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -11,6 +11,8 @@
 #define BASE 65521U     /* largest prime smaller than 65536 */
 #define NMAX 5552
 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+#define NMAX_ALIGNED32 (NMAX & ~31)
+/* NMAX rounded down to a multiple of 32 is 5536 */
 
 #define ADLER_DO1(sum1, sum2, buf, i)  {(sum1) += buf[(i)]; (sum2) += (sum1);}
 #define ADLER_DO2(sum1, sum2, buf, i)  {ADLER_DO1(sum1, sum2, buf, i); ADLER_DO1(sum1, sum2, buf, i+1);}
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 9bbb293bd8..169a785edc 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -294,7 +294,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
      * In the copying variant we use fallback to 4x loads and 4x stores,
      * as ld1x4 seems to block ILP when stores are in the mix */
     size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
-    size_t n = NMAX;
+    size_t n = NMAX_ALIGNED32;
     if (align_diff) {
         adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
 
@@ -302,7 +302,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
             dst += align_diff;
         src += align_diff;
         len -= align_diff;
-        n -= align_diff;
+        n = ALIGN_DOWN(n - align_diff, 32);
     }
 
     while (len >= 16) {
@@ -321,7 +321,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
         if (COPY)
             dst += k;
         len -= k;
-        n = NMAX;
+        n = NMAX_ALIGNED32;
     }
 
     /* Process tail (len < 16).  */
author	Nathan Moinvaziri <nathan@nathanm.com>	2026-03-06 11:38:28 -0800
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>	2026-03-08 23:37:37 +0100
commit	a5b10a321d1a4011cb188e09f9d29f22fe6b5338 (patch)
tree	677824c625472cc162dad4869a969bf69906e54e
parent	30206c1cac40a8bed335405ed2e64559fb7b40bc (diff)
download	Project-Tick-a5b10a321d1a4011cb188e09f9d29f22fe6b5338.tar.gz Project-Tick-a5b10a321d1a4011cb188e09f9d29f22fe6b5338.zip