summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Moinvaziri <nathan@nathanm.com>2026-03-06 11:38:28 -0800
committerHans Kristian Rosbach <hk-github@circlestorm.org>2026-03-08 23:37:37 +0100
commita5b10a321d1a4011cb188e09f9d29f22fe6b5338 (patch)
tree677824c625472cc162dad4869a969bf69906e54e
parent30206c1cac40a8bed335405ed2e64559fb7b40bc (diff)
downloadProject-Tick-a5b10a321d1a4011cb188e09f9d29f22fe6b5338.tar.gz
Project-Tick-a5b10a321d1a4011cb188e09f9d29f22fe6b5338.zip
Add NMAX_ALIGNED32 and use it in NEON adler32
Define NMAX_ALIGNED32 as NMAX rounded down to a multiple of 32 (5536) and use it in the NEON adler32 implementation to ensure that src stays 32-byte aligned throughout the main SIMD loop. Previously, NMAX (5552) is not a multiple of 32, so after the alignment preamble the first iteration could process a non-32-aligned number of bytes, causing src to lose 32-byte alignment for all subsequent iterations. The first iteration's budget is rounded down with ALIGN_DOWN after subtracting align_diff, ensuring k is always a multiple of 32. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
-rw-r--r--adler32_p.h2
-rw-r--r--arch/arm/adler32_neon.c6
2 files changed, 5 insertions, 3 deletions
diff --git a/adler32_p.h b/adler32_p.h
index b5d5f1615c..836029b2ab 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -11,6 +11,8 @@
#define BASE 65521U /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+#define NMAX_ALIGNED32 (NMAX & ~31)
+/* NMAX rounded down to a multiple of 32 is 5536 */
#define ADLER_DO1(sum1, sum2, buf, i) {(sum1) += buf[(i)]; (sum2) += (sum1);}
#define ADLER_DO2(sum1, sum2, buf, i) {ADLER_DO1(sum1, sum2, buf, i); ADLER_DO1(sum1, sum2, buf, i+1);}
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 9bbb293bd8..169a785edc 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -294,7 +294,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
* In the copying variant we use fallback to 4x loads and 4x stores,
* as ld1x4 seems to block ILP when stores are in the mix */
size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
- size_t n = NMAX;
+ size_t n = NMAX_ALIGNED32;
if (align_diff) {
adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
@@ -302,7 +302,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
dst += align_diff;
src += align_diff;
len -= align_diff;
- n -= align_diff;
+ n = ALIGN_DOWN(n - align_diff, 32);
}
while (len >= 16) {
@@ -321,7 +321,7 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
if (COPY)
dst += k;
len -= k;
- n = NMAX;
+ n = NMAX_ALIGNED32;
}
/* Process tail (len < 16). */