summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Moinvaziri <nathan@nathanm.com>2026-02-23 14:26:41 -0800
committerHans Kristian Rosbach <hk-github@circlestorm.org>2026-03-05 20:30:46 +0100
commitb24577be61651aa14586864f3f49d98c07bfaee3 (patch)
tree20becf6c365a492af3cf04e4eefc49f6c10739f9
parentd8136aea2a074c950b91f6c609c43a55a7990056 (diff)
downloadProject-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.tar.gz
Project-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.zip
Simplify adler32 alignment loops to advance pointers
Replace done-offset tracking with direct pointer advancement in NEON, VMX, and SSSE3 adler32 implementations. Use ALIGN_DIFF consistently across all architectures for the initial alignment step.
-rw-r--r--arch/arm/adler32_neon.c48
-rw-r--r--arch/power/adler32_vmx.c34
-rw-r--r--arch/x86/adler32_ssse3.c54
3 files changed, 63 insertions, 73 deletions
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index a55c8c1353..5ba9b41178 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -273,8 +273,6 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
uint32_t pair[2];
- int n = NMAX;
- unsigned int done = 0;
/* Split Adler-32 into component sums, it can be supplied by
* the caller sites (e.g. in a PNG file).
@@ -295,43 +293,39 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
* alignment, but it's unclear which other SIPs will benefit from it.
* In the copying variant we use fallback to 4x loads and 4x stores,
* as ld1x4 seems to block ILP when stores are in the mix */
- unsigned int align_offset = ((uintptr_t)src & 31);
- unsigned int align_adj = (align_offset) ? 32 - align_offset : 0;
-
- if (len < (16 + align_adj)) {
- return adler32_copy_tail(pair[0], dst, src, len, pair[1], 1, 15, COPY);
- } else if (align_offset) {
- adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY);
-
- n -= align_adj;
- done += align_adj;
- } else {
- /* If here, we failed the len criteria test, it wouldn't be
- * worthwhile to do scalar aligning sums */
- align_adj = 0;
- }
+ size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
+ size_t n = NMAX;
+ if (align_diff) {
+ adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
- while (done < len) {
- int remaining = (int)(len - done);
- n = MIN(remaining, (done == align_adj) ? n : NMAX);
+ if (COPY)
+ dst += align_diff;
+ src += align_diff;
+ len -= align_diff;
+ n -= align_diff;
+ }
- if (n < 16)
- break;
+ while (len >= 16) {
+ n = MIN(len, n);
if (COPY)
- NEON_accum32_copy(pair, dst + done, src + done, n >> 4);
+ NEON_accum32_copy(pair, dst, src, n >> 4);
else
- NEON_accum32(pair, src + done, n >> 4);
+ NEON_accum32(pair, src, n >> 4);
pair[0] %= BASE;
pair[1] %= BASE;
- int actual_nsums = (n >> 4) << 4;
- done += actual_nsums;
+ size_t k = (n >> 4) << 4;
+ src += k;
+ if (COPY)
+ dst += k;
+ len -= k;
+ n = NMAX;
}
/* Process tail (len < 16). */
- return adler32_copy_tail(pair[0], dst + done, src + done, len - done, pair[1], done < len, 15, COPY);
+ return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY);
}
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
index 31eaf5e36d..5171bab35b 100644
--- a/arch/power/adler32_vmx.c
+++ b/arch/power/adler32_vmx.c
@@ -132,31 +132,31 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
pair[3] = 0;
// Align buffer
- int n = NMAX;
- unsigned int done = 0;
- size_t align_len = (size_t)MIN(ALIGN_DIFF(buf, 16), len);
- if (align_len) {
- adler32_copy_align(&pair[0], NULL, buf, align_len, &pair[1], 15, 0);
- done += align_len;
- /* Rather than rebasing, we can reduce the max sums for the
- * first round only */
- n -= align_len;
+ size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+ size_t n = NMAX;
+ if (align_diff) {
+ adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0);
+
+ buf += align_diff;
+ len -= align_diff;
+ n -= align_diff;
}
- for (size_t i = align_len; i < len; i += n) {
- int remaining = (int)(len-i);
- n = MIN(remaining, (i == align_len) ? n : NMAX);
- if (n < 16)
- break;
- vmx_accum32(pair, buf + i, n / 16);
+ while (len >= 16) {
+ n = MIN(len, n);
+
+ vmx_accum32(pair, buf, n / 16);
pair[0] %= BASE;
pair[1] %= BASE;
- done += (n / 16) * 16;
+ size_t k = (n / 16) * 16;
+ buf += k;
+ len -= k;
+ n = NMAX;
}
/* Process tail (len < 16). */
- return adler32_copy_tail(pair[0], NULL, buf + done, len - done, pair[1], done < len, 15, 0);
+ return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0);
}
/* VMX stores can have higher latency than optimized memcpy */
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c
index 9d2715a435..702db50251 100644
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -38,37 +38,33 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
/* If our buffer is unaligned (likely), make the determination whether
* or not there's enough of a buffer to consume to make the scalar, aligning
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
- * load. This is a pretty simple test, just test if 16 - the remainder + len is
- * < 16 */
- size_t max_iters = NMAX;
- size_t rem = (uintptr_t)buf & 15;
- size_t align_offset = 16 - rem;
+ * load. This is a pretty simple test, just test if len < 32 */
+ size_t n = NMAX;
size_t k = 0;
- if (rem) {
- if (len < 16 + align_offset) {
- /* Let's eat the cost of this one unaligned load so that
- * we don't completely skip over the vectorization. Doing
- * 16 bytes at a time unaligned is better than 16 + <= 15
- * sums */
- vbuf = _mm_loadu_si128((__m128i*)buf);
- len -= 16;
- buf += 16;
- vs1 = _mm_cvtsi32_si128(adler);
- vs2 = _mm_cvtsi32_si128(sum2);
- vs3 = _mm_setzero_si128();
- vs1_0 = vs1;
- goto unaligned_jmp;
- }
- adler32_copy_align(&adler, NULL, buf, align_offset, &sum2, 15, 0);
-
- /* lop off the max number of sums based on the scalar sums done
- * above */
- buf += align_offset;
- len -= align_offset;
- max_iters -= align_offset;
+ if (len < 32) {
+ /* Let's eat the cost of this one unaligned load so that
+ * we don't completely skip over the vectorization. Doing
+ * 16 bytes at a time unaligned is better than 16 + <= 15
+ * sums */
+ vbuf = _mm_loadu_si128((__m128i*)buf);
+ len -= 16;
+ buf += 16;
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs1_0 = vs1;
+ goto unaligned_jmp;
}
+ size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+ if (align_diff) {
+ adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0);
+
+ buf += align_diff;
+ len -= align_diff;
+ n -= align_diff;
+ }
while (len >= 16) {
vs1 = _mm_cvtsi32_si128(adler);
@@ -77,7 +73,7 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len
vs2_0 = _mm_setzero_si128();
vs1_0 = vs1;
- k = ALIGN_DOWN(MIN(len, max_iters), 16);
+ k = ALIGN_DOWN(MIN(len, n), 16);
len -= k;
while (k >= 32) {
@@ -137,7 +133,7 @@ unaligned_jmp:
* 0 and 2. This saves us some contention on the shuffle port(s) */
adler = partial_hsum(vs1) % BASE;
sum2 = hsum(vs2) % BASE;
- max_iters = NMAX;
+ n = NMAX;
}
/* Process tail (len < 16). */