diff options
| author | Nathan Moinvaziri <nathan@nathanm.com> | 2026-02-23 14:26:41 -0800 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-03-05 20:30:46 +0100 |
| commit | b24577be61651aa14586864f3f49d98c07bfaee3 (patch) | |
| tree | 20becf6c365a492af3cf04e4eefc49f6c10739f9 | |
| parent | d8136aea2a074c950b91f6c609c43a55a7990056 (diff) | |
| download | Project-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.tar.gz Project-Tick-b24577be61651aa14586864f3f49d98c07bfaee3.zip | |
Simplify adler32 alignment loops to advance pointers
Replace done-offset tracking with direct pointer advancement in NEON,
VMX, and SSSE3 adler32 implementations. Use ALIGN_DIFF consistently
across all architectures for the initial alignment step.
| -rw-r--r-- | arch/arm/adler32_neon.c | 48 | ||||
| -rw-r--r-- | arch/power/adler32_vmx.c | 34 | ||||
| -rw-r--r-- | arch/x86/adler32_ssse3.c | 54 |
3 files changed, 63 insertions, 73 deletions
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index a55c8c1353..5ba9b41178 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -273,8 +273,6 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY); uint32_t pair[2]; - int n = NMAX; - unsigned int done = 0; /* Split Adler-32 into component sums, it can be supplied by * the caller sites (e.g. in a PNG file). @@ -295,43 +293,39 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co * alignment, but it's unclear which other SIPs will benefit from it. * In the copying variant we use fallback to 4x loads and 4x stores, * as ld1x4 seems to block ILP when stores are in the mix */ - unsigned int align_offset = ((uintptr_t)src & 31); - unsigned int align_adj = (align_offset) ? 32 - align_offset : 0; - - if (len < (16 + align_adj)) { - return adler32_copy_tail(pair[0], dst, src, len, pair[1], 1, 15, COPY); - } else if (align_offset) { - adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY); - - n -= align_adj; - done += align_adj; - } else { - /* If here, we failed the len criteria test, it wouldn't be - * worthwhile to do scalar aligning sums */ - align_adj = 0; - } + size_t align_diff = MIN(ALIGN_DIFF(src, 32), len); + size_t n = NMAX; + if (align_diff) { + adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY); - while (done < len) { - int remaining = (int)(len - done); - n = MIN(remaining, (done == align_adj) ? n : NMAX); + if (COPY) + dst += align_diff; + src += align_diff; + len -= align_diff; + n -= align_diff; + } - if (n < 16) - break; + while (len >= 16) { + n = MIN(len, n); if (COPY) - NEON_accum32_copy(pair, dst + done, src + done, n >> 4); + NEON_accum32_copy(pair, dst, src, n >> 4); else - NEON_accum32(pair, src + done, n >> 4); + NEON_accum32(pair, src, n >> 4); pair[0] %= BASE; pair[1] %= BASE; - int actual_nsums = (n >> 4) << 4; - done += actual_nsums; + size_t k = (n >> 4) << 4; + src += k; + if (COPY) + dst += k; + len -= k; + n = NMAX; } /* Process tail (len < 16). */ - return adler32_copy_tail(pair[0], dst + done, src + done, len - done, pair[1], done < len, 15, COPY); + return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY); } Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) { diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c index 31eaf5e36d..5171bab35b 100644 --- a/arch/power/adler32_vmx.c +++ b/arch/power/adler32_vmx.c @@ -132,31 +132,31 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) pair[3] = 0; // Align buffer - int n = NMAX; - unsigned int done = 0; - size_t align_len = (size_t)MIN(ALIGN_DIFF(buf, 16), len); - if (align_len) { - adler32_copy_align(&pair[0], NULL, buf, align_len, &pair[1], 15, 0); - done += align_len; - /* Rather than rebasing, we can reduce the max sums for the - * first round only */ - n -= align_len; + size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len); + size_t n = NMAX; + if (align_diff) { + adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0); + + buf += align_diff; + len -= align_diff; + n -= align_diff; } - for (size_t i = align_len; i < len; i += n) { - int remaining = (int)(len-i); - n = MIN(remaining, (i == align_len) ? n : NMAX); - if (n < 16) - break; - vmx_accum32(pair, buf + i, n / 16); + while (len >= 16) { + n = MIN(len, n); + + vmx_accum32(pair, buf, n / 16); pair[0] %= BASE; pair[1] %= BASE; - done += (n / 16) * 16; + size_t k = (n / 16) * 16; + buf += k; + len -= k; + n = NMAX; } /* Process tail (len < 16). */ - return adler32_copy_tail(pair[0], NULL, buf + done, len - done, pair[1], done < len, 15, 0); + return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0); } /* VMX stores can have higher latency than optimized memcpy */ diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 9d2715a435..702db50251 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -38,37 +38,33 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len /* If our buffer is unaligned (likely), make the determination whether * or not there's enough of a buffer to consume to make the scalar, aligning * additions worthwhile or if it's worth it to just eat the cost of an unaligned - * load. This is a pretty simple test, just test if 16 - the remainder + len is - * < 16 */ - size_t max_iters = NMAX; - size_t rem = (uintptr_t)buf & 15; - size_t align_offset = 16 - rem; + * load. This is a pretty simple test, just test if len < 32 */ + size_t n = NMAX; size_t k = 0; - if (rem) { - if (len < 16 + align_offset) { - /* Let's eat the cost of this one unaligned load so that - * we don't completely skip over the vectorization. Doing - * 16 bytes at a time unaligned is better than 16 + <= 15 - * sums */ - vbuf = _mm_loadu_si128((__m128i*)buf); - len -= 16; - buf += 16; - vs1 = _mm_cvtsi32_si128(adler); - vs2 = _mm_cvtsi32_si128(sum2); - vs3 = _mm_setzero_si128(); - vs1_0 = vs1; - goto unaligned_jmp; - } - adler32_copy_align(&adler, NULL, buf, align_offset, &sum2, 15, 0); - - /* lop off the max number of sums based on the scalar sums done - * above */ - buf += align_offset; - len -= align_offset; - max_iters -= align_offset; + if (len < 32) { + /* Let's eat the cost of this one unaligned load so that + * we don't completely skip over the vectorization. Doing + * 16 bytes at a time unaligned is better than 16 + <= 15 + * sums */ + vbuf = _mm_loadu_si128((__m128i*)buf); + len -= 16; + buf += 16; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs1_0 = vs1; + goto unaligned_jmp; } + size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len); + if (align_diff) { + adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0); + + buf += align_diff; + len -= align_diff; + n -= align_diff; + } while (len >= 16) { vs1 = _mm_cvtsi32_si128(adler); @@ -77,7 +73,7 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len vs2_0 = _mm_setzero_si128(); vs1_0 = vs1; - k = ALIGN_DOWN(MIN(len, max_iters), 16); + k = ALIGN_DOWN(MIN(len, n), 16); len -= k; while (k >= 32) { @@ -137,7 +133,7 @@ unaligned_jmp: * 0 and 2. This saves us some contention on the shuffle port(s) */ adler = partial_hsum(vs1) % BASE; sum2 = hsum(vs2) % BASE; - max_iters = NMAX; + n = NMAX; } /* Process tail (len < 16). */ |
