diff options
| author | Nathan Moinvaziri <nathan@nathanm.com> | 2026-03-12 22:03:04 -0700 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-03-14 02:14:21 +0100 |
| commit | 352c1a00c7786a0a169ce557d11c3804e88761fa (patch) | |
| tree | f9008a4b8acf9c50ad177fe1b050f38e692f757e | |
| parent | 8cd3ae20e37e4e7d38bc89b3fa7a88fe4b8af3b6 (diff) | |
| download | Project-Tick-352c1a00c7786a0a169ce557d11c3804e88761fa.tar.gz Project-Tick-352c1a00c7786a0a169ce557d11c3804e88761fa.zip | |
Unroll 64-byte CRC32+copy loop for ARMv8
Process 64 bytes per iteration using 8x uint64_t loads
with interleaved memcpy stores and __crc32d calls.
RPi5 benchmarks show 30-51% improvement over the
separate crc32 + memcpy baseline.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
| -rw-r--r-- | arch/arm/crc32_armv8.c | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/arch/arm/crc32_armv8.c b/arch/arm/crc32_armv8.c index acc6e0be20..59f2b65009 100644 --- a/arch/arm/crc32_armv8.c +++ b/arch/arm/crc32_armv8.c @@ -26,6 +26,41 @@ Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t if (align_diff) c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY); + while (len >= 64) { + uint64_t d0 = *(const uint64_t *)src; + uint64_t d1 = *(const uint64_t *)(src + 8); + uint64_t d2 = *(const uint64_t *)(src + 16); + uint64_t d3 = *(const uint64_t *)(src + 24); + uint64_t d4 = *(const uint64_t *)(src + 32); + uint64_t d5 = *(const uint64_t *)(src + 40); + uint64_t d6 = *(const uint64_t *)(src + 48); + uint64_t d7 = *(const uint64_t *)(src + 56); + + if (COPY) { + memcpy(dst, &d0, 8); + memcpy(dst + 8, &d1, 8); + memcpy(dst + 16, &d2, 8); + memcpy(dst + 24, &d3, 8); + memcpy(dst + 32, &d4, 8); + memcpy(dst + 40, &d5, 8); + memcpy(dst + 48, &d6, 8); + memcpy(dst + 56, &d7, 8); + dst += 64; + } + + c = __crc32d(c, d0); + c = __crc32d(c, d1); + c = __crc32d(c, d2); + c = __crc32d(c, d3); + c = __crc32d(c, d4); + c = __crc32d(c, d5); + c = __crc32d(c, d6); + c = __crc32d(c, d7); + + src += 64; + len -= 64; + } + return crc32_armv8_tail(c, dst, src, len, COPY); } |
