summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Moinvaziri <nathan@nathanm.com>2026-03-12 22:03:04 -0700
committerHans Kristian Rosbach <hk-github@circlestorm.org>2026-03-14 02:14:21 +0100
commit352c1a00c7786a0a169ce557d11c3804e88761fa (patch)
treef9008a4b8acf9c50ad177fe1b050f38e692f757e
parent8cd3ae20e37e4e7d38bc89b3fa7a88fe4b8af3b6 (diff)
downloadProject-Tick-352c1a00c7786a0a169ce557d11c3804e88761fa.tar.gz
Project-Tick-352c1a00c7786a0a169ce557d11c3804e88761fa.zip
Unroll 64-byte CRC32+copy loop for ARMv8
Process 64 bytes per iteration using 8x uint64_t loads with interleaved memcpy stores and __crc32d calls. RPi5 benchmarks show 30-51% improvement over the separate crc32 + memcpy baseline. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
-rw-r--r--arch/arm/crc32_armv8.c35
1 files changed, 35 insertions, 0 deletions
diff --git a/arch/arm/crc32_armv8.c b/arch/arm/crc32_armv8.c
index acc6e0be20..59f2b65009 100644
--- a/arch/arm/crc32_armv8.c
+++ b/arch/arm/crc32_armv8.c
@@ -26,6 +26,41 @@ Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t
if (align_diff)
c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY);
+ while (len >= 64) {
+ uint64_t d0 = *(const uint64_t *)src;
+ uint64_t d1 = *(const uint64_t *)(src + 8);
+ uint64_t d2 = *(const uint64_t *)(src + 16);
+ uint64_t d3 = *(const uint64_t *)(src + 24);
+ uint64_t d4 = *(const uint64_t *)(src + 32);
+ uint64_t d5 = *(const uint64_t *)(src + 40);
+ uint64_t d6 = *(const uint64_t *)(src + 48);
+ uint64_t d7 = *(const uint64_t *)(src + 56);
+
+ if (COPY) {
+ memcpy(dst, &d0, 8);
+ memcpy(dst + 8, &d1, 8);
+ memcpy(dst + 16, &d2, 8);
+ memcpy(dst + 24, &d3, 8);
+ memcpy(dst + 32, &d4, 8);
+ memcpy(dst + 40, &d5, 8);
+ memcpy(dst + 48, &d6, 8);
+ memcpy(dst + 56, &d7, 8);
+ dst += 64;
+ }
+
+ c = __crc32d(c, d0);
+ c = __crc32d(c, d1);
+ c = __crc32d(c, d2);
+ c = __crc32d(c, d3);
+ c = __crc32d(c, d4);
+ c = __crc32d(c, d5);
+ c = __crc32d(c, d6);
+ c = __crc32d(c, d7);
+
+ src += 64;
+ len -= 64;
+ }
+
return crc32_armv8_tail(c, dst, src, len, COPY);
}