Implement interleaved copying for CRC32 ARMv8 PMULL+EOR3.

author: Nathan Moinvaziri <nathan@nathanm.com> 2026-02-24 09:14:53 -0800
committer: Hans Kristian Rosbach <hk-github@circlestorm.org> 2026-03-14 02:14:21 +0100
commit: 7987e76b818539cb9e70d2d9b11a5e067d7207f8 (patch)
tree: 4319b1982d67e95426abe8f2a95481a88dc67bc4
parent: 110f7c753d11b8a3037ab8c5fd4bcfac589a85f4 (diff)
download: Project-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.tar.gz
Project-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.zip
1 files changed, 133 insertions, 57 deletions
diff --git a/arch/arm/crc32_armv8_pmull_eor3.c b/arch/arm/crc32_armv8_pmull_eor3.c
index 27e7006b3f..0498b2ea0b 100644
--- a/arch/arm/crc32_armv8_pmull_eor3.c
+++ b/arch/arm/crc32_armv8_pmull_eor3.c
@@ -68,50 +68,69 @@ static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
   return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
 }
 
-Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src,
+                                                                  size_t len, const int COPY) {
     uint32_t crc0 = ~crc;
-    uint8_t *dst = NULL;
 
     if (UNLIKELY(len == 1)) {
-        crc0 = __crc32b(crc0, *buf);
+        if (COPY)
+            *dst = *src;
+        crc0 = __crc32b(crc0, *src);
         return ~crc0;
     }
 
     /* Align to 16-byte boundary for vector path */
-    uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+    uintptr_t align_diff = ALIGN_DIFF(src, 16);
     if (align_diff)
-        crc0 = crc32_armv8_align(crc0, &dst, &buf, &len, align_diff, 0);
+        crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY);
 
     /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */
     if (len >= 192) {
-        const uint8_t *end = buf + len;
-        size_t blk = len / 192;                  /* Number of 192-byte blocks */
-        size_t klen = blk * 16;                  /* Scalar stride per CRC lane */
-        const uint8_t *buf2 = buf + klen * 3;    /* Vector data starts after scalar lanes */
+        size_t blk = len / 192;                   /* Number of 192-byte blocks */
+        size_t klen = blk * 16;                   /* Scalar stride per CRC lane */
+        const uint8_t *end = src + len;
+        const uint8_t *src0 = src;
+        const uint8_t *src1 = src + klen;
+        const uint8_t *src2 = src + klen * 2;
+        const uint8_t *srcv = src + klen * 3;     /* Vector data starts after scalar lanes */
         uint32_t crc1 = 0, crc2 = 0;
         uint64x2_t vc0, vc1, vc2;
         uint64_t vc;
 
         /* Load first 9 vector chunks (144 bytes) */
-        uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)buf2, 128), y0;
-        uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128), y1;
-        uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128), y2;
-        uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128), y3;
-        uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128), y4;
-        uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128), y5;
-        uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128), y6;
-        uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128), y7;
-        uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128), y8;
+        uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0;
+        uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1;
+        uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2;
+        uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3;
+        uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4;
+        uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5;
+        uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6;
+        uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7;
+        uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8;
         uint64x2_t k;
         /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
         { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
-        buf2 += 144;
+
+        /* Per-region dst pointers */
+        uint8_t *dst0 = dst;
+        uint8_t *dst1 = NULL;
+        uint8_t *dst2 = NULL;
+        uint8_t *dst_v = NULL;
+
+        if (COPY) {
+            dst1 = dst + klen;
+            dst2 = dst + klen * 2;
+            dst_v = dst + klen * 3;
+            memcpy(dst_v, srcv, 144);
+            dst_v += 144;
+        }
+        srcv += 144;
 
         /* Fold 9 vectors + 3-way parallel scalar CRC */
         if (blk > 1) {
             /* Only form a limit pointer when we have at least 2 blocks. */
-            const uint8_t *limit = buf + klen - 32;
-            while (buf <= limit) {
+            const uint8_t *limit = src0 + klen - 32;
+            while (src0 <= limit) {
                 /* Fold all 9 vector lanes using PMULL */
                 y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
                 y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
@@ -124,25 +143,43 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
                 y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
 
                 /* EOR3: combine hi*k, lo*k, and new data in one instruction */
-                x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)buf2, 128));
-                x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128));
-                x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128));
-                x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128));
-                x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128));
-                x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128));
-                x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128));
-                x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128));
-                x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128));
+                x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)srcv, 128));
+                x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(srcv + 16), 128));
+                x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(srcv + 32), 128));
+                x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(srcv + 48), 128));
+                x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(srcv + 64), 128));
+                x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(srcv + 80), 128));
+                x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(srcv + 96), 128));
+                x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(srcv + 112), 128));
+                x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(srcv + 128), 128));
+                if (COPY) {
+                    memcpy(dst_v, srcv, 144);
+                    dst_v += 144;
+                }
 
                 /* 3-way parallel scalar CRC (16 bytes each) */
-                crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-                crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-                crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-                crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-                crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-                crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-                buf += 16;
-                buf2 += 144;
+                if (COPY) {
+                    memcpy(dst0, src0, 16);
+                    dst0 += 16;
+                }
+                crc0 = __crc32d(crc0, *(const uint64_t*)src0);
+                crc0 = __crc32d(crc0, *(const uint64_t*)(src0 + 8));
+                if (COPY) {
+                    memcpy(dst1, src1, 16);
+                    dst1 += 16;
+                }
+                crc1 = __crc32d(crc1, *(const uint64_t*)src1);
+                crc1 = __crc32d(crc1, *(const uint64_t*)(src1 + 8));
+                if (COPY) {
+                    memcpy(dst2, src2, 16);
+                    dst2 += 16;
+                }
+                crc2 = __crc32d(crc2, *(const uint64_t*)src2);
+                crc2 = __crc32d(crc2, *(const uint64_t*)(src2 + 8));
+                src0 += 16;
+                src1 += 16;
+                src2 += 16;
+                srcv += 144;
             }
         }
 
@@ -176,12 +213,18 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
         x0 = veor3q_u64(x0, y0, x4);
 
         /* Process final scalar chunk */
-        crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-        crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-        crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-        crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-        crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-        crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
+        if (COPY)
+            memcpy(dst0, src0, 16);
+        crc0 = __crc32d(crc0, *(const uint64_t*)src0);
+        crc0 = __crc32d(crc0, *(const uint64_t*)(src0 + 8));
+        if (COPY)
+            memcpy(dst1, src1, 16);
+        crc1 = __crc32d(crc1, *(const uint64_t*)src1);
+        crc1 = __crc32d(crc1, *(const uint64_t*)(src1 + 8));
+        if (COPY)
+            memcpy(dst2, src2, 16);
+        crc2 = __crc32d(crc2, *(const uint64_t*)src2);
+        crc2 = __crc32d(crc2, *(const uint64_t*)(src2 + 8));
 
         /* Shift and combine 3 scalar CRCs */
         vc0 = crc_shift(crc0, klen * 2 + blk * 144);
@@ -192,23 +235,51 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
         /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */
         crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
         crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
-        buf = buf2;
-        len = end - buf;
+        if (COPY)
+            dst += blk * 192;
+        src = srcv;
+        len = end - srcv;
     }
 
     /* 3-way scalar CRC (24 bytes/iter) */
     if (len >= 80) {
         size_t klen = ((len - 8) / 24) * 8;   /* Stride for 3-way parallel */
+        const uint8_t *buf0 = src;
+        const uint8_t *buf1 = src + klen;
+        const uint8_t *buf2 = src + klen * 2;
         uint32_t crc1 = 0, crc2 = 0;
         uint64x2_t vc0, vc1;
         uint64_t vc;
 
+        /* Per-lane dst pointers */
+        uint8_t *dst0 = dst;
+        uint8_t *dst1 = NULL;
+        uint8_t *dst2 = NULL;
+        if (COPY) {
+            dst1 = dst + klen;
+            dst2 = dst + klen * 2;
+        }
+
         /* 3-way parallel scalar CRC */
         do {
-            crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-            crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-            crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-            buf += 8;
+            if (COPY) {
+                memcpy(dst0, buf0, 8);
+                dst0 += 8;
+            }
+            crc0 = __crc32d(crc0, *(const uint64_t*)buf0);
+            if (COPY) {
+                memcpy(dst1, buf1, 8);
+                dst1 += 8;
+            }
+            crc1 = __crc32d(crc1, *(const uint64_t*)buf1);
+            if (COPY) {
+                memcpy(dst2, buf2, 8);
+                dst2 += 8;
+            }
+            crc2 = __crc32d(crc2, *(const uint64_t*)buf2);
+            buf0 += 8;
+            buf1 += 8;
+            buf2 += 8;
             len -= 24;
         } while (len >= 32);
 
@@ -218,20 +289,25 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
         vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
 
         /* Process final 8 bytes with combined CRC */
-        buf += klen * 2;
         crc0 = crc2;
-        crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc);
-        buf += 8;
+        if (COPY)
+            memcpy(dst2, buf2, 8);
+        crc0 = __crc32d(crc0, *(const uint64_t*)buf2 ^ vc);
+        src = buf2 + 8;
         len -= 8;
+        if (COPY)
+            dst = dst2 + 8;
     }
 
     /* Process remaining bytes */
-    return crc32_armv8_tail(crc0, NULL, buf, len, 0);
+    return crc32_armv8_tail(crc0, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
 }
 
 Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
-    crc = crc32_armv8_pmull_eor3(crc, src, len);
-    memcpy(dst, src, len);
-    return crc;
+    return crc32_copy_impl(crc, dst, src, len, 1);
 }
 #endif
author	Nathan Moinvaziri <nathan@nathanm.com>	2026-02-24 09:14:53 -0800
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>	2026-03-14 02:14:21 +0100
commit	7987e76b818539cb9e70d2d9b11a5e067d7207f8 (patch)
tree	4319b1982d67e95426abe8f2a95481a88dc67bc4
parent	110f7c753d11b8a3037ab8c5fd4bcfac589a85f4 (diff)
download	Project-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.tar.gz Project-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.zip