summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Moinvaziri <nathan@nathanm.com>2026-02-24 09:14:53 -0800
committerHans Kristian Rosbach <hk-github@circlestorm.org>2026-03-14 02:14:21 +0100
commit7987e76b818539cb9e70d2d9b11a5e067d7207f8 (patch)
tree4319b1982d67e95426abe8f2a95481a88dc67bc4
parent110f7c753d11b8a3037ab8c5fd4bcfac589a85f4 (diff)
downloadProject-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.tar.gz
Project-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.zip
Implement interleaved copying for CRC32 ARMv8 PMULL+EOR3.
-rw-r--r--arch/arm/crc32_armv8_pmull_eor3.c190
1 files changed, 133 insertions, 57 deletions
diff --git a/arch/arm/crc32_armv8_pmull_eor3.c b/arch/arm/crc32_armv8_pmull_eor3.c
index 27e7006b3f..0498b2ea0b 100644
--- a/arch/arm/crc32_armv8_pmull_eor3.c
+++ b/arch/arm/crc32_armv8_pmull_eor3.c
@@ -68,50 +68,69 @@ static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
}
-Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src,
+ size_t len, const int COPY) {
uint32_t crc0 = ~crc;
- uint8_t *dst = NULL;
if (UNLIKELY(len == 1)) {
- crc0 = __crc32b(crc0, *buf);
+ if (COPY)
+ *dst = *src;
+ crc0 = __crc32b(crc0, *src);
return ~crc0;
}
/* Align to 16-byte boundary for vector path */
- uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+ uintptr_t align_diff = ALIGN_DIFF(src, 16);
if (align_diff)
- crc0 = crc32_armv8_align(crc0, &dst, &buf, &len, align_diff, 0);
+ crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY);
/* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */
if (len >= 192) {
- const uint8_t *end = buf + len;
- size_t blk = len / 192; /* Number of 192-byte blocks */
- size_t klen = blk * 16; /* Scalar stride per CRC lane */
- const uint8_t *buf2 = buf + klen * 3; /* Vector data starts after scalar lanes */
+ size_t blk = len / 192; /* Number of 192-byte blocks */
+ size_t klen = blk * 16; /* Scalar stride per CRC lane */
+ const uint8_t *end = src + len;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + klen;
+ const uint8_t *src2 = src + klen * 2;
+ const uint8_t *srcv = src + klen * 3; /* Vector data starts after scalar lanes */
uint32_t crc1 = 0, crc2 = 0;
uint64x2_t vc0, vc1, vc2;
uint64_t vc;
/* Load first 9 vector chunks (144 bytes) */
- uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)buf2, 128), y0;
- uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128), y1;
- uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128), y2;
- uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128), y3;
- uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128), y4;
- uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128), y5;
- uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128), y6;
- uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128), y7;
- uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128), y8;
+ uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0;
+ uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1;
+ uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2;
+ uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3;
+ uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4;
+ uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5;
+ uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6;
+ uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7;
+ uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8;
uint64x2_t k;
/* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
{ static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
- buf2 += 144;
+
+ /* Per-region dst pointers */
+ uint8_t *dst0 = dst;
+ uint8_t *dst1 = NULL;
+ uint8_t *dst2 = NULL;
+ uint8_t *dst_v = NULL;
+
+ if (COPY) {
+ dst1 = dst + klen;
+ dst2 = dst + klen * 2;
+ dst_v = dst + klen * 3;
+ memcpy(dst_v, srcv, 144);
+ dst_v += 144;
+ }
+ srcv += 144;
/* Fold 9 vectors + 3-way parallel scalar CRC */
if (blk > 1) {
/* Only form a limit pointer when we have at least 2 blocks. */
- const uint8_t *limit = buf + klen - 32;
- while (buf <= limit) {
+ const uint8_t *limit = src0 + klen - 32;
+ while (src0 <= limit) {
/* Fold all 9 vector lanes using PMULL */
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
@@ -124,25 +143,43 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
/* EOR3: combine hi*k, lo*k, and new data in one instruction */
- x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)buf2, 128));
- x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128));
- x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128));
- x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128));
- x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128));
- x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128));
- x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128));
- x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128));
- x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128));
+ x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)srcv, 128));
+ x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(srcv + 16), 128));
+ x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(srcv + 32), 128));
+ x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(srcv + 48), 128));
+ x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(srcv + 64), 128));
+ x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(srcv + 80), 128));
+ x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(srcv + 96), 128));
+ x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(srcv + 112), 128));
+ x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(srcv + 128), 128));
+ if (COPY) {
+ memcpy(dst_v, srcv, 144);
+ dst_v += 144;
+ }
/* 3-way parallel scalar CRC (16 bytes each) */
- crc0 = __crc32d(crc0, *(const uint64_t*)buf);
- crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
- crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
- crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
- crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
- crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
- buf += 16;
- buf2 += 144;
+ if (COPY) {
+ memcpy(dst0, src0, 16);
+ dst0 += 16;
+ }
+ crc0 = __crc32d(crc0, *(const uint64_t*)src0);
+ crc0 = __crc32d(crc0, *(const uint64_t*)(src0 + 8));
+ if (COPY) {
+ memcpy(dst1, src1, 16);
+ dst1 += 16;
+ }
+ crc1 = __crc32d(crc1, *(const uint64_t*)src1);
+ crc1 = __crc32d(crc1, *(const uint64_t*)(src1 + 8));
+ if (COPY) {
+ memcpy(dst2, src2, 16);
+ dst2 += 16;
+ }
+ crc2 = __crc32d(crc2, *(const uint64_t*)src2);
+ crc2 = __crc32d(crc2, *(const uint64_t*)(src2 + 8));
+ src0 += 16;
+ src1 += 16;
+ src2 += 16;
+ srcv += 144;
}
}
@@ -176,12 +213,18 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
x0 = veor3q_u64(x0, y0, x4);
/* Process final scalar chunk */
- crc0 = __crc32d(crc0, *(const uint64_t*)buf);
- crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
- crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
- crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
- crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
- crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
+ if (COPY)
+ memcpy(dst0, src0, 16);
+ crc0 = __crc32d(crc0, *(const uint64_t*)src0);
+ crc0 = __crc32d(crc0, *(const uint64_t*)(src0 + 8));
+ if (COPY)
+ memcpy(dst1, src1, 16);
+ crc1 = __crc32d(crc1, *(const uint64_t*)src1);
+ crc1 = __crc32d(crc1, *(const uint64_t*)(src1 + 8));
+ if (COPY)
+ memcpy(dst2, src2, 16);
+ crc2 = __crc32d(crc2, *(const uint64_t*)src2);
+ crc2 = __crc32d(crc2, *(const uint64_t*)(src2 + 8));
/* Shift and combine 3 scalar CRCs */
vc0 = crc_shift(crc0, klen * 2 + blk * 144);
@@ -192,23 +235,51 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
/* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */
crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
- buf = buf2;
- len = end - buf;
+ if (COPY)
+ dst += blk * 192;
+ src = srcv;
+ len = end - srcv;
}
/* 3-way scalar CRC (24 bytes/iter) */
if (len >= 80) {
size_t klen = ((len - 8) / 24) * 8; /* Stride for 3-way parallel */
+ const uint8_t *buf0 = src;
+ const uint8_t *buf1 = src + klen;
+ const uint8_t *buf2 = src + klen * 2;
uint32_t crc1 = 0, crc2 = 0;
uint64x2_t vc0, vc1;
uint64_t vc;
+ /* Per-lane dst pointers */
+ uint8_t *dst0 = dst;
+ uint8_t *dst1 = NULL;
+ uint8_t *dst2 = NULL;
+ if (COPY) {
+ dst1 = dst + klen;
+ dst2 = dst + klen * 2;
+ }
+
/* 3-way parallel scalar CRC */
do {
- crc0 = __crc32d(crc0, *(const uint64_t*)buf);
- crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
- crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
- buf += 8;
+ if (COPY) {
+ memcpy(dst0, buf0, 8);
+ dst0 += 8;
+ }
+ crc0 = __crc32d(crc0, *(const uint64_t*)buf0);
+ if (COPY) {
+ memcpy(dst1, buf1, 8);
+ dst1 += 8;
+ }
+ crc1 = __crc32d(crc1, *(const uint64_t*)buf1);
+ if (COPY) {
+ memcpy(dst2, buf2, 8);
+ dst2 += 8;
+ }
+ crc2 = __crc32d(crc2, *(const uint64_t*)buf2);
+ buf0 += 8;
+ buf1 += 8;
+ buf2 += 8;
len -= 24;
} while (len >= 32);
@@ -218,20 +289,25 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
/* Process final 8 bytes with combined CRC */
- buf += klen * 2;
crc0 = crc2;
- crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc);
- buf += 8;
+ if (COPY)
+ memcpy(dst2, buf2, 8);
+ crc0 = __crc32d(crc0, *(const uint64_t*)buf2 ^ vc);
+ src = buf2 + 8;
len -= 8;
+ if (COPY)
+ dst = dst2 + 8;
}
/* Process remaining bytes */
- return crc32_armv8_tail(crc0, NULL, buf, len, 0);
+ return crc32_armv8_tail(crc0, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
}
Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
- crc = crc32_armv8_pmull_eor3(crc, src, len);
- memcpy(dst, src, len);
- return crc;
+ return crc32_copy_impl(crc, dst, src, len, 1);
}
#endif