diff options
| author | Nathan Moinvaziri <nathan@nathanm.com> | 2026-02-24 09:14:53 -0800 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-03-14 02:14:21 +0100 |
| commit | 7987e76b818539cb9e70d2d9b11a5e067d7207f8 (patch) | |
| tree | 4319b1982d67e95426abe8f2a95481a88dc67bc4 | |
| parent | 110f7c753d11b8a3037ab8c5fd4bcfac589a85f4 (diff) | |
| download | Project-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.tar.gz Project-Tick-7987e76b818539cb9e70d2d9b11a5e067d7207f8.zip | |
Implement interleaved copying for CRC32 ARMv8 PMULL+EOR3.
| -rw-r--r-- | arch/arm/crc32_armv8_pmull_eor3.c | 190 |
1 files changed, 133 insertions, 57 deletions
diff --git a/arch/arm/crc32_armv8_pmull_eor3.c b/arch/arm/crc32_armv8_pmull_eor3.c index 27e7006b3f..0498b2ea0b 100644 --- a/arch/arm/crc32_armv8_pmull_eor3.c +++ b/arch/arm/crc32_armv8_pmull_eor3.c @@ -68,50 +68,69 @@ static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); } -Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) { +Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, + size_t len, const int COPY) { uint32_t crc0 = ~crc; - uint8_t *dst = NULL; if (UNLIKELY(len == 1)) { - crc0 = __crc32b(crc0, *buf); + if (COPY) + *dst = *src; + crc0 = __crc32b(crc0, *src); return ~crc0; } /* Align to 16-byte boundary for vector path */ - uintptr_t align_diff = ALIGN_DIFF(buf, 16); + uintptr_t align_diff = ALIGN_DIFF(src, 16); if (align_diff) - crc0 = crc32_armv8_align(crc0, &dst, &buf, &len, align_diff, 0); + crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY); /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */ if (len >= 192) { - const uint8_t *end = buf + len; - size_t blk = len / 192; /* Number of 192-byte blocks */ - size_t klen = blk * 16; /* Scalar stride per CRC lane */ - const uint8_t *buf2 = buf + klen * 3; /* Vector data starts after scalar lanes */ + size_t blk = len / 192; /* Number of 192-byte blocks */ + size_t klen = blk * 16; /* Scalar stride per CRC lane */ + const uint8_t *end = src + len; + const uint8_t *src0 = src; + const uint8_t *src1 = src + klen; + const uint8_t *src2 = src + klen * 2; + const uint8_t *srcv = src + klen * 3; /* Vector data starts after scalar lanes */ uint32_t crc1 = 0, crc2 = 0; uint64x2_t vc0, vc1, vc2; uint64_t vc; /* Load first 9 vector chunks (144 bytes) */ - uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)buf2, 128), y0; - uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128), y1; - uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128), y2; - uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128), y3; - uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128), y4; - uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128), y5; - uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128), y6; - uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128), y7; - uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128), y8; + uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0; + uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1; + uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2; + uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3; + uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4; + uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5; + uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6; + uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7; + uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8; uint64x2_t k; /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */ { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); } - buf2 += 144; + + /* Per-region dst pointers */ + uint8_t *dst0 = dst; + uint8_t *dst1 = NULL; + uint8_t *dst2 = NULL; + uint8_t *dst_v = NULL; + + if (COPY) { + dst1 = dst + klen; + dst2 = dst + klen * 2; + dst_v = dst + klen * 3; + memcpy(dst_v, srcv, 144); + dst_v += 144; + } + srcv += 144; /* Fold 9 vectors + 3-way parallel scalar CRC */ if (blk > 1) { /* Only form a limit pointer when we have at least 2 blocks. */ - const uint8_t *limit = buf + klen - 32; - while (buf <= limit) { + const uint8_t *limit = src0 + klen - 32; + while (src0 <= limit) { /* Fold all 9 vector lanes using PMULL */ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); @@ -124,25 +143,43 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k); /* EOR3: combine hi*k, lo*k, and new data in one instruction */ - x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)buf2, 128)); - x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128)); - x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128)); - x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128)); - x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128)); - x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128)); - x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128)); - x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128)); - x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128)); + x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)srcv, 128)); + x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(srcv + 16), 128)); + x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(srcv + 32), 128)); + x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(srcv + 48), 128)); + x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(srcv + 64), 128)); + x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(srcv + 80), 128)); + x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(srcv + 96), 128)); + x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(srcv + 112), 128)); + x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(srcv + 128), 128)); + if (COPY) { + memcpy(dst_v, srcv, 144); + dst_v += 144; + } /* 3-way parallel scalar CRC (16 bytes each) */ - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - buf += 16; - buf2 += 144; + if (COPY) { + memcpy(dst0, src0, 16); + dst0 += 16; + } + crc0 = __crc32d(crc0, *(const uint64_t*)src0); + crc0 = __crc32d(crc0, *(const uint64_t*)(src0 + 8)); + if (COPY) { + memcpy(dst1, src1, 16); + dst1 += 16; + } + crc1 = __crc32d(crc1, *(const uint64_t*)src1); + crc1 = __crc32d(crc1, *(const uint64_t*)(src1 + 8)); + if (COPY) { + memcpy(dst2, src2, 16); + dst2 += 16; + } + crc2 = __crc32d(crc2, *(const uint64_t*)src2); + crc2 = __crc32d(crc2, *(const uint64_t*)(src2 + 8)); + src0 += 16; + src1 += 16; + src2 += 16; + srcv += 144; } } @@ -176,12 +213,18 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con x0 = veor3q_u64(x0, y0, x4); /* Process final scalar chunk */ - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); + if (COPY) + memcpy(dst0, src0, 16); + crc0 = __crc32d(crc0, *(const uint64_t*)src0); + crc0 = __crc32d(crc0, *(const uint64_t*)(src0 + 8)); + if (COPY) + memcpy(dst1, src1, 16); + crc1 = __crc32d(crc1, *(const uint64_t*)src1); + crc1 = __crc32d(crc1, *(const uint64_t*)(src1 + 8)); + if (COPY) + memcpy(dst2, src2, 16); + crc2 = __crc32d(crc2, *(const uint64_t*)src2); + crc2 = __crc32d(crc2, *(const uint64_t*)(src2 + 8)); /* Shift and combine 3 scalar CRCs */ vc0 = crc_shift(crc0, klen * 2 + blk * 144); @@ -192,23 +235,51 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */ crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); - buf = buf2; - len = end - buf; + if (COPY) + dst += blk * 192; + src = srcv; + len = end - srcv; } /* 3-way scalar CRC (24 bytes/iter) */ if (len >= 80) { size_t klen = ((len - 8) / 24) * 8; /* Stride for 3-way parallel */ + const uint8_t *buf0 = src; + const uint8_t *buf1 = src + klen; + const uint8_t *buf2 = src + klen * 2; uint32_t crc1 = 0, crc2 = 0; uint64x2_t vc0, vc1; uint64_t vc; + /* Per-lane dst pointers */ + uint8_t *dst0 = dst; + uint8_t *dst1 = NULL; + uint8_t *dst2 = NULL; + if (COPY) { + dst1 = dst + klen; + dst2 = dst + klen * 2; + } + /* 3-way parallel scalar CRC */ do { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - buf += 8; + if (COPY) { + memcpy(dst0, buf0, 8); + dst0 += 8; + } + crc0 = __crc32d(crc0, *(const uint64_t*)buf0); + if (COPY) { + memcpy(dst1, buf1, 8); + dst1 += 8; + } + crc1 = __crc32d(crc1, *(const uint64_t*)buf1); + if (COPY) { + memcpy(dst2, buf2, 8); + dst2 += 8; + } + crc2 = __crc32d(crc2, *(const uint64_t*)buf2); + buf0 += 8; + buf1 += 8; + buf2 += 8; len -= 24; } while (len >= 32); @@ -218,20 +289,25 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); /* Process final 8 bytes with combined CRC */ - buf += klen * 2; crc0 = crc2; - crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc); - buf += 8; + if (COPY) + memcpy(dst2, buf2, 8); + crc0 = __crc32d(crc0, *(const uint64_t*)buf2 ^ vc); + src = buf2 + 8; len -= 8; + if (COPY) + dst = dst2 + 8; } /* Process remaining bytes */ - return crc32_armv8_tail(crc0, NULL, buf, len, 0); + return crc32_armv8_tail(crc0, dst, src, len, COPY); +} + +Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); } Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { - crc = crc32_armv8_pmull_eor3(crc, src, len); - memcpy(dst, src, len); - return crc; + return crc32_copy_impl(crc, dst, src, len, 1); } #endif |
