/* crc32_armv8_pmull_eor3.c -- ARMv8 CRC32 using PMULL + EOR3 (SHA3 extension) * Copyright (C) 2025 Peter Cawley * https://github.com/corsix/fast-crc32 * For conditions of distribution and use, see copyright notice in zlib.h * * This uses EOR3 (3-way XOR) from ARMv8.2-A SHA3 extension to save instructions. * Uses 3-way parallel scalar CRC + 9 PMULL vector lanes, processing 192 bytes/iter. */ #ifdef ARM_PMULL_EOR3 #include "zbuild.h" #include "zutil.h" #include "acle_intrins.h" #include "neon_intrins.h" #include "crc32_armv8_p.h" /* Carryless multiply low 64 bits: a[0] * b[0] */ static inline uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) { #ifdef _MSC_VER return vreinterpretq_u64_p128(vmull_p64( vget_low_p64(vreinterpret_p64_u64(a)), vget_low_p64(vreinterpret_p64_u64(b)))); #else return vreinterpretq_u64_p128(vmull_p64( vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(a)), 0), vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(b)), 0))); #endif } /* Carryless multiply high 64 bits: a[1] * b[1] */ static inline uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) { return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b))); } /* Carryless multiply of two 32-bit scalars: a * b (returns 64-bit result in 128-bit vector) */ static inline uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { #ifdef _MSC_VER return vreinterpretq_u64_p128(vmull_p64(vdup_n_p64((poly64_t)a), vdup_n_p64((poly64_t)b))); #else return vreinterpretq_u64_p128(vmull_p64((poly64_t)a, (poly64_t)b)); #endif } /* Compute x^n mod P (CRC-32 polynomial) in log(n) time, where P = 0x104c11db7 */ static uint32_t xnmodp(uint64_t n) { uint64_t stack = ~(uint64_t)1; uint32_t acc, low; for (; n > 191; n = (n >> 1) - 16) { stack = (stack << 1) + (n & 1); } stack = ~stack; acc = ((uint32_t)0x80000000) >> (n & 31); for (n >>= 5; n; --n) { acc = __crc32w(acc, 0); } while ((low = stack & 1), stack >>= 1) { poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); acc = __crc32d(0, y << low); } return acc; } /* Shift CRC forward by nbytes: equivalent to appending nbytes of zeros to the data stream */ static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { Assert(nbytes >= 5, "crc_shift requires nbytes >= 5"); return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); } Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { uint32_t crc0 = ~crc; if (UNLIKELY(len == 1)) { if (COPY) *dst = *src; crc0 = __crc32b(crc0, *src); return ~crc0; } /* Align to 16-byte boundary for vector path */ uintptr_t align_diff = ALIGN_DIFF(src, 16); if (align_diff) crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY); /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */ if (len >= 192) { size_t blk = len / 192; /* Number of 192-byte blocks */ size_t klen = blk * 16; /* Scalar stride per CRC lane */ const uint8_t *end = src + len; const uint8_t *src0 = src; const uint8_t *src1 = src + klen; const uint8_t *src2 = src + klen * 2; const uint8_t *srcv = src + klen * 3; /* Vector data starts after scalar lanes */ uint32_t crc1 = 0, crc2 = 0; uint64x2_t vc0, vc1, vc2; uint64_t vc; /* Load first 9 vector chunks (144 bytes) */ uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0; uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1; uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2; uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3; uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4; uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5; uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6; uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7; uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8; uint64x2_t k; /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */ { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); } /* Per-region dst pointers */ uint8_t *dst0 = dst; uint8_t *dst1 = NULL; uint8_t *dst2 = NULL; uint8_t *dst_v = NULL; if (COPY) { dst1 = dst + klen; dst2 = dst + klen * 2; dst_v = dst + klen * 3; vst1q_u8(dst_v, vreinterpretq_u8_u64(x0)); vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(x1)); vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(x2)); vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(x3)); vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(x4)); vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(x5)); vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(x6)); vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(x7)); vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(x8)); dst_v += 144; } srcv += 144; /* Fold 9 vectors + 3-way parallel scalar CRC */ if (blk > 1) { /* Only form a limit pointer when we have at least 2 blocks. */ const uint8_t *limit = src0 + klen - 32; while (src0 <= limit) { /* Fold all 9 vector lanes using PMULL */ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k); y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k); y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k); /* EOR3: combine hi*k, lo*k, and new data in one instruction */ { uint64x2_t d0 = vld1q_u64_ex((const uint64_t*)srcv, 128); uint64x2_t d1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128); uint64x2_t d2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128); uint64x2_t d3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128); uint64x2_t d4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128); uint64x2_t d5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128); uint64x2_t d6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128); uint64x2_t d7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128); uint64x2_t d8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128); if (COPY) { vst1q_u8(dst_v, vreinterpretq_u8_u64(d0)); vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(d1)); vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(d2)); vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(d3)); vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(d4)); vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(d5)); vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(d6)); vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(d7)); vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(d8)); dst_v += 144; } x0 = veor3q_u64(x0, y0, d0); x1 = veor3q_u64(x1, y1, d1); x2 = veor3q_u64(x2, y2, d2); x3 = veor3q_u64(x3, y3, d3); x4 = veor3q_u64(x4, y4, d4); x5 = veor3q_u64(x5, y5, d5); x6 = veor3q_u64(x6, y6, d6); x7 = veor3q_u64(x7, y7, d7); x8 = veor3q_u64(x8, y8, d8); } /* 3-way parallel scalar CRC (16 bytes each) */ { uint64_t s0a = *(const uint64_t*)src0; uint64_t s0b = *(const uint64_t*)(src0 + 8); uint64_t s1a = *(const uint64_t*)src1; uint64_t s1b = *(const uint64_t*)(src1 + 8); uint64_t s2a = *(const uint64_t*)src2; uint64_t s2b = *(const uint64_t*)(src2 + 8); if (COPY) { memcpy(dst0, &s0a, 8); memcpy(dst0 + 8, &s0b, 8); dst0 += 16; memcpy(dst1, &s1a, 8); memcpy(dst1 + 8, &s1b, 8); dst1 += 16; memcpy(dst2, &s2a, 8); memcpy(dst2 + 8, &s2b, 8); dst2 += 16; } crc0 = __crc32d(crc0, s0a); crc0 = __crc32d(crc0, s0b); crc1 = __crc32d(crc1, s1a); crc1 = __crc32d(crc1, s1b); crc2 = __crc32d(crc2, s2a); crc2 = __crc32d(crc2, s2b); } src0 += 16; src1 += 16; src2 += 16; srcv += 144; } } /* Reduce 9 vectors to 1 using tree reduction */ /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */ { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); } y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); x0 = veor3q_u64(x0, y0, x1); x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; /* Step 2: fold pairs (x0,x1), (x2,x3), (x4,x5), (x6,x7) */ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); x0 = veor3q_u64(x0, y0, x1); x2 = veor3q_u64(x2, y2, x3); x4 = veor3q_u64(x4, y4, x5); x6 = veor3q_u64(x6, y6, x7); /* Step 3: fold pairs (x0,x2), (x4,x6) */ { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); } y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); x0 = veor3q_u64(x0, y0, x2); x4 = veor3q_u64(x4, y4, x6); /* Step 4: final fold (x0, x4) -> x0 */ { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); } y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); x0 = veor3q_u64(x0, y0, x4); /* Process final scalar chunk */ { uint64_t s0a = *(const uint64_t*)src0; uint64_t s0b = *(const uint64_t*)(src0 + 8); uint64_t s1a = *(const uint64_t*)src1; uint64_t s1b = *(const uint64_t*)(src1 + 8); uint64_t s2a = *(const uint64_t*)src2; uint64_t s2b = *(const uint64_t*)(src2 + 8); if (COPY) { memcpy(dst0, &s0a, 8); memcpy(dst0 + 8, &s0b, 8); memcpy(dst1, &s1a, 8); memcpy(dst1 + 8, &s1b, 8); memcpy(dst2, &s2a, 8); memcpy(dst2 + 8, &s2b, 8); } crc0 = __crc32d(crc0, s0a); crc0 = __crc32d(crc0, s0b); crc1 = __crc32d(crc1, s1a); crc1 = __crc32d(crc1, s1b); crc2 = __crc32d(crc2, s2a); crc2 = __crc32d(crc2, s2b); } /* Shift and combine 3 scalar CRCs */ vc0 = crc_shift(crc0, klen * 2 + blk * 144); vc1 = crc_shift(crc1, klen + blk * 144); vc2 = crc_shift(crc2, blk * 144); vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */ crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); if (COPY) dst += blk * 192; src = srcv; len = end - srcv; } /* 3-way scalar CRC (24 bytes/iter) */ if (len >= 80) { size_t klen = ((len - 8) / 24) * 8; /* Stride for 3-way parallel */ const uint8_t *buf0 = src; const uint8_t *buf1 = src + klen; const uint8_t *buf2 = src + klen * 2; uint32_t crc1 = 0, crc2 = 0; uint64x2_t vc0, vc1; uint64_t vc; /* Per-lane dst pointers */ uint8_t *dst0 = dst; uint8_t *dst1 = NULL; uint8_t *dst2 = NULL; if (COPY) { dst1 = dst + klen; dst2 = dst + klen * 2; } /* 3-way parallel scalar CRC */ do { uint64_t v0 = *(const uint64_t*)buf0; uint64_t v1 = *(const uint64_t*)buf1; uint64_t v2 = *(const uint64_t*)buf2; if (COPY) { memcpy(dst0, &v0, 8); dst0 += 8; memcpy(dst1, &v1, 8); dst1 += 8; memcpy(dst2, &v2, 8); dst2 += 8; } crc0 = __crc32d(crc0, v0); crc1 = __crc32d(crc1, v1); crc2 = __crc32d(crc2, v2); buf0 += 8; buf1 += 8; buf2 += 8; len -= 24; } while (len >= 32); /* Combine the 3 CRCs */ vc0 = crc_shift(crc0, klen * 2 + 8); vc1 = crc_shift(crc1, klen + 8); vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); /* Process final 8 bytes with combined CRC */ crc0 = crc2; { uint64_t vf = *(const uint64_t*)buf2; if (COPY) memcpy(dst2, &vf, 8); crc0 = __crc32d(crc0, vf ^ vc); } src = buf2 + 8; len -= 8; if (COPY) dst = dst2 + 8; } /* Process remaining bytes */ return crc32_armv8_tail(crc0, dst, src, len, COPY); } Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) { return crc32_copy_impl(crc, NULL, buf, len, 0); } Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { #if OPTIMAL_CMP >= 32 return crc32_copy_impl(crc, dst, src, len, 1); #else /* Without unaligned access, interleaved stores get decomposed into byte ops */ crc = crc32_armv8_pmull_eor3(crc, src, len); memcpy(dst, src, len); return crc; #endif } #endif