diff options
Diffstat (limited to 'docs/handbook/neozip/arm-optimizations.md')
| -rw-r--r-- | docs/handbook/neozip/arm-optimizations.md | 403 |
1 files changed, 403 insertions, 0 deletions
diff --git a/docs/handbook/neozip/arm-optimizations.md b/docs/handbook/neozip/arm-optimizations.md new file mode 100644 index 0000000000..c7fa94e505 --- /dev/null +++ b/docs/handbook/neozip/arm-optimizations.md @@ -0,0 +1,403 @@ +# ARM Optimizations + +## Overview + +Neozip provides ARM SIMD optimizations using NEON (Advanced SIMD), CRC32 +hardware instructions, and PMULL (polynomial multiply long). These cover +both AArch32 (ARMv7+) and AArch64 (ARMv8+) targets. All implementations +reside in `arch/arm/`. + +--- + +## Source Files + +| File | ISA Extension | Function | +|---|---|---| +| `arm_features.c/h` | — | Feature detection | +| `adler32_neon.c` | NEON | Adler-32 checksum | +| `chunkset_neon.c` | NEON | Pattern fill for inflate | +| `compare256_neon.c` | NEON | 256-byte string comparison | +| `crc32_acle.c` | CRC32 | Hardware CRC-32 | +| `crc32_pmull.c` | PMULL | CLMUL-based CRC-32 | +| `insert_string_acle.c` | CRC32 | CRC-based hash insertion | +| `slide_hash_neon.c` | NEON | Hash table slide | +| `inffast_neon.c` | NEON | Fast inflate inner loop | + +--- + +## Feature Detection + +### `arm_cpu_features` Structure + +```c +struct arm_cpu_features { + int has_simd; // ARMv6 SIMD (AArch32 only) + int has_neon; // NEON / ASIMD + int has_crc32; // CRC32 instructions (ARMv8.0-A optional, ARMv8.1-A mandatory) + int has_pmull; // PMULL (polynomial multiply long, 64→128-bit) + int has_eor3; // SHA3 EOR3 instruction (ARMv8.2-A+SHA3) + int has_fast_pmull; // High-perf PMULL +}; +``` + +### Linux Detection + +```c +void Z_INTERNAL arm_check_features(struct cpu_features *features) { +#if defined(__linux__) + unsigned long hwcap = getauxval(AT_HWCAP); +#if defined(__aarch64__) + features->arm.has_neon = !!(hwcap & HWCAP_ASIMD); + features->arm.has_crc32 = !!(hwcap & HWCAP_CRC32); + features->arm.has_pmull = !!(hwcap & HWCAP_PMULL); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + features->arm.has_eor3 = !!(hwcap2 & HWCAP2_SHA3); +#else // AArch32 + features->arm.has_simd = !!(hwcap & HWCAP_ARM_VFPv3); + features->arm.has_neon = !!(hwcap & HWCAP_ARM_NEON); + features->arm.has_crc32 = !!(hwcap2 & HWCAP2_CRC32); + features->arm.has_pmull = !!(hwcap2 & HWCAP2_PMULL); +#endif +#endif +} +``` + +### macOS/iOS Detection + +```c +#if defined(__APPLE__) + // NEON is always available on Apple Silicon + features->arm.has_neon = 1; + features->arm.has_crc32 = has_feature("hw.optional.armv8_crc32"); + features->arm.has_pmull = has_feature("hw.optional.arm.FEAT_PMULL"); +#endif +``` + +### Windows Detection + +```c +#if defined(_WIN32) + features->arm.has_neon = IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE); + features->arm.has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); +#endif +``` + +--- + +## NEON Adler-32 (`adler32_neon.c`) + +Uses 128-bit NEON registers to process 16 bytes per iteration: + +```c +Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) { + uint32_t s1 = adler & 0xffff; + uint32_t s2 = adler >> 16; + + // Position weight vector: {16,15,14,...,1} + static const uint8_t taps[] = {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}; + uint8x16_t vtaps = vld1q_u8(taps); + + while (len >= 16) { + uint32x4_t vs1 = vdupq_n_u32(0); + uint32x4_t vs2 = vdupq_n_u32(0); + uint32x4_t vs1_0 = vdupq_n_u32(s1); + + // Process up to NMAX bytes before reduction + size_t block = MIN(len, NMAX); + size_t nblocks = block / 16; + + for (size_t i = 0; i < nblocks; i++) { + uint8x16_t vbuf = vld1q_u8(buf); + + // s1 += sum(bytes) + uint16x8_t sum16 = vpaddlq_u8(vbuf); + uint32x4_t sum32 = vpaddlq_u16(sum16); + vs1 = vaddq_u32(vs1, sum32); + + // s2 += 16 * s1_prev + weighted_sum(bytes) + vs2 = vshlq_n_u32(vs1_0, 4); // 16 * s1 + // Multiply-accumulate: weighted position sum + uint16x8_t prod = vmull_u8(vget_low_u8(vbuf), vget_low_u8(vtaps)); + prod = vmlal_u8(prod, vget_high_u8(vbuf), vget_high_u8(vtaps)); + vs2 = vaddq_u32(vs2, vpaddlq_u16(prod)); + + vs1_0 = vs1; + buf += 16; + } + + // Horizontal reduction + s1 += vaddvq_u32(vs1); + s2 += vaddvq_u32(vs2); + s1 %= BASE; + s2 %= BASE; + len -= nblocks * 16; + } + return s1 | (s2 << 16); +} +``` + +Key NEON intrinsics used: +- `vpaddlq_u8` — Pairwise add long (u8→u16) +- `vpaddlq_u16` — Pairwise add long (u16→u32) +- `vmull_u8` — Multiply long (u8×u8→u16) +- `vmlal_u8` — Multiply-accumulate long +- `vaddvq_u32` — Horizontal sum across vector (AArch64) + +--- + +## Hardware CRC-32 (`crc32_acle.c`) + +Uses ARMv8 CRC32 instructions via ACLE (ARM C Language Extensions): + +```c +Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) { + crc = ~crc; // CRC32 instructions use inverted convention + + // Process 8 bytes at a time + while (len >= 8) { + crc = __crc32d(crc, *(uint64_t *)buf); + buf += 8; + len -= 8; + } + + // Process 4 bytes + if (len >= 4) { + crc = __crc32w(crc, *(uint32_t *)buf); + buf += 4; + len -= 4; + } + + // Process remaining bytes + while (len--) { + crc = __crc32b(crc, *buf++); + } + + return ~crc; +} +``` + +The `__crc32b`, `__crc32w`, `__crc32d` intrinsics compile to single CRC32 +instructions, computing CRC-32 of 1/4/8 bytes per instruction. + +--- + +## PMULL CRC-32 (`crc32_pmull.c`) + +For larger data, polynomial multiply (PMULL) provides higher throughput +via carry-less multiplication, similar to x86 PCLMULQDQ: + +```c +Z_INTERNAL uint32_t crc32_pmull(uint32_t crc, const uint8_t *buf, size_t len) { + poly128_t fold_const; + uint64x2_t crc0, crc1, crc2, crc3; + + // Initialize four accumulators with first 64 bytes + crc0 = veorq_u64(vld1q_u64((uint64_t *)buf), + vcombine_u64(vcreate_u64(crc), vcreate_u64(0))); + // ... crc1, crc2, crc3 + + // Main fold loop: 64 bytes per iteration + while (len >= 64) { + // vmull_p64: 64×64→128-bit polynomial multiply + poly128_t h0 = vmull_p64(vgetq_lane_u64(crc0, 0), fold_lo); + poly128_t h1 = vmull_p64(vgetq_lane_u64(crc0, 1), fold_hi); + crc0 = veorq_u64(vreinterpretq_u64_p128(h0), + vreinterpretq_u64_p128(h1)); + crc0 = veorq_u64(crc0, vld1q_u64((uint64_t *)buf)); + // repeat for crc1..crc3 + } + + // Barrett reduction to 32-bit CRC +} +``` + +With `has_eor3` (SHA3 extension), three-way XOR is done in a single +instruction: + +```c +#ifdef ARM_FEATURE_SHA3 + // EOR3: a ^= b ^ c in one instruction + crc0 = vreinterpretq_u64_u8(veor3q_u8( + vreinterpretq_u8_p128(h0), + vreinterpretq_u8_p128(h1), + vreinterpretq_u8_u64(data))); +#endif +``` + +--- + +## NEON String Comparison (`compare256_neon.c`) + +```c +Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + do { + uint8x16_t v0 = vld1q_u8(src0 + len); + uint8x16_t v1 = vld1q_u8(src1 + len); + uint8x16_t cmp = vceqq_u8(v0, v1); + + // Check if all bytes matched + uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0); + uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1); + + if (mask_lo != ~0ULL) { + // First mismatch in lower 8 bytes + return len + (__builtin_ctzll(~mask_lo) >> 3); + } + if (mask_hi != ~0ULL) { + return len + 8 + (__builtin_ctzll(~mask_hi) >> 3); + } + len += 16; + } while (len < 256); + return 256; +} +``` + +--- + +## NEON Slide Hash (`slide_hash_neon.c`) + +```c +Z_INTERNAL void slide_hash_neon(deflate_state *s) { + unsigned n; + Pos *p; + uint16x8_t vw = vdupq_n_u16((uint16_t)s->w_size); + + n = HASH_SIZE; + p = &s->head[n]; + do { + p -= 8; + uint16x8_t val = vld1q_u16(p); + val = vqsubq_u16(val, vw); // Saturating subtract + vst1q_u16(p, val); + n -= 8; + } while (n); + + // Same loop for s->prev[0..w_size-1] + n = s->w_size; + p = &s->prev[n]; + do { + p -= 8; + uint16x8_t val = vld1q_u16(p); + val = vqsubq_u16(val, vw); + vst1q_u16(p, val); + n -= 8; + } while (n); +} +``` + +`vqsubq_u16` performs unsigned saturating subtract — values below zero +clamp to zero rather than wrapping. + +--- + +## NEON Chunk Memory Set (`chunkset_neon.c`) + +Used during inflate for back-reference copies: + +```c +Z_INTERNAL uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, + unsigned dist, unsigned len) { + if (dist == 1) { + // Broadcast single byte + uint8x16_t vfill = vdupq_n_u8(*from); + while (len >= 16) { + vst1q_u8(out, vfill); + out += 16; + len -= 16; + } + } else if (dist == 2) { + uint8x16_t v = vreinterpretq_u8_u16(vdupq_n_u16(*(uint16_t *)from)); + // ... + } else if (dist >= 16) { + // Standard copy + while (len >= 16) { + vst1q_u8(out, vld1q_u8(from)); + out += 16; + from += 16; + len -= 16; + } + } else { + // Replicate dist-byte pattern into 16 bytes + // ... + } + return out; +} +``` + +--- + +## CRC-Based Hash Insertion (`insert_string_acle.c`) + +When ARMv8 CRC32 instructions are available, they provide excellent hash +distribution: + +```c +Z_INTERNAL Pos insert_string_acle(deflate_state *s, Pos str, unsigned count) { + Pos idx; + for (unsigned i = 0; i < count; i++) { + uint32_t val = *(uint32_t *)(s->window + str + i); + uint32_t h = __crc32w(0, val); + h &= s->hash_mask; + idx = s->head[h]; + s->prev[(str + i) & s->w_mask] = idx; + s->head[h] = (Pos)(str + i); + } + return idx; +} +``` + +--- + +## CMake Configuration + +ARM features are detected via compiler intrinsic checks: + +```cmake +option(WITH_NEON "Build with NEON SIMD" ON) +option(WITH_ACLE "Build with ACLE CRC" ON) + +# AArch64 compiler flags +if(WITH_NEON) + check_c_compiler_flag("-march=armv8-a+simd" HAS_NEON) + if(HAS_NEON) + set_property(SOURCE arch/arm/adler32_neon.c APPEND + PROPERTY COMPILE_OPTIONS -march=armv8-a+simd) + # ... other NEON sources + add_definitions(-DARM_NEON) + endif() +endif() + +if(WITH_ACLE) + check_c_compiler_flag("-march=armv8-a+crc" HAS_CRC32) + if(HAS_CRC32) + set_property(SOURCE arch/arm/crc32_acle.c APPEND + PROPERTY COMPILE_OPTIONS -march=armv8-a+crc) + add_definitions(-DARM_ACLE_CRC_HASH) + endif() + check_c_compiler_flag("-march=armv8-a+crypto" HAS_PMULL) + if(HAS_PMULL) + set_property(SOURCE arch/arm/crc32_pmull.c APPEND + PROPERTY COMPILE_OPTIONS -march=armv8-a+crypto) + add_definitions(-DARM_PMULL_CRC) + endif() +endif() +``` + +--- + +## Performance Notes + +| Operation | NEON | CRC32 HW | PMULL | +|---|---|---|---| +| Adler-32 | ~8 bytes/cycle | — | — | +| CRC-32 | — | ~4 bytes/cycle | ~16 bytes/cycle | +| CRC-32+Copy | — | — | ~12 bytes/cycle | +| Compare256 | ~16 bytes/cycle | — | — | +| Slide Hash | ~8 entries/cycle | — | — | + +Apple Silicon (M1+) provides particularly fast CRC32 and PMULL +implementations with low latency per instruction. + +On Cortex-A55 and similar in-order cores, the throughput numbers are roughly +halved compared to Cortex-A76/A78 and Apple Silicon out-of-order cores. |
