summaryrefslogtreecommitdiff
path: root/docs/handbook/neozip/arm-optimizations.md
diff options
context:
space:
mode:
Diffstat (limited to 'docs/handbook/neozip/arm-optimizations.md')
-rw-r--r--docs/handbook/neozip/arm-optimizations.md403
1 files changed, 403 insertions, 0 deletions
diff --git a/docs/handbook/neozip/arm-optimizations.md b/docs/handbook/neozip/arm-optimizations.md
new file mode 100644
index 0000000000..c7fa94e505
--- /dev/null
+++ b/docs/handbook/neozip/arm-optimizations.md
@@ -0,0 +1,403 @@
+# ARM Optimizations
+
+## Overview
+
+Neozip provides ARM SIMD optimizations using NEON (Advanced SIMD), CRC32
+hardware instructions, and PMULL (polynomial multiply long). These cover
+both AArch32 (ARMv7+) and AArch64 (ARMv8+) targets. All implementations
+reside in `arch/arm/`.
+
+---
+
+## Source Files
+
+| File | ISA Extension | Function |
+|---|---|---|
+| `arm_features.c/h` | — | Feature detection |
+| `adler32_neon.c` | NEON | Adler-32 checksum |
+| `chunkset_neon.c` | NEON | Pattern fill for inflate |
+| `compare256_neon.c` | NEON | 256-byte string comparison |
+| `crc32_acle.c` | CRC32 | Hardware CRC-32 |
+| `crc32_pmull.c` | PMULL | CLMUL-based CRC-32 |
+| `insert_string_acle.c` | CRC32 | CRC-based hash insertion |
+| `slide_hash_neon.c` | NEON | Hash table slide |
+| `inffast_neon.c` | NEON | Fast inflate inner loop |
+
+---
+
+## Feature Detection
+
+### `arm_cpu_features` Structure
+
+```c
+struct arm_cpu_features {
+ int has_simd; // ARMv6 SIMD (AArch32 only)
+ int has_neon; // NEON / ASIMD
+ int has_crc32; // CRC32 instructions (ARMv8.0-A optional, ARMv8.1-A mandatory)
+ int has_pmull; // PMULL (polynomial multiply long, 64→128-bit)
+ int has_eor3; // SHA3 EOR3 instruction (ARMv8.2-A+SHA3)
+ int has_fast_pmull; // High-perf PMULL
+};
+```
+
+### Linux Detection
+
+```c
+void Z_INTERNAL arm_check_features(struct cpu_features *features) {
+#if defined(__linux__)
+ unsigned long hwcap = getauxval(AT_HWCAP);
+#if defined(__aarch64__)
+ features->arm.has_neon = !!(hwcap & HWCAP_ASIMD);
+ features->arm.has_crc32 = !!(hwcap & HWCAP_CRC32);
+ features->arm.has_pmull = !!(hwcap & HWCAP_PMULL);
+ unsigned long hwcap2 = getauxval(AT_HWCAP2);
+ features->arm.has_eor3 = !!(hwcap2 & HWCAP2_SHA3);
+#else // AArch32
+ features->arm.has_simd = !!(hwcap & HWCAP_ARM_VFPv3);
+ features->arm.has_neon = !!(hwcap & HWCAP_ARM_NEON);
+ features->arm.has_crc32 = !!(hwcap2 & HWCAP2_CRC32);
+ features->arm.has_pmull = !!(hwcap2 & HWCAP2_PMULL);
+#endif
+#endif
+}
+```
+
+### macOS/iOS Detection
+
+```c
+#if defined(__APPLE__)
+ // NEON is always available on Apple Silicon
+ features->arm.has_neon = 1;
+ features->arm.has_crc32 = has_feature("hw.optional.armv8_crc32");
+ features->arm.has_pmull = has_feature("hw.optional.arm.FEAT_PMULL");
+#endif
+```
+
+### Windows Detection
+
+```c
+#if defined(_WIN32)
+ features->arm.has_neon = IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
+ features->arm.has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#endif
+```
+
+---
+
+## NEON Adler-32 (`adler32_neon.c`)
+
+Uses 128-bit NEON registers to process 16 bytes per iteration:
+
+```c
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
+ uint32_t s1 = adler & 0xffff;
+ uint32_t s2 = adler >> 16;
+
+ // Position weight vector: {16,15,14,...,1}
+ static const uint8_t taps[] = {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
+ uint8x16_t vtaps = vld1q_u8(taps);
+
+ while (len >= 16) {
+ uint32x4_t vs1 = vdupq_n_u32(0);
+ uint32x4_t vs2 = vdupq_n_u32(0);
+ uint32x4_t vs1_0 = vdupq_n_u32(s1);
+
+ // Process up to NMAX bytes before reduction
+ size_t block = MIN(len, NMAX);
+ size_t nblocks = block / 16;
+
+ for (size_t i = 0; i < nblocks; i++) {
+ uint8x16_t vbuf = vld1q_u8(buf);
+
+ // s1 += sum(bytes)
+ uint16x8_t sum16 = vpaddlq_u8(vbuf);
+ uint32x4_t sum32 = vpaddlq_u16(sum16);
+ vs1 = vaddq_u32(vs1, sum32);
+
+ // s2 += 16 * s1_prev + weighted_sum(bytes)
+ vs2 = vshlq_n_u32(vs1_0, 4); // 16 * s1
+ // Multiply-accumulate: weighted position sum
+ uint16x8_t prod = vmull_u8(vget_low_u8(vbuf), vget_low_u8(vtaps));
+ prod = vmlal_u8(prod, vget_high_u8(vbuf), vget_high_u8(vtaps));
+ vs2 = vaddq_u32(vs2, vpaddlq_u16(prod));
+
+ vs1_0 = vs1;
+ buf += 16;
+ }
+
+ // Horizontal reduction
+ s1 += vaddvq_u32(vs1);
+ s2 += vaddvq_u32(vs2);
+ s1 %= BASE;
+ s2 %= BASE;
+ len -= nblocks * 16;
+ }
+ return s1 | (s2 << 16);
+}
+```
+
+Key NEON intrinsics used:
+- `vpaddlq_u8` — Pairwise add long (u8→u16)
+- `vpaddlq_u16` — Pairwise add long (u16→u32)
+- `vmull_u8` — Multiply long (u8×u8→u16)
+- `vmlal_u8` — Multiply-accumulate long
+- `vaddvq_u32` — Horizontal sum across vector (AArch64)
+
+---
+
+## Hardware CRC-32 (`crc32_acle.c`)
+
+Uses ARMv8 CRC32 instructions via ACLE (ARM C Language Extensions):
+
+```c
+Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
+ crc = ~crc; // CRC32 instructions use inverted convention
+
+ // Process 8 bytes at a time
+ while (len >= 8) {
+ crc = __crc32d(crc, *(uint64_t *)buf);
+ buf += 8;
+ len -= 8;
+ }
+
+ // Process 4 bytes
+ if (len >= 4) {
+ crc = __crc32w(crc, *(uint32_t *)buf);
+ buf += 4;
+ len -= 4;
+ }
+
+ // Process remaining bytes
+ while (len--) {
+ crc = __crc32b(crc, *buf++);
+ }
+
+ return ~crc;
+}
+```
+
+The `__crc32b`, `__crc32w`, `__crc32d` intrinsics compile to single CRC32
+instructions, computing CRC-32 of 1/4/8 bytes per instruction.
+
+---
+
+## PMULL CRC-32 (`crc32_pmull.c`)
+
+For larger data, polynomial multiply (PMULL) provides higher throughput
+via carry-less multiplication, similar to x86 PCLMULQDQ:
+
+```c
+Z_INTERNAL uint32_t crc32_pmull(uint32_t crc, const uint8_t *buf, size_t len) {
+ poly128_t fold_const;
+ uint64x2_t crc0, crc1, crc2, crc3;
+
+ // Initialize four accumulators with first 64 bytes
+ crc0 = veorq_u64(vld1q_u64((uint64_t *)buf),
+ vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));
+ // ... crc1, crc2, crc3
+
+ // Main fold loop: 64 bytes per iteration
+ while (len >= 64) {
+ // vmull_p64: 64×64→128-bit polynomial multiply
+ poly128_t h0 = vmull_p64(vgetq_lane_u64(crc0, 0), fold_lo);
+ poly128_t h1 = vmull_p64(vgetq_lane_u64(crc0, 1), fold_hi);
+ crc0 = veorq_u64(vreinterpretq_u64_p128(h0),
+ vreinterpretq_u64_p128(h1));
+ crc0 = veorq_u64(crc0, vld1q_u64((uint64_t *)buf));
+ // repeat for crc1..crc3
+ }
+
+ // Barrett reduction to 32-bit CRC
+}
+```
+
+With `has_eor3` (SHA3 extension), three-way XOR is done in a single
+instruction:
+
+```c
+#ifdef ARM_FEATURE_SHA3
+ // EOR3: a ^= b ^ c in one instruction
+ crc0 = vreinterpretq_u64_u8(veor3q_u8(
+ vreinterpretq_u8_p128(h0),
+ vreinterpretq_u8_p128(h1),
+ vreinterpretq_u8_u64(data)));
+#endif
+```
+
+---
+
+## NEON String Comparison (`compare256_neon.c`)
+
+```c
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+ do {
+ uint8x16_t v0 = vld1q_u8(src0 + len);
+ uint8x16_t v1 = vld1q_u8(src1 + len);
+ uint8x16_t cmp = vceqq_u8(v0, v1);
+
+ // Check if all bytes matched
+ uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+ uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+
+ if (mask_lo != ~0ULL) {
+ // First mismatch in lower 8 bytes
+ return len + (__builtin_ctzll(~mask_lo) >> 3);
+ }
+ if (mask_hi != ~0ULL) {
+ return len + 8 + (__builtin_ctzll(~mask_hi) >> 3);
+ }
+ len += 16;
+ } while (len < 256);
+ return 256;
+}
+```
+
+---
+
+## NEON Slide Hash (`slide_hash_neon.c`)
+
+```c
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+ unsigned n;
+ Pos *p;
+ uint16x8_t vw = vdupq_n_u16((uint16_t)s->w_size);
+
+ n = HASH_SIZE;
+ p = &s->head[n];
+ do {
+ p -= 8;
+ uint16x8_t val = vld1q_u16(p);
+ val = vqsubq_u16(val, vw); // Saturating subtract
+ vst1q_u16(p, val);
+ n -= 8;
+ } while (n);
+
+ // Same loop for s->prev[0..w_size-1]
+ n = s->w_size;
+ p = &s->prev[n];
+ do {
+ p -= 8;
+ uint16x8_t val = vld1q_u16(p);
+ val = vqsubq_u16(val, vw);
+ vst1q_u16(p, val);
+ n -= 8;
+ } while (n);
+}
+```
+
+`vqsubq_u16` performs unsigned saturating subtract — values below zero
+clamp to zero rather than wrapping.
+
+---
+
+## NEON Chunk Memory Set (`chunkset_neon.c`)
+
+Used during inflate for back-reference copies:
+
+```c
+Z_INTERNAL uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from,
+ unsigned dist, unsigned len) {
+ if (dist == 1) {
+ // Broadcast single byte
+ uint8x16_t vfill = vdupq_n_u8(*from);
+ while (len >= 16) {
+ vst1q_u8(out, vfill);
+ out += 16;
+ len -= 16;
+ }
+ } else if (dist == 2) {
+ uint8x16_t v = vreinterpretq_u8_u16(vdupq_n_u16(*(uint16_t *)from));
+ // ...
+ } else if (dist >= 16) {
+ // Standard copy
+ while (len >= 16) {
+ vst1q_u8(out, vld1q_u8(from));
+ out += 16;
+ from += 16;
+ len -= 16;
+ }
+ } else {
+ // Replicate dist-byte pattern into 16 bytes
+ // ...
+ }
+ return out;
+}
+```
+
+---
+
+## CRC-Based Hash Insertion (`insert_string_acle.c`)
+
+When ARMv8 CRC32 instructions are available, they provide excellent hash
+distribution:
+
+```c
+Z_INTERNAL Pos insert_string_acle(deflate_state *s, Pos str, unsigned count) {
+ Pos idx;
+ for (unsigned i = 0; i < count; i++) {
+ uint32_t val = *(uint32_t *)(s->window + str + i);
+ uint32_t h = __crc32w(0, val);
+ h &= s->hash_mask;
+ idx = s->head[h];
+ s->prev[(str + i) & s->w_mask] = idx;
+ s->head[h] = (Pos)(str + i);
+ }
+ return idx;
+}
+```
+
+---
+
+## CMake Configuration
+
+ARM features are detected via compiler intrinsic checks:
+
+```cmake
+option(WITH_NEON "Build with NEON SIMD" ON)
+option(WITH_ACLE "Build with ACLE CRC" ON)
+
+# AArch64 compiler flags
+if(WITH_NEON)
+ check_c_compiler_flag("-march=armv8-a+simd" HAS_NEON)
+ if(HAS_NEON)
+ set_property(SOURCE arch/arm/adler32_neon.c APPEND
+ PROPERTY COMPILE_OPTIONS -march=armv8-a+simd)
+ # ... other NEON sources
+ add_definitions(-DARM_NEON)
+ endif()
+endif()
+
+if(WITH_ACLE)
+ check_c_compiler_flag("-march=armv8-a+crc" HAS_CRC32)
+ if(HAS_CRC32)
+ set_property(SOURCE arch/arm/crc32_acle.c APPEND
+ PROPERTY COMPILE_OPTIONS -march=armv8-a+crc)
+ add_definitions(-DARM_ACLE_CRC_HASH)
+ endif()
+ check_c_compiler_flag("-march=armv8-a+crypto" HAS_PMULL)
+ if(HAS_PMULL)
+ set_property(SOURCE arch/arm/crc32_pmull.c APPEND
+ PROPERTY COMPILE_OPTIONS -march=armv8-a+crypto)
+ add_definitions(-DARM_PMULL_CRC)
+ endif()
+endif()
+```
+
+---
+
+## Performance Notes
+
+| Operation | NEON | CRC32 HW | PMULL |
+|---|---|---|---|
+| Adler-32 | ~8 bytes/cycle | — | — |
+| CRC-32 | — | ~4 bytes/cycle | ~16 bytes/cycle |
+| CRC-32+Copy | — | — | ~12 bytes/cycle |
+| Compare256 | ~16 bytes/cycle | — | — |
+| Slide Hash | ~8 entries/cycle | — | — |
+
+Apple Silicon (M1+) provides particularly fast CRC32 and PMULL
+implementations with low latency per instruction.
+
+On Cortex-A55 and similar in-order cores, the throughput numbers are roughly
+halved compared to Cortex-A76/A78 and Apple Silicon out-of-order cores.