1 files changed, 403 insertions, 0 deletions
diff --git a/docs/handbook/neozip/arm-optimizations.md b/docs/handbook/neozip/arm-optimizations.md
new file mode 100644
index 0000000000..c7fa94e505
--- /dev/null
+++ b/docs/handbook/neozip/arm-optimizations.md
@@ -0,0 +1,403 @@
+# ARM Optimizations
+
+## Overview
+
+Neozip provides ARM SIMD optimizations using NEON (Advanced SIMD), CRC32
+hardware instructions, and PMULL (polynomial multiply long). These cover
+both AArch32 (ARMv7+) and AArch64 (ARMv8+) targets. All implementations
+reside in `arch/arm/`.
+
+---
+
+## Source Files
+
+| File | ISA Extension | Function |
+|---|---|---|
+| `arm_features.c/h` | — | Feature detection |
+| `adler32_neon.c` | NEON | Adler-32 checksum |
+| `chunkset_neon.c` | NEON | Pattern fill for inflate |
+| `compare256_neon.c` | NEON | 256-byte string comparison |
+| `crc32_acle.c` | CRC32 | Hardware CRC-32 |
+| `crc32_pmull.c` | PMULL | CLMUL-based CRC-32 |
+| `insert_string_acle.c` | CRC32 | CRC-based hash insertion |
+| `slide_hash_neon.c` | NEON | Hash table slide |
+| `inffast_neon.c` | NEON | Fast inflate inner loop |
+
+---
+
+## Feature Detection
+
+### `arm_cpu_features` Structure
+
+```c
+struct arm_cpu_features {
+    int has_simd;        // ARMv6 SIMD (AArch32 only)
+    int has_neon;        // NEON / ASIMD
+    int has_crc32;       // CRC32 instructions (ARMv8.0-A optional, ARMv8.1-A mandatory)
+    int has_pmull;       // PMULL (polynomial multiply long, 64→128-bit)
+    int has_eor3;        // SHA3 EOR3 instruction (ARMv8.2-A+SHA3)
+    int has_fast_pmull;  // High-perf PMULL
+};
+```
+
+### Linux Detection
+
+```c
+void Z_INTERNAL arm_check_features(struct cpu_features *features) {
+#if defined(__linux__)
+    unsigned long hwcap = getauxval(AT_HWCAP);
+#if defined(__aarch64__)
+    features->arm.has_neon  = !!(hwcap & HWCAP_ASIMD);
+    features->arm.has_crc32 = !!(hwcap & HWCAP_CRC32);
+    features->arm.has_pmull = !!(hwcap & HWCAP_PMULL);
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+    features->arm.has_eor3 = !!(hwcap2 & HWCAP2_SHA3);
+#else  // AArch32
+    features->arm.has_simd = !!(hwcap & HWCAP_ARM_VFPv3);
+    features->arm.has_neon = !!(hwcap & HWCAP_ARM_NEON);
+    features->arm.has_crc32 = !!(hwcap2 & HWCAP2_CRC32);
+    features->arm.has_pmull = !!(hwcap2 & HWCAP2_PMULL);
+#endif
+#endif
+}
+```
+
+### macOS/iOS Detection
+
+```c
+#if defined(__APPLE__)
+    // NEON is always available on Apple Silicon
+    features->arm.has_neon = 1;
+    features->arm.has_crc32 = has_feature("hw.optional.armv8_crc32");
+    features->arm.has_pmull = has_feature("hw.optional.arm.FEAT_PMULL");
+#endif
+```
+
+### Windows Detection
+
+```c
+#if defined(_WIN32)
+    features->arm.has_neon  = IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
+    features->arm.has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#endif
+```
+
+---
+
+## NEON Adler-32 (`adler32_neon.c`)
+
+Uses 128-bit NEON registers to process 16 bytes per iteration:
+
+```c
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = adler >> 16;
+
+    // Position weight vector: {16,15,14,...,1}
+    static const uint8_t taps[] = {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
+    uint8x16_t vtaps = vld1q_u8(taps);
+
+    while (len >= 16) {
+        uint32x4_t vs1 = vdupq_n_u32(0);
+        uint32x4_t vs2 = vdupq_n_u32(0);
+        uint32x4_t vs1_0 = vdupq_n_u32(s1);
+
+        // Process up to NMAX bytes before reduction
+        size_t block = MIN(len, NMAX);
+        size_t nblocks = block / 16;
+
+        for (size_t i = 0; i < nblocks; i++) {
+            uint8x16_t vbuf = vld1q_u8(buf);
+
+            // s1 += sum(bytes)
+            uint16x8_t sum16 = vpaddlq_u8(vbuf);
+            uint32x4_t sum32 = vpaddlq_u16(sum16);
+            vs1 = vaddq_u32(vs1, sum32);
+
+            // s2 += 16 * s1_prev + weighted_sum(bytes)
+            vs2 = vshlq_n_u32(vs1_0, 4);  // 16 * s1
+            // Multiply-accumulate: weighted position sum
+            uint16x8_t prod = vmull_u8(vget_low_u8(vbuf), vget_low_u8(vtaps));
+            prod = vmlal_u8(prod, vget_high_u8(vbuf), vget_high_u8(vtaps));
+            vs2 = vaddq_u32(vs2, vpaddlq_u16(prod));
+
+            vs1_0 = vs1;
+            buf += 16;
+        }
+
+        // Horizontal reduction
+        s1 += vaddvq_u32(vs1);
+        s2 += vaddvq_u32(vs2);
+        s1 %= BASE;
+        s2 %= BASE;
+        len -= nblocks * 16;
+    }
+    return s1 | (s2 << 16);
+}
+```
+
+Key NEON intrinsics used:
+- `vpaddlq_u8` — Pairwise add long (u8→u16)
+- `vpaddlq_u16` — Pairwise add long (u16→u32)
+- `vmull_u8` — Multiply long (u8×u8→u16)
+- `vmlal_u8` — Multiply-accumulate long
+- `vaddvq_u32` — Horizontal sum across vector (AArch64)
+
+---
+
+## Hardware CRC-32 (`crc32_acle.c`)
+
+Uses ARMv8 CRC32 instructions via ACLE (ARM C Language Extensions):
+
+```c
+Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
+    crc = ~crc;  // CRC32 instructions use inverted convention
+
+    // Process 8 bytes at a time
+    while (len >= 8) {
+        crc = __crc32d(crc, *(uint64_t *)buf);
+        buf += 8;
+        len -= 8;
+    }
+
+    // Process 4 bytes
+    if (len >= 4) {
+        crc = __crc32w(crc, *(uint32_t *)buf);
+        buf += 4;
+        len -= 4;
+    }
+
+    // Process remaining bytes
+    while (len--) {
+        crc = __crc32b(crc, *buf++);
+    }
+
+    return ~crc;
+}
+```
+
+The `__crc32b`, `__crc32w`, `__crc32d` intrinsics compile to single CRC32
+instructions, computing CRC-32 of 1/4/8 bytes per instruction.
+
+---
+
+## PMULL CRC-32 (`crc32_pmull.c`)
+
+For larger data, polynomial multiply (PMULL) provides higher throughput
+via carry-less multiplication, similar to x86 PCLMULQDQ:
+
+```c
+Z_INTERNAL uint32_t crc32_pmull(uint32_t crc, const uint8_t *buf, size_t len) {
+    poly128_t fold_const;
+    uint64x2_t crc0, crc1, crc2, crc3;
+
+    // Initialize four accumulators with first 64 bytes
+    crc0 = veorq_u64(vld1q_u64((uint64_t *)buf),
+                      vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));
+    // ... crc1, crc2, crc3
+
+    // Main fold loop: 64 bytes per iteration
+    while (len >= 64) {
+        // vmull_p64: 64×64→128-bit polynomial multiply
+        poly128_t h0 = vmull_p64(vgetq_lane_u64(crc0, 0), fold_lo);
+        poly128_t h1 = vmull_p64(vgetq_lane_u64(crc0, 1), fold_hi);
+        crc0 = veorq_u64(vreinterpretq_u64_p128(h0),
+                          vreinterpretq_u64_p128(h1));
+        crc0 = veorq_u64(crc0, vld1q_u64((uint64_t *)buf));
+        // repeat for crc1..crc3
+    }
+
+    // Barrett reduction to 32-bit CRC
+}
+```
+
+With `has_eor3` (SHA3 extension), three-way XOR is done in a single
+instruction:
+
+```c
+#ifdef ARM_FEATURE_SHA3
+    // EOR3: a ^= b ^ c in one instruction
+    crc0 = vreinterpretq_u64_u8(veor3q_u8(
+        vreinterpretq_u8_p128(h0),
+        vreinterpretq_u8_p128(h1),
+        vreinterpretq_u8_u64(data)));
+#endif
+```
+
+---
+
+## NEON String Comparison (`compare256_neon.c`)
+
+```c
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    do {
+        uint8x16_t v0 = vld1q_u8(src0 + len);
+        uint8x16_t v1 = vld1q_u8(src1 + len);
+        uint8x16_t cmp = vceqq_u8(v0, v1);
+
+        // Check if all bytes matched
+        uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+        uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+
+        if (mask_lo != ~0ULL) {
+            // First mismatch in lower 8 bytes
+            return len + (__builtin_ctzll(~mask_lo) >> 3);
+        }
+        if (mask_hi != ~0ULL) {
+            return len + 8 + (__builtin_ctzll(~mask_hi) >> 3);
+        }
+        len += 16;
+    } while (len < 256);
+    return 256;
+}
+```
+
+---
+
+## NEON Slide Hash (`slide_hash_neon.c`)
+
+```c
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+    unsigned n;
+    Pos *p;
+    uint16x8_t vw = vdupq_n_u16((uint16_t)s->w_size);
+
+    n = HASH_SIZE;
+    p = &s->head[n];
+    do {
+        p -= 8;
+        uint16x8_t val = vld1q_u16(p);
+        val = vqsubq_u16(val, vw);   // Saturating subtract
+        vst1q_u16(p, val);
+        n -= 8;
+    } while (n);
+
+    // Same loop for s->prev[0..w_size-1]
+    n = s->w_size;
+    p = &s->prev[n];
+    do {
+        p -= 8;
+        uint16x8_t val = vld1q_u16(p);
+        val = vqsubq_u16(val, vw);
+        vst1q_u16(p, val);
+        n -= 8;
+    } while (n);
+}
+```
+
+`vqsubq_u16` performs unsigned saturating subtract — values below zero
+clamp to zero rather than wrapping.
+
+---
+
+## NEON Chunk Memory Set (`chunkset_neon.c`)
+
+Used during inflate for back-reference copies:
+
+```c
+Z_INTERNAL uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from,
+                                           unsigned dist, unsigned len) {
+    if (dist == 1) {
+        // Broadcast single byte
+        uint8x16_t vfill = vdupq_n_u8(*from);
+        while (len >= 16) {
+            vst1q_u8(out, vfill);
+            out += 16;
+            len -= 16;
+        }
+    } else if (dist == 2) {
+        uint8x16_t v = vreinterpretq_u8_u16(vdupq_n_u16(*(uint16_t *)from));
+        // ...
+    } else if (dist >= 16) {
+        // Standard copy
+        while (len >= 16) {
+            vst1q_u8(out, vld1q_u8(from));
+            out += 16;
+            from += 16;
+            len -= 16;
+        }
+    } else {
+        // Replicate dist-byte pattern into 16 bytes
+        // ...
+    }
+    return out;
+}
+```
+
+---
+
+## CRC-Based Hash Insertion (`insert_string_acle.c`)
+
+When ARMv8 CRC32 instructions are available, they provide excellent hash
+distribution:
+
+```c
+Z_INTERNAL Pos insert_string_acle(deflate_state *s, Pos str, unsigned count) {
+    Pos idx;
+    for (unsigned i = 0; i < count; i++) {
+        uint32_t val = *(uint32_t *)(s->window + str + i);
+        uint32_t h = __crc32w(0, val);
+        h &= s->hash_mask;
+        idx = s->head[h];
+        s->prev[(str + i) & s->w_mask] = idx;
+        s->head[h] = (Pos)(str + i);
+    }
+    return idx;
+}
+```
+
+---
+
+## CMake Configuration
+
+ARM features are detected via compiler intrinsic checks:
+
+```cmake
+option(WITH_NEON "Build with NEON SIMD" ON)
+option(WITH_ACLE "Build with ACLE CRC" ON)
+
+# AArch64 compiler flags
+if(WITH_NEON)
+    check_c_compiler_flag("-march=armv8-a+simd" HAS_NEON)
+    if(HAS_NEON)
+        set_property(SOURCE arch/arm/adler32_neon.c APPEND
+                     PROPERTY COMPILE_OPTIONS -march=armv8-a+simd)
+        # ... other NEON sources
+        add_definitions(-DARM_NEON)
+    endif()
+endif()
+
+if(WITH_ACLE)
+    check_c_compiler_flag("-march=armv8-a+crc" HAS_CRC32)
+    if(HAS_CRC32)
+        set_property(SOURCE arch/arm/crc32_acle.c APPEND
+                     PROPERTY COMPILE_OPTIONS -march=armv8-a+crc)
+        add_definitions(-DARM_ACLE_CRC_HASH)
+    endif()
+    check_c_compiler_flag("-march=armv8-a+crypto" HAS_PMULL)
+    if(HAS_PMULL)
+        set_property(SOURCE arch/arm/crc32_pmull.c APPEND
+                     PROPERTY COMPILE_OPTIONS -march=armv8-a+crypto)
+        add_definitions(-DARM_PMULL_CRC)
+    endif()
+endif()
+```
+
+---
+
+## Performance Notes
+
+| Operation | NEON | CRC32 HW | PMULL |
+|---|---|---|---|
+| Adler-32 | ~8 bytes/cycle | — | — |
+| CRC-32 | — | ~4 bytes/cycle | ~16 bytes/cycle |
+| CRC-32+Copy | — | — | ~12 bytes/cycle |
+| Compare256 | ~16 bytes/cycle | — | — |
+| Slide Hash | ~8 entries/cycle | — | — |
+
+Apple Silicon (M1+) provides particularly fast CRC32 and PMULL
+implementations with low latency per instruction.
+
+On Cortex-A55 and similar in-order cores, the throughput numbers are roughly
+halved compared to Cortex-A76/A78 and Apple Silicon out-of-order cores.