1 files changed, 447 insertions, 0 deletions
diff --git a/docs/handbook/neozip/hardware-acceleration.md b/docs/handbook/neozip/hardware-acceleration.md
new file mode 100644
index 0000000000..b087e5a2ab
--- /dev/null
+++ b/docs/handbook/neozip/hardware-acceleration.md
@@ -0,0 +1,447 @@
+# Hardware Acceleration
+
+## Overview
+
+Neozip dispatches compression and decompression operations to the best
+available hardware-accelerated implementation at runtime. This is achieved
+through a function table (`functable`), CPU feature detection, and
+architecture-specific source files compiled with appropriate SIMD flags.
+
+---
+
+## CPU Feature Detection
+
+### `cpu_features.c`
+
+The entry point for feature detection:
+
+```c
+void Z_INTERNAL cpu_check_features(struct cpu_features *features) {
+    // Zero out features
+    memset(features, 0, sizeof(*features));
+
+#if defined(X86_FEATURES)
+    x86_check_features(features);
+#elif defined(ARM_FEATURES)
+    arm_check_features(features);
+#elif defined(POWER_FEATURES)
+    power_check_features(features);
+#elif defined(S390_FEATURES)
+    s390_check_features(features);
+#elif defined(RISCV_FEATURES)
+    riscv_check_features(features);
+#elif defined(LOONGARCH_FEATURES)
+    loongarch_check_features(features);
+#endif
+}
+```
+
+### CPU Feature Structures
+
+```c
+struct cpu_features {
+    union {
+#if defined(X86_FEATURES)
+        struct x86_cpu_features x86;
+#elif defined(ARM_FEATURES)
+        struct arm_cpu_features arm;
+#elif defined(POWER_FEATURES)
+        struct power_cpu_features power;
+#elif defined(S390_FEATURES)
+        struct s390_cpu_features s390;
+#elif defined(RISCV_FEATURES)
+        struct riscv_cpu_features riscv;
+#elif defined(LOONGARCH_FEATURES)
+        struct loongarch_cpu_features loongarch;
+#endif
+    };
+};
+```
+
+Each architecture defines its own feature structure:
+
+**x86** (`x86_features.h`):
+```c
+struct x86_cpu_features {
+    int has_avx2;
+    int has_avx512f;
+    int has_avx512dq;
+    int has_avx512bw;
+    int has_avx512vl;
+    int has_avx512_common;   // All of f+dq+bw+vl
+    int has_avx512vnni;
+    int has_sse2;
+    int has_ssse3;
+    int has_sse41;
+    int has_sse42;
+    int has_pclmulqdq;
+    int has_vpclmulqdq;
+    int has_os_save_ymm;
+    int has_os_save_zmm;
+};
+```
+
+**ARM** (`arm_features.h`):
+```c
+struct arm_cpu_features {
+    int has_simd;        // ARMv6 SIMD
+    int has_neon;        // ARMv7+ NEON / AArch64 ASIMD
+    int has_crc32;       // CRC32 instructions
+    int has_pmull;       // PMULL (polynomial multiply long)
+    int has_eor3;        // SHA3 EOR3 instruction
+    int has_fast_pmull;  // High-perf PMULL available
+};
+```
+
+---
+
+## x86 Feature Detection
+
+`x86_check_features()` in `arch/x86/x86_features.c` uses CPUID:
+
+```c
+void Z_INTERNAL x86_check_features(struct cpu_features *features) {
+    unsigned eax, ebx, ecx, edx;
+
+    // CPUID leaf 1
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    features->x86.has_sse2 = !!(edx & (1 << 26));
+    features->x86.has_ssse3 = !!(ecx & (1 << 9));
+    features->x86.has_sse41 = !!(ecx & (1 << 19));
+    features->x86.has_sse42 = !!(ecx & (1 << 20));
+    features->x86.has_pclmulqdq = !!(ecx & (1 << 1));
+
+    // Check XSAVE support for YMM
+    if (ecx & (1 << 27)) {  // OSXSAVE
+        uint64_t xcr0 = xgetbv(0);
+        features->x86.has_os_save_ymm = (xcr0 & 0x06) == 0x06;
+        features->x86.has_os_save_zmm = (xcr0 & 0xe6) == 0xe6;
+    }
+
+    // CPUID leaf 7
+    cpuidp(7, 0, &eax, &ebx, &ecx, &edx);
+    if (features->x86.has_os_save_ymm) {
+        features->x86.has_avx2 = !!(ebx & (1 << 5));
+    }
+    if (features->x86.has_os_save_zmm) {
+        features->x86.has_avx512f  = !!(ebx & (1 << 16));
+        features->x86.has_avx512dq = !!(ebx & (1 << 17));
+        features->x86.has_avx512bw = !!(ebx & (1 << 30));
+        features->x86.has_avx512vl = !!(ebx & (1 << 31));
+        features->x86.has_vpclmulqdq = !!(ecx & (1 << 10));
+        features->x86.has_avx512vnni = !!(ecx & (1 << 11));
+    }
+    features->x86.has_avx512_common =
+        features->x86.has_avx512f && features->x86.has_avx512dq &&
+        features->x86.has_avx512bw && features->x86.has_avx512vl;
+}
+```
+
+### OS Support Verification
+
+YMM (256-bit) and ZMM (512-bit) registers require OS support to save/restore
+context during context switches. `xgetbv(0)` reads the XCR0 register:
+
+- Bits 1+2 set → YMM state is saved (required for AVX2)
+- Bits 1+2+5+6+7 set → ZMM state is saved (required for AVX-512)
+
+Without OS support, using AVX2/AVX-512 instructions will fault.
+
+---
+
+## ARM Feature Detection
+
+ARM detection in `arch/arm/arm_features.c` uses platform-specific methods:
+
+**Linux**: Reads `/proc/cpuinfo` or uses `getauxval(AT_HWCAP)`:
+```c
+features->arm.has_neon = !!(hwcap & HWCAP_NEON);     // AArch32
+features->arm.has_neon = !!(hwcap & HWCAP_ASIMD);    // AArch64
+features->arm.has_crc32 = !!(hwcap & HWCAP_CRC32);
+features->arm.has_pmull = !!(hwcap & HWCAP_PMULL);
+```
+
+**macOS**: Uses `sysctlbyname()`:
+```c
+features->arm.has_neon = 1;  // Always available on Apple Silicon
+features->arm.has_crc32 = has_feature("hw.optional.armv8_crc32");
+features->arm.has_pmull = has_feature("hw.optional.arm.FEAT_PMULL");
+```
+
+---
+
+## The Function Table
+
+### `functable_s` Structure
+
+```c
+struct functable_s {
+    adler32_func          adler32;
+    adler32_copy_func     adler32_copy;
+    compare256_func       compare256;
+    crc32_func            crc32;
+    crc32_copy_func       crc32_copy;
+    inflate_fast_func     inflate_fast;
+    longest_match_func    longest_match;
+    longest_match_slow_func longest_match_slow;
+    slide_hash_func       slide_hash;
+    chunksize_func        chunksize;
+    chunkmemset_safe_func chunkmemset_safe;
+};
+```
+
+### Dispatch Cascade
+
+`functable.c` initialises the function table using a cascade:
+
+```c
+static void init_functable(void) {
+    struct cpu_features cf;
+    cpu_check_features(&cf);
+
+    // Start with generic C implementations
+    functable.adler32        = adler32_c;
+    functable.crc32          = crc32_braid;
+    functable.compare256     = compare256_c;
+    functable.longest_match  = longest_match_c;
+    functable.slide_hash     = slide_hash_c;
+    functable.inflate_fast   = inflate_fast_c;
+    functable.chunksize      = chunksize_c;
+    functable.chunkmemset_safe = chunkmemset_safe_c;
+
+#ifdef X86_SSE2
+    if (cf.x86.has_sse2) {
+        functable.chunksize = chunksize_sse2;
+        functable.chunkmemset_safe = chunkmemset_safe_sse2;
+        functable.compare256 = compare256_sse2;
+        functable.inflate_fast = inflate_fast_sse2;
+        functable.longest_match = longest_match_sse2;
+        functable.slide_hash = slide_hash_sse2;
+    }
+#endif
+#ifdef X86_SSSE3
+    if (cf.x86.has_ssse3) {
+        functable.adler32 = adler32_ssse3;
+    }
+#endif
+#ifdef X86_SSE42
+    if (cf.x86.has_sse42) {
+        functable.adler32 = adler32_sse42;
+        functable.compare256 = compare256_sse42;
+        functable.longest_match = longest_match_sse42;
+    }
+#endif
+#ifdef X86_PCLMULQDQ
+    if (cf.x86.has_pclmulqdq) {
+        functable.crc32 = crc32_pclmulqdq;
+    }
+#endif
+#ifdef X86_AVX2
+    if (cf.x86.has_avx2) {
+        functable.adler32 = adler32_avx2;
+        functable.chunksize = chunksize_avx2;
+        functable.chunkmemset_safe = chunkmemset_safe_avx2;
+        functable.compare256 = compare256_avx2;
+        functable.inflate_fast = inflate_fast_avx2;
+        functable.longest_match = longest_match_avx2;
+        functable.slide_hash = slide_hash_avx2;
+    }
+#endif
+#ifdef X86_AVX512
+    if (cf.x86.has_avx512_common) {
+        functable.adler32 = adler32_avx512;
+        functable.slide_hash = slide_hash_avx512;
+    }
+#endif
+#ifdef X86_AVX512VNNI
+    if (cf.x86.has_avx512vnni) {
+        functable.adler32 = adler32_avx512_vnni;
+    }
+#endif
+#ifdef X86_VPCLMULQDQ
+    if (cf.x86.has_vpclmulqdq && cf.x86.has_avx512_common) {
+        functable.crc32 = crc32_vpclmulqdq;
+    }
+#endif
+
+    // ARM cascade
+#ifdef ARM_NEON
+    if (cf.arm.has_neon) {
+        functable.adler32 = adler32_neon;
+        functable.chunksize = chunksize_neon;
+        functable.chunkmemset_safe = chunkmemset_safe_neon;
+        functable.compare256 = compare256_neon;
+        functable.slide_hash = slide_hash_neon;
+        functable.inflate_fast = inflate_fast_neon;
+        functable.longest_match = longest_match_neon;
+    }
+#endif
+#ifdef ARM_ACLE_CRC_HASH
+    if (cf.arm.has_crc32) {
+        functable.crc32 = crc32_acle;
+    }
+#endif
+
+    // Store with release semantics for thread safety
+    atomic_store_explicit(&functable_init_done, 1, memory_order_release);
+}
+```
+
+Later features override earlier ones, so the best available implementation
+wins.
+
+### Thread-Safe Initialisation
+
+The function table uses atomic operations for thread safety:
+
+```c
+static atomic_int functable_init_done = 0;
+static struct functable_s functable;
+
+#define FUNCTABLE_CALL(name) \
+    do { \
+        if (!atomic_load_explicit(&functable_init_done, memory_order_acquire)) \
+            init_functable(); \
+    } while (0); \
+    functable.name
+```
+
+The first call triggers initialisation; subsequent calls skip it via the
+atomic flag.
+
+---
+
+## Accelerated Operations
+
+### 1. Adler-32 Checksum
+
+**Scalar**: `adler32_c()` — byte-by-byte with NMAX blocking
+**SIMD**: Uses horizontal sum and dot product — SSE4.1/SSSE3/AVX2/AVX-512/VNNI/NEON/VMX/Power8/RVV/LASX
+
+### 2. CRC-32 Checksum
+
+**Scalar**: `crc32_braid()` — braided 5-word parallel CRC
+**SIMD**: Carry-less multiplication (CLMUL) for fast polynomial arithmetic — PCLMULQDQ/VPCLMULQDQ/PMULL/Power8/Zbc
+
+### 3. String Matching (`compare256`)
+
+Compares up to 256 bytes to find the longest match:
+
+**Scalar**: `compare256_c()` — byte-by-byte comparison
+**SIMD**: Loads 16/32/64 bytes at a time, uses `_mm_cmpeq_epi8` + `_mm_movemask_epi8` (SSE2) or equivalent to find the first mismatch
+
+### 4. Longest Match
+
+Wraps `compare256` with hash chain walking:
+
+```c
+longest_match_func longest_match;
+longest_match_slow_func longest_match_slow;
+```
+
+The `_slow` variant also inserts intermediate hash entries for level ≥ 9.
+
+### 5. Slide Hash
+
+Slides the hash table down by one window's worth:
+
+**Scalar**: `slide_hash_c()` — loop over HASH_SIZE + w_size entries
+**SIMD**: Processes 8/16/32 entries at a time using saturating subtract
+
+```c
+// SSE2 example pattern:
+__m128i vw = _mm_set1_epi16((uint16_t)s->w_size);
+for (...) {
+    __m128i v = _mm_loadu_si128(p);
+    v = _mm_subs_epu16(v, vw);  // Saturating subtract
+    _mm_storeu_si128(p, v);
+}
+```
+
+### 6. Chunk Memory Set (`chunkmemset_safe`)
+
+Fast memset/memcpy for inflate back-reference copying:
+
+**Scalar**: `chunkmemset_safe_c()` — handles overlap via small loops
+**SIMD**: Replicates the pattern into vector registers, handles even
+overlapping copies via broadcast
+
+### 7. Inflate Fast
+
+The hot inner loop of the inflate engine:
+
+**Scalar**: `inflate_fast_c()` — standard decode loop
+**SIMD**: Uses wider copy operations from chunkmemset for the back-reference
+copy step
+
+---
+
+## Compile-Time vs Runtime Detection
+
+### Runtime Detection (Default)
+
+Enabled by `WITH_RUNTIME_CPU_DETECTION=ON` (default):
+- All SIMD variants are compiled as separate translation units
+- `functable.c` selects the best at runtime
+- Binary runs on any CPU of the target architecture
+
+### Native Compilation
+
+Enabled by `WITH_NATIVE_INSTRUCTIONS=ON`:
+- Compiles with `-march=native` (or equivalent)
+- The compiler uses host CPU features directly
+- Slightly faster: no function pointer indirection
+- Binary only runs on the build machine's CPU (or compatible)
+
+### Disabling Runtime Detection
+
+`DISABLE_RUNTIME_CPU_DETECTION` can be defined to skip runtime checks
+and use only the generic C implementations, useful for constrained
+environments.
+
+---
+
+## Adding a New Architecture
+
+To add SIMD support for a new architecture:
+
+1. **Create `arch/<arch>/` directory** with feature detection and implementations
+2. **Define a feature structure** in `<arch>_features.h`
+3. **Implement `<arch>_check_features()`** using platform-specific detection
+4. **Implement accelerated functions** matching the `functable_s` signatures
+5. **Add dispatch entries** in `functable.c` guarded by feature flags
+6. **Add CMake detection** in `CMakeLists.txt`:
+   ```cmake
+   check_<arch>_intrinsics()
+   if(WITH_<ARCH>_<FEATURE>)
+       add_compile_options(-m<flag>)
+       list(APPEND ZLIB_ARCH_SRCS arch/<arch>/...)
+       add_definitions(-D<ARCH>_<FEATURE>)
+   endif()
+   ```
+
+---
+
+## Supported Architecture Matrix
+
+| Architecture | adler32 | crc32 | compare256 | longest_match | slide_hash | inflate_fast | chunkmemset |
+|---|---|---|---|---|---|---|---|
+| x86 SSE2 | – | – | ✓ | ✓ | ✓ | ✓ | ✓ |
+| x86 SSSE3 | ✓ | – | – | – | – | – | – |
+| x86 SSE4.1 | ✓ | – | – | – | – | – | – |
+| x86 SSE4.2 | ✓ | – | ✓ | ✓ | – | – | – |
+| x86 PCLMULQDQ | – | ✓ | – | – | – | – | – |
+| x86 AVX2 | ✓ | – | ✓ | ✓ | ✓ | ✓ | ✓ |
+| x86 AVX-512 | ✓ | – | – | – | ✓ | – | – |
+| x86 AVX-512+VNNI | ✓ | – | – | – | – | – | – |
+| x86 VPCLMULQDQ | – | ✓ | – | – | – | – | – |
+| ARM NEON | ✓ | – | ✓ | ✓ | ✓ | ✓ | ✓ |
+| ARM CRC32 | – | ✓ | – | – | – | – | – |
+| ARM PMULL | – | ✓ | – | – | – | – | – |
+| Power VMX | ✓ | – | – | – | ✓ | – | – |
+| Power8 | ✓ | ✓ | – | – | – | – | – |
+| Power9 | – | – | ✓ | ✓ | – | – | – |
+| RISC-V RVV | ✓ | – | ✓ | ✓ | – | – | – |
+| s390 CRC | – | ✓ | – | – | – | – | – |
+| LoongArch LSX | ✓ | – | – | – | ✓ | – | ✓ |
+| LoongArch LASX | ✓ | – | – | – | – | – | – |