1 files changed, 439 insertions, 0 deletions
diff --git a/docs/handbook/neozip/x86-optimizations.md b/docs/handbook/neozip/x86-optimizations.md
new file mode 100644
index 0000000000..21b1a711d9
--- /dev/null
+++ b/docs/handbook/neozip/x86-optimizations.md
@@ -0,0 +1,439 @@
+# x86 Optimizations
+
+## Overview
+
+Neozip provides extensive x86 SIMD optimizations spanning SSE2, SSSE3,
+SSE4.1, SSE4.2, PCLMULQDQ, AVX2, AVX-512, AVX-512+VNNI, and VPCLMULQDQ.
+All implementations live in `arch/x86/` and are selected at runtime by
+`functable.c` based on CPUID detection.
+
+---
+
+## Source Files
+
+| File | ISA | Function |
+|---|---|---|
+| `x86_features.c/h` | — | CPUID feature detection |
+| `adler32_avx2.c` | AVX2 | Adler-32 checksum |
+| `adler32_avx512.c` | AVX-512 | Adler-32 checksum |
+| `adler32_avx512_vnni.c` | AVX-512+VNNI | Adler-32 checksum |
+| `adler32_sse42.c` | SSE4.2 | Adler-32 checksum |
+| `adler32_ssse3.c` | SSSE3 | Adler-32 checksum |
+| `crc32_pclmulqdq.c` | PCLMULQDQ | CRC-32 (carry-less multiply) |
+| `crc32_vpclmulqdq.c` | VPCLMULQDQ | CRC-32 (AVX-512 CLMUL) |
+| `compare256_avx2.c` | AVX2 | 256-byte comparison |
+| `compare256_sse2.c` | SSE2 | 256-byte comparison |
+| `compare256_sse42.c` | SSE4.2 | 256-byte comparison |
+| `chunkset_avx2.c` | AVX2 | Pattern fill for inflate |
+| `chunkset_sse2.c` | SSE2 | Pattern fill for inflate |
+| `slide_hash_avx2.c` | AVX2 | Hash table slide |
+| `slide_hash_avx512.c` | AVX-512 | Hash table slide |
+| `slide_hash_sse2.c` | SSE2 | Hash table slide |
+| `insert_string_sse42.c` | SSE4.2 | CRC-based hash insertion |
+| `inffast_avx2.c` | AVX2 | Fast inflate inner loop |
+| `inffast_sse2.c` | SSE2 | Fast inflate inner loop |
+
+---
+
+## Feature Detection
+
+### CPUID Queries
+
+`x86_features.c` queries CPUID leaves 1 and 7:
+
+```c
+void Z_INTERNAL x86_check_features(struct cpu_features *features) {
+    unsigned eax, ebx, ecx, edx;
+
+    // Leaf 1 — basic features
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    features->x86.has_sse2       = !!(edx & (1 << 26));
+    features->x86.has_ssse3      = !!(ecx & (1 << 9));
+    features->x86.has_sse41      = !!(ecx & (1 << 19));
+    features->x86.has_sse42      = !!(ecx & (1 << 20));
+    features->x86.has_pclmulqdq  = !!(ecx & (1 << 1));
+
+    // Check OS YMM/ZMM support via XSAVE/XGETBV
+    if (ecx & (1 << 27)) {
+        uint64_t xcr0 = xgetbv(0);
+        features->x86.has_os_save_ymm = ((xcr0 & 0x06) == 0x06);
+        features->x86.has_os_save_zmm = ((xcr0 & 0xe6) == 0xe6);
+    }
+
+    // Leaf 7, sub-leaf 0 — extended features
+    cpuidp(7, 0, &eax, &ebx, &ecx, &edx);
+    if (features->x86.has_os_save_ymm)
+        features->x86.has_avx2 = !!(ebx & (1 << 5));
+    if (features->x86.has_os_save_zmm) {
+        features->x86.has_avx512f   = !!(ebx & (1 << 16));
+        features->x86.has_avx512dq  = !!(ebx & (1 << 17));
+        features->x86.has_avx512bw  = !!(ebx & (1 << 30));
+        features->x86.has_avx512vl  = !!(ebx & (1 << 31));
+        features->x86.has_vpclmulqdq = !!(ecx & (1 << 10));
+        features->x86.has_avx512vnni = !!(ecx & (1 << 11));
+    }
+    features->x86.has_avx512_common =
+        features->x86.has_avx512f && features->x86.has_avx512dq &&
+        features->x86.has_avx512bw && features->x86.has_avx512vl;
+}
+```
+
+### `xgetbv()` — Reading Extended Control Register
+
+```c
+static inline uint64_t xgetbv(unsigned xcr) {
+    uint32_t eax, edx;
+    __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
+    return ((uint64_t)edx << 32) | eax;
+}
+```
+
+This verifies the OS has enabled the save/restore of wider register files.
+Without this check, using YMM/ZMM registers would cause a #UD fault.
+
+---
+
+## Adler-32 Implementations
+
+### SSSE3 (`adler32_ssse3.c`)
+
+Uses `_mm_maddubs_epi16` for weighted position sums on 16-byte vectors:
+
+```c
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+    __m128i vs1 = _mm_cvtsi32_si128(adler & 0xffff);
+    __m128i vs2 = _mm_cvtsi32_si128(adler >> 16);
+    const __m128i dot2v = _mm_setr_epi8(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
+
+    while (len >= 16) {
+        __m128i vbuf = _mm_loadu_si128((__m128i *)buf);
+        // sum1 += bytes
+        vs1 = _mm_add_epi32(vs1, _mm_sad_epu8(vbuf, _mm_setzero_si128()));
+        // sum2 += position_weight * bytes
+        __m128i vtmp = _mm_maddubs_epi16(vbuf, dot2v);
+        vs2 = _mm_add_epi32(vs2, _mm_madd_epi16(vtmp, _mm_set1_epi16(1)));
+        // Accumulate 16 * prev_s1 into s2
+        vs2 = _mm_add_epi32(vs2, _mm_slli_epi32(vs1_0, 4));
+        buf += 16;
+        len -= 16;
+    }
+    // Horizontal reduction and MOD BASE
+}
+```
+
+### AVX2 (`adler32_avx2.c`)
+
+Processes 32 bytes per iteration using 256-bit registers:
+
+```c
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len) {
+    static const uint8_t dot2v_data[] = {32,31,30,...,2,1};
+    __m256i vdot2v = _mm256_loadu_si256((__m256i*)dot2v_data);
+    __m256i vs1 = _mm256_set_epi32(0,0,0,0,0,0,0, adler & 0xffff);
+    __m256i vs2 = _mm256_set_epi32(0,0,0,0,0,0,0, adler >> 16);
+
+    while (len >= 32) {
+        __m256i vbuf = _mm256_loadu_si256((__m256i *)buf);
+        // s1 += sum of all bytes (using SAD against zero)
+        vs1 = _mm256_add_epi32(vs1,
+            _mm256_sad_epu8(vbuf, _mm256_setzero_si256()));
+        // s2 += weighted sum (dot product approach)
+        __m256i vtmp = _mm256_maddubs_epi16(vbuf, vdot2v);
+        vs2 = _mm256_add_epi32(vs2,
+            _mm256_madd_epi16(vtmp, _mm256_set1_epi16(1)));
+        // s2 += 32 * prev_s1
+        vs2 = _mm256_add_epi32(vs2, _mm256_slli_epi32(vs1_0, 5));
+        buf += 32;
+        len -= 32;
+    }
+}
+```
+
+The `_mm256_maddubs_epi16` instruction multiplies unsigned bytes by signed
+bytes and sums adjacent pairs, computing the weighted position sum in one
+instruction. `_mm256_sad_epu8` computes the horizontal sum of bytes.
+
+### AVX-512 (`adler32_avx512.c`)
+
+Processes 64 bytes per iteration using 512-bit `__m512i` registers:
+
+```c
+__m512i vs1 = _mm512_set_epi32(0,...,0, adler & 0xffff);
+__m512i vs2 = _mm512_set_epi32(0,...,0, adler >> 16);
+
+while (len >= 64) {
+    __m512i vbuf = _mm512_loadu_si512(buf);
+    vs1 = _mm512_add_epi32(vs1, _mm512_sad_epu8(vbuf, _mm512_setzero_si512()));
+    __m512i vtmp = _mm512_maddubs_epi16(vbuf, vdot2v);
+    vs2 = _mm512_add_epi32(vs2, _mm512_madd_epi16(vtmp, vones));
+    vs2 = _mm512_add_epi32(vs2, _mm512_slli_epi32(vs1_0, 6));
+    buf += 64;
+    len -= 64;
+}
+```
+
+### AVX-512+VNNI (`adler32_avx512_vnni.c`)
+
+Uses `_mm512_dpbusd_epi32` (dot product of unsigned bytes and signed bytes),
+available with the VNNI extension:
+
+```c
+// VPDPBUSD replaces maddubs + madd sequence with a single instruction
+vs2 = _mm512_dpbusd_epi32(vs2, vbuf, vdot2v);
+```
+
+---
+
+## CRC-32 Implementations
+
+### PCLMULQDQ (`crc32_pclmulqdq.c`)
+
+Uses carry-less multiplication for CRC folding. Processes 64 bytes per
+iteration with four XMM accumulators:
+
+```c
+Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) {
+    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
+    __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4,
+                                        0x00000001, 0xc6e41596);
+
+    // Init: XOR CRC into first 16 bytes of data
+    xmm_crc0 = _mm_xor_si128(_mm_loadu_si128(buf), _mm_cvtsi32_si128(crc));
+    xmm_crc1 = _mm_loadu_si128(buf + 16);
+    xmm_crc2 = _mm_loadu_si128(buf + 32);
+    xmm_crc3 = _mm_loadu_si128(buf + 48);
+
+    // Main loop: fold 64 bytes per iteration
+    while (len >= 64) {
+        // For each accumulator:
+        // crc_n = clmul(crc_n, fold4, 0x01) ^ clmul(crc_n, fold4, 0x10) ^ next_data
+        __m128i xmm_t0 = _mm_clmulepi64_si128(xmm_crc0, xmm_fold4, 0x01);
+        __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_crc0, xmm_fold4, 0x10);
+        xmm_crc0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, xmm_t1),
+                                   _mm_loadu_si128(next++));
+        // repeat for crc1..crc3
+    }
+
+    // Fold 4→1, then Barrett reduction to 32-bit CRC
+    // ...
+}
+```
+
+### VPCLMULQDQ (`crc32_vpclmulqdq.c`)
+
+Uses AVX-512 carry-less multiply to process 256 bytes per iteration
+with four ZMM (512-bit) accumulators:
+
+```c
+__m512i zmm_crc0 = _mm512_loadu_si512(buf);
+zmm_crc0 = _mm512_xor_si512(zmm_crc0, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+// ... 3 more accumulators
+
+while (len >= 256) {
+    __m512i zmm_t0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    __m512i zmm_t1 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi64(zmm_t0, zmm_t1,
+                                           _mm512_loadu_si512(next++), 0x96);
+    // XOR three values in one instruction via ternarylogic
+}
+```
+
+`_mm512_ternarylogic_epi64(..., 0x96)` computes `A ^ B ^ C` in a single
+instruction, fusing two XOR operations.
+
+---
+
+## String Comparison (`compare256`)
+
+### SSE2 (`compare256_sse2.c`)
+
+```c
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    do {
+        __m128i v0 = _mm_loadu_si128((__m128i *)(src0 + len));
+        __m128i v1 = _mm_loadu_si128((__m128i *)(src1 + len));
+        __m128i cmp = _mm_cmpeq_epi8(v0, v1);
+        unsigned mask = (unsigned)_mm_movemask_epi8(cmp);
+        if (mask != 0xffff) {
+            // Find first mismatch
+            return len + __builtin_ctz(~mask);
+        }
+        len += 16;
+    } while (len < 256);
+    return 256;
+}
+```
+
+### AVX2 (`compare256_avx2.c`)
+
+Same approach with 32-byte vectors:
+
+```c
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    do {
+        __m256i v0 = _mm256_loadu_si256((__m256i *)(src0 + len));
+        __m256i v1 = _mm256_loadu_si256((__m256i *)(src1 + len));
+        __m256i cmp = _mm256_cmpeq_epi8(v0, v1);
+        unsigned mask = (unsigned)_mm256_movemask_epi8(cmp);
+        if (mask != 0xffffffff) {
+            return len + __builtin_ctz(~mask);
+        }
+        len += 32;
+    } while (len < 256);
+    return 256;
+}
+```
+
+### SSE4.2 (`compare256_sse42.c`)
+
+Uses `_mm_cmpistri` (string compare instruction):
+
+```c
+Z_INTERNAL uint32_t compare256_sse42(const uint8_t *src0, const uint8_t *src1) {
+    // _mm_cmpistri with EQUAL_EACH | NEGATIVE_POLARITY finds first mismatch
+    // in a 16-byte comparison
+}
+```
+
+---
+
+## Slide Hash
+
+### SSE2 (`slide_hash_sse2.c`)
+
+```c
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    Pos *p;
+    unsigned n;
+    __m128i xmm_wsize = _mm_set1_epi16((uint16_t)s->w_size);
+
+    n = HASH_SIZE;
+    p = &s->head[n];
+    do {
+        p -= 8;
+        __m128i value = _mm_loadu_si128((__m128i *)p);
+        _mm_storeu_si128((__m128i *)p,
+            _mm_subs_epu16(value, xmm_wsize));  // Saturating subtract
+        n -= 8;
+    } while (n);
+    // Same for s->prev
+}
+```
+
+### AVX-512 (`slide_hash_avx512.c`)
+
+Processes 32 entries (64 bytes) per iteration:
+
+```c
+Z_INTERNAL void slide_hash_avx512(deflate_state *s) {
+    __m512i zmm_wsize = _mm512_set1_epi16((uint16_t)s->w_size);
+    // Process 32 uint16_t entries per iteration
+    for (...) {
+        __m512i v = _mm512_loadu_si512(p);
+        _mm512_storeu_si512(p, _mm512_subs_epu16(v, zmm_wsize));
+    }
+}
+```
+
+---
+
+## Hash Insertion (SSE4.2)
+
+`insert_string_sse42.c` uses the hardware CRC32 instruction for hashing:
+
+```c
+Z_INTERNAL Pos insert_string_sse42(deflate_state *s,
+                                    Pos str, unsigned count) {
+    Pos idx;
+    for (unsigned i = 0; i < count; i++) {
+        unsigned val = *(uint32_t *)(s->window + str + i);
+        uint32_t h = 0;
+        h = _mm_crc32_u32(h, val);    // Hardware CRC32C
+        h &= s->hash_mask;
+        idx = s->head[h];
+        s->prev[str + i & s->w_mask] = idx;
+        s->head[h] = (Pos)(str + i);
+    }
+    return idx;
+}
+```
+
+The CRC32C instruction provides excellent hash distribution with near-zero
+cost.
+
+---
+
+## Chunkset (Inflate Copy)
+
+### SSE2 (`chunkset_sse2.c`)
+
+Used during inflate for back-reference copying:
+
+```c
+Z_INTERNAL uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from,
+                                            unsigned dist, unsigned len) {
+    if (dist >= 16) {
+        // Standard copy with SSE2 loads/stores
+        while (len >= 16) {
+            _mm_storeu_si128((__m128i *)out, _mm_loadu_si128((__m128i *)from));
+            out += 16;
+            from += 16;
+            len -= 16;
+        }
+    } else {
+        // Replicate pattern: broadcast dist-byte pattern into 16 bytes
+        // Handle dist=1 (memset), dist=2, dist=4, dist=8 specially
+        __m128i pattern = replicate_pattern(from, dist);
+        while (len >= 16) {
+            _mm_storeu_si128((__m128i *)out, pattern);
+            out += 16;
+            len -= 16;
+        }
+    }
+    return out;
+}
+```
+
+### AVX2 (`chunkset_avx2.c`)
+
+Same pattern with 32-byte chunks:
+
+```c
+// Replicate to 256-bit and store 32 bytes at a time
+__m256i pattern = _mm256_broadcastsi128_si256(pattern_128);
+while (len >= 32) {
+    _mm256_storeu_si256((__m256i *)out, pattern);
+    out += 32;
+    len -= 32;
+}
+```
+
+---
+
+## CMake Configuration
+
+Each x86 SIMD feature has a corresponding `WITH_` option:
+
+```cmake
+option(WITH_SSE2 "Build with SSE2" ON)
+option(WITH_SSSE3 "Build with SSSE3" ON)
+option(WITH_SSE42 "Build with SSE4.2" ON)
+option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON)
+option(WITH_AVX2 "Build with AVX2" ON)
+option(WITH_AVX512 "Build with AVX-512" ON)
+option(WITH_AVX512VNNI "Build with AVX512VNNI" ON)
+option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON)
+```
+
+Each source file is compiled with its minimum required flags:
+
+```cmake
+set_property(SOURCE arch/x86/adler32_avx2.c APPEND PROPERTY COMPILE_OPTIONS -mavx2)
+set_property(SOURCE arch/x86/crc32_pclmulqdq.c APPEND PROPERTY COMPILE_OPTIONS -mpclmul -msse4.2)
+set_property(SOURCE arch/x86/crc32_vpclmulqdq.c APPEND PROPERTY COMPILE_OPTIONS -mvpclmulqdq -mavx512f)
+```
+
+This ensures the main code compiles without SIMD requirements while
+individual acceleration files use their specific instruction sets.