diff options
Diffstat (limited to 'docs/handbook/neozip/x86-optimizations.md')
| -rw-r--r-- | docs/handbook/neozip/x86-optimizations.md | 439 |
1 files changed, 439 insertions, 0 deletions
diff --git a/docs/handbook/neozip/x86-optimizations.md b/docs/handbook/neozip/x86-optimizations.md new file mode 100644 index 0000000000..21b1a711d9 --- /dev/null +++ b/docs/handbook/neozip/x86-optimizations.md @@ -0,0 +1,439 @@ +# x86 Optimizations + +## Overview + +Neozip provides extensive x86 SIMD optimizations spanning SSE2, SSSE3, +SSE4.1, SSE4.2, PCLMULQDQ, AVX2, AVX-512, AVX-512+VNNI, and VPCLMULQDQ. +All implementations live in `arch/x86/` and are selected at runtime by +`functable.c` based on CPUID detection. + +--- + +## Source Files + +| File | ISA | Function | +|---|---|---| +| `x86_features.c/h` | — | CPUID feature detection | +| `adler32_avx2.c` | AVX2 | Adler-32 checksum | +| `adler32_avx512.c` | AVX-512 | Adler-32 checksum | +| `adler32_avx512_vnni.c` | AVX-512+VNNI | Adler-32 checksum | +| `adler32_sse42.c` | SSE4.2 | Adler-32 checksum | +| `adler32_ssse3.c` | SSSE3 | Adler-32 checksum | +| `crc32_pclmulqdq.c` | PCLMULQDQ | CRC-32 (carry-less multiply) | +| `crc32_vpclmulqdq.c` | VPCLMULQDQ | CRC-32 (AVX-512 CLMUL) | +| `compare256_avx2.c` | AVX2 | 256-byte comparison | +| `compare256_sse2.c` | SSE2 | 256-byte comparison | +| `compare256_sse42.c` | SSE4.2 | 256-byte comparison | +| `chunkset_avx2.c` | AVX2 | Pattern fill for inflate | +| `chunkset_sse2.c` | SSE2 | Pattern fill for inflate | +| `slide_hash_avx2.c` | AVX2 | Hash table slide | +| `slide_hash_avx512.c` | AVX-512 | Hash table slide | +| `slide_hash_sse2.c` | SSE2 | Hash table slide | +| `insert_string_sse42.c` | SSE4.2 | CRC-based hash insertion | +| `inffast_avx2.c` | AVX2 | Fast inflate inner loop | +| `inffast_sse2.c` | SSE2 | Fast inflate inner loop | + +--- + +## Feature Detection + +### CPUID Queries + +`x86_features.c` queries CPUID leaves 1 and 7: + +```c +void Z_INTERNAL x86_check_features(struct cpu_features *features) { + unsigned eax, ebx, ecx, edx; + + // Leaf 1 — basic features + cpuid(1, &eax, &ebx, &ecx, &edx); + features->x86.has_sse2 = !!(edx & (1 << 26)); + features->x86.has_ssse3 = !!(ecx & (1 << 9)); + features->x86.has_sse41 = !!(ecx & (1 << 19)); + features->x86.has_sse42 = !!(ecx & (1 << 20)); + features->x86.has_pclmulqdq = !!(ecx & (1 << 1)); + + // Check OS YMM/ZMM support via XSAVE/XGETBV + if (ecx & (1 << 27)) { + uint64_t xcr0 = xgetbv(0); + features->x86.has_os_save_ymm = ((xcr0 & 0x06) == 0x06); + features->x86.has_os_save_zmm = ((xcr0 & 0xe6) == 0xe6); + } + + // Leaf 7, sub-leaf 0 — extended features + cpuidp(7, 0, &eax, &ebx, &ecx, &edx); + if (features->x86.has_os_save_ymm) + features->x86.has_avx2 = !!(ebx & (1 << 5)); + if (features->x86.has_os_save_zmm) { + features->x86.has_avx512f = !!(ebx & (1 << 16)); + features->x86.has_avx512dq = !!(ebx & (1 << 17)); + features->x86.has_avx512bw = !!(ebx & (1 << 30)); + features->x86.has_avx512vl = !!(ebx & (1 << 31)); + features->x86.has_vpclmulqdq = !!(ecx & (1 << 10)); + features->x86.has_avx512vnni = !!(ecx & (1 << 11)); + } + features->x86.has_avx512_common = + features->x86.has_avx512f && features->x86.has_avx512dq && + features->x86.has_avx512bw && features->x86.has_avx512vl; +} +``` + +### `xgetbv()` — Reading Extended Control Register + +```c +static inline uint64_t xgetbv(unsigned xcr) { + uint32_t eax, edx; + __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return ((uint64_t)edx << 32) | eax; +} +``` + +This verifies the OS has enabled the save/restore of wider register files. +Without this check, using YMM/ZMM registers would cause a #UD fault. + +--- + +## Adler-32 Implementations + +### SSSE3 (`adler32_ssse3.c`) + +Uses `_mm_maddubs_epi16` for weighted position sums on 16-byte vectors: + +```c +Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) { + __m128i vs1 = _mm_cvtsi32_si128(adler & 0xffff); + __m128i vs2 = _mm_cvtsi32_si128(adler >> 16); + const __m128i dot2v = _mm_setr_epi8(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); + + while (len >= 16) { + __m128i vbuf = _mm_loadu_si128((__m128i *)buf); + // sum1 += bytes + vs1 = _mm_add_epi32(vs1, _mm_sad_epu8(vbuf, _mm_setzero_si128())); + // sum2 += position_weight * bytes + __m128i vtmp = _mm_maddubs_epi16(vbuf, dot2v); + vs2 = _mm_add_epi32(vs2, _mm_madd_epi16(vtmp, _mm_set1_epi16(1))); + // Accumulate 16 * prev_s1 into s2 + vs2 = _mm_add_epi32(vs2, _mm_slli_epi32(vs1_0, 4)); + buf += 16; + len -= 16; + } + // Horizontal reduction and MOD BASE +} +``` + +### AVX2 (`adler32_avx2.c`) + +Processes 32 bytes per iteration using 256-bit registers: + +```c +Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len) { + static const uint8_t dot2v_data[] = {32,31,30,...,2,1}; + __m256i vdot2v = _mm256_loadu_si256((__m256i*)dot2v_data); + __m256i vs1 = _mm256_set_epi32(0,0,0,0,0,0,0, adler & 0xffff); + __m256i vs2 = _mm256_set_epi32(0,0,0,0,0,0,0, adler >> 16); + + while (len >= 32) { + __m256i vbuf = _mm256_loadu_si256((__m256i *)buf); + // s1 += sum of all bytes (using SAD against zero) + vs1 = _mm256_add_epi32(vs1, + _mm256_sad_epu8(vbuf, _mm256_setzero_si256())); + // s2 += weighted sum (dot product approach) + __m256i vtmp = _mm256_maddubs_epi16(vbuf, vdot2v); + vs2 = _mm256_add_epi32(vs2, + _mm256_madd_epi16(vtmp, _mm256_set1_epi16(1))); + // s2 += 32 * prev_s1 + vs2 = _mm256_add_epi32(vs2, _mm256_slli_epi32(vs1_0, 5)); + buf += 32; + len -= 32; + } +} +``` + +The `_mm256_maddubs_epi16` instruction multiplies unsigned bytes by signed +bytes and sums adjacent pairs, computing the weighted position sum in one +instruction. `_mm256_sad_epu8` computes the horizontal sum of bytes. + +### AVX-512 (`adler32_avx512.c`) + +Processes 64 bytes per iteration using 512-bit `__m512i` registers: + +```c +__m512i vs1 = _mm512_set_epi32(0,...,0, adler & 0xffff); +__m512i vs2 = _mm512_set_epi32(0,...,0, adler >> 16); + +while (len >= 64) { + __m512i vbuf = _mm512_loadu_si512(buf); + vs1 = _mm512_add_epi32(vs1, _mm512_sad_epu8(vbuf, _mm512_setzero_si512())); + __m512i vtmp = _mm512_maddubs_epi16(vbuf, vdot2v); + vs2 = _mm512_add_epi32(vs2, _mm512_madd_epi16(vtmp, vones)); + vs2 = _mm512_add_epi32(vs2, _mm512_slli_epi32(vs1_0, 6)); + buf += 64; + len -= 64; +} +``` + +### AVX-512+VNNI (`adler32_avx512_vnni.c`) + +Uses `_mm512_dpbusd_epi32` (dot product of unsigned bytes and signed bytes), +available with the VNNI extension: + +```c +// VPDPBUSD replaces maddubs + madd sequence with a single instruction +vs2 = _mm512_dpbusd_epi32(vs2, vbuf, vdot2v); +``` + +--- + +## CRC-32 Implementations + +### PCLMULQDQ (`crc32_pclmulqdq.c`) + +Uses carry-less multiplication for CRC folding. Processes 64 bytes per +iteration with four XMM accumulators: + +```c +Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) { + __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; + __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, + 0x00000001, 0xc6e41596); + + // Init: XOR CRC into first 16 bytes of data + xmm_crc0 = _mm_xor_si128(_mm_loadu_si128(buf), _mm_cvtsi32_si128(crc)); + xmm_crc1 = _mm_loadu_si128(buf + 16); + xmm_crc2 = _mm_loadu_si128(buf + 32); + xmm_crc3 = _mm_loadu_si128(buf + 48); + + // Main loop: fold 64 bytes per iteration + while (len >= 64) { + // For each accumulator: + // crc_n = clmul(crc_n, fold4, 0x01) ^ clmul(crc_n, fold4, 0x10) ^ next_data + __m128i xmm_t0 = _mm_clmulepi64_si128(xmm_crc0, xmm_fold4, 0x01); + __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_crc0, xmm_fold4, 0x10); + xmm_crc0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, xmm_t1), + _mm_loadu_si128(next++)); + // repeat for crc1..crc3 + } + + // Fold 4→1, then Barrett reduction to 32-bit CRC + // ... +} +``` + +### VPCLMULQDQ (`crc32_vpclmulqdq.c`) + +Uses AVX-512 carry-less multiply to process 256 bytes per iteration +with four ZMM (512-bit) accumulators: + +```c +__m512i zmm_crc0 = _mm512_loadu_si512(buf); +zmm_crc0 = _mm512_xor_si512(zmm_crc0, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); +// ... 3 more accumulators + +while (len >= 256) { + __m512i zmm_t0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + __m512i zmm_t1 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_ternarylogic_epi64(zmm_t0, zmm_t1, + _mm512_loadu_si512(next++), 0x96); + // XOR three values in one instruction via ternarylogic +} +``` + +`_mm512_ternarylogic_epi64(..., 0x96)` computes `A ^ B ^ C` in a single +instruction, fusing two XOR operations. + +--- + +## String Comparison (`compare256`) + +### SSE2 (`compare256_sse2.c`) + +```c +Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + do { + __m128i v0 = _mm_loadu_si128((__m128i *)(src0 + len)); + __m128i v1 = _mm_loadu_si128((__m128i *)(src1 + len)); + __m128i cmp = _mm_cmpeq_epi8(v0, v1); + unsigned mask = (unsigned)_mm_movemask_epi8(cmp); + if (mask != 0xffff) { + // Find first mismatch + return len + __builtin_ctz(~mask); + } + len += 16; + } while (len < 256); + return 256; +} +``` + +### AVX2 (`compare256_avx2.c`) + +Same approach with 32-byte vectors: + +```c +Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + do { + __m256i v0 = _mm256_loadu_si256((__m256i *)(src0 + len)); + __m256i v1 = _mm256_loadu_si256((__m256i *)(src1 + len)); + __m256i cmp = _mm256_cmpeq_epi8(v0, v1); + unsigned mask = (unsigned)_mm256_movemask_epi8(cmp); + if (mask != 0xffffffff) { + return len + __builtin_ctz(~mask); + } + len += 32; + } while (len < 256); + return 256; +} +``` + +### SSE4.2 (`compare256_sse42.c`) + +Uses `_mm_cmpistri` (string compare instruction): + +```c +Z_INTERNAL uint32_t compare256_sse42(const uint8_t *src0, const uint8_t *src1) { + // _mm_cmpistri with EQUAL_EACH | NEGATIVE_POLARITY finds first mismatch + // in a 16-byte comparison +} +``` + +--- + +## Slide Hash + +### SSE2 (`slide_hash_sse2.c`) + +```c +Z_INTERNAL void slide_hash_sse2(deflate_state *s) { + Pos *p; + unsigned n; + __m128i xmm_wsize = _mm_set1_epi16((uint16_t)s->w_size); + + n = HASH_SIZE; + p = &s->head[n]; + do { + p -= 8; + __m128i value = _mm_loadu_si128((__m128i *)p); + _mm_storeu_si128((__m128i *)p, + _mm_subs_epu16(value, xmm_wsize)); // Saturating subtract + n -= 8; + } while (n); + // Same for s->prev +} +``` + +### AVX-512 (`slide_hash_avx512.c`) + +Processes 32 entries (64 bytes) per iteration: + +```c +Z_INTERNAL void slide_hash_avx512(deflate_state *s) { + __m512i zmm_wsize = _mm512_set1_epi16((uint16_t)s->w_size); + // Process 32 uint16_t entries per iteration + for (...) { + __m512i v = _mm512_loadu_si512(p); + _mm512_storeu_si512(p, _mm512_subs_epu16(v, zmm_wsize)); + } +} +``` + +--- + +## Hash Insertion (SSE4.2) + +`insert_string_sse42.c` uses the hardware CRC32 instruction for hashing: + +```c +Z_INTERNAL Pos insert_string_sse42(deflate_state *s, + Pos str, unsigned count) { + Pos idx; + for (unsigned i = 0; i < count; i++) { + unsigned val = *(uint32_t *)(s->window + str + i); + uint32_t h = 0; + h = _mm_crc32_u32(h, val); // Hardware CRC32C + h &= s->hash_mask; + idx = s->head[h]; + s->prev[str + i & s->w_mask] = idx; + s->head[h] = (Pos)(str + i); + } + return idx; +} +``` + +The CRC32C instruction provides excellent hash distribution with near-zero +cost. + +--- + +## Chunkset (Inflate Copy) + +### SSE2 (`chunkset_sse2.c`) + +Used during inflate for back-reference copying: + +```c +Z_INTERNAL uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, + unsigned dist, unsigned len) { + if (dist >= 16) { + // Standard copy with SSE2 loads/stores + while (len >= 16) { + _mm_storeu_si128((__m128i *)out, _mm_loadu_si128((__m128i *)from)); + out += 16; + from += 16; + len -= 16; + } + } else { + // Replicate pattern: broadcast dist-byte pattern into 16 bytes + // Handle dist=1 (memset), dist=2, dist=4, dist=8 specially + __m128i pattern = replicate_pattern(from, dist); + while (len >= 16) { + _mm_storeu_si128((__m128i *)out, pattern); + out += 16; + len -= 16; + } + } + return out; +} +``` + +### AVX2 (`chunkset_avx2.c`) + +Same pattern with 32-byte chunks: + +```c +// Replicate to 256-bit and store 32 bytes at a time +__m256i pattern = _mm256_broadcastsi128_si256(pattern_128); +while (len >= 32) { + _mm256_storeu_si256((__m256i *)out, pattern); + out += 32; + len -= 32; +} +``` + +--- + +## CMake Configuration + +Each x86 SIMD feature has a corresponding `WITH_` option: + +```cmake +option(WITH_SSE2 "Build with SSE2" ON) +option(WITH_SSSE3 "Build with SSSE3" ON) +option(WITH_SSE42 "Build with SSE4.2" ON) +option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON) +option(WITH_AVX2 "Build with AVX2" ON) +option(WITH_AVX512 "Build with AVX-512" ON) +option(WITH_AVX512VNNI "Build with AVX512VNNI" ON) +option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON) +``` + +Each source file is compiled with its minimum required flags: + +```cmake +set_property(SOURCE arch/x86/adler32_avx2.c APPEND PROPERTY COMPILE_OPTIONS -mavx2) +set_property(SOURCE arch/x86/crc32_pclmulqdq.c APPEND PROPERTY COMPILE_OPTIONS -mpclmul -msse4.2) +set_property(SOURCE arch/x86/crc32_vpclmulqdq.c APPEND PROPERTY COMPILE_OPTIONS -mvpclmulqdq -mavx512f) +``` + +This ensures the main code compiles without SIMD requirements while +individual acceleration files use their specific instruction sets. |
