diff options
Diffstat (limited to 'docs/handbook/neozip/hardware-acceleration.md')
| -rw-r--r-- | docs/handbook/neozip/hardware-acceleration.md | 447 |
1 files changed, 447 insertions, 0 deletions
diff --git a/docs/handbook/neozip/hardware-acceleration.md b/docs/handbook/neozip/hardware-acceleration.md new file mode 100644 index 0000000000..b087e5a2ab --- /dev/null +++ b/docs/handbook/neozip/hardware-acceleration.md @@ -0,0 +1,447 @@ +# Hardware Acceleration + +## Overview + +Neozip dispatches compression and decompression operations to the best +available hardware-accelerated implementation at runtime. This is achieved +through a function table (`functable`), CPU feature detection, and +architecture-specific source files compiled with appropriate SIMD flags. + +--- + +## CPU Feature Detection + +### `cpu_features.c` + +The entry point for feature detection: + +```c +void Z_INTERNAL cpu_check_features(struct cpu_features *features) { + // Zero out features + memset(features, 0, sizeof(*features)); + +#if defined(X86_FEATURES) + x86_check_features(features); +#elif defined(ARM_FEATURES) + arm_check_features(features); +#elif defined(POWER_FEATURES) + power_check_features(features); +#elif defined(S390_FEATURES) + s390_check_features(features); +#elif defined(RISCV_FEATURES) + riscv_check_features(features); +#elif defined(LOONGARCH_FEATURES) + loongarch_check_features(features); +#endif +} +``` + +### CPU Feature Structures + +```c +struct cpu_features { + union { +#if defined(X86_FEATURES) + struct x86_cpu_features x86; +#elif defined(ARM_FEATURES) + struct arm_cpu_features arm; +#elif defined(POWER_FEATURES) + struct power_cpu_features power; +#elif defined(S390_FEATURES) + struct s390_cpu_features s390; +#elif defined(RISCV_FEATURES) + struct riscv_cpu_features riscv; +#elif defined(LOONGARCH_FEATURES) + struct loongarch_cpu_features loongarch; +#endif + }; +}; +``` + +Each architecture defines its own feature structure: + +**x86** (`x86_features.h`): +```c +struct x86_cpu_features { + int has_avx2; + int has_avx512f; + int has_avx512dq; + int has_avx512bw; + int has_avx512vl; + int has_avx512_common; // All of f+dq+bw+vl + int has_avx512vnni; + int has_sse2; + int has_ssse3; + int has_sse41; + int has_sse42; + int has_pclmulqdq; + int has_vpclmulqdq; + int has_os_save_ymm; + int has_os_save_zmm; +}; +``` + +**ARM** (`arm_features.h`): +```c +struct arm_cpu_features { + int has_simd; // ARMv6 SIMD + int has_neon; // ARMv7+ NEON / AArch64 ASIMD + int has_crc32; // CRC32 instructions + int has_pmull; // PMULL (polynomial multiply long) + int has_eor3; // SHA3 EOR3 instruction + int has_fast_pmull; // High-perf PMULL available +}; +``` + +--- + +## x86 Feature Detection + +`x86_check_features()` in `arch/x86/x86_features.c` uses CPUID: + +```c +void Z_INTERNAL x86_check_features(struct cpu_features *features) { + unsigned eax, ebx, ecx, edx; + + // CPUID leaf 1 + cpuid(1, &eax, &ebx, &ecx, &edx); + features->x86.has_sse2 = !!(edx & (1 << 26)); + features->x86.has_ssse3 = !!(ecx & (1 << 9)); + features->x86.has_sse41 = !!(ecx & (1 << 19)); + features->x86.has_sse42 = !!(ecx & (1 << 20)); + features->x86.has_pclmulqdq = !!(ecx & (1 << 1)); + + // Check XSAVE support for YMM + if (ecx & (1 << 27)) { // OSXSAVE + uint64_t xcr0 = xgetbv(0); + features->x86.has_os_save_ymm = (xcr0 & 0x06) == 0x06; + features->x86.has_os_save_zmm = (xcr0 & 0xe6) == 0xe6; + } + + // CPUID leaf 7 + cpuidp(7, 0, &eax, &ebx, &ecx, &edx); + if (features->x86.has_os_save_ymm) { + features->x86.has_avx2 = !!(ebx & (1 << 5)); + } + if (features->x86.has_os_save_zmm) { + features->x86.has_avx512f = !!(ebx & (1 << 16)); + features->x86.has_avx512dq = !!(ebx & (1 << 17)); + features->x86.has_avx512bw = !!(ebx & (1 << 30)); + features->x86.has_avx512vl = !!(ebx & (1 << 31)); + features->x86.has_vpclmulqdq = !!(ecx & (1 << 10)); + features->x86.has_avx512vnni = !!(ecx & (1 << 11)); + } + features->x86.has_avx512_common = + features->x86.has_avx512f && features->x86.has_avx512dq && + features->x86.has_avx512bw && features->x86.has_avx512vl; +} +``` + +### OS Support Verification + +YMM (256-bit) and ZMM (512-bit) registers require OS support to save/restore +context during context switches. `xgetbv(0)` reads the XCR0 register: + +- Bits 1+2 set → YMM state is saved (required for AVX2) +- Bits 1+2+5+6+7 set → ZMM state is saved (required for AVX-512) + +Without OS support, using AVX2/AVX-512 instructions will fault. + +--- + +## ARM Feature Detection + +ARM detection in `arch/arm/arm_features.c` uses platform-specific methods: + +**Linux**: Reads `/proc/cpuinfo` or uses `getauxval(AT_HWCAP)`: +```c +features->arm.has_neon = !!(hwcap & HWCAP_NEON); // AArch32 +features->arm.has_neon = !!(hwcap & HWCAP_ASIMD); // AArch64 +features->arm.has_crc32 = !!(hwcap & HWCAP_CRC32); +features->arm.has_pmull = !!(hwcap & HWCAP_PMULL); +``` + +**macOS**: Uses `sysctlbyname()`: +```c +features->arm.has_neon = 1; // Always available on Apple Silicon +features->arm.has_crc32 = has_feature("hw.optional.armv8_crc32"); +features->arm.has_pmull = has_feature("hw.optional.arm.FEAT_PMULL"); +``` + +--- + +## The Function Table + +### `functable_s` Structure + +```c +struct functable_s { + adler32_func adler32; + adler32_copy_func adler32_copy; + compare256_func compare256; + crc32_func crc32; + crc32_copy_func crc32_copy; + inflate_fast_func inflate_fast; + longest_match_func longest_match; + longest_match_slow_func longest_match_slow; + slide_hash_func slide_hash; + chunksize_func chunksize; + chunkmemset_safe_func chunkmemset_safe; +}; +``` + +### Dispatch Cascade + +`functable.c` initialises the function table using a cascade: + +```c +static void init_functable(void) { + struct cpu_features cf; + cpu_check_features(&cf); + + // Start with generic C implementations + functable.adler32 = adler32_c; + functable.crc32 = crc32_braid; + functable.compare256 = compare256_c; + functable.longest_match = longest_match_c; + functable.slide_hash = slide_hash_c; + functable.inflate_fast = inflate_fast_c; + functable.chunksize = chunksize_c; + functable.chunkmemset_safe = chunkmemset_safe_c; + +#ifdef X86_SSE2 + if (cf.x86.has_sse2) { + functable.chunksize = chunksize_sse2; + functable.chunkmemset_safe = chunkmemset_safe_sse2; + functable.compare256 = compare256_sse2; + functable.inflate_fast = inflate_fast_sse2; + functable.longest_match = longest_match_sse2; + functable.slide_hash = slide_hash_sse2; + } +#endif +#ifdef X86_SSSE3 + if (cf.x86.has_ssse3) { + functable.adler32 = adler32_ssse3; + } +#endif +#ifdef X86_SSE42 + if (cf.x86.has_sse42) { + functable.adler32 = adler32_sse42; + functable.compare256 = compare256_sse42; + functable.longest_match = longest_match_sse42; + } +#endif +#ifdef X86_PCLMULQDQ + if (cf.x86.has_pclmulqdq) { + functable.crc32 = crc32_pclmulqdq; + } +#endif +#ifdef X86_AVX2 + if (cf.x86.has_avx2) { + functable.adler32 = adler32_avx2; + functable.chunksize = chunksize_avx2; + functable.chunkmemset_safe = chunkmemset_safe_avx2; + functable.compare256 = compare256_avx2; + functable.inflate_fast = inflate_fast_avx2; + functable.longest_match = longest_match_avx2; + functable.slide_hash = slide_hash_avx2; + } +#endif +#ifdef X86_AVX512 + if (cf.x86.has_avx512_common) { + functable.adler32 = adler32_avx512; + functable.slide_hash = slide_hash_avx512; + } +#endif +#ifdef X86_AVX512VNNI + if (cf.x86.has_avx512vnni) { + functable.adler32 = adler32_avx512_vnni; + } +#endif +#ifdef X86_VPCLMULQDQ + if (cf.x86.has_vpclmulqdq && cf.x86.has_avx512_common) { + functable.crc32 = crc32_vpclmulqdq; + } +#endif + + // ARM cascade +#ifdef ARM_NEON + if (cf.arm.has_neon) { + functable.adler32 = adler32_neon; + functable.chunksize = chunksize_neon; + functable.chunkmemset_safe = chunkmemset_safe_neon; + functable.compare256 = compare256_neon; + functable.slide_hash = slide_hash_neon; + functable.inflate_fast = inflate_fast_neon; + functable.longest_match = longest_match_neon; + } +#endif +#ifdef ARM_ACLE_CRC_HASH + if (cf.arm.has_crc32) { + functable.crc32 = crc32_acle; + } +#endif + + // Store with release semantics for thread safety + atomic_store_explicit(&functable_init_done, 1, memory_order_release); +} +``` + +Later features override earlier ones, so the best available implementation +wins. + +### Thread-Safe Initialisation + +The function table uses atomic operations for thread safety: + +```c +static atomic_int functable_init_done = 0; +static struct functable_s functable; + +#define FUNCTABLE_CALL(name) \ + do { \ + if (!atomic_load_explicit(&functable_init_done, memory_order_acquire)) \ + init_functable(); \ + } while (0); \ + functable.name +``` + +The first call triggers initialisation; subsequent calls skip it via the +atomic flag. + +--- + +## Accelerated Operations + +### 1. Adler-32 Checksum + +**Scalar**: `adler32_c()` — byte-by-byte with NMAX blocking +**SIMD**: Uses horizontal sum and dot product — SSE4.1/SSSE3/AVX2/AVX-512/VNNI/NEON/VMX/Power8/RVV/LASX + +### 2. CRC-32 Checksum + +**Scalar**: `crc32_braid()` — braided 5-word parallel CRC +**SIMD**: Carry-less multiplication (CLMUL) for fast polynomial arithmetic — PCLMULQDQ/VPCLMULQDQ/PMULL/Power8/Zbc + +### 3. String Matching (`compare256`) + +Compares up to 256 bytes to find the longest match: + +**Scalar**: `compare256_c()` — byte-by-byte comparison +**SIMD**: Loads 16/32/64 bytes at a time, uses `_mm_cmpeq_epi8` + `_mm_movemask_epi8` (SSE2) or equivalent to find the first mismatch + +### 4. Longest Match + +Wraps `compare256` with hash chain walking: + +```c +longest_match_func longest_match; +longest_match_slow_func longest_match_slow; +``` + +The `_slow` variant also inserts intermediate hash entries for level ≥ 9. + +### 5. Slide Hash + +Slides the hash table down by one window's worth: + +**Scalar**: `slide_hash_c()` — loop over HASH_SIZE + w_size entries +**SIMD**: Processes 8/16/32 entries at a time using saturating subtract + +```c +// SSE2 example pattern: +__m128i vw = _mm_set1_epi16((uint16_t)s->w_size); +for (...) { + __m128i v = _mm_loadu_si128(p); + v = _mm_subs_epu16(v, vw); // Saturating subtract + _mm_storeu_si128(p, v); +} +``` + +### 6. Chunk Memory Set (`chunkmemset_safe`) + +Fast memset/memcpy for inflate back-reference copying: + +**Scalar**: `chunkmemset_safe_c()` — handles overlap via small loops +**SIMD**: Replicates the pattern into vector registers, handles even +overlapping copies via broadcast + +### 7. Inflate Fast + +The hot inner loop of the inflate engine: + +**Scalar**: `inflate_fast_c()` — standard decode loop +**SIMD**: Uses wider copy operations from chunkmemset for the back-reference +copy step + +--- + +## Compile-Time vs Runtime Detection + +### Runtime Detection (Default) + +Enabled by `WITH_RUNTIME_CPU_DETECTION=ON` (default): +- All SIMD variants are compiled as separate translation units +- `functable.c` selects the best at runtime +- Binary runs on any CPU of the target architecture + +### Native Compilation + +Enabled by `WITH_NATIVE_INSTRUCTIONS=ON`: +- Compiles with `-march=native` (or equivalent) +- The compiler uses host CPU features directly +- Slightly faster: no function pointer indirection +- Binary only runs on the build machine's CPU (or compatible) + +### Disabling Runtime Detection + +`DISABLE_RUNTIME_CPU_DETECTION` can be defined to skip runtime checks +and use only the generic C implementations, useful for constrained +environments. + +--- + +## Adding a New Architecture + +To add SIMD support for a new architecture: + +1. **Create `arch/<arch>/` directory** with feature detection and implementations +2. **Define a feature structure** in `<arch>_features.h` +3. **Implement `<arch>_check_features()`** using platform-specific detection +4. **Implement accelerated functions** matching the `functable_s` signatures +5. **Add dispatch entries** in `functable.c` guarded by feature flags +6. **Add CMake detection** in `CMakeLists.txt`: + ```cmake + check_<arch>_intrinsics() + if(WITH_<ARCH>_<FEATURE>) + add_compile_options(-m<flag>) + list(APPEND ZLIB_ARCH_SRCS arch/<arch>/...) + add_definitions(-D<ARCH>_<FEATURE>) + endif() + ``` + +--- + +## Supported Architecture Matrix + +| Architecture | adler32 | crc32 | compare256 | longest_match | slide_hash | inflate_fast | chunkmemset | +|---|---|---|---|---|---|---|---| +| x86 SSE2 | – | – | ✓ | ✓ | ✓ | ✓ | ✓ | +| x86 SSSE3 | ✓ | – | – | – | – | – | – | +| x86 SSE4.1 | ✓ | – | – | – | – | – | – | +| x86 SSE4.2 | ✓ | – | ✓ | ✓ | – | – | – | +| x86 PCLMULQDQ | – | ✓ | – | – | – | – | – | +| x86 AVX2 | ✓ | – | ✓ | ✓ | ✓ | ✓ | ✓ | +| x86 AVX-512 | ✓ | – | – | – | ✓ | – | – | +| x86 AVX-512+VNNI | ✓ | – | – | – | – | – | – | +| x86 VPCLMULQDQ | – | ✓ | – | – | – | – | – | +| ARM NEON | ✓ | – | ✓ | ✓ | ✓ | ✓ | ✓ | +| ARM CRC32 | – | ✓ | – | – | – | – | – | +| ARM PMULL | – | ✓ | – | – | – | – | – | +| Power VMX | ✓ | – | – | – | ✓ | – | – | +| Power8 | ✓ | ✓ | – | – | – | – | – | +| Power9 | – | – | ✓ | ✓ | – | – | – | +| RISC-V RVV | ✓ | – | ✓ | ✓ | – | – | – | +| s390 CRC | – | ✓ | – | – | – | – | – | +| LoongArch LSX | ✓ | – | – | – | ✓ | – | ✓ | +| LoongArch LASX | ✓ | – | – | – | – | – | – | |
