/* functable.c -- Choose relevant optimized functions at runtime * Copyright (C) 2017 Hans Kristian Rosbach * For conditions of distribution and use, see copyright notice in zlib.h */ #ifndef DISABLE_RUNTIME_CPU_DETECTION #include "zbuild.h" #if defined(_MSC_VER) # include #endif #include "functable.h" #include "cpu_features.h" #include "arch_functions.h" /* Platform has pointer size atomic store */ #if defined(__GNUC__) || defined(__clang__) # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) #elif defined(_MSC_VER) # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) # ifdef ARCH_ARM # define FUNCTABLE_BARRIER() do { \ _ReadWriteBarrier(); \ __dmb(0xB); /* _ARM_BARRIER_ISH */ \ _ReadWriteBarrier(); \ } while (0) # else # define FUNCTABLE_BARRIER() _ReadWriteBarrier() # endif #else # warning Unable to detect atomic intrinsic support. # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) #endif /* Verify all pointers are valid before assigning, return 1 on failure * This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR * if functable initialization fails. */ #define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \ if (!VAR.FUNC_NAME) { \ fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ return 1; \ } \ FUNCTABLE_ASSIGN(VAR, FUNC_NAME); /* Functable init & abort on failure. * Abort is needed because some stub functions are reachable without first * calling any inflateinit/deflateinit functions, and have no error propagation. */ #define FUNCTABLE_INIT_ABORT \ if (init_functable()) { \ fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ abort(); \ }; // Empty stub, used when functable has already been initialized static int force_init_empty(void) { return 0; } /* Functable initialization. * Selects the best available optimized functions appropriate for the runtime cpu. */ static int init_functable(void) { struct functable_s ft; struct cpu_features cf; memset(&ft, 0, sizeof(struct functable_s)); cpu_check_features(&cf); ft.force_init = &force_init_empty; // Set up generic C code fallbacks #ifndef WITH_ALL_FALLBACKS // Only use necessary generic functions when no suitable simd versions are available. # ifdef X86_SSE2_NATIVE // x86_64 always has SSE2 ft.adler32 = &adler32_c; ft.adler32_copy = &adler32_copy_c; ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; # elif defined(ARM_NEON_NATIVE) # ifndef ARM_CRC32_NATIVE ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; # endif # elif defined(POWER8_VSX_NATIVE) # ifndef POWER9_NATIVE ft.compare256 = &compare256_c; ft.longest_match = &longest_match_c; ft.longest_match_slow = &longest_match_slow_c; # endif # ifndef POWER8_VSX_CRC32_NATIVE ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; # endif # elif defined(LOONGARCH_LSX_NATIVE) # ifndef LOONGARCH_CRC ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; # endif # elif defined(RISCV_RVV_NATIVE) # ifndef RISCV_ZBC_NATIVE ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; # endif # elif defined(S390_CRC32_VX_NATIVE) ft.adler32 = &adler32_c; ft.adler32_copy = &adler32_copy_c; ft.chunkmemset_safe = &chunkmemset_safe_c; ft.compare256 = &compare256_c; ft.inflate_fast = &inflate_fast_c; ft.longest_match = &longest_match_c; ft.longest_match_slow = &longest_match_slow_c; ft.slide_hash = &slide_hash_c; # endif #else // WITH_ALL_FALLBACKS ft.adler32 = &adler32_c; ft.adler32_copy = &adler32_copy_c; ft.chunkmemset_safe = &chunkmemset_safe_c; ft.compare256 = &compare256_c; ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; ft.inflate_fast = &inflate_fast_c; ft.longest_match = &longest_match_c; ft.longest_match_slow = &longest_match_slow_c; ft.slide_hash = &slide_hash_c; #endif // Select arch-optimized functions #ifdef WITH_OPTIM // Chorba generic C fallback #ifndef WITHOUT_CHORBA ft.crc32 = &crc32_chorba; ft.crc32_copy = &crc32_copy_chorba; #endif // X86 - SSE2 #ifdef X86_SSE2 # ifndef X86_SSE2_NATIVE if (cf.x86.has_sse2) # endif { # ifndef X86_AVX2_NATIVE ft.chunkmemset_safe = &chunkmemset_safe_sse2; ft.compare256 = &compare256_sse2; ft.inflate_fast = &inflate_fast_sse2; ft.longest_match = &longest_match_sse2; ft.longest_match_slow = &longest_match_slow_sse2; ft.slide_hash = &slide_hash_sse2; # endif # if !defined(WITHOUT_CHORBA_SSE) && !defined(X86_PCLMULQDQ_NATIVE) ft.crc32 = &crc32_chorba_sse2; ft.crc32_copy = &crc32_copy_chorba_sse2; # endif } #endif // X86 - SSSE3 #ifdef X86_SSSE3 # ifndef X86_SSSE3_NATIVE if (cf.x86.has_ssse3) # endif { ft.adler32 = &adler32_ssse3; ft.adler32_copy = &adler32_copy_ssse3; # ifndef X86_AVX2_NATIVE ft.chunkmemset_safe = &chunkmemset_safe_ssse3; ft.inflate_fast = &inflate_fast_ssse3; # endif } #endif // X86 - SSE4.1 #if defined(X86_SSE41) && !defined(X86_PCLMULQDQ_NATIVE) # ifndef X86_SSE41_NATIVE if (cf.x86.has_sse41) # endif { # ifndef WITHOUT_CHORBA_SSE ft.crc32 = &crc32_chorba_sse41; ft.crc32_copy = &crc32_copy_chorba_sse41; # endif } #endif // X86 - SSE4.2 #if defined(X86_SSE42) && !defined(X86_AVX512_NATIVE) # ifndef X86_SSE42_NATIVE if (cf.x86.has_sse42) # endif { ft.adler32_copy = &adler32_copy_sse42; } #endif // X86 - PCLMUL #if defined(X86_PCLMULQDQ_CRC) && !defined(X86_VPCLMULQDQ_NATIVE) # ifndef X86_PCLMULQDQ_NATIVE if (cf.x86.has_pclmulqdq) # endif { ft.crc32 = &crc32_pclmulqdq; ft.crc32_copy = &crc32_copy_pclmulqdq; } #endif // X86 - AVX2 #ifdef X86_AVX2 /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers * for the shift results as an operand, eliminating several register-register moves when the original value needs * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ # ifndef X86_AVX2_NATIVE if (cf.x86.has_avx2 && cf.x86.has_bmi2) # endif { # ifndef X86_AVX512_NATIVE ft.adler32 = &adler32_avx2; ft.adler32_copy = &adler32_copy_avx2; ft.chunkmemset_safe = &chunkmemset_safe_avx2; ft.compare256 = &compare256_avx2; ft.inflate_fast = &inflate_fast_avx2; ft.longest_match = &longest_match_avx2; ft.longest_match_slow = &longest_match_slow_avx2; # endif ft.slide_hash = &slide_hash_avx2; } #endif // X86 - AVX512 (F,DQ,BW,Vl) #ifdef X86_AVX512 # ifndef X86_AVX512_NATIVE if (cf.x86.has_avx512_common) # endif { # ifndef X86_AVX512VNNI_NATIVE ft.adler32 = &adler32_avx512; ft.adler32_copy = &adler32_copy_avx512; # endif ft.chunkmemset_safe = &chunkmemset_safe_avx512; ft.compare256 = &compare256_avx512; ft.inflate_fast = &inflate_fast_avx512; ft.longest_match = &longest_match_avx512; ft.longest_match_slow = &longest_match_slow_avx512; } #endif #ifdef X86_AVX512VNNI # ifndef X86_AVX512VNNI_NATIVE if (cf.x86.has_avx512vnni) # endif { ft.adler32 = &adler32_avx512_vnni; ft.adler32_copy = &adler32_copy_avx512_vnni; } #endif // X86 - VPCLMULQDQ (AVX2) #ifdef X86_VPCLMULQDQ_AVX2 # ifndef X86_VPCLMULQDQ_AVX2_NATIVE if (cf.x86.has_pclmulqdq && cf.x86.has_avx2 && cf.x86.has_vpclmulqdq) # endif { ft.crc32 = &crc32_vpclmulqdq_avx2; ft.crc32_copy = &crc32_copy_vpclmulqdq_avx2; } #endif // X86 - VPCLMULQDQ (AVX-512) #ifdef X86_VPCLMULQDQ_AVX512 # ifndef X86_VPCLMULQDQ_AVX512_NATIVE if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) # endif { ft.crc32 = &crc32_vpclmulqdq_avx512; ft.crc32_copy = &crc32_copy_vpclmulqdq_avx512; } #endif // ARM - SIMD #if defined(ARM_SIMD) && !defined(ARM_NEON_NATIVE) # ifndef ARM_SIMD_NATIVE if (cf.arm.has_simd) # endif { ft.slide_hash = &slide_hash_armv6; } #endif // ARM - NEON #ifdef ARM_NEON # ifndef ARM_NEON_NATIVE if (cf.arm.has_neon) # endif { ft.adler32 = &adler32_neon; ft.adler32_copy = &adler32_copy_neon; ft.chunkmemset_safe = &chunkmemset_safe_neon; ft.compare256 = &compare256_neon; ft.inflate_fast = &inflate_fast_neon; ft.longest_match = &longest_match_neon; ft.longest_match_slow = &longest_match_slow_neon; ft.slide_hash = &slide_hash_neon; } #endif // ARM - CRC32 #if defined(ARM_CRC32) && !defined(ARM_PMULL_EOR3_NATIVE) # ifndef ARM_CRC32_NATIVE if (cf.arm.has_crc32) # endif { ft.crc32 = &crc32_armv8; ft.crc32_copy = &crc32_copy_armv8; } #endif // ARM - PMULL EOR3 #ifdef ARM_PMULL_EOR3 # ifndef ARM_PMULL_EOR3_NATIVE if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) # endif { ft.crc32 = &crc32_armv8_pmull_eor3; ft.crc32_copy = &crc32_copy_armv8_pmull_eor3; } #endif // Power - VMX #ifdef PPC_VMX # ifndef PPC_VMX_NATIVE if (cf.power.has_altivec) # endif { ft.adler32 = &adler32_vmx; ft.adler32_copy = &adler32_copy_vmx; ft.slide_hash = &slide_hash_vmx; } #endif // Power8 - VSX #ifdef POWER8_VSX # ifndef POWER8_VSX_NATIVE if (cf.power.has_arch_2_07) # endif { ft.adler32 = &adler32_power8; ft.adler32_copy = &adler32_copy_power8; ft.chunkmemset_safe = &chunkmemset_safe_power8; ft.inflate_fast = &inflate_fast_power8; ft.slide_hash = &slide_hash_power8; } #endif #ifdef POWER8_VSX_CRC32 # ifndef POWER8_VSX_CRC32_NATIVE if (cf.power.has_arch_2_07) # endif { ft.crc32 = &crc32_power8; ft.crc32_copy = &crc32_copy_power8; } #endif // Power9 #ifdef POWER9 # ifndef POWER9_NATIVE if (cf.power.has_arch_3_00) # endif { ft.compare256 = &compare256_power9; ft.longest_match = &longest_match_power9; ft.longest_match_slow = &longest_match_slow_power9; } #endif // RISCV - RVV #ifdef RISCV_RVV # ifndef RISCV_RVV_NATIVE if (cf.riscv.has_rvv) # endif { ft.adler32 = &adler32_rvv; ft.adler32_copy = &adler32_copy_rvv; ft.chunkmemset_safe = &chunkmemset_safe_rvv; ft.compare256 = &compare256_rvv; ft.inflate_fast = &inflate_fast_rvv; ft.longest_match = &longest_match_rvv; ft.longest_match_slow = &longest_match_slow_rvv; ft.slide_hash = &slide_hash_rvv; } #endif // RISCV - ZBC #ifdef RISCV_CRC32_ZBC # ifndef RISCV_ZBC_NATIVE if (cf.riscv.has_zbc) # endif { ft.crc32 = &crc32_riscv64_zbc; ft.crc32_copy = &crc32_copy_riscv64_zbc; } #endif // S390 #ifdef S390_CRC32_VX # ifndef S390_CRC32_VX_NATIVE if (cf.s390.has_vx) # endif { ft.crc32 = &crc32_s390_vx; ft.crc32_copy = &crc32_copy_s390_vx; } #endif // LOONGARCH #ifdef LOONGARCH_CRC # ifndef LOONGARCH_CRC_NATIVE if (cf.loongarch.has_crc) # endif { ft.crc32 = &crc32_loongarch64; ft.crc32_copy = &crc32_copy_loongarch64; } #endif #if defined(LOONGARCH_LSX) && !defined(LOONGARCH_LASX_NATIVE) # ifndef LOONGARCH_LSX_NATIVE if (cf.loongarch.has_lsx) # endif { ft.adler32 = &adler32_lsx; ft.adler32_copy = &adler32_copy_lsx; ft.chunkmemset_safe = &chunkmemset_safe_lsx; ft.compare256 = &compare256_lsx; ft.inflate_fast = &inflate_fast_lsx; ft.longest_match = &longest_match_lsx; ft.longest_match_slow = &longest_match_slow_lsx; ft.slide_hash = &slide_hash_lsx; } #endif #ifdef LOONGARCH_LASX # ifndef LOONGARCH_LASX_NATIVE if (cf.loongarch.has_lasx) # endif { ft.adler32 = &adler32_lasx; ft.adler32_copy = &adler32_copy_lasx; ft.chunkmemset_safe = &chunkmemset_safe_lasx; ft.compare256 = &compare256_lasx; ft.inflate_fast = &inflate_fast_lasx; ft.longest_match = &longest_match_lasx; ft.longest_match_slow = &longest_match_slow_lasx; ft.slide_hash = &slide_hash_lasx; } #endif #endif // WITH_OPTIM // Assign function pointers individually for atomic operation FUNCTABLE_ASSIGN(ft, force_init); FUNCTABLE_VERIFY_ASSIGN(ft, adler32); FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy); FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe); FUNCTABLE_VERIFY_ASSIGN(ft, compare256); FUNCTABLE_VERIFY_ASSIGN(ft, crc32); FUNCTABLE_VERIFY_ASSIGN(ft, crc32_copy); FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast); FUNCTABLE_VERIFY_ASSIGN(ft, longest_match); FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_slow); FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash); // Memory barrier for weak memory order CPUs FUNCTABLE_BARRIER(); return Z_OK; } /* stub functions */ static int force_init_stub(void) { return init_functable(); } static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { FUNCTABLE_INIT_ABORT; return functable.adler32(adler, buf, len); } static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { FUNCTABLE_INIT_ABORT; return functable.adler32_copy(adler, dst, src, len); } static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, size_t len, size_t left) { FUNCTABLE_INIT_ABORT; return functable.chunkmemset_safe(out, from, len, left); } static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { FUNCTABLE_INIT_ABORT; return functable.compare256(src0, src1); } static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { FUNCTABLE_INIT_ABORT; return functable.crc32(crc, buf, len); } static uint32_t crc32_copy_stub(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { FUNCTABLE_INIT_ABORT; return functable.crc32_copy(crc, dst, src, len); } static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { FUNCTABLE_INIT_ABORT; functable.inflate_fast(strm, start); } static uint32_t longest_match_stub(deflate_state* const s, uint32_t cur_match) { FUNCTABLE_INIT_ABORT; return functable.longest_match(s, cur_match); } static uint32_t longest_match_slow_stub(deflate_state* const s, uint32_t cur_match) { FUNCTABLE_INIT_ABORT; return functable.longest_match_slow(s, cur_match); } static void slide_hash_stub(deflate_state* s) { FUNCTABLE_INIT_ABORT; functable.slide_hash(s); } /* functable init */ Z_INTERNAL struct functable_s functable = { force_init_stub, adler32_stub, adler32_copy_stub, chunkmemset_safe_stub, compare256_stub, crc32_stub, crc32_copy_stub, inflate_fast_stub, longest_match_stub, longest_match_slow_stub, slide_hash_stub, }; #endif