diff options
| author | Nathan Moinvaziri <nathan@nathanm.com> | 2026-02-07 00:00:44 -0800 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-03-08 23:33:57 +0100 |
| commit | 30206c1cac40a8bed335405ed2e64559fb7b40bc (patch) | |
| tree | 80867b5eed2ee5f9ba4cce677b9a8071dd494cdb | |
| parent | b3bcd2104f483b47f4483be9d17be0d00b2a384a (diff) | |
| download | Project-Tick-30206c1cac40a8bed335405ed2e64559fb7b40bc.tar.gz Project-Tick-30206c1cac40a8bed335405ed2e64559fb7b40bc.zip | |
Add compile-time native feature detection macros
Creates [ARCH]_[FEAT]_NATIVE preprocessor defines that can be re-used
in functable to bypass CPU checks.
They are from DISABLE_RUNTIME_CPU_DETECTION preprocessor logic.
| -rw-r--r-- | arch/arm/arm_functions.h | 11 | ||||
| -rw-r--r-- | arch/arm/arm_natives.h | 31 | ||||
| -rw-r--r-- | arch/loongarch/loongarch_functions.h | 10 | ||||
| -rw-r--r-- | arch/loongarch/loongarch_natives.h | 25 | ||||
| -rw-r--r-- | arch/power/power_functions.h | 11 | ||||
| -rw-r--r-- | arch/power/power_natives.h | 27 | ||||
| -rw-r--r-- | arch/riscv/riscv_functions.h | 7 | ||||
| -rw-r--r-- | arch/riscv/riscv_natives.h | 19 | ||||
| -rw-r--r-- | arch/s390/s390_functions.h | 5 | ||||
| -rw-r--r-- | arch/s390/s390_natives.h | 14 | ||||
| -rw-r--r-- | arch/x86/x86_functions.h | 28 | ||||
| -rw-r--r-- | arch/x86/x86_natives.h | 52 | ||||
| -rw-r--r-- | arch_natives.h | 24 | ||||
| -rw-r--r-- | functable.c | 177 |
14 files changed, 374 insertions, 67 deletions
diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h index 34ba87b067..bc77adb977 100644 --- a/arch/arm/arm_functions.h +++ b/arch/arm/arm_functions.h @@ -5,6 +5,8 @@ #ifndef ARM_FUNCTIONS_H_ #define ARM_FUNCTIONS_H_ +#include "arm_natives.h" + #ifdef ARM_NEON uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); @@ -29,15 +31,14 @@ uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t * void slide_hash_armv6(deflate_state *s); #endif - #ifdef DISABLE_RUNTIME_CPU_DETECTION // ARM - SIMD -# if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD) +# ifdef ARM_SIMD_NATIVE # undef native_slide_hash # define native_slide_hash slide_hash_armv6 # endif // ARM - NEON -# if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON +# ifdef ARM_NEON_NATIVE # undef native_adler32 # define native_adler32 adler32_neon # undef native_adler32_copy @@ -56,14 +57,14 @@ void slide_hash_armv6(deflate_state *s); # define native_slide_hash slide_hash_neon # endif // ARM - CRC32 -# if (defined(ARM_CRC32) && defined(__ARM_FEATURE_CRC32)) +# ifdef ARM_CRC32_NATIVE # undef native_crc32 # define native_crc32 crc32_armv8 # undef native_crc32_copy # define native_crc32_copy crc32_copy_armv8 # endif // ARM - PMULL EOR3 -# if (defined(ARM_PMULL_EOR3) && defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3)) +# ifdef ARM_PMULL_EOR3_NATIVE # undef native_crc32 # define native_crc32 crc32_armv8_pmull_eor3 # undef native_crc32_copy diff --git a/arch/arm/arm_natives.h b/arch/arm/arm_natives.h new file mode 100644 index 0000000000..311e33e958 --- /dev/null +++ b/arch/arm/arm_natives.h @@ -0,0 +1,31 @@ +/* arm_natives.h -- ARM compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_NATIVES_H_ +#define ARM_NATIVES_H_ + +#if defined(__ARM_FEATURE_SIMD32) +# ifdef ARM_SIMD +# define ARM_SIMD_NATIVE +# endif +#endif +/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */ +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT) +# ifdef ARM_NEON +# define ARM_NEON_NATIVE +# endif +#endif +/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */ +#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801) +# ifdef ARM_CRC32 +# define ARM_CRC32_NATIVE +# endif +#endif +#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3) +# ifdef ARM_PMULL_EOR3 +# define ARM_PMULL_EOR3_NATIVE +# endif +#endif + +#endif /* ARM_NATIVES_H_ */ diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index 922c6c4165..0ec8bd66d7 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -8,6 +8,8 @@ #ifndef LOONGARCH_FUNCTIONS_H_ #define LOONGARCH_FUNCTIONS_H_ +#include "loongarch_natives.h" + #ifdef LOONGARCH_CRC uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); @@ -36,14 +38,14 @@ void slide_hash_lasx(deflate_state *s); #endif #ifdef DISABLE_RUNTIME_CPU_DETECTION -// LOONGARCH - CRC32 - All known CPUs has crc instructions -# if defined(LOONGARCH_CRC) +// LOONGARCH - CRC32 +# ifdef LOONGARCH_CRC_NATIVE # undef native_crc32 # define native_crc32 crc32_loongarch64 # undef native_crc32_copy # define native_crc32_copy crc32_copy_loongarch64 # endif -# if defined(LOONGARCH_LSX) && defined(__loongarch_sx) +# ifdef LOONGARCH_LSX_NATIVE # undef native_adler32 # define native_adler32 adler32_lsx # undef native_adler32_copy @@ -61,7 +63,7 @@ void slide_hash_lasx(deflate_state *s); # undef native_slide_hash # define native_slide_hash slide_hash_lsx # endif -# if defined(LOONGARCH_LASX) && defined(__loongarch_asx) +# ifdef LOONGARCH_LASX_NATIVE # undef native_adler32 # define native_adler32 adler32_lasx # undef native_adler32_copy diff --git a/arch/loongarch/loongarch_natives.h b/arch/loongarch/loongarch_natives.h new file mode 100644 index 0000000000..35f6d3c7bd --- /dev/null +++ b/arch/loongarch/loongarch_natives.h @@ -0,0 +1,25 @@ +/* loongarch_natives.h -- LoongArch compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef LOONGARCH_NATIVES_H_ +#define LOONGARCH_NATIVES_H_ + +#if defined(__loongarch__) +// All known CPUs have crc instructions +# ifdef LOONGARCH_CRC +# define LOONGARCH_CRC_NATIVE +# endif +#endif +#if defined(__loongarch_sx) +# ifdef LOONGARCH_LSX +# define LOONGARCH_LSX_NATIVE +# endif +#endif +#if defined(__loongarch_asx) +# ifdef LOONGARCH_LASX +# define LOONGARCH_LASX_NATIVE +# endif +#endif + +#endif /* LOONGARCH_NATIVES_H_ */ diff --git a/arch/power/power_functions.h b/arch/power/power_functions.h index 49ea89e819..ccc7754a4c 100644 --- a/arch/power/power_functions.h +++ b/arch/power/power_functions.h @@ -7,6 +7,8 @@ #ifndef POWER_FUNCTIONS_H_ #define POWER_FUNCTIONS_H_ +#include "power_natives.h" + #ifdef PPC_VMX uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); @@ -29,10 +31,9 @@ uint32_t longest_match_power9(deflate_state *const s, uint32_t cur_match); uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match); #endif - #ifdef DISABLE_RUNTIME_CPU_DETECTION // Power - VMX -# if defined(PPC_VMX) && defined(__ALTIVEC__) +# ifdef PPC_VMX_NATIVE # undef native_adler32 # define native_adler32 adler32_vmx # undef native_adler32_copy @@ -41,7 +42,7 @@ uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match); # define native_slide_hash slide_hash_vmx # endif // Power8 - VSX -# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__) +# ifdef POWER8_VSX_NATIVE # undef native_adler32 # define native_adler32 adler32_power8 # undef native_adler32_copy @@ -53,14 +54,14 @@ uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match); # undef native_slide_hash # define native_slide_hash slide_hash_power8 # endif -# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__) +# ifdef POWER8_VSX_CRC32_NATIVE # undef native_crc32 # define native_crc32 crc32_power8 # undef native_crc32_copy # define native_crc32_copy crc32_copy_power8 # endif // Power9 -# if defined(POWER9) && defined(_ARCH_PWR9) +# ifdef POWER9_NATIVE # undef native_compare256 # define native_compare256 compare256_power9 # undef native_longest_match diff --git a/arch/power/power_natives.h b/arch/power/power_natives.h new file mode 100644 index 0000000000..59ec8a8aed --- /dev/null +++ b/arch/power/power_natives.h @@ -0,0 +1,27 @@ +/* power_natives.h -- POWER compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef POWER_NATIVES_H_ +#define POWER_NATIVES_H_ + +#if defined(__ALTIVEC__) +# ifdef PPC_VMX +# define PPC_VMX_NATIVE +# endif +#endif +#if defined(_ARCH_PWR8) && defined(__VSX__) +# ifdef POWER8_VSX +# define POWER8_VSX_NATIVE +# endif +# ifdef POWER8_VSX_CRC32 +# define POWER8_VSX_CRC32_NATIVE +# endif +#endif +#if defined(_ARCH_PWR9) +# ifdef POWER9 +# define POWER9_NATIVE +# endif +#endif + +#endif /* POWER_NATIVES_H_ */ diff --git a/arch/riscv/riscv_functions.h b/arch/riscv/riscv_functions.h index 9e641966a0..89120ffabf 100644 --- a/arch/riscv/riscv_functions.h +++ b/arch/riscv/riscv_functions.h @@ -9,6 +9,8 @@ #ifndef RISCV_FUNCTIONS_H_ #define RISCV_FUNCTIONS_H_ +#include "riscv_natives.h" + #ifdef RISCV_RVV uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); @@ -28,7 +30,7 @@ uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, #ifdef DISABLE_RUNTIME_CPU_DETECTION // RISCV - RVV -# if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__) +# ifdef RISCV_RVV_NATIVE # undef native_adler32 # define native_adler32 adler32_rvv # undef native_adler32_copy @@ -46,9 +48,8 @@ uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, # undef native_slide_hash # define native_slide_hash slide_hash_rvv # endif - // RISCV - CRC32 -# if (defined(RISCV_CRC32_ZBC) && defined (__riscv_zbc)) +# ifdef RISCV_ZBC_NATIVE # undef native_crc32 # define native_crc32 crc32_riscv64_zbc # undef native_crc32_copy diff --git a/arch/riscv/riscv_natives.h b/arch/riscv/riscv_natives.h new file mode 100644 index 0000000000..38d7aba648 --- /dev/null +++ b/arch/riscv/riscv_natives.h @@ -0,0 +1,19 @@ +/* riscv_natives.h -- RISCV compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_NATIVES_H_ +#define RISCV_NATIVES_H_ + +#if defined(__riscv_v) && defined(__linux__) +# ifdef RISCV_RVV +# define RISCV_RVV_NATIVE +# endif +#endif +#if defined(__riscv_zbc) +# ifdef RISCV_CRC32_ZBC +# define RISCV_ZBC_NATIVE +# endif +#endif + +#endif /* RISCV_NATIVES_H_ */ diff --git a/arch/s390/s390_functions.h b/arch/s390/s390_functions.h index 7de83abb6e..30647051f4 100644 --- a/arch/s390/s390_functions.h +++ b/arch/s390/s390_functions.h @@ -5,6 +5,8 @@ #ifndef S390_FUNCTIONS_H_ #define S390_FUNCTIONS_H_ +#include "s390_natives.h" + #ifdef S390_CRC32_VX uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); @@ -19,7 +21,8 @@ uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size #endif #ifdef DISABLE_RUNTIME_CPU_DETECTION -# if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__) +// S390 - CRC32 VX +# ifdef S390_CRC32_VX_NATIVE # undef native_crc32 # define native_crc32 crc32_s390_vx # undef native_crc32_copy diff --git a/arch/s390/s390_natives.h b/arch/s390/s390_natives.h new file mode 100644 index 0000000000..5da913daf5 --- /dev/null +++ b/arch/s390/s390_natives.h @@ -0,0 +1,14 @@ +/* s390_natives.h -- s390 compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef S390_NATIVES_H_ +#define S390_NATIVES_H_ + +#if defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__) +# ifdef S390_CRC32_VX +# define S390_CRC32_VX_NATIVE +# endif +#endif + +#endif /* S390_NATIVES_H_ */ diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index f6ec9a137c..7b628a851a 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -6,6 +6,8 @@ #ifndef X86_FUNCTIONS_H_ #define X86_FUNCTIONS_H_ +#include "x86_natives.h" + /* So great news, your compiler is broken and causes stack smashing. Rather than * notching out its compilation we'll just remove the assignment in the functable. * Further context: @@ -80,7 +82,7 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s #ifdef DISABLE_RUNTIME_CPU_DETECTION // X86 - SSE2 -# if (defined(X86_SSE2) && defined(__SSE2__)) || (defined(ARCH_X86) && defined(ARCH_64BIT)) +# ifdef X86_SSE2_NATIVE # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_sse2 # undef native_compare256 @@ -101,7 +103,7 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # define native_slide_hash slide_hash_sse2 # endif // X86 - SSSE3 -# if defined(X86_SSSE3) && defined(__SSSE3__) +# ifdef X86_SSSE3_NATIVE # undef native_adler32 # define native_adler32 adler32_ssse3 # undef native_adler32_copy @@ -112,26 +114,26 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # define native_inflate_fast inflate_fast_ssse3 # endif // X86 - SSE4.1 -# if defined(X86_SSE41) && defined(__SSE4_1__) && !defined(WITHOUT_CHORBA_SSE) -# undef native_crc32 -# define native_crc32 crc32_chorba_sse41 -# undef native_crc32_copy -# define native_crc32_copy crc32_copy_chorba_sse41 +# if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse41 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_chorba_sse41 # endif // X86 - SSE4.2 -# if defined(X86_SSE42) && defined(__SSE4_2__) +# ifdef X86_SSE42_NATIVE # undef native_adler32_copy # define native_adler32_copy adler32_copy_sse42 # endif // X86 - PCLMUL -# if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__) +# ifdef X86_PCLMULQDQ_NATIVE # undef native_crc32 # define native_crc32 crc32_pclmulqdq # undef native_crc32_copy # define native_crc32_copy crc32_copy_pclmulqdq # endif // X86 - AVX2 -# if defined(X86_AVX2) && defined(__AVX2__) +# ifdef X86_AVX2_NATIVE # undef native_adler32 # define native_adler32 adler32_avx2 # undef native_adler32_copy @@ -150,7 +152,7 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # define native_slide_hash slide_hash_avx2 # endif // X86 - AVX512 (F,DQ,BW,Vl) -# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) +# ifdef X86_AVX512_NATIVE # undef native_adler32 # define native_adler32 adler32_avx512 # undef native_adler32_copy @@ -166,14 +168,14 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # undef native_longest_match_slow # define native_longest_match_slow longest_match_slow_avx512 // X86 - AVX512 (VNNI) -# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) +# ifdef X86_AVX512VNNI_NATIVE # undef native_adler32 # define native_adler32 adler32_avx512_vnni # undef native_adler32_copy # define native_adler32_copy adler32_copy_avx512_vnni # endif // X86 - VPCLMULQDQ -# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__) +# ifdef X86_VPCLMULQDQ_NATIVE # undef native_crc32 # define native_crc32 crc32_vpclmulqdq # undef native_crc32_copy diff --git a/arch/x86/x86_natives.h b/arch/x86/x86_natives.h new file mode 100644 index 0000000000..75f249d909 --- /dev/null +++ b/arch/x86/x86_natives.h @@ -0,0 +1,52 @@ +/* x86_natives.h -- x86 compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_NATIVES_H_ +#define X86_NATIVES_H_ + +#if defined(__SSE2__) || (defined(ARCH_X86) && defined(ARCH_64BIT)) +# ifdef X86_SSE2 +# define X86_SSE2_NATIVE +# endif +#endif +#if defined(__SSSE3__) +# ifdef X86_SSSE3 +# define X86_SSSE3_NATIVE +# endif +#endif +#if defined(__SSE4_1__) +# ifdef X86_SSE41 +# define X86_SSE41_NATIVE +# endif +#endif +#if defined(__SSE4_2__) +# ifdef X86_SSE42 +# define X86_SSE42_NATIVE +# endif +#endif +#if defined(__PCLMUL__) +# ifdef X86_PCLMULQDQ_CRC +# define X86_PCLMULQDQ_NATIVE +# endif +# if defined(__AVX512F__) && defined(__VPCLMULQDQ__) +# define X86_VPCLMULQDQ_NATIVE +# endif +#endif +#if defined(__AVX2__) +# ifdef X86_AVX2 +# define X86_AVX2_NATIVE +# endif +#endif +#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) +# ifdef X86_AVX512 +# define X86_AVX512_NATIVE +# endif +#endif +#if defined(__AVX512VNNI__) +# ifdef X86_AVX512VNNI +# define X86_AVX512VNNI_NATIVE +# endif +#endif + +#endif /* X86_NATIVES_H_ */ diff --git a/arch_natives.h b/arch_natives.h new file mode 100644 index 0000000000..5fe44516d4 --- /dev/null +++ b/arch_natives.h @@ -0,0 +1,24 @@ +/* arch_natives.h -- Compile-time feature detection macros for all architectures. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARCH_NATIVES_H_ +#define ARCH_NATIVES_H_ + +#include "zbuild.h" + +#if defined(X86_FEATURES) +# include "arch/x86/x86_natives.h" +#elif defined(ARM_FEATURES) +# include "arch/arm/arm_natives.h" +#elif defined(PPC_FEATURES) || defined(POWER_FEATURES) +# include "arch/power/power_natives.h" +#elif defined(S390_FEATURES) +# include "arch/s390/s390_natives.h" +#elif defined(RISCV_FEATURES) +# include "arch/riscv/riscv_natives.h" +#elif defined(LOONGARCH_FEATURES) +# include "arch/loongarch/loongarch_natives.h" +#endif + +#endif /* ARCH_NATIVES_H_ */ diff --git a/functable.c b/functable.c index b976e72509..478051f40d 100644 --- a/functable.c +++ b/functable.c @@ -77,12 +77,47 @@ static int init_functable(void) { // Set up generic C code fallbacks #ifndef WITH_ALL_FALLBACKS -# if defined(ARCH_X86) && defined(ARCH_64BIT) && defined(X86_SSE2) - // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available. + // Only use necessary generic functions when no suitable simd versions are available. +# ifdef X86_SSE2_NATIVE + // x86_64 always has SSE2 ft.adler32 = &adler32_c; ft.adler32_copy = &adler32_copy_c; ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; +# elif defined(ARM_NEON_NATIVE) +# ifndef ARM_CRC32_NATIVE + ft.crc32 = &crc32_braid; + ft.crc32_copy = &crc32_copy_braid; +# endif +# elif defined(POWER8_VSX_NATIVE) +# ifndef POWER9_NATIVE + ft.compare256 = &compare256_c; + ft.longest_match = &longest_match_c; + ft.longest_match_slow = &longest_match_slow_c; +# endif +# ifndef POWER8_VSX_CRC32_NATIVE + ft.crc32 = &crc32_braid; + ft.crc32_copy = &crc32_copy_braid; +# endif +# elif defined(LOONGARCH_LSX_NATIVE) +# ifndef LOONGARCH_CRC + ft.crc32 = &crc32_braid; + ft.crc32_copy = &crc32_copy_braid; +# endif +# elif defined(RISCV_RVV_NATIVE) +# ifndef RISCV_ZBC_NATIVE + ft.crc32 = &crc32_braid; + ft.crc32_copy = &crc32_copy_braid; +# endif +# elif defined(S390_CRC32_VX_NATIVE) + ft.adler32 = &adler32_c; + ft.adler32_copy = &adler32_copy_c; + ft.chunkmemset_safe = &chunkmemset_safe_c; + ft.compare256 = &compare256_c; + ft.inflate_fast = &inflate_fast_c; + ft.longest_match = &longest_match_c; + ft.longest_match_slow = &longest_match_slow_c; + ft.slide_hash = &slide_hash_c; # endif #else // WITH_ALL_FALLBACKS ft.adler32 = &adler32_c; @@ -108,60 +143,82 @@ static int init_functable(void) { // X86 - SSE2 #ifdef X86_SSE2 -# ifdef ARCH_32BIT +# ifndef X86_SSE2_NATIVE if (cf.x86.has_sse2) # endif { +# ifndef X86_AVX2_NATIVE ft.chunkmemset_safe = &chunkmemset_safe_sse2; ft.compare256 = &compare256_sse2; -# if !defined(WITHOUT_CHORBA_SSE) - ft.crc32 = &crc32_chorba_sse2; - ft.crc32_copy = &crc32_copy_chorba_sse2; -# endif ft.inflate_fast = &inflate_fast_sse2; ft.longest_match = &longest_match_sse2; ft.longest_match_slow = &longest_match_slow_sse2; ft.slide_hash = &slide_hash_sse2; +# endif +# if !defined(WITHOUT_CHORBA_SSE) && !defined(X86_PCLMULQDQ_NATIVE) + ft.crc32 = &crc32_chorba_sse2; + ft.crc32_copy = &crc32_copy_chorba_sse2; +# endif } #endif // X86 - SSSE3 #ifdef X86_SSSE3 - if (cf.x86.has_ssse3) { +# ifndef X86_SSSE3_NATIVE + if (cf.x86.has_ssse3) +# endif + { ft.adler32 = &adler32_ssse3; ft.adler32_copy = &adler32_copy_ssse3; +# ifndef X86_AVX2_NATIVE ft.chunkmemset_safe = &chunkmemset_safe_ssse3; ft.inflate_fast = &inflate_fast_ssse3; +# endif } #endif // X86 - SSE4.1 -#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) - if (cf.x86.has_sse41) { +#if defined(X86_SSE41) && !defined(X86_PCLMULQDQ_NATIVE) +# ifndef X86_SSE41_NATIVE + if (cf.x86.has_sse41) +# endif + { +# ifndef WITHOUT_CHORBA_SSE ft.crc32 = &crc32_chorba_sse41; ft.crc32_copy = &crc32_copy_chorba_sse41; +# endif } #endif // X86 - SSE4.2 -#ifdef X86_SSE42 - if (cf.x86.has_sse42) { +#if defined(X86_SSE42) && !defined(X86_AVX512_NATIVE) +# ifndef X86_SSE42_NATIVE + if (cf.x86.has_sse42) +# endif + { ft.adler32_copy = &adler32_copy_sse42; } #endif // X86 - PCLMUL -#ifdef X86_PCLMULQDQ_CRC - if (cf.x86.has_pclmulqdq) { +#if defined(X86_PCLMULQDQ_CRC) && !defined(X86_VPCLMULQDQ_NATIVE) +# ifndef X86_PCLMULQDQ_NATIVE + if (cf.x86.has_pclmulqdq) +# endif + { ft.crc32 = &crc32_pclmulqdq; ft.crc32_copy = &crc32_copy_pclmulqdq; } #endif - // X86 - AVX + // X86 - AVX2 #ifdef X86_AVX2 /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers * for the shift results as an operand, eliminating several register-register moves when the original value needs * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ - if (cf.x86.has_avx2 && cf.x86.has_bmi2) { +# ifndef X86_AVX2_NATIVE + if (cf.x86.has_avx2 && cf.x86.has_bmi2) +# endif + { +# ifndef X86_AVX512_NATIVE ft.adler32 = &adler32_avx2; ft.adler32_copy = &adler32_copy_avx2; ft.chunkmemset_safe = &chunkmemset_safe_avx2; @@ -169,14 +226,20 @@ static int init_functable(void) { ft.inflate_fast = &inflate_fast_avx2; ft.longest_match = &longest_match_avx2; ft.longest_match_slow = &longest_match_slow_avx2; +# endif ft.slide_hash = &slide_hash_avx2; } #endif // X86 - AVX512 (F,DQ,BW,Vl) #ifdef X86_AVX512 - if (cf.x86.has_avx512_common) { +# ifndef X86_AVX512_NATIVE + if (cf.x86.has_avx512_common) +# endif + { +# ifndef X86_AVX512VNNI_NATIVE ft.adler32 = &adler32_avx512; ft.adler32_copy = &adler32_copy_avx512; +# endif ft.chunkmemset_safe = &chunkmemset_safe_avx512; ft.compare256 = &compare256_avx512; ft.inflate_fast = &inflate_fast_avx512; @@ -185,14 +248,20 @@ static int init_functable(void) { } #endif #ifdef X86_AVX512VNNI - if (cf.x86.has_avx512vnni) { +# ifndef X86_AVX512VNNI_NATIVE + if (cf.x86.has_avx512vnni) +# endif + { ft.adler32 = &adler32_avx512_vnni; ft.adler32_copy = &adler32_copy_avx512_vnni; } #endif // X86 - VPCLMULQDQ #ifdef X86_VPCLMULQDQ_CRC - if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) { +# ifndef X86_VPCLMULQDQ_NATIVE + if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) +# endif + { ft.crc32 = &crc32_vpclmulqdq; ft.crc32_copy = &crc32_copy_vpclmulqdq; } @@ -200,8 +269,8 @@ static int init_functable(void) { // ARM - SIMD -#ifdef ARM_SIMD -# ifndef ARM_NOCHECK_SIMD +#if defined(ARM_SIMD) && !defined(ARM_NEON_NATIVE) +# ifndef ARM_SIMD_NATIVE if (cf.arm.has_simd) # endif { @@ -210,7 +279,7 @@ static int init_functable(void) { #endif // ARM - NEON #ifdef ARM_NEON -# ifndef ARM_NOCHECK_NEON +# ifndef ARM_NEON_NATIVE if (cf.arm.has_neon) # endif { @@ -225,15 +294,21 @@ static int init_functable(void) { } #endif // ARM - CRC32 -#ifdef ARM_CRC32 - if (cf.arm.has_crc32) { +#if defined(ARM_CRC32) && !defined(ARM_PMULL_EOR3_NATIVE) +# ifndef ARM_CRC32_NATIVE + if (cf.arm.has_crc32) +# endif + { ft.crc32 = &crc32_armv8; ft.crc32_copy = &crc32_copy_armv8; } #endif // ARM - PMULL EOR3 #ifdef ARM_PMULL_EOR3 - if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) { +# ifndef ARM_PMULL_EOR3_NATIVE + if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) +# endif + { ft.crc32 = &crc32_armv8_pmull_eor3; ft.crc32_copy = &crc32_copy_armv8_pmull_eor3; } @@ -241,7 +316,10 @@ static int init_functable(void) { // Power - VMX #ifdef PPC_VMX - if (cf.power.has_altivec) { +# ifndef PPC_VMX_NATIVE + if (cf.power.has_altivec) +# endif + { ft.adler32 = &adler32_vmx; ft.adler32_copy = &adler32_copy_vmx; ft.slide_hash = &slide_hash_vmx; @@ -249,7 +327,10 @@ static int init_functable(void) { #endif // Power8 - VSX #ifdef POWER8_VSX - if (cf.power.has_arch_2_07) { +# ifndef POWER8_VSX_NATIVE + if (cf.power.has_arch_2_07) +# endif + { ft.adler32 = &adler32_power8; ft.adler32_copy = &adler32_copy_power8; ft.chunkmemset_safe = &chunkmemset_safe_power8; @@ -258,14 +339,20 @@ static int init_functable(void) { } #endif #ifdef POWER8_VSX_CRC32 - if (cf.power.has_arch_2_07) { +# ifndef POWER8_VSX_CRC32_NATIVE + if (cf.power.has_arch_2_07) +# endif + { ft.crc32 = &crc32_power8; ft.crc32_copy = &crc32_copy_power8; } #endif // Power9 #ifdef POWER9 - if (cf.power.has_arch_3_00) { +# ifndef POWER9_NATIVE + if (cf.power.has_arch_3_00) +# endif + { ft.compare256 = &compare256_power9; ft.longest_match = &longest_match_power9; ft.longest_match_slow = &longest_match_slow_power9; @@ -275,7 +362,10 @@ static int init_functable(void) { // RISCV - RVV #ifdef RISCV_RVV - if (cf.riscv.has_rvv) { +# ifndef RISCV_RVV_NATIVE + if (cf.riscv.has_rvv) +# endif + { ft.adler32 = &adler32_rvv; ft.adler32_copy = &adler32_copy_rvv; ft.chunkmemset_safe = &chunkmemset_safe_rvv; @@ -289,7 +379,10 @@ static int init_functable(void) { // RISCV - ZBC #ifdef RISCV_CRC32_ZBC - if (cf.riscv.has_zbc) { +# ifndef RISCV_ZBC_NATIVE + if (cf.riscv.has_zbc) +# endif + { ft.crc32 = &crc32_riscv64_zbc; ft.crc32_copy = &crc32_copy_riscv64_zbc; } @@ -297,7 +390,10 @@ static int init_functable(void) { // S390 #ifdef S390_CRC32_VX - if (cf.s390.has_vx) { +# ifndef S390_CRC32_VX_NATIVE + if (cf.s390.has_vx) +# endif + { ft.crc32 = &crc32_s390_vx; ft.crc32_copy = &crc32_copy_s390_vx; } @@ -305,13 +401,19 @@ static int init_functable(void) { // LOONGARCH #ifdef LOONGARCH_CRC - if (cf.loongarch.has_crc) { +# ifndef LOONGARCH_CRC_NATIVE + if (cf.loongarch.has_crc) +# endif + { ft.crc32 = &crc32_loongarch64; ft.crc32_copy = &crc32_copy_loongarch64; } #endif -#ifdef LOONGARCH_LSX - if (cf.loongarch.has_lsx) { +#if defined(LOONGARCH_LSX) && !defined(LOONGARCH_LASX_NATIVE) +# ifndef LOONGARCH_LSX_NATIVE + if (cf.loongarch.has_lsx) +# endif + { ft.adler32 = &adler32_lsx; ft.adler32_copy = &adler32_copy_lsx; ft.chunkmemset_safe = &chunkmemset_safe_lsx; @@ -323,7 +425,10 @@ static int init_functable(void) { } #endif #ifdef LOONGARCH_LASX - if (cf.loongarch.has_lasx) { +# ifndef LOONGARCH_LASX_NATIVE + if (cf.loongarch.has_lasx) +# endif + { ft.adler32 = &adler32_lasx; ft.adler32_copy = &adler32_copy_lasx; ft.chunkmemset_safe = &chunkmemset_safe_lasx; |
