diff options
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/arm/arm_functions.h | 19 | ||||
| -rw-r--r-- | arch/arm/compare256_neon.c | 2 | ||||
| -rw-r--r-- | arch/generic/Makefile.in | 4 | ||||
| -rw-r--r-- | arch/generic/compare256_c.c | 73 | ||||
| -rw-r--r-- | arch/generic/compare256_p.h | 111 | ||||
| -rw-r--r-- | arch/generic/generic_functions.h | 4 | ||||
| -rw-r--r-- | arch/loongarch/compare256_lasx.c | 3 | ||||
| -rw-r--r-- | arch/loongarch/compare256_lsx.c | 3 | ||||
| -rw-r--r-- | arch/loongarch/loongarch_functions.h | 56 | ||||
| -rw-r--r-- | arch/riscv/compare256_rvv.c | 1 | ||||
| -rw-r--r-- | arch/x86/compare256_avx2.c | 3 | ||||
| -rw-r--r-- | arch/x86/compare256_avx512.c | 5 | ||||
| -rw-r--r-- | arch/x86/compare256_sse2.c | 3 | ||||
| -rw-r--r-- | arch/x86/x86_functions.h | 83 |
14 files changed, 149 insertions, 221 deletions
diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h index b18af8f80f..35dd12a2d9 100644 --- a/arch/arm/arm_functions.h +++ b/arch/arm/arm_functions.h @@ -9,14 +9,11 @@ uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left); - -# ifdef HAVE_BUILTIN_CTZLL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match); uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match); -# endif void slide_hash_neon(deflate_state *s); -void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); #endif #ifdef ARM_CRC32 @@ -47,18 +44,16 @@ void slide_hash_armv6(deflate_state *s); # define native_adler32_copy adler32_copy_neon # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_neon +# undef native_compare256 +# define native_compare256 compare256_neon # undef native_inflate_fast # define native_inflate_fast inflate_fast_neon +# undef native_longest_match +# define native_longest_match longest_match_neon +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_neon # undef native_slide_hash # define native_slide_hash slide_hash_neon -# ifdef HAVE_BUILTIN_CTZLL -# undef native_compare256 -# define native_compare256 compare256_neon -# undef native_longest_match -# define native_longest_match longest_match_neon -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_neon -# endif # endif // ARM - CRC32 # if (defined(ARM_CRC32) && defined(__ARM_FEATURE_CRC32)) diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c index afaf42f5bc..4ced9fc9ca 100644 --- a/arch/arm/compare256_neon.c +++ b/arch/arm/compare256_neon.c @@ -8,7 +8,7 @@ #include "deflate.h" #include "fallback_builtins.h" -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +#if defined(ARM_NEON) #include "neon_intrins.h" static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) { diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in index 07a168f2f1..1d9cc4df5b 100644 --- a/arch/generic/Makefile.in +++ b/arch/generic/Makefile.in @@ -33,10 +33,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl. chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c -compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c -compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h diff --git a/arch/generic/compare256_c.c b/arch/generic/compare256_c.c index ad535523a5..6934a55565 100644 --- a/arch/generic/compare256_c.c +++ b/arch/generic/compare256_c.c @@ -4,17 +4,74 @@ */ #include "zbuild.h" -#include "compare256_p.h" +#include "zendian.h" +#include "deflate.h" +#include "fallback_builtins.h" + +/* 8-bit integer comparison for hardware without unaligned loads */ +static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + if (src0[0] != src1[0]) + return len; + if (src0[1] != src1[1]) + return len + 1; + if (src0[2] != src1[2]) + return len + 2; + if (src0[3] != src1[3]) + return len + 3; + if (src0[4] != src1[4]) + return len + 4; + if (src0[5] != src1[5]) + return len + 5; + if (src0[6] != src1[6]) + return len + 6; + if (src0[7] != src1[7]) + return len + 7; + src0 += 8, src1 += 8, len += 8; + } while (len < 256); + + return 256; +} + +/* 64-bit integer comparison for hardware with unaligned loads */ +static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint64_t sv = zng_memread_8(src0); + uint64_t mv = zng_memread_8(src1); + uint64_t diff = sv ^ mv; + if (diff) + return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; + src0 += 8, src1 += 8, len += 8; + + sv = zng_memread_8(src0); + mv = zng_memread_8(src1); + diff = sv ^ mv; + if (diff) + return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; + src0 += 8, src1 += 8, len += 8; + } while (len < 256); + + return 256; +} -// Set optimal COMPARE256 function variant #if OPTIMAL_CMP == 8 -# define COMPARE256 compare256_8 -#elif defined(HAVE_BUILTIN_CTZLL) -# define COMPARE256 compare256_64 -#elif defined(HAVE_BUILTIN_CTZ) -# define COMPARE256 compare256_32 +# define COMPARE256 compare256_8_static #else -# define COMPARE256 compare256_16 +# define COMPARE256 compare256_64_static +#endif + +#ifdef WITH_ALL_FALLBACKS +Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { + return compare256_8_static(src0, src1); +} + +Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { + return compare256_64_static(src0, src1); +} #endif Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { diff --git a/arch/generic/compare256_p.h b/arch/generic/compare256_p.h index 331a14bfc4..e69de29bb2 100644 --- a/arch/generic/compare256_p.h +++ b/arch/generic/compare256_p.h @@ -1,111 +0,0 @@ -/* compare256_p.h -- 256 byte memory comparison with match length return - * Copyright (C) 2020 Nathan Moinvaziri - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "zmemory.h" -#include "deflate.h" -#include "fallback_builtins.h" - -/* 8-bit integer comparison */ -static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - } while (len < 256); - - return 256; -} - -/* 16-bit integer comparison */ -static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - } while (len < 256); - - return 256; -} - -#ifdef HAVE_BUILTIN_CTZ -/* 32-bit integer comparison */ -static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - uint32_t sv, mv, diff; - - sv = zng_memread_4(src0); - mv = zng_memread_4(src1); - - diff = sv ^ mv; - if (diff) - return len + zng_ctz32(Z_U32_FROM_LE(diff)) / 8; - - src0 += 4, src1 += 4, len += 4; - } while (len < 256); - - return 256; -} -#endif - -#ifdef HAVE_BUILTIN_CTZLL -/* 64-bit integer comparison */ -static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - uint64_t sv, mv, diff; - - sv = zng_memread_8(src0); - mv = zng_memread_8(src1); - - diff = sv ^ mv; - if (diff) - return len + zng_ctz64(Z_U64_FROM_LE(diff)) / 8; - - src0 += 8, src1 += 8, len += 8; - } while (len < 256); - - return 256; -} -#endif diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h index 3496427fdb..f8e564432d 100644 --- a/arch/generic/generic_functions.h +++ b/arch/generic/generic_functions.h @@ -22,6 +22,10 @@ uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +#ifdef WITH_ALL_FALLBACKS +uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1); +uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1); +#endif uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len); diff --git a/arch/loongarch/compare256_lasx.c b/arch/loongarch/compare256_lasx.c index 2db428b6ba..d61d6e57b3 100644 --- a/arch/loongarch/compare256_lasx.c +++ b/arch/loongarch/compare256_lasx.c @@ -5,11 +5,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LASX #include <lasxintrin.h> #include "lasxintrin_ext.h" diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c index e02329db09..4afd261e76 100644 --- a/arch/loongarch/compare256_lsx.c +++ b/arch/loongarch/compare256_lsx.c @@ -5,11 +5,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LSX #include <lsxintrin.h> #include "lsxintrin_ext.h" diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index 939a3a03e8..34281432f5 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -16,27 +16,23 @@ uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, #ifdef LOONGARCH_LSX uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -void slide_hash_lsx(deflate_state *s); -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match); -# endif uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1); void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match); +void slide_hash_lsx(deflate_state *s); #endif #ifdef LOONGARCH_LASX uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len); uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -void slide_hash_lasx(deflate_state *s); -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match); -# endif uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1); void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match); +void slide_hash_lasx(deflate_state *s); #endif #ifdef DISABLE_RUNTIME_CPU_DETECTION @@ -52,40 +48,36 @@ void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); # define native_adler32 adler32_lsx # undef native_adler32_copy # define native_adler32_copy adler32_copy_lsx -# undef native_slide_hash -# define native_slide_hash slide_hash_lsx # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_lsx +# undef native_compare256 +# define native_compare256 compare256_lsx # undef native_inflate_fast # define native_inflate_fast inflate_fast_lsx -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_lsx -# undef native_longest_match -# define native_longest_match longest_match_lsx -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_lsx -# endif +# undef native_longest_match +# define native_longest_match longest_match_lsx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lsx +# undef native_slide_hash +# define native_slide_hash slide_hash_lsx # endif # if defined(LOONGARCH_LASX) && defined(__loongarch_asx) # undef native_adler32 # define native_adler32 adler32_lasx # undef native_adler32_copy # define native_adler32_copy adler32_copy_lasx -# undef native_slide_hash -# define native_slide_hash slide_hash_lasx # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_lasx +# undef native_compare256 +# define native_compare256 compare256_lasx # undef native_inflate_fast # define native_inflate_fast inflate_fast_lasx -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_lasx -# undef native_longest_match -# define native_longest_match longest_match_lasx -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_lasx -# endif +# undef native_longest_match +# define native_longest_match longest_match_lasx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lasx +# undef native_slide_hash +# define native_slide_hash slide_hash_lasx # endif #endif diff --git a/arch/riscv/compare256_rvv.c b/arch/riscv/compare256_rvv.c index 3ddb4db080..edb18a3766 100644 --- a/arch/riscv/compare256_rvv.c +++ b/arch/riscv/compare256_rvv.c @@ -9,7 +9,6 @@ #include "zbuild.h" #include "zmemory.h" #include "deflate.h" -#include "fallback_builtins.h" #include <riscv_vector.h> diff --git a/arch/x86/compare256_avx2.c b/arch/x86/compare256_avx2.c index c99db3b34d..5e2b1716cf 100644 --- a/arch/x86/compare256_avx2.c +++ b/arch/x86/compare256_avx2.c @@ -4,11 +4,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_AVX2 #include <immintrin.h> #ifdef _MSC_VER diff --git a/arch/x86/compare256_avx512.c b/arch/x86/compare256_avx512.c index f61402ae6d..f3105505cb 100644 --- a/arch/x86/compare256_avx512.c +++ b/arch/x86/compare256_avx512.c @@ -5,11 +5,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL) +#ifdef X86_AVX512 #include <immintrin.h> #ifdef _MSC_VER @@ -33,7 +34,7 @@ static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8 // 16 bytes xmm_src0_0 = _mm_loadu_si128((__m128i*)src0); xmm_src1_0 = _mm_loadu_si128((__m128i*)src1); - mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz + mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); if (mask_0 != 0x0000FFFF) return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */ diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c index 2864b4df92..cfaff82cfa 100644 --- a/arch/x86/compare256_sse2.c +++ b/arch/x86/compare256_sse2.c @@ -4,11 +4,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_SSE2 #include <emmintrin.h> diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 9523ec0d6c..e1c99137dd 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -16,14 +16,12 @@ #ifdef X86_SSE2 uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match); +void slide_hash_sse2(deflate_state *s); -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match); -# endif - void slide_hash_sse2(deflate_state *s); - void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); # if !defined(WITHOUT_CHORBA_SSE) uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); @@ -51,25 +49,20 @@ uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, si uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left); - -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match); -# endif - void slide_hash_avx2(deflate_state *s); - void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start); +uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match); +void slide_hash_avx2(deflate_state *s); #endif #ifdef X86_AVX512 uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1); void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); -# ifdef HAVE_BUILTIN_CTZLL - uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match); -# endif +uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match); #endif #ifdef X86_AVX512VNNI uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); @@ -90,22 +83,20 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # if (defined(X86_SSE2) && defined(__SSE2__)) || (defined(ARCH_X86) && defined(ARCH_64BIT)) # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_sse2 +# undef native_compare256 +# define native_compare256 compare256_sse2 # undef native_inflate_fast # define native_inflate_fast inflate_fast_sse2 +# undef native_longest_match +# define native_longest_match longest_match_sse2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_sse2 +# if !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse2 +# endif # undef native_slide_hash # define native_slide_hash slide_hash_sse2 -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_sse2 -# undef native_longest_match -# define native_longest_match longest_match_sse2 -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_sse2 -# if !defined(WITHOUT_CHORBA_SSE) -# undef native_crc32 -# define native_crc32 crc32_chorba_sse2 -# endif -# endif # endif // X86 - SSSE3 # if defined(X86_SSSE3) && defined(__SSSE3__) @@ -145,18 +136,16 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # define native_adler32_copy adler32_copy_avx2 # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_avx2 +# undef native_compare256 +# define native_compare256 compare256_avx2 # undef native_inflate_fast # define native_inflate_fast inflate_fast_avx2 +# undef native_longest_match +# define native_longest_match longest_match_avx2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx2 # undef native_slide_hash # define native_slide_hash slide_hash_avx2 -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_avx2 -# undef native_longest_match -# define native_longest_match longest_match_avx2 -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_avx2 -# endif # endif // X86 - AVX512 (F,DQ,BW,Vl) # if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) @@ -166,16 +155,14 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # define native_adler32_copy adler32_copy_avx512 # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_avx512 +# undef native_compare256 +# define native_compare256 compare256_avx512 # undef native_inflate_fast # define native_inflate_fast inflate_fast_avx512 -# ifdef HAVE_BUILTIN_CTZLL -# undef native_compare256 -# define native_compare256 compare256_avx512 -# undef native_longest_match -# define native_longest_match longest_match_avx512 -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_avx512 -# endif +# undef native_longest_match +# define native_longest_match longest_match_avx512 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx512 // X86 - AVX512 (VNNI) # if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) # undef native_adler32 |
