summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/arm_functions.h19
-rw-r--r--arch/arm/compare256_neon.c2
-rw-r--r--arch/generic/Makefile.in4
-rw-r--r--arch/generic/compare256_c.c73
-rw-r--r--arch/generic/compare256_p.h111
-rw-r--r--arch/generic/generic_functions.h4
-rw-r--r--arch/loongarch/compare256_lasx.c3
-rw-r--r--arch/loongarch/compare256_lsx.c3
-rw-r--r--arch/loongarch/loongarch_functions.h56
-rw-r--r--arch/riscv/compare256_rvv.c1
-rw-r--r--arch/x86/compare256_avx2.c3
-rw-r--r--arch/x86/compare256_avx512.c5
-rw-r--r--arch/x86/compare256_sse2.c3
-rw-r--r--arch/x86/x86_functions.h83
14 files changed, 149 insertions, 221 deletions
diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h
index b18af8f80f..35dd12a2d9 100644
--- a/arch/arm/arm_functions.h
+++ b/arch/arm/arm_functions.h
@@ -9,14 +9,11 @@
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
-
-# ifdef HAVE_BUILTIN_CTZLL
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match);
uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match);
-# endif
void slide_hash_neon(deflate_state *s);
-void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef ARM_CRC32
@@ -47,18 +44,16 @@ void slide_hash_armv6(deflate_state *s);
# define native_adler32_copy adler32_copy_neon
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_neon
+# undef native_compare256
+# define native_compare256 compare256_neon
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_neon
+# undef native_longest_match
+# define native_longest_match longest_match_neon
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_neon
# undef native_slide_hash
# define native_slide_hash slide_hash_neon
-# ifdef HAVE_BUILTIN_CTZLL
-# undef native_compare256
-# define native_compare256 compare256_neon
-# undef native_longest_match
-# define native_longest_match longest_match_neon
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_neon
-# endif
# endif
// ARM - CRC32
# if (defined(ARM_CRC32) && defined(__ARM_FEATURE_CRC32))
diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c
index afaf42f5bc..4ced9fc9ca 100644
--- a/arch/arm/compare256_neon.c
+++ b/arch/arm/compare256_neon.c
@@ -8,7 +8,7 @@
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#if defined(ARM_NEON)
#include "neon_intrins.h"
static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in
index 07a168f2f1..1d9cc4df5b 100644
--- a/arch/generic/Makefile.in
+++ b/arch/generic/Makefile.in
@@ -33,10 +33,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
-compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
-compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
diff --git a/arch/generic/compare256_c.c b/arch/generic/compare256_c.c
index ad535523a5..6934a55565 100644
--- a/arch/generic/compare256_c.c
+++ b/arch/generic/compare256_c.c
@@ -4,17 +4,74 @@
*/
#include "zbuild.h"
-#include "compare256_p.h"
+#include "zendian.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+/* 8-bit integer comparison for hardware without unaligned loads */
+static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ if (src0[0] != src1[0])
+ return len;
+ if (src0[1] != src1[1])
+ return len + 1;
+ if (src0[2] != src1[2])
+ return len + 2;
+ if (src0[3] != src1[3])
+ return len + 3;
+ if (src0[4] != src1[4])
+ return len + 4;
+ if (src0[5] != src1[5])
+ return len + 5;
+ if (src0[6] != src1[6])
+ return len + 6;
+ if (src0[7] != src1[7])
+ return len + 7;
+ src0 += 8, src1 += 8, len += 8;
+ } while (len < 256);
+
+ return 256;
+}
+
+/* 64-bit integer comparison for hardware with unaligned loads */
+static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint64_t sv = zng_memread_8(src0);
+ uint64_t mv = zng_memread_8(src1);
+ uint64_t diff = sv ^ mv;
+ if (diff)
+ return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+ src0 += 8, src1 += 8, len += 8;
+
+ sv = zng_memread_8(src0);
+ mv = zng_memread_8(src1);
+ diff = sv ^ mv;
+ if (diff)
+ return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+ src0 += 8, src1 += 8, len += 8;
+ } while (len < 256);
+
+ return 256;
+}
-// Set optimal COMPARE256 function variant
#if OPTIMAL_CMP == 8
-# define COMPARE256 compare256_8
-#elif defined(HAVE_BUILTIN_CTZLL)
-# define COMPARE256 compare256_64
-#elif defined(HAVE_BUILTIN_CTZ)
-# define COMPARE256 compare256_32
+# define COMPARE256 compare256_8_static
#else
-# define COMPARE256 compare256_16
+# define COMPARE256 compare256_64_static
+#endif
+
+#ifdef WITH_ALL_FALLBACKS
+Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_8_static(src0, src1);
+}
+
+Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_64_static(src0, src1);
+}
#endif
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
diff --git a/arch/generic/compare256_p.h b/arch/generic/compare256_p.h
index 331a14bfc4..e69de29bb2 100644
--- a/arch/generic/compare256_p.h
+++ b/arch/generic/compare256_p.h
@@ -1,111 +0,0 @@
-/* compare256_p.h -- 256 byte memory comparison with match length return
- * Copyright (C) 2020 Nathan Moinvaziri
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "zmemory.h"
-#include "deflate.h"
-#include "fallback_builtins.h"
-
-/* 8-bit integer comparison */
-static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- if (*src0 != *src1)
- return len;
- src0 += 1, src1 += 1, len += 1;
- } while (len < 256);
-
- return 256;
-}
-
-/* 16-bit integer comparison */
-static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
-
- if (zng_memcmp_2(src0, src1) != 0)
- return len + (*src0 == *src1);
- src0 += 2, src1 += 2, len += 2;
- } while (len < 256);
-
- return 256;
-}
-
-#ifdef HAVE_BUILTIN_CTZ
-/* 32-bit integer comparison */
-static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- uint32_t sv, mv, diff;
-
- sv = zng_memread_4(src0);
- mv = zng_memread_4(src1);
-
- diff = sv ^ mv;
- if (diff)
- return len + zng_ctz32(Z_U32_FROM_LE(diff)) / 8;
-
- src0 += 4, src1 += 4, len += 4;
- } while (len < 256);
-
- return 256;
-}
-#endif
-
-#ifdef HAVE_BUILTIN_CTZLL
-/* 64-bit integer comparison */
-static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
- uint32_t len = 0;
-
- do {
- uint64_t sv, mv, diff;
-
- sv = zng_memread_8(src0);
- mv = zng_memread_8(src1);
-
- diff = sv ^ mv;
- if (diff)
- return len + zng_ctz64(Z_U64_FROM_LE(diff)) / 8;
-
- src0 += 8, src1 += 8, len += 8;
- } while (len < 256);
-
- return 256;
-}
-#endif
diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h
index 3496427fdb..f8e564432d 100644
--- a/arch/generic/generic_functions.h
+++ b/arch/generic/generic_functions.h
@@ -22,6 +22,10 @@ uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t
uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+#ifdef WITH_ALL_FALLBACKS
+uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1);
+uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
+#endif
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len);
diff --git a/arch/loongarch/compare256_lasx.c b/arch/loongarch/compare256_lasx.c
index 2db428b6ba..d61d6e57b3 100644
--- a/arch/loongarch/compare256_lasx.c
+++ b/arch/loongarch/compare256_lasx.c
@@ -5,11 +5,12 @@
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LASX
#include <lasxintrin.h>
#include "lasxintrin_ext.h"
diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c
index e02329db09..4afd261e76 100644
--- a/arch/loongarch/compare256_lsx.c
+++ b/arch/loongarch/compare256_lsx.c
@@ -5,11 +5,12 @@
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+#ifdef LOONGARCH_LSX
#include <lsxintrin.h>
#include "lsxintrin_ext.h"
diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h
index 939a3a03e8..34281432f5 100644
--- a/arch/loongarch/loongarch_functions.h
+++ b/arch/loongarch/loongarch_functions.h
@@ -16,27 +16,23 @@ uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src,
#ifdef LOONGARCH_LSX
uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len);
uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-void slide_hash_lsx(deflate_state *s);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match);
-# endif
uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lsx(deflate_state *s);
#endif
#ifdef LOONGARCH_LASX
uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len);
uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-void slide_hash_lasx(deflate_state *s);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match);
-# endif
uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lasx(deflate_state *s);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
@@ -52,40 +48,36 @@ void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start);
# define native_adler32 adler32_lsx
# undef native_adler32_copy
# define native_adler32_copy adler32_copy_lsx
-# undef native_slide_hash
-# define native_slide_hash slide_hash_lsx
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_lsx
+# undef native_compare256
+# define native_compare256 compare256_lsx
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_lsx
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_lsx
-# undef native_longest_match
-# define native_longest_match longest_match_lsx
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_lsx
-# endif
+# undef native_longest_match
+# define native_longest_match longest_match_lsx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lsx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lsx
# endif
# if defined(LOONGARCH_LASX) && defined(__loongarch_asx)
# undef native_adler32
# define native_adler32 adler32_lasx
# undef native_adler32_copy
# define native_adler32_copy adler32_copy_lasx
-# undef native_slide_hash
-# define native_slide_hash slide_hash_lasx
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_lasx
+# undef native_compare256
+# define native_compare256 compare256_lasx
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_lasx
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_lasx
-# undef native_longest_match
-# define native_longest_match longest_match_lasx
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_lasx
-# endif
+# undef native_longest_match
+# define native_longest_match longest_match_lasx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lasx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lasx
# endif
#endif
diff --git a/arch/riscv/compare256_rvv.c b/arch/riscv/compare256_rvv.c
index 3ddb4db080..edb18a3766 100644
--- a/arch/riscv/compare256_rvv.c
+++ b/arch/riscv/compare256_rvv.c
@@ -9,7 +9,6 @@
#include "zbuild.h"
#include "zmemory.h"
#include "deflate.h"
-#include "fallback_builtins.h"
#include <riscv_vector.h>
diff --git a/arch/x86/compare256_avx2.c b/arch/x86/compare256_avx2.c
index c99db3b34d..5e2b1716cf 100644
--- a/arch/x86/compare256_avx2.c
+++ b/arch/x86/compare256_avx2.c
@@ -4,11 +4,12 @@
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_AVX2
#include <immintrin.h>
#ifdef _MSC_VER
diff --git a/arch/x86/compare256_avx512.c b/arch/x86/compare256_avx512.c
index f61402ae6d..f3105505cb 100644
--- a/arch/x86/compare256_avx512.c
+++ b/arch/x86/compare256_avx512.c
@@ -5,11 +5,12 @@
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef X86_AVX512
#include <immintrin.h>
#ifdef _MSC_VER
@@ -33,7 +34,7 @@ static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8
// 16 bytes
xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
- mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz
+ mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0);
if (mask_0 != 0x0000FFFF)
return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */
diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c
index 2864b4df92..cfaff82cfa 100644
--- a/arch/x86/compare256_sse2.c
+++ b/arch/x86/compare256_sse2.c
@@ -4,11 +4,12 @@
*/
#include "zbuild.h"
+#include "zendian.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
-#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+#ifdef X86_SSE2
#include <emmintrin.h>
diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h
index 9523ec0d6c..e1c99137dd 100644
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@@ -16,14 +16,12 @@
#ifdef X86_SSE2
uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_sse2(deflate_state *s);
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match);
-# endif
- void slide_hash_sse2(deflate_state *s);
- void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
# if !defined(WITHOUT_CHORBA_SSE)
uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
@@ -51,25 +49,20 @@ uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, si
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
-
-# ifdef HAVE_BUILTIN_CTZ
- uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match);
-# endif
- void slide_hash_avx2(deflate_state *s);
- void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_avx2(deflate_state *s);
#endif
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
-# ifdef HAVE_BUILTIN_CTZLL
- uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
- uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
- uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match);
-# endif
+uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match);
#endif
#ifdef X86_AVX512VNNI
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
@@ -90,22 +83,20 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
# if (defined(X86_SSE2) && defined(__SSE2__)) || (defined(ARCH_X86) && defined(ARCH_64BIT))
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_sse2
+# undef native_compare256
+# define native_compare256 compare256_sse2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_sse2
+# undef native_longest_match
+# define native_longest_match longest_match_sse2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_sse2
+# if !defined(WITHOUT_CHORBA_SSE)
+# undef native_crc32
+# define native_crc32 crc32_chorba_sse2
+# endif
# undef native_slide_hash
# define native_slide_hash slide_hash_sse2
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_sse2
-# undef native_longest_match
-# define native_longest_match longest_match_sse2
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_sse2
-# if !defined(WITHOUT_CHORBA_SSE)
-# undef native_crc32
-# define native_crc32 crc32_chorba_sse2
-# endif
-# endif
# endif
// X86 - SSSE3
# if defined(X86_SSSE3) && defined(__SSSE3__)
@@ -145,18 +136,16 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
# define native_adler32_copy adler32_copy_avx2
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx2
+# undef native_compare256
+# define native_compare256 compare256_avx2
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx2
+# undef native_longest_match
+# define native_longest_match longest_match_avx2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx2
# undef native_slide_hash
# define native_slide_hash slide_hash_avx2
-# ifdef HAVE_BUILTIN_CTZ
-# undef native_compare256
-# define native_compare256 compare256_avx2
-# undef native_longest_match
-# define native_longest_match longest_match_avx2
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_avx2
-# endif
# endif
// X86 - AVX512 (F,DQ,BW,Vl)
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
@@ -166,16 +155,14 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
# define native_adler32_copy adler32_copy_avx512
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_avx512
+# undef native_compare256
+# define native_compare256 compare256_avx512
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_avx512
-# ifdef HAVE_BUILTIN_CTZLL
-# undef native_compare256
-# define native_compare256 compare256_avx512
-# undef native_longest_match
-# define native_longest_match longest_match_avx512
-# undef native_longest_match_slow
-# define native_longest_match_slow longest_match_slow_avx512
-# endif
+# undef native_longest_match
+# define native_longest_match longest_match_avx512
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx512
// X86 - AVX512 (VNNI)
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
# undef native_adler32