diff options
| -rw-r--r-- | CMakeLists.txt | 6 | ||||
| -rw-r--r-- | arch/arm/arm_functions.h | 19 | ||||
| -rw-r--r-- | arch/arm/compare256_neon.c | 2 | ||||
| -rw-r--r-- | arch/generic/Makefile.in | 4 | ||||
| -rw-r--r-- | arch/generic/compare256_c.c | 73 | ||||
| -rw-r--r-- | arch/generic/compare256_p.h | 111 | ||||
| -rw-r--r-- | arch/generic/generic_functions.h | 4 | ||||
| -rw-r--r-- | arch/loongarch/compare256_lasx.c | 3 | ||||
| -rw-r--r-- | arch/loongarch/compare256_lsx.c | 3 | ||||
| -rw-r--r-- | arch/loongarch/loongarch_functions.h | 56 | ||||
| -rw-r--r-- | arch/riscv/compare256_rvv.c | 1 | ||||
| -rw-r--r-- | arch/x86/compare256_avx2.c | 3 | ||||
| -rw-r--r-- | arch/x86/compare256_avx512.c | 5 | ||||
| -rw-r--r-- | arch/x86/compare256_sse2.c | 3 | ||||
| -rw-r--r-- | arch/x86/x86_functions.h | 83 | ||||
| -rw-r--r-- | compare256_rle.h | 104 | ||||
| -rw-r--r-- | deflate_rle.c | 6 | ||||
| -rw-r--r-- | functable.c | 47 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_compare256.cc | 19 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_compare256_rle.cc | 6 | ||||
| -rw-r--r-- | test/test_compare256.cc | 19 | ||||
| -rw-r--r-- | test/test_compare256_rle.cc | 6 |
22 files changed, 201 insertions, 382 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e7b8fe0d8..20c9c3f61a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1218,7 +1218,6 @@ set(ZLIB_PUBLIC_HDRS ) set(ZLIB_PRIVATE_HDRS arch/generic/chunk_permute_table.h - arch/generic/compare256_p.h arch/generic/generic_functions.h adler32_p.h arch_functions.h @@ -1289,11 +1288,6 @@ elseif(BASEARCH_X86_FOUND AND ARCH_64BIT AND WITH_SSE2) arch/generic/adler32_c.c arch/generic/crc32_braid_c.c ) - - # x86_64 does not need compare256 fallback if we have BUILTIN_CTZ - if(NOT HAVE_BUILTIN_CTZ) - list(APPEND ZLIB_GENERIC_SRCS arch/generic/compare256_c.c) - endif() else() list(APPEND ZLIB_GENERIC_SRCS ${ZLIB_ALL_FALLBACK_SRCS}) add_definitions(-DWITH_ALL_FALLBACKS) diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h index b18af8f80f..35dd12a2d9 100644 --- a/arch/arm/arm_functions.h +++ b/arch/arm/arm_functions.h @@ -9,14 +9,11 @@ uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left); - -# ifdef HAVE_BUILTIN_CTZLL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match); uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match); -# endif void slide_hash_neon(deflate_state *s); -void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); #endif #ifdef ARM_CRC32 @@ -47,18 +44,16 @@ void slide_hash_armv6(deflate_state *s); # define native_adler32_copy adler32_copy_neon # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_neon +# undef native_compare256 +# define native_compare256 compare256_neon # undef native_inflate_fast # define native_inflate_fast inflate_fast_neon +# undef native_longest_match +# define native_longest_match longest_match_neon +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_neon # undef native_slide_hash # define native_slide_hash slide_hash_neon -# ifdef HAVE_BUILTIN_CTZLL -# undef native_compare256 -# define native_compare256 compare256_neon -# undef native_longest_match -# define native_longest_match longest_match_neon -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_neon -# endif # endif // ARM - CRC32 # if (defined(ARM_CRC32) && defined(__ARM_FEATURE_CRC32)) diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c index afaf42f5bc..4ced9fc9ca 100644 --- a/arch/arm/compare256_neon.c +++ b/arch/arm/compare256_neon.c @@ -8,7 +8,7 @@ #include "deflate.h" #include "fallback_builtins.h" -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +#if defined(ARM_NEON) #include "neon_intrins.h" static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) { diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in index 07a168f2f1..1d9cc4df5b 100644 --- a/arch/generic/Makefile.in +++ b/arch/generic/Makefile.in @@ -33,10 +33,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl. chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c -compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c -compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h +compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h diff --git a/arch/generic/compare256_c.c b/arch/generic/compare256_c.c index ad535523a5..6934a55565 100644 --- a/arch/generic/compare256_c.c +++ b/arch/generic/compare256_c.c @@ -4,17 +4,74 @@ */ #include "zbuild.h" -#include "compare256_p.h" +#include "zendian.h" +#include "deflate.h" +#include "fallback_builtins.h" + +/* 8-bit integer comparison for hardware without unaligned loads */ +static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + if (src0[0] != src1[0]) + return len; + if (src0[1] != src1[1]) + return len + 1; + if (src0[2] != src1[2]) + return len + 2; + if (src0[3] != src1[3]) + return len + 3; + if (src0[4] != src1[4]) + return len + 4; + if (src0[5] != src1[5]) + return len + 5; + if (src0[6] != src1[6]) + return len + 6; + if (src0[7] != src1[7]) + return len + 7; + src0 += 8, src1 += 8, len += 8; + } while (len < 256); + + return 256; +} + +/* 64-bit integer comparison for hardware with unaligned loads */ +static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint64_t sv = zng_memread_8(src0); + uint64_t mv = zng_memread_8(src1); + uint64_t diff = sv ^ mv; + if (diff) + return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; + src0 += 8, src1 += 8, len += 8; + + sv = zng_memread_8(src0); + mv = zng_memread_8(src1); + diff = sv ^ mv; + if (diff) + return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; + src0 += 8, src1 += 8, len += 8; + } while (len < 256); + + return 256; +} -// Set optimal COMPARE256 function variant #if OPTIMAL_CMP == 8 -# define COMPARE256 compare256_8 -#elif defined(HAVE_BUILTIN_CTZLL) -# define COMPARE256 compare256_64 -#elif defined(HAVE_BUILTIN_CTZ) -# define COMPARE256 compare256_32 +# define COMPARE256 compare256_8_static #else -# define COMPARE256 compare256_16 +# define COMPARE256 compare256_64_static +#endif + +#ifdef WITH_ALL_FALLBACKS +Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { + return compare256_8_static(src0, src1); +} + +Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { + return compare256_64_static(src0, src1); +} #endif Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { diff --git a/arch/generic/compare256_p.h b/arch/generic/compare256_p.h index 331a14bfc4..e69de29bb2 100644 --- a/arch/generic/compare256_p.h +++ b/arch/generic/compare256_p.h @@ -1,111 +0,0 @@ -/* compare256_p.h -- 256 byte memory comparison with match length return - * Copyright (C) 2020 Nathan Moinvaziri - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "zmemory.h" -#include "deflate.h" -#include "fallback_builtins.h" - -/* 8-bit integer comparison */ -static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src0 += 1, src1 += 1, len += 1; - } while (len < 256); - - return 256; -} - -/* 16-bit integer comparison */ -static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - - if (zng_memcmp_2(src0, src1) != 0) - return len + (*src0 == *src1); - src0 += 2, src1 += 2, len += 2; - } while (len < 256); - - return 256; -} - -#ifdef HAVE_BUILTIN_CTZ -/* 32-bit integer comparison */ -static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - uint32_t sv, mv, diff; - - sv = zng_memread_4(src0); - mv = zng_memread_4(src1); - - diff = sv ^ mv; - if (diff) - return len + zng_ctz32(Z_U32_FROM_LE(diff)) / 8; - - src0 += 4, src1 += 4, len += 4; - } while (len < 256); - - return 256; -} -#endif - -#ifdef HAVE_BUILTIN_CTZLL -/* 64-bit integer comparison */ -static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - - do { - uint64_t sv, mv, diff; - - sv = zng_memread_8(src0); - mv = zng_memread_8(src1); - - diff = sv ^ mv; - if (diff) - return len + zng_ctz64(Z_U64_FROM_LE(diff)) / 8; - - src0 += 8, src1 += 8, len += 8; - } while (len < 256); - - return 256; -} -#endif diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h index 3496427fdb..f8e564432d 100644 --- a/arch/generic/generic_functions.h +++ b/arch/generic/generic_functions.h @@ -22,6 +22,10 @@ uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +#ifdef WITH_ALL_FALLBACKS +uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1); +uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1); +#endif uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len); diff --git a/arch/loongarch/compare256_lasx.c b/arch/loongarch/compare256_lasx.c index 2db428b6ba..d61d6e57b3 100644 --- a/arch/loongarch/compare256_lasx.c +++ b/arch/loongarch/compare256_lasx.c @@ -5,11 +5,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LASX #include <lasxintrin.h> #include "lasxintrin_ext.h" diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c index e02329db09..4afd261e76 100644 --- a/arch/loongarch/compare256_lsx.c +++ b/arch/loongarch/compare256_lsx.c @@ -5,11 +5,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LSX #include <lsxintrin.h> #include "lsxintrin_ext.h" diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index 939a3a03e8..34281432f5 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -16,27 +16,23 @@ uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, #ifdef LOONGARCH_LSX uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -void slide_hash_lsx(deflate_state *s); -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match); -# endif uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1); void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match); +void slide_hash_lsx(deflate_state *s); #endif #ifdef LOONGARCH_LASX uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len); uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -void slide_hash_lasx(deflate_state *s); -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match); -# endif uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1); void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match); +void slide_hash_lasx(deflate_state *s); #endif #ifdef DISABLE_RUNTIME_CPU_DETECTION @@ -52,40 +48,36 @@ void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); # define native_adler32 adler32_lsx # undef native_adler32_copy # define native_adler32_copy adler32_copy_lsx -# undef native_slide_hash -# define native_slide_hash slide_hash_lsx # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_lsx +# undef native_compare256 +# define native_compare256 compare256_lsx # undef native_inflate_fast # define native_inflate_fast inflate_fast_lsx -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_lsx -# undef native_longest_match -# define native_longest_match longest_match_lsx -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_lsx -# endif +# undef native_longest_match +# define native_longest_match longest_match_lsx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lsx +# undef native_slide_hash +# define native_slide_hash slide_hash_lsx # endif # if defined(LOONGARCH_LASX) && defined(__loongarch_asx) # undef native_adler32 # define native_adler32 adler32_lasx # undef native_adler32_copy # define native_adler32_copy adler32_copy_lasx -# undef native_slide_hash -# define native_slide_hash slide_hash_lasx # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_lasx +# undef native_compare256 +# define native_compare256 compare256_lasx # undef native_inflate_fast # define native_inflate_fast inflate_fast_lasx -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_lasx -# undef native_longest_match -# define native_longest_match longest_match_lasx -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_lasx -# endif +# undef native_longest_match +# define native_longest_match longest_match_lasx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lasx +# undef native_slide_hash +# define native_slide_hash slide_hash_lasx # endif #endif diff --git a/arch/riscv/compare256_rvv.c b/arch/riscv/compare256_rvv.c index 3ddb4db080..edb18a3766 100644 --- a/arch/riscv/compare256_rvv.c +++ b/arch/riscv/compare256_rvv.c @@ -9,7 +9,6 @@ #include "zbuild.h" #include "zmemory.h" #include "deflate.h" -#include "fallback_builtins.h" #include <riscv_vector.h> diff --git a/arch/x86/compare256_avx2.c b/arch/x86/compare256_avx2.c index c99db3b34d..5e2b1716cf 100644 --- a/arch/x86/compare256_avx2.c +++ b/arch/x86/compare256_avx2.c @@ -4,11 +4,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_AVX2 #include <immintrin.h> #ifdef _MSC_VER diff --git a/arch/x86/compare256_avx512.c b/arch/x86/compare256_avx512.c index f61402ae6d..f3105505cb 100644 --- a/arch/x86/compare256_avx512.c +++ b/arch/x86/compare256_avx512.c @@ -5,11 +5,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL) +#ifdef X86_AVX512 #include <immintrin.h> #ifdef _MSC_VER @@ -33,7 +34,7 @@ static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8 // 16 bytes xmm_src0_0 = _mm_loadu_si128((__m128i*)src0); xmm_src1_0 = _mm_loadu_si128((__m128i*)src1); - mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz + mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); if (mask_0 != 0x0000FFFF) return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */ diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c index 2864b4df92..cfaff82cfa 100644 --- a/arch/x86/compare256_sse2.c +++ b/arch/x86/compare256_sse2.c @@ -4,11 +4,12 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "deflate.h" #include "fallback_builtins.h" -#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_SSE2 #include <emmintrin.h> diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 9523ec0d6c..e1c99137dd 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -16,14 +16,12 @@ #ifdef X86_SSE2 uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match); +void slide_hash_sse2(deflate_state *s); -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match); -# endif - void slide_hash_sse2(deflate_state *s); - void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); # if !defined(WITHOUT_CHORBA_SSE) uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); @@ -51,25 +49,20 @@ uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, si uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left); - -# ifdef HAVE_BUILTIN_CTZ - uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match); -# endif - void slide_hash_avx2(deflate_state *s); - void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start); +uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match); +void slide_hash_avx2(deflate_state *s); #endif #ifdef X86_AVX512 uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1); void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); -# ifdef HAVE_BUILTIN_CTZLL - uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1); - uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match); - uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match); -# endif +uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match); #endif #ifdef X86_AVX512VNNI uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); @@ -90,22 +83,20 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # if (defined(X86_SSE2) && defined(__SSE2__)) || (defined(ARCH_X86) && defined(ARCH_64BIT)) # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_sse2 +# undef native_compare256 +# define native_compare256 compare256_sse2 # undef native_inflate_fast # define native_inflate_fast inflate_fast_sse2 +# undef native_longest_match +# define native_longest_match longest_match_sse2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_sse2 +# if !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse2 +# endif # undef native_slide_hash # define native_slide_hash slide_hash_sse2 -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_sse2 -# undef native_longest_match -# define native_longest_match longest_match_sse2 -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_sse2 -# if !defined(WITHOUT_CHORBA_SSE) -# undef native_crc32 -# define native_crc32 crc32_chorba_sse2 -# endif -# endif # endif // X86 - SSSE3 # if defined(X86_SSSE3) && defined(__SSSE3__) @@ -145,18 +136,16 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # define native_adler32_copy adler32_copy_avx2 # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_avx2 +# undef native_compare256 +# define native_compare256 compare256_avx2 # undef native_inflate_fast # define native_inflate_fast inflate_fast_avx2 +# undef native_longest_match +# define native_longest_match longest_match_avx2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx2 # undef native_slide_hash # define native_slide_hash slide_hash_avx2 -# ifdef HAVE_BUILTIN_CTZ -# undef native_compare256 -# define native_compare256 compare256_avx2 -# undef native_longest_match -# define native_longest_match longest_match_avx2 -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_avx2 -# endif # endif // X86 - AVX512 (F,DQ,BW,Vl) # if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) @@ -166,16 +155,14 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # define native_adler32_copy adler32_copy_avx512 # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_avx512 +# undef native_compare256 +# define native_compare256 compare256_avx512 # undef native_inflate_fast # define native_inflate_fast inflate_fast_avx512 -# ifdef HAVE_BUILTIN_CTZLL -# undef native_compare256 -# define native_compare256 compare256_avx512 -# undef native_longest_match -# define native_longest_match longest_match_avx512 -# undef native_longest_match_slow -# define native_longest_match_slow longest_match_slow_avx512 -# endif +# undef native_longest_match +# define native_longest_match longest_match_avx512 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx512 // X86 - AVX512 (VNNI) # if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) # undef native_adler32 diff --git a/compare256_rle.h b/compare256_rle.h index 8fac7e1080..02e6bd496a 100644 --- a/compare256_rle.h +++ b/compare256_rle.h @@ -4,111 +4,52 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zmemory.h" #include "fallback_builtins.h" typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1); -/* 8-bit integer comparison */ +/* 8-bit RLE comparison for hardware without unaligned loads */ static inline uint32_t compare256_rle_8(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; + uint8_t val = *src0; do { - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - if (*src0 != *src1) - return len; - src1 += 1, len += 1; - } while (len < 256); - - return 256; -} - -/* 16-bit integer comparison */ -static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) { - uint32_t len = 0; - uint16_t src0_cmp; - - src0_cmp = zng_memread_2(src0); - - do { - if (src0_cmp != zng_memread_2(src1)) - return len + (*src0 == *src1); - src1 += 2, len += 2; - if (src0_cmp != zng_memread_2(src1)) - return len + (*src0 == *src1); - src1 += 2, len += 2; - if (src0_cmp != zng_memread_2(src1)) - return len + (*src0 == *src1); - src1 += 2, len += 2; - if (src0_cmp != zng_memread_2(src1)) - return len + (*src0 == *src1); - src1 += 2, len += 2; - } while (len < 256); - - return 256; -} - -#ifdef HAVE_BUILTIN_CTZ -/* 32-bit integer comparison */ -static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) { - uint32_t sv, len = 0; - uint16_t src0_cmp; - - src0_cmp = zng_memread_2(src0); - sv = ((uint32_t)src0_cmp << 16) | src0_cmp; - - do { - uint32_t mv, diff; - - mv = zng_memread_4(src1); - - diff = sv ^ mv; - if (diff) - return len + zng_ctz32(Z_U32_TO_LE(diff)) / 8; - - src1 += 4, len += 4; + if (val != src1[0]) + return len; + if (val != src1[1]) + return len + 1; + if (val != src1[2]) + return len + 2; + if (val != src1[3]) + return len + 3; + if (val != src1[4]) + return len + 4; + if (val != src1[5]) + return len + 5; + if (val != src1[6]) + return len + 6; + if (val != src1[7]) + return len + 7; + src1 += 8, len += 8; } while (len < 256); return 256; } -#endif -#ifdef HAVE_BUILTIN_CTZLL -/* 64-bit integer comparison */ +/* 64-bit RLE comparison for hardware with unaligned loads */ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) { uint32_t src0_cmp32, len = 0; uint16_t src0_cmp; - uint64_t sv; + uint64_t sv, mv, diff; src0_cmp = zng_memread_2(src0); src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp; sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32; do { - uint64_t mv, diff; - mv = zng_memread_8(src1); - diff = sv ^ mv; if (diff) return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; @@ -118,4 +59,3 @@ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src return 256; } -#endif diff --git a/deflate_rle.c b/deflate_rle.c index e468bc6b6e..b52fa2f0af 100644 --- a/deflate_rle.c +++ b/deflate_rle.c @@ -12,12 +12,8 @@ #if OPTIMAL_CMP == 8 # define compare256_rle compare256_rle_8 -#elif defined(HAVE_BUILTIN_CTZLL) -# define compare256_rle compare256_rle_64 -#elif defined(HAVE_BUILTIN_CTZ) -# define compare256_rle compare256_rle_32 #else -# define compare256_rle compare256_rle_16 +# define compare256_rle compare256_rle_64 #endif /* =========================================================================== diff --git a/functable.c b/functable.c index 0e8911e161..632115c586 100644 --- a/functable.c +++ b/functable.c @@ -83,23 +83,18 @@ static int init_functable(void) { ft.adler32_copy = &adler32_copy_c; ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; -# ifndef HAVE_BUILTIN_CTZ - ft.longest_match = &longest_match_c; - ft.longest_match_slow = &longest_match_slow_c; - ft.compare256 = &compare256_c; -# endif # endif #else // WITH_ALL_FALLBACKS ft.adler32 = &adler32_c; ft.adler32_copy = &adler32_copy_c; ft.chunkmemset_safe = &chunkmemset_safe_c; + ft.compare256 = &compare256_c; ft.crc32 = &crc32_braid; ft.crc32_copy = &crc32_copy_braid; ft.inflate_fast = &inflate_fast_c; - ft.slide_hash = &slide_hash_c; ft.longest_match = &longest_match_c; ft.longest_match_slow = &longest_match_slow_c; - ft.compare256 = &compare256_c; + ft.slide_hash = &slide_hash_c; #endif // Select arch-optimized functions @@ -118,17 +113,15 @@ static int init_functable(void) { # endif { ft.chunkmemset_safe = &chunkmemset_safe_sse2; + ft.compare256 = &compare256_sse2; # if !defined(WITHOUT_CHORBA_SSE) ft.crc32 = &crc32_chorba_sse2; ft.crc32_copy = &crc32_copy_chorba_sse2; # endif ft.inflate_fast = &inflate_fast_sse2; - ft.slide_hash = &slide_hash_sse2; -# ifdef HAVE_BUILTIN_CTZ - ft.compare256 = &compare256_sse2; ft.longest_match = &longest_match_sse2; ft.longest_match_slow = &longest_match_slow_sse2; -# endif + ft.slide_hash = &slide_hash_sse2; } #endif // X86 - SSSE3 @@ -172,13 +165,11 @@ static int init_functable(void) { ft.adler32 = &adler32_avx2; ft.adler32_copy = &adler32_copy_avx2; ft.chunkmemset_safe = &chunkmemset_safe_avx2; - ft.inflate_fast = &inflate_fast_avx2; - ft.slide_hash = &slide_hash_avx2; -# ifdef HAVE_BUILTIN_CTZ ft.compare256 = &compare256_avx2; + ft.inflate_fast = &inflate_fast_avx2; ft.longest_match = &longest_match_avx2; ft.longest_match_slow = &longest_match_slow_avx2; -# endif + ft.slide_hash = &slide_hash_avx2; } #endif // X86 - AVX512 (F,DQ,BW,Vl) @@ -187,12 +178,10 @@ static int init_functable(void) { ft.adler32 = &adler32_avx512; ft.adler32_copy = &adler32_copy_avx512; ft.chunkmemset_safe = &chunkmemset_safe_avx512; - ft.inflate_fast = &inflate_fast_avx512; -# ifdef HAVE_BUILTIN_CTZLL ft.compare256 = &compare256_avx512; + ft.inflate_fast = &inflate_fast_avx512; ft.longest_match = &longest_match_avx512; ft.longest_match_slow = &longest_match_slow_avx512; -# endif } #endif #ifdef X86_AVX512VNNI @@ -228,13 +217,11 @@ static int init_functable(void) { ft.adler32 = &adler32_neon; ft.adler32_copy = &adler32_copy_neon; ft.chunkmemset_safe = &chunkmemset_safe_neon; - ft.inflate_fast = &inflate_fast_neon; - ft.slide_hash = &slide_hash_neon; -# ifdef HAVE_BUILTIN_CTZLL ft.compare256 = &compare256_neon; + ft.inflate_fast = &inflate_fast_neon; ft.longest_match = &longest_match_neon; ft.longest_match_slow = &longest_match_slow_neon; -# endif + ft.slide_hash = &slide_hash_neon; } #endif // ARM - CRC32 @@ -327,28 +314,24 @@ static int init_functable(void) { if (cf.loongarch.has_lsx) { ft.adler32 = &adler32_lsx; ft.adler32_copy = &adler32_copy_lsx; - ft.slide_hash = slide_hash_lsx; -# ifdef HAVE_BUILTIN_CTZ + ft.chunkmemset_safe = &chunkmemset_safe_lsx; ft.compare256 = &compare256_lsx; + ft.inflate_fast = &inflate_fast_lsx; ft.longest_match = &longest_match_lsx; ft.longest_match_slow = &longest_match_slow_lsx; -# endif - ft.chunkmemset_safe = &chunkmemset_safe_lsx; - ft.inflate_fast = &inflate_fast_lsx; + ft.slide_hash = slide_hash_lsx; } #endif #ifdef LOONGARCH_LASX if (cf.loongarch.has_lasx) { ft.adler32 = &adler32_lasx; ft.adler32_copy = &adler32_copy_lasx; - ft.slide_hash = slide_hash_lasx; -# ifdef HAVE_BUILTIN_CTZ + ft.chunkmemset_safe = &chunkmemset_safe_lasx; ft.compare256 = &compare256_lasx; + ft.inflate_fast = &inflate_fast_lasx; ft.longest_match = &longest_match_lasx; ft.longest_match_slow = &longest_match_slow_lasx; -# endif - ft.chunkmemset_safe = &chunkmemset_safe_lasx; - ft.inflate_fast = &inflate_fast_lasx; + ft.slide_hash = slide_hash_lasx; } #endif diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index eb103af0d1..2d8352879d 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -9,7 +9,6 @@ extern "C" { # include "zbuild.h" # include "arch_functions.h" # include "../test_cpu_features.h" -# include "arch/generic/compare256_p.h" } #define MAX_COMPARE_SIZE (256 + 64) @@ -74,25 +73,21 @@ public: BENCHMARK_COMPARE256(native, native_compare256, 1); #else +#ifdef WITH_ALL_FALLBACKS BENCHMARK_COMPARE256(8, compare256_8, 1); -BENCHMARK_COMPARE256(16, compare256_16, 1); -#if defined(HAVE_BUILTIN_CTZ) -BENCHMARK_COMPARE256(32, compare256_32, 1); -#endif -#if defined(HAVE_BUILTIN_CTZLL) BENCHMARK_COMPARE256(64, compare256_64, 1); #endif -#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_SSE2 BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2); #endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_AVX2 BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2); #endif -#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL) +#ifdef X86_AVX512 BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common); #endif -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +#ifdef ARM_NEON BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon); #endif #ifdef POWER9 @@ -101,10 +96,10 @@ BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch #ifdef RISCV_RVV BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv); #endif -#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LSX BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx); #endif -#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LASX BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx); #endif diff --git a/test/benchmarks/benchmark_compare256_rle.cc b/test/benchmarks/benchmark_compare256_rle.cc index 96489cf59a..db5adacc19 100644 --- a/test/benchmarks/benchmark_compare256_rle.cc +++ b/test/benchmarks/benchmark_compare256_rle.cc @@ -69,10 +69,4 @@ public: BENCHMARK_REGISTER_F(compare256_rle, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);; BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1); -BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1); -#if defined(HAVE_BUILTIN_CTZ) -BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1); -#endif -#if defined(HAVE_BUILTIN_CTZLL) BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1); -#endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index c689023a64..b3efe79fb9 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -12,7 +12,6 @@ extern "C" { # include "zutil.h" # include "arch_functions.h" # include "test_cpu_features.h" -# include "arch/generic/compare256_p.h" } #include <gtest/gtest.h> @@ -64,25 +63,21 @@ static inline void compare256_match_check(compare256_func compare256) { TEST_COMPARE256(native, native_compare256, 1) #else +#ifdef WITH_ALL_FALLBACKS TEST_COMPARE256(8, compare256_8, 1) -TEST_COMPARE256(16, compare256_16, 1) -#if defined(HAVE_BUILTIN_CTZ) -TEST_COMPARE256(32, compare256_32, 1) -#endif -#if defined(HAVE_BUILTIN_CTZLL) TEST_COMPARE256(64, compare256_64, 1) #endif -#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_SSE2 TEST_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2) #endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +#ifdef X86_AVX2 TEST_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2) #endif -#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL) +#ifdef X86_AVX512 TEST_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common) #endif -#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) +#ifdef ARM_NEON TEST_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon) #endif #ifdef POWER9 @@ -91,10 +86,10 @@ TEST_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00 #ifdef RISCV_RVV TEST_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv) #endif -#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LSX TEST_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx) #endif -#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ) +#ifdef LOONGARCH_LASX TEST_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx) #endif diff --git a/test/test_compare256_rle.cc b/test/test_compare256_rle.cc index 2ed85083fe..7a77fce59f 100644 --- a/test/test_compare256_rle.cc +++ b/test/test_compare256_rle.cc @@ -51,10 +51,4 @@ static inline void compare256_rle_match_check(compare256_rle_func compare256_rle } TEST_COMPARE256_RLE(8, compare256_rle_8, 1) -TEST_COMPARE256_RLE(16, compare256_rle_16, 1) -#if defined(HAVE_BUILTIN_CTZ) -TEST_COMPARE256_RLE(32, compare256_rle_32, 1) -#endif -#if defined(HAVE_BUILTIN_CTZLL) TEST_COMPARE256_RLE(64, compare256_rle_64, 1) -#endif |
