diff options
| author | Nathan Moinvaziri <nathan@nathanm.com> | 2025-12-02 15:25:56 -0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-12-03 00:25:56 +0100 |
| commit | ceef7bfce81e62306b4ad14f1f175413397735c6 (patch) | |
| tree | 9dc2591682bbc84c77d031edbf5625accaf0e121 | |
| parent | 368a926fdb0fee977719ff927178b8eff0de4a62 (diff) | |
| download | Project-Tick-ceef7bfce81e62306b4ad14f1f175413397735c6.tar.gz Project-Tick-ceef7bfce81e62306b4ad14f1f175413397735c6.zip | |
Rename adler32_fold_copy to adler32_copy (#2026)
There are no folding techniques in adler32 implementations. It is simply hashing while copying.
- Rename adler32_fold_copy to adler32_copy.
- Remove unnecessary adler32_fold.c file.
- Reorder adler32_copy functions last in source file for consistency.
- Rename adler32_rvv_impl to adler32_copy_impl for consistency.
- Replace dst != NULL with 1 in adler32_copy_neon to remove branching.
| -rw-r--r-- | CMakeLists.txt | 2 | ||||
| -rw-r--r-- | Makefile.in | 2 | ||||
| -rw-r--r-- | arch/arm/adler32_neon.c | 8 | ||||
| -rw-r--r-- | arch/arm/arm_functions.h | 6 | ||||
| -rw-r--r-- | arch/generic/Makefile.in | 7 | ||||
| -rw-r--r-- | arch/generic/adler32_c.c | 7 | ||||
| -rw-r--r-- | arch/generic/adler32_fold_c.c | 15 | ||||
| -rw-r--r-- | arch/generic/generic_functions.h | 4 | ||||
| -rw-r--r-- | arch/loongarch/adler32_lasx.c | 12 | ||||
| -rw-r--r-- | arch/loongarch/adler32_lsx.c | 8 | ||||
| -rw-r--r-- | arch/loongarch/loongarch_functions.h | 12 | ||||
| -rw-r--r-- | arch/riscv/adler32_rvv.c | 10 | ||||
| -rw-r--r-- | arch/riscv/riscv_functions.h | 6 | ||||
| -rw-r--r-- | arch/x86/adler32_avx2.c | 14 | ||||
| -rw-r--r-- | arch/x86/adler32_avx512.c | 11 | ||||
| -rw-r--r-- | arch/x86/adler32_avx512_vnni.c | 2 | ||||
| -rw-r--r-- | arch/x86/adler32_sse42.c | 2 | ||||
| -rw-r--r-- | arch/x86/x86_functions.h | 24 | ||||
| -rw-r--r-- | deflate_p.h | 2 | ||||
| -rw-r--r-- | functable.c | 28 | ||||
| -rw-r--r-- | functable.h | 2 | ||||
| -rw-r--r-- | inflate.c | 2 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_adler32_copy.cc | 18 |
23 files changed, 91 insertions, 113 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index f4d184bb5d..f59bd9e68d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1366,7 +1366,6 @@ set(ZLIB_SRCS set(ZLIB_ALL_FALLBACK_SRCS arch/generic/adler32_c.c - arch/generic/adler32_fold_c.c arch/generic/chunkset_c.c arch/generic/compare256_c.c arch/generic/crc32_braid_c.c @@ -1381,7 +1380,6 @@ elseif(${ARCH} STREQUAL "x86_64" AND WITH_SSE2) # x86_64 always has SSE2, so let the SSE2 functions act as fallbacks. list(APPEND ZLIB_GENERIC_SRCS arch/generic/adler32_c.c - arch/generic/adler32_fold_c.c arch/generic/crc32_braid_c.c arch/generic/crc32_fold_c.c ) diff --git a/Makefile.in b/Makefile.in index 1ef4f85d75..1e248a8935 100644 --- a/Makefile.in +++ b/Makefile.in @@ -76,7 +76,6 @@ pkgconfigdir = ${libdir}/pkgconfig OBJZ = \ arch/generic/adler32_c.o \ - arch/generic/adler32_fold_c.o \ arch/generic/chunkset_c.o \ arch/generic/compare256_c.o \ arch/generic/crc32_braid_c.o \ @@ -117,7 +116,6 @@ OBJC = $(OBJZ) $(OBJG) PIC_OBJZ = \ arch/generic/adler32_c.lo \ - arch/generic/adler32_fold_c.lo \ arch/generic/chunkset_c.lo \ arch/generic/compare256_c.lo \ arch/generic/crc32_braid_c.lo \ diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index 33c84228a7..53cf48253f 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -265,7 +265,7 @@ static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) { } } -static Z_FORCEINLINE uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { +static Z_FORCEINLINE uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { /* split Adler-32 into component sums */ uint32_t sum2 = (adler >> 16) & 0xffff; adler &= 0xffff; @@ -376,11 +376,11 @@ static Z_FORCEINLINE uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *ds } Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, NULL, src, len, 0); + return adler32_copy_impl(adler, NULL, src, len, 0); } -Z_INTERNAL uint32_t adler32_fold_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, dst, src, len, dst != NULL); +Z_INTERNAL uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); } #endif diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h index f313655e79..2175c94d59 100644 --- a/arch/arm/arm_functions.h +++ b/arch/arm/arm_functions.h @@ -7,7 +7,7 @@ #ifdef ARM_NEON uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); -uint32_t adler32_fold_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left); # ifdef HAVE_BUILTIN_CTZLL @@ -40,8 +40,8 @@ void slide_hash_armv6(deflate_state *s); # if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON # undef native_adler32 # define native_adler32 adler32_neon -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_neon +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_neon # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_neon # undef native_inflate_fast diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in index ba20e9e5fb..46e59894d9 100644 --- a/arch/generic/Makefile.in +++ b/arch/generic/Makefile.in @@ -14,7 +14,6 @@ TOPDIR=$(SRCTOP) all: \ adler32_c.o adler32_c.lo \ - adler32_fold_c.o adler32_fold_c.lo \ chunkset_c.o chunkset_c.lo \ compare256_c.o compare256_c.lo \ crc32_braid_c.o crc32_braid_c.lo \ @@ -29,12 +28,6 @@ adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h adler32_c.lo: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c -adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c - -adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c - chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c diff --git a/arch/generic/adler32_c.c b/arch/generic/adler32_c.c index da32c95a3d..99aeb6767d 100644 --- a/arch/generic/adler32_c.c +++ b/arch/generic/adler32_c.c @@ -7,7 +7,6 @@ #include "functable.h" #include "adler32_p.h" -/* ========================================================================= */ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { uint32_t sum2; unsigned n; @@ -52,3 +51,9 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { /* do remaining bytes (less than NMAX, still just one modulo) */ return adler32_len_64(adler, buf, len, sum2); } + +Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + adler = FUNCTABLE_CALL(adler32)(adler, src, len); + memcpy(dst, src, len); + return adler; +} diff --git a/arch/generic/adler32_fold_c.c b/arch/generic/adler32_fold_c.c deleted file mode 100644 index 397dd10400..0000000000 --- a/arch/generic/adler32_fold_c.c +++ /dev/null @@ -1,15 +0,0 @@ -/* adler32_fold.c -- adler32 folding interface - * Copyright (C) 2022 Adam Stylinski - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "zbuild.h" -#include "functable.h" - -#include <limits.h> - -Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - adler = FUNCTABLE_CALL(adler32)(adler, src, len); - memcpy(dst, src, len); - return adler; -} diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h index 6e18e34045..a04aca3825 100644 --- a/arch/generic/generic_functions.h +++ b/arch/generic/generic_functions.h @@ -19,7 +19,7 @@ typedef void (*slide_hash_func)(deflate_state *s); uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); -uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left); @@ -51,7 +51,7 @@ void slide_hash_c(deflate_state *s); #ifdef DISABLE_RUNTIME_CPU_DETECTION // Generic code # define native_adler32 adler32_c -# define native_adler32_fold_copy adler32_fold_copy_c +# define native_adler32_copy adler32_copy_c # define native_chunkmemset_safe chunkmemset_safe_c #ifndef WITHOUT_CHORBA # define native_crc32 crc32_chorba diff --git a/arch/loongarch/adler32_lasx.c b/arch/loongarch/adler32_lasx.c index 2cef16c0cf..ab416f9b27 100644 --- a/arch/loongarch/adler32_lasx.c +++ b/arch/loongarch/adler32_lasx.c @@ -31,10 +31,10 @@ static inline uint32_t partial_hsum256(__m256i x) { return (uint32_t)__lasx_xvpickve2gr_wu(sum2, 0); } -extern uint32_t adler32_fold_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); extern uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); -static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { +static inline uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { if (src == NULL) return 1L; if (len == 0) return adler; @@ -51,7 +51,7 @@ rem_peel: } } else if (len < 32) { if (COPY) { - return adler32_fold_copy_lsx(adler, dst, src, len); + return adler32_copy_lsx(adler, dst, src, len); } else { return adler32_lsx(adler, src, len); } @@ -117,11 +117,11 @@ rem_peel: } Z_INTERNAL uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, NULL, src, len, 0); + return adler32_copy_impl(adler, NULL, src, len, 0); } -Z_INTERNAL uint32_t adler32_fold_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, dst, src, len, 1); +Z_INTERNAL uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); } #endif diff --git a/arch/loongarch/adler32_lsx.c b/arch/loongarch/adler32_lsx.c index 7f43262ec0..8c997f3ac0 100644 --- a/arch/loongarch/adler32_lsx.c +++ b/arch/loongarch/adler32_lsx.c @@ -29,7 +29,7 @@ static inline uint32_t hsum(__m128i x) { return __lsx_vpickve2gr_w(sum4, 0); } -static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { +static inline uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { if (src == NULL) return 1L; if (len == 0) return adler; @@ -146,11 +146,11 @@ rem_peel: } Z_INTERNAL uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, NULL, src, len, 0); + return adler32_copy_impl(adler, NULL, src, len, 0); } -Z_INTERNAL uint32_t adler32_fold_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, dst, src, len, 1); +Z_INTERNAL uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); } #endif diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index 798c1484c2..54950629cb 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -16,7 +16,7 @@ void crc32_fold_loongarch64(crc32_fold *crc, const uint8_t *src, size_t len, #ifdef LOONGARCH_LSX uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); -uint32_t adler32_fold_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); void slide_hash_lsx(deflate_state *s); # ifdef HAVE_BUILTIN_CTZ uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1); @@ -29,7 +29,7 @@ void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start); #ifdef LOONGARCH_LASX uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len); -uint32_t adler32_fold_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); void slide_hash_lasx(deflate_state *s); # ifdef HAVE_BUILTIN_CTZ uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1); @@ -53,8 +53,8 @@ void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); # if defined(LOONGARCH_LSX) && defined(__loongarch_sx) # undef native_adler32 # define native_adler32 adler32_lsx -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_lsx +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_lsx # undef native_slide_hash # define native_slide_hash slide_hash_lsx # undef native_chunkmemset_safe @@ -73,8 +73,8 @@ void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); # if defined(LOONGARCH_LASX) && defined(__loongarch_asx) # undef native_adler32 # define native_adler32 adler32_lasx -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_lasx +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_lasx # undef native_slide_hash # define native_slide_hash slide_hash_lasx # undef native_chunkmemset_safe diff --git a/arch/riscv/adler32_rvv.c b/arch/riscv/adler32_rvv.c index d822d75af6..586f99a22c 100644 --- a/arch/riscv/adler32_rvv.c +++ b/arch/riscv/adler32_rvv.c @@ -12,7 +12,7 @@ #include "zbuild.h" #include "adler32_p.h" -static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) { +static inline uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) { /* split Adler-32 into component sums */ uint32_t sum2 = (adler >> 16) & 0xffff; adler &= 0xffff; @@ -125,12 +125,12 @@ static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, c return adler | (sum2 << 16); } -Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - return adler32_rvv_impl(adler, dst, src, len, 1); +Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { + return adler32_copy_impl(adler, NULL, buf, len, 0); } -Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { - return adler32_rvv_impl(adler, NULL, buf, len, 0); +Z_INTERNAL uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); } #endif // RISCV_RVV diff --git a/arch/riscv/riscv_functions.h b/arch/riscv/riscv_functions.h index d68dded92c..7334eb64fd 100644 --- a/arch/riscv/riscv_functions.h +++ b/arch/riscv/riscv_functions.h @@ -11,7 +11,7 @@ #ifdef RISCV_RVV uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); -uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, unsigned len, unsigned left); uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); @@ -30,8 +30,8 @@ uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len); # if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__) # undef native_adler32 # define native_adler32 adler32_rvv -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_rvv +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_rvv # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_rvv # undef native_compare256 diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c index df502fd383..90c0605581 100644 --- a/arch/x86/adler32_avx2.c +++ b/arch/x86/adler32_avx2.c @@ -15,10 +15,10 @@ #include "adler32_avx2_p.h" #include "x86_intrins.h" -extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); -static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { +static inline uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { if (src == NULL) return 1L; if (len == 0) return adler; @@ -35,7 +35,7 @@ rem_peel: } } else if (len < 32) { if (COPY) { - return adler32_fold_copy_sse42(adler, dst, src, len); + return adler32_copy_sse42(adler, dst, src, len); } else { return adler32_ssse3(adler, src, len); } @@ -108,7 +108,7 @@ rem_peel: _mm256_storeu_si256((__m256i*)dst, vbuf); dst += 32; } - + vs1 = _mm256_add_epi32(vs1, vs1_sad); vs3 = _mm256_add_epi32(vs3, vs1_0); __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts @@ -170,11 +170,11 @@ rem_peel: } Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, NULL, src, len, 0); + return adler32_copy_impl(adler, NULL, src, len, 0); } -Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, dst, src, len, 1); +Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); } #endif diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c index 626c4807f8..88d3a80b6e 100644 --- a/arch/x86/adler32_avx512.c +++ b/arch/x86/adler32_avx512.c @@ -15,7 +15,7 @@ #include "x86_intrins.h" #include "adler32_avx512_p.h" -static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { +static inline uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { if (src == NULL) return 1L; if (len == 0) return adler; @@ -96,13 +96,12 @@ rem_peel: return adler; } -Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, dst, src, len, 1); +Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); } -Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { - return adler32_fold_copy_impl(adler, NULL, src, len, 0); +Z_INTERNAL uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); } #endif - diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c index 4c5cfc1cad..2ab73bc3ca 100644 --- a/arch/x86/adler32_avx512_vnni.c +++ b/arch/x86/adler32_avx512_vnni.c @@ -109,7 +109,7 @@ rem_peel: return adler; } -Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { +Z_INTERNAL uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { if (src == NULL) return 1L; if (len == 0) return adler; diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c index df0739d165..86ff9fe49d 100644 --- a/arch/x86/adler32_sse42.c +++ b/arch/x86/adler32_sse42.c @@ -13,7 +13,7 @@ #ifdef X86_SSE42 -Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { +Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { uint32_t adler0, adler1; adler1 = (adler >> 16) & 0xffff; adler0 = adler & 0xffff; diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 5d9065e1b3..8cace60f2f 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -41,12 +41,12 @@ void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); #endif #ifdef X86_SSE42 -uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX2 uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); -uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left); # ifdef HAVE_BUILTIN_CTZ @@ -59,7 +59,7 @@ uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsign #endif #ifdef X86_AVX512 uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); -uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left); void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); # ifdef HAVE_BUILTIN_CTZLL @@ -70,7 +70,7 @@ void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); #endif #ifdef X86_AVX512VNNI uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); -uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_PCLMULQDQ_CRC @@ -126,8 +126,8 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # endif // X86 - SSE4.2 # if defined(X86_SSE42) && defined(__SSE4_2__) -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_sse42 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_sse42 # endif // X86 - PCLMUL # if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__) @@ -146,8 +146,8 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # if defined(X86_AVX2) && defined(__AVX2__) # undef native_adler32 # define native_adler32 adler32_avx2 -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_avx2 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx2 # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_avx2 # undef native_inflate_fast @@ -167,8 +167,8 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) # undef native_adler32 # define native_adler32 adler32_avx512 -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_avx512 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx512 # undef native_chunkmemset_safe # define native_chunkmemset_safe chunkmemset_safe_avx512 # undef native_inflate_fast @@ -185,8 +185,8 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) # undef native_adler32 # define native_adler32 adler32_avx512_vnni -# undef native_adler32_fold_copy -# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx512_vnni # endif // X86 - VPCLMULQDQ # if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__) diff --git a/deflate_p.h b/deflate_p.h index e803300e9a..ecaad5c554 100644 --- a/deflate_p.h +++ b/deflate_p.h @@ -156,7 +156,7 @@ Z_FORCEINLINE static unsigned read_buf(PREFIX3(stream) *strm, unsigned char *buf FUNCTABLE_CALL(crc32_fold_copy)(&s->crc_fold, buf, strm->next_in, len); #endif } else if (s->wrap == 1) { - strm->adler = FUNCTABLE_CALL(adler32_fold_copy)(strm->adler, buf, strm->next_in, len); + strm->adler = FUNCTABLE_CALL(adler32_copy)(strm->adler, buf, strm->next_in, len); } else { memcpy(buf, strm->next_in, len); } diff --git a/functable.c b/functable.c index 8924f7351f..fcfb2f36d0 100644 --- a/functable.c +++ b/functable.c @@ -79,7 +79,7 @@ static int init_functable(void) { # if (defined(__x86_64__) || defined(_M_X64)) && defined(X86_SSE2) // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available. ft.adler32 = &adler32_c; - ft.adler32_fold_copy = &adler32_fold_copy_c; + ft.adler32_copy = &adler32_copy_c; ft.crc32 = &crc32_braid; ft.crc32_fold = &crc32_fold_c; ft.crc32_fold_copy = &crc32_fold_copy_c; @@ -93,7 +93,7 @@ static int init_functable(void) { # endif #else // WITH_ALL_FALLBACKS ft.adler32 = &adler32_c; - ft.adler32_fold_copy = &adler32_fold_copy_c; + ft.adler32_copy = &adler32_copy_c; ft.chunkmemset_safe = &chunkmemset_safe_c; ft.crc32 = &crc32_braid; ft.crc32_fold = &crc32_fold_c; @@ -153,7 +153,7 @@ static int init_functable(void) { // X86 - SSE4.2 #ifdef X86_SSE42 if (cf.x86.has_sse42) { - ft.adler32_fold_copy = &adler32_fold_copy_sse42; + ft.adler32_copy = &adler32_copy_sse42; } #endif // X86 - PCLMUL @@ -174,7 +174,7 @@ static int init_functable(void) { * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ if (cf.x86.has_avx2 && cf.x86.has_bmi2) { ft.adler32 = &adler32_avx2; - ft.adler32_fold_copy = &adler32_fold_copy_avx2; + ft.adler32_copy = &adler32_copy_avx2; ft.chunkmemset_safe = &chunkmemset_safe_avx2; ft.inflate_fast = &inflate_fast_avx2; ft.slide_hash = &slide_hash_avx2; @@ -189,7 +189,7 @@ static int init_functable(void) { #ifdef X86_AVX512 if (cf.x86.has_avx512_common) { ft.adler32 = &adler32_avx512; - ft.adler32_fold_copy = &adler32_fold_copy_avx512; + ft.adler32_copy = &adler32_copy_avx512; ft.chunkmemset_safe = &chunkmemset_safe_avx512; ft.inflate_fast = &inflate_fast_avx512; # ifdef HAVE_BUILTIN_CTZLL @@ -202,7 +202,7 @@ static int init_functable(void) { #ifdef X86_AVX512VNNI if (cf.x86.has_avx512vnni) { ft.adler32 = &adler32_avx512_vnni; - ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; + ft.adler32_copy = &adler32_copy_avx512_vnni; } #endif // X86 - VPCLMULQDQ @@ -233,7 +233,7 @@ static int init_functable(void) { # endif { ft.adler32 = &adler32_neon; - ft.adler32_fold_copy = &adler32_fold_copy_neon; + ft.adler32_copy = &adler32_copy_neon; ft.chunkmemset_safe = &chunkmemset_safe_neon; ft.inflate_fast = &inflate_fast_neon; ft.slide_hash = &slide_hash_neon; @@ -288,7 +288,7 @@ static int init_functable(void) { #ifdef RISCV_RVV if (cf.riscv.has_rvv) { ft.adler32 = &adler32_rvv; - ft.adler32_fold_copy = &adler32_fold_copy_rvv; + ft.adler32_copy = &adler32_copy_rvv; ft.chunkmemset_safe = &chunkmemset_safe_rvv; ft.compare256 = &compare256_rvv; ft.inflate_fast = &inflate_fast_rvv; @@ -322,7 +322,7 @@ static int init_functable(void) { #ifdef LOONGARCH_LSX if (cf.loongarch.has_lsx) { ft.adler32 = &adler32_lsx; - ft.adler32_fold_copy = &adler32_fold_copy_lsx; + ft.adler32_copy = &adler32_copy_lsx; ft.slide_hash = slide_hash_lsx; # ifdef HAVE_BUILTIN_CTZ ft.compare256 = &compare256_lsx; @@ -336,7 +336,7 @@ static int init_functable(void) { #ifdef LOONGARCH_LASX if (cf.loongarch.has_lasx) { ft.adler32 = &adler32_lasx; - ft.adler32_fold_copy = &adler32_fold_copy_lasx; + ft.adler32_copy = &adler32_copy_lasx; ft.slide_hash = slide_hash_lasx; # ifdef HAVE_BUILTIN_CTZ ft.compare256 = &compare256_lasx; @@ -353,7 +353,7 @@ static int init_functable(void) { // Assign function pointers individually for atomic operation FUNCTABLE_ASSIGN(ft, force_init); FUNCTABLE_VERIFY_ASSIGN(ft, adler32); - FUNCTABLE_VERIFY_ASSIGN(ft, adler32_fold_copy); + FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy); FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe); FUNCTABLE_VERIFY_ASSIGN(ft, compare256); FUNCTABLE_VERIFY_ASSIGN(ft, crc32); @@ -382,9 +382,9 @@ static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { return functable.adler32(adler, buf, len); } -static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { +static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { FUNCTABLE_INIT_ABORT; - return functable.adler32_fold_copy(adler, dst, src, len); + return functable.adler32_copy(adler, dst, src, len); } static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) { @@ -446,7 +446,7 @@ static void slide_hash_stub(deflate_state* s) { Z_INTERNAL struct functable_s functable = { force_init_stub, adler32_stub, - adler32_fold_copy_stub, + adler32_copy_stub, chunkmemset_safe_stub, compare256_stub, crc32_stub, diff --git a/functable.h b/functable.h index 91308e5686..209db4a67d 100644 --- a/functable.h +++ b/functable.h @@ -26,7 +26,7 @@ struct functable_s { int (* force_init) (void); uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len); - uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); + uint32_t (* adler32_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint8_t* (* chunkmemset_safe) (uint8_t *out, uint8_t *from, unsigned len, unsigned left); uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1); uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len); @@ -32,7 +32,7 @@ static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst, } else #endif { - strm->adler = state->check = FUNCTABLE_CALL(adler32_fold_copy)(state->check, dst, src, copy); + strm->adler = state->check = FUNCTABLE_CALL(adler32_copy)(state->check, dst, src, copy); } } diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index 2027904af5..2be1d39fd0 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -92,16 +92,16 @@ BENCHMARK_ADLER32_BASELINE_COPY(native, native_adler32, 1); #ifdef ARM_NEON /* If we inline this copy for neon, the function would go here */ -BENCHMARK_ADLER32_COPY(neon, adler32_fold_copy_neon, test_cpu_features.arm.has_neon); +BENCHMARK_ADLER32_COPY(neon, adler32_copy_neon, test_cpu_features.arm.has_neon); BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, test_cpu_features.arm.has_neon); #endif #ifdef PPC_VMX -//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, test_cpu_features.power.has_altivec); +//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_copy_vmx, test_cpu_features.power.has_altivec); BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, test_cpu_features.power.has_altivec); #endif #ifdef POWER8_VSX -//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, test_cpu_features.power.has_arch_2_07); +//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_copy_power8, test_cpu_features.power.has_arch_2_07); BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, test_cpu_features.power.has_arch_2_07); #endif @@ -112,28 +112,28 @@ BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rv #ifdef X86_SSE42 BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3); -BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, test_cpu_features.x86.has_sse42); +BENCHMARK_ADLER32_COPY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42); #endif #ifdef X86_AVX2 BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2); -BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, test_cpu_features.x86.has_avx2); +BENCHMARK_ADLER32_COPY(avx2, adler32_copy_avx2, test_cpu_features.x86.has_avx2); #endif #ifdef X86_AVX512 BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, test_cpu_features.x86.has_avx512_common); -BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, test_cpu_features.x86.has_avx512_common); +BENCHMARK_ADLER32_COPY(avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common); #endif #ifdef X86_AVX512VNNI BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni); -BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni); +BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni); #endif #ifdef LOONGARCH_LSX BENCHMARK_ADLER32_BASELINE_COPY(lsx_baseline, adler32_lsx, test_cpu_features.loongarch.has_lsx); -BENCHMARK_ADLER32_COPY(lsx, adler32_fold_copy_lsx, test_cpu_features.loongarch.has_lsx); +BENCHMARK_ADLER32_COPY(lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx); #endif #ifdef LOONGARCH_LASX BENCHMARK_ADLER32_BASELINE_COPY(lasx_baseline, adler32_lasx, test_cpu_features.loongarch.has_lasx); -BENCHMARK_ADLER32_COPY(lasx, adler32_fold_copy_lasx, test_cpu_features.loongarch.has_lasx); +BENCHMARK_ADLER32_COPY(lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx); #endif #endif |
