diff options
| -rw-r--r-- | adler32_fold.c | 2 | ||||
| -rw-r--r-- | adler32_fold.h | 2 | ||||
| -rw-r--r-- | adler32_p.h | 16 | ||||
| -rw-r--r-- | arch/x86/adler32_avx2.c | 18 | ||||
| -rw-r--r-- | arch/x86/adler32_avx2_p.h | 6 | ||||
| -rw-r--r-- | arch/x86/adler32_avx2_tpl.h | 13 | ||||
| -rw-r--r-- | arch/x86/adler32_avx512.c | 11 | ||||
| -rw-r--r-- | arch/x86/adler32_avx512_tpl.h | 12 | ||||
| -rw-r--r-- | arch/x86/adler32_avx512_vnni.c | 191 | ||||
| -rw-r--r-- | arch/x86/adler32_sse42.c | 3 | ||||
| -rw-r--r-- | arch/x86/adler32_ssse3_tpl.h | 188 | ||||
| -rw-r--r-- | cpu_features.h | 21 | ||||
| -rw-r--r-- | deflate.c | 2 | ||||
| -rw-r--r-- | deflate.h | 2 | ||||
| -rw-r--r-- | functable.c | 6 | ||||
| -rw-r--r-- | inflate.c | 1 | ||||
| -rw-r--r-- | inflate.h | 2 | ||||
| -rw-r--r-- | test/benchmarks/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_adler32_copy.cc | 117 | ||||
| -rw-r--r-- | win32/Makefile.msc | 22 |
20 files changed, 317 insertions, 319 deletions
diff --git a/adler32_fold.c b/adler32_fold.c index 688f848533..20fec2bd3c 100644 --- a/adler32_fold.c +++ b/adler32_fold.c @@ -1,4 +1,4 @@ -/* crc32_fold.c -- adler32 folding interface +/* adler32_fold.c -- adler32 folding interface * Copyright (C) 2022 Adam Stylinski * For conditions of distribution and use, see copyright notice in zlib.h */ diff --git a/adler32_fold.h b/adler32_fold.h index ea456adc31..20aa1c7400 100644 --- a/adler32_fold.h +++ b/adler32_fold.h @@ -6,8 +6,6 @@ #ifndef ADLER32_FOLD_H_ #define ADLER32_FOLD_H_ -#include <stdint.h> - Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif diff --git a/adler32_p.h b/adler32_p.h index 5a14172f73..1d2e77f49f 100644 --- a/adler32_p.h +++ b/adler32_p.h @@ -26,10 +26,10 @@ static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, u return adler | (sum2 << 16); } -static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) { - while (len--) { - *dst = *buf++; - adler += *dst++; +static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) { + while (len) { + --len; + adler += *buf++; sum2 += adler; } adler %= BASE; @@ -38,10 +38,10 @@ static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char * return adler | (sum2 << 16); } -static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) { - while (len) { - --len; - adler += *buf++; +static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) { + while (len--) { + *dst = *buf++; + adler += *dst++; sum2 += adler; } adler %= BASE; diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c index fcca34ec53..dcd1166f34 100644 --- a/arch/x86/adler32_avx2.c +++ b/arch/x86/adler32_avx2.c @@ -5,29 +5,13 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" -#include "../../adler32_p.h" -#include "../../fallback_builtins.h" -#include "adler32_avx2_p.h" -#include "../../adler32_fold.h" -#include <stdio.h> - #include <immintrin.h> #ifdef X86_AVX2_ADLER32 #include "adler32_avx2_tpl.h" -#undef ADLER32_AVX2_TPL_H_ + #define COPY #include "adler32_avx2_tpl.h" -#undef COPY - -/* -Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { - if (buf == NULL) return 1L; - if (len == 0) return adler; - return adler32_fold_avx2(adler, buf, len); -} -*/ #endif diff --git a/arch/x86/adler32_avx2_p.h b/arch/x86/adler32_avx2_p.h index 1c80bde057..f7079bf3eb 100644 --- a/arch/x86/adler32_avx2_p.h +++ b/arch/x86/adler32_avx2_p.h @@ -6,10 +6,10 @@ #ifndef ADLER32_AVX2_P_H_ #define ADLER32_AVX2_P_H_ -#ifdef X86_AVX2_ADLER32 +#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32) /* 32 bit horizontal sum, adapted from Agner Fog's vector library. */ -static inline uint32_t hsum(__m256i x) { +static inline uint32_t hsum256(__m256i x) { __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1), _mm256_castsi256_si128(x)); __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1)); @@ -17,7 +17,7 @@ static inline uint32_t hsum(__m256i x) { return (uint32_t)_mm_cvtsi128_si32(sum3); } -static inline uint32_t partial_hsum(__m256i x) { +static inline uint32_t partial_hsum256(__m256i x) { /* We need a permutation vector to extract every other integer. The * rest are going to be zeros */ const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1); diff --git a/arch/x86/adler32_avx2_tpl.h b/arch/x86/adler32_avx2_tpl.h index 7df51d573d..59cacfa483 100644 --- a/arch/x86/adler32_avx2_tpl.h +++ b/arch/x86/adler32_avx2_tpl.h @@ -3,9 +3,6 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifndef ADLER32_AVX2_TPL_H_ -#define ADLER32_AVX2_TPL_H_ - #include "../../zbuild.h" #include <immintrin.h> #include "../../adler32_fold.h" @@ -38,9 +35,9 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) rem_peel: if (len < 16) { #ifdef COPY - return adler32_copy_len_16(adler0, src, dst, len, adler1); + return adler32_copy_len_16(adler0, src, dst, len, adler1); #else - return adler32_len_16(adler0, src, len, adler1); + return adler32_len_16(adler0, src, len, adler1); #endif } else if (len < 32) { #ifdef COPY @@ -129,8 +126,8 @@ rem_peel: * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant). * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly * what the compiler is doing to avoid integer divisions. */ - adler0 = partial_hsum(vs1) % BASE; - adler1 = hsum(vs2) % BASE; + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; } adler = adler0 | (adler1 << 16); @@ -141,5 +138,3 @@ rem_peel: return adler; } - -#endif diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c index e26b9cc524..c0bf0721f2 100644 --- a/arch/x86/adler32_avx512.c +++ b/arch/x86/adler32_avx512.c @@ -6,20 +6,11 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zbuild.h" -#include "../../adler32_p.h" -#include "../../cpu_features.h" -#include "../../fallback_builtins.h" -#include <immintrin.h> -#include "adler32_avx512_p.h" -#include "../../adler32_fold.h" - #ifdef X86_AVX512_ADLER32 #include "adler32_avx512_tpl.h" -#undef ADLER32_AVX512_TPL_H_ + #define COPY #include "adler32_avx512_tpl.h" -#undef COPY #endif diff --git a/arch/x86/adler32_avx512_tpl.h b/arch/x86/adler32_avx512_tpl.h index df5dd3810f..d324ce9859 100644 --- a/arch/x86/adler32_avx512_tpl.h +++ b/arch/x86/adler32_avx512_tpl.h @@ -3,16 +3,13 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifndef ADLER32_AVX512_TPL_H_ -#define ADLER32_AVX512_TPL_H_ - #include "../../zbuild.h" #include "../../adler32_p.h" +#include "../../adler32_fold.h" #include "../../cpu_features.h" #include "../../fallback_builtins.h" #include <immintrin.h> #include "adler32_avx512_p.h" -#include "../../adler32_fold.h" #ifdef X86_AVX512_ADLER32 @@ -22,13 +19,13 @@ Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { #endif + if (src == NULL) return 1L; + if (len == 0) return adler; + uint32_t adler0, adler1; adler1 = (adler >> 16) & 0xffff; adler0 = adler & 0xffff; - if (src == NULL) return 1L; - if (len == 0) return adler; - rem_peel: if (len < 64) { /* This handles the remaining copies, just call normal adler checksum after this */ @@ -107,4 +104,3 @@ rem_peel: } #endif -#endif diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c index 253eed9c6a..330bfe38e7 100644 --- a/arch/x86/adler32_avx512_vnni.c +++ b/arch/x86/adler32_avx512_vnni.c @@ -7,66 +7,54 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ +#ifdef X86_AVX512VNNI_ADLER32 + #include "../../zbuild.h" #include "../../adler32_p.h" #include "../../cpu_features.h" #include "../../fallback_builtins.h" #include <immintrin.h> +#include "../../adler32_fold.h" #include "adler32_avx512_p.h" +#include "adler32_avx2_p.h" -#ifdef X86_AVX512VNNI_ADLER32 -Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) { - uint32_t sum2; +Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) { + if (src == NULL) return 1L; + if (len == 0) return adler; + + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; - /* For impossibly tiny sizes, use the smaller width versions. We still need - * to check for compile time support for these but they are likely there */ -#ifdef X86_SSE41_ADLER32 +rem_peel: if (len < 32) - return adler32_sse41(adler, buf, len); +#if defined(X86_SSSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); #endif -#ifdef X86_AVX2_ADLER32 if (len < 64) - return adler32_avx2(adler, buf, len); -#endif - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* Only capture these corner cases if we didn't compile with SSE41 and AVX2 support - * This should make for shorter compiled code */ -#if !defined(X86_AVX2_ADLER32) && !defined(X86_SSE41_ADLER32) - /* in case user likes doing a byte at a time, keep it fast */ - if (UNLIKELY(len == 1)) - return adler32_len_1(adler, buf, sum2); - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (UNLIKELY(buf == NULL)) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (UNLIKELY(len < 16)) - return adler32_len_16(adler, buf, len, sum2); +#ifdef X86_AVX2_ADLER32 + return adler32_avx2(adler, src, len); +#elif defined(X86_SSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); #endif - /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up - * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial, - * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better - * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be - * aliased up to a 512 bit wide zmm */ - __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler)); - __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2)); - const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64); const __m512i zero = _mm512_setzero_si512(); + __m512i vs1, vs2; while (len >= 64) { - int k = (len < NMAX ? (int)len : NMAX); + vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + size_t k = MIN(len, NMAX); k -= k % 64; len -= k; __m512i vs1_0 = vs1; @@ -77,8 +65,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf /* Remainder peeling */ if (k % 128) { - vbuf1 = _mm512_loadu_si512(buf); - buf += 64; + vbuf1 = _mm512_loadu_si512((__m512i*)src); + + src += 64; k -= 64; __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero); @@ -94,9 +83,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf vs1 = adler + sum(c[i]) vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) */ - vbuf0 = _mm512_loadu_si512(buf); - vbuf1 = _mm512_loadu_si512(buf + 64); - buf += 128; + vbuf0 = _mm512_loadu_si512((__m512i*)src); + vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64)); + src += 128; k -= 128; __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero); @@ -117,14 +106,120 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf vs2 = _mm512_add_epi32(vs2, vs3); vs2 = _mm512_add_epi32(vs2, vs2_1); - adler = partial_hsum(vs1) % BASE; - vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler)); - sum2 = _mm512_reduce_add_epu32(vs2) % BASE; - vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2)); + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + if (src == NULL) return 1L; + if (len == 0) return adler; + + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel_copy: + if (len < 32) { + /* This handles the remaining copies, just call normal adler checksum after this */ + __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len)); + __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); + _mm256_mask_storeu_epi8(dst, storemask, copy_vec); + +#if defined(X86_SSSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); +#endif + } + + const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); + + const __m256i zero = _mm256_setzero_si256(); + __m256i vs1, vs2; + + while (len >= 32) { + vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); + vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); + size_t k = MIN(len, NMAX); + k -= k % 32; + len -= k; + __m256i vs1_0 = vs1; + __m256i vs3 = _mm256_setzero_si256(); + /* We might get a tad bit more ILP here if we sum to a second register in the loop */ + __m256i vs2_1 = _mm256_setzero_si256(); + __m256i vbuf0, vbuf1; + + /* Remainder peeling */ + if (k % 64) { + vbuf1 = _mm256_loadu_si256((__m256i*)src); + _mm256_storeu_si256((__m256i*)dst, vbuf1); + dst += 32; + + src += 32; + k -= 32; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf0 = _mm256_loadu_si256((__m256i*)src); + vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32)); + _mm256_storeu_si256((__m256i*)dst, vbuf0); + _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1); + dst += 64; + src += 64; + k -= 64; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v); + + vs3 = _mm256_add_epi32(vs3, vs1); + vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm256_slli_epi32(vs3, 5); + vs2 = _mm256_add_epi32(vs2, vs3); + vs2 = _mm256_add_epi32(vs2, vs2_1); + + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; } + adler = adler0 | (adler1 << 16); + /* Process tail (len < 64). */ - return adler32_len_16(adler, buf, len, sum2); + if (len) { + goto rem_peel_copy; + } + + return adler; } #endif diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c index 4f21702aaf..92efe4d8db 100644 --- a/arch/x86/adler32_sse42.c +++ b/arch/x86/adler32_sse42.c @@ -1,4 +1,4 @@ -/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream +/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream * Copyright (C) 1995-2011 Mark Adler * Authors: * Adam Stylinski <kungfujesus06@gmail.com> @@ -15,7 +15,6 @@ #ifdef X86_SSE42_ADLER32 Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { - uint32_t adler0, adler1; adler1 = (adler >> 16) & 0xffff; adler0 = adler & 0xffff; diff --git a/arch/x86/adler32_ssse3_tpl.h b/arch/x86/adler32_ssse3_tpl.h deleted file mode 100644 index aedfa81241..0000000000 --- a/arch/x86/adler32_ssse3_tpl.h +++ /dev/null @@ -1,188 +0,0 @@ -/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates - * Copyright (C) 2022 Adam Stylinski - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifndef ADLER32_SSSE3_TPL_H_ -#define ADLER32_SSSE3_TPL_H_ - -#include "../../zbuild.h" -#include <immintrin.h> -#include "../../adler32_fold.h" -#include "../../adler32_p.h" -#include "adler32_ssse3_p.h" - -#ifdef COPY -Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) { -#else -Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) { -#endif - uint32_t adler0, adler1; - - /* split Adler-32 into component sums */ - adler1 = (adler->nsums >> 16) & 0xffff; - adler0 = adler->nsums & 0xffff; - - /* in case user likes doing a byte at a time, keep it fast */ - if (UNLIKELY(len == 1)) { -#ifdef COPY - *(dst++) = *src; -#endif - adler->nsums = adler32_len_1(adler0, src, adler1); - return; - } - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (UNLIKELY(src == NULL)) { - adler->nsums = 1L; - return; - } - - /* in case short lengths are provided, keep it somewhat fast */ - if (UNLIKELY(len < 16)) { - goto sub16; - } - - const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); - const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - const __m128i dot3v = _mm_set1_epi16(1); - const __m128i zero = _mm_setzero_si128(); - - __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, - vbuf_0, v_sad_sum2, vsum2, vsum2_0; - - /* If our buffer is unaligned (likely), make the determination whether - * or not there's enough of a buffer to consume to make the scalar, aligning - * additions worthwhile or if it's worth it to just eat the cost of an unaligned - * load. This is a pretty simple test, just test if 16 - the remainder + len is - * < 16 */ - size_t max_iters = NMAX; - size_t rem = (uintptr_t)src & 15; - size_t align_offset = 16 - rem; - size_t k = 0; - if (rem) { - if (len < 16 + align_offset) { - /* Let's eat the cost of this one unaligned load so that - * we don't completely skip over the vectorization. Doing - * 16 bytes at a time unaligned is is better than 16 + <= 15 - * sums */ - vbuf = _mm_loadu_si128((__m128i*)src); - len -= 16; - src += 16; -#ifdef COPY - _mm_storeu_si128((__m128i*)dst, vbuf); - dst += 16; -#endif - vs1 = _mm_cvtsi32_si128(adler0); - vs2 = _mm_cvtsi32_si128(adler1); - vs3 = _mm_setzero_si128(); - vs1_0 = vs1; - goto unaligned_jmp; - } - -#ifdef COPY - memcpy(dst, src, align_offset); - dst += align_offset; -#endif - for (size_t i = 0; i < align_offset; ++i) { - adler0 += *(src++); - adler1 += adler0; - } - - /* lop off the max number of sums based on the scalar sums done - * above */ - len -= align_offset; - max_iters -= align_offset; - } - - - while (len >= 16) { - vs1 = _mm_cvtsi32_si128(adler0); - vs2 = _mm_cvtsi32_si128(adler1); - vs3 = _mm_setzero_si128(); - vs2_0 = _mm_setzero_si128(); - vs1_0 = vs1; - - k = (len < max_iters ? len : max_iters); - k -= k % 16; - len -= k; - - while (k >= 32) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - vbuf = _mm_load_si128((__m128i*)src); - vbuf_0 = _mm_load_si128((__m128i*)(src + 16)); - src += 32; - k -= 32; - - v_sad_sum1 = _mm_sad_epu8(vbuf, zero); - v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); -#ifdef COPY - _mm_storeu_si128((__m128i*)dst, vbuf); - _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0); - dst += 32; -#endif - vs1 = _mm_add_epi32(v_sad_sum1, vs1); - vs3 = _mm_add_epi32(vs1_0, vs3); - - vs1 = _mm_add_epi32(v_sad_sum2, vs1); - v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); - vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); - vs2 = _mm_add_epi32(vsum2, vs2); - vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); - vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); - vs1_0 = vs1; - } - - vs2 = _mm_add_epi32(vs2_0, vs2); - vs3 = _mm_slli_epi32(vs3, 5); - vs2 = _mm_add_epi32(vs3, vs2); - vs3 = _mm_setzero_si128(); - - while (k >= 16) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - vbuf = _mm_load_si128((__m128i*)src); - src += 16; - k -= 16; - -unaligned_jmp: - v_sad_sum1 = _mm_sad_epu8(vbuf, zero); -#ifdef COPY - _mm_storeu_si128((__m128i*)dst, vbuf); - dst += 16; -#endif - vs1 = _mm_add_epi32(v_sad_sum1, vs1); - vs3 = _mm_add_epi32(vs1_0, vs3); - v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); - vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vs2 = _mm_add_epi32(vsum2, vs2); - vs1_0 = vs1; - } - - vs3 = _mm_slli_epi32(vs3, 4); - vs2 = _mm_add_epi32(vs2, vs3); - - /* We don't actually need to do a full horizontal sum, since psadbw is actually doing - * a partial reduction sum implicitly and only summing to integers in vector positions - * 0 and 2. This saves us some contention on the shuffle port(s) */ - adler0 = partial_hsum(vs1) % BASE; - adler1 = hsum(vs2) % BASE; - max_iters = NMAX; - } - -sub16: -#ifdef COPY - adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1); -#else - /* Process tail (len < 16). */ - adler->nsums = adler32_len_16(adler0, src, len, adler1); -#endif -} - -#endif diff --git a/cpu_features.h b/cpu_features.h index 9e0d5cb95d..fc1b5d7339 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -6,8 +6,8 @@ #ifndef CPU_FEATURES_H_ #define CPU_FEATURES_H_ -#include "crc32_fold.h" #include "adler32_fold.h" +#include "crc32_fold.h" #if defined(X86_FEATURES) # include "arch/x86/x86_features.h" @@ -35,16 +35,11 @@ extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len #ifdef X86_SSSE3_ADLER32 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); #endif -#ifdef X86_SSE42_ADLER32 -extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); -#endif #ifdef X86_AVX2_ADLER32 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); -extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX512_ADLER32 extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len); -extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX512VNNI_ADLER32 extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len); @@ -53,6 +48,20 @@ extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, si extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); #endif +/* adler32 folding */ +#ifdef X86_SSE42_ADLER32 +extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_AVX2_ADLER32 +extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_AVX512_ADLER32 +extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_AVX512VNNI_ADLER32 +extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif + /* CRC32 folding */ #ifdef X86_PCLMULQDQ_CRC extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc); @@ -52,7 +52,6 @@ #include "deflate.h" #include "deflate_p.h" #include "functable.h" -#include <stdio.h> const char PREFIX(deflate_copyright)[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler "; /* @@ -446,7 +445,6 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) { #ifdef GZIP if (s->wrap == 2) { - /* Ensure that there's always a reset, regardless of "wrap" */ strm->adler = functable.crc32_fold_reset(&s->crc_fold); } else #endif @@ -10,9 +10,9 @@ subject to change. Applications should only use zlib.h. */ -#include "adler32_fold.h" #include "zutil.h" #include "zendian.h" +#include "adler32_fold.h" #include "crc32_fold.h" /* define NO_GZIP when compiling if you want to disable gzip header and diff --git a/functable.c b/functable.c index 8328359702..3945323125 100644 --- a/functable.c +++ b/functable.c @@ -204,7 +204,7 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_ Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { functable.adler32_fold_copy = &adler32_fold_copy_c; -#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512VNNI_ADLER32) +#if (defined X86_SSE42_ADLER32) if (x86_cpu_has_sse42) functable.adler32_fold_copy = &adler32_fold_copy_sse42; #endif @@ -216,6 +216,10 @@ Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const u if (x86_cpu_has_avx512) functable.adler32_fold_copy = &adler32_fold_copy_avx512; #endif +#ifdef X86_AVX512VNNI_ADLER32 + if (x86_cpu_has_avx512vnni) + functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; +#endif return functable.adler32_fold_copy(adler, dst, src, len); } @@ -609,7 +609,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { #endif case DICTID: NEEDBITS(32); - //strm->adler = state->check = ZSWAP32(hold); strm->adler = state->check = ZSWAP32(hold); INITBITS(); state->mode = DICT; @@ -11,8 +11,8 @@ #ifndef INFLATE_H_ #define INFLATE_H_ -#include "crc32_fold.h" #include "adler32_fold.h" +#include "crc32_fold.h" /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate(). NO_GZIP would be used to avoid linking in the crc code when it is not needed. diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt index df1df49731..19762fc738 100644 --- a/test/benchmarks/CMakeLists.txt +++ b/test/benchmarks/CMakeLists.txt @@ -24,6 +24,7 @@ endif() add_executable(benchmark_zlib benchmark_adler32.cc + benchmark_adler32_copy.cc benchmark_compare256.cc benchmark_crc32.cc benchmark_main.cc diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc new file mode 100644 index 0000000000..fac4c7f1cd --- /dev/null +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -0,0 +1,117 @@ +/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants + * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <stdio.h> +#include <assert.h> +#include <string.h> + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "zutil_p.h" +# include "cpu_features.h" +} + +#define MAX_RANDOM_INTS (1024 * 1024) +#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) + +typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len); + +class adler32_copy: public benchmark::Fixture { +private: + uint32_t *random_ints_src; + uint32_t *random_ints_dst; + +public: + void SetUp(const ::benchmark::State& state) { + /* Control the alignment so that we have the best case scenario for loads. With + * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load. + * And while this is a realistic scenario, it makes it difficult to compare benchmark + * to benchmark because one allocation could have been aligned perfectly for the loads + * while the subsequent one happened to not be. This is not to be advantageous to AVX512 + * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to + * control the _consistency_ of the results */ + random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); + random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); + assert(random_ints != NULL); + + for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { + random_ints_src[i] = rand(); + } + } + + void Bench(benchmark::State& state, adler32_cpy_func adler32_func) { + uint32_t hash = 0; + + for (auto _ : state) { + hash = adler32_func(hash, (unsigned char *)random_ints_dst, + (const unsigned char*)random_ints_src, state.range(0)); + } + + benchmark::DoNotOptimize(hash); + } + + void TearDown(const ::benchmark::State& state) { + zng_free(random_ints_src); + zng_free(random_ints_dst); + } +}; + +#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + +#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const unsigned char *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, len); \ + return fptr(init_sum, buf, len); \ + }); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + +BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1); + +#ifdef ARM_NEON_ADLER32 +/* If we inline this copy for neon, the function would go here */ +//BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon); +BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon); +#endif + +#ifdef PPC_VMX_ADLER32 +//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec); +BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec); +#endif +#ifdef POWER8_VSX_ADLER32 +//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07); +BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07); +#endif + +#ifdef X86_SSE42_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3); +BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42); +#endif +#ifdef X86_AVX2_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2); +BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2); +#endif +#ifdef X86_AVX512_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512); +BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512); +#endif +#ifdef X86_AVX512VNNI_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni); +BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni); +#endif diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 8db2633b44..8a398e4993 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -30,15 +30,15 @@ WFLAGS = \ -DX86_FEATURES \ -DX86_PCLMULQDQ_CRC \ -DX86_SSE2 \ - -DX86_SSE42_ADLER32 \ + -DX86_SSE42_ADLER32 \ -DX86_SSE42_CRC_INTRIN \ -DX86_SSE42_CRC_HASH \ - -DX86_SSSE3_ADLER32 \ + -DX86_SSSE3_ADLER32 \ -DX86_AVX2 \ - -DX86_AVX2_ADLER32 \ + -DX86_AVX2_ADLER32 \ -DX86_AVX_CHUNKSET \ - -DX86_SSE2_CHUNKSET \ - # + -DX86_SSE2_CHUNKSET + LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo RCFLAGS = /dWIN32 /r @@ -51,12 +51,12 @@ SUFFIX = OBJS = \ adler32.obj \ - adler32_avx2.obj \ - adler32_avx512.obj \ - adler32_avx512_vnni.obj \ - adler32_sse42.obj \ - adler32_ssse3.obj \ - adler32_fold.obj \ + adler32_avx2.obj \ + adler32_avx512.obj \ + adler32_avx512_vnni.obj \ + adler32_sse42.obj \ + adler32_ssse3.obj \ + adler32_fold.obj \ chunkset.obj \ chunkset_avx.obj \ chunkset_sse2.obj \ |
