diff options
| author | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
|---|---|---|
| committer | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
| commit | 7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch) | |
| tree | b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch/x86 | |
| parent | b1e34e861b5d732afe828d58aad2c638135061fd (diff) | |
| parent | c2712b8a345191f6ed79558c089777df94590087 (diff) | |
| download | Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip | |
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip
git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd
git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/arch/x86')
29 files changed, 4340 insertions, 0 deletions
diff --git a/neozip/arch/x86/Makefile.in b/neozip/arch/x86/Makefile.in new file mode 100644 index 0000000000..f756844a9f --- /dev/null +++ b/neozip/arch/x86/Makefile.in @@ -0,0 +1,176 @@ +# Makefile for zlib +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2 +AVX512VNNIFLAG=-mavx512vnni -mbmi2 +AVX2FLAG=-mavx2 -mbmi2 +SSE2FLAG=-msse2 +SSSE3FLAG=-mssse3 +SSE41FLAG=-msse4.1 +SSE42FLAG=-msse4.2 +PCLMULFLAG=-mpclmul +VPCLMULFLAG=-mvpclmulqdq +XSAVEFLAG=-mxsave +NOLTOFLAG= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +all: \ + x86_features.o x86_features.lo \ + adler32_avx2.o adler32_avx2.lo \ + adler32_avx512.o adler32_avx512.lo \ + adler32_avx512_vnni.o adler32_avx512_vnni.lo \ + adler32_sse42.o adler32_sse42.lo \ + adler32_ssse3.o adler32_ssse3.lo \ + chunkset_avx2.o chunkset_avx2.lo \ + chunkset_avx512.o chunkset_avx512.lo \ + chunkset_sse2.o chunkset_sse2.lo \ + chunkset_ssse3.o chunkset_ssse3.lo \ + compare256_avx2.o compare256_avx2.lo \ + compare256_avx512.o compare256_avx512.lo \ + compare256_sse2.o compare256_sse2.lo \ + crc32_chorba_sse2.o crc32_chorba_sse2.lo \ + crc32_chorba_sse41.o crc32_chorba_sse41.lo \ + crc32_pclmulqdq.o crc32_pclmulqdq.lo \ + crc32_vpclmulqdq_avx2.o crc32_vpclmulqdq_avx2.lo \ + crc32_vpclmulqdq_avx512.o crc32_vpclmulqdq_avx512.lo \ + slide_hash_avx2.o slide_hash_avx2.lo \ + slide_hash_sse2.o slide_hash_sse2.lo + +x86_features.o: + $(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c + +x86_features.lo: + $(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c + +chunkset_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c + +chunkset_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c + +chunkset_avx512.o: + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c + +chunkset_avx512.lo: + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c + +chunkset_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c + +chunkset_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c + +chunkset_ssse3.o: + $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c + +chunkset_ssse3.lo: + $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c + +compare256_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c + +compare256_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c + +compare256_avx512.o: + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c + +compare256_avx512.lo: + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c + +compare256_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c + +compare256_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c + +crc32_chorba_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c + +crc32_chorba_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c + +crc32_chorba_sse41.o: + $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c + +crc32_chorba_sse41.lo: + $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c + +crc32_pclmulqdq.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c + +crc32_pclmulqdq.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c + +crc32_vpclmulqdq_avx2.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c + +crc32_vpclmulqdq_avx2.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c + +crc32_vpclmulqdq_avx512.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c + +crc32_vpclmulqdq_avx512.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c + +slide_hash_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c + +slide_hash_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c + +slide_hash_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c + +slide_hash_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c + +adler32_avx2.o: $(SRCDIR)/adler32_avx2.c + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c + +adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c + +adler32_avx512.o: $(SRCDIR)/adler32_avx512.c + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c + +adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c + +adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c + $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + +adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c + $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + +adler32_sse42.o: $(SRCDIR)/adler32_sse42.c + $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c + +adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c + $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/x86/adler32_avx2.c b/neozip/arch/x86/adler32_avx2.c new file mode 100644 index 0000000000..d1811b254d --- /dev/null +++ b/neozip/arch/x86/adler32_avx2.c @@ -0,0 +1,172 @@ +/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Copyright (C) 2022 Adam Stylinski + * Authors: + * Brian Bockelman <bockelman@gmail.com> + * Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX2 + +#include "zbuild.h" +#include <immintrin.h> +#include "adler32_p.h" +#include "adler32_avx2_p.h" +#include "x86_intrins.h" + +extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) { + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); + } else if (len < 32) { + if (COPY) { + return adler32_copy_sse42(adler, dst, src, len); + } else { + return adler32_ssse3(adler, src, len); + } + } + + __m256i vs1, vs2, vs2_0; + + const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, + 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33); + const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m256i dot3v = _mm256_set1_epi16(1); + const __m256i zero = _mm256_setzero_si256(); + + while (len >= 32) { + vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); + vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); + __m256i vs1_0 = vs1; + __m256i vs3 = _mm256_setzero_si256(); + vs2_0 = vs3; + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); + len -= k; + + while (k >= 64) { + __m256i vbuf = _mm256_loadu_si256((__m256i*)src); + __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32)); + src += 64; + k -= 64; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); + __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero); + + if (COPY) { + _mm256_storeu_si256((__m256i*)dst, vbuf); + _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0); + dst += 64; + } + + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts + __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s + vs1 = _mm256_add_epi32(vs1_sad2, vs1); + vs2 = _mm256_add_epi32(vsum2, vs2); + vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm256_add_epi32(vs2_0, vs2); + vs3 = _mm256_slli_epi32(vs3, 6); + vs2 = _mm256_add_epi32(vs3, vs2); + vs3 = _mm256_setzero_si256(); + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) + */ + __m256i vbuf = _mm256_loadu_si256((__m256i*)src); + src += 32; + k -= 32; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's + + if (COPY) { + _mm256_storeu_si256((__m256i*)dst, vbuf); + dst += 32; + } + + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + vs2 = _mm256_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + /* Defer the multiplication with 32 to outside of the loop */ + vs3 = _mm256_slli_epi32(vs3, 5); + vs2 = _mm256_add_epi32(vs2, vs3); + + /* The compiler is generating the following sequence for this integer modulus + * when done the scalar way, in GPRs: + + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); + + mov $0x80078071,%edi // move magic constant into 32 bit register %edi + ... + vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi + mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax + imul %rdi,%rsi // do a signed multiplication with magic constant and vector element + shr $0x2f,%rsi // shift right by 47 + imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 + sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above + ... + // repeats for each element with vpextract instructions + + This is tricky with AVX2 for a number of reasons: + 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there + 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate + back down to 32 bit precision later (there is in AVX512) + 3.) Full width integer multiplications aren't cheap + + We can, however, do a relatively cheap sequence for horizontal sums. + Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was + previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but + that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be + performed on the maximum possible inputs before overflow + */ + + + /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy + * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant). + * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly + * what the compiler is doing to avoid integer divisions. */ + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif diff --git a/neozip/arch/x86/adler32_avx2_p.h b/neozip/arch/x86/adler32_avx2_p.h new file mode 100644 index 0000000000..f0f8a4a887 --- /dev/null +++ b/neozip/arch/x86/adler32_avx2_p.h @@ -0,0 +1,32 @@ +/* adler32_avx2_p.h -- adler32 avx2 utility functions + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_AVX2_P_H_ +#define ADLER32_AVX2_P_H_ + +#if defined(X86_AVX2) || defined(X86_AVX512VNNI) + +/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */ +static inline uint32_t hsum256(__m256i x) { + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1), + _mm256_castsi256_si128(x)); + __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +static inline uint32_t partial_hsum256(__m256i x) { + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros */ + const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1); + __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec); + __m128i non_zero_sse = _mm256_castsi256_si128(non_zero); + __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse)); + __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} +#endif + +#endif diff --git a/neozip/arch/x86/adler32_avx512.c b/neozip/arch/x86/adler32_avx512.c new file mode 100644 index 0000000000..8a8e165bb9 --- /dev/null +++ b/neozip/arch/x86/adler32_avx512.c @@ -0,0 +1,102 @@ +/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512 + +#include "zbuild.h" +#include "adler32_p.h" +#include "arch_functions.h" +#include <immintrin.h> +#include "x86_intrins.h" +#include "adler32_avx512_p.h" + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 64) { + /* This handles the remaining copies, just call normal adler checksum after this */ + if (COPY && len) { + __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len)); + __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src); + _mm512_mask_storeu_epi8(dst, storemask, copy_vec); + } + + return adler32_avx2(adler, src, len); + } + + __m512i vbuf, vs1_0, vs3; + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + const __m512i dot3v = _mm512_set1_epi16(1); + const __m512i zero = _mm512_setzero_si512(); + + while (len >= 64) { + __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + vs1_0 = vs1; + vs3 = _mm512_setzero_si512(); + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 64); + len -= k; + + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf = _mm512_loadu_si512(src); + + if (COPY) { + _mm512_storeu_si512(dst, vbuf); + dst += 64; + } + + src += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero); + __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v); + vs1 = _mm512_add_epi32(vs1_sad, vs1); + vs3 = _mm512_add_epi32(vs3, vs1_0); + __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm512_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif diff --git a/neozip/arch/x86/adler32_avx512_p.h b/neozip/arch/x86/adler32_avx512_p.h new file mode 100644 index 0000000000..742269053c --- /dev/null +++ b/neozip/arch/x86/adler32_avx512_p.h @@ -0,0 +1,57 @@ +#ifndef AVX512_FUNCS_H +#define AVX512_FUNCS_H + +#include <immintrin.h> +#include <stdint.h> + +/* Written because Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsic macros */ +#if defined(_MSC_VER) && !defined(_MM_K0_REG8) +# undef _mm512_extracti64x4_epi64 +# define _mm512_extracti64x4_epi64(v1, e1) _mm512_maskz_extracti64x4_epi64(UINT8_MAX, v1, e1) +# undef _mm512_set1_epi16 +# define _mm512_set1_epi16(e1) _mm512_maskz_set1_epi16(UINT32_MAX, e1) +# undef _mm512_maddubs_epi16 +# define _mm512_maddubs_epi16(v1, v2) _mm512_maskz_maddubs_epi16(UINT32_MAX, v1, v2) +#endif + +/* Written because *_add_epi32(a) sets off ubsan */ +static inline uint32_t _mm512_reduce_add_epu32(__m512i x) { + __m256i a = _mm512_extracti64x4_epi64(x, 1); + __m256i b = _mm512_extracti64x4_epi64(x, 0); + + __m256i a_plus_b = _mm256_add_epi32(a, b); + __m128i c = _mm256_extracti128_si256(a_plus_b, 1); + __m128i d = _mm256_extracti128_si256(a_plus_b, 0); + __m128i c_plus_d = _mm_add_epi32(c, d); + + __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d); + __m128i sum2 = _mm_add_epi32(sum1, c_plus_d); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + + return _mm_cvtsi128_si32(sum4); +} + +static inline uint32_t partial_hsum(__m512i x) { + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros. Marking this const so the compiler stands + * a better chance of keeping this resident in a register through entire + * loop execution. We certainly have enough zmm registers (32) */ + const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, + 1, 1, 1, 1, 1, 1, 1, 1); + + __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); + + /* From here, it's a simple 256 bit wide reduction sum */ + __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); + + /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is + * pretty slow, much slower than the longer instruction sequence below */ + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), + _mm256_castsi256_si128(non_zero_avx)); + __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +#endif diff --git a/neozip/arch/x86/adler32_avx512_vnni.c b/neozip/arch/x86/adler32_avx512_vnni.c new file mode 100644 index 0000000000..8bebffbf88 --- /dev/null +++ b/neozip/arch/x86/adler32_avx512_vnni.c @@ -0,0 +1,205 @@ +/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream + * Based on Brian Bockelman's AVX2 version + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512VNNI + +#include "zbuild.h" +#include "adler32_p.h" +#include "arch_functions.h" +#include <immintrin.h> +#include "x86_intrins.h" +#include "adler32_avx512_p.h" +#include "adler32_avx2_p.h" + +Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 32) + return adler32_ssse3(adler, src, len); + + if (len < 64) + return adler32_avx2(adler, src, len); + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + + const __m512i zero = _mm512_setzero_si512(); + __m512i vs1, vs2; + + while (len >= 64) { + vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + size_t k = ALIGN_DOWN(MIN(len, NMAX), 64); + len -= k; + __m512i vs1_0 = vs1; + __m512i vs3 = _mm512_setzero_si512(); + /* We might get a tad bit more ILP here if we sum to a second register in the loop */ + __m512i vs2_1 = _mm512_setzero_si512(); + __m512i vbuf0, vbuf1; + + /* Remainder peeling */ + if (k % 128) { + vbuf1 = _mm512_loadu_si512((__m512i*)src); + + src += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 128) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf0 = _mm512_loadu_si512((__m512i*)src); + vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64)); + src += 128; + k -= 128; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v); + + vs3 = _mm512_add_epi32(vs3, vs1); + vs1_sad = _mm512_sad_epu8(vbuf1, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + vs2 = _mm512_add_epi32(vs2, vs2_1); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +/* Use 256-bit vectors when copying because 512-bit variant is slower. */ +Z_INTERNAL uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel_copy: + if (len < 32) { + /* This handles the remaining copies, just call normal adler checksum after this */ + __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len)); + __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); + _mm256_mask_storeu_epi8(dst, storemask, copy_vec); + + return adler32_ssse3(adler, src, len); + } + + const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); + + const __m256i zero = _mm256_setzero_si256(); + __m256i vs1, vs2; + + while (len >= 32) { + vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); + vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); + len -= k; + + __m256i vs1_0 = vs1; + __m256i vs3 = _mm256_setzero_si256(); + /* We might get a tad bit more ILP here if we sum to a second register in the loop */ + __m256i vs2_1 = _mm256_setzero_si256(); + __m256i vbuf0, vbuf1; + + /* Remainder peeling */ + if (k % 64) { + vbuf1 = _mm256_loadu_si256((__m256i*)src); + _mm256_storeu_si256((__m256i*)dst, vbuf1); + dst += 32; + + src += 32; + k -= 32; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf0 = _mm256_loadu_si256((__m256i*)src); + vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32)); + _mm256_storeu_si256((__m256i*)dst, vbuf0); + _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1); + dst += 64; + src += 64; + k -= 64; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v); + + vs3 = _mm256_add_epi32(vs3, vs1); + vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm256_slli_epi32(vs3, 5); + vs2 = _mm256_add_epi32(vs2, vs3); + vs2 = _mm256_add_epi32(vs2, vs2_1); + + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel_copy; + } + + return adler; +} + +#endif diff --git a/neozip/arch/x86/adler32_sse42.c b/neozip/arch/x86/adler32_sse42.c new file mode 100644 index 0000000000..c2301213f0 --- /dev/null +++ b/neozip/arch/x86/adler32_sse42.c @@ -0,0 +1,117 @@ +/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSE42 + +#include "zbuild.h" +#include "adler32_p.h" +#include "adler32_ssse3_p.h" + +#include <immintrin.h> + +Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1); + + __m128i vbuf, vbuf_0; + __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + v_sad_sum2, vsum2, vsum2_0; + __m128i zero = _mm_setzero_si128(); + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + + while (len >= 16) { + size_t k = ALIGN_DOWN(MIN(len, NMAX), 16); + len -= k; + + vs1 = _mm_cvtsi32_si128(adler0); + vs2 = _mm_cvtsi32_si128(adler1); + + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16)); + src += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + _mm_storeu_si128((__m128i*)dst, vbuf); + _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0); + dst += 32; + + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + vs2 = _mm_add_epi32(vsum2, vs2); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + src += 16; + k -= 16; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + + _mm_storeu_si128((__m128i*)dst, vbuf); + dst += 16; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = hsum(vs2) % BASE; + } + + /* If this is true, there's fewer than 16 elements remaining */ + if (len) { + goto rem_peel; + } + + return adler0 | (adler1 << 16); +} + +#endif diff --git a/neozip/arch/x86/adler32_ssse3.c b/neozip/arch/x86/adler32_ssse3.c new file mode 100644 index 0000000000..702db50251 --- /dev/null +++ b/neozip/arch/x86/adler32_ssse3.c @@ -0,0 +1,149 @@ +/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSSE3 + +#include "zbuild.h" +#include "adler32_p.h" +#include "adler32_ssse3_p.h" + +#include <immintrin.h> + +Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) { + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0); + + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + const __m128i zero = _mm_setzero_si128(); + + __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + vbuf_0, v_sad_sum2, vsum2, vsum2_0; + + /* If our buffer is unaligned (likely), make the determination whether + * or not there's enough of a buffer to consume to make the scalar, aligning + * additions worthwhile or if it's worth it to just eat the cost of an unaligned + * load. This is a pretty simple test, just test if len < 32 */ + size_t n = NMAX; + size_t k = 0; + + if (len < 32) { + /* Let's eat the cost of this one unaligned load so that + * we don't completely skip over the vectorization. Doing + * 16 bytes at a time unaligned is better than 16 + <= 15 + * sums */ + vbuf = _mm_loadu_si128((__m128i*)buf); + len -= 16; + buf += 16; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs1_0 = vs1; + goto unaligned_jmp; + } + + size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len); + if (align_diff) { + adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0); + + buf += align_diff; + len -= align_diff; + n -= align_diff; + } + + while (len >= 16) { + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + k = ALIGN_DOWN(MIN(len, n), 16); + len -= k; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); + buf += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + vs2 = _mm_add_epi32(vsum2, vs2); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + buf += 16; + k -= 16; + +unaligned_jmp: + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + /* We don't actually need to do a full horizontal sum, since psadbw is actually doing + * a partial reduction sum implicitly and only summing to integers in vector positions + * 0 and 2. This saves us some contention on the shuffle port(s) */ + adler = partial_hsum(vs1) % BASE; + sum2 = hsum(vs2) % BASE; + n = NMAX; + } + + /* Process tail (len < 16). */ + return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0); +} + +/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */ +Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + adler = adler32_ssse3(adler, src, len); + memcpy(dst, src, len); + return adler; +} +#endif diff --git a/neozip/arch/x86/adler32_ssse3_p.h b/neozip/arch/x86/adler32_ssse3_p.h new file mode 100644 index 0000000000..d7ec3fe0d5 --- /dev/null +++ b/neozip/arch/x86/adler32_ssse3_p.h @@ -0,0 +1,29 @@ +/* adler32_ssse3_p.h -- adler32 ssse3 utility functions + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_SSSE3_P_H_ +#define ADLER32_SSSE3_P_H_ + +#ifdef X86_SSSE3 + +#include <immintrin.h> +#include <stdint.h> + +static inline uint32_t partial_hsum(__m128i x) { + __m128i second_int = _mm_srli_si128(x, 8); + __m128i sum = _mm_add_epi32(x, second_int); + return _mm_cvtsi128_si32(sum); +} + +static inline uint32_t hsum(__m128i x) { + __m128i sum1 = _mm_unpackhi_epi64(x, x); + __m128i sum2 = _mm_add_epi32(x, sum1); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + return _mm_cvtsi128_si32(sum4); +} +#endif + +#endif diff --git a/neozip/arch/x86/chunkset_avx2.c b/neozip/arch/x86/chunkset_avx2.c new file mode 100644 index 0000000000..3e69a7bf66 --- /dev/null +++ b/neozip/arch/x86/chunkset_avx2.c @@ -0,0 +1,129 @@ +/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX2 + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" + +#include "arch/generic/chunk_256bit_perm_idx_lut.h" +#include <immintrin.h> +#include "x86_intrins.h" + +typedef __m256i chunk_t; +typedef __m128i halfchunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 +#define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi64x(zng_memread_8(from)); +} + +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + /* See explanation in chunkset_avx512.c */ +#if defined(_MSC_VER) && _MSC_VER <= 1900 + halfchunk_t half = _mm_loadu_si128((__m128i*)from); + *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1); +#else + *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +#endif +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm256_loadu_si256((__m256i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm256_storeu_si256((__m256i *)out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is + * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in + * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */ + *chunk_rem = lut_rem.remval; + + /* See note in chunkset_ssse3.c for why this is ok */ + __msan_unpoison(buf + dist, 32 - dist); + + if (dist < 16) { + /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after + * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate + * shuffles and combining the halves later */ + __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); + } else { + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16)); + /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ + __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1); + __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1); + /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_ + * shuffle those values */ + __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} + +static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return _mm256_zextsi128_si256(*chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = _mm_loadu_si128((__m128i*)buf); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_avx2 +#define CHUNKCOPY chunkcopy_avx2 +#define CHUNKUNROLL chunkunroll_avx2 +#define CHUNKMEMSET chunkmemset_avx2 +#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_avx2 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/chunkset_avx512.c b/neozip/arch/x86/chunkset_avx512.c new file mode 100644 index 0000000000..60450c653b --- /dev/null +++ b/neozip/arch/x86/chunkset_avx512.c @@ -0,0 +1,186 @@ +/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512 + +#include "zbuild.h" +#include "zmemory.h" + +#include "arch/generic/chunk_256bit_perm_idx_lut.h" +#include <immintrin.h> +#include "x86_intrins.h" + +typedef __m256i chunk_t; +typedef __m128i halfchunk_t; +typedef __mmask32 mask_t; +typedef __mmask16 halfmask_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 +#define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK +#define HAVE_MASKED_READWRITE +#define HAVE_CHUNKCOPY +#define HAVE_HALFCHUNKCOPY + +static inline halfmask_t gen_half_mask(size_t len) { + return (halfmask_t)_bzhi_u32(0xFFFF, (unsigned)len); +} + +static inline mask_t gen_mask(size_t len) { + return (mask_t)_bzhi_u32(0xFFFFFFFF, (unsigned)len); +} + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi64x(zng_memread_8(from)); +} + +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where + * the load is dumped to the stack with an aligned move for this memory-register + * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to + * stack doesn't exist if compiled with optimizations. For the sake of working + * properly in a debugger, let's take the 2 cycle penalty */ +#if defined(_MSC_VER) && _MSC_VER <= 1900 + halfchunk_t half = _mm_loadu_si128((__m128i*)from); + *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1); +#else + *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +#endif +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm256_loadu_si256((__m256i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm256_storeu_si256((__m256i *)out, *chunk); +} + +static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + + chunk_t chunk; + size_t rem = len % sizeof(chunk_t); + + if (len < sizeof(chunk_t)) { + mask_t rem_mask = gen_mask(rem); + chunk = _mm256_maskz_loadu_epi8(rem_mask, from); + _mm256_mask_storeu_epi8(out, rem_mask, chunk); + return out + rem; + } + + loadchunk(from, &chunk); + rem = (rem == 0) ? sizeof(chunk_t) : rem; + storechunk(out, &chunk); + out += rem; + from += rem; + len -= rem; + + while (len > 0) { + loadchunk(from, &chunk); + storechunk(out, &chunk); + out += sizeof(chunk_t); + from += sizeof(chunk_t); + len -= sizeof(chunk_t); + } + + return out; +} + +/* MSVC compiler decompression bug when optimizing for size */ +#if defined(_MSC_VER) && _MSC_VER < 1943 +# pragma optimize("", off) +#endif +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + *chunk_rem = lut_rem.remval; + + /* See the AVX2 implementation for more detailed comments. This is that + some masked + * loads to avoid an out of bounds read on the heap */ + + if (dist < 16) { + __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); + halfmask_t load_mask = gen_half_mask(dist); + __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); + } else { + halfmask_t load_mask = gen_half_mask(dist - 16); + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16)); + __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE); + __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} +#if defined(_MSC_VER) && _MSC_VER < 1943 +# pragma optimize("", on) +#endif + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return _mm256_zextsi128_si256(*chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + halfmask_t load_mask = gen_half_mask(dist); + ret_vec = _mm_maskz_loadu_epi8(load_mask, buf); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + halfchunk_t chunk; + + size_t rem = len % sizeof(halfchunk_t); + if (rem == 0) { + rem = sizeof(halfchunk_t); + } + + halfmask_t rem_mask = gen_half_mask(rem); + chunk = _mm_maskz_loadu_epi8(rem_mask, from); + _mm_mask_storeu_epi8(out, rem_mask, chunk); + + return out + rem; +} + +#define CHUNKSIZE chunksize_avx512 +#define CHUNKUNROLL chunkunroll_avx512 +#define CHUNKMEMSET chunkmemset_avx512 +#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_avx512 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/chunkset_sse2.c b/neozip/arch/x86/chunkset_sse2.c new file mode 100644 index 0000000000..633ab6e64f --- /dev/null +++ b/neozip/arch/x86/chunkset_sse2.c @@ -0,0 +1,50 @@ +/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSE2 + +#include "zbuild.h" +#include "zmemory.h" + +#include <immintrin.h> + +typedef __m128i chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi64x(zng_memread_8(from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +#define CHUNKSIZE chunksize_sse2 +#define CHUNKCOPY chunkcopy_sse2 +#define CHUNKUNROLL chunkunroll_sse2 +#define CHUNKMEMSET chunkmemset_sse2 +#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_sse2 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/chunkset_ssse3.c b/neozip/arch/x86/chunkset_ssse3.c new file mode 100644 index 0000000000..0bef7de811 --- /dev/null +++ b/neozip/arch/x86/chunkset_ssse3.c @@ -0,0 +1,72 @@ +/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSSE3 + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" + +#include <immintrin.h> +#include "arch/generic/chunk_128bit_perm_idx_lut.h" + +typedef __m128i chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi64x(zng_memread_8(from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + /* Important to note: + * This is _not_ to subvert the memory sanitizer but to instead unpoison some + * bytes we willingly and purposefully load uninitialized that we swizzle over + * in a vector register, anyway. If what we assume is wrong about what is used, + * the memory sanitizer will still usefully flag it */ + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = _mm_loadu_si128((__m128i*)buf); + *chunk_rem = lut_rem.remval; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_ssse3 +#define CHUNKMEMSET chunkmemset_ssse3 +#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3 +#define CHUNKCOPY chunkcopy_ssse3 +#define CHUNKUNROLL chunkunroll_ssse3 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_ssse3 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/compare256_avx2.c b/neozip/arch/x86/compare256_avx2.c new file mode 100644 index 0000000000..5e2b1716cf --- /dev/null +++ b/neozip/arch/x86/compare256_avx2.c @@ -0,0 +1,61 @@ +/* compare256_avx2.c -- AVX2 version of compare256 + * Copyright Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef X86_AVX2 + +#include <immintrin.h> +#ifdef _MSC_VER +# include <nmmintrin.h> +#endif + +static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + __m256i ymm_src0, ymm_src1, ymm_cmp; + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ + unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */ + + src0 += 32, src1 += 32, len += 32; + + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); + mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); + + src0 += 32, src1 += 32, len += 32; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx2_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_avx2 +#define COMPARE256 compare256_avx2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_avx2 +#define COMPARE256 compare256_avx2_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/x86/compare256_avx512.c b/neozip/arch/x86/compare256_avx512.c new file mode 100644 index 0000000000..f3105505cb --- /dev/null +++ b/neozip/arch/x86/compare256_avx512.c @@ -0,0 +1,87 @@ +/* compare256_avx512.c -- AVX512 version of compare256 + * Copyright (C) 2025 Hans Kristian Rosbach + * Based on AVX2 implementation by Mika T. Lindqvist + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef X86_AVX512 + +#include <immintrin.h> +#ifdef _MSC_VER +# include <nmmintrin.h> +#endif + +static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) { + __m512i zmm_src0_4, zmm_src1_4; + __m512i zmm_src0_3, zmm_src1_3; + __m512i zmm_src0_2, zmm_src1_2; + __m512i zmm_src0_1, zmm_src1_1; + __m128i xmm_src0_0, xmm_src1_0; + uint64_t mask_1, mask_2, mask_3, mask_4; + uint32_t mask_0; + + // First do a 16byte round before increasing to 64bytes, this reduces the + // penalty for the short matches, and those are usually the most common ones. + // This requires us to overlap on the last round, giving a small penalty + // on matches of 192+ bytes (Still faster than AVX2 though). + + // 16 bytes + xmm_src0_0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1_0 = _mm_loadu_si128((__m128i*)src1); + mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); + if (mask_0 != 0x0000FFFF) + return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */ + + // 64 bytes + zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16)); + zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16)); + mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1); + if (mask_1 != 0xFFFFFFFFFFFFFFFF) + return 16 + zng_ctz64(~mask_1); + + // 64 bytes + zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80)); + zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80)); + mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2); + if (mask_2 != 0xFFFFFFFFFFFFFFFF) + return 80 + zng_ctz64(~mask_2); + + // 64 bytes + zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144)); + zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144)); + mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3); + if (mask_3 != 0xFFFFFFFFFFFFFFFF) + return 144 + zng_ctz64(~mask_3); + + // 64 bytes (overlaps the previous 16 bytes for fast tail processing) + zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192)); + zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192)); + mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4); + if (mask_4 != 0xFFFFFFFFFFFFFFFF) + return 192 + zng_ctz64(~mask_4); + + return 256; +} + +Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx512_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_avx512 +#define COMPARE256 compare256_avx512_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_avx512 +#define COMPARE256 compare256_avx512_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/x86/compare256_sse2.c b/neozip/arch/x86/compare256_sse2.c new file mode 100644 index 0000000000..cfaff82cfa --- /dev/null +++ b/neozip/arch/x86/compare256_sse2.c @@ -0,0 +1,86 @@ +/* compare256_sse2.c -- SSE2 version of compare256 + * Copyright Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef X86_SSE2 + +#include <emmintrin.h> + +static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) { + __m128i xmm_src0, xmm_src1, xmm_cmp; + + /* Do the first load unaligned, than all subsequent ones we have at least + * one aligned load. Sadly aligning both loads is probably unrealistic */ + xmm_src0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return zng_ctz32(~mask); + + const uint8_t *last0 = src0 + 240; + const uint8_t *last1 = src1 + 240; + + int align_offset = ((uintptr_t)src0) & 15; + int align_adv = 16 - align_offset; + uint32_t len = align_adv; + + src0 += align_adv; + src1 += align_adv; + + for (int i = 0; i < 15; ++i) { + xmm_src0 = _mm_load_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return len + zng_ctz32(~mask); + + len += 16, src0 += 16, src1 += 16; + } + + if (align_offset) { + xmm_src0 = _mm_loadu_si128((__m128i*)last0); + xmm_src1 = _mm_loadu_si128((__m128i*)last1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + if (mask != 0xFFFF) + return 240 + zng_ctz32(~mask); + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) { + return compare256_sse2_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_sse2 +#define COMPARE256 compare256_sse2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_sse2 +#define COMPARE256 compare256_sse2_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/x86/crc32_chorba_sse2.c b/neozip/arch/x86/crc32_chorba_sse2.c new file mode 100644 index 0000000000..66191e046a --- /dev/null +++ b/neozip/arch/x86/crc32_chorba_sse2.c @@ -0,0 +1,872 @@ +#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE) + +#include "zbuild.h" +#include "crc32_chorba_p.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include <emmintrin.h> +#include "arch/x86/x86_intrins.h" +#include "arch_functions.h" + +#define READ_NEXT(in, off, a, b) do { \ + a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ + b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \ + } while (0); + +#define NEXT_ROUND(invec, a, b, c, d) do { \ + a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \ + b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \ + c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \ + d = _mm_srli_epi64(invec, 20); \ + } while (0); + +Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len) { + /* The calling function ensured that this is aligned correctly */ + const uint64_t* input = (const uint64_t*)buf; + ALIGNED_(16) uint64_t final[9] = {0}; + uint64_t next1 = ~crc; + crc = 0; + uint64_t next2 = 0; + uint64_t next3 = 0; + uint64_t next4 = 0; + uint64_t next5 = 0; + + __m128i next12 = _mm_cvtsi64_si128(next1); + __m128i next34 = _mm_setzero_si128(); + __m128i next56 = _mm_setzero_si128(); + __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4; + + size_t i = 0; + + /* This is weird, doing for vs while drops 10% off the exec time */ + for (; (i + 256 + 40 + 32 + 32) < len; i += 32) { + __m128i in1in2, in3in4; + + /* + uint64_t chorba1 = input[i / sizeof(uint64_t)]; + uint64_t chorba2 = input[i / sizeof(uint64_t) + 1]; + uint64_t chorba3 = input[i / sizeof(uint64_t) + 2]; + uint64_t chorba4 = input[i / sizeof(uint64_t) + 3]; + uint64_t chorba5 = input[i / sizeof(uint64_t) + 4]; + uint64_t chorba6 = input[i / sizeof(uint64_t) + 5]; + uint64_t chorba7 = input[i / sizeof(uint64_t) + 6]; + uint64_t chorba8 = input[i / sizeof(uint64_t) + 7]; + */ + + const uint64_t *input_ptr = input + (i / sizeof(uint64_t)); + const __m128i *input_ptr_128 = (__m128i*)input_ptr; + __m128i chorba12 = _mm_load_si128(input_ptr_128++); + __m128i chorba34 = _mm_load_si128(input_ptr_128++); + __m128i chorba56 = _mm_load_si128(input_ptr_128++); + __m128i chorba78 = _mm_load_si128(input_ptr_128++); + + chorba12 = _mm_xor_si128(chorba12, next12); + chorba34 = _mm_xor_si128(chorba34, next34); + chorba56 = _mm_xor_si128(chorba56, next56); + chorba78 = _mm_xor_si128(chorba78, chorba12); + __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1)); + __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12), + _mm_castsi128_pd(chorba34), 1)); + /* + chorba1 ^= next1; + chorba2 ^= next2; + chorba3 ^= next3; + chorba4 ^= next4; + chorba5 ^= next5; + chorba7 ^= chorba1; + chorba8 ^= chorba2; + */ + i += 8 * 8; + + /* 0-3 */ + /*in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1];*/ + READ_NEXT(input, i, in1in2, in3in4); + __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12)); + in1in2 = _mm_xor_si128(in1in2, chorba34xor); + /* + in1 ^= chorba3; + in2 ^= chorba4 ^ chorba1; + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + */ + + in3in4 = _mm_xor_si128(in3in4, ab1); + /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */ + __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)); + + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23)); + in3in4 = _mm_xor_si128(in3in4, chorba12); + + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1; + in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, a4_); + next12 = _mm_xor_si128(next12, cd1); + + __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + + /*out1 = a3 ^ b2 ^ c1; + out2 = b3 ^ c2 ^ d1 ^ a4;*/ + next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + //out3 = b4 ^ c3 ^ d2; + //out4 = c4 ^ d3; + + //out5 = d4; + + /* + next1 = out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 4-7 */ + /*in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1];*/ + READ_NEXT(input, i, in1in2, in3in4); + + in1in2 = _mm_xor_si128(in1in2, next12); + in1in2 = _mm_xor_si128(in1in2, chorba78); + in1in2 = _mm_xor_si128(in1in2, chorba45); + in1in2 = _mm_xor_si128(in1in2, chorba34); + + /* + in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3; + in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4; + */ + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + + in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5; + in4 ^= next4 ^ b1 ^ a2 ^ chorba7 ^ chorba6; + */ + in3in4 = _mm_xor_si128(in3in4, next34); + in3in4 = _mm_xor_si128(in3in4, ab1); + in3in4 = _mm_xor_si128(in3in4, chorba56); + __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); + + /* + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + ///* + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, cd1); + + next12 = _mm_xor_si128(next12, a4_); + next12 = _mm_xor_si128(next12, next56); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + next34 = _mm_xor_si128(b4c4, cd3); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + //*/ + + /* + out1 = a3 ^ b2 ^ c1; + out2 = b3 ^ c2 ^ d1 ^ a4; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 8-11 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1; + in2 ^= next2 ^ chorba8 ^ chorba2; + */ + + READ_NEXT(input, i, in1in2, in3in4); + + __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128()); + __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12); + in1in2 = _mm_xor_si128(in1in2, chorba80); + in1in2 = _mm_xor_si128(in1in2, chorba78); + in1in2 = _mm_xor_si128(in1in2, next12_chorba12); + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /*in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3];*/ + in3in4 = _mm_xor_si128(next34, in3in4); + in3in4 = _mm_xor_si128(in3in4, ab1); + __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, chorba34); + in3in4 = _mm_xor_si128(in3in4, a2_); + + /* + in3 ^= next3 ^ a1 ^ chorba3; + in4 ^= next4 ^ a2 ^ b1 ^ chorba4; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(a4_, ab3); + next12 = _mm_xor_si128(next12, cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 12-15 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + */ + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, next12); + __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12); + in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12); + __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); + in1in2 = _mm_xor_si128(in1in2, chorb1_); + + + /* + in1 ^= next1 ^ chorba5 ^ chorba1; + in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2; + */ + + in3in4 = _mm_xor_si128(next34, in3in4); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + ///* + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + //*/ + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 16-19 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1; + in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2; + */ + ///* + READ_NEXT(input, i, in1in2, in3in4); + __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); + in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34)); + in1in2 = _mm_xor_si128(chorba1_, in1in2); + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + //*/ + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + */ + ///* + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_)); + in3in4 = _mm_xor_si128(in3in4, next34); + //*/ + /* + in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1; + */ + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 20-23 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1; + in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2; + */ + + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12)); + in1in2 = _mm_xor_si128(in1in2, chorba80); + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1; + */ + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_)); + in3in4 = _mm_xor_si128(in3in4, chorba12); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + i += 32; + + /* 24-27 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1; + in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2; + */ + + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12)); + in1in2 = _mm_xor_si128(in1in2, chorba80); + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /*in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_)); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + i += 32; + + /* 28-31 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5; + in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6; + */ + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56)); + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80)); + in3in4 = _mm_xor_si128(a2_, in3in4); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + */ + + /* + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + } + + for (; (i + 40 + 32) < len; i += 32) { + __m128i in1in2, in3in4; + + /*in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1];*/ + //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4); + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, next12); + + /* + in1 ^=next1; + in2 ^=next2; + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1; + in4 ^= next4 ^ a2 ^ b1; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + __m128i ab1_next34 = _mm_xor_si128(next34, ab1); + in3in4 = _mm_xor_si128(in3in4, ab1_next34); + in3in4 = _mm_xor_si128(a2_, in3in4); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, a4_); + next12 = _mm_xor_si128(next12, cd1); + + __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + next12 = _mm_xor_si128(next12, next56); + next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + } + + next1 = _mm_cvtsi128_si64(next12); + next2 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next12, next12)); + next3 = _mm_cvtsi128_si64(next34); + next4 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next34, next34)); + next5 = _mm_cvtsi128_si64(next56); + + /* Skip the call to memcpy */ + size_t copy_len = len - i; + __m128i *final128 = (__m128i*)final; + __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t)); + while (copy_len >= 64) { + _mm_store_si128(final128++, _mm_load_si128(input128++)); + _mm_store_si128(final128++, _mm_load_si128(input128++)); + _mm_store_si128(final128++, _mm_load_si128(input128++)); + _mm_store_si128(final128++, _mm_load_si128(input128++)); + copy_len -= 64; + } + + while (copy_len >= 16) { + _mm_store_si128(final128++, _mm_load_si128(input128++)); + copy_len -= 16; + } + + uint8_t *src_bytes = (uint8_t*)input128; + uint8_t *dst_bytes = (uint8_t*)final128; + while (copy_len--) { + *dst_bytes++ = *src_bytes++; + } + + final[0] ^= next1; + final[1] ^= next2; + final[2] ^= next3; + final[3] ^= next4; + final[4] ^= next5; + + /* We perform the same loop that braid_internal is doing but we'll skip + * the function call for this tiny tail */ + uint8_t *final_bytes = (uint8_t*)final; + size_t rem = len - i; + + while (rem--) { + crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8); + } + + return ~crc; +} + +Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) { + uintptr_t align_diff = ALIGN_DIFF(buf, 16); + if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT) + return crc32_braid(crc, buf, len); + + if (align_diff) { + crc = crc32_braid(crc, buf, align_diff); + len -= align_diff; + buf += align_diff; + } +#if !defined(WITHOUT_CHORBA) + if (len > CHORBA_LARGE_THRESHOLD) + return crc32_chorba_118960_nondestructive(crc, buf, len); +#endif + return chorba_small_nondestructive_sse2(crc, buf, len); +} + +Z_INTERNAL uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_chorba_sse2(crc, src, len); + memcpy(dst, src, len); + return crc; +} +#endif diff --git a/neozip/arch/x86/crc32_chorba_sse41.c b/neozip/arch/x86/crc32_chorba_sse41.c new file mode 100644 index 0000000000..6ef9612440 --- /dev/null +++ b/neozip/arch/x86/crc32_chorba_sse41.c @@ -0,0 +1,332 @@ +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) + +#include "zbuild.h" +#include "crc32_chorba_p.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include <emmintrin.h> +#include <smmintrin.h> +#include "arch/x86/x86_intrins.h" +#include "arch_functions.h" + +#define READ_NEXT(in, off, a, b) do { \ + a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ + b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \ + } while (0); + +#define NEXT_ROUND(invec, a, b, c, d) do { \ + a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \ + b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \ + c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \ + d = _mm_srli_epi64(invec, 20); \ + } while (0); + +#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \ + out0 = _mm_slli_si128(in0, shift); \ + out1 = _mm_alignr_epi8(in1, in0, shift); \ + out2 = _mm_alignr_epi8(in2, in1, shift); \ + out3 = _mm_alignr_epi8(in3, in2, shift); \ + out4 = _mm_srli_si128(in3, shift); \ + } while (0) + +#define STORE4(out0, out1, out2, out3, out) do { \ + _mm_store_si128(out++, out0); \ + _mm_store_si128(out++, out1); \ + _mm_store_si128(out++, out2); \ + _mm_store_si128(out++, out3); \ + } while (0) + +#define READ4(out0, out1, out2, out3, in) do { \ + out0 = _mm_load_si128(in++); \ + out1 = _mm_load_si128(in++); \ + out2 = _mm_load_si128(in++); \ + out3 = _mm_load_si128(in++); \ + } while (0) + +/* This is intentionally shifted one down to compensate for the deferred store from + * the last iteration */ +#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \ + out0 = _mm_xor_si128(in[1], xor0); \ + out1 = _mm_xor_si128(in[2], xor1); \ + out2 = _mm_xor_si128(in[3], xor2); \ + out3 = _mm_xor_si128(in[4], xor3); \ + } while (0) + +Z_FORCEINLINE static uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint8_t *buf, size_t len) { + /* The calling function ensured that this is aligned correctly */ + const uint64_t* input = (const uint64_t*)buf; + ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)]; + __m128i *bitbuffer_v = (__m128i*)bitbuffer; + const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer; + __m128i z = _mm_setzero_si128(); + + __m128i *bitbuf128 = &bitbuffer_v[64]; + __m128i *bitbuf144 = &bitbuffer_v[72]; + __m128i *bitbuf182 = &bitbuffer_v[91]; + __m128i *bitbuf210 = &bitbuffer_v[105]; + __m128i *bitbuf300 = &bitbuffer_v[150]; + __m128i *bitbuf0 = bitbuf128; + __m128i *inptr = (__m128i*)input; + + /* We only need to zero out the bytes between the 128'th value and the 144th + * that are actually read */ + __m128i *z_cursor = bitbuf128; + for (size_t i = 0; i < 2; ++i) { + STORE4(z, z, z, z, z_cursor); + } + + /* We only need to zero out the bytes between the 144'th value and the 182nd that + * are actually read */ + z_cursor = bitbuf144 + 8; + for (size_t i = 0; i < 11; ++i) { + _mm_store_si128(z_cursor++, z); + } + + /* We only need to zero out the bytes between the 182nd value and the 210th that + * are actually read. */ + z_cursor = bitbuf182; + for (size_t i = 0; i < 4; ++i) { + STORE4(z, z, z, z, z_cursor); + } + + /* We need to mix this in */ + __m128i init_crc = _mm_cvtsi64_si128(~crc); + crc = 0; + + size_t i = 0; + + /* Previous iteration runs carried over */ + __m128i buf144 = z; + __m128i buf182 = z; + __m128i buf210 = z; + + for (; i + 300*8+64 < len && i < 22 * 8; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + + READ4(in12, in34, in56, in78, inptr); + + if (i == 0) { + in12 = _mm_xor_si128(in12, init_crc); + } + + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a = _mm_xor_si128(buf144, in_1); + + STORE4(a, in23, in45, in67, bitbuf144); + buf144 = in8_; + + __m128i e = _mm_xor_si128(buf182, in_1); + STORE4(e, in23, in45, in67, bitbuf182); + buf182 = in8_; + + __m128i m = _mm_xor_si128(buf210, in_1); + STORE4(m, in23, in45, in67, bitbuf210); + buf210 = in8_; + + STORE4(in12, in34, in56, in78, bitbuf300); + } + + for (; i + 300*8+64 < len && i < 32 * 8; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + READ4(in12, in34, in56, in78, inptr); + + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a = _mm_xor_si128(buf144, in_1); + + STORE4(a, in23, in45, in67, bitbuf144); + buf144 = in8_; + + __m128i e, f, g, h; + e = _mm_xor_si128(buf182, in_1); + READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); + STORE4(e, f, g, h, bitbuf182); + + __m128i m = _mm_xor_si128(buf210, in_1); + STORE4(m, in23, in45, in67, bitbuf210); + buf210 = in8_; + + STORE4(in12, in34, in56, in78, bitbuf300); + } + + for (; i + 300*8+64 < len && i < 84 * 8; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + READ4(in12, in34, in56, in78, inptr); + + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a, b, c, d; + a = _mm_xor_si128(buf144, in_1); + READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144); + STORE4(a, b, c, d, bitbuf144); + + __m128i e, f, g, h; + e = _mm_xor_si128(buf182, in_1); + READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); + STORE4(e, f, g, h, bitbuf182); + + __m128i m = _mm_xor_si128(buf210, in_1); + STORE4(m, in23, in45, in67, bitbuf210); + buf210 = in8_; + + STORE4(in12, in34, in56, in78, bitbuf300); + } + + for (; i + 300*8+64 < len; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + + if (i < 128 * 8) { + READ4(in12, in34, in56, in78, inptr); + } else { + in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + } + + // [0, 145, 183, 211] + + /* Pre Penryn CPUs the unpack should be faster */ + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a, b, c, d; + a = _mm_xor_si128(buf144, in_1); + READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144); + STORE4(a, b, c, d, bitbuf144); + + __m128i e, f, g, h; + e = _mm_xor_si128(buf182, in_1); + READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); + STORE4(e, f, g, h, bitbuf182); + + __m128i n, o, p; + __m128i m = _mm_xor_si128(buf210, in_1); + + /* Couldn't tell you why but despite knowing that this is always false, + * removing this branch with GCC makes things significantly slower. Some + * loop bodies must be being joined or something */ + if (i < 84 * 8) { + n = in23; + o = in45; + p = in67; + buf210 = in8_; + } else { + READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210); + } + + STORE4(m, n, o, p, bitbuf210); + STORE4(in12, in34, in56, in78, bitbuf300); + } + + /* Second half of stores bubbled out */ + _mm_store_si128(bitbuf144, buf144); + _mm_store_si128(bitbuf182, buf182); + _mm_store_si128(bitbuf210, buf210); + + /* We also have to zero out the tail */ + size_t left_to_z = len - (300*8 + i); + __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8); + while (left_to_z >= 64) { + STORE4(z, z, z, z, bitbuf_tail); + left_to_z -= 64; + } + + while (left_to_z >= 16) { + _mm_store_si128(bitbuf_tail++, z); + left_to_z -= 16; + } + + uint8_t *tail_bytes = (uint8_t*)bitbuf_tail; + while (left_to_z--) { + *tail_bytes++ = 0; + } + + ALIGNED_(16) uint64_t final[9] = {0}; + __m128i next12, next34, next56; + next12 = z; + next34 = z; + next56 = z; + + for (; (i + 72 < len); i += 32) { + __m128i in1in2, in3in4; + __m128i in1in2_, in3in4_; + __m128i ab1, ab2, ab3, ab4; + __m128i cd1, cd2, cd3, cd4; + + READ_NEXT(input, i, in1in2, in3in4); + READ_NEXT(bitbuffer, i, in1in2_, in3in4_); + + in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12); + in3in4 = _mm_xor_si128(in3in4, in3in4_); + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + __m128i a2_ = _mm_slli_si128(ab2, 8); + __m128i ab1_next34 = _mm_xor_si128(next34, ab1); + in3in4 = _mm_xor_si128(in3in4, ab1_next34); + in3in4 = _mm_xor_si128(a2_, in3in4); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8); + __m128i a4_ = _mm_slli_si128(ab4, 8); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, a4_); + next12 = _mm_xor_si128(next12, cd1); + + __m128i d2_ = _mm_srli_si128(cd2, 8); + __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8); + next12 = _mm_xor_si128(next12, next56); + next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); + next56 = _mm_srli_si128(cd4, 8); + } + + memcpy(final, input+(i / sizeof(uint64_t)), len-i); + __m128i *final128 = (__m128i*)final; + _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12)); + ++final128; + _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34)); + ++final128; + _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56)); + + uint8_t *final_bytes = (uint8_t*)final; + + for (size_t j = 0; j < (len-i); j++) { + crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8); + } + return ~crc; +} + +Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) { + uintptr_t align_diff = ALIGN_DIFF(buf, 16); + if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT) + return crc32_braid(crc, buf, len); + + if (align_diff) { + crc = crc32_braid(crc, buf, align_diff); + len -= align_diff; + buf += align_diff; + } +#if !defined(WITHOUT_CHORBA) + if (len > CHORBA_LARGE_THRESHOLD) + return crc32_chorba_118960_nondestructive(crc, buf, len); +#endif + if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD) + return crc32_chorba_32768_nondestructive_sse41(crc, buf, len); + return chorba_small_nondestructive_sse2(crc, buf, len); +} + +Z_INTERNAL uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_chorba_sse41(crc, src, len); + memcpy(dst, src, len); + return crc; +} +#endif diff --git a/neozip/arch/x86/crc32_pclmulqdq.c b/neozip/arch/x86/crc32_pclmulqdq.c new file mode 100644 index 0000000000..c8be1b43ba --- /dev/null +++ b/neozip/arch/x86/crc32_pclmulqdq.c @@ -0,0 +1,31 @@ +/* + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ + * instruction. + * + * A white paper describing this algorithm can be found at: + * doc/crc-pclmulqdq.pdf + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2016 Marian Beermann (support for initial value) + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * Jim Guilford <james.guilford@intel.com> + * Vinodh Gopal <vinodh.gopal@intel.com> + * Erdinc Ozturk <erdinc.ozturk@intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_PCLMULQDQ_CRC + +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/neozip/arch/x86/crc32_pclmulqdq_tpl.h b/neozip/arch/x86/crc32_pclmulqdq_tpl.h new file mode 100644 index 0000000000..e4ea546afd --- /dev/null +++ b/neozip/arch/x86/crc32_pclmulqdq_tpl.h @@ -0,0 +1,708 @@ +/* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding + * approach with the PCLMULQDQ and VPCMULQDQ instructions. + * + * A white paper describing this algorithm can be found at: + * doc/crc-pclmulqdq.pdf + * + * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support) + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2016 Marian Beermann (support for initial value) + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * Jim Guilford <james.guilford@intel.com> + * Vinodh Gopal <vinodh.gopal@intel.com> + * Erdinc Ozturk <erdinc.ozturk@intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" + +#include <immintrin.h> +#include <wmmintrin.h> +#include <smmintrin.h> // _mm_extract_epi32 + +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include "crc32_p.h" +#include "x86_intrins.h" + +/* 512-bit VPCLMULQDQ path requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) +# if defined(_MSC_VER) && _MSC_VER < 1920 + /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */ +# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96) +# define z512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm) +# define z512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm) +# else +# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, 0x96) +# define z512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm) +# define z512_extracti64x2(a, imm) _mm512_extracti64x2_epi64(a, imm) +# endif +# ifdef __AVX512VL__ +# define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96) +# endif +#endif +/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */ +#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) +# define z256_xor3_epi64(a, b, c) _mm256_xor_si256(_mm256_xor_si256(a, b), c) +#endif + +#ifndef z128_xor3_epi64 +# define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c) +#endif + +static inline void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + + *xmm_crc0 = *xmm_crc1; + *xmm_crc1 = *xmm_crc2; + *xmm_crc2 = *xmm_crc3; + *xmm_crc3 = _mm_xor_si128(x_low, x_high); +} + +static inline void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); + + *xmm_crc0 = *xmm_crc2; + *xmm_crc1 = *xmm_crc3; + *xmm_crc2 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc3 = _mm_xor_si128(x_low1, x_high1); +} + +static inline void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); + __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); + + *xmm_crc0 = *xmm_crc3; + *xmm_crc1 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc2 = _mm_xor_si128(x_low1, x_high1); + *xmm_crc3 = _mm_xor_si128(x_low2, x_high2); +} + +static inline void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); + __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); + __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); + __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); + + *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); + *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); + *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); +} + +static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { + const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85); + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10); + __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10); + __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01); + __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10); + + *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); + *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); + *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); +} + +/* 512-bit fold function requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) +static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3, + const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) { + __m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01); + __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10); + __m512i z_low1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01); + __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10); + __m512i z_low2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01); + __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10); + __m512i z_low3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01); + __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10); + + *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0); + *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1); + *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2); + *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3); +} +#endif +/* 256-bit fold function for VPCLMULQDQ without AVX-512 */ +#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) +static inline void fold_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3, + const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, const __m256i ymm_t3, const __m256i ymm_fold8) { + __m256i y_low0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01); + __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10); + __m256i y_low1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01); + __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10); + __m256i y_low2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01); + __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10); + __m256i y_low3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01); + __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10); + + *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0); + *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1); + *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2); + *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3); +} +#endif + +Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + size_t copy_len = len; + if (len >= 16) { + /* Calculate 16-byte alignment offset */ + uintptr_t align_diff = ALIGN_DIFF(src, 16); + + /* If total length is less than (alignment bytes + 16), use the faster small method. + * Handles both initially small buffers and cases where alignment would leave < 16 bytes */ + copy_len = len < align_diff + 16 ? len : align_diff; + } + + if (copy_len > 0) { + crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY); + src += copy_len; + len -= copy_len; + if (COPY) { + dst += copy_len; + } + } + + if (len == 0) + return crc; + + const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + + __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; + __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); + __m128i xmm_crc1 = _mm_setzero_si128(); + __m128i xmm_crc2 = _mm_setzero_si128(); + __m128i xmm_crc3 = _mm_setzero_si128(); + + if (crc != 0) { + // Process the first 16 bytes and handle initial CRC + len -= 16; + xmm_t0 = _mm_load_si128((__m128i *)src); + src += 16; + + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + dst += 16; + } + xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc)); + } + +/* 512-bit VPCLMULQDQ path requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) + if (len >= 256) { + len -= 256; + + __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; + __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; + __m512i z_low0, z_high0; + const __m512i zmm_fold4 = _mm512_set4_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m512i zmm_fold16 = _mm512_set4_epi32( + 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); + + zmm_crc0 = _mm512_loadu_si512((__m512i *)src); + zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); + src += 256; + if (COPY) { + _mm512_storeu_si512((__m512i *)dst, zmm_crc0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); + dst += 256; + } + + // Fold existing xmm state into first 64 bytes + zmm_t0 = _mm512_castsi128_si512(xmm_crc0); + zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc1, 1); + zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc2, 2); + zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc3, 3); + + z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0); + + while (len >= 256) { + len -= 256; + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); + src += 256; + + fold_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16); + if (COPY) { + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); + dst += 256; + } + } + + // zmm_crc[0,1,2,3] -> zmm_crc0 + z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1); + + z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2); + + z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3); + + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] + xmm_crc0 = z512_extracti64x2(zmm_crc0, 0); + xmm_crc1 = z512_extracti64x2(zmm_crc0, 1); + xmm_crc2 = z512_extracti64x2(zmm_crc0, 2); + xmm_crc3 = z512_extracti64x2(zmm_crc0, 3); + } +/* 256-bit VPCLMULQDQ path */ +#elif defined(X86_VPCLMULQDQ) + if (len >= 128) { + len -= 128; + + __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3; + __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3; + __m256i y_low0, y_high0; + const __m256i ymm_fold4 = _mm256_set_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596, + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m256i ymm_fold8 = _mm256_set_epi32( + 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880, + 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880); + + ymm_crc0 = _mm256_loadu_si256((__m256i *)src); + ymm_crc1 = _mm256_loadu_si256((__m256i *)src + 1); + ymm_crc2 = _mm256_loadu_si256((__m256i *)src + 2); + ymm_crc3 = _mm256_loadu_si256((__m256i *)src + 3); + src += 128; + if (COPY) { + _mm256_storeu_si256((__m256i *)dst, ymm_crc0); + _mm256_storeu_si256((__m256i *)dst + 1, ymm_crc1); + _mm256_storeu_si256((__m256i *)dst + 2, ymm_crc2); + _mm256_storeu_si256((__m256i *)dst + 3, ymm_crc3); + dst += 128; + } + + // Fold existing xmm state into first 32 bytes + ymm_t0 = _mm256_castsi128_si256(xmm_crc0); + ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc1, 1); + + y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); + y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); + ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0); + + ymm_t0 = _mm256_castsi128_si256(xmm_crc2); + ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc3, 1); + + y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); + y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); + ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0); + + while (len >= 128) { + len -= 128; + ymm_t0 = _mm256_loadu_si256((__m256i *)src); + ymm_t1 = _mm256_loadu_si256((__m256i *)src + 1); + ymm_t2 = _mm256_loadu_si256((__m256i *)src + 2); + ymm_t3 = _mm256_loadu_si256((__m256i *)src + 3); + src += 128; + + fold_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8); + if (COPY) { + _mm256_storeu_si256((__m256i *)dst, ymm_t0); + _mm256_storeu_si256((__m256i *)dst + 1, ymm_t1); + _mm256_storeu_si256((__m256i *)dst + 2, ymm_t2); + _mm256_storeu_si256((__m256i *)dst + 3, ymm_t3); + dst += 128; + } + } + + // Extract 8 x 128-bit lanes from 4 x 256-bit registers + __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0); + __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1); + __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1); + __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1); + __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2); + __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1); + __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3); + __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1); + + // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4) + __m128i x_low, x_high; + x_low = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10); + xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4); + + x_low = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10); + xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5); + + x_low = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10); + xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6); + + x_low = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10); + xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7); + } +#else + /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 + * We interleave the PCLMUL-base folds with 8x scaled generator + * polynomial copies; we read 8x QWORDS and then XOR them into + * the stream at the following offsets: 6, 9, 10, 16, 20, 22, + * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper + * as "generator_64_bits_unrolled_8" */ +#ifndef __AVX512VL__ + if (!COPY) { +#endif + while (len >= 512 + 64 + 16*8) { + __m128i chorba8 = _mm_load_si128((__m128i *)src); + __m128i chorba7 = _mm_load_si128((__m128i *)src + 1); + __m128i chorba6 = _mm_load_si128((__m128i *)src + 2); + __m128i chorba5 = _mm_load_si128((__m128i *)src + 3); + __m128i chorba4 = _mm_load_si128((__m128i *)src + 4); + __m128i chorba3 = _mm_load_si128((__m128i *)src + 5); + __m128i chorba2 = _mm_load_si128((__m128i *)src + 6); + __m128i chorba1 = _mm_load_si128((__m128i *)src + 7); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, chorba8); + _mm_storeu_si128((__m128i *)dst + 1, chorba7); + _mm_storeu_si128((__m128i *)dst + 2, chorba6); + _mm_storeu_si128((__m128i *)dst + 3, chorba5); + _mm_storeu_si128((__m128i *)dst + 4, chorba4); + _mm_storeu_si128((__m128i *)dst + 5, chorba3); + _mm_storeu_si128((__m128i *)dst + 6, chorba2); + _mm_storeu_si128((__m128i *)dst + 7, chorba1); + dst += 16*8; + } + + chorba2 = _mm_xor_si128(chorba2, chorba8); + chorba1 = _mm_xor_si128(chorba1, chorba7); + src += 16*8; + len -= 16*8; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + xmm_t3 = _mm_load_si128((__m128i *)src + 3); + + fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, xmm_crc0); + xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, xmm_crc2); + xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 4); + xmm_t1 = _mm_load_si128((__m128i *)src + 5); + xmm_t2 = _mm_load_si128((__m128i *)src + 6); + xmm_t3 = _mm_load_si128((__m128i *)src + 7); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, xmm_crc1); + xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 8); + xmm_t1 = _mm_load_si128((__m128i *)src + 9); + xmm_t2 = _mm_load_si128((__m128i *)src + 10); + xmm_t3 = _mm_load_si128((__m128i *)src + 11); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, xmm_crc0); + xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), xmm_crc1); + xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, xmm_crc2); + xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 12); + xmm_t1 = _mm_load_si128((__m128i *)src + 13); + xmm_t2 = _mm_load_si128((__m128i *)src + 14); + xmm_t3 = _mm_load_si128((__m128i *)src + 15); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, xmm_crc1); + xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 16); + xmm_t1 = _mm_load_si128((__m128i *)src + 17); + xmm_t2 = _mm_load_si128((__m128i *)src + 18); + xmm_t3 = _mm_load_si128((__m128i *)src + 19); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 20); + xmm_t1 = _mm_load_si128((__m128i *)src + 21); + xmm_t2 = _mm_load_si128((__m128i *)src + 22); + xmm_t3 = _mm_load_si128((__m128i *)src + 23); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 24); + xmm_t1 = _mm_load_si128((__m128i *)src + 25); + xmm_t2 = _mm_load_si128((__m128i *)src + 26); + xmm_t3 = _mm_load_si128((__m128i *)src + 27); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 28); + xmm_t1 = _mm_load_si128((__m128i *)src + 29); + xmm_t2 = _mm_load_si128((__m128i *)src + 30); + xmm_t3 = _mm_load_si128((__m128i *)src + 31); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, xmm_crc1); + xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), xmm_crc2); + xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, xmm_crc3); + + len -= 512; + src += 512; + } +#ifndef __AVX512VL__ + } +#endif + +#endif /* X86_VPCLMULQDQ */ + + while (len >= 64) { + len -= 64; + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + xmm_t3 = _mm_load_si128((__m128i *)src + 3); + src += 64; + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); + } + + /* + * len = num bytes left - 64 + */ + if (len >= 48) { + len -= 48; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + src += 48; + + fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + dst += 48; + } + + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); + } else if (len >= 32) { + len -= 32; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + src += 32; + + fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + dst += 32; + } + + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); + } else if (len >= 16) { + len -= 16; + xmm_t0 = _mm_load_si128((__m128i *)src); + src += 16; + + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + dst += 16; + } + + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + } + + const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e); + const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641); + + /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */ + __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10); + xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0); + + __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10); + xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1); + + __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10); + xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2); + + /* Fold remaining bytes into the 128-bit state */ + if (len) { + const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080); + const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + /* Create masks to shift bytes for partial input */ + __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16)); + __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3); + + /* Shift out bytes from crc3 to make space for new data */ + __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl); + xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr); + + /* Insert the partial input into crc3 */ +#if defined(__AVX512BW__) && defined(__AVX512VL__) + __mmask16 k = (1 << len) - 1; + __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src); + if (COPY) { + _mm_mask_storeu_epi8(dst, k, xmm_crc_part); + } +#else + __m128i xmm_crc_part = _mm_setzero_si128(); + memcpy(&xmm_crc_part, src, len); + if (COPY) { + memcpy(dst, src, len); + } +#endif + __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl); + xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned); + + /* Fold the bytes that were shifted out back into crc3 */ + __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01); + __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10); + xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high); + } + + /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */ + __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00); + __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10); + + x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf); + x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3); + + __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01); + __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10); + + crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2)); + + return ~crc; +} diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx2.c b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c new file mode 100644 index 0000000000..1cdef13b09 --- /dev/null +++ b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c @@ -0,0 +1,17 @@ +/* crc32_vpclmulqdq_avx2.c -- VPCLMULQDQ-based CRC32 with AVX2. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_AVX2 + +#define X86_VPCLMULQDQ +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx512.c b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c new file mode 100644 index 0000000000..a95a448f49 --- /dev/null +++ b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c @@ -0,0 +1,17 @@ +/* crc32_vpclmulqdq_avx512.c -- VPCLMULQDQ-based CRC32 with AVX-512. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_AVX512 + +#define X86_VPCLMULQDQ +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/neozip/arch/x86/slide_hash_avx2.c b/neozip/arch/x86/slide_hash_avx2.c new file mode 100644 index 0000000000..241ea305e3 --- /dev/null +++ b/neozip/arch/x86/slide_hash_avx2.c @@ -0,0 +1,48 @@ +/* + * AVX2 optimized hash slide, based on Intel's slide_sse implementation + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven <arjan@linux.intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * Mika T. Lindqvist <postmaster@raasu.org> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX2 + +#include "zbuild.h" +#include "deflate.h" + +#include <immintrin.h> + +static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) { + table += entries; + table -= 32; + + do { + __m256i value1, value2, result1, result2; + + value1 = _mm256_load_si256((__m256i *)table); + value2 = _mm256_load_si256((__m256i *)(table+16)); + result1 = _mm256_subs_epu16(value1, wsize); + result2 = _mm256_subs_epu16(value2, wsize); + _mm256_store_si256((__m256i *)table, result1); + _mm256_store_si256((__m256i *)(table+16), result2); + + table -= 32; + entries -= 32; + } while (entries > 0); +} + +Z_INTERNAL void slide_hash_avx2(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize); + + slide_hash_chain(s->head, HASH_SIZE, ymm_wsize); + slide_hash_chain(s->prev, wsize, ymm_wsize); +} + +#endif diff --git a/neozip/arch/x86/slide_hash_sse2.c b/neozip/arch/x86/slide_hash_sse2.c new file mode 100644 index 0000000000..4aa8df5ee8 --- /dev/null +++ b/neozip/arch/x86/slide_hash_sse2.c @@ -0,0 +1,68 @@ +/* + * SSE optimized hash slide + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven <arjan@linux.intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSE2 + +#include "zbuild.h" +#include "deflate.h" + +#include <immintrin.h> +#include <assert.h> + +static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0, + uint32_t entries1, const __m128i wsize) { + uint32_t entries; + Pos *table; + __m128i value0, value1, result0, result1; + + int on_chain = 0; + +next_chain: + table = (on_chain) ? table1 : table0; + entries = (on_chain) ? entries1 : entries0; + + table += entries; + table -= 16; + + /* ZALLOC allocates this pointer unless the user chose a custom allocator. + * Our alloc function is aligned to 64 byte boundaries */ + do { + value0 = _mm_load_si128((__m128i *)table); + value1 = _mm_load_si128((__m128i *)(table + 8)); + result0 = _mm_subs_epu16(value0, wsize); + result1 = _mm_subs_epu16(value1, wsize); + _mm_store_si128((__m128i *)table, result0); + _mm_store_si128((__m128i *)(table + 8), result1); + + table -= 16; + entries -= 16; + } while (entries > 0); + + ++on_chain; + if (on_chain > 1) { + return; + } else { + goto next_chain; + } +} + +Z_INTERNAL void slide_hash_sse2(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m128i xmm_wsize = _mm_set1_epi16((short)wsize); + + assert(((uintptr_t)s->head & 15) == 0); + assert(((uintptr_t)s->prev & 15) == 0); + + slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize); +} + +#endif diff --git a/neozip/arch/x86/x86_features.c b/neozip/arch/x86/x86_features.c new file mode 100644 index 0000000000..5eba18bf8a --- /dev/null +++ b/neozip/arch/x86/x86_features.c @@ -0,0 +1,128 @@ +/* x86_features.c - x86 feature check + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Author: + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_FEATURES + +#include "zbuild.h" +#include "x86_features.h" + +#if defined(HAVE_CPUID_MS) +# include <intrin.h> +#elif defined(HAVE_CPUID_GNU) +// Newer versions of GCC and clang come with cpuid.h +# include <cpuid.h> +# ifdef X86_HAVE_XSAVE_INTRIN +# if __GNUC__ == 8 +# include <xsaveintrin.h> +# else +# include <immintrin.h> +# endif +# endif +#endif + +static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { +#if defined(HAVE_CPUID_MS) + unsigned int registers[4]; + __cpuid((int *)registers, info); + + *eax = registers[0]; + *ebx = registers[1]; + *ecx = registers[2]; + *edx = registers[3]; +#elif defined(HAVE_CPUID_GNU) + *eax = *ebx = *ecx = *edx = 0; + __cpuid(info, *eax, *ebx, *ecx, *edx); +#else + /* When using this fallback, the faster SSE/AVX code is disabled */ + *eax = *ebx = *ecx = *edx = 0; +#endif +} + +static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { +#if defined(HAVE_CPUID_MS) + unsigned int registers[4]; + __cpuidex((int *)registers, info, subinfo); + + *eax = registers[0]; + *ebx = registers[1]; + *ecx = registers[2]; + *edx = registers[3]; +#elif defined(HAVE_CPUID_GNU) + *eax = *ebx = *ecx = *edx = 0; + __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx); +#else + /* When using this fallback, the faster SSE/AVX code is disabled */ + *eax = *ebx = *ecx = *edx = 0; +#endif +} + +static inline uint64_t xgetbv(unsigned int xcr) { +#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN) + return _xgetbv(xcr); +#elif defined(__GNUC__) + uint32_t eax, edx; + __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (uint64_t)(edx) << 32 | eax; +#else + /* When using this fallback, some of the faster code is disabled */ + return 0; +#endif +} + +void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { + unsigned eax, ebx, ecx, edx; + unsigned maxbasic; + + cpuid(0, &maxbasic, &ebx, &ecx, &edx); + cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx); + + features->has_sse2 = edx & 0x4000000; + features->has_ssse3 = ecx & 0x200; + features->has_sse41 = ecx & 0x80000; + features->has_sse42 = ecx & 0x100000; + features->has_pclmulqdq = ecx & 0x2; + + if (ecx & 0x08000000) { + uint64_t xfeature = xgetbv(0); + + features->has_os_save_ymm = ((xfeature & 0x06) == 0x06); + features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6); + } + + if (maxbasic >= 7) { + // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf + cpuidex(7, 0, &eax, &ebx, &ecx, &edx); + + // check BMI2 bit + features->has_bmi2 = ebx & 0x100; + + // check AVX2 bit if the OS supports saving YMM registers + if (features->has_os_save_ymm) { + features->has_avx2 = ebx & 0x20; + features->has_vpclmulqdq = ecx & 0x400; + } + + // check AVX512 bits if the OS supports saving ZMM registers + if (features->has_os_save_zmm) { + features->has_avx512f = ebx & 0x00010000; + if (features->has_avx512f) { + // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable + // AVX512(DQ,BW,VL). + features->has_avx512dq = ebx & 0x00020000; + features->has_avx512bw = ebx & 0x40000000; + features->has_avx512vl = ebx & 0x80000000; + } + features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \ + && features->has_avx512vl && features->has_bmi2; + features->has_avx512vnni = ecx & 0x800; + } + } +} + +#endif diff --git a/neozip/arch/x86/x86_features.h b/neozip/arch/x86/x86_features.h new file mode 100644 index 0000000000..2118b8e87a --- /dev/null +++ b/neozip/arch/x86/x86_features.h @@ -0,0 +1,30 @@ +/* x86_features.h -- check for CPU features + * Copyright (C) 2013 Intel Corporation Jim Kukunas + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_FEATURES_H_ +#define X86_FEATURES_H_ + +struct x86_cpu_features { + int has_avx2; + int has_avx512f; + int has_avx512dq; + int has_avx512bw; + int has_avx512vl; + int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled. + int has_avx512vnni; + int has_bmi2; + int has_sse2; + int has_ssse3; + int has_sse41; + int has_sse42; + int has_pclmulqdq; + int has_vpclmulqdq; + int has_os_save_ymm; + int has_os_save_zmm; +}; + +void Z_INTERNAL x86_check_features(struct x86_cpu_features *features); + +#endif /* X86_FEATURES_H_ */ diff --git a/neozip/arch/x86/x86_functions.h b/neozip/arch/x86/x86_functions.h new file mode 100644 index 0000000000..881c6efe23 --- /dev/null +++ b/neozip/arch/x86/x86_functions.h @@ -0,0 +1,196 @@ +/* x86_functions.h -- x86 implementations for arch-specific functions. + * Copyright (C) 2013 Intel Corporation Jim Kukunas + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_FUNCTIONS_H_ +#define X86_FUNCTIONS_H_ + +#include "x86_natives.h" + +/* So great news, your compiler is broken and causes stack smashing. Rather than + * notching out its compilation we'll just remove the assignment in the functable. + * Further context: + * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */ +#if defined(_MSC_VER) && defined(ARCH_32BIT) && _MSC_VER >= 1920 && _MSC_VER <= 1929 +#define NO_CHORBA_SSE +#endif + +#ifdef X86_SSE2 +uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match); +void slide_hash_sse2(deflate_state *s); + +# if !defined(WITHOUT_CHORBA_SSE) + uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len); + uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); + uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len); +# endif +#endif + +#ifdef X86_SSSE3 +uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, size_t len, size_t left); +void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); +#endif + +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) + uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len); + uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_SSE42 +uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_AVX2 +uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match); +void slide_hash_avx2(deflate_state *s); +#endif +#ifdef X86_AVX512 +uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match); +#endif +#ifdef X86_AVX512VNNI +uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_PCLMULQDQ_CRC +uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_VPCLMULQDQ_AVX2 +uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_VPCLMULQDQ_AVX512 +uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// X86 - SSE2 +# ifdef X86_SSE2_NATIVE +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_sse2 +# undef native_compare256 +# define native_compare256 compare256_sse2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_sse2 +# undef native_longest_match +# define native_longest_match longest_match_sse2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_sse2 +# if !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse2 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_chorba_sse2 +# endif +# undef native_slide_hash +# define native_slide_hash slide_hash_sse2 +# endif +// X86 - SSSE3 +# ifdef X86_SSSE3_NATIVE +# undef native_adler32 +# define native_adler32 adler32_ssse3 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_ssse3 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_ssse3 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_ssse3 +# endif +// X86 - SSE4.1 +# if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse41 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_chorba_sse41 +# endif +// X86 - SSE4.2 +# ifdef X86_SSE42_NATIVE +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_sse42 +# endif +// X86 - PCLMUL +# ifdef X86_PCLMULQDQ_NATIVE +# undef native_crc32 +# define native_crc32 crc32_pclmulqdq +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_pclmulqdq +# endif +// X86 - AVX2 +# ifdef X86_AVX2_NATIVE +# undef native_adler32 +# define native_adler32 adler32_avx2 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx2 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_avx2 +# undef native_compare256 +# define native_compare256 compare256_avx2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_avx2 +# undef native_longest_match +# define native_longest_match longest_match_avx2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx2 +# undef native_slide_hash +# define native_slide_hash slide_hash_avx2 +# endif +// X86 - AVX512 (F,DQ,BW,Vl) +# ifdef X86_AVX512_NATIVE +# undef native_adler32 +# define native_adler32 adler32_avx512 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx512 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_avx512 +# undef native_compare256 +# define native_compare256 compare256_avx512 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_avx512 +# undef native_longest_match +# define native_longest_match longest_match_avx512 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx512 +// X86 - AVX512 (VNNI) +# ifdef X86_AVX512VNNI_NATIVE +# undef native_adler32 +# define native_adler32 adler32_avx512_vnni +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx512_vnni +# endif +# endif +// X86 - VPCLMULQDQ +# ifdef X86_VPCLMULQDQ_AVX512_NATIVE +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq_avx512 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_vpclmulqdq_avx512 +# elif defined(X86_VPCLMULQDQ_AVX2_NATIVE) +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq_avx2 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_vpclmulqdq_avx2 +# endif +#endif + +#endif /* X86_FUNCTIONS_H_ */ diff --git a/neozip/arch/x86/x86_intrins.h b/neozip/arch/x86/x86_intrins.h new file mode 100644 index 0000000000..1d1df5eb11 --- /dev/null +++ b/neozip/arch/x86/x86_intrins.h @@ -0,0 +1,126 @@ +#ifndef X86_INTRINS_H +#define X86_INTRINS_H + +#ifdef __SSE2__ +#include <emmintrin.h> +#endif + +/* Unfortunately GCC didn't support these things until version 10. + * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3. + */ +#ifdef __AVX2__ +#include <immintrin.h> + +#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \ + || (defined(__apple_build_version__) && __apple_build_version__ < 9020039) +static inline __m256i _mm256_zextsi128_si256(__m128i a) { + __m128i r; + __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a)); + return _mm256_castsi128_si256(r); +} + +#ifdef __AVX512F__ +static inline __m512i _mm512_zextsi128_si512(__m128i a) { + __m128i r; + __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a)); + return _mm512_castsi128_si512(r); +} +#endif // __AVX512F__ +#endif // gcc/AppleClang version test + +#endif // __AVX2__ + +/* GCC <9 is missing some AVX512 intrinsics. + */ +#ifdef __AVX512F__ +#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9) +#include <immintrin.h> + +#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \ + ((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3))) + +static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60, + char __q59, char __q58, char __q57, char __q56, + char __q55, char __q54, char __q53, char __q52, + char __q51, char __q50, char __q49, char __q48, + char __q47, char __q46, char __q45, char __q44, + char __q43, char __q42, char __q41, char __q40, + char __q39, char __q38, char __q37, char __q36, + char __q35, char __q34, char __q33, char __q32, + char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) { + return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56), + PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48), + PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40), + PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32), + PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24), + PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16), + PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08), + PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00)); +} + +#undef PACK + +#endif // gcc version test +#endif // __AVX512F__ + +/* Missing zero-extension AVX and AVX512 intrinsics. + * Fixed in Microsoft Visual Studio 2017 version 15.7 + * https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737 + */ +#if defined(_MSC_VER) && _MSC_VER < 1914 +#ifdef __AVX2__ +static inline __m256i _mm256_zextsi128_si256(__m128i a) { + return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0); +} +#endif // __AVX2__ + +#ifdef __AVX512F__ +static inline __m512i _mm512_zextsi128_si512(__m128i a) { + return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0); +} +#endif // __AVX512F__ +#endif // defined(_MSC_VER) && _MSC_VER < 1914 + +/* Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsics */ +#if defined(_MSC_VER) && defined(__AVX512F__) && !defined(_MM_K0_REG8) +# undef _mm512_extracti32x4_epi32 +# define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1) +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +#include <intrin.h> +/* For whatever reason this intrinsic is 64 bit only with MSVC? + * While we don't have 64 bit GPRs, it should at least be able to move it to stack + * or shuffle it over 2 registers */ +#ifdef ARCH_32BIT +/* So, while we can't move directly to a GPR, hopefully this move to + * a stack resident variable doesn't equate to something awful */ +static inline int64_t _mm_cvtsi128_si64(__m128i a) { + union { __m128i v; int64_t i; } u; + u.v = a; + return u.i; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) { + return _mm_set_epi64x(0, a); +} +#endif +#endif + +#if defined(__GNUC__) && defined(ARCH_X86) && defined(ARCH_32BIT) && !defined(__clang__) +static inline int64_t _mm_cvtsi128_si64(__m128i a) { + union { __m128i v; int64_t i; } u; + u.v = a; + return u.i; +} +#define _mm_cvtsi64_si128(a) _mm_set_epi64x(0, a) +#endif + +#endif // include guard X86_INTRINS_H diff --git a/neozip/arch/x86/x86_natives.h b/neozip/arch/x86/x86_natives.h new file mode 100644 index 0000000000..a39b7a51f0 --- /dev/null +++ b/neozip/arch/x86/x86_natives.h @@ -0,0 +1,57 @@ +/* x86_natives.h -- x86 compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_NATIVES_H_ +#define X86_NATIVES_H_ + +#if defined(__SSE2__) || (defined(ARCH_X86) && defined(ARCH_64BIT)) +# ifdef X86_SSE2 +# define X86_SSE2_NATIVE +# endif +#endif +#if defined(__SSSE3__) +# ifdef X86_SSSE3 +# define X86_SSSE3_NATIVE +# endif +#endif +#if defined(__SSE4_1__) +# ifdef X86_SSE41 +# define X86_SSE41_NATIVE +# endif +#endif +#if defined(__SSE4_2__) +# ifdef X86_SSE42 +# define X86_SSE42_NATIVE +# endif +#endif +#if defined(__PCLMUL__) +# ifdef X86_PCLMULQDQ_CRC +# define X86_PCLMULQDQ_NATIVE +# endif +#endif +#if defined(__AVX2__) +# ifdef X86_AVX2 +# define X86_AVX2_NATIVE +# endif +#endif +#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) +# ifdef X86_AVX512 +# define X86_AVX512_NATIVE +# endif +#endif +#if defined(__AVX512VNNI__) +# ifdef X86_AVX512VNNI +# define X86_AVX512VNNI_NATIVE +# endif +#endif +#if defined(__VPCLMULQDQ__) +# if defined(X86_VPCLMULQDQ_AVX2) && defined(X86_AVX2_NATIVE) +# define X86_VPCLMULQDQ_AVX2_NATIVE +# endif +# if defined(X86_VPCLMULQDQ_AVX512) && defined(X86_AVX512_NATIVE) +# define X86_VPCLMULQDQ_AVX512_NATIVE +# endif +#endif + +#endif /* X86_NATIVES_H_ */ |
