summaryrefslogtreecommitdiff
path: root/neozip/arch/x86
diff options
context:
space:
mode:
authorMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
committerMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
commit7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
treeb43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch/x86
parentb1e34e861b5d732afe828d58aad2c638135061fd (diff)
parentc2712b8a345191f6ed79558c089777df94590087 (diff)
downloadProject-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz
Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/arch/x86')
-rw-r--r--neozip/arch/x86/Makefile.in176
-rw-r--r--neozip/arch/x86/adler32_avx2.c172
-rw-r--r--neozip/arch/x86/adler32_avx2_p.h32
-rw-r--r--neozip/arch/x86/adler32_avx512.c102
-rw-r--r--neozip/arch/x86/adler32_avx512_p.h57
-rw-r--r--neozip/arch/x86/adler32_avx512_vnni.c205
-rw-r--r--neozip/arch/x86/adler32_sse42.c117
-rw-r--r--neozip/arch/x86/adler32_ssse3.c149
-rw-r--r--neozip/arch/x86/adler32_ssse3_p.h29
-rw-r--r--neozip/arch/x86/chunkset_avx2.c129
-rw-r--r--neozip/arch/x86/chunkset_avx512.c186
-rw-r--r--neozip/arch/x86/chunkset_sse2.c50
-rw-r--r--neozip/arch/x86/chunkset_ssse3.c72
-rw-r--r--neozip/arch/x86/compare256_avx2.c61
-rw-r--r--neozip/arch/x86/compare256_avx512.c87
-rw-r--r--neozip/arch/x86/compare256_sse2.c86
-rw-r--r--neozip/arch/x86/crc32_chorba_sse2.c872
-rw-r--r--neozip/arch/x86/crc32_chorba_sse41.c332
-rw-r--r--neozip/arch/x86/crc32_pclmulqdq.c31
-rw-r--r--neozip/arch/x86/crc32_pclmulqdq_tpl.h708
-rw-r--r--neozip/arch/x86/crc32_vpclmulqdq_avx2.c17
-rw-r--r--neozip/arch/x86/crc32_vpclmulqdq_avx512.c17
-rw-r--r--neozip/arch/x86/slide_hash_avx2.c48
-rw-r--r--neozip/arch/x86/slide_hash_sse2.c68
-rw-r--r--neozip/arch/x86/x86_features.c128
-rw-r--r--neozip/arch/x86/x86_features.h30
-rw-r--r--neozip/arch/x86/x86_functions.h196
-rw-r--r--neozip/arch/x86/x86_intrins.h126
-rw-r--r--neozip/arch/x86/x86_natives.h57
29 files changed, 4340 insertions, 0 deletions
diff --git a/neozip/arch/x86/Makefile.in b/neozip/arch/x86/Makefile.in
new file mode 100644
index 0000000000..f756844a9f
--- /dev/null
+++ b/neozip/arch/x86/Makefile.in
@@ -0,0 +1,176 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
+AVX512VNNIFLAG=-mavx512vnni -mbmi2
+AVX2FLAG=-mavx2 -mbmi2
+SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
+SSE41FLAG=-msse4.1
+SSE42FLAG=-msse4.2
+PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
+XSAVEFLAG=-mxsave
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ x86_features.o x86_features.lo \
+ adler32_avx2.o adler32_avx2.lo \
+ adler32_avx512.o adler32_avx512.lo \
+ adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+ adler32_sse42.o adler32_sse42.lo \
+ adler32_ssse3.o adler32_ssse3.lo \
+ chunkset_avx2.o chunkset_avx2.lo \
+ chunkset_avx512.o chunkset_avx512.lo \
+ chunkset_sse2.o chunkset_sse2.lo \
+ chunkset_ssse3.o chunkset_ssse3.lo \
+ compare256_avx2.o compare256_avx2.lo \
+ compare256_avx512.o compare256_avx512.lo \
+ compare256_sse2.o compare256_sse2.lo \
+ crc32_chorba_sse2.o crc32_chorba_sse2.lo \
+ crc32_chorba_sse41.o crc32_chorba_sse41.lo \
+ crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+ crc32_vpclmulqdq_avx2.o crc32_vpclmulqdq_avx2.lo \
+ crc32_vpclmulqdq_avx512.o crc32_vpclmulqdq_avx512.lo \
+ slide_hash_avx2.o slide_hash_avx2.lo \
+ slide_hash_sse2.o slide_hash_sse2.lo
+
+x86_features.o:
+ $(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+x86_features.lo:
+ $(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+chunkset_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx512.o:
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_avx512.lo:
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_ssse3.o:
+ $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+chunkset_ssse3.lo:
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+compare256_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx512.o:
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
+compare256_avx512.lo:
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
+compare256_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+crc32_chorba_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c
+
+crc32_chorba_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c
+
+crc32_chorba_sse41.o:
+ $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c
+
+crc32_chorba_sse41.lo:
+ $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c
+
+crc32_pclmulqdq.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_pclmulqdq.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_vpclmulqdq_avx2.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
+
+crc32_vpclmulqdq_avx2.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
+
+crc32_vpclmulqdq_avx512.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
+
+crc32_vpclmulqdq_avx512.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
+
+slide_hash_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+slide_hash_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/x86/adler32_avx2.c b/neozip/arch/x86/adler32_avx2.c
new file mode 100644
index 0000000000..d1811b254d
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx2.c
@@ -0,0 +1,172 @@
+/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2022 Adam Stylinski
+ * Authors:
+ * Brian Bockelman <bockelman@gmail.com>
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include <immintrin.h>
+#include "adler32_p.h"
+#include "adler32_avx2_p.h"
+#include "x86_intrins.h"
+
+extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 16) {
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
+ } else if (len < 32) {
+ if (COPY) {
+ return adler32_copy_sse42(adler, dst, src, len);
+ } else {
+ return adler32_ssse3(adler, src, len);
+ }
+ }
+
+ __m256i vs1, vs2, vs2_0;
+
+ const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
+ 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
+ const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m256i dot3v = _mm256_set1_epi16(1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ while (len >= 32) {
+ vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = _mm256_setzero_si256();
+ vs2_0 = vs3;
+
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+ len -= k;
+
+ while (k >= 64) {
+ __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+ __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
+ src += 64;
+ k -= 64;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
+ __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
+
+ if (COPY) {
+ _mm256_storeu_si256((__m256i*)dst, vbuf);
+ _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
+ dst += 64;
+ }
+
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+ __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
+ __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+ __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
+ vs1 = _mm256_add_epi32(vs1_sad2, vs1);
+ vs2 = _mm256_add_epi32(vsum2, vs2);
+ vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm256_add_epi32(vs2_0, vs2);
+ vs3 = _mm256_slli_epi32(vs3, 6);
+ vs2 = _mm256_add_epi32(vs3, vs2);
+ vs3 = _mm256_setzero_si256();
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+ */
+ __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+
+ if (COPY) {
+ _mm256_storeu_si256((__m256i*)dst, vbuf);
+ dst += 32;
+ }
+
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
+ __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+ vs2 = _mm256_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ /* Defer the multiplication with 32 to outside of the loop */
+ vs3 = _mm256_slli_epi32(vs3, 5);
+ vs2 = _mm256_add_epi32(vs2, vs3);
+
+ /* The compiler is generating the following sequence for this integer modulus
+ * when done the scalar way, in GPRs:
+
+ adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+ (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+
+ mov $0x80078071,%edi // move magic constant into 32 bit register %edi
+ ...
+ vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
+ mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
+ imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
+ shr $0x2f,%rsi // shift right by 47
+ imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+ sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
+ ...
+ // repeats for each element with vpextract instructions
+
+ This is tricky with AVX2 for a number of reasons:
+ 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
+ 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
+ back down to 32 bit precision later (there is in AVX512)
+ 3.) Full width integer multiplications aren't cheap
+
+ We can, however, do a relatively cheap sequence for horizontal sums.
+ Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
+ previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
+ that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
+ performed on the maximum possible inputs before overflow
+ */
+
+
+ /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
+ * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
+ * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
+ * what the compiler is doing to avoid integer divisions. */
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx2_p.h b/neozip/arch/x86/adler32_avx2_p.h
new file mode 100644
index 0000000000..f0f8a4a887
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx2_p.h
@@ -0,0 +1,32 @@
+/* adler32_avx2_p.h -- adler32 avx2 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_P_H_
+#define ADLER32_AVX2_P_H_
+
+#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
+
+/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
+static inline uint32_t hsum256(__m256i x) {
+ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
+ _mm256_castsi256_si128(x));
+ __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
+ __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+ /* We need a permutation vector to extract every other integer. The
+ * rest are going to be zeros */
+ const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
+ __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
+ __m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
+ __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
+ __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+#endif
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512.c b/neozip/arch/x86/adler32_avx512.c
new file mode 100644
index 0000000000..8a8e165bb9
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512.c
@@ -0,0 +1,102 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 64) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+ if (COPY && len) {
+ __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+ __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+ _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+ }
+
+ return adler32_avx2(adler, src, len);
+ }
+
+ __m512i vbuf, vs1_0, vs3;
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ const __m512i dot3v = _mm512_set1_epi16(1);
+ const __m512i zero = _mm512_setzero_si512();
+
+ while (len >= 64) {
+ __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ vs1_0 = vs1;
+ vs3 = _mm512_setzero_si512();
+
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 64);
+ len -= k;
+
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf = _mm512_loadu_si512(src);
+
+ if (COPY) {
+ _mm512_storeu_si512(dst, vbuf);
+ dst += 64;
+ }
+
+ src += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+ __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+ vs1 = _mm512_add_epi32(vs1_sad, vs1);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm512_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512_p.h b/neozip/arch/x86/adler32_avx512_p.h
new file mode 100644
index 0000000000..742269053c
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,57 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+
+/* Written because Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsic macros */
+#if defined(_MSC_VER) && !defined(_MM_K0_REG8)
+# undef _mm512_extracti64x4_epi64
+# define _mm512_extracti64x4_epi64(v1, e1) _mm512_maskz_extracti64x4_epi64(UINT8_MAX, v1, e1)
+# undef _mm512_set1_epi16
+# define _mm512_set1_epi16(e1) _mm512_maskz_set1_epi16(UINT32_MAX, e1)
+# undef _mm512_maddubs_epi16
+# define _mm512_maddubs_epi16(v1, v2) _mm512_maskz_maddubs_epi16(UINT32_MAX, v1, v2)
+#endif
+
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+ __m256i a = _mm512_extracti64x4_epi64(x, 1);
+ __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+ __m256i a_plus_b = _mm256_add_epi32(a, b);
+ __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+ __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+ __m128i c_plus_d = _mm_add_epi32(c, d);
+
+ __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+ __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+ return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+ /* We need a permutation vector to extract every other integer. The
+ * rest are going to be zeros. Marking this const so the compiler stands
+ * a better chance of keeping this resident in a register through entire
+ * loop execution. We certainly have enough zmm registers (32) */
+ const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 1, 1, 1, 1, 1, 1, 1);
+
+ __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+ /* From here, it's a simple 256 bit wide reduction sum */
+ __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+ /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+ * pretty slow, much slower than the longer instruction sequence below */
+ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+ _mm256_castsi256_si128(non_zero_avx));
+ __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+ __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512_vnni.c b/neozip/arch/x86/adler32_avx512_vnni.c
new file mode 100644
index 0000000000..8bebffbf88
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512_vnni.c
@@ -0,0 +1,205 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512VNNI
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 32)
+ return adler32_ssse3(adler, src, len);
+
+ if (len < 64)
+ return adler32_avx2(adler, src, len);
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+ const __m512i zero = _mm512_setzero_si512();
+ __m512i vs1, vs2;
+
+ while (len >= 64) {
+ vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 64);
+ len -= k;
+ __m512i vs1_0 = vs1;
+ __m512i vs3 = _mm512_setzero_si512();
+ /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+ __m512i vs2_1 = _mm512_setzero_si512();
+ __m512i vbuf0, vbuf1;
+
+ /* Remainder peeling */
+ if (k % 128) {
+ vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+ src += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 128) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf0 = _mm512_loadu_si512((__m512i*)src);
+ vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+ src += 128;
+ k -= 128;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+ vs3 = _mm512_add_epi32(vs3, vs1);
+ vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+ vs2 = _mm512_add_epi32(vs2, vs2_1);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+/* Use 256-bit vectors when copying because 512-bit variant is slower. */
+Z_INTERNAL uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel_copy:
+ if (len < 32) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+ __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+ __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+ _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+ return adler32_ssse3(adler, src, len);
+ }
+
+ const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i vs1, vs2;
+
+ while (len >= 32) {
+ vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+ len -= k;
+
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = _mm256_setzero_si256();
+ /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+ __m256i vs2_1 = _mm256_setzero_si256();
+ __m256i vbuf0, vbuf1;
+
+ /* Remainder peeling */
+ if (k % 64) {
+ vbuf1 = _mm256_loadu_si256((__m256i*)src);
+ _mm256_storeu_si256((__m256i*)dst, vbuf1);
+ dst += 32;
+
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf0 = _mm256_loadu_si256((__m256i*)src);
+ vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+ _mm256_storeu_si256((__m256i*)dst, vbuf0);
+ _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+ dst += 64;
+ src += 64;
+ k -= 64;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+ vs3 = _mm256_add_epi32(vs3, vs1);
+ vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm256_slli_epi32(vs3, 5);
+ vs2 = _mm256_add_epi32(vs2, vs3);
+ vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel_copy;
+ }
+
+ return adler;
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_sse42.c b/neozip/arch/x86/adler32_sse42.c
new file mode 100644
index 0000000000..c2301213f0
--- /dev/null
+++ b/neozip/arch/x86/adler32_sse42.c
@@ -0,0 +1,117 @@
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE42
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1);
+
+ __m128i vbuf, vbuf_0;
+ __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ v_sad_sum2, vsum2, vsum2_0;
+ __m128i zero = _mm_setzero_si128();
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+
+ while (len >= 16) {
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 16);
+ len -= k;
+
+ vs1 = _mm_cvtsi32_si128(adler0);
+ vs2 = _mm_cvtsi32_si128(adler1);
+
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+ src += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+ dst += 32;
+
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ src += 16;
+ k -= 16;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ dst += 16;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = hsum(vs2) % BASE;
+ }
+
+ /* If this is true, there's fewer than 16 elements remaining */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler0 | (adler1 << 16);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_ssse3.c b/neozip/arch/x86/adler32_ssse3.c
new file mode 100644
index 0000000000..702db50251
--- /dev/null
+++ b/neozip/arch/x86/adler32_ssse3.c
@@ -0,0 +1,149 @@
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSSE3
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+ /* If our buffer is unaligned (likely), make the determination whether
+ * or not there's enough of a buffer to consume to make the scalar, aligning
+ * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+ * load. This is a pretty simple test, just test if len < 32 */
+ size_t n = NMAX;
+ size_t k = 0;
+
+ if (len < 32) {
+ /* Let's eat the cost of this one unaligned load so that
+ * we don't completely skip over the vectorization. Doing
+ * 16 bytes at a time unaligned is better than 16 + <= 15
+ * sums */
+ vbuf = _mm_loadu_si128((__m128i*)buf);
+ len -= 16;
+ buf += 16;
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs1_0 = vs1;
+ goto unaligned_jmp;
+ }
+
+ size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+ if (align_diff) {
+ adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0);
+
+ buf += align_diff;
+ len -= align_diff;
+ n -= align_diff;
+ }
+
+ while (len >= 16) {
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ k = ALIGN_DOWN(MIN(len, n), 16);
+ len -= k;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+ buf += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ buf += 16;
+ k -= 16;
+
+unaligned_jmp:
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+ * a partial reduction sum implicitly and only summing to integers in vector positions
+ * 0 and 2. This saves us some contention on the shuffle port(s) */
+ adler = partial_hsum(vs1) % BASE;
+ sum2 = hsum(vs2) % BASE;
+ n = NMAX;
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0);
+}
+
+/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
+Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = adler32_ssse3(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
+#endif
diff --git a/neozip/arch/x86/adler32_ssse3_p.h b/neozip/arch/x86/adler32_ssse3_p.h
new file mode 100644
index 0000000000..d7ec3fe0d5
--- /dev/null
+++ b/neozip/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+ __m128i second_int = _mm_srli_si128(x, 8);
+ __m128i sum = _mm_add_epi32(x, second_int);
+ return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+ __m128i sum1 = _mm_unpackhi_epi64(x, x);
+ __m128i sum2 = _mm_add_epi32(x, sum1);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+ return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
diff --git a/neozip/arch/x86/chunkset_avx2.c b/neozip/arch/x86/chunkset_avx2.c
new file mode 100644
index 0000000000..3e69a7bf66
--- /dev/null
+++ b/neozip/arch/x86/chunkset_avx2.c
@@ -0,0 +1,129 @@
+/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+ /* See explanation in chunkset_avx512.c */
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+ halfchunk_t half = _mm_loadu_si128((__m128i*)from);
+ *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
+#else
+ *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+ * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
+ * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+ *chunk_rem = lut_rem.remval;
+
+ /* See note in chunkset_ssse3.c for why this is ok */
+ __msan_unpoison(buf + dist, 32 - dist);
+
+ if (dist < 16) {
+ /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+ * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+ * shuffles and combining the halves later */
+ __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+ } else {
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+ /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+ __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+ __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+ /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+ * shuffle those values */
+ __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+ }
+
+ return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+ /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+ * unlikely to be actually written or read from */
+ return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = _mm_loadu_si128((__m128i*)buf);
+ *chunk_rem = half_rem_vals[dist - 3];
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+#define CHUNKSIZE chunksize_avx2
+#define CHUNKCOPY chunkcopy_avx2
+#define CHUNKUNROLL chunkunroll_avx2
+#define CHUNKMEMSET chunkmemset_avx2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_avx2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_avx512.c b/neozip/arch/x86/chunkset_avx512.c
new file mode 100644
index 0000000000..60450c653b
--- /dev/null
+++ b/neozip/arch/x86/chunkset_avx512.c
@@ -0,0 +1,186 @@
+/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+typedef __mmask32 mask_t;
+typedef __mmask16 halfmask_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+#define HAVE_MASKED_READWRITE
+#define HAVE_CHUNKCOPY
+#define HAVE_HALFCHUNKCOPY
+
+static inline halfmask_t gen_half_mask(size_t len) {
+ return (halfmask_t)_bzhi_u32(0xFFFF, (unsigned)len);
+}
+
+static inline mask_t gen_mask(size_t len) {
+ return (mask_t)_bzhi_u32(0xFFFFFFFF, (unsigned)len);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+ /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where
+ * the load is dumped to the stack with an aligned move for this memory-register
+ * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to
+ * stack doesn't exist if compiled with optimizations. For the sake of working
+ * properly in a debugger, let's take the 2 cycle penalty */
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+ halfchunk_t half = _mm_loadu_si128((__m128i*)from);
+ *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
+#else
+ *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+
+ chunk_t chunk;
+ size_t rem = len % sizeof(chunk_t);
+
+ if (len < sizeof(chunk_t)) {
+ mask_t rem_mask = gen_mask(rem);
+ chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
+ _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+ return out + rem;
+ }
+
+ loadchunk(from, &chunk);
+ rem = (rem == 0) ? sizeof(chunk_t) : rem;
+ storechunk(out, &chunk);
+ out += rem;
+ from += rem;
+ len -= rem;
+
+ while (len > 0) {
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += sizeof(chunk_t);
+ from += sizeof(chunk_t);
+ len -= sizeof(chunk_t);
+ }
+
+ return out;
+}
+
+/* MSVC compiler decompression bug when optimizing for size */
+#if defined(_MSC_VER) && _MSC_VER < 1943
+# pragma optimize("", off)
+#endif
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ *chunk_rem = lut_rem.remval;
+
+ /* See the AVX2 implementation for more detailed comments. This is that + some masked
+ * loads to avoid an out of bounds read on the heap */
+
+ if (dist < 16) {
+ __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+ halfmask_t load_mask = gen_half_mask(dist);
+ __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+ } else {
+ halfmask_t load_mask = gen_half_mask(dist - 16);
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
+ __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
+ __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+ }
+
+ return ret_vec;
+}
+#if defined(_MSC_VER) && _MSC_VER < 1943
+# pragma optimize("", on)
+#endif
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+ /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+ * unlikely to be actually written or read from */
+ return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ halfmask_t load_mask = gen_half_mask(dist);
+ ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
+ *chunk_rem = half_rem_vals[dist - 3];
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+ halfchunk_t chunk;
+
+ size_t rem = len % sizeof(halfchunk_t);
+ if (rem == 0) {
+ rem = sizeof(halfchunk_t);
+ }
+
+ halfmask_t rem_mask = gen_half_mask(rem);
+ chunk = _mm_maskz_loadu_epi8(rem_mask, from);
+ _mm_mask_storeu_epi8(out, rem_mask, chunk);
+
+ return out + rem;
+}
+
+#define CHUNKSIZE chunksize_avx512
+#define CHUNKUNROLL chunkunroll_avx512
+#define CHUNKMEMSET chunkmemset_avx512
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_avx512
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_sse2.c b/neozip/arch/x86/chunkset_sse2.c
new file mode 100644
index 0000000000..633ab6e64f
--- /dev/null
+++ b/neozip/arch/x86/chunkset_sse2.c
@@ -0,0 +1,50 @@
+/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE2
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE chunksize_sse2
+#define CHUNKCOPY chunkcopy_sse2
+#define CHUNKUNROLL chunkunroll_sse2
+#define CHUNKMEMSET chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_sse2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_ssse3.c b/neozip/arch/x86/chunkset_ssse3.c
new file mode 100644
index 0000000000..0bef7de811
--- /dev/null
+++ b/neozip/arch/x86/chunkset_ssse3.c
@@ -0,0 +1,72 @@
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSSE3
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include <immintrin.h>
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ /* Important to note:
+ * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+ * bytes we willingly and purposefully load uninitialized that we swizzle over
+ * in a vector register, anyway. If what we assume is wrong about what is used,
+ * the memory sanitizer will still usefully flag it */
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = _mm_loadu_si128((__m128i*)buf);
+ *chunk_rem = lut_rem.remval;
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+#define CHUNKSIZE chunksize_ssse3
+#define CHUNKMEMSET chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
+#define CHUNKCOPY chunkcopy_ssse3
+#define CHUNKUNROLL chunkunroll_ssse3
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_ssse3
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_avx2.c b/neozip/arch/x86/compare256_avx2.c
new file mode 100644
index 0000000000..5e2b1716cf
--- /dev/null
+++ b/neozip/arch/x86/compare256_avx2.c
@@ -0,0 +1,61 @@
+/* compare256_avx2.c -- AVX2 version of compare256
+ * Copyright Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_AVX2
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ __m256i ymm_src0, ymm_src1, ymm_cmp;
+ ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+ ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+ ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+ unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
+
+ src0 += 32, src1 += 32, len += 32;
+
+ ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+ ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+ ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+ mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask);
+
+ src0 += 32, src1 += 32, len += 32;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_avx2
+#define COMPARE256 compare256_avx2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_avx2
+#define COMPARE256 compare256_avx2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_avx512.c b/neozip/arch/x86/compare256_avx512.c
new file mode 100644
index 0000000000..f3105505cb
--- /dev/null
+++ b/neozip/arch/x86/compare256_avx512.c
@@ -0,0 +1,87 @@
+/* compare256_avx512.c -- AVX512 version of compare256
+ * Copyright (C) 2025 Hans Kristian Rosbach
+ * Based on AVX2 implementation by Mika T. Lindqvist
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_AVX512
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) {
+ __m512i zmm_src0_4, zmm_src1_4;
+ __m512i zmm_src0_3, zmm_src1_3;
+ __m512i zmm_src0_2, zmm_src1_2;
+ __m512i zmm_src0_1, zmm_src1_1;
+ __m128i xmm_src0_0, xmm_src1_0;
+ uint64_t mask_1, mask_2, mask_3, mask_4;
+ uint32_t mask_0;
+
+ // First do a 16byte round before increasing to 64bytes, this reduces the
+ // penalty for the short matches, and those are usually the most common ones.
+ // This requires us to overlap on the last round, giving a small penalty
+ // on matches of 192+ bytes (Still faster than AVX2 though).
+
+ // 16 bytes
+ xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
+ xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
+ mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0);
+ if (mask_0 != 0x0000FFFF)
+ return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */
+
+ // 64 bytes
+ zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16));
+ zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16));
+ mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1);
+ if (mask_1 != 0xFFFFFFFFFFFFFFFF)
+ return 16 + zng_ctz64(~mask_1);
+
+ // 64 bytes
+ zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80));
+ zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80));
+ mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2);
+ if (mask_2 != 0xFFFFFFFFFFFFFFFF)
+ return 80 + zng_ctz64(~mask_2);
+
+ // 64 bytes
+ zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144));
+ zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144));
+ mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3);
+ if (mask_3 != 0xFFFFFFFFFFFFFFFF)
+ return 144 + zng_ctz64(~mask_3);
+
+ // 64 bytes (overlaps the previous 16 bytes for fast tail processing)
+ zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192));
+ zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192));
+ mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4);
+ if (mask_4 != 0xFFFFFFFFFFFFFFFF)
+ return 192 + zng_ctz64(~mask_4);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_avx512_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_avx512
+#define COMPARE256 compare256_avx512_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_avx512
+#define COMPARE256 compare256_avx512_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_sse2.c b/neozip/arch/x86/compare256_sse2.c
new file mode 100644
index 0000000000..cfaff82cfa
--- /dev/null
+++ b/neozip/arch/x86/compare256_sse2.c
@@ -0,0 +1,86 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_SSE2
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+ __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+ /* Do the first load unaligned, than all subsequent ones we have at least
+ * one aligned load. Sadly aligning both loads is probably unrealistic */
+ xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return zng_ctz32(~mask);
+
+ const uint8_t *last0 = src0 + 240;
+ const uint8_t *last1 = src1 + 240;
+
+ int align_offset = ((uintptr_t)src0) & 15;
+ int align_adv = 16 - align_offset;
+ uint32_t len = align_adv;
+
+ src0 += align_adv;
+ src1 += align_adv;
+
+ for (int i = 0; i < 15; ++i) {
+ xmm_src0 = _mm_load_si128((__m128i*)src0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return len + zng_ctz32(~mask);
+
+ len += 16, src0 += 16, src1 += 16;
+ }
+
+ if (align_offset) {
+ xmm_src0 = _mm_loadu_si128((__m128i*)last0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)last1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ if (mask != 0xFFFF)
+ return 240 + zng_ctz32(~mask);
+ }
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_sse2
+#define COMPARE256 compare256_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_sse2
+#define COMPARE256 compare256_sse2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/crc32_chorba_sse2.c b/neozip/arch/x86/crc32_chorba_sse2.c
new file mode 100644
index 0000000000..66191e046a
--- /dev/null
+++ b/neozip/arch/x86/crc32_chorba_sse2.c
@@ -0,0 +1,872 @@
+#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE)
+
+#include "zbuild.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include <emmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch_functions.h"
+
+#define READ_NEXT(in, off, a, b) do { \
+ a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+ b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+ } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+ a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+ b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+ c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+ d = _mm_srli_epi64(invec, 20); \
+ } while (0);
+
+Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
+ /* The calling function ensured that this is aligned correctly */
+ const uint64_t* input = (const uint64_t*)buf;
+ ALIGNED_(16) uint64_t final[9] = {0};
+ uint64_t next1 = ~crc;
+ crc = 0;
+ uint64_t next2 = 0;
+ uint64_t next3 = 0;
+ uint64_t next4 = 0;
+ uint64_t next5 = 0;
+
+ __m128i next12 = _mm_cvtsi64_si128(next1);
+ __m128i next34 = _mm_setzero_si128();
+ __m128i next56 = _mm_setzero_si128();
+ __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4;
+
+ size_t i = 0;
+
+ /* This is weird, doing for vs while drops 10% off the exec time */
+ for (; (i + 256 + 40 + 32 + 32) < len; i += 32) {
+ __m128i in1in2, in3in4;
+
+ /*
+ uint64_t chorba1 = input[i / sizeof(uint64_t)];
+ uint64_t chorba2 = input[i / sizeof(uint64_t) + 1];
+ uint64_t chorba3 = input[i / sizeof(uint64_t) + 2];
+ uint64_t chorba4 = input[i / sizeof(uint64_t) + 3];
+ uint64_t chorba5 = input[i / sizeof(uint64_t) + 4];
+ uint64_t chorba6 = input[i / sizeof(uint64_t) + 5];
+ uint64_t chorba7 = input[i / sizeof(uint64_t) + 6];
+ uint64_t chorba8 = input[i / sizeof(uint64_t) + 7];
+ */
+
+ const uint64_t *input_ptr = input + (i / sizeof(uint64_t));
+ const __m128i *input_ptr_128 = (__m128i*)input_ptr;
+ __m128i chorba12 = _mm_load_si128(input_ptr_128++);
+ __m128i chorba34 = _mm_load_si128(input_ptr_128++);
+ __m128i chorba56 = _mm_load_si128(input_ptr_128++);
+ __m128i chorba78 = _mm_load_si128(input_ptr_128++);
+
+ chorba12 = _mm_xor_si128(chorba12, next12);
+ chorba34 = _mm_xor_si128(chorba34, next34);
+ chorba56 = _mm_xor_si128(chorba56, next56);
+ chorba78 = _mm_xor_si128(chorba78, chorba12);
+ __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1));
+ __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12),
+ _mm_castsi128_pd(chorba34), 1));
+ /*
+ chorba1 ^= next1;
+ chorba2 ^= next2;
+ chorba3 ^= next3;
+ chorba4 ^= next4;
+ chorba5 ^= next5;
+ chorba7 ^= chorba1;
+ chorba8 ^= chorba2;
+ */
+ i += 8 * 8;
+
+ /* 0-3 */
+ /*in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];*/
+ READ_NEXT(input, i, in1in2, in3in4);
+ __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12));
+ in1in2 = _mm_xor_si128(in1in2, chorba34xor);
+ /*
+ in1 ^= chorba3;
+ in2 ^= chorba4 ^ chorba1;
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ */
+
+ in3in4 = _mm_xor_si128(in3in4, ab1);
+ /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */
+ __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2));
+
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23));
+ in3in4 = _mm_xor_si128(in3in4, chorba12);
+
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1;
+ in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, a4_);
+ next12 = _mm_xor_si128(next12, cd1);
+
+ __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+
+ /*out1 = a3 ^ b2 ^ c1;
+ out2 = b3 ^ c2 ^ d1 ^ a4;*/
+ next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ //out3 = b4 ^ c3 ^ d2;
+ //out4 = c4 ^ d3;
+
+ //out5 = d4;
+
+ /*
+ next1 = out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 4-7 */
+ /*in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];*/
+ READ_NEXT(input, i, in1in2, in3in4);
+
+ in1in2 = _mm_xor_si128(in1in2, next12);
+ in1in2 = _mm_xor_si128(in1in2, chorba78);
+ in1in2 = _mm_xor_si128(in1in2, chorba45);
+ in1in2 = _mm_xor_si128(in1in2, chorba34);
+
+ /*
+ in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3;
+ in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4;
+ */
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+
+ in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5;
+ in4 ^= next4 ^ b1 ^ a2 ^ chorba7 ^ chorba6;
+ */
+ in3in4 = _mm_xor_si128(in3in4, next34);
+ in3in4 = _mm_xor_si128(in3in4, ab1);
+ in3in4 = _mm_xor_si128(in3in4, chorba56);
+ __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+
+ /*
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ ///*
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, cd1);
+
+ next12 = _mm_xor_si128(next12, a4_);
+ next12 = _mm_xor_si128(next12, next56);
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ next34 = _mm_xor_si128(b4c4, cd3);
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ //*/
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = b3 ^ c2 ^ d1 ^ a4;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 8-11 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1;
+ in2 ^= next2 ^ chorba8 ^ chorba2;
+ */
+
+ READ_NEXT(input, i, in1in2, in3in4);
+
+ __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128());
+ __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12);
+ in1in2 = _mm_xor_si128(in1in2, chorba80);
+ in1in2 = _mm_xor_si128(in1in2, chorba78);
+ in1in2 = _mm_xor_si128(in1in2, next12_chorba12);
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];*/
+ in3in4 = _mm_xor_si128(next34, in3in4);
+ in3in4 = _mm_xor_si128(in3in4, ab1);
+ __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, chorba34);
+ in3in4 = _mm_xor_si128(in3in4, a2_);
+
+ /*
+ in3 ^= next3 ^ a1 ^ chorba3;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba4;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(a4_, ab3);
+ next12 = _mm_xor_si128(next12, cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 12-15 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ */
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, next12);
+ __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12);
+ in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12);
+ __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+ in1in2 = _mm_xor_si128(in1in2, chorb1_);
+
+
+ /*
+ in1 ^= next1 ^ chorba5 ^ chorba1;
+ in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
+ */
+
+ in3in4 = _mm_xor_si128(next34, in3in4);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ ///*
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ //*/
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 16-19 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
+ in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
+ */
+ ///*
+ READ_NEXT(input, i, in1in2, in3in4);
+ __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+ in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34));
+ in1in2 = _mm_xor_si128(chorba1_, in1in2);
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+ //*/
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ */
+ ///*
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+ in3in4 = _mm_xor_si128(in3in4, next34);
+ //*/
+ /*
+ in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
+ */
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 20-23 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
+ in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
+ */
+
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+ in1in2 = _mm_xor_si128(in1in2, chorba80);
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+ */
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+ in3in4 = _mm_xor_si128(in3in4, chorba12);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ i += 32;
+
+ /* 24-27 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+ in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
+ */
+
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+ in1in2 = _mm_xor_si128(in1in2, chorba80);
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_));
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+ i += 32;
+
+ /* 28-31 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5;
+ in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6;
+ */
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56));
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80));
+ in3in4 = _mm_xor_si128(a2_, in3in4);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+ */
+
+ /*
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ }
+
+ for (; (i + 40 + 32) < len; i += 32) {
+ __m128i in1in2, in3in4;
+
+ /*in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];*/
+ //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4);
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, next12);
+
+ /*
+ in1 ^=next1;
+ in2 ^=next2;
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1;
+ in4 ^= next4 ^ a2 ^ b1;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+ in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+ in3in4 = _mm_xor_si128(a2_, in3in4);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, a4_);
+ next12 = _mm_xor_si128(next12, cd1);
+
+ __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ next12 = _mm_xor_si128(next12, next56);
+ next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ }
+
+ next1 = _mm_cvtsi128_si64(next12);
+ next2 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next12, next12));
+ next3 = _mm_cvtsi128_si64(next34);
+ next4 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next34, next34));
+ next5 = _mm_cvtsi128_si64(next56);
+
+ /* Skip the call to memcpy */
+ size_t copy_len = len - i;
+ __m128i *final128 = (__m128i*)final;
+ __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t));
+ while (copy_len >= 64) {
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ copy_len -= 64;
+ }
+
+ while (copy_len >= 16) {
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ copy_len -= 16;
+ }
+
+ uint8_t *src_bytes = (uint8_t*)input128;
+ uint8_t *dst_bytes = (uint8_t*)final128;
+ while (copy_len--) {
+ *dst_bytes++ = *src_bytes++;
+ }
+
+ final[0] ^= next1;
+ final[1] ^= next2;
+ final[2] ^= next3;
+ final[3] ^= next4;
+ final[4] ^= next5;
+
+ /* We perform the same loop that braid_internal is doing but we'll skip
+ * the function call for this tiny tail */
+ uint8_t *final_bytes = (uint8_t*)final;
+ size_t rem = len - i;
+
+ while (rem--) {
+ crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8);
+ }
+
+ return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
+ uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+ if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT)
+ return crc32_braid(crc, buf, len);
+
+ if (align_diff) {
+ crc = crc32_braid(crc, buf, align_diff);
+ len -= align_diff;
+ buf += align_diff;
+ }
+#if !defined(WITHOUT_CHORBA)
+ if (len > CHORBA_LARGE_THRESHOLD)
+ return crc32_chorba_118960_nondestructive(crc, buf, len);
+#endif
+ return chorba_small_nondestructive_sse2(crc, buf, len);
+}
+
+Z_INTERNAL uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_chorba_sse2(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+#endif
diff --git a/neozip/arch/x86/crc32_chorba_sse41.c b/neozip/arch/x86/crc32_chorba_sse41.c
new file mode 100644
index 0000000000..6ef9612440
--- /dev/null
+++ b/neozip/arch/x86/crc32_chorba_sse41.c
@@ -0,0 +1,332 @@
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
+
+#include "zbuild.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch_functions.h"
+
+#define READ_NEXT(in, off, a, b) do { \
+ a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+ b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+ } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+ a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+ b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+ c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+ d = _mm_srli_epi64(invec, 20); \
+ } while (0);
+
+#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
+ out0 = _mm_slli_si128(in0, shift); \
+ out1 = _mm_alignr_epi8(in1, in0, shift); \
+ out2 = _mm_alignr_epi8(in2, in1, shift); \
+ out3 = _mm_alignr_epi8(in3, in2, shift); \
+ out4 = _mm_srli_si128(in3, shift); \
+ } while (0)
+
+#define STORE4(out0, out1, out2, out3, out) do { \
+ _mm_store_si128(out++, out0); \
+ _mm_store_si128(out++, out1); \
+ _mm_store_si128(out++, out2); \
+ _mm_store_si128(out++, out3); \
+ } while (0)
+
+#define READ4(out0, out1, out2, out3, in) do { \
+ out0 = _mm_load_si128(in++); \
+ out1 = _mm_load_si128(in++); \
+ out2 = _mm_load_si128(in++); \
+ out3 = _mm_load_si128(in++); \
+ } while (0)
+
+/* This is intentionally shifted one down to compensate for the deferred store from
+ * the last iteration */
+#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
+ out0 = _mm_xor_si128(in[1], xor0); \
+ out1 = _mm_xor_si128(in[2], xor1); \
+ out2 = _mm_xor_si128(in[3], xor2); \
+ out3 = _mm_xor_si128(in[4], xor3); \
+ } while (0)
+
+Z_FORCEINLINE static uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+ /* The calling function ensured that this is aligned correctly */
+ const uint64_t* input = (const uint64_t*)buf;
+ ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+ __m128i *bitbuffer_v = (__m128i*)bitbuffer;
+ const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+ __m128i z = _mm_setzero_si128();
+
+ __m128i *bitbuf128 = &bitbuffer_v[64];
+ __m128i *bitbuf144 = &bitbuffer_v[72];
+ __m128i *bitbuf182 = &bitbuffer_v[91];
+ __m128i *bitbuf210 = &bitbuffer_v[105];
+ __m128i *bitbuf300 = &bitbuffer_v[150];
+ __m128i *bitbuf0 = bitbuf128;
+ __m128i *inptr = (__m128i*)input;
+
+ /* We only need to zero out the bytes between the 128'th value and the 144th
+ * that are actually read */
+ __m128i *z_cursor = bitbuf128;
+ for (size_t i = 0; i < 2; ++i) {
+ STORE4(z, z, z, z, z_cursor);
+ }
+
+ /* We only need to zero out the bytes between the 144'th value and the 182nd that
+ * are actually read */
+ z_cursor = bitbuf144 + 8;
+ for (size_t i = 0; i < 11; ++i) {
+ _mm_store_si128(z_cursor++, z);
+ }
+
+ /* We only need to zero out the bytes between the 182nd value and the 210th that
+ * are actually read. */
+ z_cursor = bitbuf182;
+ for (size_t i = 0; i < 4; ++i) {
+ STORE4(z, z, z, z, z_cursor);
+ }
+
+ /* We need to mix this in */
+ __m128i init_crc = _mm_cvtsi64_si128(~crc);
+ crc = 0;
+
+ size_t i = 0;
+
+ /* Previous iteration runs carried over */
+ __m128i buf144 = z;
+ __m128i buf182 = z;
+ __m128i buf210 = z;
+
+ for (; i + 300*8+64 < len && i < 22 * 8; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+
+ READ4(in12, in34, in56, in78, inptr);
+
+ if (i == 0) {
+ in12 = _mm_xor_si128(in12, init_crc);
+ }
+
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a = _mm_xor_si128(buf144, in_1);
+
+ STORE4(a, in23, in45, in67, bitbuf144);
+ buf144 = in8_;
+
+ __m128i e = _mm_xor_si128(buf182, in_1);
+ STORE4(e, in23, in45, in67, bitbuf182);
+ buf182 = in8_;
+
+ __m128i m = _mm_xor_si128(buf210, in_1);
+ STORE4(m, in23, in45, in67, bitbuf210);
+ buf210 = in8_;
+
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ for (; i + 300*8+64 < len && i < 32 * 8; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+ READ4(in12, in34, in56, in78, inptr);
+
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a = _mm_xor_si128(buf144, in_1);
+
+ STORE4(a, in23, in45, in67, bitbuf144);
+ buf144 = in8_;
+
+ __m128i e, f, g, h;
+ e = _mm_xor_si128(buf182, in_1);
+ READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+ STORE4(e, f, g, h, bitbuf182);
+
+ __m128i m = _mm_xor_si128(buf210, in_1);
+ STORE4(m, in23, in45, in67, bitbuf210);
+ buf210 = in8_;
+
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ for (; i + 300*8+64 < len && i < 84 * 8; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+ READ4(in12, in34, in56, in78, inptr);
+
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a, b, c, d;
+ a = _mm_xor_si128(buf144, in_1);
+ READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+ STORE4(a, b, c, d, bitbuf144);
+
+ __m128i e, f, g, h;
+ e = _mm_xor_si128(buf182, in_1);
+ READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+ STORE4(e, f, g, h, bitbuf182);
+
+ __m128i m = _mm_xor_si128(buf210, in_1);
+ STORE4(m, in23, in45, in67, bitbuf210);
+ buf210 = in8_;
+
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ for (; i + 300*8+64 < len; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+
+ if (i < 128 * 8) {
+ READ4(in12, in34, in56, in78, inptr);
+ } else {
+ in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ }
+
+ // [0, 145, 183, 211]
+
+ /* Pre Penryn CPUs the unpack should be faster */
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a, b, c, d;
+ a = _mm_xor_si128(buf144, in_1);
+ READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+ STORE4(a, b, c, d, bitbuf144);
+
+ __m128i e, f, g, h;
+ e = _mm_xor_si128(buf182, in_1);
+ READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+ STORE4(e, f, g, h, bitbuf182);
+
+ __m128i n, o, p;
+ __m128i m = _mm_xor_si128(buf210, in_1);
+
+ /* Couldn't tell you why but despite knowing that this is always false,
+ * removing this branch with GCC makes things significantly slower. Some
+ * loop bodies must be being joined or something */
+ if (i < 84 * 8) {
+ n = in23;
+ o = in45;
+ p = in67;
+ buf210 = in8_;
+ } else {
+ READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
+ }
+
+ STORE4(m, n, o, p, bitbuf210);
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ /* Second half of stores bubbled out */
+ _mm_store_si128(bitbuf144, buf144);
+ _mm_store_si128(bitbuf182, buf182);
+ _mm_store_si128(bitbuf210, buf210);
+
+ /* We also have to zero out the tail */
+ size_t left_to_z = len - (300*8 + i);
+ __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
+ while (left_to_z >= 64) {
+ STORE4(z, z, z, z, bitbuf_tail);
+ left_to_z -= 64;
+ }
+
+ while (left_to_z >= 16) {
+ _mm_store_si128(bitbuf_tail++, z);
+ left_to_z -= 16;
+ }
+
+ uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
+ while (left_to_z--) {
+ *tail_bytes++ = 0;
+ }
+
+ ALIGNED_(16) uint64_t final[9] = {0};
+ __m128i next12, next34, next56;
+ next12 = z;
+ next34 = z;
+ next56 = z;
+
+ for (; (i + 72 < len); i += 32) {
+ __m128i in1in2, in3in4;
+ __m128i in1in2_, in3in4_;
+ __m128i ab1, ab2, ab3, ab4;
+ __m128i cd1, cd2, cd3, cd4;
+
+ READ_NEXT(input, i, in1in2, in3in4);
+ READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
+
+ in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
+ in3in4 = _mm_xor_si128(in3in4, in3in4_);
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ __m128i a2_ = _mm_slli_si128(ab2, 8);
+ __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+ in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+ in3in4 = _mm_xor_si128(a2_, in3in4);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
+ __m128i a4_ = _mm_slli_si128(ab4, 8);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, a4_);
+ next12 = _mm_xor_si128(next12, cd1);
+
+ __m128i d2_ = _mm_srli_si128(cd2, 8);
+ __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
+ next12 = _mm_xor_si128(next12, next56);
+ next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+ next56 = _mm_srli_si128(cd4, 8);
+ }
+
+ memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+ __m128i *final128 = (__m128i*)final;
+ _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
+ ++final128;
+ _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
+ ++final128;
+ _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
+
+ uint8_t *final_bytes = (uint8_t*)final;
+
+ for (size_t j = 0; j < (len-i); j++) {
+ crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8);
+ }
+ return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+ uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+ if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT)
+ return crc32_braid(crc, buf, len);
+
+ if (align_diff) {
+ crc = crc32_braid(crc, buf, align_diff);
+ len -= align_diff;
+ buf += align_diff;
+ }
+#if !defined(WITHOUT_CHORBA)
+ if (len > CHORBA_LARGE_THRESHOLD)
+ return crc32_chorba_118960_nondestructive(crc, buf, len);
+#endif
+ if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
+ return crc32_chorba_32768_nondestructive_sse41(crc, buf, len);
+ return chorba_small_nondestructive_sse2(crc, buf, len);
+}
+
+Z_INTERNAL uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_chorba_sse41(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+#endif
diff --git a/neozip/arch/x86/crc32_pclmulqdq.c b/neozip/arch/x86/crc32_pclmulqdq.c
new file mode 100644
index 0000000000..c8be1b43ba
--- /dev/null
+++ b/neozip/arch/x86/crc32_pclmulqdq.c
@@ -0,0 +1,31 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ * doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * Jim Guilford <james.guilford@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Erdinc Ozturk <erdinc.ozturk@intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/crc32_pclmulqdq_tpl.h b/neozip/arch/x86/crc32_pclmulqdq_tpl.h
new file mode 100644
index 0000000000..e4ea546afd
--- /dev/null
+++ b/neozip/arch/x86/crc32_pclmulqdq_tpl.h
@@ -0,0 +1,708 @@
+/* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding
+ * approach with the PCLMULQDQ and VPCMULQDQ instructions.
+ *
+ * A white paper describing this algorithm can be found at:
+ * doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support)
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * Jim Guilford <james.guilford@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Erdinc Ozturk <erdinc.ozturk@intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+#include <immintrin.h>
+#include <wmmintrin.h>
+#include <smmintrin.h> // _mm_extract_epi32
+
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_p.h"
+#include "x86_intrins.h"
+
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+# if defined(_MSC_VER) && _MSC_VER < 1920
+ /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */
+# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96)
+# define z512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm)
+# define z512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm)
+# else
+# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, 0x96)
+# define z512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm)
+# define z512_extracti64x2(a, imm) _mm512_extracti64x2_epi64(a, imm)
+# endif
+# ifdef __AVX512VL__
+# define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96)
+# endif
+#endif
+/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+# define z256_xor3_epi64(a, b, c) _mm256_xor_si256(_mm256_xor_si256(a, b), c)
+#endif
+
+#ifndef z128_xor3_epi64
+# define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
+#endif
+
+static inline void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+
+ *xmm_crc0 = *xmm_crc1;
+ *xmm_crc1 = *xmm_crc2;
+ *xmm_crc2 = *xmm_crc3;
+ *xmm_crc3 = _mm_xor_si128(x_low, x_high);
+}
+
+static inline void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+
+ *xmm_crc0 = *xmm_crc2;
+ *xmm_crc1 = *xmm_crc3;
+ *xmm_crc2 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc3 = _mm_xor_si128(x_low1, x_high1);
+}
+
+static inline void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+ __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+
+ *xmm_crc0 = *xmm_crc3;
+ *xmm_crc1 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc2 = _mm_xor_si128(x_low1, x_high1);
+ *xmm_crc3 = _mm_xor_si128(x_low2, x_high2);
+}
+
+static inline void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+ __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+ __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+ __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+
+ *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
+ *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
+ *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
+}
+
+static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+ const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85);
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10);
+ __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10);
+ __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01);
+ __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10);
+
+ *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
+ *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
+ *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
+}
+
+/* 512-bit fold function requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3,
+ const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) {
+ __m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01);
+ __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10);
+ __m512i z_low1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01);
+ __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10);
+ __m512i z_low2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01);
+ __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10);
+ __m512i z_low3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01);
+ __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10);
+
+ *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0);
+ *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1);
+ *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2);
+ *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3);
+}
+#endif
+/* 256-bit fold function for VPCLMULQDQ without AVX-512 */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+static inline void fold_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3,
+ const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, const __m256i ymm_t3, const __m256i ymm_fold8) {
+ __m256i y_low0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01);
+ __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10);
+ __m256i y_low1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01);
+ __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10);
+ __m256i y_low2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01);
+ __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10);
+ __m256i y_low3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01);
+ __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10);
+
+ *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0);
+ *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1);
+ *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2);
+ *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3);
+}
+#endif
+
+Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ size_t copy_len = len;
+ if (len >= 16) {
+ /* Calculate 16-byte alignment offset */
+ uintptr_t align_diff = ALIGN_DIFF(src, 16);
+
+ /* If total length is less than (alignment bytes + 16), use the faster small method.
+ * Handles both initially small buffers and cases where alignment would leave < 16 bytes */
+ copy_len = len < align_diff + 16 ? len : align_diff;
+ }
+
+ if (copy_len > 0) {
+ crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY);
+ src += copy_len;
+ len -= copy_len;
+ if (COPY) {
+ dst += copy_len;
+ }
+ }
+
+ if (len == 0)
+ return crc;
+
+ const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+
+ __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+ __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+ __m128i xmm_crc1 = _mm_setzero_si128();
+ __m128i xmm_crc2 = _mm_setzero_si128();
+ __m128i xmm_crc3 = _mm_setzero_si128();
+
+ if (crc != 0) {
+ // Process the first 16 bytes and handle initial CRC
+ len -= 16;
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ src += 16;
+
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ dst += 16;
+ }
+ xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc));
+ }
+
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+ if (len >= 256) {
+ len -= 256;
+
+ __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+ __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+ __m512i z_low0, z_high0;
+ const __m512i zmm_fold4 = _mm512_set4_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m512i zmm_fold16 = _mm512_set4_epi32(
+ 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+ zmm_crc0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+ src += 256;
+ if (COPY) {
+ _mm512_storeu_si512((__m512i *)dst, zmm_crc0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+ dst += 256;
+ }
+
+ // Fold existing xmm state into first 64 bytes
+ zmm_t0 = _mm512_castsi128_si512(xmm_crc0);
+ zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc1, 1);
+ zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc2, 2);
+ zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc3, 3);
+
+ z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0);
+
+ while (len >= 256) {
+ len -= 256;
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+ src += 256;
+
+ fold_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16);
+ if (COPY) {
+ _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+ dst += 256;
+ }
+ }
+
+ // zmm_crc[0,1,2,3] -> zmm_crc0
+ z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1);
+
+ z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2);
+
+ z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3);
+
+ // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+ xmm_crc0 = z512_extracti64x2(zmm_crc0, 0);
+ xmm_crc1 = z512_extracti64x2(zmm_crc0, 1);
+ xmm_crc2 = z512_extracti64x2(zmm_crc0, 2);
+ xmm_crc3 = z512_extracti64x2(zmm_crc0, 3);
+ }
+/* 256-bit VPCLMULQDQ path */
+#elif defined(X86_VPCLMULQDQ)
+ if (len >= 128) {
+ len -= 128;
+
+ __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3;
+ __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3;
+ __m256i y_low0, y_high0;
+ const __m256i ymm_fold4 = _mm256_set_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596,
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m256i ymm_fold8 = _mm256_set_epi32(
+ 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880,
+ 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880);
+
+ ymm_crc0 = _mm256_loadu_si256((__m256i *)src);
+ ymm_crc1 = _mm256_loadu_si256((__m256i *)src + 1);
+ ymm_crc2 = _mm256_loadu_si256((__m256i *)src + 2);
+ ymm_crc3 = _mm256_loadu_si256((__m256i *)src + 3);
+ src += 128;
+ if (COPY) {
+ _mm256_storeu_si256((__m256i *)dst, ymm_crc0);
+ _mm256_storeu_si256((__m256i *)dst + 1, ymm_crc1);
+ _mm256_storeu_si256((__m256i *)dst + 2, ymm_crc2);
+ _mm256_storeu_si256((__m256i *)dst + 3, ymm_crc3);
+ dst += 128;
+ }
+
+ // Fold existing xmm state into first 32 bytes
+ ymm_t0 = _mm256_castsi128_si256(xmm_crc0);
+ ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc1, 1);
+
+ y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+ y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+ ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0);
+
+ ymm_t0 = _mm256_castsi128_si256(xmm_crc2);
+ ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc3, 1);
+
+ y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+ y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+ ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0);
+
+ while (len >= 128) {
+ len -= 128;
+ ymm_t0 = _mm256_loadu_si256((__m256i *)src);
+ ymm_t1 = _mm256_loadu_si256((__m256i *)src + 1);
+ ymm_t2 = _mm256_loadu_si256((__m256i *)src + 2);
+ ymm_t3 = _mm256_loadu_si256((__m256i *)src + 3);
+ src += 128;
+
+ fold_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8);
+ if (COPY) {
+ _mm256_storeu_si256((__m256i *)dst, ymm_t0);
+ _mm256_storeu_si256((__m256i *)dst + 1, ymm_t1);
+ _mm256_storeu_si256((__m256i *)dst + 2, ymm_t2);
+ _mm256_storeu_si256((__m256i *)dst + 3, ymm_t3);
+ dst += 128;
+ }
+ }
+
+ // Extract 8 x 128-bit lanes from 4 x 256-bit registers
+ __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0);
+ __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1);
+ __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1);
+ __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1);
+ __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2);
+ __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1);
+ __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3);
+ __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1);
+
+ // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4)
+ __m128i x_low, x_high;
+ x_low = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10);
+ xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4);
+
+ x_low = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10);
+ xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5);
+
+ x_low = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10);
+ xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6);
+
+ x_low = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10);
+ xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7);
+ }
+#else
+ /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
+ * We interleave the PCLMUL-base folds with 8x scaled generator
+ * polynomial copies; we read 8x QWORDS and then XOR them into
+ * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
+ * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
+ * as "generator_64_bits_unrolled_8" */
+#ifndef __AVX512VL__
+ if (!COPY) {
+#endif
+ while (len >= 512 + 64 + 16*8) {
+ __m128i chorba8 = _mm_load_si128((__m128i *)src);
+ __m128i chorba7 = _mm_load_si128((__m128i *)src + 1);
+ __m128i chorba6 = _mm_load_si128((__m128i *)src + 2);
+ __m128i chorba5 = _mm_load_si128((__m128i *)src + 3);
+ __m128i chorba4 = _mm_load_si128((__m128i *)src + 4);
+ __m128i chorba3 = _mm_load_si128((__m128i *)src + 5);
+ __m128i chorba2 = _mm_load_si128((__m128i *)src + 6);
+ __m128i chorba1 = _mm_load_si128((__m128i *)src + 7);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, chorba8);
+ _mm_storeu_si128((__m128i *)dst + 1, chorba7);
+ _mm_storeu_si128((__m128i *)dst + 2, chorba6);
+ _mm_storeu_si128((__m128i *)dst + 3, chorba5);
+ _mm_storeu_si128((__m128i *)dst + 4, chorba4);
+ _mm_storeu_si128((__m128i *)dst + 5, chorba3);
+ _mm_storeu_si128((__m128i *)dst + 6, chorba2);
+ _mm_storeu_si128((__m128i *)dst + 7, chorba1);
+ dst += 16*8;
+ }
+
+ chorba2 = _mm_xor_si128(chorba2, chorba8);
+ chorba1 = _mm_xor_si128(chorba1, chorba7);
+ src += 16*8;
+ len -= 16*8;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+
+ fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, xmm_crc2);
+ xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 4);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 5);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 6);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 7);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 8);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 9);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 10);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 11);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, xmm_crc2);
+ xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 12);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 13);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 14);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 15);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 16);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 17);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 18);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 19);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 20);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 21);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 22);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 23);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 24);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 25);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 26);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 27);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 28);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 29);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 30);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 31);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), xmm_crc2);
+ xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, xmm_crc3);
+
+ len -= 512;
+ src += 512;
+ }
+#ifndef __AVX512VL__
+ }
+#endif
+
+#endif /* X86_VPCLMULQDQ */
+
+ while (len >= 64) {
+ len -= 64;
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+ src += 64;
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+ }
+
+ /*
+ * len = num bytes left - 64
+ */
+ if (len >= 48) {
+ len -= 48;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ src += 48;
+
+ fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ dst += 48;
+ }
+
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+ } else if (len >= 32) {
+ len -= 32;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ src += 32;
+
+ fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ dst += 32;
+ }
+
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+ } else if (len >= 16) {
+ len -= 16;
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ src += 16;
+
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ dst += 16;
+ }
+
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+ }
+
+ const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e);
+ const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641);
+
+ /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */
+ __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10);
+ xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0);
+
+ __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10);
+ xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1);
+
+ __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10);
+ xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2);
+
+ /* Fold remaining bytes into the 128-bit state */
+ if (len) {
+ const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
+ const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+ /* Create masks to shift bytes for partial input */
+ __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16));
+ __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3);
+
+ /* Shift out bytes from crc3 to make space for new data */
+ __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl);
+ xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr);
+
+ /* Insert the partial input into crc3 */
+#if defined(__AVX512BW__) && defined(__AVX512VL__)
+ __mmask16 k = (1 << len) - 1;
+ __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src);
+ if (COPY) {
+ _mm_mask_storeu_epi8(dst, k, xmm_crc_part);
+ }
+#else
+ __m128i xmm_crc_part = _mm_setzero_si128();
+ memcpy(&xmm_crc_part, src, len);
+ if (COPY) {
+ memcpy(dst, src, len);
+ }
+#endif
+ __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned);
+
+ /* Fold the bytes that were shifted out back into crc3 */
+ __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01);
+ __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10);
+ xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high);
+ }
+
+ /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */
+ __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00);
+ __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10);
+
+ x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf);
+ x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3);
+
+ __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01);
+ __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10);
+
+ crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2));
+
+ return ~crc;
+}
diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx2.c b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c
new file mode 100644
index 0000000000..1cdef13b09
--- /dev/null
+++ b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx2.c -- VPCLMULQDQ-based CRC32 with AVX2.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX2
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx512.c b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c
new file mode 100644
index 0000000000..a95a448f49
--- /dev/null
+++ b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx512.c -- VPCLMULQDQ-based CRC32 with AVX-512.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX512
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/slide_hash_avx2.c b/neozip/arch/x86/slide_hash_avx2.c
new file mode 100644
index 0000000000..241ea305e3
--- /dev/null
+++ b/neozip/arch/x86/slide_hash_avx2.c
@@ -0,0 +1,48 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <immintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+ table += entries;
+ table -= 32;
+
+ do {
+ __m256i value1, value2, result1, result2;
+
+ value1 = _mm256_load_si256((__m256i *)table);
+ value2 = _mm256_load_si256((__m256i *)(table+16));
+ result1 = _mm256_subs_epu16(value1, wsize);
+ result2 = _mm256_subs_epu16(value2, wsize);
+ _mm256_store_si256((__m256i *)table, result1);
+ _mm256_store_si256((__m256i *)(table+16), result2);
+
+ table -= 32;
+ entries -= 32;
+ } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+ slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+ slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/x86/slide_hash_sse2.c b/neozip/arch/x86/slide_hash_sse2.c
new file mode 100644
index 0000000000..4aa8df5ee8
--- /dev/null
+++ b/neozip/arch/x86/slide_hash_sse2.c
@@ -0,0 +1,68 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE2
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <immintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+ uint32_t entries1, const __m128i wsize) {
+ uint32_t entries;
+ Pos *table;
+ __m128i value0, value1, result0, result1;
+
+ int on_chain = 0;
+
+next_chain:
+ table = (on_chain) ? table1 : table0;
+ entries = (on_chain) ? entries1 : entries0;
+
+ table += entries;
+ table -= 16;
+
+ /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+ * Our alloc function is aligned to 64 byte boundaries */
+ do {
+ value0 = _mm_load_si128((__m128i *)table);
+ value1 = _mm_load_si128((__m128i *)(table + 8));
+ result0 = _mm_subs_epu16(value0, wsize);
+ result1 = _mm_subs_epu16(value1, wsize);
+ _mm_store_si128((__m128i *)table, result0);
+ _mm_store_si128((__m128i *)(table + 8), result1);
+
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+
+ ++on_chain;
+ if (on_chain > 1) {
+ return;
+ } else {
+ goto next_chain;
+ }
+}
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+ assert(((uintptr_t)s->head & 15) == 0);
+ assert(((uintptr_t)s->prev & 15) == 0);
+
+ slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/x86/x86_features.c b/neozip/arch/x86/x86_features.c
new file mode 100644
index 0000000000..5eba18bf8a
--- /dev/null
+++ b/neozip/arch/x86/x86_features.c
@@ -0,0 +1,128 @@
+/* x86_features.c - x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ * Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_FEATURES
+
+#include "zbuild.h"
+#include "x86_features.h"
+
+#if defined(HAVE_CPUID_MS)
+# include <intrin.h>
+#elif defined(HAVE_CPUID_GNU)
+// Newer versions of GCC and clang come with cpuid.h
+# include <cpuid.h>
+# ifdef X86_HAVE_XSAVE_INTRIN
+# if __GNUC__ == 8
+# include <xsaveintrin.h>
+# else
+# include <immintrin.h>
+# endif
+# endif
+#endif
+
+static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#if defined(HAVE_CPUID_MS)
+ unsigned int registers[4];
+ __cpuid((int *)registers, info);
+
+ *eax = registers[0];
+ *ebx = registers[1];
+ *ecx = registers[2];
+ *edx = registers[3];
+#elif defined(HAVE_CPUID_GNU)
+ *eax = *ebx = *ecx = *edx = 0;
+ __cpuid(info, *eax, *ebx, *ecx, *edx);
+#else
+ /* When using this fallback, the faster SSE/AVX code is disabled */
+ *eax = *ebx = *ecx = *edx = 0;
+#endif
+}
+
+static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#if defined(HAVE_CPUID_MS)
+ unsigned int registers[4];
+ __cpuidex((int *)registers, info, subinfo);
+
+ *eax = registers[0];
+ *ebx = registers[1];
+ *ecx = registers[2];
+ *edx = registers[3];
+#elif defined(HAVE_CPUID_GNU)
+ *eax = *ebx = *ecx = *edx = 0;
+ __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#else
+ /* When using this fallback, the faster SSE/AVX code is disabled */
+ *eax = *ebx = *ecx = *edx = 0;
+#endif
+}
+
+static inline uint64_t xgetbv(unsigned int xcr) {
+#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
+ return _xgetbv(xcr);
+#elif defined(__GNUC__)
+ uint32_t eax, edx;
+ __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
+ return (uint64_t)(edx) << 32 | eax;
+#else
+ /* When using this fallback, some of the faster code is disabled */
+ return 0;
+#endif
+}
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
+ unsigned eax, ebx, ecx, edx;
+ unsigned maxbasic;
+
+ cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+ cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+ features->has_sse2 = edx & 0x4000000;
+ features->has_ssse3 = ecx & 0x200;
+ features->has_sse41 = ecx & 0x80000;
+ features->has_sse42 = ecx & 0x100000;
+ features->has_pclmulqdq = ecx & 0x2;
+
+ if (ecx & 0x08000000) {
+ uint64_t xfeature = xgetbv(0);
+
+ features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
+ features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
+ }
+
+ if (maxbasic >= 7) {
+ // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+ cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+ // check BMI2 bit
+ features->has_bmi2 = ebx & 0x100;
+
+ // check AVX2 bit if the OS supports saving YMM registers
+ if (features->has_os_save_ymm) {
+ features->has_avx2 = ebx & 0x20;
+ features->has_vpclmulqdq = ecx & 0x400;
+ }
+
+ // check AVX512 bits if the OS supports saving ZMM registers
+ if (features->has_os_save_zmm) {
+ features->has_avx512f = ebx & 0x00010000;
+ if (features->has_avx512f) {
+ // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
+ // AVX512(DQ,BW,VL).
+ features->has_avx512dq = ebx & 0x00020000;
+ features->has_avx512bw = ebx & 0x40000000;
+ features->has_avx512vl = ebx & 0x80000000;
+ }
+ features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
+ && features->has_avx512vl && features->has_bmi2;
+ features->has_avx512vnni = ecx & 0x800;
+ }
+ }
+}
+
+#endif
diff --git a/neozip/arch/x86/x86_features.h b/neozip/arch/x86/x86_features.h
new file mode 100644
index 0000000000..2118b8e87a
--- /dev/null
+++ b/neozip/arch/x86/x86_features.h
@@ -0,0 +1,30 @@
+/* x86_features.h -- check for CPU features
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FEATURES_H_
+#define X86_FEATURES_H_
+
+struct x86_cpu_features {
+ int has_avx2;
+ int has_avx512f;
+ int has_avx512dq;
+ int has_avx512bw;
+ int has_avx512vl;
+ int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
+ int has_avx512vnni;
+ int has_bmi2;
+ int has_sse2;
+ int has_ssse3;
+ int has_sse41;
+ int has_sse42;
+ int has_pclmulqdq;
+ int has_vpclmulqdq;
+ int has_os_save_ymm;
+ int has_os_save_zmm;
+};
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
+
+#endif /* X86_FEATURES_H_ */
diff --git a/neozip/arch/x86/x86_functions.h b/neozip/arch/x86/x86_functions.h
new file mode 100644
index 0000000000..881c6efe23
--- /dev/null
+++ b/neozip/arch/x86/x86_functions.h
@@ -0,0 +1,196 @@
+/* x86_functions.h -- x86 implementations for arch-specific functions.
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FUNCTIONS_H_
+#define X86_FUNCTIONS_H_
+
+#include "x86_natives.h"
+
+/* So great news, your compiler is broken and causes stack smashing. Rather than
+ * notching out its compilation we'll just remove the assignment in the functable.
+ * Further context:
+ * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
+#if defined(_MSC_VER) && defined(ARCH_32BIT) && _MSC_VER >= 1920 && _MSC_VER <= 1929
+#define NO_CHORBA_SSE
+#endif
+
+#ifdef X86_SSE2
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_sse2(deflate_state *s);
+
+# if !defined(WITHOUT_CHORBA_SSE)
+ uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len);
+ uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+ uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len);
+# endif
+#endif
+
+#ifdef X86_SSSE3
+uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, size_t len, size_t left);
+void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
+ uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len);
+ uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_SSE42
+uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_AVX2
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_avx2(deflate_state *s);
+#endif
+#ifdef X86_AVX512
+uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match);
+#endif
+#ifdef X86_AVX512VNNI
+uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_PCLMULQDQ_CRC
+uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX2
+uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// X86 - SSE2
+# ifdef X86_SSE2_NATIVE
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_sse2
+# undef native_compare256
+# define native_compare256 compare256_sse2
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_sse2
+# undef native_longest_match
+# define native_longest_match longest_match_sse2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_sse2
+# if !defined(WITHOUT_CHORBA_SSE)
+# undef native_crc32
+# define native_crc32 crc32_chorba_sse2
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_chorba_sse2
+# endif
+# undef native_slide_hash
+# define native_slide_hash slide_hash_sse2
+# endif
+// X86 - SSSE3
+# ifdef X86_SSSE3_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_ssse3
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_ssse3
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_ssse3
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_ssse3
+# endif
+// X86 - SSE4.1
+# if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE)
+# undef native_crc32
+# define native_crc32 crc32_chorba_sse41
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_chorba_sse41
+# endif
+// X86 - SSE4.2
+# ifdef X86_SSE42_NATIVE
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_sse42
+# endif
+// X86 - PCLMUL
+# ifdef X86_PCLMULQDQ_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_pclmulqdq
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_pclmulqdq
+# endif
+// X86 - AVX2
+# ifdef X86_AVX2_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_avx2
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_avx2
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_avx2
+# undef native_compare256
+# define native_compare256 compare256_avx2
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_avx2
+# undef native_longest_match
+# define native_longest_match longest_match_avx2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx2
+# undef native_slide_hash
+# define native_slide_hash slide_hash_avx2
+# endif
+// X86 - AVX512 (F,DQ,BW,Vl)
+# ifdef X86_AVX512_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_avx512
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_avx512
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_avx512
+# undef native_compare256
+# define native_compare256 compare256_avx512
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_avx512
+# undef native_longest_match
+# define native_longest_match longest_match_avx512
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx512
+// X86 - AVX512 (VNNI)
+# ifdef X86_AVX512VNNI_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_avx512_vnni
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_avx512_vnni
+# endif
+# endif
+// X86 - VPCLMULQDQ
+# ifdef X86_VPCLMULQDQ_AVX512_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_vpclmulqdq_avx512
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_vpclmulqdq_avx512
+# elif defined(X86_VPCLMULQDQ_AVX2_NATIVE)
+# undef native_crc32
+# define native_crc32 crc32_vpclmulqdq_avx2
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_vpclmulqdq_avx2
+# endif
+#endif
+
+#endif /* X86_FUNCTIONS_H_ */
diff --git a/neozip/arch/x86/x86_intrins.h b/neozip/arch/x86/x86_intrins.h
new file mode 100644
index 0000000000..1d1df5eb11
--- /dev/null
+++ b/neozip/arch/x86/x86_intrins.h
@@ -0,0 +1,126 @@
+#ifndef X86_INTRINS_H
+#define X86_INTRINS_H
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+/* Unfortunately GCC didn't support these things until version 10.
+ * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
+ */
+#ifdef __AVX2__
+#include <immintrin.h>
+
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
+ || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+ __m128i r;
+ __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+ return _mm256_castsi128_si256(r);
+}
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+ __m128i r;
+ __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+ return _mm512_castsi128_si512(r);
+}
+#endif // __AVX512F__
+#endif // gcc/AppleClang version test
+
+#endif // __AVX2__
+
+/* GCC <9 is missing some AVX512 intrinsics.
+ */
+#ifdef __AVX512F__
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
+#include <immintrin.h>
+
+#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
+ ((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
+
+static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
+ char __q59, char __q58, char __q57, char __q56,
+ char __q55, char __q54, char __q53, char __q52,
+ char __q51, char __q50, char __q49, char __q48,
+ char __q47, char __q46, char __q45, char __q44,
+ char __q43, char __q42, char __q41, char __q40,
+ char __q39, char __q38, char __q37, char __q36,
+ char __q35, char __q34, char __q33, char __q32,
+ char __q31, char __q30, char __q29, char __q28,
+ char __q27, char __q26, char __q25, char __q24,
+ char __q23, char __q22, char __q21, char __q20,
+ char __q19, char __q18, char __q17, char __q16,
+ char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00) {
+ return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
+ PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
+ PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
+ PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
+ PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
+ PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
+ PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
+ PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
+}
+
+#undef PACK
+
+#endif // gcc version test
+#endif // __AVX512F__
+
+/* Missing zero-extension AVX and AVX512 intrinsics.
+ * Fixed in Microsoft Visual Studio 2017 version 15.7
+ * https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#ifdef __AVX2__
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+ return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
+}
+#endif // __AVX2__
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+ return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
+}
+#endif // __AVX512F__
+#endif // defined(_MSC_VER) && _MSC_VER < 1914
+
+/* Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsics */
+#if defined(_MSC_VER) && defined(__AVX512F__) && !defined(_MM_K0_REG8)
+# undef _mm512_extracti32x4_epi32
+# define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1)
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+/* For whatever reason this intrinsic is 64 bit only with MSVC?
+ * While we don't have 64 bit GPRs, it should at least be able to move it to stack
+ * or shuffle it over 2 registers */
+#ifdef ARCH_32BIT
+/* So, while we can't move directly to a GPR, hopefully this move to
+ * a stack resident variable doesn't equate to something awful */
+static inline int64_t _mm_cvtsi128_si64(__m128i a) {
+ union { __m128i v; int64_t i; } u;
+ u.v = a;
+ return u.i;
+}
+
+static inline __m128i _mm_cvtsi64_si128(int64_t a) {
+ return _mm_set_epi64x(0, a);
+}
+#endif
+#endif
+
+#if defined(__GNUC__) && defined(ARCH_X86) && defined(ARCH_32BIT) && !defined(__clang__)
+static inline int64_t _mm_cvtsi128_si64(__m128i a) {
+ union { __m128i v; int64_t i; } u;
+ u.v = a;
+ return u.i;
+}
+#define _mm_cvtsi64_si128(a) _mm_set_epi64x(0, a)
+#endif
+
+#endif // include guard X86_INTRINS_H
diff --git a/neozip/arch/x86/x86_natives.h b/neozip/arch/x86/x86_natives.h
new file mode 100644
index 0000000000..a39b7a51f0
--- /dev/null
+++ b/neozip/arch/x86/x86_natives.h
@@ -0,0 +1,57 @@
+/* x86_natives.h -- x86 compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_NATIVES_H_
+#define X86_NATIVES_H_
+
+#if defined(__SSE2__) || (defined(ARCH_X86) && defined(ARCH_64BIT))
+# ifdef X86_SSE2
+# define X86_SSE2_NATIVE
+# endif
+#endif
+#if defined(__SSSE3__)
+# ifdef X86_SSSE3
+# define X86_SSSE3_NATIVE
+# endif
+#endif
+#if defined(__SSE4_1__)
+# ifdef X86_SSE41
+# define X86_SSE41_NATIVE
+# endif
+#endif
+#if defined(__SSE4_2__)
+# ifdef X86_SSE42
+# define X86_SSE42_NATIVE
+# endif
+#endif
+#if defined(__PCLMUL__)
+# ifdef X86_PCLMULQDQ_CRC
+# define X86_PCLMULQDQ_NATIVE
+# endif
+#endif
+#if defined(__AVX2__)
+# ifdef X86_AVX2
+# define X86_AVX2_NATIVE
+# endif
+#endif
+#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+# ifdef X86_AVX512
+# define X86_AVX512_NATIVE
+# endif
+#endif
+#if defined(__AVX512VNNI__)
+# ifdef X86_AVX512VNNI
+# define X86_AVX512VNNI_NATIVE
+# endif
+#endif
+#if defined(__VPCLMULQDQ__)
+# if defined(X86_VPCLMULQDQ_AVX2) && defined(X86_AVX2_NATIVE)
+# define X86_VPCLMULQDQ_AVX2_NATIVE
+# endif
+# if defined(X86_VPCLMULQDQ_AVX512) && defined(X86_AVX512_NATIVE)
+# define X86_VPCLMULQDQ_AVX512_NATIVE
+# endif
+#endif
+
+#endif /* X86_NATIVES_H_ */