summaryrefslogtreecommitdiff
path: root/neozip/arch
diff options
context:
space:
mode:
authorMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
committerMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
commit7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
treeb43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch
parentb1e34e861b5d732afe828d58aad2c638135061fd (diff)
parentc2712b8a345191f6ed79558c089777df94590087 (diff)
downloadProject-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz
Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/arch')
-rw-r--r--neozip/arch/.gitignore2
-rw-r--r--neozip/arch/arm/Makefile.in86
-rw-r--r--neozip/arch/arm/acle_intrins.h90
-rw-r--r--neozip/arch/arm/adler32_neon.c346
-rw-r--r--neozip/arch/arm/arm_features.c334
-rw-r--r--neozip/arch/arm/arm_features.h19
-rw-r--r--neozip/arch/arm/arm_functions.h75
-rw-r--r--neozip/arch/arm/arm_natives.h31
-rw-r--r--neozip/arch/arm/chunkset_neon.c81
-rw-r--r--neozip/arch/arm/compare256_neon.c56
-rw-r--r--neozip/arch/arm/crc32_armv8.c81
-rw-r--r--neozip/arch/arm/crc32_armv8_p.h103
-rw-r--r--neozip/arch/arm/crc32_armv8_pmull_eor3.c366
-rw-r--r--neozip/arch/arm/neon_intrins.h79
-rw-r--r--neozip/arch/arm/slide_hash_armv6.c49
-rw-r--r--neozip/arch/arm/slide_hash_neon.c48
-rw-r--r--neozip/arch/generic/Makefile.in68
-rw-r--r--neozip/arch/generic/adler32_c.c55
-rw-r--r--neozip/arch/generic/chunk_128bit_perm_idx_lut.h26
-rw-r--r--neozip/arch/generic/chunk_256bit_perm_idx_lut.h47
-rw-r--r--neozip/arch/generic/chunk_permute_table.h53
-rw-r--r--neozip/arch/generic/chunkset_c.c40
-rw-r--r--neozip/arch/generic/compare256_c.c88
-rw-r--r--neozip/arch/generic/compare256_p.h0
-rw-r--r--neozip/arch/generic/crc32_braid_c.c213
-rw-r--r--neozip/arch/generic/crc32_chorba_c.c1275
-rw-r--r--neozip/arch/generic/generic_functions.h64
-rw-r--r--neozip/arch/generic/slide_hash_c.c52
-rw-r--r--neozip/arch/loongarch/Makefile.in99
-rw-r--r--neozip/arch/loongarch/adler32_lasx.c154
-rw-r--r--neozip/arch/loongarch/adler32_lsx.c147
-rw-r--r--neozip/arch/loongarch/chunkset_lasx.c126
-rw-r--r--neozip/arch/loongarch/chunkset_lsx.c74
-rw-r--r--neozip/arch/loongarch/compare256_lasx.c60
-rw-r--r--neozip/arch/loongarch/compare256_lsx.c88
-rw-r--r--neozip/arch/loongarch/crc32_la.c71
-rw-r--r--neozip/arch/loongarch/lasxintrin_ext.h61
-rw-r--r--neozip/arch/loongarch/loongarch_features.c31
-rw-r--r--neozip/arch/loongarch/loongarch_features.h19
-rw-r--r--neozip/arch/loongarch/loongarch_functions.h86
-rw-r--r--neozip/arch/loongarch/loongarch_natives.h25
-rw-r--r--neozip/arch/loongarch/lsxintrin_ext.h33
-rw-r--r--neozip/arch/loongarch/slide_hash_lasx.c49
-rw-r--r--neozip/arch/loongarch/slide_hash_lsx.c54
-rw-r--r--neozip/arch/power/Makefile.in93
-rw-r--r--neozip/arch/power/adler32_power8.c160
-rw-r--r--neozip/arch/power/adler32_vmx.c168
-rw-r--r--neozip/arch/power/chunkset_power8.c50
-rw-r--r--neozip/arch/power/compare256_power9.c68
-rw-r--r--neozip/arch/power/crc32_constants.h1123
-rw-r--r--neozip/arch/power/crc32_power8.c593
-rw-r--r--neozip/arch/power/power_features.c54
-rw-r--r--neozip/arch/power/power_features.h18
-rw-r--r--neozip/arch/power/power_functions.h74
-rw-r--r--neozip/arch/power/power_intrins.h61
-rw-r--r--neozip/arch/power/power_natives.h27
-rw-r--r--neozip/arch/power/slide_hash_power8.c12
-rw-r--r--neozip/arch/power/slide_hash_vmx.c10
-rw-r--r--neozip/arch/power/slide_ppc_tpl.h44
-rw-r--r--neozip/arch/riscv/Makefile.in72
-rw-r--r--neozip/arch/riscv/README.md45
-rw-r--r--neozip/arch/riscv/adler32_rvv.c119
-rw-r--r--neozip/arch/riscv/chunkset_rvv.c126
-rw-r--r--neozip/arch/riscv/compare256_rvv.c48
-rw-r--r--neozip/arch/riscv/crc32_zbc.c103
-rw-r--r--neozip/arch/riscv/riscv_features.c99
-rw-r--r--neozip/arch/riscv/riscv_features.h19
-rw-r--r--neozip/arch/riscv/riscv_functions.h60
-rw-r--r--neozip/arch/riscv/riscv_natives.h19
-rw-r--r--neozip/arch/riscv/slide_hash_rvv.c33
-rw-r--r--neozip/arch/s390/Makefile.in48
-rw-r--r--neozip/arch/s390/README.md265
-rw-r--r--neozip/arch/s390/crc32-vx.c232
-rw-r--r--neozip/arch/s390/dfltcc_common.h119
-rw-r--r--neozip/arch/s390/dfltcc_deflate.c390
-rw-r--r--neozip/arch/s390/dfltcc_deflate.h58
-rw-r--r--neozip/arch/s390/dfltcc_detail.h274
-rw-r--r--neozip/arch/s390/dfltcc_inflate.c195
-rw-r--r--neozip/arch/s390/dfltcc_inflate.h67
-rw-r--r--neozip/arch/s390/s390_features.c18
-rw-r--r--neozip/arch/s390/s390_features.h14
-rw-r--r--neozip/arch/s390/s390_functions.h33
-rw-r--r--neozip/arch/s390/s390_natives.h14
-rwxr-xr-xneozip/arch/s390/self-hosted-builder/actions-runner62
-rw-r--r--neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh52
-rw-r--r--neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile47
-rw-r--r--neozip/arch/s390/self-hosted-builder/actions-runner.service18
-rwxr-xr-xneozip/arch/s390/self-hosted-builder/entrypoint30
-rw-r--r--neozip/arch/x86/Makefile.in176
-rw-r--r--neozip/arch/x86/adler32_avx2.c172
-rw-r--r--neozip/arch/x86/adler32_avx2_p.h32
-rw-r--r--neozip/arch/x86/adler32_avx512.c102
-rw-r--r--neozip/arch/x86/adler32_avx512_p.h57
-rw-r--r--neozip/arch/x86/adler32_avx512_vnni.c205
-rw-r--r--neozip/arch/x86/adler32_sse42.c117
-rw-r--r--neozip/arch/x86/adler32_ssse3.c149
-rw-r--r--neozip/arch/x86/adler32_ssse3_p.h29
-rw-r--r--neozip/arch/x86/chunkset_avx2.c129
-rw-r--r--neozip/arch/x86/chunkset_avx512.c186
-rw-r--r--neozip/arch/x86/chunkset_sse2.c50
-rw-r--r--neozip/arch/x86/chunkset_ssse3.c72
-rw-r--r--neozip/arch/x86/compare256_avx2.c61
-rw-r--r--neozip/arch/x86/compare256_avx512.c87
-rw-r--r--neozip/arch/x86/compare256_sse2.c86
-rw-r--r--neozip/arch/x86/crc32_chorba_sse2.c872
-rw-r--r--neozip/arch/x86/crc32_chorba_sse41.c332
-rw-r--r--neozip/arch/x86/crc32_pclmulqdq.c31
-rw-r--r--neozip/arch/x86/crc32_pclmulqdq_tpl.h708
-rw-r--r--neozip/arch/x86/crc32_vpclmulqdq_avx2.c17
-rw-r--r--neozip/arch/x86/crc32_vpclmulqdq_avx512.c17
-rw-r--r--neozip/arch/x86/slide_hash_avx2.c48
-rw-r--r--neozip/arch/x86/slide_hash_sse2.c68
-rw-r--r--neozip/arch/x86/x86_features.c128
-rw-r--r--neozip/arch/x86/x86_features.h30
-rw-r--r--neozip/arch/x86/x86_functions.h196
-rw-r--r--neozip/arch/x86/x86_intrins.h126
-rw-r--r--neozip/arch/x86/x86_natives.h57
117 files changed, 14578 insertions, 0 deletions
diff --git a/neozip/arch/.gitignore b/neozip/arch/.gitignore
new file mode 100644
index 0000000000..2c3af0a08c
--- /dev/null
+++ b/neozip/arch/.gitignore
@@ -0,0 +1,2 @@
+# ignore Makefiles; they're all automatically generated
+Makefile
diff --git a/neozip/arch/arm/Makefile.in b/neozip/arch/arm/Makefile.in
new file mode 100644
index 0000000000..d0bfe0e172
--- /dev/null
+++ b/neozip/arch/arm/Makefile.in
@@ -0,0 +1,86 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+ARMV8FLAG=
+PMULLEOR3FLAG=
+NEONFLAG=
+ARMV6FLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ adler32_neon.o adler32_neon.lo \
+ arm_features.o arm_features.lo \
+ chunkset_neon.o chunkset_neon.lo \
+ compare256_neon.o compare256_neon.lo \
+ crc32_armv8.o crc32_armv8.lo \
+ crc32_armv8_pmull_eor3.o crc32_armv8_pmull_eor3.lo \
+ slide_hash_neon.o slide_hash_neon.lo \
+ slide_hash_armv6.o slide_hash_armv6.lo \
+
+adler32_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+arm_features.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+arm_features.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+chunkset_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+compare256_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+crc32_armv8.o:
+ $(CC) $(CFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8.lo:
+ $(CC) $(SFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8_pmull_eor3.o:
+ $(CC) $(CFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+crc32_armv8_pmull_eor3.lo:
+ $(CC) $(SFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+slide_hash_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_armv6.o:
+ $(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+slide_hash_armv6.lo:
+ $(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/arm/acle_intrins.h b/neozip/arch/arm/acle_intrins.h
new file mode 100644
index 0000000000..16f5e2c77c
--- /dev/null
+++ b/neozip/arch/arm/acle_intrins.h
@@ -0,0 +1,90 @@
+#ifndef ARM_ACLE_INTRINS_H
+#define ARM_ACLE_INTRINS_H
+
+#include <stdint.h>
+#ifdef _MSC_VER
+# include <intrin.h>
+#elif defined(HAVE_ARM_ACLE_H)
+# include <arm_acle.h>
+#endif
+
+#ifdef ARM_CRC32
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+# define Z_TARGET_CRC Z_TARGET("+crc")
+#else
+# define Z_TARGET_CRC
+#endif
+#ifdef ARM_PMULL_EOR3
+# define Z_TARGET_PMULL_EOR3 Z_TARGET("+crc+crypto+sha3")
+#else
+# define Z_TARGET_PMULL_EOR3
+#endif
+
+#if !defined(ARM_CRC32_INTRIN) && !defined(_MSC_VER)
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+ uint32_t __c;
+ __asm__("crc32b %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+ uint32_t __c;
+ __asm__("crc32h %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+ uint32_t __c;
+ __asm__("crc32w %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+ uint32_t __c;
+ __asm__("crc32x %w0, %w1, %x2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+#else
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+ uint32_t __c;
+ __asm__("crc32b %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+ uint32_t __c;
+ __asm__("crc32h %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+ uint32_t __c;
+ __asm__("crc32w %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+ return __crc32w (__crc32w (__a, __b & 0xffffffffULL), __b >> 32);
+}
+#endif
+#endif
+#endif
+
+#ifdef ARM_SIMD
+#ifdef _MSC_VER
+typedef uint32_t uint16x2_t;
+
+#define __uqsub16 _arm_uqsub16
+#elif !defined(ARM_SIMD_INTRIN)
+typedef uint32_t uint16x2_t;
+
+static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
+ uint16x2_t __c;
+ __asm__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+#endif
+#endif
+
+#endif // include guard ARM_ACLE_INTRINS_H
diff --git a/neozip/arch/arm/adler32_neon.c b/neozip/arch/arm/adler32_neon.c
new file mode 100644
index 0000000000..48532e6cd1
--- /dev/null
+++ b/neozip/arch/arm/adler32_neon.c
@@ -0,0 +1,346 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors:
+ * Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "adler32_p.h"
+
+static const uint16_t ALIGNED_(64) taps[64] = {
+ 64, 63, 62, 61, 60, 59, 58, 57,
+ 56, 55, 54, 53, 52, 51, 50, 49,
+ 48, 47, 46, 45, 44, 43, 42, 41,
+ 40, 39, 38, 37, 36, 35, 34, 33,
+ 32, 31, 30, 29, 28, 27, 26, 25,
+ 24, 23, 22, 21, 20, 19, 18, 17,
+ 16, 15, 14, 13, 12, 11, 10, 9,
+ 8, 7, 6, 5, 4, 3, 2, 1 };
+
+Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uint8_t *buf, size_t len) {
+ uint32x4_t adacc = vdupq_n_u32(0);
+ uint32x4_t s2acc = vdupq_n_u32(0);
+ uint32x4_t s2acc_0 = vdupq_n_u32(0);
+ uint32x4_t s2acc_1 = vdupq_n_u32(0);
+ uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+ adacc = vsetq_lane_u32(s[0], adacc, 0);
+ s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+ uint32x4_t s3acc = vdupq_n_u32(0);
+ uint32x4_t adacc_prev = adacc;
+
+ uint16x8_t s2_0, s2_1, s2_2, s2_3;
+ s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+ uint16x8_t s2_4, s2_5, s2_6, s2_7;
+ s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+ size_t num_iter = len >> 2;
+ int rem = len & 3;
+
+ for (size_t i = 0; i < num_iter; ++i) {
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+ uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128);
+ uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128);
+ uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128);
+
+ vst1q_u8(dst, d0);
+ vst1q_u8(dst + 16, d1);
+ vst1q_u8(dst + 32, d2);
+ vst1q_u8(dst + 48, d3);
+ dst += 64;
+
+ /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+ * bit instruction, we'll have to make due summing to 16 bits first */
+ uint16x8x2_t hsum, hsum_fold;
+ hsum.val[0] = vpaddlq_u8(d0);
+ hsum.val[1] = vpaddlq_u8(d1);
+
+ hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d2);
+ hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d3);
+
+ adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+ s3acc = vaddq_u32(s3acc, adacc_prev);
+ adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+ /* If we do straight widening additions to the 16 bit values, we don't incur
+ * the usual penalties of a pairwise add. We can defer the multiplications
+ * until the very end. These will not overflow because we are incurring at
+ * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+ * summed into once. This means for the maximum input size, the largest value
+ * we will see is 255 * 102 = 26010, safely under uint16 max */
+ s2_0 = vaddw_u8(s2_0, vget_low_u8(d0));
+ s2_1 = vaddw_high_u8(s2_1, d0);
+ s2_2 = vaddw_u8(s2_2, vget_low_u8(d1));
+ s2_3 = vaddw_high_u8(s2_3, d1);
+ s2_4 = vaddw_u8(s2_4, vget_low_u8(d2));
+ s2_5 = vaddw_high_u8(s2_5, d2);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d3));
+ s2_7 = vaddw_high_u8(s2_7, d3);
+
+ adacc_prev = adacc;
+ buf += 64;
+ }
+
+ s3acc = vshlq_n_u32(s3acc, 6);
+
+ if (rem) {
+ uint32x4_t s3acc_0 = vdupq_n_u32(0);
+ while (rem--) {
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+ vst1q_u8(dst, d0);
+ dst += 16;
+ uint16x8_t adler;
+ adler = vpaddlq_u8(d0);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+ s2_7 = vaddw_high_u8(s2_7, d0);
+ adacc = vpadalq_u16(adacc, adler);
+ s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+ adacc_prev = adacc;
+ buf += 16;
+ }
+
+ s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+ s3acc = vaddq_u32(s3acc_0, s3acc);
+ }
+
+ uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+ uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+ s2acc = vaddq_u32(s2acc_0, s2acc);
+ s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+ s2acc = vaddq_u32(s2acc, s2acc_2);
+
+ uint32x2_t adacc2, s2acc2, as;
+ s2acc = vaddq_u32(s2acc, s3acc);
+ adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+ s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+ as = vpadd_u32(adacc2, s2acc2);
+ s[0] = vget_lane_u32(as, 0);
+ s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+ uint32x4_t adacc = vdupq_n_u32(0);
+ uint32x4_t s2acc = vdupq_n_u32(0);
+ uint32x4_t s2acc_0 = vdupq_n_u32(0);
+ uint32x4_t s2acc_1 = vdupq_n_u32(0);
+ uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+ adacc = vsetq_lane_u32(s[0], adacc, 0);
+ s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+ uint32x4_t s3acc = vdupq_n_u32(0);
+ uint32x4_t adacc_prev = adacc;
+
+ uint16x8_t s2_0, s2_1, s2_2, s2_3;
+ s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+ uint16x8_t s2_4, s2_5, s2_6, s2_7;
+ s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+ size_t num_iter = len >> 2;
+ int rem = len & 3;
+
+ for (size_t i = 0; i < num_iter; ++i) {
+ uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256);
+
+ /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+ * bit instruction, we'll have to make due summing to 16 bits first */
+ uint16x8x2_t hsum, hsum_fold;
+ hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+ hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+ hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+ hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+ adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+ s3acc = vaddq_u32(s3acc, adacc_prev);
+ adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+ /* If we do straight widening additions to the 16 bit values, we don't incur
+ * the usual penalties of a pairwise add. We can defer the multiplications
+ * until the very end. These will not overflow because we are incurring at
+ * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+ * summed into once. This means for the maximum input size, the largest value
+ * we will see is 255 * 102 = 26010, safely under uint16 max */
+ s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+ s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+ s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+ s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+ s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+ s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+ s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+ adacc_prev = adacc;
+ buf += 64;
+ }
+
+ s3acc = vshlq_n_u32(s3acc, 6);
+
+ if (rem) {
+ uint32x4_t s3acc_0 = vdupq_n_u32(0);
+ while (rem--) {
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+ uint16x8_t adler;
+ adler = vpaddlq_u8(d0);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+ s2_7 = vaddw_high_u8(s2_7, d0);
+ adacc = vpadalq_u16(adacc, adler);
+ s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+ adacc_prev = adacc;
+ buf += 16;
+ }
+
+ s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+ s3acc = vaddq_u32(s3acc_0, s3acc);
+ }
+
+ uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+ uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+ s2acc = vaddq_u32(s2acc_0, s2acc);
+ s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+ s2acc = vaddq_u32(s2acc, s2acc_2);
+
+ uint32x2_t adacc2, s2acc2, as;
+ s2acc = vaddq_u32(s2acc, s3acc);
+ adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+ s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+ as = vpadd_u32(adacc2, s2acc2);
+ s[0] = vget_lane_u32(as, 0);
+ s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
+
+ uint32_t pair[2];
+
+ /* Split Adler-32 into component sums, it can be supplied by
+ * the caller sites (e.g. in a PNG file).
+ */
+ pair[0] = adler;
+ pair[1] = sum2;
+
+ /* If memory is not SIMD aligned, do scalar sums to an aligned
+ * offset, provided that doing so doesn't completely eliminate
+ * SIMD operation. Aligned loads are still faster on ARM, even
+ * when there's no explicit aligned load instruction. Note:
+ * the code currently emits an alignment hint in the instruction
+ * for exactly 256 bits when supported by the compiler. Several ARM
+ * SIPs have small penalties for cacheline crossing loads as well (so
+ * really 512 bits is the optimal alignment of the buffer). 32 bytes
+ * should strike a balance, though. The Cortex-A8 and Cortex-A9
+ * processors are documented to benefit from 128 bit and 64 bit
+ * alignment, but it's unclear which other SIPs will benefit from it.
+ * In the copying variant we use fallback to 4x loads and 4x stores,
+ * as ld1x4 seems to block ILP when stores are in the mix */
+ size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
+ size_t n = NMAX_ALIGNED32;
+ if (align_diff) {
+ adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
+
+ if (COPY)
+ dst += align_diff;
+ src += align_diff;
+ len -= align_diff;
+ n = ALIGN_DOWN(n - align_diff, 32);
+ }
+
+ while (len >= 16) {
+ n = MIN(len, n);
+
+ if (COPY)
+ NEON_accum32_copy(pair, dst, src, n >> 4);
+ else
+ NEON_accum32(pair, src, n >> 4);
+
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+
+ size_t k = (n >> 4) << 4;
+ src += k;
+ if (COPY)
+ dst += k;
+ len -= k;
+ n = NMAX_ALIGNED32;
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY);
+}
+
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+ return adler32_copy_impl(adler, dst, src, len, 1);
+#else
+ /* Without unaligned access, interleaved stores get decomposed into byte ops */
+ adler = adler32_neon(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+#endif
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.c b/neozip/arch/arm/arm_features.c
new file mode 100644
index 0000000000..8f179526ef
--- /dev/null
+++ b/neozip/arch/arm/arm_features.c
@@ -0,0 +1,334 @@
+#ifdef ARM_FEATURES
+
+#include "zbuild.h"
+#include "arm_features.h"
+
+#if defined(HAVE_SYS_AUXV_H)
+# include <sys/auxv.h>
+# ifdef ARM_ASM_HWCAP
+# include <asm/hwcap.h>
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+# include <machine/armreg.h>
+# ifndef ID_AA64ISAR0_CRC32_VAL
+# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+# endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+# include <machine/armreg.h>
+# include <machine/cpu.h>
+# include <sys/sysctl.h>
+# include <sys/types.h>
+#elif defined(__APPLE__)
+# if !defined(_DARWIN_C_SOURCE)
+# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+# endif
+# include <sys/sysctl.h>
+#elif defined(_WIN32)
+# include <windows.h>
+#endif
+
+static int arm_has_crc32(void) {
+ int has_crc32 = 0;
+#if defined(__ARM_FEATURE_CRC32)
+ /* Compile-time check */
+ has_crc32 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CRC32
+ has_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
+# elif defined(HWCAP2_CRC32)
+ has_crc32 = (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CRC32
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_crc32 = (hwcap & HWCAP_CRC32) != 0;
+# elif defined(HWCAP2_CRC32)
+ unsigned long hwcap2 = 0;
+ elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+ has_crc32 = (hwcap2 & HWCAP2_CRC32) != 0;
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+ has_crc32 = getenv("QEMU_EMULATING") == NULL
+ && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+ int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+ uint64_t isar0 = 0;
+ size_t len = sizeof(isar0);
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+ has_crc32 = ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE;
+ }
+#elif defined(__APPLE__)
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_crc32 = sysctlbyname("hw.optional.armv8_crc32", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+#elif defined(_WIN32)
+ has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#endif
+ return has_crc32;
+}
+
+static int arm_has_pmull(void) {
+ int has_pmull = 0;
+#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)
+ /* Compile-time check */
+ has_pmull = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_PMULL
+ has_pmull = (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+# elif defined(HWCAP_AES)
+ /* PMULL is part of crypto extension, check for AES as proxy */
+ has_pmull = (getauxval(AT_HWCAP) & HWCAP_AES) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_PMULL
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_pmull = (hwcap & HWCAP_PMULL) != 0;
+# elif defined(HWCAP_AES)
+ /* PMULL is part of crypto extension, check for AES as proxy */
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_pmull = (hwcap & HWCAP_AES) != 0;
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+ /* Check for AES feature as PMULL is part of crypto extension */
+ has_pmull = getenv("QEMU_EMULATING") == NULL
+ && ID_AA64ISAR0_AES_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_AES_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+ int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+ uint64_t isar0 = 0;
+ size_t len = sizeof(isar0);
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+ has_pmull = ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_BASE;
+ }
+#elif defined(__APPLE__)
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_pmull = sysctlbyname("hw.optional.arm.FEAT_PMULL", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+#elif defined(_WIN32)
+ /* Windows checks for crypto/AES support */
+# ifdef PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE
+ has_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+# endif
+#endif
+ return has_pmull;
+}
+
+static int arm_has_eor3(void) {
+ int has_eor3 = 0;
+#if defined(__ARM_FEATURE_SHA3)
+ /* Compile-time check */
+ has_eor3 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+ /* EOR3 is part of SHA3 extension, check HWCAP2_SHA3 */
+# ifdef HWCAP2_SHA3
+ has_eor3 = (getauxval(AT_HWCAP2) & HWCAP2_SHA3) != 0;
+# elif defined(HWCAP_SHA3)
+ has_eor3 = (getauxval(AT_HWCAP) & HWCAP_SHA3) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP2_SHA3
+ unsigned long hwcap2 = 0;
+ elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+ has_eor3 = (hwcap2 & HWCAP2_SHA3) != 0;
+# elif defined(HWCAP_SHA3)
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_eor3 = (hwcap & HWCAP_SHA3) != 0;
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+ /* FreeBSD: check for SHA3 in id_aa64isar0_el1 */
+# ifdef ID_AA64ISAR0_SHA3_VAL
+ has_eor3 = getenv("QEMU_EMULATING") == NULL
+ && ID_AA64ISAR0_SHA3_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_SHA3_BASE;
+# endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+# ifdef ID_AA64ISAR0_SHA3
+ int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+ uint64_t isar0 = 0;
+ size_t len = sizeof(isar0);
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+ has_eor3 = ID_AA64ISAR0_SHA3(isar0) >= ID_AA64ISAR0_SHA3_IMPL;
+ }
+# endif
+#elif defined(__APPLE__)
+ /* All Apple Silicon (M1+) has SHA3/EOR3 support */
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_eor3 = sysctlbyname("hw.optional.arm.FEAT_SHA3", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+ /* Fallback to legacy name for older macOS versions */
+ if (!has_eor3) {
+ size = sizeof(has_feat);
+ has_eor3 = sysctlbyname("hw.optional.armv8_2_sha3", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+ }
+#elif defined(_WIN32)
+# ifdef PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE
+ has_eor3 = IsProcessorFeaturePresent(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE);
+# endif
+#endif
+ return has_eor3;
+}
+
+/* AArch64 has neon. */
+#ifdef ARCH_32BIT
+static inline int arm_has_neon(void) {
+ int has_neon = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+ /* Compile-time check */
+ has_neon = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_ARM_NEON
+ has_neon = (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
+# elif defined(HWCAP_NEON)
+ has_neon = (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_NEON
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_neon = (hwcap & HWCAP_NEON) != 0;
+# endif
+#elif defined(__APPLE__)
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_neon = sysctlbyname("hw.optional.neon", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+ has_neon = 1; /* Always supported */
+# endif
+#endif
+ return has_neon;
+}
+#endif
+
+/* AArch64 does not have ARMv6 SIMD. */
+#ifdef ARCH_32BIT
+static inline int arm_has_simd(void) {
+ int has_simd = 0;
+#if defined(__ARM_FEATURE_SIMD32)
+ /* Compile-time check for ARMv6 SIMD */
+ has_simd = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+ const char *platform = (const char *)getauxval(AT_PLATFORM);
+ has_simd = platform
+ && (strncmp(platform, "v6l", 3) == 0
+ || strncmp(platform, "v7l", 3) == 0
+ || strncmp(platform, "v8l", 3) == 0);
+#endif
+ return has_simd;
+}
+#endif
+
+#if defined(ARCH_64BIT) && !defined(__APPLE__) && !defined(_WIN32)
+/* MIDR_EL1 bit field definitions */
+#define MIDR_IMPLEMENTOR(midr) (((midr) & (0xffU << 24)) >> 24)
+#define MIDR_PARTNUM(midr) (((midr) & (0xfffU << 4)) >> 4)
+
+/* ARM CPU Implementer IDs */
+#define ARM_IMPLEMENTER_ARM 0x41
+#define ARM_IMPLEMENTER_QUALCOMM 0x51
+#define ARM_IMPLEMENTER_APPLE 0x61
+
+/* ARM CPU Part Numbers */
+
+/* Cortex-X series - Multiple PMULL lanes */
+#define ARM_PART_CORTEX_X1 0xd44
+#define ARM_PART_CORTEX_X1C 0xd4c
+#define ARM_PART_CORTEX_X2 0xd48
+#define ARM_PART_CORTEX_X3 0xd4e
+#define ARM_PART_CORTEX_X4 0xd82
+#define ARM_PART_CORTEX_X925 0xd85
+
+/* Neoverse V/N2 series - Multiple PMULL lanes */
+#define ARM_PART_NEOVERSE_N2 0xd49
+#define ARM_PART_NEOVERSE_V1 0xd40
+#define ARM_PART_NEOVERSE_V2 0xd4f
+#define ARM_PART_NEOVERSE_V3 0xd8e
+
+/* Snapdragon X Elite/Plus - Custom core */
+#define QUALCOMM_PART_ORYON 0x001
+
+static inline int arm_has_cpuid(void) {
+ int has_cpuid = 0;
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CPUID
+ has_cpuid = (getauxval(AT_HWCAP) & HWCAP_CPUID) != 0;
+# elif defined(HWCAP2_CPUID)
+ has_cpuid = (getauxval(AT_HWCAP2) & HWCAP2_CPUID) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CPUID
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_cpuid = (hwcap & HWCAP_CPUID) != 0;
+# endif
+#endif
+ return has_cpuid;
+}
+#endif
+
+/* Determine if CPU has fast PMULL (multiple execution units) */
+static inline int arm_cpu_has_fast_pmull(void) {
+ int has_fast_pmull = 0;
+#if defined(__APPLE__)
+ /* On macOS, all Apple Silicon has fast PMULL */
+ has_fast_pmull = 1;
+#elif defined(ARCH_64BIT) && !defined(_WIN32)
+ /* We need CPUID feature to read MIDR register */
+ if (!arm_has_cpuid())
+ return has_fast_pmull;
+
+ uint64_t midr;
+ __asm__ ("mrs %0, midr_el1" : "=r" (midr));
+
+ uint32_t implementer = MIDR_IMPLEMENTOR(midr);
+ uint32_t part = MIDR_PARTNUM(midr);
+
+ if (implementer == ARM_IMPLEMENTER_APPLE) {
+ /* All Apple Silicon (M1+) have fast PMULL */
+ has_fast_pmull = 1;
+ } else if (implementer == ARM_IMPLEMENTER_ARM) {
+ /* ARM Cortex-X and Neoverse V/N2 series have multi-lane PMULL */
+ switch (part) {
+ case ARM_PART_CORTEX_X1:
+ case ARM_PART_CORTEX_X1C:
+ case ARM_PART_CORTEX_X2:
+ case ARM_PART_CORTEX_X3:
+ case ARM_PART_CORTEX_X4:
+ case ARM_PART_CORTEX_X925:
+ case ARM_PART_NEOVERSE_N2:
+ case ARM_PART_NEOVERSE_V1:
+ case ARM_PART_NEOVERSE_V2:
+ case ARM_PART_NEOVERSE_V3:
+ has_fast_pmull = 1;
+ }
+ } else if (implementer == ARM_IMPLEMENTER_QUALCOMM) {
+ /* Qualcomm Oryon (Snapdragon X Elite/Plus) has fast PMULL */
+ if (part == QUALCOMM_PART_ORYON)
+ has_fast_pmull = 1;
+ }
+#endif
+ return has_fast_pmull;
+}
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
+#ifdef ARCH_64BIT
+ features->has_simd = 0; /* never available */
+ features->has_neon = 1; /* always available */
+#else
+ features->has_simd = arm_has_simd();
+ features->has_neon = arm_has_neon();
+#endif
+ features->has_crc32 = arm_has_crc32();
+ features->has_pmull = arm_has_pmull();
+ features->has_eor3 = arm_has_eor3();
+ features->has_fast_pmull = features->has_pmull && arm_cpu_has_fast_pmull();
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.h b/neozip/arch/arm/arm_features.h
new file mode 100644
index 0000000000..2f17a9ddf0
--- /dev/null
+++ b/neozip/arch/arm/arm_features.h
@@ -0,0 +1,19 @@
+/* arm_features.h -- check for ARM features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_
+
+struct arm_cpu_features {
+ int has_simd;
+ int has_neon;
+ int has_crc32;
+ int has_pmull;
+ int has_eor3;
+ int has_fast_pmull;
+};
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
+
+#endif /* ARM_FEATURES_H_ */
diff --git a/neozip/arch/arm/arm_functions.h b/neozip/arch/arm/arm_functions.h
new file mode 100644
index 0000000000..bc77adb977
--- /dev/null
+++ b/neozip/arch/arm/arm_functions.h
@@ -0,0 +1,75 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#include "arm_natives.h"
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match);
+void slide_hash_neon(deflate_state *s);
+#endif
+
+#ifdef ARM_CRC32
+uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef ARM_PMULL_EOR3
+uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+# ifdef ARM_SIMD_NATIVE
+# undef native_slide_hash
+# define native_slide_hash slide_hash_armv6
+# endif
+// ARM - NEON
+# ifdef ARM_NEON_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_neon
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_neon
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_neon
+# undef native_compare256
+# define native_compare256 compare256_neon
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_neon
+# undef native_longest_match
+# define native_longest_match longest_match_neon
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_neon
+# undef native_slide_hash
+# define native_slide_hash slide_hash_neon
+# endif
+// ARM - CRC32
+# ifdef ARM_CRC32_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_armv8
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_armv8
+# endif
+// ARM - PMULL EOR3
+# ifdef ARM_PMULL_EOR3_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_armv8_pmull_eor3
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_armv8_pmull_eor3
+# endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
diff --git a/neozip/arch/arm/arm_natives.h b/neozip/arch/arm/arm_natives.h
new file mode 100644
index 0000000000..311e33e958
--- /dev/null
+++ b/neozip/arch/arm/arm_natives.h
@@ -0,0 +1,31 @@
+/* arm_natives.h -- ARM compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_NATIVES_H_
+#define ARM_NATIVES_H_
+
+#if defined(__ARM_FEATURE_SIMD32)
+# ifdef ARM_SIMD
+# define ARM_SIMD_NATIVE
+# endif
+#endif
+/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT)
+# ifdef ARM_NEON
+# define ARM_NEON_NATIVE
+# endif
+#endif
+/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */
+#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801)
+# ifdef ARM_CRC32
+# define ARM_CRC32_NATIVE
+# endif
+#endif
+#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3)
+# ifdef ARM_PMULL_EOR3
+# define ARM_PMULL_EOR3_NATIVE
+# endif
+#endif
+
+#endif /* ARM_NATIVES_H_ */
diff --git a/neozip/arch/arm/chunkset_neon.c b/neozip/arch/arm/chunkset_neon.c
new file mode 100644
index 0000000000..a891f10fa5
--- /dev/null
+++ b/neozip/arch/arm/chunkset_neon.c
@@ -0,0 +1,81 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+#include "neon_intrins.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef uint8x16_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from)));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from)));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from)));
+}
+
+#define CHUNKSIZE chunksize_neon
+#define CHUNKCOPY chunkcopy_neon
+#define CHUNKUNROLL chunkunroll_neon
+#define CHUNKMEMSET chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ vst1q_u8(out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ *chunk_rem = lut_rem.remval;
+
+ /* See note in chunkset_ssse3.c for why this is ok */
+ __msan_unpoison(buf + dist, 16 - dist);
+
+ /* This version of table is only available on aarch64 */
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+ uint8x16_t ret_vec = vld1q_u8(buf);
+
+ uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128);
+ return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+ uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+ perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64);
+ perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
+ a = vld1_u8(buf);
+ b = vld1_u8(buf + 8);
+ ret0 = vtbl1_u8(a, perm_vec0);
+ uint8x8x2_t ab;
+ ab.val[0] = a;
+ ab.val[1] = b;
+ ret1 = vtbl2_u8(ab, perm_vec1);
+ return vcombine_u8(ret0, ret1);
+#endif
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_neon
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/compare256_neon.c b/neozip/arch/arm/compare256_neon.c
new file mode 100644
index 0000000000..4ced9fc9ca
--- /dev/null
+++ b/neozip/arch/arm/compare256_neon.c
@@ -0,0 +1,56 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON)
+#include "neon_intrins.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint8x16_t a, b, cmp;
+ uint64_t lane;
+
+ a = vld1q_u8(src0);
+ b = vld1q_u8(src1);
+
+ cmp = veorq_u8(a, b);
+
+ lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+ if (lane)
+ return len + zng_ctz64(lane) / 8;
+ len += 8;
+ lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+ if (lane)
+ return len + zng_ctz64(lane) / 8;
+ len += 8;
+
+ src0 += 16, src1 += 16;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_neon
+#define COMPARE256 compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_neon
+#define COMPARE256 compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/crc32_armv8.c b/neozip/arch/arm/crc32_armv8.c
new file mode 100644
index 0000000000..59f2b65009
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8.c
@@ -0,0 +1,81 @@
+/* crc32_armv8.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
+ * Copyright (C) 2016 Yang Zhang
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_CRC32
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "crc32_armv8_p.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len,
+ const int COPY) {
+ uint32_t c = ~crc;
+
+ if (UNLIKELY(len == 1)) {
+ if (COPY)
+ *dst = *src;
+ c = __crc32b(c, *src);
+ return ~c;
+ }
+
+ /* Align to 8-byte boundary for tail processing */
+ uintptr_t align_diff = ALIGN_DIFF(src, 8);
+ if (align_diff)
+ c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY);
+
+ while (len >= 64) {
+ uint64_t d0 = *(const uint64_t *)src;
+ uint64_t d1 = *(const uint64_t *)(src + 8);
+ uint64_t d2 = *(const uint64_t *)(src + 16);
+ uint64_t d3 = *(const uint64_t *)(src + 24);
+ uint64_t d4 = *(const uint64_t *)(src + 32);
+ uint64_t d5 = *(const uint64_t *)(src + 40);
+ uint64_t d6 = *(const uint64_t *)(src + 48);
+ uint64_t d7 = *(const uint64_t *)(src + 56);
+
+ if (COPY) {
+ memcpy(dst, &d0, 8);
+ memcpy(dst + 8, &d1, 8);
+ memcpy(dst + 16, &d2, 8);
+ memcpy(dst + 24, &d3, 8);
+ memcpy(dst + 32, &d4, 8);
+ memcpy(dst + 40, &d5, 8);
+ memcpy(dst + 48, &d6, 8);
+ memcpy(dst + 56, &d7, 8);
+ dst += 64;
+ }
+
+ c = __crc32d(c, d0);
+ c = __crc32d(c, d1);
+ c = __crc32d(c, d2);
+ c = __crc32d(c, d3);
+ c = __crc32d(c, d4);
+ c = __crc32d(c, d5);
+ c = __crc32d(c, d6);
+ c = __crc32d(c, d7);
+
+ src += 64;
+ len -= 64;
+ }
+
+ return crc32_armv8_tail(c, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+ return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+ /* Without unaligned access, interleaved stores get decomposed into byte ops */
+ crc = crc32_armv8(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/crc32_armv8_p.h b/neozip/arch/arm/crc32_armv8_p.h
new file mode 100644
index 0000000000..e72c4c0ad1
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_p.h
@@ -0,0 +1,103 @@
+/* crc32_armv8_p.h -- Private shared inline ARMv8 CRC32 functions
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CRC32_ARMV8_P_H
+#define CRC32_ARMV8_P_H
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_align(uint32_t crc, uint8_t **dst, const uint8_t **buf,
+ size_t *len, uintptr_t align_diff, const int COPY) {
+ if (*len && (align_diff & 1)) {
+ uint8_t val = **buf;
+ if (COPY) {
+ **dst = val;
+ *dst += 1;
+ }
+ crc = __crc32b(crc, val);
+ *buf += 1;
+ *len -= 1;
+ }
+
+ if (*len >= 2 && (align_diff & 2)) {
+ uint16_t val = *((uint16_t*)*buf);
+ if (COPY) {
+ memcpy(*dst, &val, 2);
+ *dst += 2;
+ }
+ crc = __crc32h(crc, val);
+ *buf += 2;
+ *len -= 2;
+ }
+
+ if (*len >= 4 && (align_diff & 4)) {
+ uint32_t val = *((uint32_t*)*buf);
+ if (COPY) {
+ memcpy(*dst, &val, 4);
+ *dst += 4;
+ }
+ crc = __crc32w(crc, val);
+ *buf += 4;
+ *len -= 4;
+ }
+
+ if (*len >= 8 && (align_diff & 8)) {
+ uint64_t val = *((uint64_t*)*buf);
+ if (COPY) {
+ memcpy(*dst, &val, 8);
+ *dst += 8;
+ }
+ crc = __crc32d(crc, val);
+ *buf += 8;
+ *len -= 8;
+ }
+
+ return crc;
+}
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_tail(uint32_t crc, uint8_t *dst, const uint8_t *buf,
+ size_t len, const int COPY) {
+ while (len >= 8) {
+ uint64_t val = *((uint64_t*)buf);
+ if (COPY) {
+ memcpy(dst, &val, 8);
+ dst += 8;
+ }
+ crc = __crc32d(crc, val);
+ buf += 8;
+ len -= 8;
+ }
+
+ if (len & 4) {
+ uint32_t val = *((uint32_t*)buf);
+ if (COPY) {
+ memcpy(dst, &val, 4);
+ dst += 4;
+ }
+ crc = __crc32w(crc, val);
+ buf += 4;
+ }
+
+ if (len & 2) {
+ uint16_t val = *((uint16_t*)buf);
+ if (COPY) {
+ memcpy(dst, &val, 2);
+ dst += 2;
+ }
+ crc = __crc32h(crc, val);
+ buf += 2;
+ }
+
+ if (len & 1) {
+ uint8_t val = *buf;
+ if (COPY)
+ *dst = val;
+ crc = __crc32b(crc, val);
+ }
+
+ return ~crc;
+}
+
+#endif /* CRC32_ARMV8_P_H */
diff --git a/neozip/arch/arm/crc32_armv8_pmull_eor3.c b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
new file mode 100644
index 0000000000..e0d5bf043b
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
@@ -0,0 +1,366 @@
+/* crc32_armv8_pmull_eor3.c -- ARMv8 CRC32 using PMULL + EOR3 (SHA3 extension)
+ * Copyright (C) 2025 Peter Cawley
+ * https://github.com/corsix/fast-crc32
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This uses EOR3 (3-way XOR) from ARMv8.2-A SHA3 extension to save instructions.
+ * Uses 3-way parallel scalar CRC + 9 PMULL vector lanes, processing 192 bytes/iter.
+ */
+
+#ifdef ARM_PMULL_EOR3
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "acle_intrins.h"
+#include "neon_intrins.h"
+#include "crc32_armv8_p.h"
+
+/* Carryless multiply low 64 bits: a[0] * b[0] */
+static inline uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
+#ifdef _MSC_VER
+ return vreinterpretq_u64_p128(vmull_p64(
+ vget_low_p64(vreinterpret_p64_u64(a)),
+ vget_low_p64(vreinterpret_p64_u64(b))));
+#else
+ return vreinterpretq_u64_p128(vmull_p64(
+ vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(a)), 0),
+ vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(b)), 0)));
+#endif
+}
+
+/* Carryless multiply high 64 bits: a[1] * b[1] */
+static inline uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
+ return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b)));
+}
+
+/* Carryless multiply of two 32-bit scalars: a * b (returns 64-bit result in 128-bit vector) */
+static inline uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
+#ifdef _MSC_VER
+ return vreinterpretq_u64_p128(vmull_p64(vdup_n_p64((poly64_t)a), vdup_n_p64((poly64_t)b)));
+#else
+ return vreinterpretq_u64_p128(vmull_p64((poly64_t)a, (poly64_t)b));
+#endif
+}
+
+/* Compute x^n mod P (CRC-32 polynomial) in log(n) time, where P = 0x104c11db7 */
+static uint32_t xnmodp(uint64_t n) {
+ uint64_t stack = ~(uint64_t)1;
+ uint32_t acc, low;
+ for (; n > 191; n = (n >> 1) - 16) {
+ stack = (stack << 1) + (n & 1);
+ }
+ stack = ~stack;
+ acc = ((uint32_t)0x80000000) >> (n & 31);
+ for (n >>= 5; n; --n) {
+ acc = __crc32w(acc, 0);
+ }
+ while ((low = stack & 1), stack >>= 1) {
+ poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
+ uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
+ acc = __crc32d(0, y << low);
+ }
+ return acc;
+}
+
+/* Shift CRC forward by nbytes: equivalent to appending nbytes of zeros to the data stream */
+static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
+ Assert(nbytes >= 5, "crc_shift requires nbytes >= 5");
+ return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
+}
+
+Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src,
+ size_t len, const int COPY) {
+ uint32_t crc0 = ~crc;
+
+ if (UNLIKELY(len == 1)) {
+ if (COPY)
+ *dst = *src;
+ crc0 = __crc32b(crc0, *src);
+ return ~crc0;
+ }
+
+ /* Align to 16-byte boundary for vector path */
+ uintptr_t align_diff = ALIGN_DIFF(src, 16);
+ if (align_diff)
+ crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY);
+
+ /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */
+ if (len >= 192) {
+ size_t blk = len / 192; /* Number of 192-byte blocks */
+ size_t klen = blk * 16; /* Scalar stride per CRC lane */
+ const uint8_t *end = src + len;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + klen;
+ const uint8_t *src2 = src + klen * 2;
+ const uint8_t *srcv = src + klen * 3; /* Vector data starts after scalar lanes */
+ uint32_t crc1 = 0, crc2 = 0;
+ uint64x2_t vc0, vc1, vc2;
+ uint64_t vc;
+
+ /* Load first 9 vector chunks (144 bytes) */
+ uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0;
+ uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1;
+ uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2;
+ uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3;
+ uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4;
+ uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5;
+ uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6;
+ uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7;
+ uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8;
+ uint64x2_t k;
+ /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
+ { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
+
+ /* Per-region dst pointers */
+ uint8_t *dst0 = dst;
+ uint8_t *dst1 = NULL;
+ uint8_t *dst2 = NULL;
+ uint8_t *dst_v = NULL;
+
+ if (COPY) {
+ dst1 = dst + klen;
+ dst2 = dst + klen * 2;
+ dst_v = dst + klen * 3;
+ vst1q_u8(dst_v, vreinterpretq_u8_u64(x0));
+ vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(x1));
+ vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(x2));
+ vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(x3));
+ vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(x4));
+ vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(x5));
+ vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(x6));
+ vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(x7));
+ vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(x8));
+ dst_v += 144;
+ }
+ srcv += 144;
+
+ /* Fold 9 vectors + 3-way parallel scalar CRC */
+ if (blk > 1) {
+ /* Only form a limit pointer when we have at least 2 blocks. */
+ const uint8_t *limit = src0 + klen - 32;
+ while (src0 <= limit) {
+ /* Fold all 9 vector lanes using PMULL */
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+ y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+ y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
+ y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+ y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
+ y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
+
+ /* EOR3: combine hi*k, lo*k, and new data in one instruction */
+ {
+ uint64x2_t d0 = vld1q_u64_ex((const uint64_t*)srcv, 128);
+ uint64x2_t d1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128);
+ uint64x2_t d2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128);
+ uint64x2_t d3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128);
+ uint64x2_t d4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128);
+ uint64x2_t d5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128);
+ uint64x2_t d6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128);
+ uint64x2_t d7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128);
+ uint64x2_t d8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128);
+ if (COPY) {
+ vst1q_u8(dst_v, vreinterpretq_u8_u64(d0));
+ vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(d1));
+ vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(d2));
+ vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(d3));
+ vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(d4));
+ vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(d5));
+ vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(d6));
+ vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(d7));
+ vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(d8));
+ dst_v += 144;
+ }
+ x0 = veor3q_u64(x0, y0, d0);
+ x1 = veor3q_u64(x1, y1, d1);
+ x2 = veor3q_u64(x2, y2, d2);
+ x3 = veor3q_u64(x3, y3, d3);
+ x4 = veor3q_u64(x4, y4, d4);
+ x5 = veor3q_u64(x5, y5, d5);
+ x6 = veor3q_u64(x6, y6, d6);
+ x7 = veor3q_u64(x7, y7, d7);
+ x8 = veor3q_u64(x8, y8, d8);
+ }
+
+ /* 3-way parallel scalar CRC (16 bytes each) */
+ {
+ uint64_t s0a = *(const uint64_t*)src0;
+ uint64_t s0b = *(const uint64_t*)(src0 + 8);
+ uint64_t s1a = *(const uint64_t*)src1;
+ uint64_t s1b = *(const uint64_t*)(src1 + 8);
+ uint64_t s2a = *(const uint64_t*)src2;
+ uint64_t s2b = *(const uint64_t*)(src2 + 8);
+ if (COPY) {
+ memcpy(dst0, &s0a, 8);
+ memcpy(dst0 + 8, &s0b, 8);
+ dst0 += 16;
+ memcpy(dst1, &s1a, 8);
+ memcpy(dst1 + 8, &s1b, 8);
+ dst1 += 16;
+ memcpy(dst2, &s2a, 8);
+ memcpy(dst2 + 8, &s2b, 8);
+ dst2 += 16;
+ }
+ crc0 = __crc32d(crc0, s0a);
+ crc0 = __crc32d(crc0, s0b);
+ crc1 = __crc32d(crc1, s1a);
+ crc1 = __crc32d(crc1, s1b);
+ crc2 = __crc32d(crc2, s2a);
+ crc2 = __crc32d(crc2, s2b);
+ }
+ src0 += 16;
+ src1 += 16;
+ src2 += 16;
+ srcv += 144;
+ }
+ }
+
+ /* Reduce 9 vectors to 1 using tree reduction */
+ /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */
+ { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); }
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ x0 = veor3q_u64(x0, y0, x1);
+ x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
+
+ /* Step 2: fold pairs (x0,x1), (x2,x3), (x4,x5), (x6,x7) */
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+ x0 = veor3q_u64(x0, y0, x1);
+ x2 = veor3q_u64(x2, y2, x3);
+ x4 = veor3q_u64(x4, y4, x5);
+ x6 = veor3q_u64(x6, y6, x7);
+
+ /* Step 3: fold pairs (x0,x2), (x4,x6) */
+ { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); }
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ x0 = veor3q_u64(x0, y0, x2);
+ x4 = veor3q_u64(x4, y4, x6);
+
+ /* Step 4: final fold (x0, x4) -> x0 */
+ { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); }
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ x0 = veor3q_u64(x0, y0, x4);
+
+ /* Process final scalar chunk */
+ {
+ uint64_t s0a = *(const uint64_t*)src0;
+ uint64_t s0b = *(const uint64_t*)(src0 + 8);
+ uint64_t s1a = *(const uint64_t*)src1;
+ uint64_t s1b = *(const uint64_t*)(src1 + 8);
+ uint64_t s2a = *(const uint64_t*)src2;
+ uint64_t s2b = *(const uint64_t*)(src2 + 8);
+ if (COPY) {
+ memcpy(dst0, &s0a, 8);
+ memcpy(dst0 + 8, &s0b, 8);
+ memcpy(dst1, &s1a, 8);
+ memcpy(dst1 + 8, &s1b, 8);
+ memcpy(dst2, &s2a, 8);
+ memcpy(dst2 + 8, &s2b, 8);
+ }
+ crc0 = __crc32d(crc0, s0a);
+ crc0 = __crc32d(crc0, s0b);
+ crc1 = __crc32d(crc1, s1a);
+ crc1 = __crc32d(crc1, s1b);
+ crc2 = __crc32d(crc2, s2a);
+ crc2 = __crc32d(crc2, s2b);
+ }
+
+ /* Shift and combine 3 scalar CRCs */
+ vc0 = crc_shift(crc0, klen * 2 + blk * 144);
+ vc1 = crc_shift(crc1, klen + blk * 144);
+ vc2 = crc_shift(crc2, blk * 144);
+ vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+ /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */
+ crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+ crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+ if (COPY)
+ dst += blk * 192;
+ src = srcv;
+ len = end - srcv;
+ }
+
+ /* 3-way scalar CRC (24 bytes/iter) */
+ if (len >= 80) {
+ size_t klen = ((len - 8) / 24) * 8; /* Stride for 3-way parallel */
+ const uint8_t *buf0 = src;
+ const uint8_t *buf1 = src + klen;
+ const uint8_t *buf2 = src + klen * 2;
+ uint32_t crc1 = 0, crc2 = 0;
+ uint64x2_t vc0, vc1;
+ uint64_t vc;
+
+ /* Per-lane dst pointers */
+ uint8_t *dst0 = dst;
+ uint8_t *dst1 = NULL;
+ uint8_t *dst2 = NULL;
+ if (COPY) {
+ dst1 = dst + klen;
+ dst2 = dst + klen * 2;
+ }
+
+ /* 3-way parallel scalar CRC */
+ do {
+ uint64_t v0 = *(const uint64_t*)buf0;
+ uint64_t v1 = *(const uint64_t*)buf1;
+ uint64_t v2 = *(const uint64_t*)buf2;
+ if (COPY) {
+ memcpy(dst0, &v0, 8);
+ dst0 += 8;
+ memcpy(dst1, &v1, 8);
+ dst1 += 8;
+ memcpy(dst2, &v2, 8);
+ dst2 += 8;
+ }
+ crc0 = __crc32d(crc0, v0);
+ crc1 = __crc32d(crc1, v1);
+ crc2 = __crc32d(crc2, v2);
+ buf0 += 8;
+ buf1 += 8;
+ buf2 += 8;
+ len -= 24;
+ } while (len >= 32);
+
+ /* Combine the 3 CRCs */
+ vc0 = crc_shift(crc0, klen * 2 + 8);
+ vc1 = crc_shift(crc1, klen + 8);
+ vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+ /* Process final 8 bytes with combined CRC */
+ crc0 = crc2;
+ {
+ uint64_t vf = *(const uint64_t*)buf2;
+ if (COPY)
+ memcpy(dst2, &vf, 8);
+ crc0 = __crc32d(crc0, vf ^ vc);
+ }
+ src = buf2 + 8;
+ len -= 8;
+ if (COPY)
+ dst = dst2 + 8;
+ }
+
+ /* Process remaining bytes */
+ return crc32_armv8_tail(crc0, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+ return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+ /* Without unaligned access, interleaved stores get decomposed into byte ops */
+ crc = crc32_armv8_pmull_eor3(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/neon_intrins.h b/neozip/arch/arm/neon_intrins.h
new file mode 100644
index 0000000000..449916e0b7
--- /dev/null
+++ b/neozip/arch/arm/neon_intrins.h
@@ -0,0 +1,79 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#if defined(_MSC_VER) && defined(ARCH_ARM) && defined(ARCH_64BIT)
+/* arm64_neon.h is MSVC specific */
+# include <arm64_neon.h>
+#else
+# include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON) && defined(ARCH_ARM) && defined(ARCH_32BIT)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+ out.val[0] = vqsubq_u16(a.val[0], b); \
+ out.val[1] = vqsubq_u16(a.val[1], b); \
+ out.val[2] = vqsubq_u16(a.val[2], b); \
+ out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+# if defined(ARCH_ARM) && defined(ARCH_32BIT) && defined(__clang__) && \
+ (!defined(__clang_major__) || __clang_major__ < 20)
+/* Clang versions before 20 have too strict of an
+ * alignment requirement (:256) for x4 NEON intrinsics */
+# undef ARM_NEON_HASLD4
+# undef vld1q_u16_x4
+# undef vld1q_u8_x4
+# undef vst1q_u16_x4
+# endif
+
+# ifndef ARM_NEON_HASLD4
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
+ uint16x8x4_t ret;
+ ret.val[0] = vld1q_u16(a);
+ ret.val[1] = vld1q_u16(a+8);
+ ret.val[2] = vld1q_u16(a+16);
+ ret.val[3] = vld1q_u16(a+24);
+ return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
+ uint8x16x4_t ret;
+ ret.val[0] = vld1q_u8(a);
+ ret.val[1] = vld1q_u8(a+16);
+ ret.val[2] = vld1q_u8(a+32);
+ ret.val[3] = vld1q_u8(a+48);
+ return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+ vst1q_u16(p, a.val[0]);
+ vst1q_u16(p + 8, a.val[1]);
+ vst1q_u16(p + 16, a.val[2]);
+ vst1q_u16(p + 24, a.val[3]);
+}
+# endif // HASLD4 check
+
+# ifndef _MSC_VER
+# define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8))
+# endif
+# if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
+# define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
+# define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
+# endif
+
+#endif
+
+#endif // include guard ARM_NEON_INTRINS_H
diff --git a/neozip/arch/arm/slide_hash_armv6.c b/neozip/arch/arm/slide_hash_armv6.c
new file mode 100644
index 0000000000..b241e6c5e6
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_armv6.c
@@ -0,0 +1,49 @@
+/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
+ * Copyright (C) 2023 Cameron Cawley
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_SIMD
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ Z_REGISTER uint16x2_t v;
+ uint16x2_t p0, p1, p2, p3;
+ Z_REGISTER size_t n;
+
+ size_t size = entries*sizeof(table[0]);
+ Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
+
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
+ v = wsize | (wsize << 16);
+
+ n = size / (sizeof(uint16x2_t) * 4);
+ do {
+ p0 = *((const uint16x2_t *)(table));
+ p1 = *((const uint16x2_t *)(table+2));
+ p2 = *((const uint16x2_t *)(table+4));
+ p3 = *((const uint16x2_t *)(table+6));
+ p0 = __uqsub16(p0, v);
+ p1 = __uqsub16(p1, v);
+ p2 = __uqsub16(p2, v);
+ p3 = __uqsub16(p3, v);
+ *((uint16x2_t *)(table)) = p0;
+ *((uint16x2_t *)(table+2)) = p1;
+ *((uint16x2_t *)(table+4)) = p2;
+ *((uint16x2_t *)(table+6)) = p3;
+ table += 8;
+ } while (--n);
+}
+
+Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/neozip/arch/arm/slide_hash_neon.c b/neozip/arch/arm/slide_hash_neon.c
new file mode 100644
index 0000000000..2f9e94a33d
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_neon.c
@@ -0,0 +1,48 @@
+/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ Z_REGISTER uint16x8_t v;
+ uint16x8x4_t p0, p1;
+ Z_REGISTER size_t n;
+
+ size_t size = entries*sizeof(table[0]);
+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
+ v = vdupq_n_u16(wsize);
+
+ n = size / (sizeof(uint16x8_t) * 8);
+ do {
+ p0 = vld1q_u16_x4_ex(table, 256);
+ p1 = vld1q_u16_x4_ex(table+32, 256);
+ vqsubq_u16_x4_x1(p0, p0, v);
+ vqsubq_u16_x4_x1(p1, p1, v);
+ vst1q_u16_x4_ex(table, p0, 256);
+ vst1q_u16_x4_ex(table+32, p1, 256);
+ table += 64;
+ } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/neozip/arch/generic/Makefile.in b/neozip/arch/generic/Makefile.in
new file mode 100644
index 0000000000..1d9cc4df5b
--- /dev/null
+++ b/neozip/arch/generic/Makefile.in
@@ -0,0 +1,68 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ adler32_c.o adler32_c.lo \
+ chunkset_c.o chunkset_c.lo \
+ compare256_c.o compare256_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_chorba_c.o crc32_chorba_c.lo \
+ slide_hash_c.o slide_hash_c.lo
+
+
+adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_c.lo: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_chorba_c.o: $(SRCDIR)/crc32_chorba_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c
+
+crc32_chorba_c.lo: $(SRCDIR)/crc32_chorba_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c
+
+slide_hash_c.o: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/generic/adler32_c.c b/neozip/arch/generic/adler32_c.c
new file mode 100644
index 0000000000..84c946f452
--- /dev/null
+++ b/neozip/arch/generic/adler32_c.c
@@ -0,0 +1,55 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+ uint32_t sum2;
+ unsigned n;
+
+ /* split Adler-32 into component sums */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+ /* do length NMAX blocks -- requires just one modulo operation */
+ while (len >= NMAX) {
+ len -= NMAX;
+#ifdef UNROLL_MORE
+ n = NMAX / 16; /* NMAX is divisible by 16 */
+#else
+ n = NMAX / 8; /* NMAX is divisible by 8 */
+#endif
+ do {
+#ifdef UNROLL_MORE
+ ADLER_DO16(adler, sum2, buf); /* 16 sums unrolled */
+ buf += 16;
+#else
+ ADLER_DO8(adler, sum2, buf, 0); /* 8 sums unrolled */
+ buf += 8;
+#endif
+ } while (--n);
+ adler %= BASE;
+ sum2 %= BASE;
+ }
+
+ /* do remaining bytes (less than NMAX, still just one modulo) */
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = FUNCTABLE_CALL(adler32)(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
diff --git a/neozip/arch/generic/chunk_128bit_perm_idx_lut.h b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h
new file mode 100644
index 0000000000..6e5098bf26
--- /dev/null
+++ b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h
@@ -0,0 +1,26 @@
+/* chunk_128bit_perm_idx_lut.h - shared SSSE3/NEON/LSX permutation idx lut for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_128BIT_PERM_IDX_LUT_H_
+#define CHUNK_128BIT_PERM_IDX_LUT_H_
+
+#include "chunk_permute_table.h"
+
+static const lut_rem_pair perm_idx_lut[13] = {
+ {0, 1}, /* 3 */
+ {0, 0}, /* don't care */
+ {1 * 32, 1}, /* 5 */
+ {2 * 32, 4}, /* 6 */
+ {3 * 32, 2}, /* 7 */
+ {0 * 32, 0}, /* don't care */
+ {4 * 32, 7}, /* 9 */
+ {5 * 32, 6}, /* 10 */
+ {6 * 32, 5}, /* 11 */
+ {7 * 32, 4}, /* 12 */
+ {8 * 32, 3}, /* 13 */
+ {9 * 32, 2}, /* 14 */
+ {10 * 32, 1},/* 15 */
+};
+
+#endif
diff --git a/neozip/arch/generic/chunk_256bit_perm_idx_lut.h b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h
new file mode 100644
index 0000000000..796a7df120
--- /dev/null
+++ b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h
@@ -0,0 +1,47 @@
+/* chunk_256bit_perm_idx_lut.h - shared AVX512/AVX2/LASX permutation idx lut for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CHUNK_256BIT_PERM_IDX_LUT_H_
+#define CHUNK_256BIT_PERM_IDX_LUT_H_
+
+#include "chunk_permute_table.h"
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+ { 0, 2}, /* 3 */
+ { 0, 0}, /* don't care */
+ { 1 * 32, 2}, /* 5 */
+ { 2 * 32, 2}, /* 6 */
+ { 3 * 32, 4}, /* 7 */
+ { 0 * 32, 0}, /* don't care */
+ { 4 * 32, 5}, /* 9 */
+ { 5 * 32, 22}, /* 10 */
+ { 6 * 32, 21}, /* 11 */
+ { 7 * 32, 20}, /* 12 */
+ { 8 * 32, 6}, /* 13 */
+ { 9 * 32, 4}, /* 14 */
+ {10 * 32, 2}, /* 15 */
+ { 0 * 32, 0}, /* don't care */
+ {11 * 32, 15}, /* 17 */
+ {11 * 32 + 16, 14}, /* 18 */
+ {11 * 32 + 16 * 2, 13}, /* 19 */
+ {11 * 32 + 16 * 3, 12}, /* 20 */
+ {11 * 32 + 16 * 4, 11}, /* 21 */
+ {11 * 32 + 16 * 5, 10}, /* 22 */
+ {11 * 32 + 16 * 6, 9}, /* 23 */
+ {11 * 32 + 16 * 7, 8}, /* 24 */
+ {11 * 32 + 16 * 8, 7}, /* 25 */
+ {11 * 32 + 16 * 9, 6}, /* 26 */
+ {11 * 32 + 16 * 10, 5}, /* 27 */
+ {11 * 32 + 16 * 11, 4}, /* 28 */
+ {11 * 32 + 16 * 12, 3}, /* 29 */
+ {11 * 32 + 16 * 13, 2}, /* 30 */
+ {11 * 32 + 16 * 14, 1} /* 31 */
+};
+
+static const uint16_t half_rem_vals[13] = {
+ 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
+#endif
diff --git a/neozip/arch/generic/chunk_permute_table.h b/neozip/arch/generic/chunk_permute_table.h
new file mode 100644
index 0000000000..bad66ccc77
--- /dev/null
+++ b/neozip/arch/generic/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+ 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+ 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+ 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+ /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+ * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+ * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+ * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+ * this is what we're dealt.
+ */
+
+ 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+ 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+ 16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+ 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+ 16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+ 16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+ 16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+ uint16_t idx;
+ uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/neozip/arch/generic/chunkset_c.c b/neozip/arch/generic/chunkset_c.c
new file mode 100644
index 0000000000..ff9b1cb5fb
--- /dev/null
+++ b/neozip/arch/generic/chunkset_c.c
@@ -0,0 +1,40 @@
+/* chunkset.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+typedef uint64_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ uint32_t tmp = zng_memread_4(from);
+ *chunk = tmp | ((chunk_t)tmp << 32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = zng_memread_8(from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = zng_memread_8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ zng_memwrite_8(out, *chunk);
+}
+
+#define CHUNKSIZE chunksize_c
+#define CHUNKCOPY chunkcopy_c
+#define CHUNKUNROLL chunkunroll_c
+#define CHUNKMEMSET chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_c
+
+#include "inffast_tpl.h"
diff --git a/neozip/arch/generic/compare256_c.c b/neozip/arch/generic/compare256_c.c
new file mode 100644
index 0000000000..6934a55565
--- /dev/null
+++ b/neozip/arch/generic/compare256_c.c
@@ -0,0 +1,88 @@
+/* compare256.c -- 256 byte memory comparison with match length return
+ * Copyright (C) 2020 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+/* 8-bit integer comparison for hardware without unaligned loads */
+static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ if (src0[0] != src1[0])
+ return len;
+ if (src0[1] != src1[1])
+ return len + 1;
+ if (src0[2] != src1[2])
+ return len + 2;
+ if (src0[3] != src1[3])
+ return len + 3;
+ if (src0[4] != src1[4])
+ return len + 4;
+ if (src0[5] != src1[5])
+ return len + 5;
+ if (src0[6] != src1[6])
+ return len + 6;
+ if (src0[7] != src1[7])
+ return len + 7;
+ src0 += 8, src1 += 8, len += 8;
+ } while (len < 256);
+
+ return 256;
+}
+
+/* 64-bit integer comparison for hardware with unaligned loads */
+static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint64_t sv = zng_memread_8(src0);
+ uint64_t mv = zng_memread_8(src1);
+ uint64_t diff = sv ^ mv;
+ if (diff)
+ return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+ src0 += 8, src1 += 8, len += 8;
+
+ sv = zng_memread_8(src0);
+ mv = zng_memread_8(src1);
+ diff = sv ^ mv;
+ if (diff)
+ return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+ src0 += 8, src1 += 8, len += 8;
+ } while (len < 256);
+
+ return 256;
+}
+
+#if OPTIMAL_CMP == 8
+# define COMPARE256 compare256_8_static
+#else
+# define COMPARE256 compare256_64_static
+#endif
+
+#ifdef WITH_ALL_FALLBACKS
+Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_8_static(src0, src1);
+}
+
+Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_64_static(src0, src1);
+}
+#endif
+
+Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
+ return COMPARE256(src0, src1);
+}
+
+// Generate longest_match_c
+#define LONGEST_MATCH longest_match_c
+#include "match_tpl.h"
+
+// Generate longest_match_slow_c
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_c
+#include "match_tpl.h"
diff --git a/neozip/arch/generic/compare256_p.h b/neozip/arch/generic/compare256_p.h
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/neozip/arch/generic/compare256_p.h
diff --git a/neozip/arch/generic/crc32_braid_c.c b/neozip/arch/generic/crc32_braid_c.c
new file mode 100644
index 0000000000..bda4a249bb
--- /dev/null
+++ b/neozip/arch/generic/crc32_braid_c.c
@@ -0,0 +1,213 @@
+/* crc32_braid.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_p.h"
+
+/*
+ A CRC of a message is computed on BRAID_N braids of words in the message, where
+ each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then
+ three running sparse CRCs are calculated respectively on each braid, at these
+ indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
+ This is done starting at a word boundary, and continues until as many blocks of
+ BRAID_N * BRAID_W bytes as are available have been processed. The results are
+ combined into a single CRC at the end. For this code, BRAID_N must be in the
+ range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased
+ if desired by adding more #if blocks, extending the patterns apparent in the code.
+ In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N
+ value is increased.
+
+ BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time
+ on a given processor. The choices for BRAID_N and BRAID_W below were based on
+ testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC
+ POWER9, and MIPS64 Octeon II processors.
+ The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.
+ The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.
+ They were all tested with either gcc or clang, all using the -O3 optimization
+ level. Your mileage may vary.
+*/
+
+/* ========================================================================= */
+#ifdef BRAID_W
+/*
+ Return the CRC of the BRAID_W bytes in the word_t data, taking the
+ least-significant byte of the word as the first byte of data, without any pre
+ or post conditioning. This is used to combine the CRCs of each braid.
+ */
+# if BYTE_ORDER == LITTLE_ENDIAN
+static uint32_t crc_word(z_word_t data) {
+ int k;
+ for (k = 0; k < BRAID_W; k++)
+ data = (data >> 8) ^ crc_table[data & 0xff];
+ return (uint32_t)data;
+}
+# elif BYTE_ORDER == BIG_ENDIAN
+static z_word_t crc_word(z_word_t data) {
+ int k;
+ for (k = 0; k < BRAID_W; k++)
+ data = (data << 8) ^
+ crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];
+ return data;
+}
+# endif /* BYTE_ORDER */
+#endif /* BRAID_W */
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len) {
+ crc = ~crc;
+
+#ifdef BRAID_W
+ /* If provided enough bytes, do a braided CRC calculation. */
+ if (len >= BRAID_N * BRAID_W + BRAID_W - 1) {
+ size_t blks;
+ z_word_t const *words;
+ int k;
+
+ /* Compute the CRC up to a z_word_t boundary. */
+ size_t align_diff = (size_t)MIN(ALIGN_DIFF(buf, BRAID_W), len);
+ if (align_diff) {
+ crc = crc32_copy_small(crc, NULL, buf, align_diff, BRAID_W - 1, 0);
+ len -= align_diff;
+ buf += align_diff;
+ }
+
+ /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */
+ blks = len / (BRAID_N * BRAID_W);
+ len -= blks * BRAID_N * BRAID_W;
+ words = (z_word_t const *)buf;
+
+ z_word_t crc0, word0, comb;
+#if BRAID_N > 1
+ z_word_t crc1, word1;
+#if BRAID_N > 2
+ z_word_t crc2, word2;
+#if BRAID_N > 3
+ z_word_t crc3, word3;
+#if BRAID_N > 4
+ z_word_t crc4, word4;
+#if BRAID_N > 5
+ z_word_t crc5, word5;
+#endif
+#endif
+#endif
+#endif
+#endif
+ /* Initialize the CRC for each braid. */
+ crc0 = Z_WORD_FROM_LE(crc);
+#if BRAID_N > 1
+ crc1 = 0;
+#if BRAID_N > 2
+ crc2 = 0;
+#if BRAID_N > 3
+ crc3 = 0;
+#if BRAID_N > 4
+ crc4 = 0;
+#if BRAID_N > 5
+ crc5 = 0;
+#endif
+#endif
+#endif
+#endif
+#endif
+ /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
+ while (--blks) {
+ /* Load the word for each braid into registers. */
+ word0 = crc0 ^ words[0];
+#if BRAID_N > 1
+ word1 = crc1 ^ words[1];
+#if BRAID_N > 2
+ word2 = crc2 ^ words[2];
+#if BRAID_N > 3
+ word3 = crc3 ^ words[3];
+#if BRAID_N > 4
+ word4 = crc4 ^ words[4];
+#if BRAID_N > 5
+ word5 = crc5 ^ words[5];
+#endif
+#endif
+#endif
+#endif
+#endif
+ words += BRAID_N;
+
+ /* Compute and update the CRC for each word. The loop should get unrolled. */
+ crc0 = BRAID_TABLE[0][word0 & 0xff];
+#if BRAID_N > 1
+ crc1 = BRAID_TABLE[0][word1 & 0xff];
+#if BRAID_N > 2
+ crc2 = BRAID_TABLE[0][word2 & 0xff];
+#if BRAID_N > 3
+ crc3 = BRAID_TABLE[0][word3 & 0xff];
+#if BRAID_N > 4
+ crc4 = BRAID_TABLE[0][word4 & 0xff];
+#if BRAID_N > 5
+ crc5 = BRAID_TABLE[0][word5 & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+ for (k = 1; k < BRAID_W; k++) {
+ crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
+#if BRAID_N > 1
+ crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
+#if BRAID_N > 2
+ crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
+#if BRAID_N > 3
+ crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
+#if BRAID_N > 4
+ crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
+#if BRAID_N > 5
+ crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+ }
+ }
+
+ /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */
+ comb = crc_word(crc0 ^ words[0]);
+#if BRAID_N > 1
+ comb = crc_word(crc1 ^ words[1] ^ comb);
+#if BRAID_N > 2
+ comb = crc_word(crc2 ^ words[2] ^ comb);
+#if BRAID_N > 3
+ comb = crc_word(crc3 ^ words[3] ^ comb);
+#if BRAID_N > 4
+ comb = crc_word(crc4 ^ words[4] ^ comb);
+#if BRAID_N > 5
+ comb = crc_word(crc5 ^ words[5] ^ comb);
+#endif
+#endif
+#endif
+#endif
+#endif
+ words += BRAID_N;
+ Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");
+ crc = (uint32_t)Z_WORD_FROM_LE(comb);
+
+ /* Update the pointer to the remaining bytes to process. */
+ buf = (const unsigned char *)words;
+ }
+
+#endif /* BRAID_W */
+
+ /* Complete the computation of the CRC on any remaining bytes. */
+ return ~crc32_copy_small(crc, NULL, buf, len, (BRAID_N * BRAID_W) - 1, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_braid(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
diff --git a/neozip/arch/generic/crc32_chorba_c.c b/neozip/arch/generic/crc32_chorba_c.c
new file mode 100644
index 0000000000..693972da11
--- /dev/null
+++ b/neozip/arch/generic/crc32_chorba_c.c
@@ -0,0 +1,1275 @@
+#include "zbuild.h"
+#include "zendian.h"
+#if defined(__EMSCRIPTEN__)
+# include "zutil_p.h"
+#endif
+#include "zmemory.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "generic_functions.h"
+
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+#define bitbuffer_size_bytes (16 * 1024 * sizeof(chorba_word_t))
+#define bitbuffer_size_zwords (bitbuffer_size_bytes / sizeof(chorba_word_t))
+#define bitbuffer_size_qwords (bitbuffer_size_bytes / sizeof(uint64_t))
+
+#if defined(HAVE_MAY_ALIAS) && CHORBA_W != 8
+ typedef uint64_t __attribute__ ((__may_alias__)) uint64a_t;
+#else
+ typedef uint64_t uint64a_t;
+#endif
+
+/**
+ * Implements the Chorba algorithm for CRC32 computation (https://arxiv.org/abs/2412.16398).
+ *
+ * This implementation processes data in three phases:
+ * 1. Initial pass: Zeros out bitbuffer
+ * 2. Intermediate pass: Processes half the values
+ * 3. Main pass: Processes remaining data
+ *
+ * @param crc Initial CRC value
+ * @param input Input data buffer
+ * @param len Length of input data
+ * @return Computed CRC32 value
+ *
+ * @note Requires minimum input size of 118960 + 512 bytes
+ * @note Uses 128KB temporary buffer
+ */
+Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+#if defined(__EMSCRIPTEN__)
+ chorba_word_t *bitbuffer = (chorba_word_t*)zng_alloc(bitbuffer_size_bytes);
+#else
+ ALIGNED_(16) chorba_word_t bitbuffer[bitbuffer_size_zwords];
+#endif
+ const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+ uint64a_t *bitbuffer_qwords = (uint64a_t*)bitbuffer;
+ /* The calling function ensured that this is aligned correctly */
+ const chorba_word_t* input = (const chorba_word_t*)buf;
+ const uint64a_t* input_qwords = (const uint64a_t*)buf;
+
+ size_t i = 0;
+
+ chorba_word_t next1 = CHORBA_WORD_FROM_LE(~crc);
+
+ chorba_word_t next2 = 0;
+ chorba_word_t next3 = 0;
+ chorba_word_t next4 = 0;
+ chorba_word_t next5 = 0;
+ chorba_word_t next6 = 0;
+ chorba_word_t next7 = 0;
+ chorba_word_t next8 = 0;
+ chorba_word_t next9 = 0;
+ chorba_word_t next10 = 0;
+ chorba_word_t next11 = 0;
+ chorba_word_t next12 = 0;
+ chorba_word_t next13 = 0;
+ chorba_word_t next14 = 0;
+ chorba_word_t next15 = 0;
+ chorba_word_t next16 = 0;
+ chorba_word_t next17 = 0;
+ chorba_word_t next18 = 0;
+ chorba_word_t next19 = 0;
+ chorba_word_t next20 = 0;
+ chorba_word_t next21 = 0;
+ chorba_word_t next22 = 0;
+ crc = 0;
+
+ // do a first pass to zero out bitbuffer
+ for (; i < (14848 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) {
+ chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+ chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+ chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+ chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+ int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+ int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+ in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1;
+ in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2;
+ in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3;
+ in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4;
+ in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5;
+ in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6;
+ in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7;
+ in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1;
+ in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2;
+ in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3;
+ in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4;
+ in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5;
+ in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6;
+ in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7;
+ in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8;
+ in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9;
+ in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10;
+ in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11;
+ in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12;
+ in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13;
+ in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14;
+ in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15;
+ in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16;
+ in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17;
+ in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18;
+ in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19;
+ in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20;
+ in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21;
+ in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22;
+ in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23;
+ in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24;
+ in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25;
+
+ next1 = in11 ^ in22 ^ in26;
+ next2 = in12 ^ in23 ^ in27;
+ next3 = in13 ^ in24 ^ in28;
+ next4 = in14 ^ in25 ^ in29;
+ next5 = in15 ^ in26 ^ in30;
+ next6 = in16 ^ in27 ^ in31;
+ next7 = in17 ^ in28 ^ in32;
+ next8 = in18 ^ in29;
+ next9 = in19 ^ in30;
+ next10 = in20 ^ in31;
+ next11 = in21 ^ in32;
+ next12 = in22;
+ next13 = in23;
+ next14 = in24;
+ next15 = in25;
+ next16 = in26;
+ next17 = in27;
+ next18 = in28;
+ next19 = in29;
+ next20 = in30;
+ next21 = in31;
+ next22 = in32;
+
+ bitbuffer[out_offset1 + 22] = in1;
+ bitbuffer[out_offset1 + 23] = in2;
+ bitbuffer[out_offset1 + 24] = in3;
+ bitbuffer[out_offset1 + 25] = in4;
+ bitbuffer[out_offset1 + 26] = in5;
+ bitbuffer[out_offset1 + 27] = in6;
+ bitbuffer[out_offset1 + 28] = in7;
+ bitbuffer[out_offset1 + 29] = in8;
+ bitbuffer[out_offset1 + 30] = in9;
+ bitbuffer[out_offset1 + 31] = in10;
+ bitbuffer[out_offset2 + 0] = in11;
+ bitbuffer[out_offset2 + 1] = in12;
+ bitbuffer[out_offset2 + 2] = in13;
+ bitbuffer[out_offset2 + 3] = in14;
+ bitbuffer[out_offset2 + 4] = in15;
+ bitbuffer[out_offset2 + 5] = in16;
+ bitbuffer[out_offset2 + 6] = in17;
+ bitbuffer[out_offset2 + 7] = in18;
+ bitbuffer[out_offset2 + 8] = in19;
+ bitbuffer[out_offset2 + 9] = in20;
+ bitbuffer[out_offset2 + 10] = in21;
+ bitbuffer[out_offset2 + 11] = in22;
+ bitbuffer[out_offset2 + 12] = in23;
+ bitbuffer[out_offset2 + 13] = in24;
+ bitbuffer[out_offset2 + 14] = in25;
+ bitbuffer[out_offset2 + 15] = in26;
+ bitbuffer[out_offset2 + 16] = in27;
+ bitbuffer[out_offset2 + 17] = in28;
+ bitbuffer[out_offset2 + 18] = in29;
+ bitbuffer[out_offset2 + 19] = in30;
+ bitbuffer[out_offset2 + 20] = in31;
+ bitbuffer[out_offset2 + 21] = in32;
+ }
+
+ // one intermediate pass where we pull half the values
+ for (; i < (14880 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) {
+ chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+ chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+ chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+ chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+ int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords;
+ int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+ int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+ in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1;
+ in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2;
+ in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3;
+ in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4;
+ in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5;
+ in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6;
+ in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7;
+ in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1;
+ in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2;
+ in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3;
+ in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4;
+ in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5;
+ in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6;
+ in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7;
+ in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8;
+ in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9;
+ in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10;
+ in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11;
+ in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12;
+ in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13;
+ in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14;
+ in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15;
+ in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22];
+ in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23];
+ in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24];
+ in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25];
+ in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26];
+ in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27];
+ in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28];
+ in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29];
+ in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30];
+ in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31];
+
+ next1 = in11 ^ in22 ^ in26;
+ next2 = in12 ^ in23 ^ in27;
+ next3 = in13 ^ in24 ^ in28;
+ next4 = in14 ^ in25 ^ in29;
+ next5 = in15 ^ in26 ^ in30;
+ next6 = in16 ^ in27 ^ in31;
+ next7 = in17 ^ in28 ^ in32;
+ next8 = in18 ^ in29;
+ next9 = in19 ^ in30;
+ next10 = in20 ^ in31;
+ next11 = in21 ^ in32;
+ next12 = in22;
+ next13 = in23;
+ next14 = in24;
+ next15 = in25;
+ next16 = in26;
+ next17 = in27;
+ next18 = in28;
+ next19 = in29;
+ next20 = in30;
+ next21 = in31;
+ next22 = in32;
+
+ bitbuffer[out_offset1 + 22] = in1;
+ bitbuffer[out_offset1 + 23] = in2;
+ bitbuffer[out_offset1 + 24] = in3;
+ bitbuffer[out_offset1 + 25] = in4;
+ bitbuffer[out_offset1 + 26] = in5;
+ bitbuffer[out_offset1 + 27] = in6;
+ bitbuffer[out_offset1 + 28] = in7;
+ bitbuffer[out_offset1 + 29] = in8;
+ bitbuffer[out_offset1 + 30] = in9;
+ bitbuffer[out_offset1 + 31] = in10;
+ bitbuffer[out_offset2 + 0] = in11;
+ bitbuffer[out_offset2 + 1] = in12;
+ bitbuffer[out_offset2 + 2] = in13;
+ bitbuffer[out_offset2 + 3] = in14;
+ bitbuffer[out_offset2 + 4] = in15;
+ bitbuffer[out_offset2 + 5] = in16;
+ bitbuffer[out_offset2 + 6] = in17;
+ bitbuffer[out_offset2 + 7] = in18;
+ bitbuffer[out_offset2 + 8] = in19;
+ bitbuffer[out_offset2 + 9] = in20;
+ bitbuffer[out_offset2 + 10] = in21;
+ bitbuffer[out_offset2 + 11] = in22;
+ bitbuffer[out_offset2 + 12] = in23;
+ bitbuffer[out_offset2 + 13] = in24;
+ bitbuffer[out_offset2 + 14] = in25;
+ bitbuffer[out_offset2 + 15] = in26;
+ bitbuffer[out_offset2 + 16] = in27;
+ bitbuffer[out_offset2 + 17] = in28;
+ bitbuffer[out_offset2 + 18] = in29;
+ bitbuffer[out_offset2 + 19] = in30;
+ bitbuffer[out_offset2 + 20] = in31;
+ bitbuffer[out_offset2 + 21] = in32;
+ }
+
+ for (; (i + (14870 + 64) * sizeof(chorba_word_t)) < len; i += (32 * sizeof(chorba_word_t))) {
+ chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+ chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+ chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+ chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+ int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords;
+ int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+ int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+ in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1 ^ bitbuffer[in_offset + 0];
+ in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2 ^ bitbuffer[in_offset + 1];
+ in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3 ^ bitbuffer[in_offset + 2];
+ in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4 ^ bitbuffer[in_offset + 3];
+ in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5 ^ bitbuffer[in_offset + 4];
+ in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6 ^ bitbuffer[in_offset + 5];
+ in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7 ^ bitbuffer[in_offset + 6];
+ in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1 ^ bitbuffer[in_offset + 7];
+ in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2 ^ bitbuffer[in_offset + 8];
+ in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3 ^ bitbuffer[in_offset + 9];
+ in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4 ^ bitbuffer[in_offset + 10];
+ in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5 ^ bitbuffer[in_offset + 11];
+ in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6 ^ bitbuffer[in_offset + 12];
+ in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7 ^ bitbuffer[in_offset + 13];
+ in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8 ^ bitbuffer[in_offset + 14];
+ in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9 ^ bitbuffer[in_offset + 15];
+ in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10 ^ bitbuffer[in_offset + 16];
+ in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11 ^ bitbuffer[in_offset + 17];
+ in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12 ^ bitbuffer[in_offset + 18];
+ in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13 ^ bitbuffer[in_offset + 19];
+ in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14 ^ bitbuffer[in_offset + 20];
+ in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15 ^ bitbuffer[in_offset + 21];
+ in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22];
+ in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23];
+ in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24];
+ in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25];
+ in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26];
+ in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27];
+ in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28];
+ in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29];
+ in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30];
+ in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31];
+
+ next1 = in11 ^ in22 ^ in26;
+ next2 = in12 ^ in23 ^ in27;
+ next3 = in13 ^ in24 ^ in28;
+ next4 = in14 ^ in25 ^ in29;
+ next5 = in15 ^ in26 ^ in30;
+ next6 = in16 ^ in27 ^ in31;
+ next7 = in17 ^ in28 ^ in32;
+ next8 = in18 ^ in29;
+ next9 = in19 ^ in30;
+ next10 = in20 ^ in31;
+ next11 = in21 ^ in32;
+ next12 = in22;
+ next13 = in23;
+ next14 = in24;
+ next15 = in25;
+ next16 = in26;
+ next17 = in27;
+ next18 = in28;
+ next19 = in29;
+ next20 = in30;
+ next21 = in31;
+ next22 = in32;
+
+ bitbuffer[out_offset1 + 22] = in1;
+ bitbuffer[out_offset1 + 23] = in2;
+ bitbuffer[out_offset1 + 24] = in3;
+ bitbuffer[out_offset1 + 25] = in4;
+ bitbuffer[out_offset1 + 26] = in5;
+ bitbuffer[out_offset1 + 27] = in6;
+ bitbuffer[out_offset1 + 28] = in7;
+ bitbuffer[out_offset1 + 29] = in8;
+ bitbuffer[out_offset1 + 30] = in9;
+ bitbuffer[out_offset1 + 31] = in10;
+ bitbuffer[out_offset2 + 0] = in11;
+ bitbuffer[out_offset2 + 1] = in12;
+ bitbuffer[out_offset2 + 2] = in13;
+ bitbuffer[out_offset2 + 3] = in14;
+ bitbuffer[out_offset2 + 4] = in15;
+ bitbuffer[out_offset2 + 5] = in16;
+ bitbuffer[out_offset2 + 6] = in17;
+ bitbuffer[out_offset2 + 7] = in18;
+ bitbuffer[out_offset2 + 8] = in19;
+ bitbuffer[out_offset2 + 9] = in20;
+ bitbuffer[out_offset2 + 10] = in21;
+ bitbuffer[out_offset2 + 11] = in22;
+ bitbuffer[out_offset2 + 12] = in23;
+ bitbuffer[out_offset2 + 13] = in24;
+ bitbuffer[out_offset2 + 14] = in25;
+ bitbuffer[out_offset2 + 15] = in26;
+ bitbuffer[out_offset2 + 16] = in27;
+ bitbuffer[out_offset2 + 17] = in28;
+ bitbuffer[out_offset2 + 18] = in29;
+ bitbuffer[out_offset2 + 19] = in30;
+ bitbuffer[out_offset2 + 20] = in31;
+ bitbuffer[out_offset2 + 21] = in32;
+ }
+
+ bitbuffer[(i / sizeof(chorba_word_t) + 0) % bitbuffer_size_zwords] ^= next1;
+ bitbuffer[(i / sizeof(chorba_word_t) + 1) % bitbuffer_size_zwords] ^= next2;
+ bitbuffer[(i / sizeof(chorba_word_t) + 2) % bitbuffer_size_zwords] ^= next3;
+ bitbuffer[(i / sizeof(chorba_word_t) + 3) % bitbuffer_size_zwords] ^= next4;
+ bitbuffer[(i / sizeof(chorba_word_t) + 4) % bitbuffer_size_zwords] ^= next5;
+ bitbuffer[(i / sizeof(chorba_word_t) + 5) % bitbuffer_size_zwords] ^= next6;
+ bitbuffer[(i / sizeof(chorba_word_t) + 6) % bitbuffer_size_zwords] ^= next7;
+ bitbuffer[(i / sizeof(chorba_word_t) + 7) % bitbuffer_size_zwords] ^= next8;
+ bitbuffer[(i / sizeof(chorba_word_t) + 8) % bitbuffer_size_zwords] ^= next9;
+ bitbuffer[(i / sizeof(chorba_word_t) + 9) % bitbuffer_size_zwords] ^= next10;
+ bitbuffer[(i / sizeof(chorba_word_t) + 10) % bitbuffer_size_zwords] ^= next11;
+ bitbuffer[(i / sizeof(chorba_word_t) + 11) % bitbuffer_size_zwords] ^= next12;
+ bitbuffer[(i / sizeof(chorba_word_t) + 12) % bitbuffer_size_zwords] ^= next13;
+ bitbuffer[(i / sizeof(chorba_word_t) + 13) % bitbuffer_size_zwords] ^= next14;
+ bitbuffer[(i / sizeof(chorba_word_t) + 14) % bitbuffer_size_zwords] ^= next15;
+ bitbuffer[(i / sizeof(chorba_word_t) + 15) % bitbuffer_size_zwords] ^= next16;
+ bitbuffer[(i / sizeof(chorba_word_t) + 16) % bitbuffer_size_zwords] ^= next17;
+ bitbuffer[(i / sizeof(chorba_word_t) + 17) % bitbuffer_size_zwords] ^= next18;
+ bitbuffer[(i / sizeof(chorba_word_t) + 18) % bitbuffer_size_zwords] ^= next19;
+ bitbuffer[(i / sizeof(chorba_word_t) + 19) % bitbuffer_size_zwords] ^= next20;
+ bitbuffer[(i / sizeof(chorba_word_t) + 20) % bitbuffer_size_zwords] ^= next21;
+ bitbuffer[(i / sizeof(chorba_word_t) + 21) % bitbuffer_size_zwords] ^= next22;
+
+ for (int j = 14870; j < 14870 + 64; j++) {
+ bitbuffer[(j + (i / sizeof(chorba_word_t))) % bitbuffer_size_zwords] = 0;
+ }
+
+ uint64_t next1_64 = 0;
+ uint64_t next2_64 = 0;
+ uint64_t next3_64 = 0;
+ uint64_t next4_64 = 0;
+ uint64_t next5_64 = 0;
+ uint64_t final[9] = {0};
+
+ for (; (i + 72 < len); i += 32) {
+ uint64_t in1;
+ uint64_t in2;
+ uint64_t in3;
+ uint64_t in4;
+ uint64_t a1, a2, a3, a4;
+ uint64_t b1, b2, b3, b4;
+ uint64_t c1, c2, c3, c4;
+ uint64_t d1, d2, d3, d4;
+
+ uint64_t out1;
+ uint64_t out2;
+ uint64_t out3;
+ uint64_t out4;
+ uint64_t out5;
+
+ in1 = input_qwords[i / sizeof(uint64_t)] ^ bitbuffer_qwords[(i / sizeof(uint64_t)) % bitbuffer_size_qwords];
+ in2 = input_qwords[i / sizeof(uint64_t) + 1] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 1) % bitbuffer_size_qwords];
+ in1 = Z_U64_FROM_LE(in1) ^ next1_64;
+ in2 = Z_U64_FROM_LE(in2) ^ next2_64;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = input_qwords[i / sizeof(uint64_t) + 2] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 2) % bitbuffer_size_qwords];
+ in4 = input_qwords[i / sizeof(uint64_t) + 3] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 3) % bitbuffer_size_qwords];
+ in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1;
+ in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1_64 = next5_64 ^ out1;
+ next2_64 = out2;
+ next3_64 = out3;
+ next4_64 = out4;
+ next5_64 = out5;
+
+ }
+
+ memcpy(final, input_qwords + (i / sizeof(uint64_t)), len-i);
+ final[0] ^= Z_U64_TO_LE(next1_64);
+ final[1] ^= Z_U64_TO_LE(next2_64);
+ final[2] ^= Z_U64_TO_LE(next3_64);
+ final[3] ^= Z_U64_TO_LE(next4_64);
+ final[4] ^= Z_U64_TO_LE(next5_64);
+
+ uint8_t *final_bytes = (uint8_t*)final;
+
+ for (size_t j = 0; j < (len-i); j++) {
+ crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i) % bitbuffer_size_bytes]) & 0xff] ^ (crc >> 8);
+ }
+
+#if defined(__EMSCRIPTEN__)
+ zng_free(bitbuffer);
+#endif
+ return ~crc;
+}
+
+# if CHORBA_W == 8
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+ /* The calling function ensured that this is aligned correctly */
+ const uint64_t* input = (const uint64_t*)buf;
+ uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+ const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+ memset(bitbuffer, 0, 32768);
+ bitbuffer[0] = Z_U64_TO_LE(~crc);
+
+ crc = 0;
+
+ size_t i = 0;
+
+ for(; i + 300*8+64 < len; i += 64) {
+ uint64_t in1, in2, in3, in4;
+ uint64_t in5, in6, in7, in8;
+ size_t in_offset = (i/8);
+
+ in1 = input[i / sizeof(uint64_t) + 0] ^ bitbuffer[in_offset + 0];
+ in2 = input[i / sizeof(uint64_t) + 1] ^ bitbuffer[in_offset + 1];
+ in3 = input[i / sizeof(uint64_t) + 2] ^ bitbuffer[in_offset + 2];
+ in4 = input[i / sizeof(uint64_t) + 3] ^ bitbuffer[in_offset + 3];
+ in5 = input[i / sizeof(uint64_t) + 4] ^ bitbuffer[in_offset + 4];
+ in6 = input[i / sizeof(uint64_t) + 5] ^ bitbuffer[in_offset + 5];
+ in7 = input[i / sizeof(uint64_t) + 6] ^ bitbuffer[in_offset + 6];
+ in8 = input[i / sizeof(uint64_t) + 7] ^ bitbuffer[in_offset + 7];
+
+ // [0, 145, 183, 211]
+
+ bitbuffer[(i/8 + 0 + 145)] ^= in1;
+ bitbuffer[(i/8 + 1 + 145)] ^= in2;
+ bitbuffer[(i/8 + 2 + 145)] ^= in3;
+ bitbuffer[(i/8 + 3 + 145)] ^= in4;
+ bitbuffer[(i/8 + 4 + 145)] ^= in5;
+ bitbuffer[(i/8 + 5 + 145)] ^= in6;
+ bitbuffer[(i/8 + 6 + 145)] ^= in7;
+ bitbuffer[(i/8 + 7 + 145)] ^= in8;
+
+ bitbuffer[(i/8 + 0 + 183)] ^= in1;
+ bitbuffer[(i/8 + 1 + 183)] ^= in2;
+ bitbuffer[(i/8 + 2 + 183)] ^= in3;
+ bitbuffer[(i/8 + 3 + 183)] ^= in4;
+ bitbuffer[(i/8 + 4 + 183)] ^= in5;
+ bitbuffer[(i/8 + 5 + 183)] ^= in6;
+ bitbuffer[(i/8 + 6 + 183)] ^= in7;
+ bitbuffer[(i/8 + 7 + 183)] ^= in8;
+
+ bitbuffer[(i/8 + 0 + 211)] ^= in1;
+ bitbuffer[(i/8 + 1 + 211)] ^= in2;
+ bitbuffer[(i/8 + 2 + 211)] ^= in3;
+ bitbuffer[(i/8 + 3 + 211)] ^= in4;
+ bitbuffer[(i/8 + 4 + 211)] ^= in5;
+ bitbuffer[(i/8 + 5 + 211)] ^= in6;
+ bitbuffer[(i/8 + 6 + 211)] ^= in7;
+ bitbuffer[(i/8 + 7 + 211)] ^= in8;
+
+ bitbuffer[(i/8 + 0 + 300)] = in1;
+ bitbuffer[(i/8 + 1 + 300)] = in2;
+ bitbuffer[(i/8 + 2 + 300)] = in3;
+ bitbuffer[(i/8 + 3 + 300)] = in4;
+ bitbuffer[(i/8 + 4 + 300)] = in5;
+ bitbuffer[(i/8 + 5 + 300)] = in6;
+ bitbuffer[(i/8 + 6 + 300)] = in7;
+ bitbuffer[(i/8 + 7 + 300)] = in8;
+ }
+
+ uint64_t next1_64 = 0;
+ uint64_t next2_64 = 0;
+ uint64_t next3_64 = 0;
+ uint64_t next4_64 = 0;
+ uint64_t next5_64 = 0;
+ uint64_t final[9] = {0};
+
+ for (; (i + 72 < len); i += 32) {
+ uint64_t in1;
+ uint64_t in2;
+ uint64_t in3;
+ uint64_t in4;
+ uint64_t a1, a2, a3, a4;
+ uint64_t b1, b2, b3, b4;
+ uint64_t c1, c2, c3, c4;
+ uint64_t d1, d2, d3, d4;
+
+ uint64_t out1;
+ uint64_t out2;
+ uint64_t out3;
+ uint64_t out4;
+ uint64_t out5;
+
+ in1 = input[i / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t))];
+ in2 = input[(i + 8) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 1)];
+ in1 = Z_U64_FROM_LE(in1) ^ next1_64;
+ in2 = Z_U64_FROM_LE(in2) ^ next2_64;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = input[(i + 16) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 2)];
+ in4 = input[(i + 24) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 3)];
+ in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1;
+ in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1_64 = next5_64 ^ out1;
+ next2_64 = out2;
+ next3_64 = out3;
+ next4_64 = out4;
+ next5_64 = out5;
+
+ }
+
+ memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+ final[0] ^= Z_U64_TO_LE(next1_64);
+ final[1] ^= Z_U64_TO_LE(next2_64);
+ final[2] ^= Z_U64_TO_LE(next3_64);
+ final[3] ^= Z_U64_TO_LE(next4_64);
+ final[4] ^= Z_U64_TO_LE(next5_64);
+
+ uint8_t *final_bytes = (uint8_t*)final;
+
+ for (size_t j = 0; j < (len-i); j++) {
+ crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8);
+ }
+
+ return ~crc;
+}
+
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+Z_INTERNAL uint32_t crc32_chorba_small_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+ /* The calling function ensured that this is aligned correctly */
+ const uint64_t* input = (const uint64_t*)buf;
+ uint64_t final[9] = {0};
+ uint64_t next1 = ~crc;
+ crc = 0;
+ uint64_t next2 = 0;
+ uint64_t next3 = 0;
+ uint64_t next4 = 0;
+ uint64_t next5 = 0;
+
+ size_t i = 0;
+
+ /* This is weird, doing for vs while drops 10% off the exec time */
+ for (; (i + 256 + 40 + 32 + 32) < len; i += 32) {
+ uint64_t in1;
+ uint64_t in2;
+ uint64_t in3;
+ uint64_t in4;
+ uint64_t a1, a2, a3, a4;
+ uint64_t b1, b2, b3, b4;
+ uint64_t c1, c2, c3, c4;
+ uint64_t d1, d2, d3, d4;
+
+ uint64_t out1;
+ uint64_t out2;
+ uint64_t out3;
+ uint64_t out4;
+ uint64_t out5;
+
+ uint64_t chorba1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1;
+ uint64_t chorba2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2;
+ uint64_t chorba3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3;
+ uint64_t chorba4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4;
+ uint64_t chorba5 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 4]) ^ next5;
+ uint64_t chorba6 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 5]);
+ uint64_t chorba7 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 6]) ^ chorba1;
+ uint64_t chorba8 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 7]) ^ chorba2;
+
+ i += 8 * 8;
+
+ /* 0-3 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ chorba3;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ chorba4 ^ chorba1;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ a1 ^ chorba5 ^ chorba2 ^ chorba1;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ a2 ^ b1 ^ chorba6 ^ chorba3 ^ chorba2;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+
+ i += 32;
+
+ /* 4-7 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba4 ^ chorba3;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba4;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba6 ^ chorba5;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba7 ^ chorba6;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+
+ i += 32;
+
+ /* 8-11 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba1;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba2;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba3;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba4;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+
+ i += 32;
+
+ /* 12-15 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba1;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba2 ^ chorba1;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+
+ i += 32;
+
+ /* 16-19 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+
+ i += 32;
+
+ /* 20-23 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+
+ i += 32;
+
+ /* 24-27 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+
+ i += 32;
+
+ /* 28-31 */
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba6 ^ chorba5;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba7 ^ chorba6;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ }
+
+ for (; (i + 40 + 32) < len; i += 32) {
+ uint64_t in1;
+ uint64_t in2;
+ uint64_t in3;
+ uint64_t in4;
+ uint64_t a1, a2, a3, a4;
+ uint64_t b1, b2, b3, b4;
+ uint64_t c1, c2, c3, c4;
+ uint64_t d1, d2, d3, d4;
+
+ uint64_t out1;
+ uint64_t out2;
+ uint64_t out3;
+ uint64_t out4;
+ uint64_t out5;
+
+ in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1;
+ in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1;
+ in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ }
+
+ memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+ final[0] ^= Z_U64_TO_LE(next1);
+ final[1] ^= Z_U64_TO_LE(next2);
+ final[2] ^= Z_U64_TO_LE(next3);
+ final[3] ^= Z_U64_TO_LE(next4);
+ final[4] ^= Z_U64_TO_LE(next5);
+
+ return crc32_braid(~crc, (uint8_t*)final, len-i);
+}
+
+#else // CHORBA_W == 8
+
+Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit(uint32_t crc, const uint8_t *buf, size_t len) {
+ /* The calling function ensured that this is aligned correctly */
+ const uint32_t* input = (const uint32_t*)buf;
+ uint32_t final[20] = {0};
+
+ uint32_t next1 = ~crc;
+ crc = 0;
+ uint32_t next2 = 0;
+ uint32_t next3 = 0;
+ uint32_t next4 = 0;
+ uint32_t next5 = 0;
+ uint32_t next6 = 0;
+ uint32_t next7 = 0;
+ uint32_t next8 = 0;
+ uint32_t next9 = 0;
+ uint32_t next10 = 0;
+
+ size_t i = 0;
+ for (; i + 80 < len; i += 40) {
+ uint32_t in1;
+ uint32_t in2;
+ uint32_t in3;
+ uint32_t in4;
+ uint32_t in5;
+ uint32_t in6;
+ uint32_t in7;
+ uint32_t in8;
+ uint32_t in9;
+ uint32_t in10;
+
+ uint32_t a1, a2, a3, a4, a6, a7;
+ uint32_t b1, b2, b3, b4, b6, b7;
+ uint32_t c1, c2, c3, c4, c6, c7;
+ uint32_t d1, d2, d3, d4, d6, d7;
+ uint32_t e1, e2, e3, e4, e6, e7;
+ uint32_t f1, f2, f3, f4, f6, f7;
+ uint32_t g1, g2, g3, g4, g6, g7;
+ uint32_t h1, h2, h3, h4, h6, h7;
+ uint32_t i1, i2, i3, i4, i6, i7;
+ uint32_t j1, j2, j3, j4, j6, j7;
+
+ uint32_t out1;
+ uint32_t out2;
+ uint32_t out3;
+ uint32_t out4;
+ uint32_t out5;
+ uint32_t out6;
+ uint32_t out7;
+ uint32_t out8;
+ uint32_t out9;
+ uint32_t out10;
+
+ in1 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 0]) ^ next1;
+ in2 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 1]) ^ next2;
+ in3 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 2]) ^ next3;
+ in4 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 3]) ^ next4;
+
+ a1 = (in1 << 17);
+ a2 = (in1 >> 15) ^ (in1 << 23);
+ a3 = (in1 >> 9) ^ (in1 << 19);
+ a4 = (in1 >> 13);
+ a6 = (in1 << 12);
+ a7 = (in1 >> 20);
+
+ b1 = (in2 << 17);
+ b2 = (in2 >> 15) ^ (in2 << 23);
+ b3 = (in2 >> 9) ^ (in2 << 19);
+ b4 = (in2 >> 13);
+ b6 = (in2 << 12);
+ b7 = (in2 >> 20);
+
+ c1 = (in3 << 17);
+ c2 = (in3 >> 15) ^ (in3 << 23);
+ c3 = (in3 >> 9) ^ (in3 << 19);
+ c4 = (in3 >> 13);
+ c6 = (in3 << 12);
+ c7 = (in3 >> 20);
+
+ d1 = (in4 << 17);
+ d2 = (in4 >> 15) ^ (in4 << 23);
+ d3 = (in4 >> 9) ^ (in4 << 19);
+ d4 = (in4 >> 13);
+ d6 = (in4 << 12);
+ d7 = (in4 >> 20);
+
+ in5 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 4]) ^ next5 ^ a1;
+ in6 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 5]) ^ next6 ^ a2 ^ b1;
+ in7 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 6]) ^ next7 ^ a3 ^ b2 ^ c1;
+ in8 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 7]) ^ next8 ^ a4 ^ b3 ^ c2 ^ d1;
+
+ e1 = (in5 << 17);
+ e2 = (in5 >> 15) ^ (in5 << 23);
+ e3 = (in5 >> 9) ^ (in5 << 19);
+ e4 = (in5 >> 13);
+ e6 = (in5 << 12);
+ e7 = (in5 >> 20);
+
+ f1 = (in6 << 17);
+ f2 = (in6 >> 15) ^ (in6 << 23);
+ f3 = (in6 >> 9) ^ (in6 << 19);
+ f4 = (in6 >> 13);
+ f6 = (in6 << 12);
+ f7 = (in6 >> 20);
+
+ g1 = (in7 << 17);
+ g2 = (in7 >> 15) ^ (in7 << 23);
+ g3 = (in7 >> 9) ^ (in7 << 19);
+ g4 = (in7 >> 13);
+ g6 = (in7 << 12);
+ g7 = (in7 >> 20);
+
+ h1 = (in8 << 17);
+ h2 = (in8 >> 15) ^ (in8 << 23);
+ h3 = (in8 >> 9) ^ (in8 << 19);
+ h4 = (in8 >> 13);
+ h6 = (in8 << 12);
+ h7 = (in8 >> 20);
+
+ in9 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 8]) ^ next9 ^ b4 ^ c3 ^ d2 ^ e1;
+ in10 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 9]) ^ next10 ^ a6 ^ c4 ^ d3 ^ e2 ^ f1;
+
+ i1 = (in9 << 17);
+ i2 = (in9 >> 15) ^ (in9 << 23);
+ i3 = (in9 >> 9) ^ (in9 << 19);
+ i4 = (in9 >> 13);
+ i6 = (in9 << 12);
+ i7 = (in9 >> 20);
+
+ j1 = (in10 << 17);
+ j2 = (in10 >> 15) ^ (in10 << 23);
+ j3 = (in10 >> 9) ^ (in10 << 19);
+ j4 = (in10 >> 13);
+ j6 = (in10 << 12);
+ j7 = (in10 >> 20);
+
+ out1 = a7 ^ b6 ^ d4 ^ e3 ^ f2 ^ g1;
+ out2 = b7 ^ c6 ^ e4 ^ f3 ^ g2 ^ h1;
+ out3 = c7 ^ d6 ^ f4 ^ g3 ^ h2 ^ i1;
+ out4 = d7 ^ e6 ^ g4 ^ h3 ^ i2 ^ j1;
+ out5 = e7 ^ f6 ^ h4 ^ i3 ^ j2;
+ out6 = f7 ^ g6 ^ i4 ^ j3;
+ out7 = g7 ^ h6 ^ j4;
+ out8 = h7 ^ i6;
+ out9 = i7 ^ j6;
+ out10 = j7;
+
+ next1 = out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ next6 = out6;
+ next7 = out7;
+ next8 = out8;
+ next9 = out9;
+ next10 = out10;
+
+ }
+
+ memcpy(final, input+(i/sizeof(uint32_t)), len-i);
+ final[0] ^= Z_U32_TO_LE(next1);
+ final[1] ^= Z_U32_TO_LE(next2);
+ final[2] ^= Z_U32_TO_LE(next3);
+ final[3] ^= Z_U32_TO_LE(next4);
+ final[4] ^= Z_U32_TO_LE(next5);
+ final[5] ^= Z_U32_TO_LE(next6);
+ final[6] ^= Z_U32_TO_LE(next7);
+ final[7] ^= Z_U32_TO_LE(next8);
+ final[8] ^= Z_U32_TO_LE(next9);
+ final[9] ^= Z_U32_TO_LE(next10);
+
+ return crc32_braid(~crc, (uint8_t*)final, len-i);
+}
+#endif // CHORBA_W == 8
+
+Z_INTERNAL uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len) {
+ uintptr_t align_diff = ALIGN_DIFF(buf, 8);
+ if (len <= align_diff + CHORBA_SMALL_THRESHOLD)
+ return crc32_braid(crc, buf, len);
+
+ if (align_diff) {
+ crc = crc32_braid(crc, buf, align_diff);
+ len -= align_diff;
+ buf += align_diff;
+ }
+ if (len > CHORBA_LARGE_THRESHOLD)
+ return crc32_chorba_118960_nondestructive(crc, buf, len);
+#if CHORBA_W == 8
+ if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
+ return crc32_chorba_32768_nondestructive(crc, buf, len);
+ return crc32_chorba_small_nondestructive(crc, buf, len);
+#else
+ return crc32_chorba_small_nondestructive_32bit(crc, buf, len);
+#endif
+}
+
+uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_chorba(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
diff --git a/neozip/arch/generic/generic_functions.h b/neozip/arch/generic/generic_functions.h
new file mode 100644
index 0000000000..c150a2f010
--- /dev/null
+++ b/neozip/arch/generic/generic_functions.h
@@ -0,0 +1,64 @@
+/* generic_functions.h -- generic C implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef GENERIC_FUNCTIONS_H_
+#define GENERIC_FUNCTIONS_H_
+
+#include "zendian.h"
+#include "deflate.h"
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+typedef uint32_t (*adler32_copy_func)(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+typedef uint32_t (*crc32_func)(uint32_t crc, const uint8_t *buf, size_t len);
+typedef uint32_t (*crc32_copy_func)(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+typedef void (*slide_hash_func)(deflate_state *s);
+
+
+uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, size_t len, size_t left);
+
+#ifdef WITH_ALL_FALLBACKS
+uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1);
+uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
+#endif
+uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+
+#ifndef WITHOUT_CHORBA
+ uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len);
+ uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+
+uint32_t longest_match_c(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_c(deflate_state *const s, uint32_t cur_match);
+
+void slide_hash_c(deflate_state *s);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Generic code
+# define native_adler32 adler32_c
+# define native_adler32_copy adler32_copy_c
+# define native_chunkmemset_safe chunkmemset_safe_c
+#ifndef WITHOUT_CHORBA
+# define native_crc32 crc32_chorba
+# define native_crc32_copy crc32_copy_chorba
+#else
+# define native_crc32 crc32_braid
+# define native_crc32_copy crc32_copy_braid
+#endif
+# define native_inflate_fast inflate_fast_c
+# define native_slide_hash slide_hash_c
+# define native_longest_match longest_match_c
+# define native_longest_match_slow longest_match_slow_c
+# define native_compare256 compare256_c
+#endif
+
+#endif
diff --git a/neozip/arch/generic/slide_hash_c.c b/neozip/arch/generic/slide_hash_c.c
new file mode 100644
index 0000000000..8345b9e36b
--- /dev/null
+++ b/neozip/arch/generic/slide_hash_c.c
@@ -0,0 +1,52 @@
+/* slide_hash.c -- slide hash table C implementation
+ *
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+
+/* ===========================================================================
+ * Slide the hash table when sliding the window down (could be avoided with 32
+ * bit values at the expense of memory usage). We slide even when level == 0 to
+ * keep the hash table consistent if we switch back to level > 0 later.
+ */
+static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+#ifdef NOT_TWEAK_COMPILER
+ table += entries;
+ do {
+ unsigned m;
+ m = *--table;
+ *table = (Pos)(m >= wsize ? m-wsize : 0);
+ /* If entries is not on any hash chain, prev[entries] is garbage but
+ * its value will never be used.
+ */
+ } while (--entries);
+#else
+ {
+ /* As of I make this change, gcc (4.8.*) isn't able to vectorize
+ * this hot loop using saturated-subtraction on x86-64 architecture.
+ * To avoid this defect, we can change the loop such that
+ * o. the pointer advance forward, and
+ * o. demote the variable 'm' to be local to the loop, and
+ * choose type "Pos" (instead of 'unsigned int') for the
+ * variable to avoid unnecessary zero-extension.
+ */
+ unsigned int i;
+ Pos *q = table;
+ for (i = 0; i < entries; i++) {
+ Pos m = *q;
+ Pos t = (Pos)wsize;
+ *q++ = (Pos)(m >= t ? m-t: 0);
+ }
+ }
+#endif /* NOT_TWEAK_COMPILER */
+}
+
+Z_INTERNAL void slide_hash_c(deflate_state *s) {
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_c_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_c_chain(s->prev, wsize, wsize);
+}
diff --git a/neozip/arch/loongarch/Makefile.in b/neozip/arch/loongarch/Makefile.in
new file mode 100644
index 0000000000..86baed1553
--- /dev/null
+++ b/neozip/arch/loongarch/Makefile.in
@@ -0,0 +1,99 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+LSXFLAG=-mlsx
+LASXFLAG=-mlasx
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ loongarch_features.o loongarch_features.lo \
+ crc32_la.o crc32_la.lo \
+ adler32_lasx.o adler32_lasx.lo \
+ adler32_lsx.o adler32_lsx.lo \
+ chunkset_lasx.o chunkset_lasx.lo \
+ chunkset_lsx.o chunkset_lsx.lo \
+ compare256_lasx.o compare256_lasx.lo \
+ compare256_lsx.o compare256_lsx.lo \
+ slide_hash_lasx.o slide_hash_lasx.lo \
+ slide_hash_lsx.o slide_hash_lsx.lo
+
+loongarch_features.o: $(SRCDIR)/loongarch_features.c
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c
+
+loongarch_features.lo: $(SRCDIR)/loongarch_features.c
+ $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c
+
+crc32_la.o: $(SRCDIR)/crc32_la.c
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
+
+crc32_la.lo: $(SRCDIR)/crc32_la.c
+ $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
+
+adler32_lasx.o:
+ $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lasx.c
+
+adler32_lasx.lo:
+ $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lasx.c
+
+adler32_lsx.o:
+ $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lsx.c
+
+adler32_lsx.lo:
+ $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lsx.c
+
+chunkset_lasx.o:
+ $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c
+
+chunkset_lasx.lo:
+ $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c
+
+chunkset_lsx.o:
+ $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c
+
+chunkset_lsx.lo:
+ $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c
+
+compare256_lasx.o:
+ $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lasx.lo:
+ $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lsx.o:
+ $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
+compare256_lsx.lo:
+ $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
+slide_hash_lasx.o:
+ $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lasx.lo:
+ $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lsx.o:
+ $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
+slide_hash_lsx.lo:
+ $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/loongarch/adler32_lasx.c b/neozip/arch/loongarch/adler32_lasx.c
new file mode 100644
index 0000000000..a7268e73ff
--- /dev/null
+++ b/neozip/arch/loongarch/adler32_lasx.c
@@ -0,0 +1,154 @@
+/* adler32_lasx.c -- compute the Adler-32 checksum of a data stream, based on Intel AVX2 implementation
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2022 Adam Stylinski
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ * Brian Bockelman <bockelman@gmail.com>
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LASX
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+
+
+/* 32 bit horizontal sum */
+static inline uint32_t hsum256(__m256i x) {
+ __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8));
+ __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2));
+ __m256i sum3 = __lasx_xvadd_w(sum2, __lasx_xvbsrl_v(sum2, 4));
+ return (uint32_t)__lasx_xvpickve2gr_wu(sum3, 0);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+ __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8));
+ __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2));
+ return (uint32_t)__lasx_xvpickve2gr_wu(sum2, 0);
+}
+
+extern uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len);
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 16) {
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
+ } else if (len < 32) {
+ if (COPY) {
+ return adler32_copy_lsx(adler, dst, src, len);
+ } else {
+ return adler32_lsx(adler, src, len);
+ }
+ }
+
+ __m256i vs1, vs2, vs2_0;
+
+ const __m256i dot2v = (__m256i)((v32i8){ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
+ 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33 });
+ const __m256i dot2v_0 = (__m256i)((v32i8){ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 });
+ const __m256i dot3v = __lasx_xvreplgr2vr_h(1);
+ const __m256i zero = __lasx_xvldi(0);
+
+ while (len >= 32) {
+ vs1 = __lasx_xvinsgr2vr_w(zero, adler0, 0);
+ vs2 = __lasx_xvinsgr2vr_w(zero, adler1, 0);
+
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = __lasx_xvldi(0);
+ vs2_0 = vs3;
+
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+ len -= k;
+
+ while (k >= 64) {
+ __m256i vbuf = __lasx_xvld(src, 0);
+ __m256i vbuf_0 = __lasx_xvld(src, 32);
+ src += 64;
+ k -= 64;
+
+ __m256i vs1_sad = lasx_sad_bu(vbuf, zero);
+ __m256i vs1_sad2 = lasx_sad_bu(vbuf_0, zero);
+
+ if (COPY) {
+ __lasx_xvst(vbuf, dst, 0);
+ __lasx_xvst(vbuf_0, dst, 32);
+ dst += 64;
+ }
+
+ vs1 = __lasx_xvadd_w(vs1, vs1_sad);
+ vs3 = __lasx_xvadd_w(vs3, vs1_0);
+ __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+ __m256i v_short_sum2_0 = lasx_maddubs_w_h(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
+ __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+ __m256i vsum2_0 = lasx_madd_w_h(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
+ vs1 = __lasx_xvadd_w(vs1_sad2, vs1);
+ vs2 = __lasx_xvadd_w(vsum2, vs2);
+ vs2_0 = __lasx_xvadd_w(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = __lasx_xvadd_w(vs2_0, vs2);
+ vs3 = __lasx_xvslli_w(vs3, 6);
+ vs2 = __lasx_xvadd_w(vs3, vs2);
+ vs3 = __lasx_xvldi(0);
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+ */
+ __m256i vbuf = __lasx_xvld(src, 0);
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = lasx_sad_bu(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+
+ if (COPY) {
+ __lasx_xvst(vbuf, dst, 0);
+ dst += 32;
+ }
+
+ vs1 = __lasx_xvadd_w(vs1, vs1_sad);
+ vs3 = __lasx_xvadd_w(vs3, vs1_0);
+ __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
+ __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+ vs2 = __lasx_xvadd_w(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ /* Defer the multiplication with 32 to outside of the loop */
+ vs3 = __lasx_xvslli_w(vs3, 5);
+ vs2 = __lasx_xvadd_w(vs2, vs3);
+
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+Z_INTERNAL uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/loongarch/adler32_lsx.c b/neozip/arch/loongarch/adler32_lsx.c
new file mode 100644
index 0000000000..389f74c683
--- /dev/null
+++ b/neozip/arch/loongarch/adler32_lsx.c
@@ -0,0 +1,147 @@
+/* adler32_lsx.c -- compute the Adler-32 checksum of a data stream, based on Intel SSE4.2 implementation
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LSX
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+
+static inline uint32_t partial_hsum(__m128i x) {
+ __m128i second_int = __lsx_vbsrl_v(x, 8);
+ __m128i sum = __lsx_vadd_w(x, second_int);
+ return __lsx_vpickve2gr_w(sum, 0);
+}
+
+static inline uint32_t hsum(__m128i x) {
+ __m128i sum1 = __lsx_vilvh_d(x, x);
+ __m128i sum2 = __lsx_vadd_w(x, sum1);
+ __m128i sum3 = __lsx_vshuf4i_w(sum2, 0x01);
+ __m128i sum4 = __lsx_vadd_w(sum2, sum3);
+ return __lsx_vpickve2gr_w(sum4, 0);
+}
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 16)
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
+
+ __m128i vbuf, vbuf_0;
+ __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ v_sad_sum2, vsum2, vsum2_0;
+ __m128i zero = __lsx_vldi(0);
+ const __m128i dot2v = (__m128i)((v16i8){ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 });
+ const __m128i dot2v_0 = (__m128i)((v16i8){ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 });
+ const __m128i dot3v = __lsx_vreplgr2vr_h(1);
+ size_t k;
+
+ while (len >= 16) {
+
+ k = ALIGN_DOWN(MIN(len, NMAX), 16);
+ len -= k;
+
+ vs1 = __lsx_vinsgr2vr_w(zero, adler0, 0);
+ vs2 = __lsx_vinsgr2vr_w(zero, adler1, 0);
+
+ vs3 = __lsx_vldi(0);
+ vs2_0 = __lsx_vldi(0);
+ vs1_0 = vs1;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = __lsx_vld(src, 0);
+ vbuf_0 = __lsx_vld(src, 16);
+ src += 32;
+ k -= 32;
+
+ v_sad_sum1 = lsx_sad_bu(vbuf, zero);
+ v_sad_sum2 = lsx_sad_bu(vbuf_0, zero);
+
+ if (COPY) {
+ __lsx_vst(vbuf, dst, 0);
+ __lsx_vst(vbuf_0, dst, 16);
+ dst += 32;
+ }
+
+ v_short_sum2 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf, dot2v), __lsx_vmulwod_h_bu_b(vbuf, dot2v));
+ v_short_sum2_0 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf_0, dot2v_0), __lsx_vmulwod_h_bu_b(vbuf_0, dot2v_0));
+
+ vs1 = __lsx_vadd_w(v_sad_sum1, vs1);
+ vs3 = __lsx_vadd_w(vs1_0, vs3);
+
+ vsum2 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2, dot3v), v_short_sum2, dot3v);
+ vsum2_0 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2_0, dot3v), v_short_sum2_0, dot3v);
+ vs1 = __lsx_vadd_w(v_sad_sum2, vs1);
+ vs2 = __lsx_vadd_w(vsum2, vs2);
+ vs2_0 = __lsx_vadd_w(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = __lsx_vadd_w(vs2_0, vs2);
+ vs3 = __lsx_vslli_w(vs3, 5);
+ vs2 = __lsx_vadd_w(vs3, vs2);
+ vs3 = __lsx_vldi(0);
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = __lsx_vld(src, 0);
+ src += 16;
+ k -= 16;
+
+ v_sad_sum1 = lsx_sad_bu(vbuf, zero);
+ v_short_sum2 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf, dot2v_0), __lsx_vmulwod_h_bu_b(vbuf, dot2v_0));
+
+ vs1 = __lsx_vadd_w(v_sad_sum1, vs1);
+ vs3 = __lsx_vadd_w(vs1_0, vs3);
+ vsum2 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2, dot3v), v_short_sum2, dot3v);
+ vs2 = __lsx_vadd_w(vsum2, vs2);
+ vs1_0 = vs1;
+
+ if (COPY) {
+ __lsx_vst(vbuf, dst, 0);
+ dst += 16;
+ }
+ }
+
+ vs3 = __lsx_vslli_w(vs3, 4);
+ vs2 = __lsx_vadd_w(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = hsum(vs2) % BASE;
+ }
+
+ /* If this is true, there's fewer than 16 elements remaining */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler0 | (adler1 << 16);
+}
+
+Z_INTERNAL uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/loongarch/chunkset_lasx.c b/neozip/arch/loongarch/chunkset_lasx.c
new file mode 100644
index 0000000000..905704172d
--- /dev/null
+++ b/neozip/arch/loongarch/chunkset_lasx.c
@@ -0,0 +1,126 @@
+/* chunkset_lasx.c -- LASX inline functions to copy small data chunks, based on Intel AVX2 implementation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LASX
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+#include "lsxintrin_ext.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lasx_xvreplgr2vr_h(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lasx_xvreplgr2vr_w(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lasx_xvreplgr2vr_d(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+ *chunk = lasx_broadcast_128(__lsx_vld(from, 0));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = __lasx_xvld(s, 0);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ __lasx_xvst(*chunk, out, 0);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+ * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
+ * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+ *chunk_rem = lut_rem.remval;
+
+ /* See note in chunkset_ssse3.c for why this is ok */
+ __msan_unpoison(buf + dist, 32 - dist);
+
+ if (dist < 16) {
+ /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+ * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+ * shuffles and combining the halves later */
+ __m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0);
+ __m128i ret_vec0 = __lsx_vld(buf, 0);
+ ret_vec = __lasx_concat_128(ret_vec0, ret_vec0);
+ ret_vec = lasx_shuffle_b(ret_vec, perm_vec);
+ } else {
+ __m128i ret_vec0 = __lsx_vld(buf, 0);
+ __m128i ret_vec1 = __lsx_vld(buf, 16);
+ /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+ __m128i perm_vec1 = __lsx_vld(permute_table + lut_rem.idx, 0);
+ __m128i xlane_permutes = __lsx_vslt_b(perm_vec1, __lsx_vreplgr2vr_b(16));
+ __m128i xlane_res = lsx_shuffle_b(ret_vec0, perm_vec1);
+ /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+ * shuffle those values */
+ __m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes);
+ ret_vec = __lasx_concat_128(ret_vec0, latter_half);
+ }
+
+ return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+ *chunk = __lsx_vld(s, 0);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+ __lsx_vst(*chunk, out, 0);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+ /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+ * unlikely to be actually written or read from */
+ return lasx_zext_128(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = __lsx_vld(buf, 0);
+ *chunk_rem = half_rem_vals[dist - 3];
+
+ perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0);
+ ret_vec = lsx_shuffle_b(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+#define CHUNKSIZE chunksize_lasx
+#define CHUNKCOPY chunkcopy_lasx
+#define CHUNKUNROLL chunkunroll_lasx
+#define CHUNKMEMSET chunkmemset_lasx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_lasx
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_lasx
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/chunkset_lsx.c b/neozip/arch/loongarch/chunkset_lsx.c
new file mode 100644
index 0000000000..23dabfba51
--- /dev/null
+++ b/neozip/arch/loongarch/chunkset_lsx.c
@@ -0,0 +1,74 @@
+/* chunkset_lsx.c -- LSX inline functions to copy small data chunks, based on Intel SSSE3 implementation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LSX
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lsx_vreplgr2vr_h(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lsx_vreplgr2vr_w(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lsx_vreplgr2vr_d(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = __lsx_vld(s, 0);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ __lsx_vst(*chunk, out, 0);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ /* Important to note:
+ * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+ * bytes we willingly and purposefully load uninitialized that we swizzle over
+ * in a vector register, anyway. If what we assume is wrong about what is used,
+ * the memory sanitizer will still usefully flag it */
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = __lsx_vld(buf, 0);
+ *chunk_rem = lut_rem.remval;
+
+ perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0);
+ ret_vec = lsx_shuffle_b(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+#define CHUNKSIZE chunksize_lsx
+#define CHUNKMEMSET chunkmemset_lsx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_lsx
+#define CHUNKCOPY chunkcopy_lsx
+#define CHUNKUNROLL chunkunroll_lsx
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_lsx
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/compare256_lasx.c b/neozip/arch/loongarch/compare256_lasx.c
new file mode 100644
index 0000000000..d61d6e57b3
--- /dev/null
+++ b/neozip/arch/loongarch/compare256_lasx.c
@@ -0,0 +1,60 @@
+/* compare256_lasx.c -- LASX version of compare256, based on Intel AVX2 implementation
+ * Copyright Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef LOONGARCH_LASX
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+
+static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ __m256i ymm_src0, ymm_src1, ymm_cmp;
+ ymm_src0 = __lasx_xvld(src0, 0);
+ ymm_src1 = __lasx_xvld(src1, 0);
+ ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+ unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp);
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
+
+ src0 += 32, src1 += 32, len += 32;
+
+ ymm_src0 = __lasx_xvld(src0, 0);
+ ymm_src1 = __lasx_xvld(src1, 0);
+ ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1);
+ mask = (unsigned)lasx_movemask_b(ymm_cmp);
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask);
+
+ src0 += 32, src1 += 32, len += 32;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_lasx_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_lasx
+#define COMPARE256 compare256_lasx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_lasx
+#define COMPARE256 compare256_lasx_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/compare256_lsx.c b/neozip/arch/loongarch/compare256_lsx.c
new file mode 100644
index 0000000000..4afd261e76
--- /dev/null
+++ b/neozip/arch/loongarch/compare256_lsx.c
@@ -0,0 +1,88 @@
+/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef LOONGARCH_LSX
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+
+static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) {
+ __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+ /* Do the first load unaligned, than all subsequent ones we have at least
+ * one aligned load. Sadly aligning both loads is probably unrealistic */
+ xmm_src0 = __lsx_vld(src0, 0);
+ xmm_src1 = __lsx_vld(src1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return zng_ctz32(~mask);
+
+ const uint8_t *last0 = src0 + 240;
+ const uint8_t *last1 = src1 + 240;
+
+ int align_offset = ((uintptr_t)src0) & 15;
+ int align_adv = 16 - align_offset;
+ uint32_t len = align_adv;
+
+ src0 += align_adv;
+ src1 += align_adv;
+
+ for (int i = 0; i < 15; i++) {
+ xmm_src0 = __lsx_vld(src0, 0);
+ xmm_src1 = __lsx_vld(src1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return len + zng_ctz32(~mask);
+
+ len += 16, src0 += 16, src1 += 16;
+ }
+
+ if (align_offset) {
+ xmm_src0 = __lsx_vld(last0, 0);
+ xmm_src1 = __lsx_vld(last1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ if (mask != 0xFFFF)
+ return 240 + zng_ctz32(~mask);
+ }
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_lsx_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_lsx
+#define COMPARE256 compare256_lsx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_lsx
+#define COMPARE256 compare256_lsx_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/crc32_la.c b/neozip/arch/loongarch/crc32_la.c
new file mode 100644
index 0000000000..f1bd314e65
--- /dev/null
+++ b/neozip/arch/loongarch/crc32_la.c
@@ -0,0 +1,71 @@
+/* crc32_la.c - LoongArch version of crc32
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_CRC
+
+#include "zbuild.h"
+
+#include <larchintrin.h>
+
+Z_INTERNAL uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len) {
+ uint32_t c = ~crc;
+
+ if (UNLIKELY(len == 1)) {
+ c = (uint32_t)__crc_w_b_w((char)(*buf), (int)c);
+ c = ~c;
+ return c;
+ }
+
+ uintptr_t align_diff = ALIGN_DIFF(buf, 8);
+ if (align_diff) {
+ if (len && (align_diff & 1)) {
+ c = (uint32_t)__crc_w_b_w((char)(*buf++), (int)c);
+ len--;
+ }
+
+ if (len >= 2 && (align_diff & 2)) {
+ c = (uint32_t)__crc_w_h_w((short)*((uint16_t*)buf), (int)c);
+ buf += 2;
+ len -= 2;
+ }
+
+ if (len >= 4 && (align_diff & 4)) {
+ c = (uint32_t)__crc_w_w_w((int)*((uint32_t*)buf), (int)c);
+ len -= 4;
+ buf += 4;
+ }
+
+ }
+
+ while (len >= 8) {
+ c = (uint32_t)__crc_w_d_w((long int)*((uint64_t*)buf), (int)c);
+ len -= 8;
+ buf += 8;
+ }
+
+ if (len & 4) {
+ c = (uint32_t)__crc_w_w_w((int)*((uint32_t*)buf), (int)c);
+ buf += 4;
+ }
+
+ if (len & 2) {
+ c = (uint32_t)__crc_w_h_w((short)*((uint16_t*)buf), (int)c);
+ buf += 2;
+ }
+
+ if (len & 1) {
+ c = (uint32_t)__crc_w_b_w((char)(*buf), (int)c);
+ }
+
+ c = ~c;
+ return c;
+}
+
+Z_INTERNAL uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_loongarch64(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+#endif
diff --git a/neozip/arch/loongarch/lasxintrin_ext.h b/neozip/arch/loongarch/lasxintrin_ext.h
new file mode 100644
index 0000000000..b1e72cff86
--- /dev/null
+++ b/neozip/arch/loongarch/lasxintrin_ext.h
@@ -0,0 +1,61 @@
+/* lasxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LASXINTRIN_EXT_H
+#define LASXINTRIN_EXT_H
+
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+
+static inline __m256i lasx_zext_128(__m128i src) {
+#ifdef __loongarch_asx_sx_conv
+ return __lasx_insert_128_lo(__lasx_xvldi(0), src);
+#else
+ __m256i dest = __lasx_xvldi(0);
+ __asm__ volatile ("xvpermi.q %u0,%u2,0x30\n" : "=f"(dest) : "0"(dest), "f"(src));
+ return dest;
+#endif
+}
+
+#ifndef __loongarch_asx_sx_conv
+static inline __m256i __lasx_concat_128(__m128i lo, __m128i hi) {
+ __m256i dest;
+ __asm__ volatile ("xvpermi.q %u0,%u2,0x02\n" : "=f"(dest) : "0"(lo), "f"(hi));
+ return dest;
+}
+#endif
+
+static inline __m256i lasx_broadcast_128(__m128i in) {
+ return __lasx_concat_128(in, in);
+}
+
+static inline __m256i lasx_sad_bu(__m256i a, __m256i b) {
+ __m256i tmp = __lasx_xvabsd_bu(a, b);
+ tmp = __lasx_xvhaddw_hu_bu(tmp, tmp);
+ tmp = __lasx_xvhaddw_wu_hu(tmp, tmp);
+ return __lasx_xvhaddw_du_wu(tmp, tmp);
+}
+
+static inline __m256i lasx_maddubs_w_h(__m256i a, __m256i b) {
+ return __lasx_xvsadd_h(__lasx_xvmulwod_h_bu_b(a, b), __lasx_xvmulwev_h_bu_b(a, b));
+}
+
+static inline __m256i lasx_madd_w_h(__m256i a, __m256i b) {
+ return __lasx_xvmaddwod_w_h(__lasx_xvmulwev_w_h(a, b), a, b);
+}
+
+static inline int lasx_movemask_b(__m256i v) {
+ v = __lasx_xvmskltz_b(v);
+ return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16);
+}
+
+/* See: lsx_shuffle_b */
+static inline __m256i lasx_shuffle_b(__m256i a, __m256i b) {
+ __m256i msb_mask = __lasx_xvslti_b(b, 0);
+ __m256i dst = __lasx_xvshuf_b(a, a, __lasx_xvandi_b(b, 0xF));
+ return __lasx_xvand_v(dst, __lasx_xvnor_v(msb_mask, msb_mask));
+}
+
+#endif // include guard LASXINTRIN_EXT_H
diff --git a/neozip/arch/loongarch/loongarch_features.c b/neozip/arch/loongarch/loongarch_features.c
new file mode 100644
index 0000000000..bedf8499f7
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_features.c
@@ -0,0 +1,31 @@
+/* loongarch_features.c -- check for LoongArch features.
+ *
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_FEATURES
+
+#include "zbuild.h"
+#include "loongarch_features.h"
+
+#include <larchintrin.h>
+
+/*
+ * https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html
+ *
+ * Word number Bit number Annotation Implication
+ * 0x1 25 CRC 1 indicates support for CRC instruction
+ * 0x1 6 LSX 1 indicates support for 128-bit vector extension
+ * 0x1 7 LASX 1 indicates support for 256-bit vector expansion
+ */
+
+void Z_INTERNAL loongarch_check_features(struct loongarch_cpu_features *features) {
+ unsigned int w1 = __cpucfg(0x1);
+ features->has_crc = w1 & 0x2000000;
+ features->has_lsx = w1 & 0x40;
+ features->has_lasx = w1 & 0x80;
+}
+
+#endif
diff --git a/neozip/arch/loongarch/loongarch_features.h b/neozip/arch/loongarch/loongarch_features.h
new file mode 100644
index 0000000000..27c90b14b3
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_features.h
@@ -0,0 +1,19 @@
+/* loongarch_features.h -- check for LoongArch features.
+ *
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef LOONGARCH_FEATURES_H_
+#define LOONGARCH_FEATURES_H_
+
+struct loongarch_cpu_features {
+ int has_crc;
+ int has_lsx;
+ int has_lasx;
+};
+
+void Z_INTERNAL loongarch_check_features(struct loongarch_cpu_features *features);
+
+#endif /* LOONGARCH_FEATURES_H_ */
diff --git a/neozip/arch/loongarch/loongarch_functions.h b/neozip/arch/loongarch/loongarch_functions.h
new file mode 100644
index 0000000000..0ec8bd66d7
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_functions.h
@@ -0,0 +1,86 @@
+/* loongarch_functions.h -- LoongArch implementations for arch-specific functions.
+ *
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef LOONGARCH_FUNCTIONS_H_
+#define LOONGARCH_FUNCTIONS_H_
+
+#include "loongarch_natives.h"
+
+#ifdef LOONGARCH_CRC
+uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef LOONGARCH_LSX
+uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len);
+uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lsx(deflate_state *s);
+#endif
+
+#ifdef LOONGARCH_LASX
+uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len);
+uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lasx(deflate_state *s);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// LOONGARCH - CRC32
+# ifdef LOONGARCH_CRC_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_loongarch64
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_loongarch64
+# endif
+# ifdef LOONGARCH_LSX_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_lsx
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_lsx
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_lsx
+# undef native_compare256
+# define native_compare256 compare256_lsx
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_lsx
+# undef native_longest_match
+# define native_longest_match longest_match_lsx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lsx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lsx
+# endif
+# ifdef LOONGARCH_LASX_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_lasx
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_lasx
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_lasx
+# undef native_compare256
+# define native_compare256 compare256_lasx
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_lasx
+# undef native_longest_match
+# define native_longest_match longest_match_lasx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lasx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lasx
+# endif
+#endif
+
+#endif /* LOONGARCH_FUNCTIONS_H_ */
diff --git a/neozip/arch/loongarch/loongarch_natives.h b/neozip/arch/loongarch/loongarch_natives.h
new file mode 100644
index 0000000000..35f6d3c7bd
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_natives.h
@@ -0,0 +1,25 @@
+/* loongarch_natives.h -- LoongArch compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef LOONGARCH_NATIVES_H_
+#define LOONGARCH_NATIVES_H_
+
+#if defined(__loongarch__)
+// All known CPUs have crc instructions
+# ifdef LOONGARCH_CRC
+# define LOONGARCH_CRC_NATIVE
+# endif
+#endif
+#if defined(__loongarch_sx)
+# ifdef LOONGARCH_LSX
+# define LOONGARCH_LSX_NATIVE
+# endif
+#endif
+#if defined(__loongarch_asx)
+# ifdef LOONGARCH_LASX
+# define LOONGARCH_LASX_NATIVE
+# endif
+#endif
+
+#endif /* LOONGARCH_NATIVES_H_ */
diff --git a/neozip/arch/loongarch/lsxintrin_ext.h b/neozip/arch/loongarch/lsxintrin_ext.h
new file mode 100644
index 0000000000..0a0503b9f9
--- /dev/null
+++ b/neozip/arch/loongarch/lsxintrin_ext.h
@@ -0,0 +1,33 @@
+/* lsxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LSXINTRIN_EXT_H
+#define LSXINTRIN_EXT_H
+
+#include <lsxintrin.h>
+
+
+static inline __m128i lsx_sad_bu(__m128i a, __m128i b) {
+ __m128i tmp = __lsx_vabsd_bu(a, b);
+ tmp = __lsx_vhaddw_hu_bu(tmp, tmp);
+ tmp = __lsx_vhaddw_wu_hu(tmp, tmp);
+ return __lsx_vhaddw_du_wu(tmp, tmp);
+}
+
+static inline int lsx_movemask_b(__m128i v) {
+ return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0);
+}
+
+static inline __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+ /* most significant bit is set - negative 8-bit integer */
+ __m128i msb_mask = __lsx_vslti_b(b, 0);
+
+ /* shuffle, clear msb in indices vector b */
+ __m128i dst = __lsx_vshuf_b(a, a, __lsx_vandi_b(b, 0xF));
+
+ /* invert and apply mask - clear dst-element if b-msb is set */
+ return __lsx_vand_v(dst, __lsx_vnor_v(msb_mask, msb_mask));
+}
+
+#endif // include guard LSXINTRIN_EXT_H
diff --git a/neozip/arch/loongarch/slide_hash_lasx.c b/neozip/arch/loongarch/slide_hash_lasx.c
new file mode 100644
index 0000000000..f464779090
--- /dev/null
+++ b/neozip/arch/loongarch/slide_hash_lasx.c
@@ -0,0 +1,49 @@
+/*
+ * LASX optimized hash slide, based on Intel AVX2 implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LASX
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lasxintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+ table += entries;
+ table -= 32;
+
+ do {
+ __m256i value1, value2, result1, result2;
+
+ value1 = __lasx_xvld(table, 0);
+ value2 = __lasx_xvld(table, 32);
+ result1 = __lasx_xvssub_hu(value1, wsize);
+ result2 = __lasx_xvssub_hu(value2, wsize);
+ __lasx_xvst(result1, table, 0);
+ __lasx_xvst(result2, table, 32);
+
+ table -= 32;
+ entries -= 32;
+ } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_lasx(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m256i ymm_wsize = __lasx_xvreplgr2vr_h((short)wsize);
+
+ slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+ slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/loongarch/slide_hash_lsx.c b/neozip/arch/loongarch/slide_hash_lsx.c
new file mode 100644
index 0000000000..f4c94ea70d
--- /dev/null
+++ b/neozip/arch/loongarch/slide_hash_lsx.c
@@ -0,0 +1,54 @@
+/*
+ * LSX optimized hash slide, based on Intel SSE implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LSX
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lsxintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m128i wsize) {
+ table += entries;
+ table -= 16;
+
+ /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+ * Our alloc function is aligned to 64 byte boundaries */
+ do {
+ __m128i value0, value1, result0, result1;
+
+ value0 = __lsx_vld(table, 0);
+ value1 = __lsx_vld(table, 16);
+ result0 = __lsx_vssub_hu(value0, wsize);
+ result1 = __lsx_vssub_hu(value1, wsize);
+ __lsx_vst(result0, table, 0);
+ __lsx_vst(result1, table, 16);
+
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_lsx(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m128i xmm_wsize = __lsx_vreplgr2vr_h((short)wsize);
+
+ assert(((uintptr_t)s->head & 15) == 0);
+ assert(((uintptr_t)s->prev & 15) == 0);
+
+ slide_hash_chain(s->head, HASH_SIZE, xmm_wsize);
+ slide_hash_chain(s->prev, wsize, xmm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/power/Makefile.in b/neozip/arch/power/Makefile.in
new file mode 100644
index 0000000000..e2bec5e510
--- /dev/null
+++ b/neozip/arch/power/Makefile.in
@@ -0,0 +1,93 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
+PPCFLAGS=-maltivec
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: power_features.o \
+ power_features.lo \
+ adler32_power8.o \
+ adler32_power8.lo \
+ adler32_vmx.o \
+ adler32_vmx.lo \
+ chunkset_power8.o \
+ chunkset_power8.lo \
+ compare256_power9.o \
+ compare256_power9.lo \
+ crc32_power8.o \
+ crc32_power8.lo \
+ slide_hash_power8.o \
+ slide_hash_power8.lo \
+ slide_hash_vmx.o \
+ slide_hash_vmx.lo
+
+power_features.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+power_features.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+adler32_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_vmx.o:
+ $(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+ $(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+chunkset_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+compare256_power9.o:
+ $(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+ $(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+crc32_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+slide_hash_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_vmx.o:
+ $(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+ $(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/power/adler32_power8.c b/neozip/arch/power/adler32_power8.c
new file mode 100644
index 0000000000..39b3cf399c
--- /dev/null
+++ b/neozip/arch/power/adler32_power8.c
@@ -0,0 +1,160 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <altivec.h>
+
+/* Vector across sum unsigned int (saturate). */
+static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+ __b = vec_sld(__a, __a, 8);
+ __b = vec_add(__b, __a);
+ __a = vec_sld(__b, __b, 4);
+ __a = vec_add(__a, __b);
+
+ return __a;
+}
+
+Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) {
+ uint32_t s1 = adler & 0xffff;
+ uint32_t s2 = (adler >> 16) & 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0);
+
+ /* This is faster than VSX code for len < 64. */
+ if (len < 64)
+ return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0);
+
+ /* Use POWER VSX instructions for len >= 64. */
+ const vector unsigned int v_zeros = { 0 };
+ const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+ 6, 5, 4, 3, 2, 1};
+ const vector unsigned char vsh = vec_splat_u8(4);
+ const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+ vector unsigned int vs1 = { 0 };
+ vector unsigned int vs2 = { 0 };
+ vector unsigned int vs1_save = { 0 };
+ vector unsigned int vsum1, vsum2;
+ vector unsigned char vbuf;
+ int n;
+
+ vs1[0] = s1;
+ vs2[0] = s2;
+
+ /* Do length bigger than NMAX in blocks of NMAX size. */
+ while (len >= NMAX) {
+ len -= NMAX;
+ n = NMAX / 16;
+ do {
+ vbuf = vec_xl(0, (unsigned char *) buf);
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+ /* Save vs1. */
+ vs1_save = vec_add(vs1_save, vs1);
+ /* Accumulate the sums. */
+ vs1 = vec_add(vsum1, vs1);
+ vs2 = vec_add(vsum2, vs2);
+
+ buf += 16;
+ } while (--n);
+ /* Once each block of NMAX size. */
+ vs1 = vec_sumsu(vs1, vsum1);
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
+ vs2 = vec_add(vs1_save, vs2);
+ vs2 = vec_sumsu(vs2, vsum2);
+
+ /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
+ vs1[0] = vs1[0] % BASE;
+ /* vs2[0] = s2_i + 16*s1_save +
+ sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
+ vs2[0] = vs2[0] % BASE;
+
+ vs1 = vec_and(vs1, vmask);
+ vs2 = vec_and(vs2, vmask);
+ vs1_save = v_zeros;
+ }
+
+ /* len is less than NMAX one modulo is needed. */
+ if (len >= 16) {
+ while (len >= 16) {
+ len -= 16;
+
+ vbuf = vec_xl(0, (unsigned char *) buf);
+
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+ /* Save vs1. */
+ vs1_save = vec_add(vs1_save, vs1);
+ /* Accumulate the sums. */
+ vs1 = vec_add(vsum1, vs1);
+ vs2 = vec_add(vsum2, vs2);
+
+ buf += 16;
+ }
+ /* Since the size will be always less than NMAX we do this once. */
+ vs1 = vec_sumsu(vs1, vsum1);
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
+ vs2 = vec_add(vs1_save, vs2);
+ vs2 = vec_sumsu(vs2, vsum2);
+ }
+ /* Copy result back to s1, s2 (mod 65521). */
+ s1 = vs1[0] % BASE;
+ s2 = vs2[0] % BASE;
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0);
+}
+
+Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
+ return adler32_impl(adler, buf, len);
+}
+
+/* VSX/VMX stores can have higher latency than optimized memcpy on POWER8+ */
+Z_INTERNAL uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len) {
+ adler = adler32_impl(adler, buf, len);
+ memcpy(dst, buf, len);
+ return adler;
+}
+#endif /* POWER8_VSX */
diff --git a/neozip/arch/power/adler32_vmx.c b/neozip/arch/power/adler32_vmx.c
new file mode 100644
index 0000000000..5171bab35b
--- /dev/null
+++ b/neozip/arch/power/adler32_vmx.c
@@ -0,0 +1,168 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "adler32_p.h"
+
+#include <altivec.h>
+
+#define vmx_zero() (vec_splat_u32(0))
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+ /* Different taps for the separable components of sums */
+ const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+ const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+ const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+ const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+ /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+ * a 2 element vector from a single load + a subsequent shift is just barely faster
+ * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+ const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+ const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+ vector unsigned int adacc, s2acc;
+ vector unsigned int pair_vec = vec_ld(0, s);
+ adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+#if BYTE_ORDER == LITTLE_ENDIAN
+ s2acc = vec_sro(pair_vec, shift_vec);
+#else
+ s2acc = vec_slo(pair_vec, shift_vec);
+#endif
+
+ vector unsigned int zero = vmx_zero();
+ vector unsigned int s3acc = zero;
+ vector unsigned int s3acc_0 = zero;
+ vector unsigned int adacc_prev = adacc;
+ vector unsigned int adacc_prev_0 = zero;
+
+ vector unsigned int s2acc_0 = zero;
+ vector unsigned int s2acc_1 = zero;
+ vector unsigned int s2acc_2 = zero;
+
+ /* Maintain a running sum of a second half, this might help use break yet another
+ * data dependency bubble in the sum */
+ vector unsigned int adacc_0 = zero;
+
+ int num_iter = len / 4;
+ int rem = len & 3;
+
+ for (int i = 0; i < num_iter; ++i) {
+ vector unsigned char d0 = vec_ld(0, buf);
+ vector unsigned char d1 = vec_ld(16, buf);
+ vector unsigned char d2 = vec_ld(32, buf);
+ vector unsigned char d3 = vec_ld(48, buf);
+
+ /* The core operation of the loop, basically
+ * what is being unrolled below */
+ adacc = vec_sum4s(d0, adacc);
+ s3acc = vec_add(s3acc, adacc_prev);
+ s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+ s2acc = vec_msum(t0, d0, s2acc);
+
+ /* interleave dependent sums in here */
+ adacc_0 = vec_sum4s(d1, adacc_0);
+ s2acc_0 = vec_msum(t1, d1, s2acc_0);
+ adacc = vec_sum4s(d2, adacc);
+ s2acc_1 = vec_msum(t2, d2, s2acc_1);
+ s2acc_2 = vec_msum(t3, d3, s2acc_2);
+ adacc_0 = vec_sum4s(d3, adacc_0);
+
+ adacc_prev = adacc;
+ adacc_prev_0 = adacc_0;
+ buf += 64;
+ }
+
+ adacc = vec_add(adacc, adacc_0);
+ s3acc = vec_add(s3acc, s3acc_0);
+ s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+ if (rem) {
+ adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+ adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+ while (rem--) {
+ vector unsigned char d0 = vec_ld(0, buf);
+ adacc = vec_sum4s(d0, adacc);
+ s3acc = vec_add(s3acc, adacc_prev);
+ s2acc = vec_msum(t3, d0, s2acc);
+ adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+ buf += 16;
+ }
+ }
+
+
+ /* Sum up independent second sums */
+ s2acc = vec_add(s2acc, s2acc_0);
+ s2acc_2 = vec_add(s2acc_1, s2acc_2);
+ s2acc = vec_add(s2acc, s2acc_2);
+
+ s2acc = vec_add(s2acc, s3acc);
+
+ adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+ s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+ adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+ s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+ vec_ste(adacc, 0, s);
+ vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+ /* Split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+ uint32_t pair[4] ALIGNED_(16);
+ pair[0] = adler;
+ pair[1] = sum2;
+ pair[2] = 0;
+ pair[3] = 0;
+
+ // Align buffer
+ size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+ size_t n = NMAX;
+ if (align_diff) {
+ adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0);
+
+ buf += align_diff;
+ len -= align_diff;
+ n -= align_diff;
+ }
+
+ while (len >= 16) {
+ n = MIN(len, n);
+
+ vmx_accum32(pair, buf, n / 16);
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+
+ size_t k = (n / 16) * 16;
+ buf += k;
+ len -= k;
+ n = NMAX;
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0);
+}
+
+/* VMX stores can have higher latency than optimized memcpy */
+Z_INTERNAL uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = adler32_vmx(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
+#endif
diff --git a/neozip/arch/power/chunkset_power8.c b/neozip/arch/power/chunkset_power8.c
new file mode 100644
index 0000000000..f9855e677e
--- /dev/null
+++ b/neozip/arch/power/chunkset_power8.c
@@ -0,0 +1,50 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include <altivec.h>
+
+typedef vector unsigned char chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = (vector unsigned char)vec_splats(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = (vector unsigned char)vec_splats(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ vec_xst(*chunk, 0, out);
+}
+
+#define CHUNKSIZE chunksize_power8
+#define CHUNKCOPY chunkcopy_power8
+#define CHUNKUNROLL chunkunroll_power8
+#define CHUNKMEMSET chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_power8
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/power/compare256_power9.c b/neozip/arch/power/compare256_power9.c
new file mode 100644
index 0000000000..99c3b0b6d1
--- /dev/null
+++ b/neozip/arch/power/compare256_power9.c
@@ -0,0 +1,68 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "zendian.h"
+
+#include <altivec.h>
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
+#if BYTE_ORDER == LITTLE_ENDIAN
+# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#else
+# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+#else
+# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0, cmplen;
+
+ do {
+ vector unsigned char vsrc0, vsrc1, vc;
+
+ vsrc0 = *((vector unsigned char *)src0);
+ vsrc1 = *((vector unsigned char *)src1);
+
+ /* Compare 16 bytes at a time. Each byte of vc will be either
+ * all ones or all zeroes, depending on the result of the comparison. */
+ vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+ /* Since the index of matching bytes will contain only zeroes
+ * on vc (since we used cmpne), counting the number of consecutive
+ * bytes where LSB == 0 is the same as counting the length of the match. */
+ zng_vec_vctzlsbb(vc, cmplen);
+ if (cmplen != 16)
+ return len + cmplen;
+
+ src0 += 16, src1 += 16, len += 16;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_power9
+#define COMPARE256 compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_power9
+#define COMPARE256 compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/power/crc32_constants.h b/neozip/arch/power/crc32_constants.h
new file mode 100644
index 0000000000..8c8f2153b6
--- /dev/null
+++ b/neozip/arch/power/crc32_constants.h
@@ -0,0 +1,1123 @@
+/* Constants table used by crc32_power8.c
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * This file was automatically generated, DO NOT EDIT IT MANUALLY.
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zendian.h"
+#include "zbuild.h"
+
+/* Reduce 262144 kbits to 1024 bits */
+static const __vector unsigned long long vcrc_const[255] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ { 0x0000000099ea94a8, 0x00000001651797d2 },
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ { 0x00000000945a8420, 0x0000000021e0d56c },
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ { 0x0000000030762706, 0x000000000f95ecaa },
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ { 0x00000001a52fc582, 0x00000001ebd224ac },
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ { 0x00000001a4a7167a, 0x000000000ccb97ca },
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ { 0x000000000c18249a, 0x00000001006ec8a8 },
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ { 0x00000000a924ae7c, 0x000000014f58f196 },
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ { 0x00000001e12ccc12, 0x00000001a7192ca6 },
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ { 0x00000000a0b9d4ac, 0x000000019a64bab2 },
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ { 0x0000000095e8ddfe, 0x0000000014f4ed2e },
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ { 0x00000000233fddc4, 0x000000011092b6a2 },
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ { 0x00000001b4529b62, 0x00000000c8a1629c },
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ { 0x00000001a7fa0e64, 0x000000017bf32e8e },
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ { 0x00000001b5334592, 0x00000001f8cc6582 },
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ { 0x000000011f8ee1b4, 0x000000008631ddf0 },
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ { 0x000000006252e632, 0x000000007e5a76d0 },
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ { 0x00000000ab973e84, 0x000000002b09b31c },
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ { 0x000000007734f5ec, 0x00000001b2df1f84 },
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ { 0x000000007c547798, 0x00000001d6f56afc },
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ { 0x000000007ec40210, 0x00000001b9b5e70c },
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ { 0x00000001ab1695a8, 0x0000000034b626d2 },
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ { 0x0000000090494bba, 0x000000014c53479a },
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ { 0x00000001123fb816, 0x00000001a6d179a4 },
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ { 0x00000001e188c74c, 0x000000015abd16b4 },
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ { 0x00000001c2d3451c, 0x00000000018f9852 },
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ { 0x00000000f55cf1ca, 0x000000001fb3084a },
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ { 0x00000001a0531540, 0x00000000c53dfb04 },
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ { 0x0000000132cd7ebc, 0x00000000e10c9ad6 },
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ { 0x0000000073ab7f36, 0x0000000025aa994a },
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ { 0x0000000041aed1c2, 0x00000000fa3a74c4 },
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ { 0x0000000136c53800, 0x0000000033eb3f40 },
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ { 0x0000000126835a30, 0x000000017193f296 },
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ { 0x000000006241b502, 0x0000000043f6c86a },
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ { 0x00000000d5196ad4, 0x000000016b513ec6 },
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ { 0x000000009cfa769a, 0x00000000c8f25b4e },
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ { 0x00000000920e5df4, 0x00000001a45048ec },
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ { 0x0000000169dc310e, 0x000000000c441004 },
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ { 0x0000000009fc331c, 0x000000000e17cad6 },
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ { 0x000000010d94a81e, 0x00000001253ae964 },
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ { 0x0000000027a20ab2, 0x00000001d7c88ebc },
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ { 0x0000000114f87504, 0x00000001e7ca913a },
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ { 0x000000004b076d96, 0x0000000033ed078a },
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ { 0x00000000da4d1e74, 0x00000000e1839c78 },
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ { 0x000000001b81f672, 0x00000001322b267e },
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ { 0x000000009367c988, 0x00000000638231b6 },
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ { 0x00000001717214ca, 0x00000001ee7f16f4 },
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ { 0x000000009f47d820, 0x0000000117d9924a },
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ { 0x000000010d9a47d2, 0x00000000e1a9e0c4 },
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ { 0x00000000a696c58c, 0x00000001403731dc },
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ { 0x000000002aa28ec6, 0x00000001a5ea9682 },
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ { 0x00000001fe18fd9a, 0x0000000101c5c578 },
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ { 0x000000019d4fc1ae, 0x00000000dddf6494 },
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ { 0x00000001ba0e3dea, 0x00000000f1c3db28 },
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ { 0x0000000074b59a5e, 0x000000013112fb9c },
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ { 0x00000000f2b5ea98, 0x00000000b680b906 },
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ { 0x0000000187132676, 0x000000001a282932 },
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ { 0x000000010a8c6ad4, 0x0000000089406e7e },
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ { 0x00000001e21dfe70, 0x00000001def6be8c },
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ { 0x00000001da0050e4, 0x0000000075258728 },
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ { 0x00000000772172ae, 0x000000019536090a },
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ { 0x00000000e47724aa, 0x00000000f2455bfc },
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ { 0x000000003cd63ac4, 0x000000018c40baf4 },
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ { 0x00000001bf47d352, 0x000000004cd390d4 },
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ { 0x000000018dc1d708, 0x00000001e4ece95a },
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ { 0x000000002d4620a4, 0x000000001a3ee918 },
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ { 0x0000000058fd1740, 0x000000007c652fb8 },
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ { 0x00000000dadd9bfc, 0x000000011c67842c },
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ { 0x00000001ea2140be, 0x00000000254f759c },
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ { 0x000000009de128ba, 0x000000007ece94ca },
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ { 0x000000013ac3aa8e, 0x0000000038f258c2 },
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ { 0x0000000099980562, 0x00000001cdf17b00 },
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ { 0x00000001c1579c86, 0x000000011f882c16 },
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ { 0x0000000068dbbf94, 0x0000000100093fc8 },
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ { 0x000000004509fb04, 0x00000001cd684f16 },
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ { 0x00000001202f6398, 0x000000004bc6a70a },
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ { 0x000000013aea243e, 0x000000004fc7e8e4 },
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ { 0x00000001b4052ae6, 0x0000000130103f1c },
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ { 0x00000001cd2a0ae8, 0x0000000111b0024c },
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ { 0x00000001fe4aa8b4, 0x000000010b3079da },
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ { 0x00000001d1559a42, 0x000000010192bcc2 },
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ { 0x00000001f3e05ecc, 0x0000000074838d50 },
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ { 0x0000000104ddd2cc, 0x000000001b20f520 },
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ { 0x000000015393153c, 0x0000000050c3590a },
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ { 0x0000000057e942c6, 0x00000000b41cac8e },
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ { 0x000000012c633850, 0x000000000c72cc78 },
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ { 0x00000000ebcaae4c, 0x0000000030cdb032 },
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ { 0x000000013ee532a6, 0x000000013e09fc32 },
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ { 0x00000001bf0cbc7e, 0x000000001ed624d2 },
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ { 0x00000000d50b7a5a, 0x00000000781aee1a },
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ { 0x0000000002fca6e8, 0x00000001c4d8348c },
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ { 0x000000007af40044, 0x0000000057a40336 },
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ { 0x0000000016178744, 0x0000000085544940 },
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ { 0x000000014c177458, 0x000000019cd21e80 },
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ { 0x000000011b6ddf04, 0x000000013eb95bc0 },
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ { 0x00000001f3e29ccc, 0x00000001dfc9fdfc },
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ { 0x0000000135ae7562, 0x00000000cd028bc2 },
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ { 0x0000000190ef812c, 0x0000000090db8c44 },
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ { 0x0000000067a2c786, 0x000000010010a4ce },
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ { 0x0000000048b9496c, 0x00000001c8f4c72c },
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ { 0x000000015a422de6, 0x000000001c26170c },
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ { 0x00000001ef0e3640, 0x00000000e3fccf68 },
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ { 0x00000001006d2d26, 0x00000000d513ed24 },
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ { 0x00000001170d56d6, 0x00000000141beada },
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ { 0x00000000a5fb613c, 0x000000011071aea0 },
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ { 0x0000000040bbf7fc, 0x000000012e19080a },
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ { 0x000000016ac3a5b2, 0x0000000100ecf826 },
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ { 0x00000000abf16230, 0x0000000069b09412 },
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ { 0x00000001ebe23fac, 0x0000000122297bac },
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ { 0x000000008b6a0894, 0x00000000e9e4b068 },
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ { 0x00000001288ea478, 0x000000004b38651a },
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ { 0x000000016619c442, 0x00000001468360e2 },
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ { 0x0000000086230038, 0x00000000121c2408 },
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ { 0x000000017746a756, 0x00000000da7e7d08 },
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ { 0x0000000191b8f8f8, 0x00000001058d7652 },
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ { 0x000000008e167708, 0x000000014a098a90 },
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ { 0x0000000148b22d54, 0x0000000020dbe72e },
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ { 0x0000000044ba2c3c, 0x000000011e7323e8 },
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ { 0x00000000b54d2b52, 0x00000000d5d4bf94 },
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ { 0x0000000005a4fd8a, 0x0000000199d8746c },
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ { 0x0000000139f9fc46, 0x00000000ce9ca8a0 },
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ { 0x000000015a1fa824, 0x00000000136edece },
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ { 0x000000000a61ae4c, 0x000000019b92a068 },
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ { 0x0000000145e9113e, 0x0000000071d62206 },
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ { 0x000000006a348448, 0x00000000dfc50158 },
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ { 0x000000004d80a08c, 0x00000001517626bc },
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ { 0x000000014b6837a0, 0x0000000148d1e4fa },
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ { 0x000000016896a7fc, 0x0000000094d8266e },
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ { 0x000000014f187140, 0x00000000606c5e34 },
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ { 0x000000019581b9da, 0x000000019766beaa },
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ { 0x00000001091bc984, 0x00000001d80c506c },
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ { 0x000000001067223c, 0x000000001e73837c },
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ { 0x00000001ab16ea02, 0x0000000064d587de },
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ { 0x000000013c4598a8, 0x00000000f4a507b0 },
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ { 0x00000000b3735430, 0x0000000040e342fc },
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ { 0x00000001bb3fc0c0, 0x00000001d5ad9c3a },
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ { 0x00000001570ae19c, 0x0000000094a691a4 },
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ { 0x00000001ea910712, 0x00000001271ecdfa },
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ { 0x0000000167127128, 0x000000009e54475a },
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ { 0x0000000019e790a2, 0x00000000c9c099ee },
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ { 0x000000003788f710, 0x000000009a2f736c },
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ { 0x00000001682a160e, 0x00000000bb9f4996 },
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ { 0x000000007f0ebd2e, 0x00000001db688050 },
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ { 0x000000002b032080, 0x00000000e9b10af4 },
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ { 0x00000000cfd1664a, 0x000000012d4545e4 },
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ { 0x00000000aa1181c2, 0x000000000361139c },
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ { 0x00000000ddd08002, 0x00000001a5a1a3a8 },
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ { 0x00000000e8dd0446, 0x000000006844e0b0 },
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ { 0x00000001bbd94a00, 0x00000000c3762f28 },
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ { 0x00000000ab6cd180, 0x00000001d26287a2 },
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ { 0x0000000031803ce2, 0x00000001f6f0bba8 },
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ { 0x0000000024f40b0c, 0x000000002ffabd62 },
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ { 0x00000001ba1d9834, 0x00000000fb4516b8 },
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ { 0x0000000104de61aa, 0x000000018cfa961c },
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ { 0x0000000113e40d46, 0x000000019e588d52 },
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ { 0x00000001415598a0, 0x00000001180f0bbc },
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ { 0x00000000bf6c8c90, 0x00000000e1d9177a },
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ { 0x00000001788b0504, 0x0000000105abc27c },
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ { 0x0000000038385d02, 0x00000000972e4a58 },
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ { 0x00000001b6c83844, 0x0000000183499a5e },
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ { 0x0000000051061a8a, 0x00000001c96a8cca },
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ { 0x000000017351388a, 0x00000001a1a5b60c },
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ { 0x0000000132928f92, 0x00000000e4b6ac9c },
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ { 0x00000000e6b4f48a, 0x00000001807e7f5a },
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ { 0x0000000039d15e90, 0x000000017a7e3bc8 },
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ { 0x00000000312d6074, 0x00000000d73975da },
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ { 0x000000017bbb2cc4, 0x000000017375d038 },
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ { 0x000000016ded3e18, 0x00000000193680bc },
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ { 0x00000000f1638b16, 0x00000000999b06f6 },
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ { 0x00000001d38b9ecc, 0x00000001f685d2b8 },
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ { 0x000000018b8d09dc, 0x00000001f4ecbed2 },
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ { 0x00000000e7bc27d2, 0x00000000ba16f1a0 },
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ { 0x00000000275e1e96, 0x0000000115aceac4 },
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ { 0x00000000e2e3031e, 0x00000001aeff6292 },
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ { 0x00000001041c84d8, 0x000000009640124c },
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ { 0x00000000706ce672, 0x0000000114f41f02 },
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ { 0x000000015d5070da, 0x000000009c5f3586 },
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ { 0x0000000038f9493a, 0x00000001878275fa },
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ { 0x00000000a3348a76, 0x00000000ddc42ce8 },
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ { 0x00000001ad0aab92, 0x0000000181d2c73a },
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ { 0x000000019e85f712, 0x0000000141c9320a },
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ { 0x000000005a871e76, 0x000000015235719a },
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ { 0x000000017249c662, 0x00000000be27d804 },
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ { 0x000000003a084712, 0x000000006242d45a },
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ { 0x00000000ed438478, 0x000000009a53638e },
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ { 0x00000000abac34cc, 0x00000001001ecfb6 },
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ { 0x000000005f35ef3e, 0x000000016d7c2d64 },
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ { 0x0000000047d6608c, 0x00000001d0ce46c0 },
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ { 0x000000002d01470e, 0x0000000124c907b4 },
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ { 0x0000000158bbc7b0, 0x0000000018a555ca },
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ { 0x00000000c0a23e8e, 0x000000006b0980bc },
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ { 0x00000001ebd85c88, 0x000000008bbba964 },
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ { 0x000000019ee20bb2, 0x00000001070a5a1e },
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ { 0x00000001acabf2d6, 0x000000002204322a },
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ { 0x00000001b7963d56, 0x00000000a27524d0 },
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ { 0x000000017bffa1fe, 0x0000000020b1e4ba },
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ { 0x000000001f15333e, 0x0000000032cc27fc },
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ { 0x000000018593129e, 0x0000000044dd22b8 },
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ { 0x000000019cb32602, 0x00000000dffc9e0a },
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ { 0x0000000142b05cc8, 0x00000001b7a0ed14 },
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ { 0x00000001be49e7a4, 0x00000000c7842488 },
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ { 0x0000000108f69d6c, 0x00000001c02a4fee },
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ { 0x000000006c0971f0, 0x000000003c273778 },
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ { 0x000000005b16467a, 0x00000001d63f8894 },
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ { 0x00000001551a628e, 0x000000006be557d6 },
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ { 0x000000019e42ea92, 0x000000006a7806ea },
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ { 0x000000012fa83ff2, 0x000000016155aa0c },
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ { 0x000000011ca9cde0, 0x00000000908650ac },
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ { 0x00000000c8e5cd74, 0x00000000aa5a8084 },
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ { 0x0000000096c27f0c, 0x0000000191bb500a },
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ { 0x000000002baed926, 0x0000000064e9bed0 },
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ { 0x000000017c8de8d2, 0x000000009444f302 },
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ { 0x00000000d43d6068, 0x000000019db07d3c },
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ { 0x00000000cb2c4b26, 0x00000001359e3e6e },
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ { 0x0000000145b8da26, 0x00000001e4f10dd2 },
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ { 0x000000018fff4b08, 0x0000000124f5735e },
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ { 0x0000000150b58ed0, 0x0000000124760a4c },
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ { 0x00000001549f39bc, 0x000000000f1fc186 },
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ { 0x00000000ef4d2f42, 0x00000000150e4cc4 },
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ { 0x00000001b1468572, 0x000000002a6204e8 },
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ { 0x000000013d7403b2, 0x00000000beb1d432 },
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ { 0x00000001a4681842, 0x0000000135f3f1f0 },
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ { 0x0000000167714492, 0x0000000074fe2232 },
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ { 0x00000001e599099a, 0x000000001ac6e2ba },
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ { 0x00000000fe128194, 0x0000000013fca91e },
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ { 0x0000000077e8b990, 0x0000000183f4931e },
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ { 0x00000001a267f63a, 0x00000000b6d9b4e4 },
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ { 0x00000001945c245a, 0x00000000b5188656 },
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ { 0x0000000149002e76, 0x0000000027a81a84 },
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ { 0x00000001bb8310a4, 0x0000000125699258 },
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ { 0x000000019ec60bcc, 0x00000001b23de796 },
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ { 0x000000012d8590ae, 0x00000000fe4365dc },
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ { 0x0000000065b00684, 0x00000000c68f497a },
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ { 0x000000015e5aeadc, 0x00000000fbf521ee },
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ { 0x00000000b77ff2b0, 0x000000015eac3378 },
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ { 0x0000000188da2ff6, 0x0000000134914b90 },
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ { 0x0000000063da929a, 0x0000000016335cfe },
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ { 0x00000001389caa80, 0x000000010372d10c },
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ { 0x000000013db599d2, 0x000000015097b908 },
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ { 0x0000000122505a86, 0x00000001227a7572 },
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ { 0x000000016bd72746, 0x000000009a8f75c0 },
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ { 0x00000001c3faf1d4, 0x00000000682c77a2 },
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ { 0x00000001111c826c, 0x00000000231f091c },
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ { 0x00000000153e9fb2, 0x000000007d4439f2 },
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ { 0x000000002b1f7b60, 0x000000017e221efc },
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ { 0x00000000b1dba570, 0x0000000167457c38 },
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ { 0x00000001f6397b76, 0x00000000bdf081c4 },
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ { 0x0000000156335214, 0x000000016286d6b0 },
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ { 0x00000001d70e3986, 0x00000000c84f001c },
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ { 0x000000003701a774, 0x0000000064efe7c0 },
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ { 0x00000000ac81ef72, 0x000000000ac2d904 },
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ { 0x0000000133212464, 0x00000000fd226d14 },
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ { 0x00000000e4e45610, 0x000000011cfd42e0 },
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ { 0x000000000c1bd370, 0x000000016e5a5678 },
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ { 0x00000001a7b9e7a6, 0x00000001d888fe22 },
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ { 0x000000007d657a10, 0x00000001af77fcd4 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ { 0x00000001651797d2, 0x0000000099ea94a8 },
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ { 0x0000000021e0d56c, 0x00000000945a8420 },
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ { 0x000000000f95ecaa, 0x0000000030762706 },
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ { 0x00000001ebd224ac, 0x00000001a52fc582 },
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ { 0x000000000ccb97ca, 0x00000001a4a7167a },
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ { 0x00000001006ec8a8, 0x000000000c18249a },
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ { 0x000000014f58f196, 0x00000000a924ae7c },
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ { 0x00000001a7192ca6, 0x00000001e12ccc12 },
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ { 0x000000019a64bab2, 0x00000000a0b9d4ac },
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ { 0x0000000014f4ed2e, 0x0000000095e8ddfe },
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ { 0x000000011092b6a2, 0x00000000233fddc4 },
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ { 0x00000000c8a1629c, 0x00000001b4529b62 },
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ { 0x000000017bf32e8e, 0x00000001a7fa0e64 },
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ { 0x00000001f8cc6582, 0x00000001b5334592 },
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ { 0x000000008631ddf0, 0x000000011f8ee1b4 },
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ { 0x000000007e5a76d0, 0x000000006252e632 },
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ { 0x000000002b09b31c, 0x00000000ab973e84 },
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ { 0x00000001b2df1f84, 0x000000007734f5ec },
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ { 0x00000001d6f56afc, 0x000000007c547798 },
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ { 0x00000001b9b5e70c, 0x000000007ec40210 },
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ { 0x0000000034b626d2, 0x00000001ab1695a8 },
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ { 0x000000014c53479a, 0x0000000090494bba },
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ { 0x00000001a6d179a4, 0x00000001123fb816 },
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ { 0x000000015abd16b4, 0x00000001e188c74c },
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ { 0x00000000018f9852, 0x00000001c2d3451c },
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ { 0x000000001fb3084a, 0x00000000f55cf1ca },
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ { 0x00000000c53dfb04, 0x00000001a0531540 },
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ { 0x00000000e10c9ad6, 0x0000000132cd7ebc },
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ { 0x0000000025aa994a, 0x0000000073ab7f36 },
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ { 0x00000000fa3a74c4, 0x0000000041aed1c2 },
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ { 0x0000000033eb3f40, 0x0000000136c53800 },
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ { 0x000000017193f296, 0x0000000126835a30 },
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ { 0x0000000043f6c86a, 0x000000006241b502 },
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ { 0x000000016b513ec6, 0x00000000d5196ad4 },
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ { 0x00000000c8f25b4e, 0x000000009cfa769a },
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ { 0x00000001a45048ec, 0x00000000920e5df4 },
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ { 0x000000000c441004, 0x0000000169dc310e },
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ { 0x000000000e17cad6, 0x0000000009fc331c },
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ { 0x00000001253ae964, 0x000000010d94a81e },
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ { 0x00000001d7c88ebc, 0x0000000027a20ab2 },
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ { 0x00000001e7ca913a, 0x0000000114f87504 },
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ { 0x0000000033ed078a, 0x000000004b076d96 },
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ { 0x00000000e1839c78, 0x00000000da4d1e74 },
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ { 0x00000001322b267e, 0x000000001b81f672 },
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ { 0x00000000638231b6, 0x000000009367c988 },
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ { 0x00000001ee7f16f4, 0x00000001717214ca },
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ { 0x0000000117d9924a, 0x000000009f47d820 },
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ { 0x00000000e1a9e0c4, 0x000000010d9a47d2 },
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ { 0x00000001403731dc, 0x00000000a696c58c },
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ { 0x00000001a5ea9682, 0x000000002aa28ec6 },
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ { 0x0000000101c5c578, 0x00000001fe18fd9a },
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ { 0x00000000dddf6494, 0x000000019d4fc1ae },
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ { 0x00000000f1c3db28, 0x00000001ba0e3dea },
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ { 0x000000013112fb9c, 0x0000000074b59a5e },
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ { 0x00000000b680b906, 0x00000000f2b5ea98 },
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ { 0x000000001a282932, 0x0000000187132676 },
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ { 0x0000000089406e7e, 0x000000010a8c6ad4 },
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ { 0x00000001def6be8c, 0x00000001e21dfe70 },
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ { 0x0000000075258728, 0x00000001da0050e4 },
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ { 0x000000019536090a, 0x00000000772172ae },
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ { 0x00000000f2455bfc, 0x00000000e47724aa },
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ { 0x000000018c40baf4, 0x000000003cd63ac4 },
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ { 0x000000004cd390d4, 0x00000001bf47d352 },
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ { 0x00000001e4ece95a, 0x000000018dc1d708 },
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ { 0x000000001a3ee918, 0x000000002d4620a4 },
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ { 0x000000007c652fb8, 0x0000000058fd1740 },
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ { 0x000000011c67842c, 0x00000000dadd9bfc },
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ { 0x00000000254f759c, 0x00000001ea2140be },
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ { 0x000000007ece94ca, 0x000000009de128ba },
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ { 0x0000000038f258c2, 0x000000013ac3aa8e },
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ { 0x00000001cdf17b00, 0x0000000099980562 },
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ { 0x000000011f882c16, 0x00000001c1579c86 },
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ { 0x0000000100093fc8, 0x0000000068dbbf94 },
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ { 0x00000001cd684f16, 0x000000004509fb04 },
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ { 0x000000004bc6a70a, 0x00000001202f6398 },
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ { 0x000000004fc7e8e4, 0x000000013aea243e },
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ { 0x0000000130103f1c, 0x00000001b4052ae6 },
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ { 0x0000000111b0024c, 0x00000001cd2a0ae8 },
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ { 0x000000010b3079da, 0x00000001fe4aa8b4 },
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ { 0x000000010192bcc2, 0x00000001d1559a42 },
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ { 0x0000000074838d50, 0x00000001f3e05ecc },
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ { 0x000000001b20f520, 0x0000000104ddd2cc },
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ { 0x0000000050c3590a, 0x000000015393153c },
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ { 0x00000000b41cac8e, 0x0000000057e942c6 },
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ { 0x000000000c72cc78, 0x000000012c633850 },
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ { 0x0000000030cdb032, 0x00000000ebcaae4c },
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ { 0x000000013e09fc32, 0x000000013ee532a6 },
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ { 0x000000001ed624d2, 0x00000001bf0cbc7e },
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ { 0x00000000781aee1a, 0x00000000d50b7a5a },
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ { 0x00000001c4d8348c, 0x0000000002fca6e8 },
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ { 0x0000000057a40336, 0x000000007af40044 },
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ { 0x0000000085544940, 0x0000000016178744 },
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ { 0x000000019cd21e80, 0x000000014c177458 },
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ { 0x000000013eb95bc0, 0x000000011b6ddf04 },
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ { 0x00000001dfc9fdfc, 0x00000001f3e29ccc },
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ { 0x00000000cd028bc2, 0x0000000135ae7562 },
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ { 0x0000000090db8c44, 0x0000000190ef812c },
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ { 0x000000010010a4ce, 0x0000000067a2c786 },
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ { 0x00000001c8f4c72c, 0x0000000048b9496c },
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ { 0x000000001c26170c, 0x000000015a422de6 },
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ { 0x00000000e3fccf68, 0x00000001ef0e3640 },
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ { 0x00000000d513ed24, 0x00000001006d2d26 },
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ { 0x00000000141beada, 0x00000001170d56d6 },
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ { 0x000000011071aea0, 0x00000000a5fb613c },
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ { 0x000000012e19080a, 0x0000000040bbf7fc },
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ { 0x0000000100ecf826, 0x000000016ac3a5b2 },
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ { 0x0000000069b09412, 0x00000000abf16230 },
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ { 0x0000000122297bac, 0x00000001ebe23fac },
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ { 0x00000000e9e4b068, 0x000000008b6a0894 },
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ { 0x000000004b38651a, 0x00000001288ea478 },
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ { 0x00000001468360e2, 0x000000016619c442 },
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ { 0x00000000121c2408, 0x0000000086230038 },
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ { 0x00000000da7e7d08, 0x000000017746a756 },
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ { 0x00000001058d7652, 0x0000000191b8f8f8 },
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ { 0x000000014a098a90, 0x000000008e167708 },
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ { 0x0000000020dbe72e, 0x0000000148b22d54 },
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ { 0x000000011e7323e8, 0x0000000044ba2c3c },
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ { 0x00000000d5d4bf94, 0x00000000b54d2b52 },
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ { 0x0000000199d8746c, 0x0000000005a4fd8a },
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ { 0x00000000ce9ca8a0, 0x0000000139f9fc46 },
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ { 0x00000000136edece, 0x000000015a1fa824 },
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ { 0x000000019b92a068, 0x000000000a61ae4c },
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ { 0x0000000071d62206, 0x0000000145e9113e },
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ { 0x00000000dfc50158, 0x000000006a348448 },
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ { 0x00000001517626bc, 0x000000004d80a08c },
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ { 0x0000000148d1e4fa, 0x000000014b6837a0 },
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ { 0x0000000094d8266e, 0x000000016896a7fc },
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ { 0x00000000606c5e34, 0x000000014f187140 },
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ { 0x000000019766beaa, 0x000000019581b9da },
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ { 0x00000001d80c506c, 0x00000001091bc984 },
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ { 0x000000001e73837c, 0x000000001067223c },
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ { 0x0000000064d587de, 0x00000001ab16ea02 },
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ { 0x00000000f4a507b0, 0x000000013c4598a8 },
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ { 0x0000000040e342fc, 0x00000000b3735430 },
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ { 0x00000001d5ad9c3a, 0x00000001bb3fc0c0 },
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ { 0x0000000094a691a4, 0x00000001570ae19c },
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ { 0x00000001271ecdfa, 0x00000001ea910712 },
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ { 0x000000009e54475a, 0x0000000167127128 },
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ { 0x00000000c9c099ee, 0x0000000019e790a2 },
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ { 0x000000009a2f736c, 0x000000003788f710 },
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ { 0x00000000bb9f4996, 0x00000001682a160e },
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ { 0x00000001db688050, 0x000000007f0ebd2e },
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ { 0x00000000e9b10af4, 0x000000002b032080 },
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ { 0x000000012d4545e4, 0x00000000cfd1664a },
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ { 0x000000000361139c, 0x00000000aa1181c2 },
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ { 0x00000001a5a1a3a8, 0x00000000ddd08002 },
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ { 0x000000006844e0b0, 0x00000000e8dd0446 },
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ { 0x00000000c3762f28, 0x00000001bbd94a00 },
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ { 0x00000001d26287a2, 0x00000000ab6cd180 },
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ { 0x00000001f6f0bba8, 0x0000000031803ce2 },
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ { 0x000000002ffabd62, 0x0000000024f40b0c },
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ { 0x00000000fb4516b8, 0x00000001ba1d9834 },
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ { 0x000000018cfa961c, 0x0000000104de61aa },
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ { 0x000000019e588d52, 0x0000000113e40d46 },
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ { 0x00000001180f0bbc, 0x00000001415598a0 },
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ { 0x00000000e1d9177a, 0x00000000bf6c8c90 },
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ { 0x0000000105abc27c, 0x00000001788b0504 },
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ { 0x00000000972e4a58, 0x0000000038385d02 },
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ { 0x0000000183499a5e, 0x00000001b6c83844 },
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ { 0x00000001c96a8cca, 0x0000000051061a8a },
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ { 0x00000001a1a5b60c, 0x000000017351388a },
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ { 0x00000000e4b6ac9c, 0x0000000132928f92 },
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ { 0x00000001807e7f5a, 0x00000000e6b4f48a },
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ { 0x000000017a7e3bc8, 0x0000000039d15e90 },
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ { 0x00000000d73975da, 0x00000000312d6074 },
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ { 0x000000017375d038, 0x000000017bbb2cc4 },
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ { 0x00000000193680bc, 0x000000016ded3e18 },
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ { 0x00000000999b06f6, 0x00000000f1638b16 },
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ { 0x00000001f685d2b8, 0x00000001d38b9ecc },
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ { 0x00000001f4ecbed2, 0x000000018b8d09dc },
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ { 0x00000000ba16f1a0, 0x00000000e7bc27d2 },
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ { 0x0000000115aceac4, 0x00000000275e1e96 },
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ { 0x00000001aeff6292, 0x00000000e2e3031e },
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ { 0x000000009640124c, 0x00000001041c84d8 },
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ { 0x0000000114f41f02, 0x00000000706ce672 },
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ { 0x000000009c5f3586, 0x000000015d5070da },
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ { 0x00000001878275fa, 0x0000000038f9493a },
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ { 0x00000000ddc42ce8, 0x00000000a3348a76 },
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ { 0x0000000181d2c73a, 0x00000001ad0aab92 },
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ { 0x0000000141c9320a, 0x000000019e85f712 },
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ { 0x000000015235719a, 0x000000005a871e76 },
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ { 0x00000000be27d804, 0x000000017249c662 },
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ { 0x000000006242d45a, 0x000000003a084712 },
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ { 0x000000009a53638e, 0x00000000ed438478 },
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ { 0x00000001001ecfb6, 0x00000000abac34cc },
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ { 0x000000016d7c2d64, 0x000000005f35ef3e },
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ { 0x00000001d0ce46c0, 0x0000000047d6608c },
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ { 0x0000000124c907b4, 0x000000002d01470e },
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ { 0x0000000018a555ca, 0x0000000158bbc7b0 },
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ { 0x000000006b0980bc, 0x00000000c0a23e8e },
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ { 0x000000008bbba964, 0x00000001ebd85c88 },
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ { 0x00000001070a5a1e, 0x000000019ee20bb2 },
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ { 0x000000002204322a, 0x00000001acabf2d6 },
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ { 0x00000000a27524d0, 0x00000001b7963d56 },
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ { 0x0000000020b1e4ba, 0x000000017bffa1fe },
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ { 0x0000000032cc27fc, 0x000000001f15333e },
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ { 0x0000000044dd22b8, 0x000000018593129e },
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ { 0x00000000dffc9e0a, 0x000000019cb32602 },
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ { 0x00000001b7a0ed14, 0x0000000142b05cc8 },
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ { 0x00000000c7842488, 0x00000001be49e7a4 },
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ { 0x00000001c02a4fee, 0x0000000108f69d6c },
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ { 0x000000003c273778, 0x000000006c0971f0 },
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ { 0x00000001d63f8894, 0x000000005b16467a },
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ { 0x000000006be557d6, 0x00000001551a628e },
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ { 0x000000006a7806ea, 0x000000019e42ea92 },
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ { 0x000000016155aa0c, 0x000000012fa83ff2 },
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ { 0x00000000908650ac, 0x000000011ca9cde0 },
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ { 0x00000000aa5a8084, 0x00000000c8e5cd74 },
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ { 0x0000000191bb500a, 0x0000000096c27f0c },
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ { 0x0000000064e9bed0, 0x000000002baed926 },
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ { 0x000000009444f302, 0x000000017c8de8d2 },
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ { 0x000000019db07d3c, 0x00000000d43d6068 },
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ { 0x00000001359e3e6e, 0x00000000cb2c4b26 },
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ { 0x00000001e4f10dd2, 0x0000000145b8da26 },
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ { 0x0000000124f5735e, 0x000000018fff4b08 },
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ { 0x0000000124760a4c, 0x0000000150b58ed0 },
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ { 0x000000000f1fc186, 0x00000001549f39bc },
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ { 0x00000000150e4cc4, 0x00000000ef4d2f42 },
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ { 0x000000002a6204e8, 0x00000001b1468572 },
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ { 0x00000000beb1d432, 0x000000013d7403b2 },
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ { 0x0000000135f3f1f0, 0x00000001a4681842 },
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ { 0x0000000074fe2232, 0x0000000167714492 },
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ { 0x000000001ac6e2ba, 0x00000001e599099a },
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ { 0x0000000013fca91e, 0x00000000fe128194 },
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ { 0x0000000183f4931e, 0x0000000077e8b990 },
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ { 0x00000000b6d9b4e4, 0x00000001a267f63a },
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ { 0x00000000b5188656, 0x00000001945c245a },
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ { 0x0000000027a81a84, 0x0000000149002e76 },
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ { 0x0000000125699258, 0x00000001bb8310a4 },
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ { 0x00000001b23de796, 0x000000019ec60bcc },
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ { 0x00000000fe4365dc, 0x000000012d8590ae },
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ { 0x00000000c68f497a, 0x0000000065b00684 },
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ { 0x00000000fbf521ee, 0x000000015e5aeadc },
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ { 0x000000015eac3378, 0x00000000b77ff2b0 },
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ { 0x0000000134914b90, 0x0000000188da2ff6 },
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ { 0x0000000016335cfe, 0x0000000063da929a },
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ { 0x000000010372d10c, 0x00000001389caa80 },
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ { 0x000000015097b908, 0x000000013db599d2 },
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ { 0x00000001227a7572, 0x0000000122505a86 },
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ { 0x000000009a8f75c0, 0x000000016bd72746 },
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ { 0x00000000682c77a2, 0x00000001c3faf1d4 },
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ { 0x00000000231f091c, 0x00000001111c826c },
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ { 0x000000007d4439f2, 0x00000000153e9fb2 },
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ { 0x000000017e221efc, 0x000000002b1f7b60 },
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ { 0x0000000167457c38, 0x00000000b1dba570 },
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ { 0x00000000bdf081c4, 0x00000001f6397b76 },
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ { 0x000000016286d6b0, 0x0000000156335214 },
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ { 0x00000000c84f001c, 0x00000001d70e3986 },
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ { 0x0000000064efe7c0, 0x000000003701a774 },
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ { 0x000000000ac2d904, 0x00000000ac81ef72 },
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ { 0x00000000fd226d14, 0x0000000133212464 },
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ { 0x000000011cfd42e0, 0x00000000e4e45610 },
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ { 0x000000016e5a5678, 0x000000000c1bd370 },
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ { 0x00000001d888fe22, 0x00000001a7b9e7a6 },
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ { 0x00000001af77fcd4, 0x000000007d657a10 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+
+static const __vector unsigned long long vcrc_short_const[16] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x) */
+ { 0x99168a18ec447f11, 0xed837b2613e8221e },
+ /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x) */
+ { 0xe23e954e8fd2cd3c, 0xc8acdd8147b9ce5a },
+ /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x) */
+ { 0x92f8befe6b1d2b53, 0xd9ad6d87d4277e25 },
+ /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x) */
+ { 0xf38a3556291ea462, 0xc10ec5e033fbca3b },
+ /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x) */
+ { 0x974ac56262b6ca4b, 0xc0b55b0e82e02e2f },
+ /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x) */
+ { 0x855712b3784d2a56, 0x71aa1df0e172334d },
+ /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x) */
+ { 0xa5abe9f80eaee722, 0xfee3053e3969324d },
+ /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x) */
+ { 0x1fa0943ddb54814c, 0xf44779b93eb2bd08 },
+ /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x) */
+ { 0xa53ff440d7bbfe6a, 0xf5449b3f00cc3374 },
+ /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x) */
+ { 0xebe7e3566325605c, 0x6f8346e1d777606e },
+ /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x) */
+ { 0xc65a272ce5b592b8, 0xe3ab4f2ac0b95347 },
+ /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x) */
+ { 0x5705a9ca4721589f, 0xaa2215ea329ecc11 },
+ /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x) */
+ { 0xe3720acb88d14467, 0x1ed8f66ed95efd26 },
+ /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x) */
+ { 0xba1aca0315141c31, 0x78ed02d5a700e96a },
+ /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x) */
+ { 0xad2a31b3ed627dae, 0xba8ccbe832b39da3 },
+ /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x) */
+ { 0x6655004fa06a2517, 0xedb88320b1e6b092 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+ /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x) */
+ { 0xed837b2613e8221e, 0x99168a18ec447f11 },
+ /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x) */
+ { 0xc8acdd8147b9ce5a, 0xe23e954e8fd2cd3c },
+ /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x) */
+ { 0xd9ad6d87d4277e25, 0x92f8befe6b1d2b53 },
+ /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x) */
+ { 0xc10ec5e033fbca3b, 0xf38a3556291ea462 },
+ /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x) */
+ { 0xc0b55b0e82e02e2f, 0x974ac56262b6ca4b },
+ /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x) */
+ { 0x71aa1df0e172334d, 0x855712b3784d2a56 },
+ /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x) */
+ { 0xfee3053e3969324d, 0xa5abe9f80eaee722 },
+ /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x) */
+ { 0xf44779b93eb2bd08, 0x1fa0943ddb54814c },
+ /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x) */
+ { 0xf5449b3f00cc3374, 0xa53ff440d7bbfe6a },
+ /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x) */
+ { 0x6f8346e1d777606e, 0xebe7e3566325605c },
+ /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x) */
+ { 0xe3ab4f2ac0b95347, 0xc65a272ce5b592b8 },
+ /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x) */
+ { 0xaa2215ea329ecc11, 0x5705a9ca4721589f },
+ /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x) */
+ { 0x1ed8f66ed95efd26, 0xe3720acb88d14467 },
+ /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x) */
+ { 0x78ed02d5a700e96a, 0xba1aca0315141c31 },
+ /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x) */
+ { 0xba8ccbe832b39da3, 0xad2a31b3ed627dae },
+ /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x) */
+ { 0xedb88320b1e6b092, 0x6655004fa06a2517 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Barrett constants */
+/* 33 bit reflected Barrett constant m - (4^32)/n */
+
+static const __vector unsigned long long v_Barrett_const[2] ALIGNED_(16) = {
+ /* x^64 div p(x) */
+#if BYTE_ORDER == LITTLE_ENDIAN
+ { 0x00000001f7011641, 0x0000000000000000 },
+ { 0x00000001db710641, 0x0000000000000000 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+ { 0x0000000000000000, 0x00000001f7011641 },
+ { 0x0000000000000000, 0x00000001db710641 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
diff --git a/neozip/arch/power/crc32_power8.c b/neozip/arch/power/crc32_power8.c
new file mode 100644
index 0000000000..a7a2fb7435
--- /dev/null
+++ b/neozip/arch/power/crc32_power8.c
@@ -0,0 +1,593 @@
+/* crc32 for POWER8 using VSX instructions
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Author: Rogerio Alves <rogealve@br.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ */
+
+#ifdef POWER8_VSX_CRC32
+
+#include "zbuild.h"
+#include "zendian.h"
+
+#include "crc32_constants.h"
+#include "crc32_braid_tbl.h"
+
+#include "power_intrins.h"
+
+#define MAX_SIZE 32768
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
+ while (len--)
+ crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+ return crc;
+}
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
+ unsigned int prealign;
+ unsigned int tail;
+
+ unsigned long len = (unsigned long) _len;
+
+ crc ^= 0xffffffff;
+
+ if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+ crc = crc32_align(crc, p, len);
+ goto out;
+ }
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = (unsigned int)ALIGN_DIFF(p, VMX_ALIGN);
+ crc = crc32_align(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ crc = __crc32_vpmsum(crc, p, ALIGN_DOWN(len, VMX_ALIGN));
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += ALIGN_DOWN(len, VMX_ALIGN);
+ crc = crc32_align(crc, p, tail);
+ }
+
+out:
+ crc ^= 0xffffffff;
+
+ return crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_power8(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that forward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+ const __vector unsigned long long vzero = {0,0};
+ const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+ const __vector unsigned long long vmask_32bit =
+ (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+
+ const __vector unsigned long long vmask_64bit =
+ (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+ __vector unsigned long long vcrc;
+
+ __vector unsigned long long vconst1, vconst2;
+
+ /* vdata0-vdata7 will contain our data (p). */
+ __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+ /* v0-v7 will contain our checksums */
+ __vector unsigned long long v0 = {0,0};
+ __vector unsigned long long v1 = {0,0};
+ __vector unsigned long long v2 = {0,0};
+ __vector unsigned long long v3 = {0,0};
+ __vector unsigned long long v4 = {0,0};
+ __vector unsigned long long v5 = {0,0};
+ __vector unsigned long long v6 = {0,0};
+ __vector unsigned long long v7 = {0,0};
+
+
+ /* Vector auxiliary variables. */
+ __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+ unsigned int offset; /* Constant table offset. */
+
+ unsigned long i; /* Counter. */
+ unsigned long chunks;
+
+ unsigned long block_size;
+ int next_block = 0;
+
+ /* Align by 128 bits. The last 128 bit block will be processed at end. */
+ unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+ vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
+
+ /* Short version. */
+ if (len < 256) {
+ /* Calculate where in the constant table we need to start. */
+ offset = 256 - len;
+
+ vconst1 = vec_ld(offset, vcrc_short_const);
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+ /* xor initial value */
+ vdata0 = vec_xor(vdata0, vcrc);
+
+ vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+ v0 = vec_xor(v0, vdata0);
+
+ for (i = 16; i < len; i += 16) {
+ vconst1 = vec_ld(offset + i, vcrc_short_const);
+ vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+ vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+ v0 = vec_xor(v0, vdata0);
+ }
+ } else {
+
+ /* Load initial values. */
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+ VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+ vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+ vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+ VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+ vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+ vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+ VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+ vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+ vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+ VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+ /* xor in initial value */
+ vdata0 = vec_xor(vdata0, vcrc);
+
+ p = (char *)p + 128;
+
+ do {
+ /* Checksum in blocks of MAX_SIZE. */
+ block_size = length;
+ if (block_size > MAX_SIZE) {
+ block_size = MAX_SIZE;
+ }
+
+ length = length - block_size;
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ offset = (MAX_SIZE/8) - (block_size/8);
+ /* We reduce our final 128 bytes in a separate step */
+ chunks = (block_size/128)-1;
+
+ vconst1 = vec_ld(offset, vcrc_const);
+
+ va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+ (__vector unsigned long long)vconst1);
+ va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+ (__vector unsigned long long)vconst1);
+ va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+ (__vector unsigned long long)vconst1);
+ va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+ (__vector unsigned long long)vconst1);
+ va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+ (__vector unsigned long long)vconst1);
+ va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+ (__vector unsigned long long)vconst1);
+ va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+ (__vector unsigned long long)vconst1);
+ va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+ (__vector unsigned long long)vconst1);
+
+ if (chunks > 1) {
+ offset += 16;
+ vconst2 = vec_ld(offset, vcrc_const);
+ GROUP_ENDING_NOP;
+
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+ vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+ VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+ vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+ VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+ vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+ VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+ vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+ VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+ vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+ VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+ vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+ VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+ vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+ VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+ p = (char *)p + 128;
+
+ /*
+ * main loop. Each iteration calculates the CRC for a 128-byte
+ * block.
+ */
+ for (i = 0; i < chunks-2; i++) {
+ vconst1 = vec_ld(offset, vcrc_const);
+ offset += 16;
+ GROUP_ENDING_NOP;
+
+ v0 = vec_xor(v0, va0);
+ va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+ (__vector unsigned long long)vconst2);
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v1 = vec_xor(v1, va1);
+ va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+ (__vector unsigned long long)vconst2);
+ vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+ VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v2 = vec_xor(v2, va2);
+ va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
+ vdata2, (__vector unsigned long long)vconst2);
+ vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+ VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v3 = vec_xor(v3, va3);
+ va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+ (__vector unsigned long long)vconst2);
+ vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+ VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+ vconst2 = vec_ld(offset, vcrc_const);
+ GROUP_ENDING_NOP;
+
+ v4 = vec_xor(v4, va4);
+ va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+ (__vector unsigned long long)vconst1);
+ vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+ VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v5 = vec_xor(v5, va5);
+ va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+ (__vector unsigned long long)vconst1);
+ vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+ VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v6 = vec_xor(v6, va6);
+ va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+ (__vector unsigned long long)vconst1);
+ vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+ VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v7 = vec_xor(v7, va7);
+ va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+ (__vector unsigned long long)vconst1);
+ vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+ VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+ p = (char *)p + 128;
+ }
+
+ /* First cool down */
+ vconst1 = vec_ld(offset, vcrc_const);
+ offset += 16;
+
+ v0 = vec_xor(v0, va0);
+ va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v1 = vec_xor(v1, va1);
+ va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v2 = vec_xor(v2, va2);
+ va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v3 = vec_xor(v3, va3);
+ va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v4 = vec_xor(v4, va4);
+ va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v5 = vec_xor(v5, va5);
+ va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v6 = vec_xor(v6, va6);
+ va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v7 = vec_xor(v7, va7);
+ va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+ (__vector unsigned long long)vconst1);
+ }/* else */
+
+ /* Second cool down. */
+ v0 = vec_xor(v0, va0);
+ v1 = vec_xor(v1, va1);
+ v2 = vec_xor(v2, va2);
+ v3 = vec_xor(v3, va3);
+ v4 = vec_xor(v4, va4);
+ v5 = vec_xor(v5, va5);
+ v6 = vec_xor(v6, va6);
+ v7 = vec_xor(v7, va7);
+
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+ (__vector unsigned char)vzero, 4);
+ v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+ (__vector unsigned char)vzero, 4);
+ v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+ (__vector unsigned char)vzero, 4);
+ v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+ (__vector unsigned char)vzero, 4);
+ v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+ (__vector unsigned char)vzero, 4);
+ v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+ (__vector unsigned char)vzero, 4);
+ v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+ (__vector unsigned char)vzero, 4);
+ v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+ (__vector unsigned char)vzero, 4);
+
+ /* xor with the last 1024 bits. */
+ va0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(va0, va0, va0, vperm_const);
+
+ va1 = vec_ld(16, (__vector unsigned long long*) p);
+ VEC_PERM(va1, va1, va1, vperm_const);
+
+ va2 = vec_ld(32, (__vector unsigned long long*) p);
+ VEC_PERM(va2, va2, va2, vperm_const);
+
+ va3 = vec_ld(48, (__vector unsigned long long*) p);
+ VEC_PERM(va3, va3, va3, vperm_const);
+
+ va4 = vec_ld(64, (__vector unsigned long long*) p);
+ VEC_PERM(va4, va4, va4, vperm_const);
+
+ va5 = vec_ld(80, (__vector unsigned long long*) p);
+ VEC_PERM(va5, va5, va5, vperm_const);
+
+ va6 = vec_ld(96, (__vector unsigned long long*) p);
+ VEC_PERM(va6, va6, va6, vperm_const);
+
+ va7 = vec_ld(112, (__vector unsigned long long*) p);
+ VEC_PERM(va7, va7, va7, vperm_const);
+
+ p = (char *)p + 128;
+
+ vdata0 = vec_xor(v0, va0);
+ vdata1 = vec_xor(v1, va1);
+ vdata2 = vec_xor(v2, va2);
+ vdata3 = vec_xor(v3, va3);
+ vdata4 = vec_xor(v4, va4);
+ vdata5 = vec_xor(v5, va5);
+ vdata6 = vec_xor(v6, va6);
+ vdata7 = vec_xor(v7, va7);
+
+ /* Check if we have more blocks to process */
+ next_block = 0;
+ if (length != 0) {
+ next_block = 1;
+
+ /* zero v0-v7 */
+ v0 = vec_xor(v0, v0);
+ v1 = vec_xor(v1, v1);
+ v2 = vec_xor(v2, v2);
+ v3 = vec_xor(v3, v3);
+ v4 = vec_xor(v4, v4);
+ v5 = vec_xor(v5, v5);
+ v6 = vec_xor(v6, v6);
+ v7 = vec_xor(v7, v7);
+ }
+ length = length + 128;
+
+ } while (next_block);
+
+ /* Calculate how many bytes we have left. */
+ length = (len & 127);
+
+ /* Calculate where in (short) constant table we need to start. */
+ offset = 128 - length;
+
+ v0 = vec_ld(offset, vcrc_short_const);
+ v1 = vec_ld(offset + 16, vcrc_short_const);
+ v2 = vec_ld(offset + 32, vcrc_short_const);
+ v3 = vec_ld(offset + 48, vcrc_short_const);
+ v4 = vec_ld(offset + 64, vcrc_short_const);
+ v5 = vec_ld(offset + 80, vcrc_short_const);
+ v6 = vec_ld(offset + 96, vcrc_short_const);
+ v7 = vec_ld(offset + 112, vcrc_short_const);
+
+ offset += 128;
+
+ v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+ v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+ v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+ v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+ v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+ v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+ v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+ v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+ /* Now reduce the tail (0-112 bytes). */
+ for (i = 0; i < length; i+=16) {
+ vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+ va0 = vec_ld(offset + i,vcrc_short_const);
+ va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+ v0 = vec_xor(v0, va0);
+ }
+
+ /* xor all parallel chunks together. */
+ v0 = vec_xor(v0, v1);
+ v2 = vec_xor(v2, v3);
+ v4 = vec_xor(v4, v5);
+ v6 = vec_xor(v6, v7);
+
+ v0 = vec_xor(v0, v2);
+ v4 = vec_xor(v4, v6);
+
+ v0 = vec_xor(v0, v4);
+ }
+
+ /* Barrett Reduction */
+ vconst1 = vec_ld(0, v_Barrett_const);
+ vconst2 = vec_ld(16, v_Barrett_const);
+
+ v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+ (__vector unsigned char)v0, 8);
+ v0 = vec_xor(v1,v0);
+
+ /* shift left one bit */
+ __vector unsigned char vsht_splat = vec_splat_u8 (1);
+ v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+
+ v0 = vec_and(v0, vmask_64bit);
+
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+
+ /* bottom 32 bits of a */
+ v1 = vec_and(v0, vmask_32bit);
+
+ /* ma */
+ v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+ (__vector unsigned long long)vconst1);
+
+ /* bottom 32bits of ma */
+ v1 = vec_and(v1, vmask_32bit);
+ /* qn */
+ v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+ (__vector unsigned long long)vconst2);
+ /* a - qn, subtraction is xor in GF(2) */
+ v0 = vec_xor (v0, v1);
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+
+ /* shift result into top 64 bits of */
+ v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+ (__vector unsigned char)vzero, 4);
+
+#if BYTE_ORDER == BIG_ENDIAN
+ return v0[0];
+#else
+ return v0[1];
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_features.c b/neozip/arch/power/power_features.c
new file mode 100644
index 0000000000..148f30a974
--- /dev/null
+++ b/neozip/arch/power/power_features.c
@@ -0,0 +1,54 @@
+/* power_features.c - POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(PPC_FEATURES) || defined(POWER_FEATURES)
+
+#include "zbuild.h"
+#include "power_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+# include <sys/auxv.h>
+#endif
+#ifdef POWER_NEED_AUXVEC_H
+# include <linux/auxvec.h>
+#endif
+#ifdef __FreeBSD__
+# include <machine/cpu.h>
+#endif
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
+#ifdef PPC_FEATURES
+ unsigned long hwcap;
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+ hwcap = getauxval(AT_HWCAP);
+#endif
+
+ if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ features->has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
+ unsigned long hwcap2;
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+ elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
+ hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#ifdef POWER8_VSX
+ if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ features->has_arch_2_07 = 1;
+#endif
+#ifdef POWER9
+ if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ features->has_arch_3_00 = 1;
+#endif
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_features.h b/neozip/arch/power/power_features.h
new file mode 100644
index 0000000000..1ff51de5dd
--- /dev/null
+++ b/neozip/arch/power/power_features.h
@@ -0,0 +1,18 @@
+/* power_features.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_
+
+struct power_cpu_features {
+ int has_altivec;
+ int has_arch_2_07;
+ int has_arch_3_00;
+};
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features);
+
+#endif /* POWER_FEATURES_H_ */
diff --git a/neozip/arch/power/power_functions.h b/neozip/arch/power/power_functions.h
new file mode 100644
index 0000000000..ccc7754a4c
--- /dev/null
+++ b/neozip/arch/power/power_functions.h
@@ -0,0 +1,74 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#include "power_natives.h"
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+# ifdef PPC_VMX_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_vmx
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_vmx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_vmx
+# endif
+// Power8 - VSX
+# ifdef POWER8_VSX_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_power8
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_power8
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_power8
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_power8
+# undef native_slide_hash
+# define native_slide_hash slide_hash_power8
+# endif
+# ifdef POWER8_VSX_CRC32_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_power8
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_power8
+# endif
+// Power9
+# ifdef POWER9_NATIVE
+# undef native_compare256
+# define native_compare256 compare256_power9
+# undef native_longest_match
+# define native_longest_match longest_match_power9
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_power9
+# endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
diff --git a/neozip/arch/power/power_intrins.h b/neozip/arch/power/power_intrins.h
new file mode 100644
index 0000000000..3efcfb9722
--- /dev/null
+++ b/neozip/arch/power/power_intrins.h
@@ -0,0 +1,61 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ * Daniel Black <daniel@linux.vnet.ibm.com>
+ * Rogerio Alves <rogealve@br.ibm.com>
+ * Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_INTRINS_H
+#define POWER_INTRINS_H
+
+#include <altivec.h>
+
+#if defined (__clang__)
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+#ifdef __VSX__
+static inline __vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b) {
+ return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+#endif
+
+#endif
+
+/* There's no version of this that operates over unsigned and if casted, it does
+ * sign extension. Let's write an endian independent version and hope the compiler
+ * eliminates creating another zero idiom for the zero value if one exists locally */
+static inline vector unsigned short vec_unpackl(vector unsigned char a) {
+ vector unsigned char zero = vec_splat_u8(0);
+
+#if BYTE_ORDER == BIG_ENDIAN
+ return (vector unsigned short)vec_mergel(zero, a);
+#else
+ return (vector unsigned short)vec_mergel(a, zero);
+#endif
+}
+
+static inline vector unsigned short vec_unpackh(vector unsigned char a) {
+ vector unsigned char zero = vec_splat_u8(0);
+
+#if BYTE_ORDER == BIG_ENDIAN
+ return (vector unsigned short)vec_mergeh(zero, a);
+#else
+ return (vector unsigned short)vec_mergeh(a, zero);
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_natives.h b/neozip/arch/power/power_natives.h
new file mode 100644
index 0000000000..59ec8a8aed
--- /dev/null
+++ b/neozip/arch/power/power_natives.h
@@ -0,0 +1,27 @@
+/* power_natives.h -- POWER compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_NATIVES_H_
+#define POWER_NATIVES_H_
+
+#if defined(__ALTIVEC__)
+# ifdef PPC_VMX
+# define PPC_VMX_NATIVE
+# endif
+#endif
+#if defined(_ARCH_PWR8) && defined(__VSX__)
+# ifdef POWER8_VSX
+# define POWER8_VSX_NATIVE
+# endif
+# ifdef POWER8_VSX_CRC32
+# define POWER8_VSX_CRC32_NATIVE
+# endif
+#endif
+#if defined(_ARCH_PWR9)
+# ifdef POWER9
+# define POWER9_NATIVE
+# endif
+#endif
+
+#endif /* POWER_NATIVES_H_ */
diff --git a/neozip/arch/power/slide_hash_power8.c b/neozip/arch/power/slide_hash_power8.c
new file mode 100644
index 0000000000..d01e0acd56
--- /dev/null
+++ b/neozip/arch/power/slide_hash_power8.c
@@ -0,0 +1,12 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#define SLIDE_PPC slide_hash_power8
+#include "slide_ppc_tpl.h"
+
+#endif /* POWER8_VSX */
diff --git a/neozip/arch/power/slide_hash_vmx.c b/neozip/arch/power/slide_hash_vmx.c
new file mode 100644
index 0000000000..5a87ef7d9a
--- /dev/null
+++ b/neozip/arch/power/slide_hash_vmx.c
@@ -0,0 +1,10 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX
+
+#define SLIDE_PPC slide_hash_vmx
+#include "slide_ppc_tpl.h"
+
+#endif /* PPC_VMX */
diff --git a/neozip/arch/power/slide_ppc_tpl.h b/neozip/arch/power/slide_ppc_tpl.h
new file mode 100644
index 0000000000..24629b4039
--- /dev/null
+++ b/neozip/arch/power/slide_ppc_tpl.h
@@ -0,0 +1,44 @@
+/* Optimized slide_hash for PowerPC processors
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ const vector unsigned short vmx_wsize = vec_splats(wsize);
+ Pos *p = table;
+
+ do {
+ /* Do the pointer arithmetic early to hopefully overlap the vector unit */
+ Pos *q = p;
+ p += 32;
+ vector unsigned short value0, value1, value2, value3;
+ vector unsigned short result0, result1, result2, result3;
+
+ value0 = vec_ld(0, q);
+ value1 = vec_ld(16, q);
+ value2 = vec_ld(32, q);
+ value3 = vec_ld(48, q);
+ result0 = vec_subs(value0, vmx_wsize);
+ result1 = vec_subs(value1, vmx_wsize);
+ result2 = vec_subs(value2, vmx_wsize);
+ result3 = vec_subs(value3, vmx_wsize);
+ vec_st(result0, 0, q);
+ vec_st(result1, 16, q);
+ vec_st(result2, 32, q);
+ vec_st(result3, 48, q);
+
+ entries -= 32;
+ } while (entries);
+}
+
+void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
diff --git a/neozip/arch/riscv/Makefile.in b/neozip/arch/riscv/Makefile.in
new file mode 100644
index 0000000000..43176eee6e
--- /dev/null
+++ b/neozip/arch/riscv/Makefile.in
@@ -0,0 +1,72 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# Copyright (C) 2025 Yin Tong <yintong.ustc@bytedance.com>, ByteDance
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+RVVFLAG=
+RVVZBCFLAG=
+ZBCFLAG=
+
+all: \
+ riscv_features.o riscv_features.lo \
+ adler32_rvv.o adler32_rvv.lo \
+ chunkset_rvv.o chunkset_rvv.lo \
+ compare256_rvv.o compare256_rvv.lo \
+ slide_hash_rvv.o slide_hash_rvv.lo \
+ crc32_zbc.o crc32_zbc.lo
+
+riscv_features.o: $(SRCDIR)/riscv_features.c
+ $(CC) $(CFLAGS) $(RVVZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c
+
+riscv_features.lo: $(SRCDIR)/riscv_features.c
+ $(CC) $(SFLAGS) $(RVVZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c
+
+adler32_rvv.o: $(SRCDIR)/adler32_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c
+
+adler32_rvv.lo: $(SRCDIR)/adler32_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c
+
+chunkset_rvv.o: $(SRCDIR)/chunkset_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c
+
+chunkset_rvv.lo: $(SRCDIR)/chunkset_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c
+
+compare256_rvv.o: $(SRCDIR)/compare256_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c
+
+compare256_rvv.lo: $(SRCDIR)/compare256_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c
+
+slide_hash_rvv.o: $(SRCDIR)/slide_hash_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c
+
+slide_hash_rvv.lo: $(SRCDIR)/slide_hash_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c
+
+crc32_zbc.o: $(SRCDIR)/crc32_zbc.c
+ $(CC) $(CFLAGS) $(ZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c
+
+crc32_zbc.lo: $(SRCDIR)/crc32_zbc.c
+ $(CC) $(SFLAGS) $(ZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/riscv/README.md b/neozip/arch/riscv/README.md
new file mode 100644
index 0000000000..013095c373
--- /dev/null
+++ b/neozip/arch/riscv/README.md
@@ -0,0 +1,45 @@
+# Building RISC-V Target with Cmake #
+
+> **Warning**
+> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer.
+>
+> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu.
+> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it.
+## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
+
+If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
+
+```bash
+./prepare_riscv_toolchain_qemu.sh
+```
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
+`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
+
+You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
+
+## Cross-Compile for RISC-V Target ##
+
+```bash
+cmake -G Ninja -B ./build-riscv \
+ -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
+ -D CMAKE_INSTALL_PREFIX=./build-riscv/install \
+ -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+ -D QEMU_PATH={QEMU_PATH} \
+ .
+
+cmake --build ./build-riscv
+```
+
+Disable the option if there is no RVV support:
+```
+-D WITH_RVV=OFF
+```
+
+## Run Unittests on User Mode QEMU ##
+
+```bash
+cd ./build-riscv && ctest --verbose
+```
diff --git a/neozip/arch/riscv/adler32_rvv.c b/neozip/arch/riscv/adler32_rvv.c
new file mode 100644
index 0000000000..e446189302
--- /dev/null
+++ b/neozip/arch/riscv/adler32_rvv.c
@@ -0,0 +1,119 @@
+/* adler32_rvv.c - RVV version of adler32
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <riscv_vector.h>
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
+
+ size_t left = len;
+ size_t vl = __riscv_vsetvlmax_e8m1();
+ vl = MIN(vl, 256);
+ vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+ vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+ vuint16m2_t v_buf16_accu;
+
+ /*
+ * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+ * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+ * accumulators to boost performance.
+ *
+ * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+ * vl > 256 (255 * 256 <= UINT16_MAX).
+ *
+ * We accumulate 8-bit data into a 16-bit accumulator and then
+ * move the data into the 32-bit accumulator at the last iteration.
+ */
+ size_t block_size = (256 / vl) * vl;
+ size_t nmax_limit = (NMAX / block_size);
+ size_t cnt = 0;
+ while (left >= block_size) {
+ v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+ size_t subprob = block_size;
+ while (subprob > 0) {
+ vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+ if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+ v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+ v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+ src += vl;
+ if (COPY) dst += vl;
+ subprob -= vl;
+ }
+ v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+ v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+ left -= block_size;
+ /* do modulo once each block of NMAX size */
+ if (++cnt >= nmax_limit) {
+ v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+ v_buf32_accu = __riscv_vremu_vx_u32m4(v_buf32_accu, BASE, vl);
+ cnt = 0;
+ }
+ }
+ /* the left len <= 256 now, we can use 16-bit accum safely */
+ v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+ size_t res = left;
+ while (left >= vl) {
+ vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+ if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+ v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+ v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+ src += vl;
+ if (COPY) dst += vl;
+ left -= vl;
+ }
+ v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+ v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+ v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+ vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+ vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+ vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+ v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+ vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+ v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+ uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum) % BASE;
+
+ sum2 += (sum2_sum + adler * ((len - left) % BASE));
+
+ vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+ v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+ uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum) % BASE;
+
+ adler += adler_sum;
+
+ sum2 %= BASE;
+ adler %= BASE;
+
+ /* Process tail (left < 256). */
+ return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY);
+}
+
+Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
+ return adler32_copy_impl(adler, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/riscv/chunkset_rvv.c b/neozip/arch/riscv/chunkset_rvv.c
new file mode 100644
index 0000000000..cd8ed3cfd2
--- /dev/null
+++ b/neozip/arch/riscv/chunkset_rvv.c
@@ -0,0 +1,126 @@
+/* chunkset_rvv.c - RVV version of chunkset
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+
+#include <riscv_vector.h>
+
+/*
+ * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * so we prefer using large size chunk and copy memory as much as possible.
+ */
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+#define CHUNK_MEMSET_RVV_IMPL(from, chunk, elen) \
+do { \
+ size_t vl, len = sizeof(*chunk) / sizeof(uint##elen##_t); \
+ uint##elen##_t val = *(uint##elen##_t*)from; \
+ uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \
+ do { \
+ vl = __riscv_vsetvl_e##elen##m4(len); \
+ vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
+ __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \
+ len -= vl; chunk_p += vl; \
+ } while (len > 0); \
+} while (0)
+
+/* We don't have a 32-byte datatype for RISC-V arch. */
+typedef struct chunk_s {
+ uint64_t data[4];
+} chunk_t;
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(from, chunk, 16);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(from, chunk, 32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(from, chunk, 64);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ memcpy(chunk->data, (uint8_t *)s, sizeof(*chunk));
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ memcpy(out, chunk->data, sizeof(*chunk));
+}
+
+#define CHUNKSIZE chunksize_rvv
+#define CHUNKCOPY chunkcopy_rvv
+#define CHUNKUNROLL chunkunroll_rvv
+#define CHUNKMEMSET chunkmemset_rvv
+#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
+
+#define HAVE_CHUNKCOPY
+
+/*
+ * Assuming that the length is non-zero, and that `from` lags `out` by at least
+ * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
+ *
+ * We load/store a single chunk once in the `CHUNKCOPY`.
+ * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * such that, we prefer copy large memory size once to make good use of the the RVV advance.
+ *
+ * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
+ * but we still copy as much memory as possible for some conditions.
+ *
+ * case 1: out - from >= len (no overlap)
+ * We can use memcpy to copy `len` size once
+ * because the memory layout would be the same.
+ *
+ * case 2: overlap
+ * We copy N chunks using memcpy at once, aiming to achieve our goal:
+ * to copy as much memory as possible.
+ *
+ * After using a single memcpy to copy N chunks, we have to use series of
+ * loadchunk and storechunk to ensure the result is correct.
+ */
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+ size_t dist = out - from;
+ if (out < from || dist >= len) {
+ memcpy(out, from, len);
+ out += len;
+ from += len;
+ return out;
+ }
+
+ size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+ memcpy(out, from, sizeof(chunk_t));
+ out += align;
+ from += align;
+ len -= align;
+
+ size_t vl = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
+ while (len > dist) {
+ memcpy(out, from, vl);
+ out += vl;
+ from += vl;
+ len -= vl;
+ }
+
+ if (len > 0) {
+ memcpy(out, from, len);
+ out += len;
+ }
+ return out;
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_rvv
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/riscv/compare256_rvv.c b/neozip/arch/riscv/compare256_rvv.c
new file mode 100644
index 0000000000..edb18a3766
--- /dev/null
+++ b/neozip/arch/riscv/compare256_rvv.c
@@ -0,0 +1,48 @@
+/* compare256_rvv.c - RVV version of compare256
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+
+#include <riscv_vector.h>
+
+static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+ size_t vl;
+ long found_diff;
+ do {
+ vl = __riscv_vsetvl_e8m4(256 - len);
+ vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+ vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+ vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
+ found_diff = __riscv_vfirst_m_b2(v_mask, vl);
+ if (found_diff >= 0)
+ return len + (uint32_t)found_diff;
+ src0 += vl, src1 += vl, len += vl;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_rvv_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_rvv
+#define COMPARE256 compare256_rvv_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_rvv
+#define COMPARE256 compare256_rvv_static
+
+#include "match_tpl.h"
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/riscv/crc32_zbc.c b/neozip/arch/riscv/crc32_zbc.c
new file mode 100644
index 0000000000..cf52279b80
--- /dev/null
+++ b/neozip/arch/riscv/crc32_zbc.c
@@ -0,0 +1,103 @@
+/* crc32_zbc.c - RISCV Zbc version of crc32
+ * Copyright (C) 2025 ByteDance. All rights reserved.
+ * Contributed by Yin Tong <yintong.ustc@bytedance.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_CRC32_ZBC
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#define CLMUL_MIN_LEN 16 // Minimum size of buffer for _crc32_clmul
+#define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul
+
+#define CONSTANT_R3 0x1751997d0ULL
+#define CONSTANT_R4 0x0ccaa009eULL
+#define CONSTANT_R5 0x163cd6124ULL
+#define MASK32 0xFFFFFFFF
+#define CRCPOLY_TRUE_LE_FULL 0x1DB710641ULL
+#define CONSTANT_RU 0x1F7011641ULL
+
+static inline uint64_t clmul(uint64_t a, uint64_t b) {
+ uint64_t res;
+ __asm__ volatile("clmul %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
+ return res;
+}
+
+static inline uint64_t clmulh(uint64_t a, uint64_t b) {
+ uint64_t res;
+ __asm__ volatile("clmulh %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
+ return res;
+}
+
+Z_FORCEINLINE static uint32_t crc32_clmul_impl(uint64_t crc, const unsigned char *buf, uint64_t len) {
+ const uint64_t *buf64 = (const uint64_t *)buf;
+ uint64_t low = buf64[0] ^ crc;
+ uint64_t high = buf64[1];
+
+ if (len < 16)
+ goto finish_fold;
+ len -= 16;
+ buf64 += 2;
+
+ // process each 16-byte block
+ while (len >= 16) {
+ uint64_t t2 = clmul(CONSTANT_R4, high);
+ uint64_t t3 = clmulh(CONSTANT_R4, high);
+
+ uint64_t t0_new = clmul(CONSTANT_R3, low);
+ uint64_t t1_new = clmulh(CONSTANT_R3, low);
+
+ // Combine the results and XOR with new data
+ low = t0_new ^ t2;
+ high = t1_new ^ t3;
+ low ^= buf64[0];
+ high ^= buf64[1];
+
+ buf64 += 2;
+ len -= 16;
+ }
+
+finish_fold:
+ // Fold the 128-bit result into 64 bits
+ uint64_t fold_t3 = clmulh(low, CONSTANT_R4);
+ uint64_t fold_t2 = clmul(low, CONSTANT_R4);
+ low = high ^ fold_t2;
+ high = fold_t3;
+
+ // Combine the low and high parts and perform polynomial reduction
+ uint64_t combined = (low >> 32) | ((high & MASK32) << 32);
+ uint64_t reduced_low = clmul(low & MASK32, CONSTANT_R5) ^ combined;
+
+ // Barrett reduction step
+ uint64_t barrett = clmul(reduced_low & MASK32, CONSTANT_RU) & MASK32;
+ barrett = clmul(barrett, CRCPOLY_TRUE_LE_FULL);
+ uint64_t final = barrett ^ reduced_low;
+
+ // Return the high 32 bits as the final CRC
+ return (uint32_t)(final >> 32);
+}
+
+Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len) {
+ if (len < CLMUL_MIN_LEN) {
+ return crc32_braid(crc, buf, len);
+ }
+
+ uint64_t unaligned_length = len % CLMUL_CHUNK_LEN;
+ if (unaligned_length) {
+ crc = crc32_braid(crc, buf, unaligned_length);
+ buf += unaligned_length;
+ len -= unaligned_length;
+ }
+
+ crc = crc32_clmul_impl(~crc, buf, len);
+ return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_riscv64_zbc(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+#endif
diff --git a/neozip/arch/riscv/riscv_features.c b/neozip/arch/riscv/riscv_features.c
new file mode 100644
index 0000000000..b23f10a699
--- /dev/null
+++ b/neozip/arch/riscv/riscv_features.c
@@ -0,0 +1,99 @@
+#ifdef RISCV_FEATURES
+
+#define _DEFAULT_SOURCE 1 /* For syscall() */
+
+#include "zbuild.h"
+#include "riscv_features.h"
+
+#include <sys/utsname.h>
+
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# include <sys/auxv.h>
+#endif
+
+#if defined(__linux__) && defined(HAVE_ASM_HWPROBE_H)
+# include <asm/hwprobe.h>
+# include <sys/syscall.h> /* For __NR_riscv_hwprobe */
+# include <unistd.h> /* For syscall() */
+#endif
+
+#define ISA_V_HWCAP (1 << ('v' - 'a'))
+#define ISA_ZBC_HWCAP (1 << 29)
+
+static int riscv_check_features_runtime_hwprobe(struct riscv_cpu_features *features) {
+#if defined(__NR_riscv_hwprobe) && defined(RISCV_HWPROBE_KEY_IMA_EXT_0)
+ struct riscv_hwprobe probes[] = {
+ {RISCV_HWPROBE_KEY_IMA_EXT_0, 0},
+ };
+ int ret;
+ unsigned i;
+
+ ret = syscall(__NR_riscv_hwprobe, probes, sizeof(probes) / sizeof(probes[0]), 0, NULL, 0);
+
+ if (ret != 0) {
+ /* Kernel does not support hwprobe */
+ return 0;
+ }
+
+ for (i = 0; i < sizeof(probes) / sizeof(probes[0]); i++) {
+ switch (probes[i].key) {
+ case RISCV_HWPROBE_KEY_IMA_EXT_0:
+# ifdef RISCV_HWPROBE_IMA_V
+ features->has_rvv = !!(probes[i].value & RISCV_HWPROBE_IMA_V);
+# endif
+# ifdef RISCV_HWPROBE_EXT_ZBC
+ features->has_zbc = !!(probes[i].value & RISCV_HWPROBE_EXT_ZBC);
+# endif
+ break;
+ }
+ }
+
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+static int riscv_check_features_runtime_hwcap(struct riscv_cpu_features *features) {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+
+ features->has_rvv = hw_cap & ISA_V_HWCAP;
+ features->has_zbc = hw_cap & ISA_ZBC_HWCAP;
+
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+static void riscv_check_features_runtime(struct riscv_cpu_features *features) {
+ if (riscv_check_features_runtime_hwprobe(features))
+ return;
+
+ riscv_check_features_runtime_hwcap(features);
+}
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
+ riscv_check_features_runtime(features);
+#ifdef RISCV_RVV
+ if (features->has_rvv) {
+ size_t e8m1_vec_len;
+ intptr_t vtype_reg_val;
+ // Check that a vuint8m1_t vector is at least 16 bytes and that tail
+ // agnostic and mask agnostic mode are supported
+ //
+ __asm__ volatile(
+ "vsetvli %0, zero, e8, m1, ta, ma\n\t"
+ "csrr %1, vtype"
+ : "=r"(e8m1_vec_len), "=r"(vtype_reg_val));
+
+ // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of
+ // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16
+ // bytes
+ features->has_rvv = (vtype_reg_val >= 0 && e8m1_vec_len >= 16);
+ }
+#endif
+}
+
+#endif
diff --git a/neozip/arch/riscv/riscv_features.h b/neozip/arch/riscv/riscv_features.h
new file mode 100644
index 0000000000..42855a1b6b
--- /dev/null
+++ b/neozip/arch/riscv/riscv_features.h
@@ -0,0 +1,19 @@
+/* riscv_features.h -- check for riscv features.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FEATURES_H_
+#define RISCV_FEATURES_H_
+
+struct riscv_cpu_features {
+ int has_rvv;
+ int has_zbc;
+};
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
+
+#endif /* RISCV_FEATURES_H_ */
diff --git a/neozip/arch/riscv/riscv_functions.h b/neozip/arch/riscv/riscv_functions.h
new file mode 100644
index 0000000000..89120ffabf
--- /dev/null
+++ b/neozip/arch/riscv/riscv_functions.h
@@ -0,0 +1,60 @@
+/* riscv_functions.h -- RISCV implementations for arch-specific functions.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FUNCTIONS_H_
+#define RISCV_FUNCTIONS_H_
+
+#include "riscv_natives.h"
+
+#ifdef RISCV_RVV
+uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t longest_match_rvv(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_rvv(deflate_state *const s, uint32_t cur_match);
+void slide_hash_rvv(deflate_state *s);
+void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef RISCV_CRC32_ZBC
+uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// RISCV - RVV
+# ifdef RISCV_RVV_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_rvv
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_rvv
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_rvv
+# undef native_compare256
+# define native_compare256 compare256_rvv
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_rvv
+# undef native_longest_match
+# define native_longest_match longest_match_rvv
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_rvv
+# undef native_slide_hash
+# define native_slide_hash slide_hash_rvv
+# endif
+// RISCV - CRC32
+# ifdef RISCV_ZBC_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_riscv64_zbc
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_riscv64_zbc
+# endif
+#endif
+
+#endif /* RISCV_FUNCTIONS_H_ */
diff --git a/neozip/arch/riscv/riscv_natives.h b/neozip/arch/riscv/riscv_natives.h
new file mode 100644
index 0000000000..38d7aba648
--- /dev/null
+++ b/neozip/arch/riscv/riscv_natives.h
@@ -0,0 +1,19 @@
+/* riscv_natives.h -- RISCV compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_NATIVES_H_
+#define RISCV_NATIVES_H_
+
+#if defined(__riscv_v) && defined(__linux__)
+# ifdef RISCV_RVV
+# define RISCV_RVV_NATIVE
+# endif
+#endif
+#if defined(__riscv_zbc)
+# ifdef RISCV_CRC32_ZBC
+# define RISCV_ZBC_NATIVE
+# endif
+#endif
+
+#endif /* RISCV_NATIVES_H_ */
diff --git a/neozip/arch/riscv/slide_hash_rvv.c b/neozip/arch/riscv/slide_hash_rvv.c
new file mode 100644
index 0000000000..e794c38204
--- /dev/null
+++ b/neozip/arch/riscv/slide_hash_rvv.c
@@ -0,0 +1,33 @@
+/* slide_hash_rvv.c - RVV version of slide_hash
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <riscv_vector.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ size_t vl;
+ while (entries > 0) {
+ vl = __riscv_vsetvl_e16m4(entries);
+ vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
+ vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
+ __riscv_vse16_v_u16m4(table, v_diff, vl);
+ table += vl, entries -= vl;
+ }
+}
+
+Z_INTERNAL void slide_hash_rvv(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/s390/Makefile.in b/neozip/arch/s390/Makefile.in
new file mode 100644
index 0000000000..e994157df2
--- /dev/null
+++ b/neozip/arch/s390/Makefile.in
@@ -0,0 +1,48 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+s390_features.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+s390_features.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+dfltcc_deflate.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_deflate.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_inflate.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+dfltcc_inflate.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+crc32-vx.o:
+ $(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+ $(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/s390/README.md b/neozip/arch/s390/README.md
new file mode 100644
index 0000000000..c56ffd7654
--- /dev/null
+++ b/neozip/arch/s390/README.md
@@ -0,0 +1,265 @@
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:
+
+ $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
+ $ make
+
+or
+
+ $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
+ $ make
+
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
+
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.
+
+DFLTCC does not support every single zlib-ng feature, in particular:
+
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`
+
+When used, these functions will either switch to software, or, in case
+this is not possible, gracefully fail.
+
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros
+
+DFLTCC takes as arguments a parameter block, an input buffer, an output
+buffer, and a window. Parameter blocks are stored alongside zlib states;
+buffers are forwarded from the caller; and window - which must be
+4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
+`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
+and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
+
+Software and hardware window formats do not match, therefore,
+`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
+and `inflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
+`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
+
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.
+
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.
+
+The algorithm implemented in hardware has different compression ratio
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.
+
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.
+
+In addition to compression, DFLTCC computes CRC-32 and Adler-32
+checksums, therefore, whenever it's used, software checksumming is
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
+macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
+* Translating between software and hardware data formats, e.g.
+ `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+ `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+ is not the case, then it returns `0`, making `deflate()` use some
+ other function in order to compress in software. Otherwise it returns
+ `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+ when explicitly instructed to do so by the software. Furthermore,
+ whether to use fixed or dynamic Huffman tables must also be determined
+ by the software. Since looking at data in order to gather statistics
+ would negate performance benefits, the following approach is used: the
+ first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+ block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+ dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+ instructs DFLTCC to write EOBS, however, certain conditions need to be
+ met: input data length must be non-zero or Continuation Flag must be
+ set. To put this in simpler terms, DFLTCC will silently refuse to
+ write EOBS if this is the only thing that it is asked to do. Since the
+ code has to be able to emit EOBS in software anyway, in order to avoid
+ tricky corner cases Block Closing Control is never used. Whether to
+ write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+ must perform various additional actions when a block or a stream ends.
+ `dfltcc_deflate()` informs `deflate()` about this using
+ `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+ and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+ and Sub-Byte Boundary. Certain fields cannot be translated and must
+ persist untouched in the parameter block between calls, for example,
+ Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+ quite intertwined and pervasive. The general idea here is that the
+ code must not do anything in software - whether explicitly by e.g.
+ calling `send_eobs()`, or implicitly - by returning to `deflate()`
+ with certain return and `*result` values, when Continuation Flag is
+ set.
+* Ending streams. When a new block is started and flush mode is
+ `Z_FINISH`, Block Header Final parameter block bit is used to mark
+ this block as final. However, sometimes an empty final block is
+ needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+ refuse to do this. The general idea of DFLTCC implementation is to
+ rely as much as possible on the existing code. Here in order to do
+ this, the code pretends that it does not support DFLTCC, which makes
+ `deflate()` call a software compression function, which writes an
+ empty final block. Whether this is required is controlled by
+ `need_empty_block` variable.
+* Error handling. This is simply converting
+ Operation-Ending-Supplemental Code to string. Errors can only happen
+ due to things like memory corruption, and therefore they don't affect
+ the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+ Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+ block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+ the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+ `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+ and vice versa. For example, `whave` and History Length or `wnext` and
+ History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+ and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+ Operation-Ending-Supplemental Code to string conversion. Unlike
+ deflate, errors may happen due to bad inputs, therefore they are
+ propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
+testing. There is no official IBM Z GitHub Actions runner, so we build
+one inspired by `anup-kodlekere/gaplib`.
+Future updates to actions-runner might need an updated patch. The .net
+version number patch has been separated into a separate file to avoid a
+need for constantly changing the patch.
+
+## Configuring the builder.
+
+### Install prerequisites.
+```
+sudo dnf install podman
+```
+
+### Create a config file, needs github personal access token.
+Access token needs permissions; Repo Admin RW, Org Self-hosted runners RW.
+For details, consult
+https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository
+
+#### Create file /etc/actions-runner:
+```
+REPO=<owner>/<name>
+PAT_TOKEN=<github_pat_***>
+```
+
+#### Set permissions on /etc/actions-runner:
+```
+chmod 600 /etc/actions-runner
+```
+
+### Add actions-runner service.
+```
+sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
+sudo systemctl daemon-reload
+```
+
+### Autostart actions-runner.
+```
+$ sudo systemctl enable --now actions-runner
+```
+
+### Add auto-rebuild cronjob
+```
+sudo cp self-hosted-builder/actions-runner-rebuild.sh /etc/cron.weekly/
+chmod +x /etc/cron.weekly/actions-runner-rebuild.sh
+```
+
+## Building / Rebuilding the container
+```
+sudo /etc/cron.weekly/actions-runner-rebuild.sh
+```
diff --git a/neozip/arch/s390/crc32-vx.c b/neozip/arch/s390/crc32-vx.c
new file mode 100644
index 0000000000..ba00f9a370
--- /dev/null
+++ b/neozip/arch/s390/crc32-vx.c
@@ -0,0 +1,232 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#ifdef S390_CRC32_VX
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
+ /*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these definitions:
+ *
+ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+ * R3 = [(x128+32 mod P'(x) << 32)]' << 1
+ * R4 = [(x128-32 mod P'(x) << 32)]' << 1
+ * R5 = [(x64 mod P'(x) << 32)]' << 1
+ * R6 = [(x32 mod P'(x) << 32)]' << 1
+ *
+ * The bitreflected Barret reduction constant, u', is defined as
+ * the bit reversal of floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ */
+ const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
+ const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
+ const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
+ const uv2di r5 = {0, 0x163CD6124}; /* R5 */
+ const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
+ const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
+
+ /*
+ * Load the initial CRC value.
+ *
+ * The CRC value is loaded into the rightmost word of the
+ * vector register and is later XORed with the LSB portion
+ * of the loaded input data.
+ */
+ uv2di v0 = {0, 0};
+ v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+ uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+ uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+ uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+ v1 ^= v0;
+ buf += 64;
+ len -= 64;
+
+ while (len >= 64) {
+ /* Load the next 64-byte data chunk */
+ uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+ uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+ uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+ uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the R1 and R2 reduction constants in V0. The intermediate result
+ * is then folded (accumulated) with the next data chunk in PART1 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+ v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+ v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+ v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+ buf += 64;
+ len -= 64;
+ }
+
+ /*
+ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
+ * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+ * value remains.
+ */
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+ while (len >= 16) {
+ /* Load next data chunk */
+ v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+ /* Fold next data chunk */
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+ buf += 16;
+ len -= 16;
+ }
+
+ /*
+ * Set up a vector register for byte shifts. The shift value must
+ * be loaded in bits 1-4 in byte element 7 of a vector register.
+ * Shift by 8 bytes: 0x40
+ * Shift by 4 bytes: 0x20
+ */
+ uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+ /*
+ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+ * to move R4 into the rightmost doubleword and set the leftmost
+ * doubleword to 0x1.
+ */
+ v0 = vec_srb(r4r3, (uv2di)v9);
+ v0[0] = 1;
+
+ /*
+ * Compute GF(2) product of V1 and V0. The rightmost doubleword
+ * of V1 is multiplied with R4. The leftmost doubleword of V1 is
+ * multiplied by 0x1 and is then XORed with rightmost product.
+ * Implicitly, the intermediate leftmost product becomes padded
+ */
+ v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+ /*
+ * Now do the final 32-bit fold by multiplying the rightmost word
+ * in V1 with R5 and XOR the result with the remaining bits in V1.
+ *
+ * To achieve this by a single VGFMAG, right shift V1 by a word
+ * and store the result in V2 which is then accumulated. Use the
+ * vector unpack instruction to load the rightmost half of the
+ * doubleword into the rightmost doubleword element of V1; the other
+ * half is loaded in the leftmost doubleword.
+ * The vector register with CONST_R5 contains the R5 constant in the
+ * rightmost doubleword and the leftmost doubleword is zero to ignore
+ * the leftmost product of V1.
+ */
+ v9 = vec_insert((unsigned char)0x20, v9, 7);
+ v2 = vec_srb(v1, (uv2di)v9);
+ v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
+ v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+ /*
+ * Apply a Barret reduction to compute the final 32-bit CRC value.
+ *
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: The leftmost doubleword of vector register containing
+ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+ * is zero and does not contribute to the final result.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ v2 = vec_unpackl((uv4si)v1);
+ v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is stored in word element 2 of V2.
+ */
+ v2 = vec_unpackl((uv4si)v2);
+ v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+ return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
+ size_t prealign, aligned, remaining;
+
+ if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+ return crc32_braid(crc, buf, len);
+
+ if ((uintptr_t)buf & VX_ALIGN_MASK) {
+ prealign = (size_t)ALIGN_DIFF(buf, VX_ALIGNMENT);
+ len -= prealign;
+ crc = crc32_braid(crc, buf, prealign);
+ buf += prealign;
+ }
+ aligned = ALIGN_DOWN(len, VX_ALIGNMENT);
+ remaining = len & VX_ALIGN_MASK;
+
+ crc = ~crc32_le_vgfm_16(~crc, buf, aligned);
+
+ if (remaining)
+ crc = crc32_braid(crc, buf + aligned, remaining);
+
+ return crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_s390_vx(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_common.h b/neozip/arch/s390/dfltcc_common.h
new file mode 100644
index 0000000000..a6527ab5df
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_common.h
@@ -0,0 +1,119 @@
+#ifndef DFLTCC_COMMON_H
+#define DFLTCC_COMMON_H
+
+#include "zutil.h"
+
+/*
+ Parameter Block for Query Available Functions.
+ */
+struct dfltcc_qaf_param {
+ char fns[16];
+ char reserved1[8];
+ char fmts[2];
+ char reserved2[6];
+} ALIGNED_(8);
+
+/*
+ Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
+ */
+struct dfltcc_param_v0 {
+ uint16_t pbvn; /* Parameter-Block-Version Number */
+ uint8_t mvn; /* Model-Version Number */
+ uint8_t ribm; /* Reserved for IBM use */
+ uint32_t reserved32 : 31;
+ uint32_t cf : 1; /* Continuation Flag */
+ uint8_t reserved64[8];
+ uint32_t nt : 1; /* New Task */
+ uint32_t reserved129 : 1;
+ uint32_t cvt : 1; /* Check Value Type */
+ uint32_t reserved131 : 1;
+ uint32_t htt : 1; /* Huffman-Table Type */
+ uint32_t bcf : 1; /* Block-Continuation Flag */
+ uint32_t bcc : 1; /* Block Closing Control */
+ uint32_t bhf : 1; /* Block Header Final */
+ uint32_t reserved136 : 1;
+ uint32_t reserved137 : 1;
+ uint32_t dhtgc : 1; /* DHT Generation Control */
+ uint32_t reserved139 : 5;
+ uint32_t reserved144 : 5;
+ uint32_t sbb : 3; /* Sub-Byte Boundary */
+ uint8_t oesc; /* Operation-Ending-Supplemental Code */
+ uint32_t reserved160 : 12;
+ uint32_t ifs : 4; /* Incomplete-Function Status */
+ uint16_t ifl; /* Incomplete-Function Length */
+ uint8_t reserved192[8];
+ uint8_t reserved256[8];
+ uint8_t reserved320[4];
+ uint16_t hl; /* History Length */
+ uint32_t reserved368 : 1;
+ uint16_t ho : 15; /* History Offset */
+ uint32_t cv; /* Check Value */
+ uint32_t eobs : 15; /* End-of-block Symbol */
+ uint32_t reserved431: 1;
+ uint8_t eobl : 4; /* End-of-block Length */
+ uint32_t reserved436 : 12;
+ uint32_t reserved448 : 4;
+ uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table
+ Length */
+ uint8_t reserved464[6];
+ uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */
+ uint8_t reserved[24];
+ uint8_t ribm2[8]; /* Reserved for IBM use */
+ uint8_t csb[1152]; /* Continuation-State Buffer */
+} ALIGNED_(8);
+
+/*
+ Extension of inflate_state and deflate_state.
+ */
+struct dfltcc_state {
+ struct dfltcc_param_v0 param; /* Parameter block. */
+ struct dfltcc_qaf_param af; /* Available functions. */
+ char msg[64]; /* Buffer for strm->msg */
+};
+
+typedef struct {
+ struct dfltcc_state common;
+ uint16_t level_mask; /* Levels on which to use DFLTCC */
+ uint32_t block_size; /* New block each X bytes */
+ size_t block_threshold; /* New block after total_in > X */
+ uint32_t dht_threshold; /* New block only if avail_in >= X */
+} arch_deflate_state;
+
+typedef struct {
+ struct dfltcc_state common;
+} arch_inflate_state;
+
+/*
+ History buffer size.
+ */
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+/*
+ Sizes of deflate block parts.
+ */
+#define DFLTCC_BLOCK_HEADER_BITS 3
+#define DFLTCC_HLITS_COUNT_BITS 5
+#define DFLTCC_HDISTS_COUNT_BITS 5
+#define DFLTCC_HCLENS_COUNT_BITS 4
+#define DFLTCC_MAX_HCLENS 19
+#define DFLTCC_HCLEN_BITS 3
+#define DFLTCC_MAX_HLITS 286
+#define DFLTCC_MAX_HDISTS 30
+#define DFLTCC_MAX_HLIT_HDIST_BITS 7
+#define DFLTCC_MAX_SYMBOL_BITS 16
+#define DFLTCC_MAX_EOBS_BITS 15
+#define DFLTCC_MAX_PADDING_BITS 7
+
+#define DEFLATE_BOUND_COMPLEN(source_len) \
+ ((DFLTCC_BLOCK_HEADER_BITS + \
+ DFLTCC_HLITS_COUNT_BITS + \
+ DFLTCC_HDISTS_COUNT_BITS + \
+ DFLTCC_HCLENS_COUNT_BITS + \
+ DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
+ (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
+ (source_len) * DFLTCC_MAX_SYMBOL_BITS + \
+ DFLTCC_MAX_EOBS_BITS + \
+ DFLTCC_MAX_PADDING_BITS) >> 3)
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_deflate.c b/neozip/arch/s390/dfltcc_deflate.c
new file mode 100644
index 0000000000..5cbd700c64
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_deflate.c
@@ -0,0 +1,390 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
+
+/*
+ Use the following commands to build zlib-ng with DFLTCC compression support:
+
+ $ ./configure --with-dfltcc-deflate
+ or
+
+ $ cmake -DWITH_DFLTCC_DEFLATE=1 .
+
+ and then
+
+ $ make
+*/
+
+#ifdef S390_DFLTCC_DEFLATE
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "trees_emit.h"
+#include "dfltcc_deflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
+ deflate_state *state = (deflate_state *)strm->state;
+ arch_deflate_state *dfltcc_state = &state->arch;
+
+ dfltcc_reset_state(&dfltcc_state->common);
+
+ /* Initialize tuning parameters */
+ dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
+ dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
+ dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
+ dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
+}
+
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+ int reproducible) {
+ deflate_state *state = (deflate_state *)strm->state;
+ arch_deflate_state *dfltcc_state = &state->arch;
+
+ /* Unsupported compression settings */
+ if ((dfltcc_state->level_mask & (1 << level)) == 0)
+ return 0;
+ if (window_bits != HB_BITS)
+ return 0;
+ if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+ return 0;
+ if (reproducible)
+ return 0;
+
+ /* Unsupported hardware */
+ if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
+ !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
+ !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
+ return 0;
+
+ return 1;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
+ deflate_state *state = (deflate_state *)strm->state;
+
+ return dfltcc_can_deflate_with_params(strm, state->level, W_BITS(state), state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
+ deflate_state *state = (deflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+ size_t avail_in = strm->avail_in;
+
+ dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
+}
+
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
+ deflate_state *state = (deflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+ size_t avail_in = strm->avail_in;
+ size_t avail_out = strm->avail_out;
+ dfltcc_cc cc;
+
+ cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
+ param, &strm->next_out, &avail_out,
+ &strm->next_in, &avail_in, state->window);
+ strm->total_in += (strm->avail_in - avail_in);
+ strm->total_out += (strm->avail_out - avail_out);
+ strm->avail_in = avail_in;
+ strm->avail_out = avail_out;
+ return cc;
+}
+
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
+ deflate_state *state = (deflate_state *)strm->state;
+
+ send_bits(state, bi_reverse((uint16_t)(param->eobs >> (15 - param->eobl)), param->eobl),
+ param->eobl, state->bi_buf, state->bi_valid);
+
+ flush_pending_inline(strm);
+ if (state->pending != 0) {
+ /* The remaining data is located in pending_out[0:pending]. If someone
+ * calls put_byte() - this might happen in deflate() - the byte will be
+ * placed into pending_buf[pending], which is incorrect. Move the
+ * remaining data to the beginning of pending_buf so that put_byte() is
+ * usable again.
+ */
+ memmove(state->pending_buf, state->pending_out, state->pending);
+ state->pending_out = state->pending_buf;
+ }
+#ifdef ZLIB_DEBUG
+ state->compressed_len += param->eobl;
+#endif
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
+ deflate_state *state = (deflate_state *)strm->state;
+ arch_deflate_state *dfltcc_state = &state->arch;
+ struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
+ uInt masked_avail_in;
+ dfltcc_cc cc;
+ int need_empty_block;
+ int soft_bcc;
+ int no_flush;
+
+ if (!PREFIX(dfltcc_can_deflate)(strm)) {
+ /* Clear history. */
+ if (flush == Z_FULL_FLUSH)
+ param->hl = 0;
+ return 0;
+ }
+
+again:
+ masked_avail_in = 0;
+ soft_bcc = 0;
+ no_flush = flush == Z_NO_FLUSH;
+
+ /* No input data. Return, except when Continuation Flag is set, which means
+ * that DFLTCC has buffered some output in the parameter block and needs to
+ * be called again in order to flush it.
+ */
+ if (strm->avail_in == 0 && !param->cf) {
+ /* A block is still open, and the hardware does not support closing
+ * blocks without adding data. Thus, close it manually.
+ */
+ if (!no_flush && param->bcf) {
+ send_eobs(strm, param);
+ param->bcf = 0;
+ }
+ /* Let one of deflate_* functions write a trailing empty block. */
+ if (flush == Z_FINISH)
+ return 0;
+ /* Clear history. */
+ if (flush == Z_FULL_FLUSH)
+ param->hl = 0;
+ /* Trigger block post-processing if necessary. */
+ *result = no_flush ? need_more : block_done;
+ return 1;
+ }
+
+ /* There is an open non-BFINAL block, we are not going to close it just
+ * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
+ * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
+ * DHT in order to adapt to a possibly changed input data distribution.
+ */
+ if (param->bcf && no_flush &&
+ strm->total_in > dfltcc_state->block_threshold &&
+ strm->avail_in >= dfltcc_state->dht_threshold) {
+ if (param->cf) {
+ /* We need to flush the DFLTCC buffer before writing the
+ * End-of-block Symbol. Mask the input data and proceed as usual.
+ */
+ masked_avail_in += strm->avail_in;
+ strm->avail_in = 0;
+ no_flush = 0;
+ } else {
+ /* DFLTCC buffer is empty, so we can manually write the
+ * End-of-block Symbol right away.
+ */
+ send_eobs(strm, param);
+ param->bcf = 0;
+ dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+ }
+ }
+
+ /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+ * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+ * set BCF=1, which is wrong. Avoid complications and return early.
+ */
+ if (strm->avail_out == 0) {
+ *result = need_more;
+ return 1;
+ }
+
+ /* The caller gave us too much data. Pass only one block worth of
+ * uncompressed data to DFLTCC and mask the rest, so that on the next
+ * iteration we start a new block.
+ */
+ if (no_flush && strm->avail_in > dfltcc_state->block_size) {
+ masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
+ strm->avail_in = dfltcc_state->block_size;
+ }
+
+ /* When we have an open non-BFINAL deflate block and caller indicates that
+ * the stream is ending, we need to close an open deflate block and open a
+ * BFINAL one.
+ */
+ need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
+
+ /* Translate stream to parameter block */
+ param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
+ if (!no_flush)
+ /* We need to close a block. Always do this in software - when there is
+ * no input data, the hardware will not honor BCC. */
+ soft_bcc = 1;
+ if (flush == Z_FINISH && !param->bcf)
+ /* We are about to open a BFINAL block, set Block Header Final bit
+ * until the stream ends.
+ */
+ param->bhf = 1;
+ /* DFLTCC-CMPR will write to next_out, so make sure that buffers with
+ * higher precedence are empty.
+ */
+ Assert(state->pending == 0, "There must be no pending bytes");
+ Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
+ param->sbb = (unsigned int)state->bi_valid;
+ if (param->sbb > 0)
+ *strm->next_out = (unsigned char)state->bi_buf;
+ /* Honor history and check value */
+ param->nt = 0;
+ if (state->wrap == 1)
+ param->cv = strm->adler;
+ else if (state->wrap == 2)
+ param->cv = ZSWAP32(strm->adler);
+
+ /* When opening a block, choose a Huffman-Table Type */
+ if (!param->bcf) {
+ if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
+ param->htt = HTT_FIXED;
+ else {
+ param->htt = HTT_DYNAMIC;
+ dfltcc_gdht(strm);
+ }
+ }
+
+ /* Deflate */
+ do {
+ cc = dfltcc_cmpr(strm);
+ if (strm->avail_in < 4096 && masked_avail_in > 0)
+ /* We are about to call DFLTCC with a small input buffer, which is
+ * inefficient. Since there is masked data, there will be at least
+ * one more DFLTCC call, so skip the current one and make the next
+ * one handle more data.
+ */
+ break;
+ } while (cc == DFLTCC_CC_AGAIN);
+
+ /* Translate parameter block to stream */
+ strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
+ state->bi_valid = param->sbb;
+ if (state->bi_valid == 0)
+ state->bi_buf = 0; /* Avoid accessing next_out */
+ else
+ state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
+ if (state->wrap == 1)
+ strm->adler = param->cv;
+ else if (state->wrap == 2)
+ strm->adler = ZSWAP32(param->cv);
+
+ /* Unmask the input data */
+ strm->avail_in += masked_avail_in;
+ masked_avail_in = 0;
+
+ /* If we encounter an error, it means there is a bug in DFLTCC call */
+ Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
+
+ /* Update Block-Continuation Flag. It will be used to check whether to call
+ * GDHT the next time.
+ */
+ if (cc == DFLTCC_CC_OK) {
+ if (soft_bcc) {
+ send_eobs(strm, param);
+ param->bcf = 0;
+ dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+ } else
+ param->bcf = 1;
+ if (flush == Z_FINISH) {
+ if (need_empty_block)
+ /* Make the current deflate() call also close the stream */
+ return 0;
+ else {
+ bi_windup(state);
+ *result = finish_done;
+ }
+ } else {
+ if (flush == Z_FULL_FLUSH)
+ param->hl = 0; /* Clear history */
+ *result = flush == Z_NO_FLUSH ? need_more : block_done;
+ }
+ } else {
+ param->bcf = 1;
+ *result = need_more;
+ }
+ if (strm->avail_in != 0 && strm->avail_out != 0)
+ goto again; /* deflate() must use all input or all output */
+ return 1;
+}
+
+/*
+ Switching between hardware and software compression.
+
+ DFLTCC does not support all zlib settings, e.g. generation of non-compressed
+ blocks or alternative window sizes. When such settings are applied on the
+ fly with deflateParams, we need to convert between hardware and software
+ window formats.
+*/
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+ deflate_state *state = (deflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+ return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
+ deflate_state *state = (deflate_state *)strm->state;
+ int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
+ int can_deflate = dfltcc_can_deflate_with_params(strm, level, W_BITS(state), strategy, state->reproducible);
+
+ if (can_deflate == could_deflate)
+ /* We continue to work in the same mode - no changes needed */
+ return Z_OK;
+
+ if (!dfltcc_was_deflate_used(strm))
+ /* DFLTCC was not used yet - no changes needed */
+ return Z_OK;
+
+ /* For now, do not convert between window formats - simply get rid of the old data instead */
+ *flush = Z_FULL_FLUSH;
+ return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
+ deflate_state *state = (deflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+ /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+ * close the block without resetting the compression state. Detect this
+ * situation and return that deflation is not done.
+ */
+ if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+ return 0;
+
+ /* Return that deflation is not done if DFLTCC is used and either it
+ * buffered some data (Continuation Flag is set), or has not written EOBS
+ * yet (Block-Continuation Flag is set).
+ */
+ return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
+ deflate_state *state = (deflate_state *)strm->state;
+
+ return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
+}
+
+/*
+ Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+ const unsigned char *dictionary, uInt dict_length) {
+ deflate_state *state = (deflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+ append_history(param, state->window, dictionary, dict_length);
+ state->strstart = 1; /* Add FDICT to zlib header */
+ state->block_start = state->strstart; /* Make deflate_stored happy */
+ return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
+ deflate_state *state = (deflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+ if (dictionary)
+ get_history(param, state->window, dictionary);
+ if (dict_length)
+ *dict_length = param->hl;
+ return Z_OK;
+}
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_deflate.h b/neozip/arch/s390/dfltcc_deflate.h
new file mode 100644
index 0000000000..35e2fd3f62
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_deflate.h
@@ -0,0 +1,58 @@
+#ifndef DFLTCC_DEFLATE_H
+#define DFLTCC_DEFLATE_H
+
+#include "deflate.h"
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+ const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+
+#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+ do { \
+ if (PREFIX(dfltcc_can_deflate)((strm))) \
+ return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
+ } while (0)
+
+#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+ do { \
+ if (PREFIX(dfltcc_can_deflate)((strm))) \
+ return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
+ } while (0)
+
+#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
+
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
+ do { \
+ int err; \
+\
+ err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
+ if (err == Z_STREAM_ERROR) \
+ return err; \
+ } while (0)
+
+#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
+
+#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
+ do { \
+ if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
+ (complen) = DEFLATE_BOUND_COMPLEN(source_len); \
+ } while (0)
+
+#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
+
+#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
+
+#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_detail.h b/neozip/arch/s390/dfltcc_detail.h
new file mode 100644
index 0000000000..f790735ab4
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_detail.h
@@ -0,0 +1,274 @@
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include <stdio.h>
+
+#ifdef HAVE_SYS_SDT_H
+#include <sys/sdt.h>
+#endif
+
+/*
+ Tuning parameters.
+ */
+#ifndef DFLTCC_LEVEL_MASK
+#define DFLTCC_LEVEL_MASK 0x2
+#endif
+#ifndef DFLTCC_BLOCK_SIZE
+#define DFLTCC_BLOCK_SIZE 1048576
+#endif
+#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
+#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
+#endif
+#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
+#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
+#endif
+#ifndef DFLTCC_RIBM
+#define DFLTCC_RIBM 0
+#endif
+
+#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
+
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
+
+static inline int is_bit_set(const char *bits, int n) {
+ return bits[n / 8] & (1 << (7 - (n % 8)));
+}
+
+static inline void clear_bit(char *bits, int n) {
+ bits[n / 8] &= ~(1 << (7 - (n % 8)));
+}
+
+#define DFLTCC_FACILITY 151
+
+static inline int is_dfltcc_enabled(void) {
+ uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
+ Z_REGISTER uint8_t r0 __asm__("r0");
+
+ memset(facilities, 0, sizeof(facilities));
+ r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
+ /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+ * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+ * is 64-bit, it's always z/Architecture mode at runtime.
+ */
+ __asm__ volatile(
+#ifndef __clang__
+ ".machinemode push\n"
+ ".machinemode zarch\n"
+#endif
+ "stfle %[facilities]\n"
+#ifndef __clang__
+ ".machinemode pop\n"
+#endif
+ : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+ return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
+}
+
+#define DFLTCC_FMT0 0
+
+#define CVT_CRC32 0
+#define CVT_ADLER32 1
+#define HTT_FIXED 0
+#define HTT_DYNAMIC 1
+
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
+
+static inline z_const char *oesc_msg(char *buf, int oesc) {
+ if (oesc == 0x00)
+ return NULL; /* Successful completion */
+ else {
+ sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
+ return buf;
+ }
+}
+
+/*
+ C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+ DFLTCC_CC_OK = 0,
+ DFLTCC_CC_OP1_TOO_SHORT = 1,
+ DFLTCC_CC_OP2_TOO_SHORT = 2,
+ DFLTCC_CC_OP2_CORRUPT = 2,
+ DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+
+/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
+static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
+ *hl_high = MIN(param->hl, HB_SIZE - param->ho);
+ *hl_low = param->hl - *hl_high;
+}
+
+/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
+static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
+ size_t hl_high, hl_low;
+
+ get_history_lengths(param, &hl_high, &hl_low);
+ instrument_read_write(hist + param->ho, hl_high);
+ instrument_read_write(hist, hl_low);
+}
+
+/* Notify MSan about a completed write to the circular history buffer. */
+static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
+ size_t hl_high, hl_low;
+
+ get_history_lengths(param, &hl_high, &hl_low);
+ __msan_unpoison(hist + param->ho, hl_high);
+ __msan_unpoison(hist, hl_low);
+}
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+ unsigned char **op1, size_t *len1,
+ z_const unsigned char **op2, size_t *len2, void *hist) {
+ unsigned char *t2 = op1 ? *op1 : NULL;
+ unsigned char *orig_t2 = t2;
+ size_t t3 = len1 ? *len1 : 0;
+ z_const unsigned char *t4 = op2 ? *op2 : NULL;
+ size_t t5 = len2 ? *len2 : 0;
+ Z_REGISTER int r0 __asm__("r0");
+ Z_REGISTER void *r1 __asm__("r1");
+ Z_REGISTER unsigned char *r2 __asm__("r2");
+ Z_REGISTER size_t r3 __asm__("r3");
+ Z_REGISTER z_const unsigned char *r4 __asm__("r4");
+ Z_REGISTER size_t r5 __asm__("r5");
+ int cc;
+
+ /* Insert pre-instrumentation for DFLTCC. */
+ switch (fn & DFLTCC_FN_MASK) {
+ case DFLTCC_QAF:
+ instrument_write(param, DFLTCC_SIZEOF_QAF);
+ break;
+ case DFLTCC_GDHT:
+ instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
+ instrument_read(t4, t5);
+ break;
+ case DFLTCC_CMPR:
+ case DFLTCC_XPND:
+ instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+ instrument_read(t4, t5);
+ instrument_write(t2, t3);
+ instrument_read_write_hist(param, hist);
+ break;
+ }
+
+ r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
+ __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+ STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+ ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+ STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+ "ipm %[cc]\n"
+ : [r2] "+r" (r2)
+ , [r3] "+r" (r3)
+ , [r4] "+r" (r4)
+ , [r5] "+r" (r5)
+ , [cc] "=r" (cc)
+ : [r0] "r" (r0)
+ , [r1] "r" (r1)
+ , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+ , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+ : "cc", "memory");
+ t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+ /* Insert post-instrumentation for DFLTCC. */
+ switch (fn & DFLTCC_FN_MASK) {
+ case DFLTCC_QAF:
+ __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+ break;
+ case DFLTCC_GDHT:
+ __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+ break;
+ case DFLTCC_CMPR:
+ __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+ __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+ msan_unpoison_hist(param, hist);
+ break;
+ case DFLTCC_XPND:
+ __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+ __msan_unpoison(orig_t2, t2 - orig_t2);
+ msan_unpoison_hist(param, hist);
+ break;
+ }
+
+ if (op1)
+ *op1 = t2;
+ if (len1)
+ *len1 = t3;
+ if (op2)
+ *op2 = t4;
+ if (len2)
+ *len2 = t5;
+ return (cc >> 28) & 3;
+}
+
+static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
+ /* Initialize available functions */
+ if (is_dfltcc_enabled()) {
+ dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
+ memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
+ } else
+ memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+
+ /* Initialize parameter block */
+ memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
+ dfltcc_state->param.nt = 1;
+ dfltcc_state->param.ribm = DFLTCC_RIBM;
+}
+
+static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
+ memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
+ const unsigned char *buf, uInt count) {
+ size_t offset;
+ size_t n;
+
+ /* Do not use more than 32K */
+ if (count > HB_SIZE) {
+ buf += count - HB_SIZE;
+ count = HB_SIZE;
+ }
+ offset = (param->ho + param->hl) % HB_SIZE;
+ if (offset + count <= HB_SIZE)
+ /* Circular history buffer does not wrap - copy one chunk */
+ memcpy(history + offset, buf, count);
+ else {
+ /* Circular history buffer wraps - copy two chunks */
+ n = HB_SIZE - offset;
+ memcpy(history + offset, buf, n);
+ memcpy(history, buf + n, count - n);
+ }
+ n = param->hl + count;
+ if (n <= HB_SIZE)
+ /* All history fits into buffer - no need to discard anything */
+ param->hl = n;
+ else {
+ /* History does not fit into buffer - discard extra bytes */
+ param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
+ param->hl = HB_SIZE;
+ }
+}
+
+static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
+ unsigned char *buf) {
+ size_t hl_high, hl_low;
+
+ get_history_lengths(param, &hl_high, &hl_low);
+ memcpy(buf, history + param->ho, hl_high);
+ memcpy(buf + hl_high, history, hl_low);
+}
diff --git a/neozip/arch/s390/dfltcc_inflate.c b/neozip/arch/s390/dfltcc_inflate.c
new file mode 100644
index 0000000000..f6bc423c22
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_inflate.c
@@ -0,0 +1,195 @@
+/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
+
+/*
+ Use the following commands to build zlib-ng with DFLTCC decompression support:
+
+ $ ./configure --with-dfltcc-inflate
+ or
+
+ $ cmake -DWITH_DFLTCC_INFLATE=1 .
+
+ and then
+
+ $ make
+*/
+
+#ifdef S390_DFLTCC_INFLATE
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "dfltcc_inflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+
+ dfltcc_reset_state(&state->arch.common);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+ struct dfltcc_state *dfltcc_state = &state->arch.common;
+
+ /* Unsupported hardware */
+ return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
+}
+
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+ size_t avail_in = strm->avail_in;
+ size_t avail_out = strm->avail_out;
+ dfltcc_cc cc;
+
+ cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
+ param, &strm->next_out, &avail_out,
+ &strm->next_in, &avail_in, state->window);
+ strm->avail_in = avail_in;
+ strm->avail_out = avail_out;
+ return cc;
+}
+
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+ struct dfltcc_state *dfltcc_state = &state->arch.common;
+ struct dfltcc_param_v0 *param = &dfltcc_state->param;
+ dfltcc_cc cc;
+
+ if (flush == Z_BLOCK || flush == Z_TREES) {
+ /* DFLTCC does not support stopping on block boundaries */
+ if (PREFIX(dfltcc_inflate_disable)(strm)) {
+ *ret = Z_STREAM_ERROR;
+ return DFLTCC_INFLATE_BREAK;
+ } else
+ return DFLTCC_INFLATE_SOFTWARE;
+ }
+
+ if (state->last) {
+ if (state->bits != 0) {
+ strm->next_in++;
+ strm->avail_in--;
+ state->bits = 0;
+ }
+ state->mode = CHECK;
+ return DFLTCC_INFLATE_CONTINUE;
+ }
+
+ if (strm->avail_in == 0 && !param->cf)
+ return DFLTCC_INFLATE_BREAK;
+
+ /* if window not in use yet, initialize */
+ if (state->wsize == 0)
+ state->wsize = 1U << state->wbits;
+
+ /* Translate stream to parameter block */
+ param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
+ param->sbb = state->bits;
+ if (param->hl)
+ param->nt = 0; /* Honor history for the first block */
+ if (state->wrap & 4)
+ param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+
+ /* Inflate */
+ do {
+ cc = dfltcc_xpnd(strm);
+ } while (cc == DFLTCC_CC_AGAIN);
+
+ /* Translate parameter block to stream */
+ strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
+ state->last = cc == DFLTCC_CC_OK;
+ state->bits = param->sbb;
+ if (state->wrap & 4)
+ strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+ if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+ /* Report an error if stream is corrupted */
+ state->mode = BAD;
+ return DFLTCC_INFLATE_CONTINUE;
+ }
+ state->mode = TYPEDO;
+ /* Break if operands are exhausted, otherwise continue looping */
+ return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
+ DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+
+ return !state->arch.common.param.nt;
+}
+
+/*
+ Rotates a circular buffer.
+ The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
+ */
+static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
+ unsigned char *p = pivot;
+ unsigned char tmp;
+
+ while (p != start) {
+ tmp = *start;
+ *start = *p;
+ *p = tmp;
+
+ start++;
+ p++;
+
+ if (p == end)
+ p = pivot;
+ else if (start == pivot)
+ pivot = p;
+ }
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+ struct dfltcc_state *dfltcc_state = &state->arch.common;
+ struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+ if (!PREFIX(dfltcc_can_inflate)(strm))
+ return 0;
+ if (PREFIX(dfltcc_was_inflate_used)(strm))
+ /* DFLTCC has already decompressed some data. Since there is not
+ * enough information to resume decompression in software, the call
+ * must fail.
+ */
+ return 1;
+ /* DFLTCC was not used yet - decompress in software */
+ memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+ /* Convert the window from the hardware to the software format */
+ rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
+ state->whave = state->wnext = MIN(param->hl, state->wsize);
+ return 0;
+}
+
+/*
+ Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+ const unsigned char *dictionary, uInt dict_length) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+ /* if window not in use yet, initialize */
+ if (state->wsize == 0)
+ state->wsize = 1U << state->wbits;
+
+ append_history(param, state->window, dictionary, dict_length);
+ state->havedict = 1;
+ return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+ unsigned char *dictionary, uInt *dict_length) {
+ struct inflate_state *state = (struct inflate_state *)strm->state;
+ struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+ if (dictionary && state->window)
+ get_history(param, state->window, dictionary);
+ if (dict_length)
+ *dict_length = param->hl;
+ return Z_OK;
+}
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_inflate.h b/neozip/arch/s390/dfltcc_inflate.h
new file mode 100644
index 0000000000..3623f8ed7f
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_inflate.h
@@ -0,0 +1,67 @@
+#ifndef DFLTCC_INFLATE_H
+#define DFLTCC_INFLATE_H
+
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
+typedef enum {
+ DFLTCC_INFLATE_CONTINUE,
+ DFLTCC_INFLATE_BREAK,
+ DFLTCC_INFLATE_SOFTWARE,
+} dfltcc_inflate_action;
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+ const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+ unsigned char *dictionary, uInt* dict_length);
+
+#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
+
+#define INFLATE_PRIME_HOOK(strm, bits, value) \
+ do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
+
+#define INFLATE_TYPEDO_HOOK(strm, flush) \
+ if (PREFIX(dfltcc_can_inflate)((strm))) { \
+ dfltcc_inflate_action action; \
+\
+ RESTORE(); \
+ action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
+ LOAD(); \
+ if (action == DFLTCC_INFLATE_CONTINUE) \
+ break; \
+ else if (action == DFLTCC_INFLATE_BREAK) \
+ goto inf_leave; \
+ }
+
+#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_MARK_HOOK(strm) \
+ do { \
+ if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
+ } while (0)
+
+#define INFLATE_SYNC_POINT_HOOK(strm) \
+ do { \
+ if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
+ } while (0)
+
+#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+ do { \
+ if (PREFIX(dfltcc_can_inflate)((strm))) \
+ return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
+ } while (0)
+
+#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+ do { \
+ if (PREFIX(dfltcc_can_inflate)((strm))) \
+ return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
+ } while (0)
+
+#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
diff --git a/neozip/arch/s390/s390_features.c b/neozip/arch/s390/s390_features.c
new file mode 100644
index 0000000000..dabb578a07
--- /dev/null
+++ b/neozip/arch/s390/s390_features.c
@@ -0,0 +1,18 @@
+#ifdef S390_FEATURES
+
+#include "zbuild.h"
+#include "s390_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+# include <sys/auxv.h>
+#endif
+
+#ifndef HWCAP_S390_VXRS
+#define HWCAP_S390_VXRS (1 << 11)
+#endif
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
+ features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
+
+#endif
diff --git a/neozip/arch/s390/s390_features.h b/neozip/arch/s390/s390_features.h
new file mode 100644
index 0000000000..fb2ac14b26
--- /dev/null
+++ b/neozip/arch/s390/s390_features.h
@@ -0,0 +1,14 @@
+/* s390_features.h -- check for s390 features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FEATURES_H_
+#define S390_FEATURES_H_
+
+struct s390_cpu_features {
+ int has_vx;
+};
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
+
+#endif
diff --git a/neozip/arch/s390/s390_functions.h b/neozip/arch/s390/s390_functions.h
new file mode 100644
index 0000000000..30647051f4
--- /dev/null
+++ b/neozip/arch/s390/s390_functions.h
@@ -0,0 +1,33 @@
+/* s390_functions.h -- s390 implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FUNCTIONS_H_
+#define S390_FUNCTIONS_H_
+
+#include "s390_natives.h"
+
+#ifdef S390_CRC32_VX
+uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+
+#ifdef __clang__
+# if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+# error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \
+ Either disable the zlib-ng CRC32-VX optimization, or switch to another compiler/compiler version.
+# endif
+#endif
+
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// S390 - CRC32 VX
+# ifdef S390_CRC32_VX_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_s390_vx
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_s390_vx
+# endif
+#endif
+
+#endif
diff --git a/neozip/arch/s390/s390_natives.h b/neozip/arch/s390/s390_natives.h
new file mode 100644
index 0000000000..5da913daf5
--- /dev/null
+++ b/neozip/arch/s390/s390_natives.h
@@ -0,0 +1,14 @@
+/* s390_natives.h -- s390 compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_NATIVES_H_
+#define S390_NATIVES_H_
+
+#if defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
+# ifdef S390_CRC32_VX
+# define S390_CRC32_VX_NATIVE
+# endif
+#endif
+
+#endif /* S390_NATIVES_H_ */
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner b/neozip/arch/s390/self-hosted-builder/actions-runner
new file mode 100755
index 0000000000..aabc802547
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+#
+# Ephemeral runner startup script.
+#
+# Expects the following environment variables:
+#
+# - REPO=<owner>
+# - PAT_TOKEN=<github_pat_***>
+#
+
+set -e -u
+
+# Validate required environment variables
+if [ -z "${REPO:-}" ] || [ -z "${PAT_TOKEN:-}" ]; then
+ echo "Error: REPO and/or PAT_TOKEN environment variables not found"
+ exit 1
+fi
+
+# Check the cached registration token.
+TOKEN_FILE=registration-token.json
+if [ -f $TOKEN_FILE ]; then
+ set +e
+ EXPIRES=$(jq --raw-output .expires_at "$TOKEN_FILE" 2>/dev/null)
+ STATUS=$?
+ set -e
+else
+ STATUS=1
+ EXPIRES=""
+fi
+
+if [[ $STATUS -ne 0 || -z "$EXPIRES" || "$EXPIRES" == "null" || $(date +%s) -ge $(date -d "$EXPIRES" +%s) ]]; then
+ # Refresh the cached registration token.
+ curl \
+ -sS \
+ -X POST \
+ -H "Accept: application/vnd.github+json" \
+ -H "Authorization: Bearer $PAT_TOKEN" \
+ "https://api.github.com/repos/$REPO/actions/runners/registration-token" \
+ -o "$TOKEN_FILE"
+fi
+
+REG_TOKEN=$(jq --raw-output .token "$TOKEN_FILE")
+if [ $REG_TOKEN = "null" ]; then
+ echo "Failed to get registration token"
+ exit 1
+fi
+
+# (Re-)register the runner.
+./config.sh remove --token "$REG_TOKEN" || true
+set -x
+./config.sh \
+ --url "https://github.com/$REPO" \
+ --token "$REG_TOKEN" \
+ --unattended \
+ --disableupdate \
+ --replace \
+ --labels z15 \
+ --ephemeral
+
+# Run one job.
+./run.sh
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh b/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh
new file mode 100644
index 0000000000..7fded31785
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/bash
+set -ex
+
+TMPDIR="$(mktemp -d)"
+
+if [ -f actions-runner.Dockerfile ]; then
+ MODE=1
+ cp actions-runner.Dockerfile actions-runner entrypoint $TMPDIR
+ cd $TMPDIR
+else
+ MODE=2
+ cd $TMPDIR
+ wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/actions-runner.Dockerfile
+ wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/actions-runner
+ wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/entrypoint
+fi
+
+# Stop service
+systemctl stop actions-runner || true
+
+# Delete old container
+podman container rm gaplib-actions-runner || true
+
+# Delete old image
+podman image rm localhost/zlib-ng/actions-runner || true
+
+# Prune all unused podman data
+podman system prune -f || true
+
+# Build new image
+podman build --squash -f actions-runner.Dockerfile --tag zlib-ng/actions-runner . 2>&1 | tee /var/log/actions-runner-build.log
+
+# Create new container
+podman create --replace --name=gaplib-actions-runner --env-file=/etc/actions-runner --init \
+ zlib-ng/actions-runner 2>&1 | tee -a /var/log/actions-runner-build.log
+
+# Start service
+systemctl start actions-runner || true
+
+# Cleanup
+podman image prune -af || true
+
+# Clean up tempfile
+if [ "$MODE" == "2" ] ; then
+ cd $TMPDIR
+ rm actions-runner.Dockerfile
+ rm actions-runner
+ rm entrypoint
+ cd ..
+ rmdir $TMPDIR
+ echo "Deleted tempfiles."
+fi
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile b/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile
new file mode 100644
index 0000000000..7210caaebe
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile
@@ -0,0 +1,47 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+FROM almalinux:10
+
+RUN dnf update -y -q && \
+ dnf install -y -q --enablerepo=crb wget git which sudo jq sed \
+ cmake make automake autoconf m4 libtool ninja-build \
+ python3-pip python3-devel python3-lxml \
+ gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
+ glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
+
+RUN dnf install -y -q dotnet-sdk-8.0 && \
+ echo "Using SDK - `dotnet --version`"
+
+RUN cd /tmp && \
+ git clone -q https://github.com/actions/runner && \
+ cd runner && \
+ git checkout $(git tag --sort=-v:refname | grep '^v[0-9]' | head -n1) && \
+ git log -n 1 && \
+ wget https://raw.githubusercontent.com/IBM/action-runner-image-pz/refs/heads/main/patches/runner-sdk8-s390x.patch -O runner-sdk8-s390x.patch && \
+ git apply --whitespace=nowarn runner-sdk8-s390x.patch && \
+
+ sed -i'' -e /version/s/8......\"$/$8.0.100\"/ src/global.json
+
+RUN cd /tmp/runner/src && \
+ ./dev.sh layout && \
+ ./dev.sh package && \
+ rm -rf /root/.dotnet /root/.nuget
+
+RUN useradd -c "Action Runner" -m actions-runner && \
+ usermod -L actions-runner
+
+RUN tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
+ chown -R actions-runner:actions-runner /home/actions-runner
+
+# Cleanup
+RUN rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
+ dnf clean all
+
+USER actions-runner
+
+# Scripts.
+COPY --chmod=555 entrypoint /usr/bin/
+COPY --chmod=555 actions-runner /usr/bin/
+WORKDIR /home/actions-runner
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD ["/usr/bin/actions-runner"]
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner.service b/neozip/arch/s390/self-hosted-builder/actions-runner.service
new file mode 100644
index 0000000000..79560cde18
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Podman container: Gaplib Github Actions Runner
+Wants=network-online.target
+After=network-online.target
+StartLimitIntervalSec=1
+RequiresMountsFor=/run/user/1001/containers
+
+[Service]
+Environment=PODMAN_SYSTEMD_UNIT=%n
+Restart=always
+TimeoutStopSec=61
+ExecStart=/usr/bin/podman start gaplib-actions-runner
+ExecStop=/usr/bin/podman stop -t 30 gaplib-actions-runner
+ExecStopPost=/usr/bin/podman stop -t 10 gaplib-actions-runner
+Type=forking
+
+[Install]
+WantedBy=default.target
diff --git a/neozip/arch/s390/self-hosted-builder/entrypoint b/neozip/arch/s390/self-hosted-builder/entrypoint
new file mode 100755
index 0000000000..eb8772becf
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/entrypoint
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+#
+# Container entrypoint that waits for all spawned processes.
+#
+
+set -e -u
+
+# Create a FIFO and start reading from its read end.
+tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
+trap 'rm -r "$tempdir"' EXIT
+done="$tempdir/pipe"
+mkfifo "$done"
+cat "$done" & waiter=$!
+
+# Start the workload. Its descendants will inherit the FIFO's write end.
+status=0
+if [ "$#" -eq 0 ]; then
+ bash 9>"$done" || status=$?
+else
+ "$@" 9>"$done" || status=$?
+fi
+
+# When the workload and all of its descendants exit, the FIFO's write end will
+# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
+# in order to handle SelfUpdater, which the workload may start in background
+# before exiting.
+wait "$waiter"
+
+exit "$status"
diff --git a/neozip/arch/x86/Makefile.in b/neozip/arch/x86/Makefile.in
new file mode 100644
index 0000000000..f756844a9f
--- /dev/null
+++ b/neozip/arch/x86/Makefile.in
@@ -0,0 +1,176 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
+AVX512VNNIFLAG=-mavx512vnni -mbmi2
+AVX2FLAG=-mavx2 -mbmi2
+SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
+SSE41FLAG=-msse4.1
+SSE42FLAG=-msse4.2
+PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
+XSAVEFLAG=-mxsave
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ x86_features.o x86_features.lo \
+ adler32_avx2.o adler32_avx2.lo \
+ adler32_avx512.o adler32_avx512.lo \
+ adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+ adler32_sse42.o adler32_sse42.lo \
+ adler32_ssse3.o adler32_ssse3.lo \
+ chunkset_avx2.o chunkset_avx2.lo \
+ chunkset_avx512.o chunkset_avx512.lo \
+ chunkset_sse2.o chunkset_sse2.lo \
+ chunkset_ssse3.o chunkset_ssse3.lo \
+ compare256_avx2.o compare256_avx2.lo \
+ compare256_avx512.o compare256_avx512.lo \
+ compare256_sse2.o compare256_sse2.lo \
+ crc32_chorba_sse2.o crc32_chorba_sse2.lo \
+ crc32_chorba_sse41.o crc32_chorba_sse41.lo \
+ crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+ crc32_vpclmulqdq_avx2.o crc32_vpclmulqdq_avx2.lo \
+ crc32_vpclmulqdq_avx512.o crc32_vpclmulqdq_avx512.lo \
+ slide_hash_avx2.o slide_hash_avx2.lo \
+ slide_hash_sse2.o slide_hash_sse2.lo
+
+x86_features.o:
+ $(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+x86_features.lo:
+ $(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+chunkset_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx512.o:
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_avx512.lo:
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_ssse3.o:
+ $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+chunkset_ssse3.lo:
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+compare256_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx512.o:
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
+compare256_avx512.lo:
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
+compare256_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+crc32_chorba_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c
+
+crc32_chorba_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c
+
+crc32_chorba_sse41.o:
+ $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c
+
+crc32_chorba_sse41.lo:
+ $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c
+
+crc32_pclmulqdq.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_pclmulqdq.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_vpclmulqdq_avx2.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
+
+crc32_vpclmulqdq_avx2.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
+
+crc32_vpclmulqdq_avx512.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
+
+crc32_vpclmulqdq_avx512.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
+
+slide_hash_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+slide_hash_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/x86/adler32_avx2.c b/neozip/arch/x86/adler32_avx2.c
new file mode 100644
index 0000000000..d1811b254d
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx2.c
@@ -0,0 +1,172 @@
+/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2022 Adam Stylinski
+ * Authors:
+ * Brian Bockelman <bockelman@gmail.com>
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include <immintrin.h>
+#include "adler32_p.h"
+#include "adler32_avx2_p.h"
+#include "x86_intrins.h"
+
+extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 16) {
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
+ } else if (len < 32) {
+ if (COPY) {
+ return adler32_copy_sse42(adler, dst, src, len);
+ } else {
+ return adler32_ssse3(adler, src, len);
+ }
+ }
+
+ __m256i vs1, vs2, vs2_0;
+
+ const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
+ 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
+ const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m256i dot3v = _mm256_set1_epi16(1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ while (len >= 32) {
+ vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = _mm256_setzero_si256();
+ vs2_0 = vs3;
+
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+ len -= k;
+
+ while (k >= 64) {
+ __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+ __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
+ src += 64;
+ k -= 64;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
+ __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
+
+ if (COPY) {
+ _mm256_storeu_si256((__m256i*)dst, vbuf);
+ _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
+ dst += 64;
+ }
+
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+ __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
+ __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+ __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
+ vs1 = _mm256_add_epi32(vs1_sad2, vs1);
+ vs2 = _mm256_add_epi32(vsum2, vs2);
+ vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm256_add_epi32(vs2_0, vs2);
+ vs3 = _mm256_slli_epi32(vs3, 6);
+ vs2 = _mm256_add_epi32(vs3, vs2);
+ vs3 = _mm256_setzero_si256();
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+ */
+ __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+
+ if (COPY) {
+ _mm256_storeu_si256((__m256i*)dst, vbuf);
+ dst += 32;
+ }
+
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
+ __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+ vs2 = _mm256_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ /* Defer the multiplication with 32 to outside of the loop */
+ vs3 = _mm256_slli_epi32(vs3, 5);
+ vs2 = _mm256_add_epi32(vs2, vs3);
+
+ /* The compiler is generating the following sequence for this integer modulus
+ * when done the scalar way, in GPRs:
+
+ adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+ (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+
+ mov $0x80078071,%edi // move magic constant into 32 bit register %edi
+ ...
+ vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
+ mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
+ imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
+ shr $0x2f,%rsi // shift right by 47
+ imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+ sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
+ ...
+ // repeats for each element with vpextract instructions
+
+ This is tricky with AVX2 for a number of reasons:
+ 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
+ 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
+ back down to 32 bit precision later (there is in AVX512)
+ 3.) Full width integer multiplications aren't cheap
+
+ We can, however, do a relatively cheap sequence for horizontal sums.
+ Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
+ previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
+ that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
+ performed on the maximum possible inputs before overflow
+ */
+
+
+ /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
+ * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
+ * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
+ * what the compiler is doing to avoid integer divisions. */
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx2_p.h b/neozip/arch/x86/adler32_avx2_p.h
new file mode 100644
index 0000000000..f0f8a4a887
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx2_p.h
@@ -0,0 +1,32 @@
+/* adler32_avx2_p.h -- adler32 avx2 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_P_H_
+#define ADLER32_AVX2_P_H_
+
+#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
+
+/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
+static inline uint32_t hsum256(__m256i x) {
+ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
+ _mm256_castsi256_si128(x));
+ __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
+ __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+ /* We need a permutation vector to extract every other integer. The
+ * rest are going to be zeros */
+ const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
+ __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
+ __m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
+ __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
+ __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+#endif
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512.c b/neozip/arch/x86/adler32_avx512.c
new file mode 100644
index 0000000000..8a8e165bb9
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512.c
@@ -0,0 +1,102 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 64) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+ if (COPY && len) {
+ __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+ __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+ _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+ }
+
+ return adler32_avx2(adler, src, len);
+ }
+
+ __m512i vbuf, vs1_0, vs3;
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ const __m512i dot3v = _mm512_set1_epi16(1);
+ const __m512i zero = _mm512_setzero_si512();
+
+ while (len >= 64) {
+ __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ vs1_0 = vs1;
+ vs3 = _mm512_setzero_si512();
+
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 64);
+ len -= k;
+
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf = _mm512_loadu_si512(src);
+
+ if (COPY) {
+ _mm512_storeu_si512(dst, vbuf);
+ dst += 64;
+ }
+
+ src += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+ __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+ vs1 = _mm512_add_epi32(vs1_sad, vs1);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm512_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512_p.h b/neozip/arch/x86/adler32_avx512_p.h
new file mode 100644
index 0000000000..742269053c
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,57 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+
+/* Written because Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsic macros */
+#if defined(_MSC_VER) && !defined(_MM_K0_REG8)
+# undef _mm512_extracti64x4_epi64
+# define _mm512_extracti64x4_epi64(v1, e1) _mm512_maskz_extracti64x4_epi64(UINT8_MAX, v1, e1)
+# undef _mm512_set1_epi16
+# define _mm512_set1_epi16(e1) _mm512_maskz_set1_epi16(UINT32_MAX, e1)
+# undef _mm512_maddubs_epi16
+# define _mm512_maddubs_epi16(v1, v2) _mm512_maskz_maddubs_epi16(UINT32_MAX, v1, v2)
+#endif
+
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+ __m256i a = _mm512_extracti64x4_epi64(x, 1);
+ __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+ __m256i a_plus_b = _mm256_add_epi32(a, b);
+ __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+ __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+ __m128i c_plus_d = _mm_add_epi32(c, d);
+
+ __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+ __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+ return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+ /* We need a permutation vector to extract every other integer. The
+ * rest are going to be zeros. Marking this const so the compiler stands
+ * a better chance of keeping this resident in a register through entire
+ * loop execution. We certainly have enough zmm registers (32) */
+ const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 1, 1, 1, 1, 1, 1, 1);
+
+ __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+ /* From here, it's a simple 256 bit wide reduction sum */
+ __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+ /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+ * pretty slow, much slower than the longer instruction sequence below */
+ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+ _mm256_castsi256_si128(non_zero_avx));
+ __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+ __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512_vnni.c b/neozip/arch/x86/adler32_avx512_vnni.c
new file mode 100644
index 0000000000..8bebffbf88
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512_vnni.c
@@ -0,0 +1,205 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512VNNI
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 32)
+ return adler32_ssse3(adler, src, len);
+
+ if (len < 64)
+ return adler32_avx2(adler, src, len);
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+ const __m512i zero = _mm512_setzero_si512();
+ __m512i vs1, vs2;
+
+ while (len >= 64) {
+ vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 64);
+ len -= k;
+ __m512i vs1_0 = vs1;
+ __m512i vs3 = _mm512_setzero_si512();
+ /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+ __m512i vs2_1 = _mm512_setzero_si512();
+ __m512i vbuf0, vbuf1;
+
+ /* Remainder peeling */
+ if (k % 128) {
+ vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+ src += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 128) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf0 = _mm512_loadu_si512((__m512i*)src);
+ vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+ src += 128;
+ k -= 128;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+ vs3 = _mm512_add_epi32(vs3, vs1);
+ vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+ vs2 = _mm512_add_epi32(vs2, vs2_1);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+/* Use 256-bit vectors when copying because 512-bit variant is slower. */
+Z_INTERNAL uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel_copy:
+ if (len < 32) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+ __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+ __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+ _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+ return adler32_ssse3(adler, src, len);
+ }
+
+ const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i vs1, vs2;
+
+ while (len >= 32) {
+ vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+ len -= k;
+
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = _mm256_setzero_si256();
+ /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+ __m256i vs2_1 = _mm256_setzero_si256();
+ __m256i vbuf0, vbuf1;
+
+ /* Remainder peeling */
+ if (k % 64) {
+ vbuf1 = _mm256_loadu_si256((__m256i*)src);
+ _mm256_storeu_si256((__m256i*)dst, vbuf1);
+ dst += 32;
+
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf0 = _mm256_loadu_si256((__m256i*)src);
+ vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+ _mm256_storeu_si256((__m256i*)dst, vbuf0);
+ _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+ dst += 64;
+ src += 64;
+ k -= 64;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+ vs3 = _mm256_add_epi32(vs3, vs1);
+ vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm256_slli_epi32(vs3, 5);
+ vs2 = _mm256_add_epi32(vs2, vs3);
+ vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel_copy;
+ }
+
+ return adler;
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_sse42.c b/neozip/arch/x86/adler32_sse42.c
new file mode 100644
index 0000000000..c2301213f0
--- /dev/null
+++ b/neozip/arch/x86/adler32_sse42.c
@@ -0,0 +1,117 @@
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE42
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1);
+
+ __m128i vbuf, vbuf_0;
+ __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ v_sad_sum2, vsum2, vsum2_0;
+ __m128i zero = _mm_setzero_si128();
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+
+ while (len >= 16) {
+ size_t k = ALIGN_DOWN(MIN(len, NMAX), 16);
+ len -= k;
+
+ vs1 = _mm_cvtsi32_si128(adler0);
+ vs2 = _mm_cvtsi32_si128(adler1);
+
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+ src += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+ dst += 32;
+
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ src += 16;
+ k -= 16;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ dst += 16;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = hsum(vs2) % BASE;
+ }
+
+ /* If this is true, there's fewer than 16 elements remaining */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler0 | (adler1 << 16);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_ssse3.c b/neozip/arch/x86/adler32_ssse3.c
new file mode 100644
index 0000000000..702db50251
--- /dev/null
+++ b/neozip/arch/x86/adler32_ssse3.c
@@ -0,0 +1,149 @@
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSSE3
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+ /* If our buffer is unaligned (likely), make the determination whether
+ * or not there's enough of a buffer to consume to make the scalar, aligning
+ * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+ * load. This is a pretty simple test, just test if len < 32 */
+ size_t n = NMAX;
+ size_t k = 0;
+
+ if (len < 32) {
+ /* Let's eat the cost of this one unaligned load so that
+ * we don't completely skip over the vectorization. Doing
+ * 16 bytes at a time unaligned is better than 16 + <= 15
+ * sums */
+ vbuf = _mm_loadu_si128((__m128i*)buf);
+ len -= 16;
+ buf += 16;
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs1_0 = vs1;
+ goto unaligned_jmp;
+ }
+
+ size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+ if (align_diff) {
+ adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0);
+
+ buf += align_diff;
+ len -= align_diff;
+ n -= align_diff;
+ }
+
+ while (len >= 16) {
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ k = ALIGN_DOWN(MIN(len, n), 16);
+ len -= k;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+ buf += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ buf += 16;
+ k -= 16;
+
+unaligned_jmp:
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+ * a partial reduction sum implicitly and only summing to integers in vector positions
+ * 0 and 2. This saves us some contention on the shuffle port(s) */
+ adler = partial_hsum(vs1) % BASE;
+ sum2 = hsum(vs2) % BASE;
+ n = NMAX;
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0);
+}
+
+/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
+Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = adler32_ssse3(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
+#endif
diff --git a/neozip/arch/x86/adler32_ssse3_p.h b/neozip/arch/x86/adler32_ssse3_p.h
new file mode 100644
index 0000000000..d7ec3fe0d5
--- /dev/null
+++ b/neozip/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+ __m128i second_int = _mm_srli_si128(x, 8);
+ __m128i sum = _mm_add_epi32(x, second_int);
+ return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+ __m128i sum1 = _mm_unpackhi_epi64(x, x);
+ __m128i sum2 = _mm_add_epi32(x, sum1);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+ return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
diff --git a/neozip/arch/x86/chunkset_avx2.c b/neozip/arch/x86/chunkset_avx2.c
new file mode 100644
index 0000000000..3e69a7bf66
--- /dev/null
+++ b/neozip/arch/x86/chunkset_avx2.c
@@ -0,0 +1,129 @@
+/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+ /* See explanation in chunkset_avx512.c */
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+ halfchunk_t half = _mm_loadu_si128((__m128i*)from);
+ *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
+#else
+ *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+ * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
+ * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+ *chunk_rem = lut_rem.remval;
+
+ /* See note in chunkset_ssse3.c for why this is ok */
+ __msan_unpoison(buf + dist, 32 - dist);
+
+ if (dist < 16) {
+ /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+ * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+ * shuffles and combining the halves later */
+ __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+ } else {
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+ /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+ __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+ __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+ /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+ * shuffle those values */
+ __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+ }
+
+ return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+ /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+ * unlikely to be actually written or read from */
+ return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = _mm_loadu_si128((__m128i*)buf);
+ *chunk_rem = half_rem_vals[dist - 3];
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+#define CHUNKSIZE chunksize_avx2
+#define CHUNKCOPY chunkcopy_avx2
+#define CHUNKUNROLL chunkunroll_avx2
+#define CHUNKMEMSET chunkmemset_avx2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_avx2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_avx512.c b/neozip/arch/x86/chunkset_avx512.c
new file mode 100644
index 0000000000..60450c653b
--- /dev/null
+++ b/neozip/arch/x86/chunkset_avx512.c
@@ -0,0 +1,186 @@
+/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+typedef __mmask32 mask_t;
+typedef __mmask16 halfmask_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+#define HAVE_MASKED_READWRITE
+#define HAVE_CHUNKCOPY
+#define HAVE_HALFCHUNKCOPY
+
+static inline halfmask_t gen_half_mask(size_t len) {
+ return (halfmask_t)_bzhi_u32(0xFFFF, (unsigned)len);
+}
+
+static inline mask_t gen_mask(size_t len) {
+ return (mask_t)_bzhi_u32(0xFFFFFFFF, (unsigned)len);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm256_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+ /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where
+ * the load is dumped to the stack with an aligned move for this memory-register
+ * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to
+ * stack doesn't exist if compiled with optimizations. For the sake of working
+ * properly in a debugger, let's take the 2 cycle penalty */
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+ halfchunk_t half = _mm_loadu_si128((__m128i*)from);
+ *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
+#else
+ *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+
+ chunk_t chunk;
+ size_t rem = len % sizeof(chunk_t);
+
+ if (len < sizeof(chunk_t)) {
+ mask_t rem_mask = gen_mask(rem);
+ chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
+ _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+ return out + rem;
+ }
+
+ loadchunk(from, &chunk);
+ rem = (rem == 0) ? sizeof(chunk_t) : rem;
+ storechunk(out, &chunk);
+ out += rem;
+ from += rem;
+ len -= rem;
+
+ while (len > 0) {
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += sizeof(chunk_t);
+ from += sizeof(chunk_t);
+ len -= sizeof(chunk_t);
+ }
+
+ return out;
+}
+
+/* MSVC compiler decompression bug when optimizing for size */
+#if defined(_MSC_VER) && _MSC_VER < 1943
+# pragma optimize("", off)
+#endif
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ *chunk_rem = lut_rem.remval;
+
+ /* See the AVX2 implementation for more detailed comments. This is that + some masked
+ * loads to avoid an out of bounds read on the heap */
+
+ if (dist < 16) {
+ __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+ halfmask_t load_mask = gen_half_mask(dist);
+ __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+ } else {
+ halfmask_t load_mask = gen_half_mask(dist - 16);
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
+ __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
+ __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+ }
+
+ return ret_vec;
+}
+#if defined(_MSC_VER) && _MSC_VER < 1943
+# pragma optimize("", on)
+#endif
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+ /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+ * unlikely to be actually written or read from */
+ return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ halfmask_t load_mask = gen_half_mask(dist);
+ ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
+ *chunk_rem = half_rem_vals[dist - 3];
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+ halfchunk_t chunk;
+
+ size_t rem = len % sizeof(halfchunk_t);
+ if (rem == 0) {
+ rem = sizeof(halfchunk_t);
+ }
+
+ halfmask_t rem_mask = gen_half_mask(rem);
+ chunk = _mm_maskz_loadu_epi8(rem_mask, from);
+ _mm_mask_storeu_epi8(out, rem_mask, chunk);
+
+ return out + rem;
+}
+
+#define CHUNKSIZE chunksize_avx512
+#define CHUNKUNROLL chunkunroll_avx512
+#define CHUNKMEMSET chunkmemset_avx512
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_avx512
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_sse2.c b/neozip/arch/x86/chunkset_sse2.c
new file mode 100644
index 0000000000..633ab6e64f
--- /dev/null
+++ b/neozip/arch/x86/chunkset_sse2.c
@@ -0,0 +1,50 @@
+/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE2
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE chunksize_sse2
+#define CHUNKCOPY chunkcopy_sse2
+#define CHUNKUNROLL chunkunroll_sse2
+#define CHUNKMEMSET chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_sse2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_ssse3.c b/neozip/arch/x86/chunkset_ssse3.c
new file mode 100644
index 0000000000..0bef7de811
--- /dev/null
+++ b/neozip/arch/x86/chunkset_ssse3.c
@@ -0,0 +1,72 @@
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSSE3
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include <immintrin.h>
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = _mm_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ /* Important to note:
+ * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+ * bytes we willingly and purposefully load uninitialized that we swizzle over
+ * in a vector register, anyway. If what we assume is wrong about what is used,
+ * the memory sanitizer will still usefully flag it */
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = _mm_loadu_si128((__m128i*)buf);
+ *chunk_rem = lut_rem.remval;
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+#define CHUNKSIZE chunksize_ssse3
+#define CHUNKMEMSET chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
+#define CHUNKCOPY chunkcopy_ssse3
+#define CHUNKUNROLL chunkunroll_ssse3
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_ssse3
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_avx2.c b/neozip/arch/x86/compare256_avx2.c
new file mode 100644
index 0000000000..5e2b1716cf
--- /dev/null
+++ b/neozip/arch/x86/compare256_avx2.c
@@ -0,0 +1,61 @@
+/* compare256_avx2.c -- AVX2 version of compare256
+ * Copyright Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_AVX2
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ __m256i ymm_src0, ymm_src1, ymm_cmp;
+ ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+ ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+ ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+ unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
+
+ src0 += 32, src1 += 32, len += 32;
+
+ ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+ ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+ ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+ mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+ if (mask != 0xFFFFFFFF)
+ return len + zng_ctz32(~mask);
+
+ src0 += 32, src1 += 32, len += 32;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_avx2
+#define COMPARE256 compare256_avx2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_avx2
+#define COMPARE256 compare256_avx2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_avx512.c b/neozip/arch/x86/compare256_avx512.c
new file mode 100644
index 0000000000..f3105505cb
--- /dev/null
+++ b/neozip/arch/x86/compare256_avx512.c
@@ -0,0 +1,87 @@
+/* compare256_avx512.c -- AVX512 version of compare256
+ * Copyright (C) 2025 Hans Kristian Rosbach
+ * Based on AVX2 implementation by Mika T. Lindqvist
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_AVX512
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) {
+ __m512i zmm_src0_4, zmm_src1_4;
+ __m512i zmm_src0_3, zmm_src1_3;
+ __m512i zmm_src0_2, zmm_src1_2;
+ __m512i zmm_src0_1, zmm_src1_1;
+ __m128i xmm_src0_0, xmm_src1_0;
+ uint64_t mask_1, mask_2, mask_3, mask_4;
+ uint32_t mask_0;
+
+ // First do a 16byte round before increasing to 64bytes, this reduces the
+ // penalty for the short matches, and those are usually the most common ones.
+ // This requires us to overlap on the last round, giving a small penalty
+ // on matches of 192+ bytes (Still faster than AVX2 though).
+
+ // 16 bytes
+ xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
+ xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
+ mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0);
+ if (mask_0 != 0x0000FFFF)
+ return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */
+
+ // 64 bytes
+ zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16));
+ zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16));
+ mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1);
+ if (mask_1 != 0xFFFFFFFFFFFFFFFF)
+ return 16 + zng_ctz64(~mask_1);
+
+ // 64 bytes
+ zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80));
+ zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80));
+ mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2);
+ if (mask_2 != 0xFFFFFFFFFFFFFFFF)
+ return 80 + zng_ctz64(~mask_2);
+
+ // 64 bytes
+ zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144));
+ zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144));
+ mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3);
+ if (mask_3 != 0xFFFFFFFFFFFFFFFF)
+ return 144 + zng_ctz64(~mask_3);
+
+ // 64 bytes (overlaps the previous 16 bytes for fast tail processing)
+ zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192));
+ zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192));
+ mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4);
+ if (mask_4 != 0xFFFFFFFFFFFFFFFF)
+ return 192 + zng_ctz64(~mask_4);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_avx512_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_avx512
+#define COMPARE256 compare256_avx512_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_avx512
+#define COMPARE256 compare256_avx512_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_sse2.c b/neozip/arch/x86/compare256_sse2.c
new file mode 100644
index 0000000000..cfaff82cfa
--- /dev/null
+++ b/neozip/arch/x86/compare256_sse2.c
@@ -0,0 +1,86 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_SSE2
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+ __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+ /* Do the first load unaligned, than all subsequent ones we have at least
+ * one aligned load. Sadly aligning both loads is probably unrealistic */
+ xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return zng_ctz32(~mask);
+
+ const uint8_t *last0 = src0 + 240;
+ const uint8_t *last1 = src1 + 240;
+
+ int align_offset = ((uintptr_t)src0) & 15;
+ int align_adv = 16 - align_offset;
+ uint32_t len = align_adv;
+
+ src0 += align_adv;
+ src1 += align_adv;
+
+ for (int i = 0; i < 15; ++i) {
+ xmm_src0 = _mm_load_si128((__m128i*)src0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF)
+ return len + zng_ctz32(~mask);
+
+ len += 16, src0 += 16, src1 += 16;
+ }
+
+ if (align_offset) {
+ xmm_src0 = _mm_loadu_si128((__m128i*)last0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)last1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ if (mask != 0xFFFF)
+ return 240 + zng_ctz32(~mask);
+ }
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_sse2
+#define COMPARE256 compare256_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_sse2
+#define COMPARE256 compare256_sse2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/crc32_chorba_sse2.c b/neozip/arch/x86/crc32_chorba_sse2.c
new file mode 100644
index 0000000000..66191e046a
--- /dev/null
+++ b/neozip/arch/x86/crc32_chorba_sse2.c
@@ -0,0 +1,872 @@
+#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE)
+
+#include "zbuild.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include <emmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch_functions.h"
+
+#define READ_NEXT(in, off, a, b) do { \
+ a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+ b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+ } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+ a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+ b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+ c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+ d = _mm_srli_epi64(invec, 20); \
+ } while (0);
+
+Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
+ /* The calling function ensured that this is aligned correctly */
+ const uint64_t* input = (const uint64_t*)buf;
+ ALIGNED_(16) uint64_t final[9] = {0};
+ uint64_t next1 = ~crc;
+ crc = 0;
+ uint64_t next2 = 0;
+ uint64_t next3 = 0;
+ uint64_t next4 = 0;
+ uint64_t next5 = 0;
+
+ __m128i next12 = _mm_cvtsi64_si128(next1);
+ __m128i next34 = _mm_setzero_si128();
+ __m128i next56 = _mm_setzero_si128();
+ __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4;
+
+ size_t i = 0;
+
+ /* This is weird, doing for vs while drops 10% off the exec time */
+ for (; (i + 256 + 40 + 32 + 32) < len; i += 32) {
+ __m128i in1in2, in3in4;
+
+ /*
+ uint64_t chorba1 = input[i / sizeof(uint64_t)];
+ uint64_t chorba2 = input[i / sizeof(uint64_t) + 1];
+ uint64_t chorba3 = input[i / sizeof(uint64_t) + 2];
+ uint64_t chorba4 = input[i / sizeof(uint64_t) + 3];
+ uint64_t chorba5 = input[i / sizeof(uint64_t) + 4];
+ uint64_t chorba6 = input[i / sizeof(uint64_t) + 5];
+ uint64_t chorba7 = input[i / sizeof(uint64_t) + 6];
+ uint64_t chorba8 = input[i / sizeof(uint64_t) + 7];
+ */
+
+ const uint64_t *input_ptr = input + (i / sizeof(uint64_t));
+ const __m128i *input_ptr_128 = (__m128i*)input_ptr;
+ __m128i chorba12 = _mm_load_si128(input_ptr_128++);
+ __m128i chorba34 = _mm_load_si128(input_ptr_128++);
+ __m128i chorba56 = _mm_load_si128(input_ptr_128++);
+ __m128i chorba78 = _mm_load_si128(input_ptr_128++);
+
+ chorba12 = _mm_xor_si128(chorba12, next12);
+ chorba34 = _mm_xor_si128(chorba34, next34);
+ chorba56 = _mm_xor_si128(chorba56, next56);
+ chorba78 = _mm_xor_si128(chorba78, chorba12);
+ __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1));
+ __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12),
+ _mm_castsi128_pd(chorba34), 1));
+ /*
+ chorba1 ^= next1;
+ chorba2 ^= next2;
+ chorba3 ^= next3;
+ chorba4 ^= next4;
+ chorba5 ^= next5;
+ chorba7 ^= chorba1;
+ chorba8 ^= chorba2;
+ */
+ i += 8 * 8;
+
+ /* 0-3 */
+ /*in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];*/
+ READ_NEXT(input, i, in1in2, in3in4);
+ __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12));
+ in1in2 = _mm_xor_si128(in1in2, chorba34xor);
+ /*
+ in1 ^= chorba3;
+ in2 ^= chorba4 ^ chorba1;
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+
+ */
+
+ in3in4 = _mm_xor_si128(in3in4, ab1);
+ /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */
+ __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2));
+
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23));
+ in3in4 = _mm_xor_si128(in3in4, chorba12);
+
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1;
+ in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, a4_);
+ next12 = _mm_xor_si128(next12, cd1);
+
+ __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+
+ /*out1 = a3 ^ b2 ^ c1;
+ out2 = b3 ^ c2 ^ d1 ^ a4;*/
+ next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ //out3 = b4 ^ c3 ^ d2;
+ //out4 = c4 ^ d3;
+
+ //out5 = d4;
+
+ /*
+ next1 = out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 4-7 */
+ /*in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];*/
+ READ_NEXT(input, i, in1in2, in3in4);
+
+ in1in2 = _mm_xor_si128(in1in2, next12);
+ in1in2 = _mm_xor_si128(in1in2, chorba78);
+ in1in2 = _mm_xor_si128(in1in2, chorba45);
+ in1in2 = _mm_xor_si128(in1in2, chorba34);
+
+ /*
+ in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3;
+ in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4;
+ */
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+
+ in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5;
+ in4 ^= next4 ^ b1 ^ a2 ^ chorba7 ^ chorba6;
+ */
+ in3in4 = _mm_xor_si128(in3in4, next34);
+ in3in4 = _mm_xor_si128(in3in4, ab1);
+ in3in4 = _mm_xor_si128(in3in4, chorba56);
+ __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+
+ /*
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ ///*
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, cd1);
+
+ next12 = _mm_xor_si128(next12, a4_);
+ next12 = _mm_xor_si128(next12, next56);
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ next34 = _mm_xor_si128(b4c4, cd3);
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ //*/
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = b3 ^ c2 ^ d1 ^ a4;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 8-11 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1;
+ in2 ^= next2 ^ chorba8 ^ chorba2;
+ */
+
+ READ_NEXT(input, i, in1in2, in3in4);
+
+ __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128());
+ __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12);
+ in1in2 = _mm_xor_si128(in1in2, chorba80);
+ in1in2 = _mm_xor_si128(in1in2, chorba78);
+ in1in2 = _mm_xor_si128(in1in2, next12_chorba12);
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];*/
+ in3in4 = _mm_xor_si128(next34, in3in4);
+ in3in4 = _mm_xor_si128(in3in4, ab1);
+ __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, chorba34);
+ in3in4 = _mm_xor_si128(in3in4, a2_);
+
+ /*
+ in3 ^= next3 ^ a1 ^ chorba3;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba4;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(a4_, ab3);
+ next12 = _mm_xor_si128(next12, cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 12-15 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ */
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, next12);
+ __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12);
+ in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12);
+ __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+ in1in2 = _mm_xor_si128(in1in2, chorb1_);
+
+
+ /*
+ in1 ^= next1 ^ chorba5 ^ chorba1;
+ in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1;
+
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
+ */
+
+ in3in4 = _mm_xor_si128(next34, in3in4);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ ///*
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ //*/
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 16-19 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
+ in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
+ */
+ ///*
+ READ_NEXT(input, i, in1in2, in3in4);
+ __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+ in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34));
+ in1in2 = _mm_xor_si128(chorba1_, in1in2);
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+ //*/
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ */
+ ///*
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+ in3in4 = _mm_xor_si128(in3in4, next34);
+ //*/
+ /*
+ in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
+ */
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ i += 32;
+
+ /* 20-23 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
+ in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
+ */
+
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+ in1in2 = _mm_xor_si128(in1in2, chorba80);
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+ */
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+ in3in4 = _mm_xor_si128(in3in4, chorba12);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ i += 32;
+
+ /* 24-27 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+ in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
+ */
+
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+ in1in2 = _mm_xor_si128(in1in2, chorba80);
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_));
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+ i += 32;
+
+ /* 28-31 */
+ /*
+ in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];
+ in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5;
+ in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6;
+ */
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+ in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56));
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7;
+ in4 ^= next4 ^ a2 ^ b1 ^ chorba8;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+ a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+ in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80));
+ in3in4 = _mm_xor_si128(a2_, in3in4);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+ */
+
+ /*
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ a4_ = _mm_unpacklo_epi64(next56, ab4);
+ next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+ b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ next12 = _mm_xor_si128(next12, b2c2);
+ next34 = _mm_xor_si128(b4c4, cd3);
+ next34 = _mm_xor_si128(next34, d2_);
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ }
+
+ for (; (i + 40 + 32) < len; i += 32) {
+ __m128i in1in2, in3in4;
+
+ /*in1 = input[i / sizeof(uint64_t)];
+ in2 = input[i / sizeof(uint64_t) + 1];*/
+ //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4);
+ READ_NEXT(input, i, in1in2, in3in4);
+ in1in2 = _mm_xor_si128(in1in2, next12);
+
+ /*
+ in1 ^=next1;
+ in2 ^=next2;
+ */
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+ /*
+ a1 = (in1 << 17) ^ (in1 << 55);
+ a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+ a3 = (in1 >> 45) ^ (in1 << 44);
+ a4 = (in1 >> 20);
+
+ b1 = (in2 << 17) ^ (in2 << 55);
+ b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+ b3 = (in2 >> 45) ^ (in2 << 44);
+ b4 = (in2 >> 20);
+ */
+
+ /*
+ in3 = input[i / sizeof(uint64_t) + 2];
+ in4 = input[i / sizeof(uint64_t) + 3];
+ in3 ^= next3 ^ a1;
+ in4 ^= next4 ^ a2 ^ b1;
+
+ c1 = (in3 << 17) ^ (in3 << 55);
+ c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+ c3 = (in3 >> 45) ^ (in3 << 44);
+ c4 = (in3 >> 20);
+
+ d1 = (in4 << 17) ^ (in4 << 55);
+ d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+ d3 = (in4 >> 45) ^ (in4 << 44);
+ d4 = (in4 >> 20);
+ */
+
+ __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+ __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+ in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+ in3in4 = _mm_xor_si128(a2_, in3in4);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ /*
+
+ out1 = a3 ^ b2 ^ c1;
+ out2 = a4 ^ b3 ^ c2 ^ d1;
+ out3 = b4 ^ c3 ^ d2;
+ out4 = c4 ^ d3;
+ out5 = d4;
+
+ next1 = next5 ^ out1;
+ next2 = out2;
+ next3 = out3;
+ next4 = out4;
+ next5 = out5;
+ */
+
+ __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+ __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, a4_);
+ next12 = _mm_xor_si128(next12, cd1);
+
+ __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+ __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+ next12 = _mm_xor_si128(next12, next56);
+ next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+ next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+ }
+
+ next1 = _mm_cvtsi128_si64(next12);
+ next2 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next12, next12));
+ next3 = _mm_cvtsi128_si64(next34);
+ next4 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next34, next34));
+ next5 = _mm_cvtsi128_si64(next56);
+
+ /* Skip the call to memcpy */
+ size_t copy_len = len - i;
+ __m128i *final128 = (__m128i*)final;
+ __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t));
+ while (copy_len >= 64) {
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ copy_len -= 64;
+ }
+
+ while (copy_len >= 16) {
+ _mm_store_si128(final128++, _mm_load_si128(input128++));
+ copy_len -= 16;
+ }
+
+ uint8_t *src_bytes = (uint8_t*)input128;
+ uint8_t *dst_bytes = (uint8_t*)final128;
+ while (copy_len--) {
+ *dst_bytes++ = *src_bytes++;
+ }
+
+ final[0] ^= next1;
+ final[1] ^= next2;
+ final[2] ^= next3;
+ final[3] ^= next4;
+ final[4] ^= next5;
+
+ /* We perform the same loop that braid_internal is doing but we'll skip
+ * the function call for this tiny tail */
+ uint8_t *final_bytes = (uint8_t*)final;
+ size_t rem = len - i;
+
+ while (rem--) {
+ crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8);
+ }
+
+ return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
+ uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+ if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT)
+ return crc32_braid(crc, buf, len);
+
+ if (align_diff) {
+ crc = crc32_braid(crc, buf, align_diff);
+ len -= align_diff;
+ buf += align_diff;
+ }
+#if !defined(WITHOUT_CHORBA)
+ if (len > CHORBA_LARGE_THRESHOLD)
+ return crc32_chorba_118960_nondestructive(crc, buf, len);
+#endif
+ return chorba_small_nondestructive_sse2(crc, buf, len);
+}
+
+Z_INTERNAL uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_chorba_sse2(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+#endif
diff --git a/neozip/arch/x86/crc32_chorba_sse41.c b/neozip/arch/x86/crc32_chorba_sse41.c
new file mode 100644
index 0000000000..6ef9612440
--- /dev/null
+++ b/neozip/arch/x86/crc32_chorba_sse41.c
@@ -0,0 +1,332 @@
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
+
+#include "zbuild.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch_functions.h"
+
+#define READ_NEXT(in, off, a, b) do { \
+ a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+ b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+ } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+ a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+ b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+ c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+ d = _mm_srli_epi64(invec, 20); \
+ } while (0);
+
+#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
+ out0 = _mm_slli_si128(in0, shift); \
+ out1 = _mm_alignr_epi8(in1, in0, shift); \
+ out2 = _mm_alignr_epi8(in2, in1, shift); \
+ out3 = _mm_alignr_epi8(in3, in2, shift); \
+ out4 = _mm_srli_si128(in3, shift); \
+ } while (0)
+
+#define STORE4(out0, out1, out2, out3, out) do { \
+ _mm_store_si128(out++, out0); \
+ _mm_store_si128(out++, out1); \
+ _mm_store_si128(out++, out2); \
+ _mm_store_si128(out++, out3); \
+ } while (0)
+
+#define READ4(out0, out1, out2, out3, in) do { \
+ out0 = _mm_load_si128(in++); \
+ out1 = _mm_load_si128(in++); \
+ out2 = _mm_load_si128(in++); \
+ out3 = _mm_load_si128(in++); \
+ } while (0)
+
+/* This is intentionally shifted one down to compensate for the deferred store from
+ * the last iteration */
+#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
+ out0 = _mm_xor_si128(in[1], xor0); \
+ out1 = _mm_xor_si128(in[2], xor1); \
+ out2 = _mm_xor_si128(in[3], xor2); \
+ out3 = _mm_xor_si128(in[4], xor3); \
+ } while (0)
+
+Z_FORCEINLINE static uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+ /* The calling function ensured that this is aligned correctly */
+ const uint64_t* input = (const uint64_t*)buf;
+ ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+ __m128i *bitbuffer_v = (__m128i*)bitbuffer;
+ const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+ __m128i z = _mm_setzero_si128();
+
+ __m128i *bitbuf128 = &bitbuffer_v[64];
+ __m128i *bitbuf144 = &bitbuffer_v[72];
+ __m128i *bitbuf182 = &bitbuffer_v[91];
+ __m128i *bitbuf210 = &bitbuffer_v[105];
+ __m128i *bitbuf300 = &bitbuffer_v[150];
+ __m128i *bitbuf0 = bitbuf128;
+ __m128i *inptr = (__m128i*)input;
+
+ /* We only need to zero out the bytes between the 128'th value and the 144th
+ * that are actually read */
+ __m128i *z_cursor = bitbuf128;
+ for (size_t i = 0; i < 2; ++i) {
+ STORE4(z, z, z, z, z_cursor);
+ }
+
+ /* We only need to zero out the bytes between the 144'th value and the 182nd that
+ * are actually read */
+ z_cursor = bitbuf144 + 8;
+ for (size_t i = 0; i < 11; ++i) {
+ _mm_store_si128(z_cursor++, z);
+ }
+
+ /* We only need to zero out the bytes between the 182nd value and the 210th that
+ * are actually read. */
+ z_cursor = bitbuf182;
+ for (size_t i = 0; i < 4; ++i) {
+ STORE4(z, z, z, z, z_cursor);
+ }
+
+ /* We need to mix this in */
+ __m128i init_crc = _mm_cvtsi64_si128(~crc);
+ crc = 0;
+
+ size_t i = 0;
+
+ /* Previous iteration runs carried over */
+ __m128i buf144 = z;
+ __m128i buf182 = z;
+ __m128i buf210 = z;
+
+ for (; i + 300*8+64 < len && i < 22 * 8; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+
+ READ4(in12, in34, in56, in78, inptr);
+
+ if (i == 0) {
+ in12 = _mm_xor_si128(in12, init_crc);
+ }
+
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a = _mm_xor_si128(buf144, in_1);
+
+ STORE4(a, in23, in45, in67, bitbuf144);
+ buf144 = in8_;
+
+ __m128i e = _mm_xor_si128(buf182, in_1);
+ STORE4(e, in23, in45, in67, bitbuf182);
+ buf182 = in8_;
+
+ __m128i m = _mm_xor_si128(buf210, in_1);
+ STORE4(m, in23, in45, in67, bitbuf210);
+ buf210 = in8_;
+
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ for (; i + 300*8+64 < len && i < 32 * 8; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+ READ4(in12, in34, in56, in78, inptr);
+
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a = _mm_xor_si128(buf144, in_1);
+
+ STORE4(a, in23, in45, in67, bitbuf144);
+ buf144 = in8_;
+
+ __m128i e, f, g, h;
+ e = _mm_xor_si128(buf182, in_1);
+ READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+ STORE4(e, f, g, h, bitbuf182);
+
+ __m128i m = _mm_xor_si128(buf210, in_1);
+ STORE4(m, in23, in45, in67, bitbuf210);
+ buf210 = in8_;
+
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ for (; i + 300*8+64 < len && i < 84 * 8; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+ READ4(in12, in34, in56, in78, inptr);
+
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a, b, c, d;
+ a = _mm_xor_si128(buf144, in_1);
+ READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+ STORE4(a, b, c, d, bitbuf144);
+
+ __m128i e, f, g, h;
+ e = _mm_xor_si128(buf182, in_1);
+ READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+ STORE4(e, f, g, h, bitbuf182);
+
+ __m128i m = _mm_xor_si128(buf210, in_1);
+ STORE4(m, in23, in45, in67, bitbuf210);
+ buf210 = in8_;
+
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ for (; i + 300*8+64 < len; i += 64) {
+ __m128i in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_;
+
+ if (i < 128 * 8) {
+ READ4(in12, in34, in56, in78, inptr);
+ } else {
+ in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+ }
+
+ // [0, 145, 183, 211]
+
+ /* Pre Penryn CPUs the unpack should be faster */
+ REALIGN_CHORBA(in12, in34, in56, in78,
+ in_1, in23, in45, in67, in8_, 8);
+
+ __m128i a, b, c, d;
+ a = _mm_xor_si128(buf144, in_1);
+ READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+ STORE4(a, b, c, d, bitbuf144);
+
+ __m128i e, f, g, h;
+ e = _mm_xor_si128(buf182, in_1);
+ READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+ STORE4(e, f, g, h, bitbuf182);
+
+ __m128i n, o, p;
+ __m128i m = _mm_xor_si128(buf210, in_1);
+
+ /* Couldn't tell you why but despite knowing that this is always false,
+ * removing this branch with GCC makes things significantly slower. Some
+ * loop bodies must be being joined or something */
+ if (i < 84 * 8) {
+ n = in23;
+ o = in45;
+ p = in67;
+ buf210 = in8_;
+ } else {
+ READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
+ }
+
+ STORE4(m, n, o, p, bitbuf210);
+ STORE4(in12, in34, in56, in78, bitbuf300);
+ }
+
+ /* Second half of stores bubbled out */
+ _mm_store_si128(bitbuf144, buf144);
+ _mm_store_si128(bitbuf182, buf182);
+ _mm_store_si128(bitbuf210, buf210);
+
+ /* We also have to zero out the tail */
+ size_t left_to_z = len - (300*8 + i);
+ __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
+ while (left_to_z >= 64) {
+ STORE4(z, z, z, z, bitbuf_tail);
+ left_to_z -= 64;
+ }
+
+ while (left_to_z >= 16) {
+ _mm_store_si128(bitbuf_tail++, z);
+ left_to_z -= 16;
+ }
+
+ uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
+ while (left_to_z--) {
+ *tail_bytes++ = 0;
+ }
+
+ ALIGNED_(16) uint64_t final[9] = {0};
+ __m128i next12, next34, next56;
+ next12 = z;
+ next34 = z;
+ next56 = z;
+
+ for (; (i + 72 < len); i += 32) {
+ __m128i in1in2, in3in4;
+ __m128i in1in2_, in3in4_;
+ __m128i ab1, ab2, ab3, ab4;
+ __m128i cd1, cd2, cd3, cd4;
+
+ READ_NEXT(input, i, in1in2, in3in4);
+ READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
+
+ in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
+ in3in4 = _mm_xor_si128(in3in4, in3in4_);
+
+ NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+ __m128i a2_ = _mm_slli_si128(ab2, 8);
+ __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+ in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+ in3in4 = _mm_xor_si128(a2_, in3in4);
+ NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+ __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
+ __m128i a4_ = _mm_slli_si128(ab4, 8);
+ a4_ = _mm_xor_si128(b2c2, a4_);
+ next12 = _mm_xor_si128(ab3, a4_);
+ next12 = _mm_xor_si128(next12, cd1);
+
+ __m128i d2_ = _mm_srli_si128(cd2, 8);
+ __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
+ next12 = _mm_xor_si128(next12, next56);
+ next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+ next56 = _mm_srli_si128(cd4, 8);
+ }
+
+ memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+ __m128i *final128 = (__m128i*)final;
+ _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
+ ++final128;
+ _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
+ ++final128;
+ _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
+
+ uint8_t *final_bytes = (uint8_t*)final;
+
+ for (size_t j = 0; j < (len-i); j++) {
+ crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8);
+ }
+ return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+ uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+ if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT)
+ return crc32_braid(crc, buf, len);
+
+ if (align_diff) {
+ crc = crc32_braid(crc, buf, align_diff);
+ len -= align_diff;
+ buf += align_diff;
+ }
+#if !defined(WITHOUT_CHORBA)
+ if (len > CHORBA_LARGE_THRESHOLD)
+ return crc32_chorba_118960_nondestructive(crc, buf, len);
+#endif
+ if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
+ return crc32_chorba_32768_nondestructive_sse41(crc, buf, len);
+ return chorba_small_nondestructive_sse2(crc, buf, len);
+}
+
+Z_INTERNAL uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_chorba_sse41(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+#endif
diff --git a/neozip/arch/x86/crc32_pclmulqdq.c b/neozip/arch/x86/crc32_pclmulqdq.c
new file mode 100644
index 0000000000..c8be1b43ba
--- /dev/null
+++ b/neozip/arch/x86/crc32_pclmulqdq.c
@@ -0,0 +1,31 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ * doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * Jim Guilford <james.guilford@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Erdinc Ozturk <erdinc.ozturk@intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/crc32_pclmulqdq_tpl.h b/neozip/arch/x86/crc32_pclmulqdq_tpl.h
new file mode 100644
index 0000000000..e4ea546afd
--- /dev/null
+++ b/neozip/arch/x86/crc32_pclmulqdq_tpl.h
@@ -0,0 +1,708 @@
+/* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding
+ * approach with the PCLMULQDQ and VPCMULQDQ instructions.
+ *
+ * A white paper describing this algorithm can be found at:
+ * doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support)
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * Jim Guilford <james.guilford@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Erdinc Ozturk <erdinc.ozturk@intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+#include <immintrin.h>
+#include <wmmintrin.h>
+#include <smmintrin.h> // _mm_extract_epi32
+
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_p.h"
+#include "x86_intrins.h"
+
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+# if defined(_MSC_VER) && _MSC_VER < 1920
+ /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */
+# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96)
+# define z512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm)
+# define z512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm)
+# else
+# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, 0x96)
+# define z512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm)
+# define z512_extracti64x2(a, imm) _mm512_extracti64x2_epi64(a, imm)
+# endif
+# ifdef __AVX512VL__
+# define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96)
+# endif
+#endif
+/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+# define z256_xor3_epi64(a, b, c) _mm256_xor_si256(_mm256_xor_si256(a, b), c)
+#endif
+
+#ifndef z128_xor3_epi64
+# define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
+#endif
+
+static inline void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+
+ *xmm_crc0 = *xmm_crc1;
+ *xmm_crc1 = *xmm_crc2;
+ *xmm_crc2 = *xmm_crc3;
+ *xmm_crc3 = _mm_xor_si128(x_low, x_high);
+}
+
+static inline void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+
+ *xmm_crc0 = *xmm_crc2;
+ *xmm_crc1 = *xmm_crc3;
+ *xmm_crc2 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc3 = _mm_xor_si128(x_low1, x_high1);
+}
+
+static inline void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+ __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+
+ *xmm_crc0 = *xmm_crc3;
+ *xmm_crc1 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc2 = _mm_xor_si128(x_low1, x_high1);
+ *xmm_crc3 = _mm_xor_si128(x_low2, x_high2);
+}
+
+static inline void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+ __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+ __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+ __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+
+ *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
+ *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
+ *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
+}
+
+static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+ const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85);
+ __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10);
+ __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10);
+ __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10);
+ __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01);
+ __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10);
+
+ *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
+ *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
+ *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
+ *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
+}
+
+/* 512-bit fold function requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3,
+ const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) {
+ __m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01);
+ __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10);
+ __m512i z_low1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01);
+ __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10);
+ __m512i z_low2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01);
+ __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10);
+ __m512i z_low3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01);
+ __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10);
+
+ *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0);
+ *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1);
+ *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2);
+ *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3);
+}
+#endif
+/* 256-bit fold function for VPCLMULQDQ without AVX-512 */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+static inline void fold_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3,
+ const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, const __m256i ymm_t3, const __m256i ymm_fold8) {
+ __m256i y_low0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01);
+ __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10);
+ __m256i y_low1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01);
+ __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10);
+ __m256i y_low2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01);
+ __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10);
+ __m256i y_low3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01);
+ __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10);
+
+ *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0);
+ *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1);
+ *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2);
+ *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3);
+}
+#endif
+
+Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ size_t copy_len = len;
+ if (len >= 16) {
+ /* Calculate 16-byte alignment offset */
+ uintptr_t align_diff = ALIGN_DIFF(src, 16);
+
+ /* If total length is less than (alignment bytes + 16), use the faster small method.
+ * Handles both initially small buffers and cases where alignment would leave < 16 bytes */
+ copy_len = len < align_diff + 16 ? len : align_diff;
+ }
+
+ if (copy_len > 0) {
+ crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY);
+ src += copy_len;
+ len -= copy_len;
+ if (COPY) {
+ dst += copy_len;
+ }
+ }
+
+ if (len == 0)
+ return crc;
+
+ const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+
+ __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+ __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+ __m128i xmm_crc1 = _mm_setzero_si128();
+ __m128i xmm_crc2 = _mm_setzero_si128();
+ __m128i xmm_crc3 = _mm_setzero_si128();
+
+ if (crc != 0) {
+ // Process the first 16 bytes and handle initial CRC
+ len -= 16;
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ src += 16;
+
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ dst += 16;
+ }
+ xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc));
+ }
+
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+ if (len >= 256) {
+ len -= 256;
+
+ __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+ __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+ __m512i z_low0, z_high0;
+ const __m512i zmm_fold4 = _mm512_set4_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m512i zmm_fold16 = _mm512_set4_epi32(
+ 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+ zmm_crc0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+ src += 256;
+ if (COPY) {
+ _mm512_storeu_si512((__m512i *)dst, zmm_crc0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+ dst += 256;
+ }
+
+ // Fold existing xmm state into first 64 bytes
+ zmm_t0 = _mm512_castsi128_si512(xmm_crc0);
+ zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc1, 1);
+ zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc2, 2);
+ zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc3, 3);
+
+ z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0);
+
+ while (len >= 256) {
+ len -= 256;
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+ src += 256;
+
+ fold_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16);
+ if (COPY) {
+ _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+ dst += 256;
+ }
+ }
+
+ // zmm_crc[0,1,2,3] -> zmm_crc0
+ z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1);
+
+ z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2);
+
+ z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3);
+
+ // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+ xmm_crc0 = z512_extracti64x2(zmm_crc0, 0);
+ xmm_crc1 = z512_extracti64x2(zmm_crc0, 1);
+ xmm_crc2 = z512_extracti64x2(zmm_crc0, 2);
+ xmm_crc3 = z512_extracti64x2(zmm_crc0, 3);
+ }
+/* 256-bit VPCLMULQDQ path */
+#elif defined(X86_VPCLMULQDQ)
+ if (len >= 128) {
+ len -= 128;
+
+ __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3;
+ __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3;
+ __m256i y_low0, y_high0;
+ const __m256i ymm_fold4 = _mm256_set_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596,
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m256i ymm_fold8 = _mm256_set_epi32(
+ 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880,
+ 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880);
+
+ ymm_crc0 = _mm256_loadu_si256((__m256i *)src);
+ ymm_crc1 = _mm256_loadu_si256((__m256i *)src + 1);
+ ymm_crc2 = _mm256_loadu_si256((__m256i *)src + 2);
+ ymm_crc3 = _mm256_loadu_si256((__m256i *)src + 3);
+ src += 128;
+ if (COPY) {
+ _mm256_storeu_si256((__m256i *)dst, ymm_crc0);
+ _mm256_storeu_si256((__m256i *)dst + 1, ymm_crc1);
+ _mm256_storeu_si256((__m256i *)dst + 2, ymm_crc2);
+ _mm256_storeu_si256((__m256i *)dst + 3, ymm_crc3);
+ dst += 128;
+ }
+
+ // Fold existing xmm state into first 32 bytes
+ ymm_t0 = _mm256_castsi128_si256(xmm_crc0);
+ ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc1, 1);
+
+ y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+ y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+ ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0);
+
+ ymm_t0 = _mm256_castsi128_si256(xmm_crc2);
+ ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc3, 1);
+
+ y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+ y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+ ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0);
+
+ while (len >= 128) {
+ len -= 128;
+ ymm_t0 = _mm256_loadu_si256((__m256i *)src);
+ ymm_t1 = _mm256_loadu_si256((__m256i *)src + 1);
+ ymm_t2 = _mm256_loadu_si256((__m256i *)src + 2);
+ ymm_t3 = _mm256_loadu_si256((__m256i *)src + 3);
+ src += 128;
+
+ fold_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8);
+ if (COPY) {
+ _mm256_storeu_si256((__m256i *)dst, ymm_t0);
+ _mm256_storeu_si256((__m256i *)dst + 1, ymm_t1);
+ _mm256_storeu_si256((__m256i *)dst + 2, ymm_t2);
+ _mm256_storeu_si256((__m256i *)dst + 3, ymm_t3);
+ dst += 128;
+ }
+ }
+
+ // Extract 8 x 128-bit lanes from 4 x 256-bit registers
+ __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0);
+ __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1);
+ __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1);
+ __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1);
+ __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2);
+ __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1);
+ __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3);
+ __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1);
+
+ // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4)
+ __m128i x_low, x_high;
+ x_low = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10);
+ xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4);
+
+ x_low = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10);
+ xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5);
+
+ x_low = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10);
+ xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6);
+
+ x_low = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10);
+ xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7);
+ }
+#else
+ /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
+ * We interleave the PCLMUL-base folds with 8x scaled generator
+ * polynomial copies; we read 8x QWORDS and then XOR them into
+ * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
+ * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
+ * as "generator_64_bits_unrolled_8" */
+#ifndef __AVX512VL__
+ if (!COPY) {
+#endif
+ while (len >= 512 + 64 + 16*8) {
+ __m128i chorba8 = _mm_load_si128((__m128i *)src);
+ __m128i chorba7 = _mm_load_si128((__m128i *)src + 1);
+ __m128i chorba6 = _mm_load_si128((__m128i *)src + 2);
+ __m128i chorba5 = _mm_load_si128((__m128i *)src + 3);
+ __m128i chorba4 = _mm_load_si128((__m128i *)src + 4);
+ __m128i chorba3 = _mm_load_si128((__m128i *)src + 5);
+ __m128i chorba2 = _mm_load_si128((__m128i *)src + 6);
+ __m128i chorba1 = _mm_load_si128((__m128i *)src + 7);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, chorba8);
+ _mm_storeu_si128((__m128i *)dst + 1, chorba7);
+ _mm_storeu_si128((__m128i *)dst + 2, chorba6);
+ _mm_storeu_si128((__m128i *)dst + 3, chorba5);
+ _mm_storeu_si128((__m128i *)dst + 4, chorba4);
+ _mm_storeu_si128((__m128i *)dst + 5, chorba3);
+ _mm_storeu_si128((__m128i *)dst + 6, chorba2);
+ _mm_storeu_si128((__m128i *)dst + 7, chorba1);
+ dst += 16*8;
+ }
+
+ chorba2 = _mm_xor_si128(chorba2, chorba8);
+ chorba1 = _mm_xor_si128(chorba1, chorba7);
+ src += 16*8;
+ len -= 16*8;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+
+ fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, xmm_crc2);
+ xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 4);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 5);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 6);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 7);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 8);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 9);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 10);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 11);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, xmm_crc2);
+ xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 12);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 13);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 14);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 15);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 16);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 17);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 18);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 19);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 20);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 21);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 22);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 23);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 24);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 25);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 26);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 27);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, xmm_crc1);
+ xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), xmm_crc3);
+
+ xmm_t0 = _mm_load_si128((__m128i *)src + 28);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 29);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 30);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 31);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, xmm_crc0);
+ xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), xmm_crc2);
+ xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, xmm_crc3);
+
+ len -= 512;
+ src += 512;
+ }
+#ifndef __AVX512VL__
+ }
+#endif
+
+#endif /* X86_VPCLMULQDQ */
+
+ while (len >= 64) {
+ len -= 64;
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+ src += 64;
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
+ }
+
+ xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+ }
+
+ /*
+ * len = num bytes left - 64
+ */
+ if (len >= 48) {
+ len -= 48;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ src += 48;
+
+ fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ dst += 48;
+ }
+
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+ } else if (len >= 32) {
+ len -= 32;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ src += 32;
+
+ fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ dst += 32;
+ }
+
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+ } else if (len >= 16) {
+ len -= 16;
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ src += 16;
+
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+ if (COPY) {
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ dst += 16;
+ }
+
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+ }
+
+ const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e);
+ const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641);
+
+ /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */
+ __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01);
+ __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10);
+ xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0);
+
+ __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01);
+ __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10);
+ xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1);
+
+ __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01);
+ __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10);
+ xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2);
+
+ /* Fold remaining bytes into the 128-bit state */
+ if (len) {
+ const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
+ const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+ /* Create masks to shift bytes for partial input */
+ __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16));
+ __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3);
+
+ /* Shift out bytes from crc3 to make space for new data */
+ __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl);
+ xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr);
+
+ /* Insert the partial input into crc3 */
+#if defined(__AVX512BW__) && defined(__AVX512VL__)
+ __mmask16 k = (1 << len) - 1;
+ __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src);
+ if (COPY) {
+ _mm_mask_storeu_epi8(dst, k, xmm_crc_part);
+ }
+#else
+ __m128i xmm_crc_part = _mm_setzero_si128();
+ memcpy(&xmm_crc_part, src, len);
+ if (COPY) {
+ memcpy(dst, src, len);
+ }
+#endif
+ __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned);
+
+ /* Fold the bytes that were shifted out back into crc3 */
+ __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01);
+ __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10);
+ xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high);
+ }
+
+ /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */
+ __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00);
+ __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10);
+
+ x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf);
+ x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3);
+
+ __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01);
+ __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10);
+
+ crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2));
+
+ return ~crc;
+}
diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx2.c b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c
new file mode 100644
index 0000000000..1cdef13b09
--- /dev/null
+++ b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx2.c -- VPCLMULQDQ-based CRC32 with AVX2.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX2
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx512.c b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c
new file mode 100644
index 0000000000..a95a448f49
--- /dev/null
+++ b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx512.c -- VPCLMULQDQ-based CRC32 with AVX-512.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX512
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/slide_hash_avx2.c b/neozip/arch/x86/slide_hash_avx2.c
new file mode 100644
index 0000000000..241ea305e3
--- /dev/null
+++ b/neozip/arch/x86/slide_hash_avx2.c
@@ -0,0 +1,48 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <immintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+ table += entries;
+ table -= 32;
+
+ do {
+ __m256i value1, value2, result1, result2;
+
+ value1 = _mm256_load_si256((__m256i *)table);
+ value2 = _mm256_load_si256((__m256i *)(table+16));
+ result1 = _mm256_subs_epu16(value1, wsize);
+ result2 = _mm256_subs_epu16(value2, wsize);
+ _mm256_store_si256((__m256i *)table, result1);
+ _mm256_store_si256((__m256i *)(table+16), result2);
+
+ table -= 32;
+ entries -= 32;
+ } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+ slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+ slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/x86/slide_hash_sse2.c b/neozip/arch/x86/slide_hash_sse2.c
new file mode 100644
index 0000000000..4aa8df5ee8
--- /dev/null
+++ b/neozip/arch/x86/slide_hash_sse2.c
@@ -0,0 +1,68 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE2
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <immintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+ uint32_t entries1, const __m128i wsize) {
+ uint32_t entries;
+ Pos *table;
+ __m128i value0, value1, result0, result1;
+
+ int on_chain = 0;
+
+next_chain:
+ table = (on_chain) ? table1 : table0;
+ entries = (on_chain) ? entries1 : entries0;
+
+ table += entries;
+ table -= 16;
+
+ /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+ * Our alloc function is aligned to 64 byte boundaries */
+ do {
+ value0 = _mm_load_si128((__m128i *)table);
+ value1 = _mm_load_si128((__m128i *)(table + 8));
+ result0 = _mm_subs_epu16(value0, wsize);
+ result1 = _mm_subs_epu16(value1, wsize);
+ _mm_store_si128((__m128i *)table, result0);
+ _mm_store_si128((__m128i *)(table + 8), result1);
+
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+
+ ++on_chain;
+ if (on_chain > 1) {
+ return;
+ } else {
+ goto next_chain;
+ }
+}
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+ assert(((uintptr_t)s->head & 15) == 0);
+ assert(((uintptr_t)s->prev & 15) == 0);
+
+ slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/x86/x86_features.c b/neozip/arch/x86/x86_features.c
new file mode 100644
index 0000000000..5eba18bf8a
--- /dev/null
+++ b/neozip/arch/x86/x86_features.c
@@ -0,0 +1,128 @@
+/* x86_features.c - x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ * Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_FEATURES
+
+#include "zbuild.h"
+#include "x86_features.h"
+
+#if defined(HAVE_CPUID_MS)
+# include <intrin.h>
+#elif defined(HAVE_CPUID_GNU)
+// Newer versions of GCC and clang come with cpuid.h
+# include <cpuid.h>
+# ifdef X86_HAVE_XSAVE_INTRIN
+# if __GNUC__ == 8
+# include <xsaveintrin.h>
+# else
+# include <immintrin.h>
+# endif
+# endif
+#endif
+
+static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#if defined(HAVE_CPUID_MS)
+ unsigned int registers[4];
+ __cpuid((int *)registers, info);
+
+ *eax = registers[0];
+ *ebx = registers[1];
+ *ecx = registers[2];
+ *edx = registers[3];
+#elif defined(HAVE_CPUID_GNU)
+ *eax = *ebx = *ecx = *edx = 0;
+ __cpuid(info, *eax, *ebx, *ecx, *edx);
+#else
+ /* When using this fallback, the faster SSE/AVX code is disabled */
+ *eax = *ebx = *ecx = *edx = 0;
+#endif
+}
+
+static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#if defined(HAVE_CPUID_MS)
+ unsigned int registers[4];
+ __cpuidex((int *)registers, info, subinfo);
+
+ *eax = registers[0];
+ *ebx = registers[1];
+ *ecx = registers[2];
+ *edx = registers[3];
+#elif defined(HAVE_CPUID_GNU)
+ *eax = *ebx = *ecx = *edx = 0;
+ __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#else
+ /* When using this fallback, the faster SSE/AVX code is disabled */
+ *eax = *ebx = *ecx = *edx = 0;
+#endif
+}
+
+static inline uint64_t xgetbv(unsigned int xcr) {
+#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
+ return _xgetbv(xcr);
+#elif defined(__GNUC__)
+ uint32_t eax, edx;
+ __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
+ return (uint64_t)(edx) << 32 | eax;
+#else
+ /* When using this fallback, some of the faster code is disabled */
+ return 0;
+#endif
+}
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
+ unsigned eax, ebx, ecx, edx;
+ unsigned maxbasic;
+
+ cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+ cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+ features->has_sse2 = edx & 0x4000000;
+ features->has_ssse3 = ecx & 0x200;
+ features->has_sse41 = ecx & 0x80000;
+ features->has_sse42 = ecx & 0x100000;
+ features->has_pclmulqdq = ecx & 0x2;
+
+ if (ecx & 0x08000000) {
+ uint64_t xfeature = xgetbv(0);
+
+ features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
+ features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
+ }
+
+ if (maxbasic >= 7) {
+ // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+ cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+ // check BMI2 bit
+ features->has_bmi2 = ebx & 0x100;
+
+ // check AVX2 bit if the OS supports saving YMM registers
+ if (features->has_os_save_ymm) {
+ features->has_avx2 = ebx & 0x20;
+ features->has_vpclmulqdq = ecx & 0x400;
+ }
+
+ // check AVX512 bits if the OS supports saving ZMM registers
+ if (features->has_os_save_zmm) {
+ features->has_avx512f = ebx & 0x00010000;
+ if (features->has_avx512f) {
+ // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
+ // AVX512(DQ,BW,VL).
+ features->has_avx512dq = ebx & 0x00020000;
+ features->has_avx512bw = ebx & 0x40000000;
+ features->has_avx512vl = ebx & 0x80000000;
+ }
+ features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
+ && features->has_avx512vl && features->has_bmi2;
+ features->has_avx512vnni = ecx & 0x800;
+ }
+ }
+}
+
+#endif
diff --git a/neozip/arch/x86/x86_features.h b/neozip/arch/x86/x86_features.h
new file mode 100644
index 0000000000..2118b8e87a
--- /dev/null
+++ b/neozip/arch/x86/x86_features.h
@@ -0,0 +1,30 @@
+/* x86_features.h -- check for CPU features
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FEATURES_H_
+#define X86_FEATURES_H_
+
+struct x86_cpu_features {
+ int has_avx2;
+ int has_avx512f;
+ int has_avx512dq;
+ int has_avx512bw;
+ int has_avx512vl;
+ int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
+ int has_avx512vnni;
+ int has_bmi2;
+ int has_sse2;
+ int has_ssse3;
+ int has_sse41;
+ int has_sse42;
+ int has_pclmulqdq;
+ int has_vpclmulqdq;
+ int has_os_save_ymm;
+ int has_os_save_zmm;
+};
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
+
+#endif /* X86_FEATURES_H_ */
diff --git a/neozip/arch/x86/x86_functions.h b/neozip/arch/x86/x86_functions.h
new file mode 100644
index 0000000000..881c6efe23
--- /dev/null
+++ b/neozip/arch/x86/x86_functions.h
@@ -0,0 +1,196 @@
+/* x86_functions.h -- x86 implementations for arch-specific functions.
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FUNCTIONS_H_
+#define X86_FUNCTIONS_H_
+
+#include "x86_natives.h"
+
+/* So great news, your compiler is broken and causes stack smashing. Rather than
+ * notching out its compilation we'll just remove the assignment in the functable.
+ * Further context:
+ * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
+#if defined(_MSC_VER) && defined(ARCH_32BIT) && _MSC_VER >= 1920 && _MSC_VER <= 1929
+#define NO_CHORBA_SSE
+#endif
+
+#ifdef X86_SSE2
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_sse2(deflate_state *s);
+
+# if !defined(WITHOUT_CHORBA_SSE)
+ uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len);
+ uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+ uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len);
+# endif
+#endif
+
+#ifdef X86_SSSE3
+uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, size_t len, size_t left);
+void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
+ uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len);
+ uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_SSE42
+uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_AVX2
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_avx2(deflate_state *s);
+#endif
+#ifdef X86_AVX512
+uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match);
+#endif
+#ifdef X86_AVX512VNNI
+uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_PCLMULQDQ_CRC
+uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX2
+uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// X86 - SSE2
+# ifdef X86_SSE2_NATIVE
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_sse2
+# undef native_compare256
+# define native_compare256 compare256_sse2
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_sse2
+# undef native_longest_match
+# define native_longest_match longest_match_sse2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_sse2
+# if !defined(WITHOUT_CHORBA_SSE)
+# undef native_crc32
+# define native_crc32 crc32_chorba_sse2
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_chorba_sse2
+# endif
+# undef native_slide_hash
+# define native_slide_hash slide_hash_sse2
+# endif
+// X86 - SSSE3
+# ifdef X86_SSSE3_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_ssse3
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_ssse3
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_ssse3
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_ssse3
+# endif
+// X86 - SSE4.1
+# if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE)
+# undef native_crc32
+# define native_crc32 crc32_chorba_sse41
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_chorba_sse41
+# endif
+// X86 - SSE4.2
+# ifdef X86_SSE42_NATIVE
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_sse42
+# endif
+// X86 - PCLMUL
+# ifdef X86_PCLMULQDQ_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_pclmulqdq
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_pclmulqdq
+# endif
+// X86 - AVX2
+# ifdef X86_AVX2_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_avx2
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_avx2
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_avx2
+# undef native_compare256
+# define native_compare256 compare256_avx2
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_avx2
+# undef native_longest_match
+# define native_longest_match longest_match_avx2
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx2
+# undef native_slide_hash
+# define native_slide_hash slide_hash_avx2
+# endif
+// X86 - AVX512 (F,DQ,BW,Vl)
+# ifdef X86_AVX512_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_avx512
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_avx512
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_avx512
+# undef native_compare256
+# define native_compare256 compare256_avx512
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_avx512
+# undef native_longest_match
+# define native_longest_match longest_match_avx512
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_avx512
+// X86 - AVX512 (VNNI)
+# ifdef X86_AVX512VNNI_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_avx512_vnni
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_avx512_vnni
+# endif
+# endif
+// X86 - VPCLMULQDQ
+# ifdef X86_VPCLMULQDQ_AVX512_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_vpclmulqdq_avx512
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_vpclmulqdq_avx512
+# elif defined(X86_VPCLMULQDQ_AVX2_NATIVE)
+# undef native_crc32
+# define native_crc32 crc32_vpclmulqdq_avx2
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_vpclmulqdq_avx2
+# endif
+#endif
+
+#endif /* X86_FUNCTIONS_H_ */
diff --git a/neozip/arch/x86/x86_intrins.h b/neozip/arch/x86/x86_intrins.h
new file mode 100644
index 0000000000..1d1df5eb11
--- /dev/null
+++ b/neozip/arch/x86/x86_intrins.h
@@ -0,0 +1,126 @@
+#ifndef X86_INTRINS_H
+#define X86_INTRINS_H
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+/* Unfortunately GCC didn't support these things until version 10.
+ * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
+ */
+#ifdef __AVX2__
+#include <immintrin.h>
+
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
+ || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+ __m128i r;
+ __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+ return _mm256_castsi128_si256(r);
+}
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+ __m128i r;
+ __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+ return _mm512_castsi128_si512(r);
+}
+#endif // __AVX512F__
+#endif // gcc/AppleClang version test
+
+#endif // __AVX2__
+
+/* GCC <9 is missing some AVX512 intrinsics.
+ */
+#ifdef __AVX512F__
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
+#include <immintrin.h>
+
+#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
+ ((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
+
+static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
+ char __q59, char __q58, char __q57, char __q56,
+ char __q55, char __q54, char __q53, char __q52,
+ char __q51, char __q50, char __q49, char __q48,
+ char __q47, char __q46, char __q45, char __q44,
+ char __q43, char __q42, char __q41, char __q40,
+ char __q39, char __q38, char __q37, char __q36,
+ char __q35, char __q34, char __q33, char __q32,
+ char __q31, char __q30, char __q29, char __q28,
+ char __q27, char __q26, char __q25, char __q24,
+ char __q23, char __q22, char __q21, char __q20,
+ char __q19, char __q18, char __q17, char __q16,
+ char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00) {
+ return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
+ PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
+ PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
+ PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
+ PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
+ PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
+ PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
+ PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
+}
+
+#undef PACK
+
+#endif // gcc version test
+#endif // __AVX512F__
+
+/* Missing zero-extension AVX and AVX512 intrinsics.
+ * Fixed in Microsoft Visual Studio 2017 version 15.7
+ * https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#ifdef __AVX2__
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+ return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
+}
+#endif // __AVX2__
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+ return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
+}
+#endif // __AVX512F__
+#endif // defined(_MSC_VER) && _MSC_VER < 1914
+
+/* Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsics */
+#if defined(_MSC_VER) && defined(__AVX512F__) && !defined(_MM_K0_REG8)
+# undef _mm512_extracti32x4_epi32
+# define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1)
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+/* For whatever reason this intrinsic is 64 bit only with MSVC?
+ * While we don't have 64 bit GPRs, it should at least be able to move it to stack
+ * or shuffle it over 2 registers */
+#ifdef ARCH_32BIT
+/* So, while we can't move directly to a GPR, hopefully this move to
+ * a stack resident variable doesn't equate to something awful */
+static inline int64_t _mm_cvtsi128_si64(__m128i a) {
+ union { __m128i v; int64_t i; } u;
+ u.v = a;
+ return u.i;
+}
+
+static inline __m128i _mm_cvtsi64_si128(int64_t a) {
+ return _mm_set_epi64x(0, a);
+}
+#endif
+#endif
+
+#if defined(__GNUC__) && defined(ARCH_X86) && defined(ARCH_32BIT) && !defined(__clang__)
+static inline int64_t _mm_cvtsi128_si64(__m128i a) {
+ union { __m128i v; int64_t i; } u;
+ u.v = a;
+ return u.i;
+}
+#define _mm_cvtsi64_si128(a) _mm_set_epi64x(0, a)
+#endif
+
+#endif // include guard X86_INTRINS_H
diff --git a/neozip/arch/x86/x86_natives.h b/neozip/arch/x86/x86_natives.h
new file mode 100644
index 0000000000..a39b7a51f0
--- /dev/null
+++ b/neozip/arch/x86/x86_natives.h
@@ -0,0 +1,57 @@
+/* x86_natives.h -- x86 compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_NATIVES_H_
+#define X86_NATIVES_H_
+
+#if defined(__SSE2__) || (defined(ARCH_X86) && defined(ARCH_64BIT))
+# ifdef X86_SSE2
+# define X86_SSE2_NATIVE
+# endif
+#endif
+#if defined(__SSSE3__)
+# ifdef X86_SSSE3
+# define X86_SSSE3_NATIVE
+# endif
+#endif
+#if defined(__SSE4_1__)
+# ifdef X86_SSE41
+# define X86_SSE41_NATIVE
+# endif
+#endif
+#if defined(__SSE4_2__)
+# ifdef X86_SSE42
+# define X86_SSE42_NATIVE
+# endif
+#endif
+#if defined(__PCLMUL__)
+# ifdef X86_PCLMULQDQ_CRC
+# define X86_PCLMULQDQ_NATIVE
+# endif
+#endif
+#if defined(__AVX2__)
+# ifdef X86_AVX2
+# define X86_AVX2_NATIVE
+# endif
+#endif
+#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+# ifdef X86_AVX512
+# define X86_AVX512_NATIVE
+# endif
+#endif
+#if defined(__AVX512VNNI__)
+# ifdef X86_AVX512VNNI
+# define X86_AVX512VNNI_NATIVE
+# endif
+#endif
+#if defined(__VPCLMULQDQ__)
+# if defined(X86_VPCLMULQDQ_AVX2) && defined(X86_AVX2_NATIVE)
+# define X86_VPCLMULQDQ_AVX2_NATIVE
+# endif
+# if defined(X86_VPCLMULQDQ_AVX512) && defined(X86_AVX512_NATIVE)
+# define X86_VPCLMULQDQ_AVX512_NATIVE
+# endif
+#endif
+
+#endif /* X86_NATIVES_H_ */