Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'

git-subtree-dir: neozip git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
author: Mehmet Samet Duman <yongdohyun@projecttick.org> 2026-04-02 19:56:09 +0300
committer: Mehmet Samet Duman <yongdohyun@projecttick.org> 2026-04-02 19:56:09 +0300
commit: 7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
tree: b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch
parent: b1e34e861b5d732afe828d58aad2c638135061fd (diff)
parent: c2712b8a345191f6ed79558c089777df94590087 (diff)
download: Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz
Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip
117 files changed, 14578 insertions, 0 deletions
diff --git a/neozip/arch/.gitignore b/neozip/arch/.gitignore
new file mode 100644
index 0000000000..2c3af0a08c
--- /dev/null
+++ b/neozip/arch/.gitignore
@@ -0,0 +1,2 @@
+# ignore Makefiles; they're all automatically generated
+Makefile
diff --git a/neozip/arch/arm/Makefile.in b/neozip/arch/arm/Makefile.in
new file mode 100644
index 0000000000..d0bfe0e172
--- /dev/null
+++ b/neozip/arch/arm/Makefile.in
@@ -0,0 +1,86 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+ARMV8FLAG=
+PMULLEOR3FLAG=
+NEONFLAG=
+ARMV6FLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	adler32_neon.o adler32_neon.lo \
+	arm_features.o arm_features.lo \
+	chunkset_neon.o chunkset_neon.lo \
+	compare256_neon.o compare256_neon.lo \
+	crc32_armv8.o crc32_armv8.lo \
+	crc32_armv8_pmull_eor3.o crc32_armv8_pmull_eor3.lo \
+	slide_hash_neon.o slide_hash_neon.lo \
+	slide_hash_armv6.o slide_hash_armv6.lo \
+
+adler32_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+arm_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+arm_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+chunkset_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+compare256_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+crc32_armv8.o:
+	$(CC) $(CFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8.lo:
+	$(CC) $(SFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8_pmull_eor3.o:
+	$(CC) $(CFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+crc32_armv8_pmull_eor3.lo:
+	$(CC) $(SFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+slide_hash_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_armv6.o:
+	$(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+slide_hash_armv6.lo:
+	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/arm/acle_intrins.h b/neozip/arch/arm/acle_intrins.h
new file mode 100644
index 0000000000..16f5e2c77c
--- /dev/null
+++ b/neozip/arch/arm/acle_intrins.h
@@ -0,0 +1,90 @@
+#ifndef ARM_ACLE_INTRINS_H
+#define ARM_ACLE_INTRINS_H
+
+#include <stdint.h>
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif defined(HAVE_ARM_ACLE_H)
+#  include <arm_acle.h>
+#endif
+
+#ifdef ARM_CRC32
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+#  define Z_TARGET_CRC Z_TARGET("+crc")
+#else
+#  define Z_TARGET_CRC
+#endif
+#ifdef ARM_PMULL_EOR3
+#  define Z_TARGET_PMULL_EOR3 Z_TARGET("+crc+crypto+sha3")
+#else
+#  define Z_TARGET_PMULL_EOR3
+#endif
+
+#if !defined(ARM_CRC32_INTRIN) && !defined(_MSC_VER)
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+    uint32_t __c;
+    __asm__("crc32b %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+    uint32_t __c;
+    __asm__("crc32h %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+    uint32_t __c;
+    __asm__("crc32w %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+    uint32_t __c;
+    __asm__("crc32x %w0, %w1, %x2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+#else
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+    uint32_t __c;
+    __asm__("crc32b %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+    uint32_t __c;
+    __asm__("crc32h %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+    uint32_t __c;
+    __asm__("crc32w %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+    return __crc32w (__crc32w (__a, __b & 0xffffffffULL), __b >> 32);
+}
+#endif
+#endif
+#endif
+
+#ifdef ARM_SIMD
+#ifdef _MSC_VER
+typedef uint32_t uint16x2_t;
+
+#define __uqsub16 _arm_uqsub16
+#elif !defined(ARM_SIMD_INTRIN)
+typedef uint32_t uint16x2_t;
+
+static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
+    uint16x2_t __c;
+    __asm__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+#endif
+#endif
+
+#endif // include guard ARM_ACLE_INTRINS_H
diff --git a/neozip/arch/arm/adler32_neon.c b/neozip/arch/arm/adler32_neon.c
new file mode 100644
index 0000000000..48532e6cd1
--- /dev/null
+++ b/neozip/arch/arm/adler32_neon.c
@@ -0,0 +1,346 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors:
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "adler32_p.h"
+
+static const uint16_t ALIGNED_(64) taps[64] = {
+    64, 63, 62, 61, 60, 59, 58, 57,
+    56, 55, 54, 53, 52, 51, 50, 49,
+    48, 47, 46, 45, 44, 43, 42, 41,
+    40, 39, 38, 37, 36, 35, 34, 33,
+    32, 31, 30, 29, 28, 27, 26, 25,
+    24, 23, 22, 21, 20, 19, 18, 17,
+    16, 15, 14, 13, 12, 11, 10, 9,
+    8, 7, 6, 5, 4, 3, 2, 1 };
+
+Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uint8_t *buf, size_t len) {
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    size_t num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (size_t i = 0; i < num_iter; ++i) {
+        uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+        uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128);
+        uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128);
+        uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128);
+
+        vst1q_u8(dst, d0);
+        vst1q_u8(dst + 16, d1);
+        vst1q_u8(dst + 32, d2);
+        vst1q_u8(dst + 48, d3);
+        dst += 64;
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0);
+        hsum.val[1] = vpaddlq_u8(d1);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d2);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d3);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0));
+        s2_1 = vaddw_high_u8(s2_1, d0);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d1));
+        s2_3 = vaddw_high_u8(s2_3, d1);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d2));
+        s2_5 = vaddw_high_u8(s2_5, d2);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d3));
+        s2_7 = vaddw_high_u8(s2_7, d3);
+
+        adacc_prev = adacc;
+        buf += 64;
+    }
+
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+            vst1q_u8(dst, d0);
+            dst += 16;
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
+    }
+
+    uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    size_t num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (size_t i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
+    }
+
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
+    }
+
+    uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
+
+    uint32_t pair[2];
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * when there's no explicit aligned load instruction. Note:
+     * the code currently emits an alignment hint in the instruction
+     * for exactly 256 bits when supported by the compiler. Several ARM
+     * SIPs have small penalties for cacheline crossing loads as well (so
+     * really 512 bits is the optimal alignment of the buffer). 32 bytes
+     * should strike a balance, though. The Cortex-A8 and Cortex-A9
+     * processors are documented to benefit from 128 bit and 64 bit
+     * alignment, but it's unclear which other SIPs will benefit from it.
+     * In the copying variant we use fallback to 4x loads and 4x stores,
+     * as ld1x4 seems to block ILP when stores are in the mix */
+    size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
+    size_t n = NMAX_ALIGNED32;
+    if (align_diff) {
+        adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
+
+        if (COPY)
+            dst += align_diff;
+        src += align_diff;
+        len -= align_diff;
+        n = ALIGN_DOWN(n - align_diff, 32);
+    }
+
+    while (len >= 16) {
+        n = MIN(len, n);
+
+        if (COPY)
+            NEON_accum32_copy(pair, dst, src, n >> 4);
+        else
+            NEON_accum32(pair, src, n >> 4);
+
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        size_t k = (n >> 4) << 4;
+        src += k;
+        if (COPY)
+            dst += k;
+        len -= k;
+        n = NMAX_ALIGNED32;
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY);
+}
+
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+    return adler32_copy_impl(adler, dst, src, len, 1);
+#else
+    /* Without unaligned access, interleaved stores get decomposed into byte ops */
+    adler = adler32_neon(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+#endif
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.c b/neozip/arch/arm/arm_features.c
new file mode 100644
index 0000000000..8f179526ef
--- /dev/null
+++ b/neozip/arch/arm/arm_features.c
@@ -0,0 +1,334 @@
+#ifdef ARM_FEATURES
+
+#include "zbuild.h"
+#include "arm_features.h"
+
+#if defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#  ifdef ARM_ASM_HWCAP
+#    include <asm/hwcap.h>
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+#  include <machine/armreg.h>
+#  ifndef ID_AA64ISAR0_CRC32_VAL
+#    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+#  endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+#  include <machine/armreg.h>
+#  include <machine/cpu.h>
+#  include <sys/sysctl.h>
+#  include <sys/types.h>
+#elif defined(__APPLE__)
+#  if !defined(_DARWIN_C_SOURCE)
+#    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+#  endif
+#  include <sys/sysctl.h>
+#elif defined(_WIN32)
+#  include <windows.h>
+#endif
+
+static int arm_has_crc32(void) {
+    int has_crc32 = 0;
+#if defined(__ARM_FEATURE_CRC32)
+    /* Compile-time check */
+    has_crc32 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CRC32
+    has_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
+#  elif defined(HWCAP2_CRC32)
+    has_crc32 = (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CRC32
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_crc32 = (hwcap & HWCAP_CRC32) != 0;
+#  elif defined(HWCAP2_CRC32)
+    unsigned long hwcap2 = 0;
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+    has_crc32 = (hwcap2 & HWCAP2_CRC32) != 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+    has_crc32 = getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      has_crc32 = ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE;
+    }
+#elif defined(__APPLE__)
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_crc32 = sysctlbyname("hw.optional.armv8_crc32", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+#elif defined(_WIN32)
+    has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#endif
+    return has_crc32;
+}
+
+static int arm_has_pmull(void) {
+    int has_pmull = 0;
+#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)
+    /* Compile-time check */
+    has_pmull = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_PMULL
+    has_pmull = (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#  elif defined(HWCAP_AES)
+    /* PMULL is part of crypto extension, check for AES as proxy */
+    has_pmull = (getauxval(AT_HWCAP) & HWCAP_AES) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_PMULL
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_pmull = (hwcap & HWCAP_PMULL) != 0;
+#  elif defined(HWCAP_AES)
+    /* PMULL is part of crypto extension, check for AES as proxy */
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_pmull = (hwcap & HWCAP_AES) != 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+    /* Check for AES feature as PMULL is part of crypto extension */
+    has_pmull = getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_AES_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_AES_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      has_pmull = ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_BASE;
+    }
+#elif defined(__APPLE__)
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_pmull = sysctlbyname("hw.optional.arm.FEAT_PMULL", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+#elif defined(_WIN32)
+    /* Windows checks for crypto/AES support */
+#  ifdef PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE
+    has_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+#  endif
+#endif
+    return has_pmull;
+}
+
+static int arm_has_eor3(void) {
+    int has_eor3 = 0;
+#if defined(__ARM_FEATURE_SHA3)
+    /* Compile-time check */
+    has_eor3 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+    /* EOR3 is part of SHA3 extension, check HWCAP2_SHA3 */
+#  ifdef HWCAP2_SHA3
+    has_eor3 = (getauxval(AT_HWCAP2) & HWCAP2_SHA3) != 0;
+#  elif defined(HWCAP_SHA3)
+    has_eor3 = (getauxval(AT_HWCAP) & HWCAP_SHA3) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP2_SHA3
+    unsigned long hwcap2 = 0;
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+    has_eor3 = (hwcap2 & HWCAP2_SHA3) != 0;
+#  elif defined(HWCAP_SHA3)
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_eor3 = (hwcap & HWCAP_SHA3) != 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+    /* FreeBSD: check for SHA3 in id_aa64isar0_el1 */
+#  ifdef ID_AA64ISAR0_SHA3_VAL
+    has_eor3 = getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_SHA3_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_SHA3_BASE;
+#  endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+#  ifdef ID_AA64ISAR0_SHA3
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      has_eor3 = ID_AA64ISAR0_SHA3(isar0) >= ID_AA64ISAR0_SHA3_IMPL;
+    }
+#  endif
+#elif defined(__APPLE__)
+    /* All Apple Silicon (M1+) has SHA3/EOR3 support */
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_eor3 = sysctlbyname("hw.optional.arm.FEAT_SHA3", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+    /* Fallback to legacy name for older macOS versions */
+    if (!has_eor3) {
+        size = sizeof(has_feat);
+        has_eor3 = sysctlbyname("hw.optional.armv8_2_sha3", &has_feat, &size, NULL, 0) == 0
+            && has_feat == 1;
+    }
+#elif defined(_WIN32)
+#  ifdef PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE
+    has_eor3 = IsProcessorFeaturePresent(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE);
+#  endif
+#endif
+    return has_eor3;
+}
+
+/* AArch64 has neon. */
+#ifdef ARCH_32BIT
+static inline int arm_has_neon(void) {
+    int has_neon = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    /* Compile-time check */
+    has_neon = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_ARM_NEON
+    has_neon = (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
+#  elif defined(HWCAP_NEON)
+    has_neon = (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_NEON
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_neon = (hwcap & HWCAP_NEON) != 0;
+#  endif
+#elif defined(__APPLE__)
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_neon = sysctlbyname("hw.optional.neon", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+    has_neon = 1; /* Always supported */
+#  endif
+#endif
+    return has_neon;
+}
+#endif
+
+/* AArch64 does not have ARMv6 SIMD. */
+#ifdef ARCH_32BIT
+static inline int arm_has_simd(void) {
+    int has_simd = 0;
+#if defined(__ARM_FEATURE_SIMD32)
+    /* Compile-time check for ARMv6 SIMD */
+    has_simd = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+    const char *platform = (const char *)getauxval(AT_PLATFORM);
+    has_simd = platform
+       && (strncmp(platform, "v6l", 3) == 0
+        || strncmp(platform, "v7l", 3) == 0
+        || strncmp(platform, "v8l", 3) == 0);
+#endif
+    return has_simd;
+}
+#endif
+
+#if defined(ARCH_64BIT) && !defined(__APPLE__) && !defined(_WIN32)
+/* MIDR_EL1 bit field definitions */
+#define MIDR_IMPLEMENTOR(midr)  (((midr) & (0xffU << 24)) >> 24)
+#define MIDR_PARTNUM(midr)      (((midr) & (0xfffU << 4)) >> 4)
+
+/* ARM CPU Implementer IDs */
+#define ARM_IMPLEMENTER_ARM      0x41
+#define ARM_IMPLEMENTER_QUALCOMM 0x51
+#define ARM_IMPLEMENTER_APPLE    0x61
+
+/* ARM CPU Part Numbers */
+
+/* Cortex-X series - Multiple PMULL lanes */
+#define ARM_PART_CORTEX_X1   0xd44
+#define ARM_PART_CORTEX_X1C  0xd4c
+#define ARM_PART_CORTEX_X2   0xd48
+#define ARM_PART_CORTEX_X3   0xd4e
+#define ARM_PART_CORTEX_X4   0xd82
+#define ARM_PART_CORTEX_X925 0xd85
+
+/* Neoverse V/N2 series - Multiple PMULL lanes */
+#define ARM_PART_NEOVERSE_N2 0xd49
+#define ARM_PART_NEOVERSE_V1 0xd40
+#define ARM_PART_NEOVERSE_V2 0xd4f
+#define ARM_PART_NEOVERSE_V3 0xd8e
+
+/* Snapdragon X Elite/Plus - Custom core */
+#define QUALCOMM_PART_ORYON 0x001
+
+static inline int arm_has_cpuid(void) {
+    int has_cpuid = 0;
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CPUID
+    has_cpuid = (getauxval(AT_HWCAP) & HWCAP_CPUID) != 0;
+#  elif defined(HWCAP2_CPUID)
+    has_cpuid = (getauxval(AT_HWCAP2) & HWCAP2_CPUID) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CPUID
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_cpuid = (hwcap & HWCAP_CPUID) != 0;
+#  endif
+#endif
+    return has_cpuid;
+}
+#endif
+
+/* Determine if CPU has fast PMULL (multiple execution units) */
+static inline int arm_cpu_has_fast_pmull(void) {
+    int has_fast_pmull = 0;
+#if defined(__APPLE__)
+    /* On macOS, all Apple Silicon has fast PMULL */
+    has_fast_pmull = 1;
+#elif defined(ARCH_64BIT) && !defined(_WIN32)
+    /* We need CPUID feature to read MIDR register */
+    if (!arm_has_cpuid())
+        return has_fast_pmull;
+
+    uint64_t midr;
+    __asm__ ("mrs %0, midr_el1" : "=r" (midr));
+
+    uint32_t implementer = MIDR_IMPLEMENTOR(midr);
+    uint32_t part = MIDR_PARTNUM(midr);
+
+    if (implementer == ARM_IMPLEMENTER_APPLE) {
+        /* All Apple Silicon (M1+) have fast PMULL */
+        has_fast_pmull = 1;
+    } else if (implementer == ARM_IMPLEMENTER_ARM) {
+        /* ARM Cortex-X and Neoverse V/N2 series have multi-lane PMULL */
+        switch (part) {
+            case ARM_PART_CORTEX_X1:
+            case ARM_PART_CORTEX_X1C:
+            case ARM_PART_CORTEX_X2:
+            case ARM_PART_CORTEX_X3:
+            case ARM_PART_CORTEX_X4:
+            case ARM_PART_CORTEX_X925:
+            case ARM_PART_NEOVERSE_N2:
+            case ARM_PART_NEOVERSE_V1:
+            case ARM_PART_NEOVERSE_V2:
+            case ARM_PART_NEOVERSE_V3:
+                has_fast_pmull = 1;
+        }
+    } else if (implementer == ARM_IMPLEMENTER_QUALCOMM) {
+        /* Qualcomm Oryon (Snapdragon X Elite/Plus) has fast PMULL */
+        if (part == QUALCOMM_PART_ORYON)
+            has_fast_pmull = 1;
+    }
+#endif
+    return has_fast_pmull;
+}
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
+#ifdef ARCH_64BIT
+    features->has_simd = 0; /* never available */
+    features->has_neon = 1; /* always available */
+#else
+    features->has_simd = arm_has_simd();
+    features->has_neon = arm_has_neon();
+#endif
+    features->has_crc32 = arm_has_crc32();
+    features->has_pmull = arm_has_pmull();
+    features->has_eor3 = arm_has_eor3();
+    features->has_fast_pmull = features->has_pmull && arm_cpu_has_fast_pmull();
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.h b/neozip/arch/arm/arm_features.h
new file mode 100644
index 0000000000..2f17a9ddf0
--- /dev/null
+++ b/neozip/arch/arm/arm_features.h
@@ -0,0 +1,19 @@
+/* arm_features.h -- check for ARM features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_
+
+struct arm_cpu_features {
+    int has_simd;
+    int has_neon;
+    int has_crc32;
+    int has_pmull;
+    int has_eor3;
+    int has_fast_pmull;
+};
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
+
+#endif /* ARM_FEATURES_H_ */
diff --git a/neozip/arch/arm/arm_functions.h b/neozip/arch/arm/arm_functions.h
new file mode 100644
index 0000000000..bc77adb977
--- /dev/null
+++ b/neozip/arch/arm/arm_functions.h
@@ -0,0 +1,75 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#include "arm_natives.h"
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match);
+void slide_hash_neon(deflate_state *s);
+#endif
+
+#ifdef ARM_CRC32
+uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef ARM_PMULL_EOR3
+uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+#  ifdef ARM_SIMD_NATIVE
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_armv6
+#  endif
+// ARM - NEON
+#  ifdef ARM_NEON_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_neon
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_neon
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_neon
+#    undef native_compare256
+#    define native_compare256 compare256_neon
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_neon
+#    undef native_longest_match
+#    define native_longest_match longest_match_neon
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_neon
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_neon
+#  endif
+// ARM - CRC32
+#  ifdef ARM_CRC32_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_armv8
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_armv8
+#  endif
+// ARM - PMULL EOR3
+#  ifdef ARM_PMULL_EOR3_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_armv8_pmull_eor3
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_armv8_pmull_eor3
+#  endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
diff --git a/neozip/arch/arm/arm_natives.h b/neozip/arch/arm/arm_natives.h
new file mode 100644
index 0000000000..311e33e958
--- /dev/null
+++ b/neozip/arch/arm/arm_natives.h
@@ -0,0 +1,31 @@
+/* arm_natives.h -- ARM compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_NATIVES_H_
+#define ARM_NATIVES_H_
+
+#if defined(__ARM_FEATURE_SIMD32)
+#  ifdef ARM_SIMD
+#    define ARM_SIMD_NATIVE
+#  endif
+#endif
+/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT)
+#  ifdef ARM_NEON
+#    define ARM_NEON_NATIVE
+#  endif
+#endif
+/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */
+#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801)
+#  ifdef ARM_CRC32
+#    define ARM_CRC32_NATIVE
+#  endif
+#endif
+#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3)
+#  ifdef ARM_PMULL_EOR3
+#    define ARM_PMULL_EOR3_NATIVE
+#  endif
+#endif
+
+#endif /* ARM_NATIVES_H_ */
diff --git a/neozip/arch/arm/chunkset_neon.c b/neozip/arch/arm/chunkset_neon.c
new file mode 100644
index 0000000000..a891f10fa5
--- /dev/null
+++ b/neozip/arch/arm/chunkset_neon.c
@@ -0,0 +1,81 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+#include "neon_intrins.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef uint8x16_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from)));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from)));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from)));
+}
+
+#define CHUNKSIZE        chunksize_neon
+#define CHUNKCOPY        chunkcopy_neon
+#define CHUNKUNROLL      chunkunroll_neon
+#define CHUNKMEMSET      chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vst1q_u8(out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 16 - dist);
+
+    /* This version of table is only available on aarch64 */
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+    uint8x16_t ret_vec = vld1q_u8(buf);
+
+    uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128);
+    return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+    uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+    perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64);
+    perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
+    a = vld1_u8(buf);
+    b = vld1_u8(buf + 8);
+    ret0 = vtbl1_u8(a, perm_vec0);
+    uint8x8x2_t ab;
+    ab.val[0] = a;
+    ab.val[1] = b;
+    ret1 = vtbl2_u8(ab, perm_vec1);
+    return vcombine_u8(ret0, ret1);
+#endif
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_neon
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/compare256_neon.c b/neozip/arch/arm/compare256_neon.c
new file mode 100644
index 0000000000..4ced9fc9ca
--- /dev/null
+++ b/neozip/arch/arm/compare256_neon.c
@@ -0,0 +1,56 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON)
+#include "neon_intrins.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint8x16_t a, b, cmp;
+        uint64_t lane;
+
+        a = vld1q_u8(src0);
+        b = vld1q_u8(src1);
+
+        cmp = veorq_u8(a, b);
+
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+        if (lane)
+            return len + zng_ctz64(lane) / 8;
+        len += 8;
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+        if (lane)
+            return len + zng_ctz64(lane) / 8;
+        len += 8;
+
+        src0 += 16, src1 += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/crc32_armv8.c b/neozip/arch/arm/crc32_armv8.c
new file mode 100644
index 0000000000..59f2b65009
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8.c
@@ -0,0 +1,81 @@
+/* crc32_armv8.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
+ * Copyright (C) 2016 Yang Zhang
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_CRC32
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "crc32_armv8_p.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len,
+                                                           const int COPY) {
+    uint32_t c = ~crc;
+
+    if (UNLIKELY(len == 1)) {
+        if (COPY)
+            *dst = *src;
+        c = __crc32b(c, *src);
+        return ~c;
+    }
+
+    /* Align to 8-byte boundary for tail processing */
+    uintptr_t align_diff = ALIGN_DIFF(src, 8);
+    if (align_diff)
+        c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY);
+
+    while (len >= 64) {
+        uint64_t d0 = *(const uint64_t *)src;
+        uint64_t d1 = *(const uint64_t *)(src + 8);
+        uint64_t d2 = *(const uint64_t *)(src + 16);
+        uint64_t d3 = *(const uint64_t *)(src + 24);
+        uint64_t d4 = *(const uint64_t *)(src + 32);
+        uint64_t d5 = *(const uint64_t *)(src + 40);
+        uint64_t d6 = *(const uint64_t *)(src + 48);
+        uint64_t d7 = *(const uint64_t *)(src + 56);
+
+        if (COPY) {
+            memcpy(dst,      &d0, 8);
+            memcpy(dst + 8,  &d1, 8);
+            memcpy(dst + 16, &d2, 8);
+            memcpy(dst + 24, &d3, 8);
+            memcpy(dst + 32, &d4, 8);
+            memcpy(dst + 40, &d5, 8);
+            memcpy(dst + 48, &d6, 8);
+            memcpy(dst + 56, &d7, 8);
+            dst += 64;
+        }
+
+        c = __crc32d(c, d0);
+        c = __crc32d(c, d1);
+        c = __crc32d(c, d2);
+        c = __crc32d(c, d3);
+        c = __crc32d(c, d4);
+        c = __crc32d(c, d5);
+        c = __crc32d(c, d6);
+        c = __crc32d(c, d7);
+
+        src += 64;
+        len -= 64;
+    }
+
+    return crc32_armv8_tail(c, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+    return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+    /* Without unaligned access, interleaved stores get decomposed into byte ops */
+    crc = crc32_armv8(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/crc32_armv8_p.h b/neozip/arch/arm/crc32_armv8_p.h
new file mode 100644
index 0000000000..e72c4c0ad1
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_p.h
@@ -0,0 +1,103 @@
+/* crc32_armv8_p.h -- Private shared inline ARMv8 CRC32 functions
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CRC32_ARMV8_P_H
+#define CRC32_ARMV8_P_H
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_align(uint32_t crc, uint8_t **dst, const uint8_t **buf,
+                                                             size_t *len, uintptr_t align_diff, const int COPY) {
+    if (*len && (align_diff & 1)) {
+        uint8_t val = **buf;
+        if (COPY) {
+            **dst = val;
+            *dst += 1;
+        }
+        crc = __crc32b(crc, val);
+        *buf += 1;
+        *len -= 1;
+    }
+
+    if (*len >= 2 && (align_diff & 2)) {
+        uint16_t val = *((uint16_t*)*buf);
+        if (COPY) {
+            memcpy(*dst, &val, 2);
+            *dst += 2;
+        }
+        crc = __crc32h(crc, val);
+        *buf += 2;
+        *len -= 2;
+    }
+
+    if (*len >= 4 && (align_diff & 4)) {
+        uint32_t val = *((uint32_t*)*buf);
+        if (COPY) {
+            memcpy(*dst, &val, 4);
+            *dst += 4;
+        }
+        crc = __crc32w(crc, val);
+        *buf += 4;
+        *len -= 4;
+    }
+
+    if (*len >= 8 && (align_diff & 8)) {
+        uint64_t val = *((uint64_t*)*buf);
+        if (COPY) {
+            memcpy(*dst, &val, 8);
+            *dst += 8;
+        }
+        crc = __crc32d(crc, val);
+        *buf += 8;
+        *len -= 8;
+    }
+
+    return crc;
+}
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_tail(uint32_t crc, uint8_t *dst, const uint8_t *buf,
+                                                            size_t len, const int COPY) {
+    while (len >= 8) {
+        uint64_t val = *((uint64_t*)buf);
+        if (COPY) {
+            memcpy(dst, &val, 8);
+            dst += 8;
+        }
+        crc = __crc32d(crc, val);
+        buf += 8;
+        len -= 8;
+    }
+
+    if (len & 4) {
+        uint32_t val = *((uint32_t*)buf);
+        if (COPY) {
+            memcpy(dst, &val, 4);
+            dst += 4;
+        }
+        crc = __crc32w(crc, val);
+        buf += 4;
+    }
+
+    if (len & 2) {
+        uint16_t val = *((uint16_t*)buf);
+        if (COPY) {
+            memcpy(dst, &val, 2);
+            dst += 2;
+        }
+        crc = __crc32h(crc, val);
+        buf += 2;
+    }
+
+    if (len & 1) {
+        uint8_t val = *buf;
+        if (COPY)
+            *dst = val;
+        crc = __crc32b(crc, val);
+    }
+
+    return ~crc;
+}
+
+#endif /* CRC32_ARMV8_P_H */
diff --git a/neozip/arch/arm/crc32_armv8_pmull_eor3.c b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
new file mode 100644
index 0000000000..e0d5bf043b
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
@@ -0,0 +1,366 @@
+/* crc32_armv8_pmull_eor3.c -- ARMv8 CRC32 using PMULL + EOR3 (SHA3 extension)
+ * Copyright (C) 2025 Peter Cawley
+ *   https://github.com/corsix/fast-crc32
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This uses EOR3 (3-way XOR) from ARMv8.2-A SHA3 extension to save instructions.
+ * Uses 3-way parallel scalar CRC + 9 PMULL vector lanes, processing 192 bytes/iter.
+ */
+
+#ifdef ARM_PMULL_EOR3
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "acle_intrins.h"
+#include "neon_intrins.h"
+#include "crc32_armv8_p.h"
+
+/* Carryless multiply low 64 bits: a[0] * b[0] */
+static inline uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
+#ifdef _MSC_VER
+    return vreinterpretq_u64_p128(vmull_p64(
+        vget_low_p64(vreinterpret_p64_u64(a)),
+        vget_low_p64(vreinterpret_p64_u64(b))));
+#else
+    return vreinterpretq_u64_p128(vmull_p64(
+        vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(a)), 0),
+        vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(b)), 0)));
+#endif
+}
+
+/* Carryless multiply high 64 bits: a[1] * b[1] */
+static inline uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
+    return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b)));
+}
+
+/* Carryless multiply of two 32-bit scalars: a * b (returns 64-bit result in 128-bit vector) */
+static inline uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
+#ifdef _MSC_VER
+    return vreinterpretq_u64_p128(vmull_p64(vdup_n_p64((poly64_t)a), vdup_n_p64((poly64_t)b)));
+#else
+    return vreinterpretq_u64_p128(vmull_p64((poly64_t)a, (poly64_t)b));
+#endif
+}
+
+/* Compute x^n mod P (CRC-32 polynomial) in log(n) time, where P = 0x104c11db7 */
+static uint32_t xnmodp(uint64_t n) {
+  uint64_t stack = ~(uint64_t)1;
+  uint32_t acc, low;
+  for (; n > 191; n = (n >> 1) - 16) {
+    stack = (stack << 1) + (n & 1);
+  }
+  stack = ~stack;
+  acc = ((uint32_t)0x80000000) >> (n & 31);
+  for (n >>= 5; n; --n) {
+    acc = __crc32w(acc, 0);
+  }
+  while ((low = stack & 1), stack >>= 1) {
+    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
+    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
+    acc = __crc32d(0, y << low);
+  }
+  return acc;
+}
+
+/* Shift CRC forward by nbytes: equivalent to appending nbytes of zeros to the data stream */
+static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
+  Assert(nbytes >= 5, "crc_shift requires nbytes >= 5");
+  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
+}
+
+Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src,
+                                                                  size_t len, const int COPY) {
+    uint32_t crc0 = ~crc;
+
+    if (UNLIKELY(len == 1)) {
+        if (COPY)
+            *dst = *src;
+        crc0 = __crc32b(crc0, *src);
+        return ~crc0;
+    }
+
+    /* Align to 16-byte boundary for vector path */
+    uintptr_t align_diff = ALIGN_DIFF(src, 16);
+    if (align_diff)
+        crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY);
+
+    /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */
+    if (len >= 192) {
+        size_t blk = len / 192;                   /* Number of 192-byte blocks */
+        size_t klen = blk * 16;                   /* Scalar stride per CRC lane */
+        const uint8_t *end = src + len;
+        const uint8_t *src0 = src;
+        const uint8_t *src1 = src + klen;
+        const uint8_t *src2 = src + klen * 2;
+        const uint8_t *srcv = src + klen * 3;     /* Vector data starts after scalar lanes */
+        uint32_t crc1 = 0, crc2 = 0;
+        uint64x2_t vc0, vc1, vc2;
+        uint64_t vc;
+
+        /* Load first 9 vector chunks (144 bytes) */
+        uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0;
+        uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1;
+        uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2;
+        uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3;
+        uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4;
+        uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5;
+        uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6;
+        uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7;
+        uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8;
+        uint64x2_t k;
+        /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
+        { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
+
+        /* Per-region dst pointers */
+        uint8_t *dst0 = dst;
+        uint8_t *dst1 = NULL;
+        uint8_t *dst2 = NULL;
+        uint8_t *dst_v = NULL;
+
+        if (COPY) {
+            dst1 = dst + klen;
+            dst2 = dst + klen * 2;
+            dst_v = dst + klen * 3;
+            vst1q_u8(dst_v, vreinterpretq_u8_u64(x0));
+            vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(x1));
+            vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(x2));
+            vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(x3));
+            vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(x4));
+            vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(x5));
+            vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(x6));
+            vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(x7));
+            vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(x8));
+            dst_v += 144;
+        }
+        srcv += 144;
+
+        /* Fold 9 vectors + 3-way parallel scalar CRC */
+        if (blk > 1) {
+            /* Only form a limit pointer when we have at least 2 blocks. */
+            const uint8_t *limit = src0 + klen - 32;
+            while (src0 <= limit) {
+                /* Fold all 9 vector lanes using PMULL */
+                y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+                y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+                y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+                y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+                y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+                y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
+                y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+                y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
+                y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
+
+                /* EOR3: combine hi*k, lo*k, and new data in one instruction */
+                {
+                    uint64x2_t d0 = vld1q_u64_ex((const uint64_t*)srcv, 128);
+                    uint64x2_t d1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128);
+                    uint64x2_t d2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128);
+                    uint64x2_t d3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128);
+                    uint64x2_t d4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128);
+                    uint64x2_t d5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128);
+                    uint64x2_t d6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128);
+                    uint64x2_t d7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128);
+                    uint64x2_t d8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128);
+                    if (COPY) {
+                        vst1q_u8(dst_v, vreinterpretq_u8_u64(d0));
+                        vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(d1));
+                        vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(d2));
+                        vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(d3));
+                        vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(d4));
+                        vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(d5));
+                        vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(d6));
+                        vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(d7));
+                        vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(d8));
+                        dst_v += 144;
+                    }
+                    x0 = veor3q_u64(x0, y0, d0);
+                    x1 = veor3q_u64(x1, y1, d1);
+                    x2 = veor3q_u64(x2, y2, d2);
+                    x3 = veor3q_u64(x3, y3, d3);
+                    x4 = veor3q_u64(x4, y4, d4);
+                    x5 = veor3q_u64(x5, y5, d5);
+                    x6 = veor3q_u64(x6, y6, d6);
+                    x7 = veor3q_u64(x7, y7, d7);
+                    x8 = veor3q_u64(x8, y8, d8);
+                }
+
+                /* 3-way parallel scalar CRC (16 bytes each) */
+                {
+                    uint64_t s0a = *(const uint64_t*)src0;
+                    uint64_t s0b = *(const uint64_t*)(src0 + 8);
+                    uint64_t s1a = *(const uint64_t*)src1;
+                    uint64_t s1b = *(const uint64_t*)(src1 + 8);
+                    uint64_t s2a = *(const uint64_t*)src2;
+                    uint64_t s2b = *(const uint64_t*)(src2 + 8);
+                    if (COPY) {
+                        memcpy(dst0, &s0a, 8);
+                        memcpy(dst0 + 8, &s0b, 8);
+                        dst0 += 16;
+                        memcpy(dst1, &s1a, 8);
+                        memcpy(dst1 + 8, &s1b, 8);
+                        dst1 += 16;
+                        memcpy(dst2, &s2a, 8);
+                        memcpy(dst2 + 8, &s2b, 8);
+                        dst2 += 16;
+                    }
+                    crc0 = __crc32d(crc0, s0a);
+                    crc0 = __crc32d(crc0, s0b);
+                    crc1 = __crc32d(crc1, s1a);
+                    crc1 = __crc32d(crc1, s1b);
+                    crc2 = __crc32d(crc2, s2a);
+                    crc2 = __crc32d(crc2, s2b);
+                }
+                src0 += 16;
+                src1 += 16;
+                src2 += 16;
+                srcv += 144;
+            }
+        }
+
+        /* Reduce 9 vectors to 1 using tree reduction */
+        /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */
+        { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); }
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
+
+        /* Step 2: fold pairs (x0,x1), (x2,x3), (x4,x5), (x6,x7) */
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+        y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+        y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        /* Step 3: fold pairs (x0,x2), (x4,x6) */
+        { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); }
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        /* Step 4: final fold (x0, x4) -> x0 */
+        { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); }
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        /* Process final scalar chunk */
+        {
+            uint64_t s0a = *(const uint64_t*)src0;
+            uint64_t s0b = *(const uint64_t*)(src0 + 8);
+            uint64_t s1a = *(const uint64_t*)src1;
+            uint64_t s1b = *(const uint64_t*)(src1 + 8);
+            uint64_t s2a = *(const uint64_t*)src2;
+            uint64_t s2b = *(const uint64_t*)(src2 + 8);
+            if (COPY) {
+                memcpy(dst0, &s0a, 8);
+                memcpy(dst0 + 8, &s0b, 8);
+                memcpy(dst1, &s1a, 8);
+                memcpy(dst1 + 8, &s1b, 8);
+                memcpy(dst2, &s2a, 8);
+                memcpy(dst2 + 8, &s2b, 8);
+            }
+            crc0 = __crc32d(crc0, s0a);
+            crc0 = __crc32d(crc0, s0b);
+            crc1 = __crc32d(crc1, s1a);
+            crc1 = __crc32d(crc1, s1b);
+            crc2 = __crc32d(crc2, s2a);
+            crc2 = __crc32d(crc2, s2b);
+        }
+
+        /* Shift and combine 3 scalar CRCs */
+        vc0 = crc_shift(crc0, klen * 2 + blk * 144);
+        vc1 = crc_shift(crc1, klen + blk * 144);
+        vc2 = crc_shift(crc2, blk * 144);
+        vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+        if (COPY)
+            dst += blk * 192;
+        src = srcv;
+        len = end - srcv;
+    }
+
+    /* 3-way scalar CRC (24 bytes/iter) */
+    if (len >= 80) {
+        size_t klen = ((len - 8) / 24) * 8;   /* Stride for 3-way parallel */
+        const uint8_t *buf0 = src;
+        const uint8_t *buf1 = src + klen;
+        const uint8_t *buf2 = src + klen * 2;
+        uint32_t crc1 = 0, crc2 = 0;
+        uint64x2_t vc0, vc1;
+        uint64_t vc;
+
+        /* Per-lane dst pointers */
+        uint8_t *dst0 = dst;
+        uint8_t *dst1 = NULL;
+        uint8_t *dst2 = NULL;
+        if (COPY) {
+            dst1 = dst + klen;
+            dst2 = dst + klen * 2;
+        }
+
+        /* 3-way parallel scalar CRC */
+        do {
+            uint64_t v0 = *(const uint64_t*)buf0;
+            uint64_t v1 = *(const uint64_t*)buf1;
+            uint64_t v2 = *(const uint64_t*)buf2;
+            if (COPY) {
+                memcpy(dst0, &v0, 8);
+                dst0 += 8;
+                memcpy(dst1, &v1, 8);
+                dst1 += 8;
+                memcpy(dst2, &v2, 8);
+                dst2 += 8;
+            }
+            crc0 = __crc32d(crc0, v0);
+            crc1 = __crc32d(crc1, v1);
+            crc2 = __crc32d(crc2, v2);
+            buf0 += 8;
+            buf1 += 8;
+            buf2 += 8;
+            len -= 24;
+        } while (len >= 32);
+
+        /* Combine the 3 CRCs */
+        vc0 = crc_shift(crc0, klen * 2 + 8);
+        vc1 = crc_shift(crc1, klen + 8);
+        vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        /* Process final 8 bytes with combined CRC */
+        crc0 = crc2;
+        {
+            uint64_t vf = *(const uint64_t*)buf2;
+            if (COPY)
+                memcpy(dst2, &vf, 8);
+            crc0 = __crc32d(crc0, vf ^ vc);
+        }
+        src = buf2 + 8;
+        len -= 8;
+        if (COPY)
+            dst = dst2 + 8;
+    }
+
+    /* Process remaining bytes */
+    return crc32_armv8_tail(crc0, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+    return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+    /* Without unaligned access, interleaved stores get decomposed into byte ops */
+    crc = crc32_armv8_pmull_eor3(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/neon_intrins.h b/neozip/arch/arm/neon_intrins.h
new file mode 100644
index 0000000000..449916e0b7
--- /dev/null
+++ b/neozip/arch/arm/neon_intrins.h
@@ -0,0 +1,79 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#if defined(_MSC_VER) && defined(ARCH_ARM) && defined(ARCH_64BIT)
+/* arm64_neon.h is MSVC specific */
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON) && defined(ARCH_ARM) && defined(ARCH_32BIT)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+    out.val[0] = vqsubq_u16(a.val[0], b); \
+    out.val[1] = vqsubq_u16(a.val[1], b); \
+    out.val[2] = vqsubq_u16(a.val[2], b); \
+    out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+#  if defined(ARCH_ARM) && defined(ARCH_32BIT) && defined(__clang__) && \
+    (!defined(__clang_major__) || __clang_major__ < 20)
+/* Clang versions before 20 have too strict of an
+ * alignment requirement (:256) for x4 NEON intrinsics */
+#    undef ARM_NEON_HASLD4
+#    undef vld1q_u16_x4
+#    undef vld1q_u8_x4
+#    undef vst1q_u16_x4
+#  endif
+
+#  ifndef ARM_NEON_HASLD4
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
+    uint16x8x4_t ret;
+    ret.val[0] = vld1q_u16(a);
+    ret.val[1] = vld1q_u16(a+8);
+    ret.val[2] = vld1q_u16(a+16);
+    ret.val[3] = vld1q_u16(a+24);
+    return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(a);
+    ret.val[1] = vld1q_u8(a+16);
+    ret.val[2] = vld1q_u8(a+32);
+    ret.val[3] = vld1q_u8(a+48);
+    return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+    vst1q_u16(p, a.val[0]);
+    vst1q_u16(p + 8, a.val[1]);
+    vst1q_u16(p + 16, a.val[2]);
+    vst1q_u16(p + 24, a.val[3]);
+}
+#  endif // HASLD4 check
+
+#  ifndef _MSC_VER
+#    define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8))
+#  endif
+#  if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
+#    define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
+#    define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
+#  endif
+
+#endif
+
+#endif // include guard ARM_NEON_INTRINS_H
diff --git a/neozip/arch/arm/slide_hash_armv6.c b/neozip/arch/arm/slide_hash_armv6.c
new file mode 100644
index 0000000000..b241e6c5e6
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_armv6.c
@@ -0,0 +1,49 @@
+/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
+ * Copyright (C) 2023 Cameron Cawley
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_SIMD
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x2_t v;
+    uint16x2_t p0, p1, p2, p3;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = wsize | (wsize << 16);
+
+    n = size / (sizeof(uint16x2_t) * 4);
+    do {
+        p0 = *((const uint16x2_t *)(table));
+        p1 = *((const uint16x2_t *)(table+2));
+        p2 = *((const uint16x2_t *)(table+4));
+        p3 = *((const uint16x2_t *)(table+6));
+        p0 = __uqsub16(p0, v);
+        p1 = __uqsub16(p1, v);
+        p2 = __uqsub16(p2, v);
+        p3 = __uqsub16(p3, v);
+        *((uint16x2_t *)(table)) = p0;
+        *((uint16x2_t *)(table+2)) = p1;
+        *((uint16x2_t *)(table+4)) = p2;
+        *((uint16x2_t *)(table+6)) = p3;
+        table += 8;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/neozip/arch/arm/slide_hash_neon.c b/neozip/arch/arm/slide_hash_neon.c
new file mode 100644
index 0000000000..2f9e94a33d
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_neon.c
@@ -0,0 +1,48 @@
+/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x8_t v;
+    uint16x8x4_t p0, p1;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = vdupq_n_u16(wsize);
+
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p0 = vld1q_u16_x4_ex(table, 256);
+        p1 = vld1q_u16_x4_ex(table+32, 256);
+        vqsubq_u16_x4_x1(p0, p0, v);
+        vqsubq_u16_x4_x1(p1, p1, v);
+        vst1q_u16_x4_ex(table, p0, 256);
+        vst1q_u16_x4_ex(table+32, p1, 256);
+        table += 64;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/neozip/arch/generic/Makefile.in b/neozip/arch/generic/Makefile.in
new file mode 100644
index 0000000000..1d9cc4df5b
--- /dev/null
+++ b/neozip/arch/generic/Makefile.in
@@ -0,0 +1,68 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ adler32_c.o adler32_c.lo \
+ chunkset_c.o chunkset_c.lo \
+ compare256_c.o compare256_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_chorba_c.o crc32_chorba_c.lo \
+ slide_hash_c.o slide_hash_c.lo
+
+
+adler32_c.o: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_c.lo: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+chunkset_c.o: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+chunkset_c.lo: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+compare256_c.o: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+compare256_c.lo: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_chorba_c.o: $(SRCDIR)/crc32_chorba_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c
+
+crc32_chorba_c.lo: $(SRCDIR)/crc32_chorba_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c
+
+slide_hash_c.o: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/generic/adler32_c.c b/neozip/arch/generic/adler32_c.c
new file mode 100644
index 0000000000..84c946f452
--- /dev/null
+++ b/neozip/arch/generic/adler32_c.c
@@ -0,0 +1,55 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            ADLER_DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            ADLER_DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    adler = FUNCTABLE_CALL(adler32)(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+}
diff --git a/neozip/arch/generic/chunk_128bit_perm_idx_lut.h b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h
new file mode 100644
index 0000000000..6e5098bf26
--- /dev/null
+++ b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h
@@ -0,0 +1,26 @@
+/* chunk_128bit_perm_idx_lut.h - shared SSSE3/NEON/LSX permutation idx lut for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_128BIT_PERM_IDX_LUT_H_
+#define CHUNK_128BIT_PERM_IDX_LUT_H_
+
+#include "chunk_permute_table.h"
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+#endif
diff --git a/neozip/arch/generic/chunk_256bit_perm_idx_lut.h b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h
new file mode 100644
index 0000000000..796a7df120
--- /dev/null
+++ b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h
@@ -0,0 +1,47 @@
+/* chunk_256bit_perm_idx_lut.h - shared AVX512/AVX2/LASX permutation idx lut for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CHUNK_256BIT_PERM_IDX_LUT_H_
+#define CHUNK_256BIT_PERM_IDX_LUT_H_
+
+#include "chunk_permute_table.h"
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
+
+static const uint16_t half_rem_vals[13] = {
+    1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
+#endif
diff --git a/neozip/arch/generic/chunk_permute_table.h b/neozip/arch/generic/chunk_permute_table.h
new file mode 100644
index 0000000000..bad66ccc77
--- /dev/null
+++ b/neozip/arch/generic/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/neozip/arch/generic/chunkset_c.c b/neozip/arch/generic/chunkset_c.c
new file mode 100644
index 0000000000..ff9b1cb5fb
--- /dev/null
+++ b/neozip/arch/generic/chunkset_c.c
@@ -0,0 +1,40 @@
+/* chunkset.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+typedef uint64_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp = zng_memread_4(from);
+    *chunk = tmp | ((chunk_t)tmp << 32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = zng_memread_8(from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = zng_memread_8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    zng_memwrite_8(out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_c
+#define CHUNKCOPY        chunkcopy_c
+#define CHUNKUNROLL      chunkunroll_c
+#define CHUNKMEMSET      chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_c
+
+#include "inffast_tpl.h"
diff --git a/neozip/arch/generic/compare256_c.c b/neozip/arch/generic/compare256_c.c
new file mode 100644
index 0000000000..6934a55565
--- /dev/null
+++ b/neozip/arch/generic/compare256_c.c
@@ -0,0 +1,88 @@
+/* compare256.c -- 256 byte memory comparison with match length return
+ * Copyright (C) 2020 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+/* 8-bit integer comparison for hardware without unaligned loads */
+static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (src0[0] != src1[0])
+            return len;
+        if (src0[1] != src1[1])
+            return len + 1;
+        if (src0[2] != src1[2])
+            return len + 2;
+        if (src0[3] != src1[3])
+            return len + 3;
+        if (src0[4] != src1[4])
+            return len + 4;
+        if (src0[5] != src1[5])
+            return len + 5;
+        if (src0[6] != src1[6])
+            return len + 6;
+        if (src0[7] != src1[7])
+            return len + 7;
+        src0 += 8, src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+/* 64-bit integer comparison for hardware with unaligned loads */
+static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint64_t sv = zng_memread_8(src0);
+        uint64_t mv = zng_memread_8(src1);
+        uint64_t diff = sv ^ mv;
+        if (diff)
+            return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+        src0 += 8, src1 += 8, len += 8;
+
+        sv = zng_memread_8(src0);
+        mv = zng_memread_8(src1);
+        diff = sv ^ mv;
+        if (diff)
+            return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+        src0 += 8, src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+#if OPTIMAL_CMP == 8
+#  define COMPARE256 compare256_8_static
+#else
+#  define COMPARE256 compare256_64_static
+#endif
+
+#ifdef WITH_ALL_FALLBACKS
+Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_8_static(src0, src1);
+}
+
+Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_64_static(src0, src1);
+}
+#endif
+
+Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
+    return COMPARE256(src0, src1);
+}
+
+// Generate longest_match_c
+#define LONGEST_MATCH       longest_match_c
+#include "match_tpl.h"
+
+// Generate longest_match_slow_c
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_c
+#include "match_tpl.h"
diff --git a/neozip/arch/generic/compare256_p.h b/neozip/arch/generic/compare256_p.h
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/neozip/arch/generic/compare256_p.h
diff --git a/neozip/arch/generic/crc32_braid_c.c b/neozip/arch/generic/crc32_braid_c.c
new file mode 100644
index 0000000000..bda4a249bb
--- /dev/null
+++ b/neozip/arch/generic/crc32_braid_c.c
@@ -0,0 +1,213 @@
+/* crc32_braid.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_p.h"
+
+/*
+  A CRC of a message is computed on BRAID_N braids of words in the message, where
+  each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then
+  three running sparse CRCs are calculated respectively on each braid, at these
+  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
+  This is done starting at a word boundary, and continues until as many blocks of
+  BRAID_N * BRAID_W bytes as are available have been processed. The results are
+  combined into a single CRC at the end. For this code, BRAID_N must be in the
+  range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased
+  if desired by adding more #if blocks, extending the patterns apparent in the code.
+  In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N
+  value is increased.
+
+  BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time
+  on a given processor. The choices for BRAID_N and BRAID_W below were based on
+  testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC
+  POWER9, and MIPS64 Octeon II processors.
+  The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.
+  The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.
+  They were all tested with either gcc or clang, all using the -O3 optimization
+  level. Your mileage may vary.
+*/
+
+/* ========================================================================= */
+#ifdef BRAID_W
+/*
+  Return the CRC of the BRAID_W bytes in the word_t data, taking the
+  least-significant byte of the word as the first byte of data, without any pre
+  or post conditioning. This is used to combine the CRCs of each braid.
+ */
+#  if BYTE_ORDER == LITTLE_ENDIAN
+static uint32_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < BRAID_W; k++)
+        data = (data >> 8) ^ crc_table[data & 0xff];
+    return (uint32_t)data;
+}
+#  elif BYTE_ORDER == BIG_ENDIAN
+static z_word_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < BRAID_W; k++)
+        data = (data << 8) ^
+            crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];
+    return data;
+}
+#  endif /* BYTE_ORDER */
+#endif /* BRAID_W */
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len) {
+    crc = ~crc;
+
+#ifdef BRAID_W
+    /* If provided enough bytes, do a braided CRC calculation. */
+    if (len >= BRAID_N * BRAID_W + BRAID_W - 1) {
+        size_t blks;
+        z_word_t const *words;
+        int k;
+
+        /* Compute the CRC up to a z_word_t boundary. */
+        size_t align_diff = (size_t)MIN(ALIGN_DIFF(buf, BRAID_W), len);
+        if (align_diff) {
+            crc = crc32_copy_small(crc, NULL, buf, align_diff, BRAID_W - 1, 0);
+            len -= align_diff;
+            buf += align_diff;
+        }
+
+        /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */
+        blks = len / (BRAID_N * BRAID_W);
+        len -= blks * BRAID_N * BRAID_W;
+        words = (z_word_t const *)buf;
+
+        z_word_t crc0, word0, comb;
+#if BRAID_N > 1
+        z_word_t crc1, word1;
+#if BRAID_N > 2
+        z_word_t crc2, word2;
+#if BRAID_N > 3
+        z_word_t crc3, word3;
+#if BRAID_N > 4
+        z_word_t crc4, word4;
+#if BRAID_N > 5
+        z_word_t crc5, word5;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Initialize the CRC for each braid. */
+        crc0 = Z_WORD_FROM_LE(crc);
+#if BRAID_N > 1
+        crc1 = 0;
+#if BRAID_N > 2
+        crc2 = 0;
+#if BRAID_N > 3
+        crc3 = 0;
+#if BRAID_N > 4
+        crc4 = 0;
+#if BRAID_N > 5
+        crc5 = 0;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
+        while (--blks) {
+            /* Load the word for each braid into registers. */
+            word0 = crc0 ^ words[0];
+#if BRAID_N > 1
+            word1 = crc1 ^ words[1];
+#if BRAID_N > 2
+            word2 = crc2 ^ words[2];
+#if BRAID_N > 3
+            word3 = crc3 ^ words[3];
+#if BRAID_N > 4
+            word4 = crc4 ^ words[4];
+#if BRAID_N > 5
+            word5 = crc5 ^ words[5];
+#endif
+#endif
+#endif
+#endif
+#endif
+            words += BRAID_N;
+
+            /* Compute and update the CRC for each word. The loop should get unrolled. */
+            crc0 = BRAID_TABLE[0][word0 & 0xff];
+#if BRAID_N > 1
+            crc1 = BRAID_TABLE[0][word1 & 0xff];
+#if BRAID_N > 2
+            crc2 = BRAID_TABLE[0][word2 & 0xff];
+#if BRAID_N > 3
+            crc3 = BRAID_TABLE[0][word3 & 0xff];
+#if BRAID_N > 4
+            crc4 = BRAID_TABLE[0][word4 & 0xff];
+#if BRAID_N > 5
+            crc5 = BRAID_TABLE[0][word5 & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            for (k = 1; k < BRAID_W; k++) {
+                crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
+#if BRAID_N > 1
+                crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
+#if BRAID_N > 2
+                crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
+#if BRAID_N > 3
+                crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
+#if BRAID_N > 4
+                crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
+#if BRAID_N > 5
+                crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            }
+        }
+
+        /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */
+        comb = crc_word(crc0 ^ words[0]);
+#if BRAID_N > 1
+        comb = crc_word(crc1 ^ words[1] ^ comb);
+#if BRAID_N > 2
+        comb = crc_word(crc2 ^ words[2] ^ comb);
+#if BRAID_N > 3
+        comb = crc_word(crc3 ^ words[3] ^ comb);
+#if BRAID_N > 4
+        comb = crc_word(crc4 ^ words[4] ^ comb);
+#if BRAID_N > 5
+        comb = crc_word(crc5 ^ words[5] ^ comb);
+#endif
+#endif
+#endif
+#endif
+#endif
+        words += BRAID_N;
+        Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");
+        crc = (uint32_t)Z_WORD_FROM_LE(comb);
+
+        /* Update the pointer to the remaining bytes to process. */
+        buf = (const unsigned char *)words;
+    }
+
+#endif /* BRAID_W */
+
+    /* Complete the computation of the CRC on any remaining bytes. */
+    return ~crc32_copy_small(crc, NULL, buf, len, (BRAID_N * BRAID_W) - 1, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_braid(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
diff --git a/neozip/arch/generic/crc32_chorba_c.c b/neozip/arch/generic/crc32_chorba_c.c
new file mode 100644
index 0000000000..693972da11
--- /dev/null
+++ b/neozip/arch/generic/crc32_chorba_c.c
@@ -0,0 +1,1275 @@
+#include "zbuild.h"
+#include "zendian.h"
+#if defined(__EMSCRIPTEN__)
+#  include "zutil_p.h"
+#endif
+#include "zmemory.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "generic_functions.h"
+
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+#define bitbuffer_size_bytes (16 * 1024 * sizeof(chorba_word_t))
+#define bitbuffer_size_zwords (bitbuffer_size_bytes / sizeof(chorba_word_t))
+#define bitbuffer_size_qwords (bitbuffer_size_bytes / sizeof(uint64_t))
+
+#if defined(HAVE_MAY_ALIAS) && CHORBA_W != 8
+    typedef uint64_t __attribute__ ((__may_alias__)) uint64a_t;
+#else
+    typedef uint64_t uint64a_t;
+#endif
+
+/**
+ * Implements the Chorba algorithm for CRC32 computation (https://arxiv.org/abs/2412.16398).
+ *
+ * This implementation processes data in three phases:
+ * 1. Initial pass: Zeros out bitbuffer
+ * 2. Intermediate pass: Processes half the values
+ * 3. Main pass: Processes remaining data
+ *
+ * @param crc Initial CRC value
+ * @param input Input data buffer
+ * @param len Length of input data
+ * @return Computed CRC32 value
+ *
+ * @note Requires minimum input size of 118960 + 512 bytes
+ * @note Uses 128KB temporary buffer
+ */
+Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+#if defined(__EMSCRIPTEN__)
+    chorba_word_t *bitbuffer = (chorba_word_t*)zng_alloc(bitbuffer_size_bytes);
+#else
+    ALIGNED_(16) chorba_word_t bitbuffer[bitbuffer_size_zwords];
+#endif
+    const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+    uint64a_t *bitbuffer_qwords = (uint64a_t*)bitbuffer;
+    /* The calling function ensured that this is aligned correctly */
+    const chorba_word_t* input = (const chorba_word_t*)buf;
+    const uint64a_t* input_qwords = (const uint64a_t*)buf;
+
+    size_t i = 0;
+
+    chorba_word_t next1 = CHORBA_WORD_FROM_LE(~crc);
+
+    chorba_word_t next2 = 0;
+    chorba_word_t next3 = 0;
+    chorba_word_t next4 = 0;
+    chorba_word_t next5 = 0;
+    chorba_word_t next6 = 0;
+    chorba_word_t next7 = 0;
+    chorba_word_t next8 = 0;
+    chorba_word_t next9 = 0;
+    chorba_word_t next10 = 0;
+    chorba_word_t next11 = 0;
+    chorba_word_t next12 = 0;
+    chorba_word_t next13 = 0;
+    chorba_word_t next14 = 0;
+    chorba_word_t next15 = 0;
+    chorba_word_t next16 = 0;
+    chorba_word_t next17 = 0;
+    chorba_word_t next18 = 0;
+    chorba_word_t next19 = 0;
+    chorba_word_t next20 = 0;
+    chorba_word_t next21 = 0;
+    chorba_word_t next22 = 0;
+    crc = 0;
+
+    // do a first pass to zero out bitbuffer
+    for (; i < (14848 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) {
+        chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+        chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+        chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+        chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+        int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+        int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+        in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1;
+        in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2;
+        in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3;
+        in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4;
+        in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5;
+        in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6;
+        in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7;
+        in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1;
+        in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2;
+        in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3;
+        in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4;
+        in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5;
+        in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6;
+        in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7;
+        in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8;
+        in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9;
+        in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10;
+        in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11;
+        in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12;
+        in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13;
+        in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14;
+        in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15;
+        in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16;
+        in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17;
+        in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18;
+        in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19;
+        in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20;
+        in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21;
+        in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22;
+        in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23;
+        in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24;
+        in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25;
+
+        next1 = in11 ^ in22 ^ in26;
+        next2 = in12 ^ in23 ^ in27;
+        next3 = in13 ^ in24 ^ in28;
+        next4 = in14 ^ in25 ^ in29;
+        next5 = in15 ^ in26 ^ in30;
+        next6 = in16 ^ in27 ^ in31;
+        next7 = in17 ^ in28 ^ in32;
+        next8 = in18 ^ in29;
+        next9 = in19 ^ in30;
+        next10 = in20 ^ in31;
+        next11 = in21 ^ in32;
+        next12 = in22;
+        next13 = in23;
+        next14 = in24;
+        next15 = in25;
+        next16 = in26;
+        next17 = in27;
+        next18 = in28;
+        next19 = in29;
+        next20 = in30;
+        next21 = in31;
+        next22 = in32;
+
+        bitbuffer[out_offset1 + 22] = in1;
+        bitbuffer[out_offset1 + 23] = in2;
+        bitbuffer[out_offset1 + 24] = in3;
+        bitbuffer[out_offset1 + 25] = in4;
+        bitbuffer[out_offset1 + 26] = in5;
+        bitbuffer[out_offset1 + 27] = in6;
+        bitbuffer[out_offset1 + 28] = in7;
+        bitbuffer[out_offset1 + 29] = in8;
+        bitbuffer[out_offset1 + 30] = in9;
+        bitbuffer[out_offset1 + 31] = in10;
+        bitbuffer[out_offset2 + 0] = in11;
+        bitbuffer[out_offset2 + 1] = in12;
+        bitbuffer[out_offset2 + 2] = in13;
+        bitbuffer[out_offset2 + 3] = in14;
+        bitbuffer[out_offset2 + 4] = in15;
+        bitbuffer[out_offset2 + 5] = in16;
+        bitbuffer[out_offset2 + 6] = in17;
+        bitbuffer[out_offset2 + 7] = in18;
+        bitbuffer[out_offset2 + 8] = in19;
+        bitbuffer[out_offset2 + 9] = in20;
+        bitbuffer[out_offset2 + 10] = in21;
+        bitbuffer[out_offset2 + 11] = in22;
+        bitbuffer[out_offset2 + 12] = in23;
+        bitbuffer[out_offset2 + 13] = in24;
+        bitbuffer[out_offset2 + 14] = in25;
+        bitbuffer[out_offset2 + 15] = in26;
+        bitbuffer[out_offset2 + 16] = in27;
+        bitbuffer[out_offset2 + 17] = in28;
+        bitbuffer[out_offset2 + 18] = in29;
+        bitbuffer[out_offset2 + 19] = in30;
+        bitbuffer[out_offset2 + 20] = in31;
+        bitbuffer[out_offset2 + 21] = in32;
+    }
+
+    // one intermediate pass where we pull half the values
+    for (; i < (14880 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) {
+        chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+        chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+        chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+        chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+        int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords;
+        int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+        int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+        in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1;
+        in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2;
+        in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3;
+        in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4;
+        in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5;
+        in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6;
+        in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7;
+        in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1;
+        in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2;
+        in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3;
+        in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4;
+        in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5;
+        in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6;
+        in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7;
+        in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8;
+        in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9;
+        in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10;
+        in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11;
+        in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12;
+        in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13;
+        in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14;
+        in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15;
+        in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22];
+        in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23];
+        in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24];
+        in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25];
+        in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26];
+        in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27];
+        in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28];
+        in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29];
+        in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30];
+        in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31];
+
+        next1 = in11 ^ in22 ^ in26;
+        next2 = in12 ^ in23 ^ in27;
+        next3 = in13 ^ in24 ^ in28;
+        next4 = in14 ^ in25 ^ in29;
+        next5 = in15 ^ in26 ^ in30;
+        next6 = in16 ^ in27 ^ in31;
+        next7 = in17 ^ in28 ^ in32;
+        next8 = in18 ^ in29;
+        next9 = in19 ^ in30;
+        next10 = in20 ^ in31;
+        next11 = in21 ^ in32;
+        next12 = in22;
+        next13 = in23;
+        next14 = in24;
+        next15 = in25;
+        next16 = in26;
+        next17 = in27;
+        next18 = in28;
+        next19 = in29;
+        next20 = in30;
+        next21 = in31;
+        next22 = in32;
+
+        bitbuffer[out_offset1 + 22] = in1;
+        bitbuffer[out_offset1 + 23] = in2;
+        bitbuffer[out_offset1 + 24] = in3;
+        bitbuffer[out_offset1 + 25] = in4;
+        bitbuffer[out_offset1 + 26] = in5;
+        bitbuffer[out_offset1 + 27] = in6;
+        bitbuffer[out_offset1 + 28] = in7;
+        bitbuffer[out_offset1 + 29] = in8;
+        bitbuffer[out_offset1 + 30] = in9;
+        bitbuffer[out_offset1 + 31] = in10;
+        bitbuffer[out_offset2 + 0] = in11;
+        bitbuffer[out_offset2 + 1] = in12;
+        bitbuffer[out_offset2 + 2] = in13;
+        bitbuffer[out_offset2 + 3] = in14;
+        bitbuffer[out_offset2 + 4] = in15;
+        bitbuffer[out_offset2 + 5] = in16;
+        bitbuffer[out_offset2 + 6] = in17;
+        bitbuffer[out_offset2 + 7] = in18;
+        bitbuffer[out_offset2 + 8] = in19;
+        bitbuffer[out_offset2 + 9] = in20;
+        bitbuffer[out_offset2 + 10] = in21;
+        bitbuffer[out_offset2 + 11] = in22;
+        bitbuffer[out_offset2 + 12] = in23;
+        bitbuffer[out_offset2 + 13] = in24;
+        bitbuffer[out_offset2 + 14] = in25;
+        bitbuffer[out_offset2 + 15] = in26;
+        bitbuffer[out_offset2 + 16] = in27;
+        bitbuffer[out_offset2 + 17] = in28;
+        bitbuffer[out_offset2 + 18] = in29;
+        bitbuffer[out_offset2 + 19] = in30;
+        bitbuffer[out_offset2 + 20] = in31;
+        bitbuffer[out_offset2 + 21] = in32;
+    }
+
+    for (; (i + (14870 + 64) * sizeof(chorba_word_t)) < len; i += (32 * sizeof(chorba_word_t))) {
+        chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+        chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+        chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+        chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+        int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords;
+        int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+        int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+        in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1 ^ bitbuffer[in_offset + 0];
+        in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2 ^ bitbuffer[in_offset + 1];
+        in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3 ^ bitbuffer[in_offset + 2];
+        in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4 ^ bitbuffer[in_offset + 3];
+        in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5 ^ bitbuffer[in_offset + 4];
+        in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6 ^ bitbuffer[in_offset + 5];
+        in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7 ^ bitbuffer[in_offset + 6];
+        in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1 ^ bitbuffer[in_offset + 7];
+        in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2 ^ bitbuffer[in_offset + 8];
+        in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3 ^ bitbuffer[in_offset + 9];
+        in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4 ^ bitbuffer[in_offset + 10];
+        in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5 ^ bitbuffer[in_offset + 11];
+        in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6 ^ bitbuffer[in_offset + 12];
+        in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7 ^ bitbuffer[in_offset + 13];
+        in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8 ^ bitbuffer[in_offset + 14];
+        in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9 ^ bitbuffer[in_offset + 15];
+        in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10 ^ bitbuffer[in_offset + 16];
+        in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11 ^ bitbuffer[in_offset + 17];
+        in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12 ^ bitbuffer[in_offset + 18];
+        in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13 ^ bitbuffer[in_offset + 19];
+        in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14 ^ bitbuffer[in_offset + 20];
+        in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15 ^ bitbuffer[in_offset + 21];
+        in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22];
+        in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23];
+        in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24];
+        in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25];
+        in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26];
+        in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27];
+        in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28];
+        in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29];
+        in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30];
+        in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31];
+
+        next1 = in11 ^ in22 ^ in26;
+        next2 = in12 ^ in23 ^ in27;
+        next3 = in13 ^ in24 ^ in28;
+        next4 = in14 ^ in25 ^ in29;
+        next5 = in15 ^ in26 ^ in30;
+        next6 = in16 ^ in27 ^ in31;
+        next7 = in17 ^ in28 ^ in32;
+        next8 = in18 ^ in29;
+        next9 = in19 ^ in30;
+        next10 = in20 ^ in31;
+        next11 = in21 ^ in32;
+        next12 = in22;
+        next13 = in23;
+        next14 = in24;
+        next15 = in25;
+        next16 = in26;
+        next17 = in27;
+        next18 = in28;
+        next19 = in29;
+        next20 = in30;
+        next21 = in31;
+        next22 = in32;
+
+        bitbuffer[out_offset1 + 22] = in1;
+        bitbuffer[out_offset1 + 23] = in2;
+        bitbuffer[out_offset1 + 24] = in3;
+        bitbuffer[out_offset1 + 25] = in4;
+        bitbuffer[out_offset1 + 26] = in5;
+        bitbuffer[out_offset1 + 27] = in6;
+        bitbuffer[out_offset1 + 28] = in7;
+        bitbuffer[out_offset1 + 29] = in8;
+        bitbuffer[out_offset1 + 30] = in9;
+        bitbuffer[out_offset1 + 31] = in10;
+        bitbuffer[out_offset2 + 0] = in11;
+        bitbuffer[out_offset2 + 1] = in12;
+        bitbuffer[out_offset2 + 2] = in13;
+        bitbuffer[out_offset2 + 3] = in14;
+        bitbuffer[out_offset2 + 4] = in15;
+        bitbuffer[out_offset2 + 5] = in16;
+        bitbuffer[out_offset2 + 6] = in17;
+        bitbuffer[out_offset2 + 7] = in18;
+        bitbuffer[out_offset2 + 8] = in19;
+        bitbuffer[out_offset2 + 9] = in20;
+        bitbuffer[out_offset2 + 10] = in21;
+        bitbuffer[out_offset2 + 11] = in22;
+        bitbuffer[out_offset2 + 12] = in23;
+        bitbuffer[out_offset2 + 13] = in24;
+        bitbuffer[out_offset2 + 14] = in25;
+        bitbuffer[out_offset2 + 15] = in26;
+        bitbuffer[out_offset2 + 16] = in27;
+        bitbuffer[out_offset2 + 17] = in28;
+        bitbuffer[out_offset2 + 18] = in29;
+        bitbuffer[out_offset2 + 19] = in30;
+        bitbuffer[out_offset2 + 20] = in31;
+        bitbuffer[out_offset2 + 21] = in32;
+    }
+
+    bitbuffer[(i / sizeof(chorba_word_t) + 0) % bitbuffer_size_zwords] ^= next1;
+    bitbuffer[(i / sizeof(chorba_word_t) + 1) % bitbuffer_size_zwords] ^= next2;
+    bitbuffer[(i / sizeof(chorba_word_t) + 2) % bitbuffer_size_zwords] ^= next3;
+    bitbuffer[(i / sizeof(chorba_word_t) + 3) % bitbuffer_size_zwords] ^= next4;
+    bitbuffer[(i / sizeof(chorba_word_t) + 4) % bitbuffer_size_zwords] ^= next5;
+    bitbuffer[(i / sizeof(chorba_word_t) + 5) % bitbuffer_size_zwords] ^= next6;
+    bitbuffer[(i / sizeof(chorba_word_t) + 6) % bitbuffer_size_zwords] ^= next7;
+    bitbuffer[(i / sizeof(chorba_word_t) + 7) % bitbuffer_size_zwords] ^= next8;
+    bitbuffer[(i / sizeof(chorba_word_t) + 8) % bitbuffer_size_zwords] ^= next9;
+    bitbuffer[(i / sizeof(chorba_word_t) + 9) % bitbuffer_size_zwords] ^= next10;
+    bitbuffer[(i / sizeof(chorba_word_t) + 10) % bitbuffer_size_zwords] ^= next11;
+    bitbuffer[(i / sizeof(chorba_word_t) + 11) % bitbuffer_size_zwords] ^= next12;
+    bitbuffer[(i / sizeof(chorba_word_t) + 12) % bitbuffer_size_zwords] ^= next13;
+    bitbuffer[(i / sizeof(chorba_word_t) + 13) % bitbuffer_size_zwords] ^= next14;
+    bitbuffer[(i / sizeof(chorba_word_t) + 14) % bitbuffer_size_zwords] ^= next15;
+    bitbuffer[(i / sizeof(chorba_word_t) + 15) % bitbuffer_size_zwords] ^= next16;
+    bitbuffer[(i / sizeof(chorba_word_t) + 16) % bitbuffer_size_zwords] ^= next17;
+    bitbuffer[(i / sizeof(chorba_word_t) + 17) % bitbuffer_size_zwords] ^= next18;
+    bitbuffer[(i / sizeof(chorba_word_t) + 18) % bitbuffer_size_zwords] ^= next19;
+    bitbuffer[(i / sizeof(chorba_word_t) + 19) % bitbuffer_size_zwords] ^= next20;
+    bitbuffer[(i / sizeof(chorba_word_t) + 20) % bitbuffer_size_zwords] ^= next21;
+    bitbuffer[(i / sizeof(chorba_word_t) + 21) % bitbuffer_size_zwords] ^= next22;
+
+    for (int j = 14870; j < 14870 + 64; j++) {
+        bitbuffer[(j + (i / sizeof(chorba_word_t))) % bitbuffer_size_zwords] = 0;
+    }
+
+    uint64_t next1_64 = 0;
+    uint64_t next2_64 = 0;
+    uint64_t next3_64 = 0;
+    uint64_t next4_64 = 0;
+    uint64_t next5_64 = 0;
+    uint64_t final[9] = {0};
+
+    for (; (i + 72 < len); i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        in1 = input_qwords[i / sizeof(uint64_t)] ^ bitbuffer_qwords[(i / sizeof(uint64_t)) % bitbuffer_size_qwords];
+        in2 = input_qwords[i / sizeof(uint64_t) + 1] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 1) % bitbuffer_size_qwords];
+        in1 = Z_U64_FROM_LE(in1) ^ next1_64;
+        in2 = Z_U64_FROM_LE(in2) ^ next2_64;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = input_qwords[i / sizeof(uint64_t) + 2] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 2) % bitbuffer_size_qwords];
+        in4 = input_qwords[i / sizeof(uint64_t) + 3] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 3) % bitbuffer_size_qwords];
+        in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1;
+        in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1_64 = next5_64 ^ out1;
+        next2_64 = out2;
+        next3_64 = out3;
+        next4_64 = out4;
+        next5_64 = out5;
+
+    }
+
+    memcpy(final, input_qwords + (i / sizeof(uint64_t)), len-i);
+    final[0] ^= Z_U64_TO_LE(next1_64);
+    final[1] ^= Z_U64_TO_LE(next2_64);
+    final[2] ^= Z_U64_TO_LE(next3_64);
+    final[3] ^= Z_U64_TO_LE(next4_64);
+    final[4] ^= Z_U64_TO_LE(next5_64);
+
+    uint8_t *final_bytes = (uint8_t*)final;
+
+    for (size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i) % bitbuffer_size_bytes]) & 0xff] ^ (crc >> 8);
+    }
+
+#if defined(__EMSCRIPTEN__)
+    zng_free(bitbuffer);
+#endif
+    return ~crc;
+}
+
+#  if CHORBA_W == 8
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint64_t* input = (const uint64_t*)buf;
+    uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+    const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+    memset(bitbuffer, 0, 32768);
+    bitbuffer[0] = Z_U64_TO_LE(~crc);
+
+    crc = 0;
+
+    size_t i = 0;
+
+    for(; i + 300*8+64 < len; i += 64) {
+        uint64_t in1, in2, in3, in4;
+        uint64_t in5, in6, in7, in8;
+        size_t in_offset = (i/8);
+
+        in1 = input[i / sizeof(uint64_t) + 0] ^ bitbuffer[in_offset + 0];
+        in2 = input[i / sizeof(uint64_t) + 1] ^ bitbuffer[in_offset + 1];
+        in3 = input[i / sizeof(uint64_t) + 2] ^ bitbuffer[in_offset + 2];
+        in4 = input[i / sizeof(uint64_t) + 3] ^ bitbuffer[in_offset + 3];
+        in5 = input[i / sizeof(uint64_t) + 4] ^ bitbuffer[in_offset + 4];
+        in6 = input[i / sizeof(uint64_t) + 5] ^ bitbuffer[in_offset + 5];
+        in7 = input[i / sizeof(uint64_t) + 6] ^ bitbuffer[in_offset + 6];
+        in8 = input[i / sizeof(uint64_t) + 7] ^ bitbuffer[in_offset + 7];
+
+        // [0, 145, 183, 211]
+
+        bitbuffer[(i/8 + 0 + 145)] ^= in1;
+        bitbuffer[(i/8 + 1 + 145)] ^= in2;
+        bitbuffer[(i/8 + 2 + 145)] ^= in3;
+        bitbuffer[(i/8 + 3 + 145)] ^= in4;
+        bitbuffer[(i/8 + 4 + 145)] ^= in5;
+        bitbuffer[(i/8 + 5 + 145)] ^= in6;
+        bitbuffer[(i/8 + 6 + 145)] ^= in7;
+        bitbuffer[(i/8 + 7 + 145)] ^= in8;
+
+        bitbuffer[(i/8 + 0 + 183)] ^= in1;
+        bitbuffer[(i/8 + 1 + 183)] ^= in2;
+        bitbuffer[(i/8 + 2 + 183)] ^= in3;
+        bitbuffer[(i/8 + 3 + 183)] ^= in4;
+        bitbuffer[(i/8 + 4 + 183)] ^= in5;
+        bitbuffer[(i/8 + 5 + 183)] ^= in6;
+        bitbuffer[(i/8 + 6 + 183)] ^= in7;
+        bitbuffer[(i/8 + 7 + 183)] ^= in8;
+
+        bitbuffer[(i/8 + 0 + 211)] ^= in1;
+        bitbuffer[(i/8 + 1 + 211)] ^= in2;
+        bitbuffer[(i/8 + 2 + 211)] ^= in3;
+        bitbuffer[(i/8 + 3 + 211)] ^= in4;
+        bitbuffer[(i/8 + 4 + 211)] ^= in5;
+        bitbuffer[(i/8 + 5 + 211)] ^= in6;
+        bitbuffer[(i/8 + 6 + 211)] ^= in7;
+        bitbuffer[(i/8 + 7 + 211)] ^= in8;
+
+        bitbuffer[(i/8 + 0 + 300)] = in1;
+        bitbuffer[(i/8 + 1 + 300)] = in2;
+        bitbuffer[(i/8 + 2 + 300)] = in3;
+        bitbuffer[(i/8 + 3 + 300)] = in4;
+        bitbuffer[(i/8 + 4 + 300)] = in5;
+        bitbuffer[(i/8 + 5 + 300)] = in6;
+        bitbuffer[(i/8 + 6 + 300)] = in7;
+        bitbuffer[(i/8 + 7 + 300)] = in8;
+    }
+
+    uint64_t next1_64 = 0;
+    uint64_t next2_64 = 0;
+    uint64_t next3_64 = 0;
+    uint64_t next4_64 = 0;
+    uint64_t next5_64 = 0;
+    uint64_t final[9] = {0};
+
+    for (; (i + 72 < len); i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        in1 = input[i / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t))];
+        in2 = input[(i + 8) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 1)];
+        in1 = Z_U64_FROM_LE(in1) ^ next1_64;
+        in2 = Z_U64_FROM_LE(in2) ^ next2_64;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = input[(i + 16) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 2)];
+        in4 = input[(i + 24) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 3)];
+        in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1;
+        in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1_64 = next5_64 ^ out1;
+        next2_64 = out2;
+        next3_64 = out3;
+        next4_64 = out4;
+        next5_64 = out5;
+
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    final[0] ^= Z_U64_TO_LE(next1_64);
+    final[1] ^= Z_U64_TO_LE(next2_64);
+    final[2] ^= Z_U64_TO_LE(next3_64);
+    final[3] ^= Z_U64_TO_LE(next4_64);
+    final[4] ^= Z_U64_TO_LE(next5_64);
+
+    uint8_t *final_bytes = (uint8_t*)final;
+
+    for (size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8);
+    }
+
+    return ~crc;
+}
+
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+Z_INTERNAL uint32_t crc32_chorba_small_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint64_t* input = (const uint64_t*)buf;
+    uint64_t final[9] = {0};
+    uint64_t next1 = ~crc;
+    crc = 0;
+    uint64_t next2 = 0;
+    uint64_t next3 = 0;
+    uint64_t next4 = 0;
+    uint64_t next5 = 0;
+
+    size_t i = 0;
+
+    /* This is weird, doing for vs while drops 10% off the exec time */
+    for (; (i + 256 + 40 + 32 + 32) < len; i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        uint64_t chorba1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1;
+        uint64_t chorba2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2;
+        uint64_t chorba3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3;
+        uint64_t chorba4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4;
+        uint64_t chorba5 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 4]) ^ next5;
+        uint64_t chorba6 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 5]);
+        uint64_t chorba7 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 6]) ^ chorba1;
+        uint64_t chorba8 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 7]) ^ chorba2;
+
+        i += 8 * 8;
+
+        /* 0-3 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ chorba3;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ chorba4 ^ chorba1;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ a1 ^ chorba5 ^ chorba2 ^ chorba1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ a2 ^ b1 ^ chorba6 ^ chorba3 ^ chorba2;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 4-7 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba4 ^ chorba3;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba4;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba6 ^ chorba5;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba7 ^ chorba6;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 8-11 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba3;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 12-15 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba2 ^ chorba1;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 16-19 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 20-23 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 24-27 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 28-31 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba6 ^ chorba5;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba7 ^ chorba6;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+    }
+
+    for (; (i + 40 + 32) < len; i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    final[0] ^= Z_U64_TO_LE(next1);
+    final[1] ^= Z_U64_TO_LE(next2);
+    final[2] ^= Z_U64_TO_LE(next3);
+    final[3] ^= Z_U64_TO_LE(next4);
+    final[4] ^= Z_U64_TO_LE(next5);
+
+    return crc32_braid(~crc, (uint8_t*)final, len-i);
+}
+
+#else // CHORBA_W == 8
+
+Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint32_t* input = (const uint32_t*)buf;
+    uint32_t final[20] = {0};
+
+    uint32_t next1 = ~crc;
+    crc = 0;
+    uint32_t next2 = 0;
+    uint32_t next3 = 0;
+    uint32_t next4 = 0;
+    uint32_t next5 = 0;
+    uint32_t next6 = 0;
+    uint32_t next7 = 0;
+    uint32_t next8 = 0;
+    uint32_t next9 = 0;
+    uint32_t next10 = 0;
+
+    size_t i = 0;
+    for (; i + 80 < len; i += 40) {
+        uint32_t in1;
+        uint32_t in2;
+        uint32_t in3;
+        uint32_t in4;
+        uint32_t in5;
+        uint32_t in6;
+        uint32_t in7;
+        uint32_t in8;
+        uint32_t in9;
+        uint32_t in10;
+
+        uint32_t a1, a2, a3, a4, a6, a7;
+        uint32_t b1, b2, b3, b4, b6, b7;
+        uint32_t c1, c2, c3, c4, c6, c7;
+        uint32_t d1, d2, d3, d4, d6, d7;
+        uint32_t e1, e2, e3, e4, e6, e7;
+        uint32_t f1, f2, f3, f4, f6, f7;
+        uint32_t g1, g2, g3, g4, g6, g7;
+        uint32_t h1, h2, h3, h4, h6, h7;
+        uint32_t i1, i2, i3, i4, i6, i7;
+        uint32_t j1, j2, j3, j4, j6, j7;
+
+        uint32_t out1;
+        uint32_t out2;
+        uint32_t out3;
+        uint32_t out4;
+        uint32_t out5;
+        uint32_t out6;
+        uint32_t out7;
+        uint32_t out8;
+        uint32_t out9;
+        uint32_t out10;
+
+        in1 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 0]) ^ next1;
+        in2 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 1]) ^ next2;
+        in3 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 2]) ^ next3;
+        in4 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 3]) ^ next4;
+
+        a1 = (in1 << 17);
+        a2 = (in1 >> 15) ^ (in1 << 23);
+        a3 = (in1 >> 9) ^ (in1 << 19);
+        a4 = (in1 >> 13);
+        a6 = (in1 << 12);
+        a7 = (in1 >> 20);
+
+        b1 = (in2 << 17);
+        b2 = (in2 >> 15) ^ (in2 << 23);
+        b3 = (in2 >> 9) ^ (in2 << 19);
+        b4 = (in2 >> 13);
+        b6 = (in2 << 12);
+        b7 = (in2 >> 20);
+
+        c1 = (in3 << 17);
+        c2 = (in3 >> 15) ^ (in3 << 23);
+        c3 = (in3 >> 9) ^ (in3 << 19);
+        c4 = (in3 >> 13);
+        c6 = (in3 << 12);
+        c7 = (in3 >> 20);
+
+        d1 = (in4 << 17);
+        d2 = (in4 >> 15) ^ (in4 << 23);
+        d3 = (in4 >> 9) ^ (in4 << 19);
+        d4 = (in4 >> 13);
+        d6 = (in4 << 12);
+        d7 = (in4 >> 20);
+
+        in5 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 4]) ^ next5 ^ a1;
+        in6 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 5]) ^ next6 ^ a2 ^ b1;
+        in7 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 6]) ^ next7 ^ a3 ^ b2 ^ c1;
+        in8 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 7]) ^ next8 ^ a4 ^ b3 ^ c2 ^ d1;
+
+        e1 = (in5 << 17);
+        e2 = (in5 >> 15) ^ (in5 << 23);
+        e3 = (in5 >> 9) ^ (in5 << 19);
+        e4 = (in5 >> 13);
+        e6 = (in5 << 12);
+        e7 = (in5 >> 20);
+
+        f1 = (in6 << 17);
+        f2 = (in6 >> 15) ^ (in6 << 23);
+        f3 = (in6 >> 9) ^ (in6 << 19);
+        f4 = (in6 >> 13);
+        f6 = (in6 << 12);
+        f7 = (in6 >> 20);
+
+        g1 = (in7 << 17);
+        g2 = (in7 >> 15) ^ (in7 << 23);
+        g3 = (in7 >> 9) ^ (in7 << 19);
+        g4 = (in7 >> 13);
+        g6 = (in7 << 12);
+        g7 = (in7 >> 20);
+
+        h1 = (in8 << 17);
+        h2 = (in8 >> 15) ^ (in8 << 23);
+        h3 = (in8 >> 9) ^ (in8 << 19);
+        h4 = (in8 >> 13);
+        h6 = (in8 << 12);
+        h7 = (in8 >> 20);
+
+        in9 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 8]) ^ next9 ^ b4 ^ c3 ^ d2 ^ e1;
+        in10 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 9]) ^ next10 ^ a6 ^ c4 ^ d3 ^ e2 ^ f1;
+
+        i1 = (in9 << 17);
+        i2 = (in9 >> 15) ^ (in9 << 23);
+        i3 = (in9 >> 9) ^ (in9 << 19);
+        i4 = (in9 >> 13);
+        i6 = (in9 << 12);
+        i7 = (in9 >> 20);
+
+        j1 = (in10 << 17);
+        j2 = (in10 >> 15) ^ (in10 << 23);
+        j3 = (in10 >> 9) ^ (in10 << 19);
+        j4 = (in10 >> 13);
+        j6 = (in10 << 12);
+        j7 = (in10 >> 20);
+
+        out1 = a7 ^ b6 ^ d4 ^ e3 ^ f2 ^ g1;
+        out2 = b7 ^ c6 ^ e4 ^ f3 ^ g2 ^ h1;
+        out3 = c7 ^ d6 ^ f4 ^ g3 ^ h2 ^ i1;
+        out4 = d7 ^ e6 ^ g4 ^ h3 ^ i2 ^ j1;
+        out5 = e7 ^ f6 ^ h4 ^ i3 ^ j2;
+        out6 = f7 ^ g6 ^ i4 ^ j3;
+        out7 = g7 ^ h6 ^ j4;
+        out8 = h7 ^ i6;
+        out9 = i7 ^ j6;
+        out10 = j7;
+
+        next1 = out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        next6 = out6;
+        next7 = out7;
+        next8 = out8;
+        next9 = out9;
+        next10 = out10;
+
+    }
+
+    memcpy(final, input+(i/sizeof(uint32_t)), len-i);
+    final[0] ^= Z_U32_TO_LE(next1);
+    final[1] ^= Z_U32_TO_LE(next2);
+    final[2] ^= Z_U32_TO_LE(next3);
+    final[3] ^= Z_U32_TO_LE(next4);
+    final[4] ^= Z_U32_TO_LE(next5);
+    final[5] ^= Z_U32_TO_LE(next6);
+    final[6] ^= Z_U32_TO_LE(next7);
+    final[7] ^= Z_U32_TO_LE(next8);
+    final[8] ^= Z_U32_TO_LE(next9);
+    final[9] ^= Z_U32_TO_LE(next10);
+
+    return crc32_braid(~crc, (uint8_t*)final, len-i);
+}
+#endif // CHORBA_W == 8
+
+Z_INTERNAL uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len) {
+    uintptr_t align_diff = ALIGN_DIFF(buf, 8);
+    if (len <= align_diff + CHORBA_SMALL_THRESHOLD)
+        return crc32_braid(crc, buf, len);
+
+    if (align_diff) {
+        crc = crc32_braid(crc, buf, align_diff);
+        len -= align_diff;
+        buf += align_diff;
+    }
+    if (len > CHORBA_LARGE_THRESHOLD)
+        return crc32_chorba_118960_nondestructive(crc, buf, len);
+#if CHORBA_W == 8
+    if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
+        return crc32_chorba_32768_nondestructive(crc, buf, len);
+    return crc32_chorba_small_nondestructive(crc, buf, len);
+#else
+    return crc32_chorba_small_nondestructive_32bit(crc, buf, len);
+#endif
+}
+
+uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_chorba(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
diff --git a/neozip/arch/generic/generic_functions.h b/neozip/arch/generic/generic_functions.h
new file mode 100644
index 0000000000..c150a2f010
--- /dev/null
+++ b/neozip/arch/generic/generic_functions.h
@@ -0,0 +1,64 @@
+/* generic_functions.h -- generic C implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef GENERIC_FUNCTIONS_H_
+#define GENERIC_FUNCTIONS_H_
+
+#include "zendian.h"
+#include "deflate.h"
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+typedef uint32_t (*adler32_copy_func)(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+typedef uint32_t (*crc32_func)(uint32_t crc, const uint8_t *buf, size_t len);
+typedef uint32_t (*crc32_copy_func)(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+typedef void     (*slide_hash_func)(deflate_state *s);
+
+
+uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, size_t len, size_t left);
+
+#ifdef WITH_ALL_FALLBACKS
+uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1);
+uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
+#endif
+uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+
+#ifndef WITHOUT_CHORBA
+  uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len);
+  uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+void     inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+
+uint32_t longest_match_c(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_c(deflate_state *const s, uint32_t cur_match);
+
+void     slide_hash_c(deflate_state *s);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Generic code
+#  define native_adler32 adler32_c
+#  define native_adler32_copy adler32_copy_c
+#  define native_chunkmemset_safe chunkmemset_safe_c
+#ifndef WITHOUT_CHORBA
+#  define native_crc32 crc32_chorba
+#  define native_crc32_copy crc32_copy_chorba
+#else
+#  define native_crc32 crc32_braid
+#  define native_crc32_copy crc32_copy_braid
+#endif
+#  define native_inflate_fast inflate_fast_c
+#  define native_slide_hash slide_hash_c
+#  define native_longest_match longest_match_c
+#  define native_longest_match_slow longest_match_slow_c
+#  define native_compare256 compare256_c
+#endif
+
+#endif
diff --git a/neozip/arch/generic/slide_hash_c.c b/neozip/arch/generic/slide_hash_c.c
new file mode 100644
index 0000000000..8345b9e36b
--- /dev/null
+++ b/neozip/arch/generic/slide_hash_c.c
@@ -0,0 +1,52 @@
+/* slide_hash.c -- slide hash table C implementation
+ *
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+
+/* ===========================================================================
+ * Slide the hash table when sliding the window down (could be avoided with 32
+ * bit values at the expense of memory usage). We slide even when level == 0 to
+ * keep the hash table consistent if we switch back to level > 0 later.
+ */
+static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+#ifdef NOT_TWEAK_COMPILER
+    table += entries;
+    do {
+        unsigned m;
+        m = *--table;
+        *table = (Pos)(m >= wsize ? m-wsize : 0);
+        /* If entries is not on any hash chain, prev[entries] is garbage but
+         * its value will never be used.
+         */
+    } while (--entries);
+#else
+    {
+    /* As of I make this change, gcc (4.8.*) isn't able to vectorize
+     * this hot loop using saturated-subtraction on x86-64 architecture.
+     * To avoid this defect, we can change the loop such that
+     *    o. the pointer advance forward, and
+     *    o. demote the variable 'm' to be local to the loop, and
+     *       choose type "Pos" (instead of 'unsigned int') for the
+     *       variable to avoid unnecessary zero-extension.
+     */
+        unsigned int i;
+        Pos *q = table;
+        for (i = 0; i < entries; i++) {
+            Pos m = *q;
+            Pos t = (Pos)wsize;
+            *q++ = (Pos)(m >= t ? m-t: 0);
+        }
+    }
+#endif /* NOT_TWEAK_COMPILER */
+}
+
+Z_INTERNAL void slide_hash_c(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_c_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_c_chain(s->prev, wsize, wsize);
+}
diff --git a/neozip/arch/loongarch/Makefile.in b/neozip/arch/loongarch/Makefile.in
new file mode 100644
index 0000000000..86baed1553
--- /dev/null
+++ b/neozip/arch/loongarch/Makefile.in
@@ -0,0 +1,99 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+LSXFLAG=-mlsx
+LASXFLAG=-mlasx
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	loongarch_features.o loongarch_features.lo \
+	crc32_la.o crc32_la.lo \
+	adler32_lasx.o adler32_lasx.lo \
+	adler32_lsx.o adler32_lsx.lo \
+	chunkset_lasx.o chunkset_lasx.lo \
+	chunkset_lsx.o chunkset_lsx.lo \
+	compare256_lasx.o compare256_lasx.lo \
+	compare256_lsx.o compare256_lsx.lo \
+	slide_hash_lasx.o slide_hash_lasx.lo \
+	slide_hash_lsx.o slide_hash_lsx.lo
+
+loongarch_features.o: $(SRCDIR)/loongarch_features.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c
+
+loongarch_features.lo: $(SRCDIR)/loongarch_features.c
+	$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c
+
+crc32_la.o: $(SRCDIR)/crc32_la.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
+
+crc32_la.lo: $(SRCDIR)/crc32_la.c
+	$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
+
+adler32_lasx.o:
+	$(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lasx.c
+
+adler32_lasx.lo:
+	$(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lasx.c
+
+adler32_lsx.o:
+	$(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lsx.c
+
+adler32_lsx.lo:
+	$(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lsx.c
+
+chunkset_lasx.o:
+	$(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c
+
+chunkset_lasx.lo:
+	$(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c
+
+chunkset_lsx.o:
+	$(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c
+
+chunkset_lsx.lo:
+	$(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c
+
+compare256_lasx.o:
+	$(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lasx.lo:
+	$(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lsx.o:
+	$(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
+compare256_lsx.lo:
+	$(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
+slide_hash_lasx.o:
+	$(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lasx.lo:
+	$(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lsx.o:
+	$(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
+slide_hash_lsx.lo:
+	$(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/loongarch/adler32_lasx.c b/neozip/arch/loongarch/adler32_lasx.c
new file mode 100644
index 0000000000..a7268e73ff
--- /dev/null
+++ b/neozip/arch/loongarch/adler32_lasx.c
@@ -0,0 +1,154 @@
+/* adler32_lasx.c -- compute the Adler-32 checksum of a data stream, based on Intel AVX2 implementation
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2022 Adam Stylinski
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LASX
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+
+
+/* 32 bit horizontal sum */
+static inline uint32_t hsum256(__m256i x) {
+    __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8));
+    __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2));
+    __m256i sum3 = __lasx_xvadd_w(sum2, __lasx_xvbsrl_v(sum2, 4));
+    return (uint32_t)__lasx_xvpickve2gr_wu(sum3, 0);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+    __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8));
+    __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2));
+    return (uint32_t)__lasx_xvpickve2gr_wu(sum2, 0);
+}
+
+extern uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len);
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
+    } else if (len < 32) {
+        if (COPY) {
+            return adler32_copy_lsx(adler, dst, src, len);
+        } else {
+            return adler32_lsx(adler, src, len);
+        }
+    }
+
+    __m256i vs1, vs2, vs2_0;
+
+    const __m256i dot2v = (__m256i)((v32i8){ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
+                                             46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33 });
+    const __m256i dot2v_0 = (__m256i)((v32i8){ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+                                               14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 });
+    const __m256i dot3v = __lasx_xvreplgr2vr_h(1);
+    const __m256i zero = __lasx_xvldi(0);
+
+    while (len >= 32) {
+        vs1 = __lasx_xvinsgr2vr_w(zero, adler0, 0);
+        vs2 = __lasx_xvinsgr2vr_w(zero, adler1, 0);
+
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = __lasx_xvldi(0);
+        vs2_0 = vs3;
+
+        size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+        len -= k;
+
+        while (k >= 64) {
+            __m256i vbuf = __lasx_xvld(src, 0);
+            __m256i vbuf_0 = __lasx_xvld(src, 32);
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = lasx_sad_bu(vbuf, zero);
+            __m256i vs1_sad2 = lasx_sad_bu(vbuf_0, zero);
+
+            if (COPY) {
+                __lasx_xvst(vbuf, dst, 0);
+                __lasx_xvst(vbuf_0, dst, 32);
+                dst += 64;
+            }
+
+            vs1 = __lasx_xvadd_w(vs1, vs1_sad);
+            vs3 = __lasx_xvadd_w(vs3, vs1_0);
+            __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+            __m256i v_short_sum2_0 = lasx_maddubs_w_h(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
+            __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+            __m256i vsum2_0 = lasx_madd_w_h(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
+            vs1 = __lasx_xvadd_w(vs1_sad2, vs1);
+            vs2 = __lasx_xvadd_w(vsum2, vs2);
+            vs2_0 = __lasx_xvadd_w(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = __lasx_xvadd_w(vs2_0, vs2);
+        vs3 = __lasx_xvslli_w(vs3, 6);
+        vs2 = __lasx_xvadd_w(vs3, vs2);
+        vs3 = __lasx_xvldi(0);
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+            */
+            __m256i vbuf = __lasx_xvld(src, 0);
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = lasx_sad_bu(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+
+            if (COPY) {
+                __lasx_xvst(vbuf, dst, 0);
+                dst += 32;
+            }
+
+            vs1 = __lasx_xvadd_w(vs1, vs1_sad);
+            vs3 = __lasx_xvadd_w(vs3, vs1_0);
+            __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
+            __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+            vs2 = __lasx_xvadd_w(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        /* Defer the multiplication with 32 to outside of the loop */
+        vs3 = __lasx_xvslli_w(vs3, 5);
+        vs2 = __lasx_xvadd_w(vs2, vs3);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/loongarch/adler32_lsx.c b/neozip/arch/loongarch/adler32_lsx.c
new file mode 100644
index 0000000000..389f74c683
--- /dev/null
+++ b/neozip/arch/loongarch/adler32_lsx.c
@@ -0,0 +1,147 @@
+/* adler32_lsx.c -- compute the Adler-32 checksum of a data stream, based on Intel SSE4.2 implementation
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LSX
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+
+static inline uint32_t partial_hsum(__m128i x) {
+    __m128i second_int = __lsx_vbsrl_v(x, 8);
+    __m128i sum = __lsx_vadd_w(x, second_int);
+    return __lsx_vpickve2gr_w(sum, 0);
+}
+
+static inline uint32_t hsum(__m128i x) {
+    __m128i sum1 = __lsx_vilvh_d(x, x);
+    __m128i sum2 = __lsx_vadd_w(x, sum1);
+    __m128i sum3 = __lsx_vshuf4i_w(sum2, 0x01);
+    __m128i sum4 = __lsx_vadd_w(sum2, sum3);
+    return __lsx_vpickve2gr_w(sum4, 0);
+}
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16)
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
+
+    __m128i vbuf, vbuf_0;
+    __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            v_sad_sum2, vsum2, vsum2_0;
+    __m128i zero = __lsx_vldi(0);
+    const __m128i dot2v = (__m128i)((v16i8){ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 });
+    const __m128i dot2v_0 = (__m128i)((v16i8){ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 });
+    const __m128i dot3v = __lsx_vreplgr2vr_h(1);
+    size_t k;
+
+    while (len >= 16) {
+
+        k = ALIGN_DOWN(MIN(len, NMAX), 16);
+        len -= k;
+
+        vs1 = __lsx_vinsgr2vr_w(zero, adler0, 0);
+        vs2 = __lsx_vinsgr2vr_w(zero, adler1, 0);
+
+        vs3 = __lsx_vldi(0);
+        vs2_0 = __lsx_vldi(0);
+        vs1_0 = vs1;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = __lsx_vld(src, 0);
+            vbuf_0 = __lsx_vld(src, 16);
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = lsx_sad_bu(vbuf, zero);
+            v_sad_sum2 = lsx_sad_bu(vbuf_0, zero);
+
+            if (COPY) {
+                __lsx_vst(vbuf, dst, 0);
+                __lsx_vst(vbuf_0, dst, 16);
+                dst += 32;
+            }
+
+            v_short_sum2 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf, dot2v), __lsx_vmulwod_h_bu_b(vbuf, dot2v));
+            v_short_sum2_0 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf_0, dot2v_0), __lsx_vmulwod_h_bu_b(vbuf_0, dot2v_0));
+
+            vs1 = __lsx_vadd_w(v_sad_sum1, vs1);
+            vs3 = __lsx_vadd_w(vs1_0, vs3);
+
+            vsum2 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2, dot3v), v_short_sum2, dot3v);
+            vsum2_0 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2_0, dot3v), v_short_sum2_0, dot3v);
+            vs1 = __lsx_vadd_w(v_sad_sum2, vs1);
+            vs2 = __lsx_vadd_w(vsum2, vs2);
+            vs2_0 = __lsx_vadd_w(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = __lsx_vadd_w(vs2_0, vs2);
+        vs3 = __lsx_vslli_w(vs3, 5);
+        vs2 = __lsx_vadd_w(vs3, vs2);
+        vs3 = __lsx_vldi(0);
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = __lsx_vld(src, 0);
+            src += 16;
+            k -= 16;
+
+            v_sad_sum1 = lsx_sad_bu(vbuf, zero);
+            v_short_sum2 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf, dot2v_0), __lsx_vmulwod_h_bu_b(vbuf, dot2v_0));
+
+            vs1 = __lsx_vadd_w(v_sad_sum1, vs1);
+            vs3 = __lsx_vadd_w(vs1_0, vs3);
+            vsum2 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2, dot3v), v_short_sum2, dot3v);
+            vs2 = __lsx_vadd_w(vsum2, vs2);
+            vs1_0 = vs1;
+
+            if (COPY) {
+                __lsx_vst(vbuf, dst, 0);
+                dst += 16;
+            }
+        }
+
+        vs3 = __lsx_vslli_w(vs3, 4);
+        vs2 = __lsx_vadd_w(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+    }
+
+    /* If this is true, there's fewer than 16 elements remaining */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler0 | (adler1 << 16);
+}
+
+Z_INTERNAL uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/loongarch/chunkset_lasx.c b/neozip/arch/loongarch/chunkset_lasx.c
new file mode 100644
index 0000000000..905704172d
--- /dev/null
+++ b/neozip/arch/loongarch/chunkset_lasx.c
@@ -0,0 +1,126 @@
+/* chunkset_lasx.c -- LASX inline functions to copy small data chunks, based on Intel AVX2 implementation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LASX
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+#include "lsxintrin_ext.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = __lasx_xvreplgr2vr_h(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = __lasx_xvreplgr2vr_w(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = __lasx_xvreplgr2vr_d(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+    *chunk = lasx_broadcast_128(__lsx_vld(from, 0));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = __lasx_xvld(s, 0);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    __lasx_xvst(*chunk, out, 0);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+     * compiling this to a shared load for all branches, preferring the simpler code.  Given that the buf value isn't in
+     * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 32 - dist);
+
+    if (dist < 16) {
+        /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+         * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+         * shuffles and combining the halves later */
+        __m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0);
+        __m128i ret_vec0 = __lsx_vld(buf, 0);
+        ret_vec = __lasx_concat_128(ret_vec0, ret_vec0);
+        ret_vec = lasx_shuffle_b(ret_vec, perm_vec);
+    }  else {
+        __m128i ret_vec0 = __lsx_vld(buf, 0);
+        __m128i ret_vec1 = __lsx_vld(buf, 16);
+        /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+        __m128i perm_vec1 = __lsx_vld(permute_table + lut_rem.idx, 0);
+        __m128i xlane_permutes = __lsx_vslt_b(perm_vec1, __lsx_vreplgr2vr_b(16));
+        __m128i xlane_res  = lsx_shuffle_b(ret_vec0, perm_vec1);
+        /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+         * shuffle those values */
+        __m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes);
+        ret_vec = __lasx_concat_128(ret_vec0, latter_half);
+    }
+
+    return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+    *chunk = __lsx_vld(s, 0);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+    __lsx_vst(*chunk, out, 0);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+    /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+     * unlikely to be actually written or read from */
+    return lasx_zext_128(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    __msan_unpoison(buf + dist, 16 - dist);
+    ret_vec = __lsx_vld(buf, 0);
+    *chunk_rem = half_rem_vals[dist - 3];
+
+    perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0);
+    ret_vec = lsx_shuffle_b(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+#define CHUNKSIZE        chunksize_lasx
+#define CHUNKCOPY        chunkcopy_lasx
+#define CHUNKUNROLL      chunkunroll_lasx
+#define CHUNKMEMSET      chunkmemset_lasx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_lasx
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_lasx
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/chunkset_lsx.c b/neozip/arch/loongarch/chunkset_lsx.c
new file mode 100644
index 0000000000..23dabfba51
--- /dev/null
+++ b/neozip/arch/loongarch/chunkset_lsx.c
@@ -0,0 +1,74 @@
+/* chunkset_lsx.c -- LSX inline functions to copy small data chunks, based on Intel SSSE3 implementation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LSX
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = __lsx_vreplgr2vr_h(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = __lsx_vreplgr2vr_w(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = __lsx_vreplgr2vr_d(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = __lsx_vld(s, 0);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    __lsx_vst(*chunk, out, 0);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    /* Important to note:
+     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+     * bytes we willingly and purposefully load uninitialized that we swizzle over
+     * in a vector register, anyway.  If what we assume is wrong about what is used,
+     * the memory sanitizer will still usefully flag it */
+    __msan_unpoison(buf + dist, 16 - dist);
+    ret_vec = __lsx_vld(buf, 0);
+    *chunk_rem = lut_rem.remval;
+
+    perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0);
+    ret_vec = lsx_shuffle_b(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+#define CHUNKSIZE        chunksize_lsx
+#define CHUNKMEMSET      chunkmemset_lsx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_lsx
+#define CHUNKCOPY        chunkcopy_lsx
+#define CHUNKUNROLL      chunkunroll_lsx
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_lsx
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/compare256_lasx.c b/neozip/arch/loongarch/compare256_lasx.c
new file mode 100644
index 0000000000..d61d6e57b3
--- /dev/null
+++ b/neozip/arch/loongarch/compare256_lasx.c
@@ -0,0 +1,60 @@
+/* compare256_lasx.c -- LASX version of compare256, based on Intel AVX2 implementation
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef LOONGARCH_LASX
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+
+static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = __lasx_xvld(src0, 0);
+        ymm_src1 = __lasx_xvld(src1, 0);
+        ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp);
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = __lasx_xvld(src0, 0);
+        ymm_src1 = __lasx_xvld(src1, 0);
+        ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1);
+        mask = (unsigned)lasx_movemask_b(ymm_cmp);
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask);
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_lasx_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_lasx
+#define COMPARE256          compare256_lasx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_lasx
+#define COMPARE256          compare256_lasx_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/compare256_lsx.c b/neozip/arch/loongarch/compare256_lsx.c
new file mode 100644
index 0000000000..4afd261e76
--- /dev/null
+++ b/neozip/arch/loongarch/compare256_lsx.c
@@ -0,0 +1,88 @@
+/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef LOONGARCH_LSX
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+
+static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) {
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = __lsx_vld(src0, 0);
+    xmm_src1 = __lsx_vld(src1, 0);
+    xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF)
+        return zng_ctz32(~mask);
+
+    const uint8_t *last0 = src0 + 240;
+    const uint8_t *last1 = src1 + 240;
+
+    int align_offset = ((uintptr_t)src0) & 15;
+    int align_adv = 16 - align_offset;
+    uint32_t len = align_adv;
+
+    src0 += align_adv;
+    src1 += align_adv;
+
+    for (int i = 0; i < 15; i++) {
+        xmm_src0 = __lsx_vld(src0, 0);
+        xmm_src1 = __lsx_vld(src1, 0);
+        xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+        mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF)
+            return len + zng_ctz32(~mask);
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        xmm_src0 = __lsx_vld(last0, 0);
+        xmm_src1 = __lsx_vld(last1, 0);
+        xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+        mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+        if (mask != 0xFFFF)
+            return 240 + zng_ctz32(~mask);
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_lsx_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_lsx
+#define COMPARE256          compare256_lsx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_lsx
+#define COMPARE256          compare256_lsx_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/loongarch/crc32_la.c b/neozip/arch/loongarch/crc32_la.c
new file mode 100644
index 0000000000..f1bd314e65
--- /dev/null
+++ b/neozip/arch/loongarch/crc32_la.c
@@ -0,0 +1,71 @@
+/* crc32_la.c - LoongArch version of crc32
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_CRC
+
+#include "zbuild.h"
+
+#include <larchintrin.h>
+
+Z_INTERNAL uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c = ~crc;
+
+    if (UNLIKELY(len == 1)) {
+        c = (uint32_t)__crc_w_b_w((char)(*buf), (int)c);
+        c = ~c;
+        return c;
+    }
+
+    uintptr_t align_diff = ALIGN_DIFF(buf, 8);
+    if (align_diff) {
+        if (len && (align_diff & 1)) {
+            c = (uint32_t)__crc_w_b_w((char)(*buf++), (int)c);
+            len--;
+        }
+
+        if (len >= 2 && (align_diff & 2)) {
+            c = (uint32_t)__crc_w_h_w((short)*((uint16_t*)buf), (int)c);
+            buf += 2;
+            len -= 2;
+        }
+
+        if (len >= 4 && (align_diff & 4)) {
+            c = (uint32_t)__crc_w_w_w((int)*((uint32_t*)buf), (int)c);
+            len -= 4;
+            buf += 4;
+        }
+
+    }
+
+    while (len >= 8) {
+        c = (uint32_t)__crc_w_d_w((long int)*((uint64_t*)buf), (int)c);
+        len -= 8;
+        buf += 8;
+    }
+
+    if (len & 4) {
+        c = (uint32_t)__crc_w_w_w((int)*((uint32_t*)buf), (int)c);
+        buf += 4;
+    }
+
+    if (len & 2) {
+        c = (uint32_t)__crc_w_h_w((short)*((uint16_t*)buf), (int)c);
+        buf += 2;
+    }
+
+    if (len & 1) {
+        c = (uint32_t)__crc_w_b_w((char)(*buf), (int)c);
+    }
+
+    c = ~c;
+    return c;
+}
+
+Z_INTERNAL uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_loongarch64(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
+#endif
diff --git a/neozip/arch/loongarch/lasxintrin_ext.h b/neozip/arch/loongarch/lasxintrin_ext.h
new file mode 100644
index 0000000000..b1e72cff86
--- /dev/null
+++ b/neozip/arch/loongarch/lasxintrin_ext.h
@@ -0,0 +1,61 @@
+/* lasxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LASXINTRIN_EXT_H
+#define LASXINTRIN_EXT_H
+
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+
+static inline __m256i lasx_zext_128(__m128i src) {
+#ifdef __loongarch_asx_sx_conv
+    return __lasx_insert_128_lo(__lasx_xvldi(0), src);
+#else
+    __m256i dest = __lasx_xvldi(0);
+    __asm__ volatile ("xvpermi.q %u0,%u2,0x30\n" : "=f"(dest) : "0"(dest), "f"(src));
+    return dest;
+#endif
+}
+
+#ifndef __loongarch_asx_sx_conv
+static inline __m256i __lasx_concat_128(__m128i lo, __m128i hi) {
+    __m256i dest;
+    __asm__ volatile ("xvpermi.q %u0,%u2,0x02\n" : "=f"(dest) : "0"(lo), "f"(hi));
+    return dest;
+}
+#endif
+
+static inline __m256i lasx_broadcast_128(__m128i in) {
+    return __lasx_concat_128(in, in);
+}
+
+static inline __m256i lasx_sad_bu(__m256i a, __m256i b) {
+    __m256i tmp = __lasx_xvabsd_bu(a, b);
+    tmp = __lasx_xvhaddw_hu_bu(tmp, tmp);
+    tmp = __lasx_xvhaddw_wu_hu(tmp, tmp);
+    return __lasx_xvhaddw_du_wu(tmp, tmp);
+}
+
+static inline __m256i lasx_maddubs_w_h(__m256i a, __m256i b) {
+    return __lasx_xvsadd_h(__lasx_xvmulwod_h_bu_b(a, b), __lasx_xvmulwev_h_bu_b(a, b));
+}
+
+static inline __m256i lasx_madd_w_h(__m256i a, __m256i b) {
+    return __lasx_xvmaddwod_w_h(__lasx_xvmulwev_w_h(a, b), a, b);
+}
+
+static inline int lasx_movemask_b(__m256i v) {
+    v = __lasx_xvmskltz_b(v);
+    return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16);
+}
+
+/* See: lsx_shuffle_b */
+static inline __m256i lasx_shuffle_b(__m256i a, __m256i b) {
+    __m256i msb_mask = __lasx_xvslti_b(b, 0);
+    __m256i dst = __lasx_xvshuf_b(a, a, __lasx_xvandi_b(b, 0xF));
+    return __lasx_xvand_v(dst, __lasx_xvnor_v(msb_mask, msb_mask));
+}
+
+#endif // include guard LASXINTRIN_EXT_H
diff --git a/neozip/arch/loongarch/loongarch_features.c b/neozip/arch/loongarch/loongarch_features.c
new file mode 100644
index 0000000000..bedf8499f7
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_features.c
@@ -0,0 +1,31 @@
+/* loongarch_features.c -- check for LoongArch features.
+ *
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_FEATURES
+
+#include "zbuild.h"
+#include "loongarch_features.h"
+
+#include <larchintrin.h>
+
+/*
+ * https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html
+ *
+ * Word number Bit number  Annotation  Implication
+ * 0x1         25          CRC         1 indicates support for CRC instruction
+ * 0x1         6           LSX         1 indicates support for 128-bit vector extension
+ * 0x1         7           LASX        1 indicates support for 256-bit vector expansion
+ */
+
+void Z_INTERNAL loongarch_check_features(struct loongarch_cpu_features *features) {
+    unsigned int w1 = __cpucfg(0x1);
+    features->has_crc = w1 & 0x2000000;
+    features->has_lsx = w1 & 0x40;
+    features->has_lasx = w1 & 0x80;
+}
+
+#endif
diff --git a/neozip/arch/loongarch/loongarch_features.h b/neozip/arch/loongarch/loongarch_features.h
new file mode 100644
index 0000000000..27c90b14b3
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_features.h
@@ -0,0 +1,19 @@
+/* loongarch_features.h -- check for LoongArch features.
+ *
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef LOONGARCH_FEATURES_H_
+#define LOONGARCH_FEATURES_H_
+
+struct loongarch_cpu_features {
+    int has_crc;
+    int has_lsx;
+    int has_lasx;
+};
+
+void Z_INTERNAL loongarch_check_features(struct loongarch_cpu_features *features);
+
+#endif /* LOONGARCH_FEATURES_H_ */
diff --git a/neozip/arch/loongarch/loongarch_functions.h b/neozip/arch/loongarch/loongarch_functions.h
new file mode 100644
index 0000000000..0ec8bd66d7
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_functions.h
@@ -0,0 +1,86 @@
+/* loongarch_functions.h -- LoongArch implementations for arch-specific functions.
+ *
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef LOONGARCH_FUNCTIONS_H_
+#define LOONGARCH_FUNCTIONS_H_
+
+#include "loongarch_natives.h"
+
+#ifdef LOONGARCH_CRC
+uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef LOONGARCH_LSX
+uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len);
+uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lsx(deflate_state *s);
+#endif
+
+#ifdef LOONGARCH_LASX
+uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len);
+uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match);
+void slide_hash_lasx(deflate_state *s);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// LOONGARCH - CRC32
+#  ifdef LOONGARCH_CRC_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_loongarch64
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_loongarch64
+#  endif
+#  ifdef LOONGARCH_LSX_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_lsx
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_lsx
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_lsx
+#    undef native_compare256
+#    define native_compare256 compare256_lsx
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_lsx
+#    undef native_longest_match
+#    define native_longest_match longest_match_lsx
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_lsx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_lsx
+#  endif
+#  ifdef LOONGARCH_LASX_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_lasx
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_lasx
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_lasx
+#    undef native_compare256
+#    define native_compare256 compare256_lasx
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_lasx
+#    undef native_longest_match
+#    define native_longest_match longest_match_lasx
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_lasx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_lasx
+#  endif
+#endif
+
+#endif /* LOONGARCH_FUNCTIONS_H_ */
diff --git a/neozip/arch/loongarch/loongarch_natives.h b/neozip/arch/loongarch/loongarch_natives.h
new file mode 100644
index 0000000000..35f6d3c7bd
--- /dev/null
+++ b/neozip/arch/loongarch/loongarch_natives.h
@@ -0,0 +1,25 @@
+/* loongarch_natives.h -- LoongArch compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef LOONGARCH_NATIVES_H_
+#define LOONGARCH_NATIVES_H_
+
+#if defined(__loongarch__)
+// All known CPUs have crc instructions
+#  ifdef LOONGARCH_CRC
+#    define LOONGARCH_CRC_NATIVE
+#  endif
+#endif
+#if defined(__loongarch_sx)
+#  ifdef LOONGARCH_LSX
+#    define LOONGARCH_LSX_NATIVE
+#  endif
+#endif
+#if defined(__loongarch_asx)
+#  ifdef LOONGARCH_LASX
+#    define LOONGARCH_LASX_NATIVE
+#  endif
+#endif
+
+#endif /* LOONGARCH_NATIVES_H_ */
diff --git a/neozip/arch/loongarch/lsxintrin_ext.h b/neozip/arch/loongarch/lsxintrin_ext.h
new file mode 100644
index 0000000000..0a0503b9f9
--- /dev/null
+++ b/neozip/arch/loongarch/lsxintrin_ext.h
@@ -0,0 +1,33 @@
+/* lsxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LSXINTRIN_EXT_H
+#define LSXINTRIN_EXT_H
+
+#include <lsxintrin.h>
+
+
+static inline __m128i lsx_sad_bu(__m128i a, __m128i b) {
+    __m128i tmp = __lsx_vabsd_bu(a, b);
+    tmp = __lsx_vhaddw_hu_bu(tmp, tmp);
+    tmp = __lsx_vhaddw_wu_hu(tmp, tmp);
+    return __lsx_vhaddw_du_wu(tmp, tmp);
+}
+
+static inline int lsx_movemask_b(__m128i v) {
+    return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0);
+}
+
+static inline __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+    /* most significant bit is set - negative 8-bit integer */
+    __m128i msb_mask = __lsx_vslti_b(b, 0);
+
+    /* shuffle, clear msb in indices vector b */
+    __m128i dst = __lsx_vshuf_b(a, a, __lsx_vandi_b(b, 0xF));
+
+    /* invert and apply mask - clear dst-element if b-msb is set */
+    return __lsx_vand_v(dst, __lsx_vnor_v(msb_mask, msb_mask));
+}
+
+#endif // include guard LSXINTRIN_EXT_H
diff --git a/neozip/arch/loongarch/slide_hash_lasx.c b/neozip/arch/loongarch/slide_hash_lasx.c
new file mode 100644
index 0000000000..f464779090
--- /dev/null
+++ b/neozip/arch/loongarch/slide_hash_lasx.c
@@ -0,0 +1,49 @@
+/*
+ * LASX optimized hash slide, based on Intel AVX2 implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LASX
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lasxintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 32;
+
+    do {
+        __m256i value1, value2, result1, result2;
+
+        value1 = __lasx_xvld(table, 0);
+        value2 = __lasx_xvld(table, 32);
+        result1 = __lasx_xvssub_hu(value1, wsize);
+        result2 = __lasx_xvssub_hu(value2, wsize);
+        __lasx_xvst(result1, table, 0);
+        __lasx_xvst(result2, table, 32);
+
+        table -= 32;
+        entries -= 32;
+    } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_lasx(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = __lasx_xvreplgr2vr_h((short)wsize);
+
+    slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/loongarch/slide_hash_lsx.c b/neozip/arch/loongarch/slide_hash_lsx.c
new file mode 100644
index 0000000000..f4c94ea70d
--- /dev/null
+++ b/neozip/arch/loongarch/slide_hash_lsx.c
@@ -0,0 +1,54 @@
+/*
+ * LSX optimized hash slide, based on Intel SSE implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef LOONGARCH_LSX
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lsxintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m128i wsize) {
+    table += entries;
+    table -= 16;
+
+    /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+     * Our alloc function is aligned to 64 byte boundaries */
+    do {
+        __m128i value0, value1, result0, result1;
+
+        value0 = __lsx_vld(table, 0);
+        value1 = __lsx_vld(table, 16);
+        result0 = __lsx_vssub_hu(value0, wsize);
+        result1 = __lsx_vssub_hu(value1, wsize);
+        __lsx_vst(result0, table, 0);
+        __lsx_vst(result1, table, 16);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_lsx(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = __lsx_vreplgr2vr_h((short)wsize);
+
+    assert(((uintptr_t)s->head & 15) == 0);
+    assert(((uintptr_t)s->prev & 15) == 0);
+
+    slide_hash_chain(s->head, HASH_SIZE, xmm_wsize);
+    slide_hash_chain(s->prev, wsize, xmm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/power/Makefile.in b/neozip/arch/power/Makefile.in
new file mode 100644
index 0000000000..e2bec5e510
--- /dev/null
+++ b/neozip/arch/power/Makefile.in
@@ -0,0 +1,93 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
+PPCFLAGS=-maltivec
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: power_features.o \
+     power_features.lo \
+     adler32_power8.o \
+     adler32_power8.lo \
+     adler32_vmx.o \
+     adler32_vmx.lo \
+     chunkset_power8.o \
+     chunkset_power8.lo \
+     compare256_power9.o \
+     compare256_power9.lo \
+     crc32_power8.o \
+     crc32_power8.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo \
+     slide_hash_vmx.o \
+     slide_hash_vmx.lo
+
+power_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+power_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+adler32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_vmx.o:
+	$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+	$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+chunkset_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+compare256_power9.o:
+	$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+	$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+crc32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+slide_hash_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_vmx.o:
+	$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+	$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/power/adler32_power8.c b/neozip/arch/power/adler32_power8.c
new file mode 100644
index 0000000000..39b3cf399c
--- /dev/null
+++ b/neozip/arch/power/adler32_power8.c
@@ -0,0 +1,160 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start  _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <altivec.h>
+
+/* Vector across sum unsigned int (saturate).  */
+static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+    __b = vec_sld(__a, __a, 8);
+    __b = vec_add(__b, __a);
+    __a = vec_sld(__b, __b, 4);
+    __a = vec_add(__a, __b);
+
+    return __a;
+}
+
+Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = (adler >> 16) & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0);
+
+    /* This is faster than VSX code for len < 64.  */
+    if (len < 64)
+        return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0);
+
+    /* Use POWER VSX instructions for len >= 64. */
+    const vector unsigned int v_zeros = { 0 };
+    const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+         6, 5, 4, 3, 2, 1};
+    const vector unsigned char vsh = vec_splat_u8(4);
+    const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+    vector unsigned int vs1 = { 0 };
+    vector unsigned int vs2 = { 0 };
+    vector unsigned int vs1_save = { 0 };
+    vector unsigned int vsum1, vsum2;
+    vector unsigned char vbuf;
+    int n;
+
+    vs1[0] = s1;
+    vs2[0] = s2;
+
+    /* Do length bigger than NMAX in blocks of NMAX size.  */
+    while (len >= NMAX) {
+        len -= NMAX;
+        n = NMAX / 16;
+        do {
+            vbuf = vec_xl(0, (unsigned char *) buf);
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        } while (--n);
+        /* Once each block of NMAX size.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+
+        /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
+        vs1[0] = vs1[0] % BASE;
+        /* vs2[0] = s2_i + 16*s1_save +
+           sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
+        vs2[0] = vs2[0] % BASE;
+
+        vs1 = vec_and(vs1, vmask);
+        vs2 = vec_and(vs2, vmask);
+        vs1_save = v_zeros;
+    }
+
+    /* len is less than NMAX one modulo is needed.  */
+    if (len >= 16) {
+        while (len >= 16) {
+            len -= 16;
+
+            vbuf = vec_xl(0, (unsigned char *) buf);
+
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        }
+        /* Since the size will be always less than NMAX we do this once.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+    }
+    /* Copy result back to s1, s2 (mod 65521).  */
+    s1 = vs1[0] % BASE;
+    s2 = vs2[0] % BASE;
+
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0);
+}
+
+Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
+    return adler32_impl(adler, buf, len);
+}
+
+/* VSX/VMX stores can have higher latency than optimized memcpy on POWER8+ */
+Z_INTERNAL uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len) {
+    adler = adler32_impl(adler, buf, len);
+    memcpy(dst, buf, len);
+    return adler;
+}
+#endif /* POWER8_VSX */
diff --git a/neozip/arch/power/adler32_vmx.c b/neozip/arch/power/adler32_vmx.c
new file mode 100644
index 0000000000..5171bab35b
--- /dev/null
+++ b/neozip/arch/power/adler32_vmx.c
@@ -0,0 +1,168 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "adler32_p.h"
+
+#include <altivec.h>
+
+#define vmx_zero()  (vec_splat_u32(0))
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    /* Different taps for the separable components of sums */
+    const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+    const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+    const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+    const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+     * a 2 element vector from a single load + a subsequent shift is just barely faster
+     * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+    const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+    vector unsigned int  adacc, s2acc;
+    vector unsigned int pair_vec = vec_ld(0, s);
+    adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+#if BYTE_ORDER == LITTLE_ENDIAN
+    s2acc = vec_sro(pair_vec, shift_vec);
+#else
+    s2acc = vec_slo(pair_vec, shift_vec);
+#endif
+
+    vector unsigned int zero = vmx_zero();
+    vector unsigned int s3acc = zero;
+    vector unsigned int s3acc_0 = zero;
+    vector unsigned int adacc_prev = adacc;
+    vector unsigned int adacc_prev_0 = zero;
+
+    vector unsigned int s2acc_0 = zero;
+    vector unsigned int s2acc_1 = zero;
+    vector unsigned int s2acc_2 = zero;
+
+    /* Maintain a running sum of a second half, this might help use break yet another
+     * data dependency bubble in the sum */
+    vector unsigned int adacc_0 = zero;
+
+    int num_iter = len / 4;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        vector unsigned char d0 = vec_ld(0, buf);
+        vector unsigned char d1 = vec_ld(16, buf);
+        vector unsigned char d2 = vec_ld(32, buf);
+        vector unsigned char d3 = vec_ld(48, buf);
+
+        /* The core operation of the loop, basically
+         * what is being unrolled below */
+        adacc = vec_sum4s(d0, adacc);
+        s3acc = vec_add(s3acc, adacc_prev);
+        s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+        s2acc = vec_msum(t0, d0, s2acc);
+
+        /* interleave dependent sums in here */
+        adacc_0 = vec_sum4s(d1, adacc_0);
+        s2acc_0 = vec_msum(t1, d1, s2acc_0);
+        adacc = vec_sum4s(d2, adacc);
+        s2acc_1 = vec_msum(t2, d2, s2acc_1);
+        s2acc_2 = vec_msum(t3, d3, s2acc_2);
+        adacc_0 = vec_sum4s(d3, adacc_0);
+
+        adacc_prev = adacc;
+        adacc_prev_0 = adacc_0;
+        buf += 64;
+    }
+
+    adacc = vec_add(adacc, adacc_0);
+    s3acc = vec_add(s3acc, s3acc_0);
+    s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+    if (rem) {
+        adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+        adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+        while (rem--) {
+            vector unsigned char d0 = vec_ld(0, buf);
+            adacc = vec_sum4s(d0, adacc);
+            s3acc = vec_add(s3acc, adacc_prev);
+            s2acc = vec_msum(t3, d0, s2acc);
+            adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+            buf += 16;
+        }
+    }
+
+
+    /* Sum up independent second sums */
+    s2acc = vec_add(s2acc, s2acc_0);
+    s2acc_2 = vec_add(s2acc_1, s2acc_2);
+    s2acc = vec_add(s2acc, s2acc_2);
+
+    s2acc = vec_add(s2acc, s3acc);
+
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+    vec_ste(adacc, 0, s);
+    vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+    /* Split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+    uint32_t pair[4] ALIGNED_(16);
+    pair[0] = adler;
+    pair[1] = sum2;
+    pair[2] = 0;
+    pair[3] = 0;
+
+    // Align buffer
+    size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+    size_t n = NMAX;
+    if (align_diff) {
+        adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0);
+
+        buf += align_diff;
+        len -= align_diff;
+        n -= align_diff;
+    }
+
+    while (len >= 16) {
+        n = MIN(len, n);
+
+        vmx_accum32(pair, buf, n / 16);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        size_t k = (n / 16) * 16;
+        buf += k;
+        len -= k;
+        n = NMAX;
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0);
+}
+
+/* VMX stores can have higher latency than optimized memcpy */
+Z_INTERNAL uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    adler = adler32_vmx(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+}
+#endif
diff --git a/neozip/arch/power/chunkset_power8.c b/neozip/arch/power/chunkset_power8.c
new file mode 100644
index 0000000000..f9855e677e
--- /dev/null
+++ b/neozip/arch/power/chunkset_power8.c
@@ -0,0 +1,50 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include <altivec.h>
+
+typedef vector unsigned char chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = (vector unsigned char)vec_splats(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = (vector unsigned char)vec_splats(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vec_xst(*chunk, 0, out);
+}
+
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_power8
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/power/compare256_power9.c b/neozip/arch/power/compare256_power9.c
new file mode 100644
index 0000000000..99c3b0b6d1
--- /dev/null
+++ b/neozip/arch/power/compare256_power9.c
@@ -0,0 +1,68 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "zendian.h"
+
+#include <altivec.h>
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0, cmplen;
+
+    do {
+        vector unsigned char vsrc0, vsrc1, vc;
+
+        vsrc0 = *((vector unsigned char *)src0);
+        vsrc1 = *((vector unsigned char *)src1);
+
+        /* Compare 16 bytes at a time. Each byte of vc will be either
+         * all ones or all zeroes, depending on the result of the comparison. */
+        vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+        /* Since the index of matching bytes will contain only zeroes
+         * on vc (since we used cmpne), counting the number of consecutive
+         * bytes where LSB == 0 is the same as counting the length of the match. */
+        zng_vec_vctzlsbb(vc, cmplen);
+        if (cmplen != 16)
+            return len + cmplen;
+
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+   return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/power/crc32_constants.h b/neozip/arch/power/crc32_constants.h
new file mode 100644
index 0000000000..8c8f2153b6
--- /dev/null
+++ b/neozip/arch/power/crc32_constants.h
@@ -0,0 +1,1123 @@
+/* Constants table used by crc32_power8.c
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * This file was automatically generated, DO NOT EDIT IT MANUALLY.
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zendian.h"
+#include "zbuild.h"
+
+/* Reduce 262144 kbits to 1024 bits */
+static const __vector unsigned long long vcrc_const[255] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+    { 0x0000000099ea94a8, 0x00000001651797d2 },
+    /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+    { 0x00000000945a8420, 0x0000000021e0d56c },
+    /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+    { 0x0000000030762706, 0x000000000f95ecaa },
+    /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+    { 0x00000001a52fc582, 0x00000001ebd224ac },
+    /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+    { 0x00000001a4a7167a, 0x000000000ccb97ca },
+    /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+    { 0x000000000c18249a, 0x00000001006ec8a8 },
+    /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+    { 0x00000000a924ae7c, 0x000000014f58f196 },
+    /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+    { 0x00000001e12ccc12, 0x00000001a7192ca6 },
+    /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+    { 0x00000000a0b9d4ac, 0x000000019a64bab2 },
+    /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+    { 0x0000000095e8ddfe, 0x0000000014f4ed2e },
+    /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+    { 0x00000000233fddc4, 0x000000011092b6a2 },
+    /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+    { 0x00000001b4529b62, 0x00000000c8a1629c },
+    /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+    { 0x00000001a7fa0e64, 0x000000017bf32e8e },
+    /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+    { 0x00000001b5334592, 0x00000001f8cc6582 },
+    /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+    { 0x000000011f8ee1b4, 0x000000008631ddf0 },
+    /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+    { 0x000000006252e632, 0x000000007e5a76d0 },
+    /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+    { 0x00000000ab973e84, 0x000000002b09b31c },
+    /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+    { 0x000000007734f5ec, 0x00000001b2df1f84 },
+    /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+    { 0x000000007c547798, 0x00000001d6f56afc },
+    /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+    { 0x000000007ec40210, 0x00000001b9b5e70c },
+    /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+    { 0x00000001ab1695a8, 0x0000000034b626d2 },
+    /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+    { 0x0000000090494bba, 0x000000014c53479a },
+    /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+    { 0x00000001123fb816, 0x00000001a6d179a4 },
+    /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+    { 0x00000001e188c74c, 0x000000015abd16b4 },
+    /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+    { 0x00000001c2d3451c, 0x00000000018f9852 },
+    /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+    { 0x00000000f55cf1ca, 0x000000001fb3084a },
+    /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+    { 0x00000001a0531540, 0x00000000c53dfb04 },
+    /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+    { 0x0000000132cd7ebc, 0x00000000e10c9ad6 },
+    /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+    { 0x0000000073ab7f36, 0x0000000025aa994a },
+    /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+    { 0x0000000041aed1c2, 0x00000000fa3a74c4 },
+    /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+    { 0x0000000136c53800, 0x0000000033eb3f40 },
+    /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+    { 0x0000000126835a30, 0x000000017193f296 },
+    /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+    { 0x000000006241b502, 0x0000000043f6c86a },
+    /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+    { 0x00000000d5196ad4, 0x000000016b513ec6 },
+    /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+    { 0x000000009cfa769a, 0x00000000c8f25b4e },
+    /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+    { 0x00000000920e5df4, 0x00000001a45048ec },
+    /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+    { 0x0000000169dc310e, 0x000000000c441004 },
+    /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+    { 0x0000000009fc331c, 0x000000000e17cad6 },
+    /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+    { 0x000000010d94a81e, 0x00000001253ae964 },
+    /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+    { 0x0000000027a20ab2, 0x00000001d7c88ebc },
+    /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+    { 0x0000000114f87504, 0x00000001e7ca913a },
+    /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+    { 0x000000004b076d96, 0x0000000033ed078a },
+    /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+    { 0x00000000da4d1e74, 0x00000000e1839c78 },
+    /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+    { 0x000000001b81f672, 0x00000001322b267e },
+    /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+    { 0x000000009367c988, 0x00000000638231b6 },
+    /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+    { 0x00000001717214ca, 0x00000001ee7f16f4 },
+    /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+    { 0x000000009f47d820, 0x0000000117d9924a },
+    /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+    { 0x000000010d9a47d2, 0x00000000e1a9e0c4 },
+    /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+    { 0x00000000a696c58c, 0x00000001403731dc },
+    /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+    { 0x000000002aa28ec6, 0x00000001a5ea9682 },
+    /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+    { 0x00000001fe18fd9a, 0x0000000101c5c578 },
+    /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+    { 0x000000019d4fc1ae, 0x00000000dddf6494 },
+    /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+    { 0x00000001ba0e3dea, 0x00000000f1c3db28 },
+    /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+    { 0x0000000074b59a5e, 0x000000013112fb9c },
+    /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+    { 0x00000000f2b5ea98, 0x00000000b680b906 },
+    /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+    { 0x0000000187132676, 0x000000001a282932 },
+    /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+    { 0x000000010a8c6ad4, 0x0000000089406e7e },
+    /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+    { 0x00000001e21dfe70, 0x00000001def6be8c },
+    /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+    { 0x00000001da0050e4, 0x0000000075258728 },
+    /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+    { 0x00000000772172ae, 0x000000019536090a },
+    /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+    { 0x00000000e47724aa, 0x00000000f2455bfc },
+    /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+    { 0x000000003cd63ac4, 0x000000018c40baf4 },
+    /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+    { 0x00000001bf47d352, 0x000000004cd390d4 },
+    /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+    { 0x000000018dc1d708, 0x00000001e4ece95a },
+    /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+    { 0x000000002d4620a4, 0x000000001a3ee918 },
+    /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+    { 0x0000000058fd1740, 0x000000007c652fb8 },
+    /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+    { 0x00000000dadd9bfc, 0x000000011c67842c },
+    /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+    { 0x00000001ea2140be, 0x00000000254f759c },
+    /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+    { 0x000000009de128ba, 0x000000007ece94ca },
+    /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+    { 0x000000013ac3aa8e, 0x0000000038f258c2 },
+    /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+    { 0x0000000099980562, 0x00000001cdf17b00 },
+    /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+    { 0x00000001c1579c86, 0x000000011f882c16 },
+    /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+    { 0x0000000068dbbf94, 0x0000000100093fc8 },
+    /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+    { 0x000000004509fb04, 0x00000001cd684f16 },
+    /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+    { 0x00000001202f6398, 0x000000004bc6a70a },
+    /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+    { 0x000000013aea243e, 0x000000004fc7e8e4 },
+    /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+    { 0x00000001b4052ae6, 0x0000000130103f1c },
+    /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+    { 0x00000001cd2a0ae8, 0x0000000111b0024c },
+    /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+    { 0x00000001fe4aa8b4, 0x000000010b3079da },
+    /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+    { 0x00000001d1559a42, 0x000000010192bcc2 },
+    /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+    { 0x00000001f3e05ecc, 0x0000000074838d50 },
+    /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+    { 0x0000000104ddd2cc, 0x000000001b20f520 },
+    /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+    { 0x000000015393153c, 0x0000000050c3590a },
+    /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+    { 0x0000000057e942c6, 0x00000000b41cac8e },
+    /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+    { 0x000000012c633850, 0x000000000c72cc78 },
+    /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+    { 0x00000000ebcaae4c, 0x0000000030cdb032 },
+    /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+    { 0x000000013ee532a6, 0x000000013e09fc32 },
+    /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+    { 0x00000001bf0cbc7e, 0x000000001ed624d2 },
+    /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+    { 0x00000000d50b7a5a, 0x00000000781aee1a },
+    /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+    { 0x0000000002fca6e8, 0x00000001c4d8348c },
+    /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+    { 0x000000007af40044, 0x0000000057a40336 },
+    /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+    { 0x0000000016178744, 0x0000000085544940 },
+    /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+    { 0x000000014c177458, 0x000000019cd21e80 },
+    /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+    { 0x000000011b6ddf04, 0x000000013eb95bc0 },
+    /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+    { 0x00000001f3e29ccc, 0x00000001dfc9fdfc },
+    /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+    { 0x0000000135ae7562, 0x00000000cd028bc2 },
+    /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+    { 0x0000000190ef812c, 0x0000000090db8c44 },
+    /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+    { 0x0000000067a2c786, 0x000000010010a4ce },
+    /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+    { 0x0000000048b9496c, 0x00000001c8f4c72c },
+    /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+    { 0x000000015a422de6, 0x000000001c26170c },
+    /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+    { 0x00000001ef0e3640, 0x00000000e3fccf68 },
+    /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+    { 0x00000001006d2d26, 0x00000000d513ed24 },
+    /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+    { 0x00000001170d56d6, 0x00000000141beada },
+    /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+    { 0x00000000a5fb613c, 0x000000011071aea0 },
+    /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+    { 0x0000000040bbf7fc, 0x000000012e19080a },
+    /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+    { 0x000000016ac3a5b2, 0x0000000100ecf826 },
+    /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+    { 0x00000000abf16230, 0x0000000069b09412 },
+    /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+    { 0x00000001ebe23fac, 0x0000000122297bac },
+    /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+    { 0x000000008b6a0894, 0x00000000e9e4b068 },
+    /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+    { 0x00000001288ea478, 0x000000004b38651a },
+    /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+    { 0x000000016619c442, 0x00000001468360e2 },
+    /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+    { 0x0000000086230038, 0x00000000121c2408 },
+    /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+    { 0x000000017746a756, 0x00000000da7e7d08 },
+    /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+    { 0x0000000191b8f8f8, 0x00000001058d7652 },
+    /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+    { 0x000000008e167708, 0x000000014a098a90 },
+    /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+    { 0x0000000148b22d54, 0x0000000020dbe72e },
+    /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+    { 0x0000000044ba2c3c, 0x000000011e7323e8 },
+    /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+    { 0x00000000b54d2b52, 0x00000000d5d4bf94 },
+    /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+    { 0x0000000005a4fd8a, 0x0000000199d8746c },
+    /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+    { 0x0000000139f9fc46, 0x00000000ce9ca8a0 },
+    /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+    { 0x000000015a1fa824, 0x00000000136edece },
+    /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+    { 0x000000000a61ae4c, 0x000000019b92a068 },
+    /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+    { 0x0000000145e9113e, 0x0000000071d62206 },
+    /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+    { 0x000000006a348448, 0x00000000dfc50158 },
+    /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+    { 0x000000004d80a08c, 0x00000001517626bc },
+    /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+    { 0x000000014b6837a0, 0x0000000148d1e4fa },
+    /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+    { 0x000000016896a7fc, 0x0000000094d8266e },
+    /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+    { 0x000000014f187140, 0x00000000606c5e34 },
+    /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+    { 0x000000019581b9da, 0x000000019766beaa },
+    /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+    { 0x00000001091bc984, 0x00000001d80c506c },
+    /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+    { 0x000000001067223c, 0x000000001e73837c },
+    /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+    { 0x00000001ab16ea02, 0x0000000064d587de },
+    /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+    { 0x000000013c4598a8, 0x00000000f4a507b0 },
+    /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+    { 0x00000000b3735430, 0x0000000040e342fc },
+    /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+    { 0x00000001bb3fc0c0, 0x00000001d5ad9c3a },
+    /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+    { 0x00000001570ae19c, 0x0000000094a691a4 },
+    /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+    { 0x00000001ea910712, 0x00000001271ecdfa },
+    /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+    { 0x0000000167127128, 0x000000009e54475a },
+    /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+    { 0x0000000019e790a2, 0x00000000c9c099ee },
+    /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+    { 0x000000003788f710, 0x000000009a2f736c },
+    /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+    { 0x00000001682a160e, 0x00000000bb9f4996 },
+    /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+    { 0x000000007f0ebd2e, 0x00000001db688050 },
+    /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+    { 0x000000002b032080, 0x00000000e9b10af4 },
+    /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+    { 0x00000000cfd1664a, 0x000000012d4545e4 },
+    /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+    { 0x00000000aa1181c2, 0x000000000361139c },
+    /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+    { 0x00000000ddd08002, 0x00000001a5a1a3a8 },
+    /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+    { 0x00000000e8dd0446, 0x000000006844e0b0 },
+    /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+    { 0x00000001bbd94a00, 0x00000000c3762f28 },
+    /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+    { 0x00000000ab6cd180, 0x00000001d26287a2 },
+    /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+    { 0x0000000031803ce2, 0x00000001f6f0bba8 },
+    /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+    { 0x0000000024f40b0c, 0x000000002ffabd62 },
+    /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+    { 0x00000001ba1d9834, 0x00000000fb4516b8 },
+    /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+    { 0x0000000104de61aa, 0x000000018cfa961c },
+    /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+    { 0x0000000113e40d46, 0x000000019e588d52 },
+    /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+    { 0x00000001415598a0, 0x00000001180f0bbc },
+    /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+    { 0x00000000bf6c8c90, 0x00000000e1d9177a },
+    /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+    { 0x00000001788b0504, 0x0000000105abc27c },
+    /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+    { 0x0000000038385d02, 0x00000000972e4a58 },
+    /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+    { 0x00000001b6c83844, 0x0000000183499a5e },
+    /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+    { 0x0000000051061a8a, 0x00000001c96a8cca },
+    /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+    { 0x000000017351388a, 0x00000001a1a5b60c },
+    /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+    { 0x0000000132928f92, 0x00000000e4b6ac9c },
+    /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+    { 0x00000000e6b4f48a, 0x00000001807e7f5a },
+    /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+    { 0x0000000039d15e90, 0x000000017a7e3bc8 },
+    /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+    { 0x00000000312d6074, 0x00000000d73975da },
+    /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+    { 0x000000017bbb2cc4, 0x000000017375d038 },
+    /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+    { 0x000000016ded3e18, 0x00000000193680bc },
+    /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+    { 0x00000000f1638b16, 0x00000000999b06f6 },
+    /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+    { 0x00000001d38b9ecc, 0x00000001f685d2b8 },
+    /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+    { 0x000000018b8d09dc, 0x00000001f4ecbed2 },
+    /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+    { 0x00000000e7bc27d2, 0x00000000ba16f1a0 },
+    /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+    { 0x00000000275e1e96, 0x0000000115aceac4 },
+    /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+    { 0x00000000e2e3031e, 0x00000001aeff6292 },
+    /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+    { 0x00000001041c84d8, 0x000000009640124c },
+    /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+    { 0x00000000706ce672, 0x0000000114f41f02 },
+    /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+    { 0x000000015d5070da, 0x000000009c5f3586 },
+    /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+    { 0x0000000038f9493a, 0x00000001878275fa },
+    /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+    { 0x00000000a3348a76, 0x00000000ddc42ce8 },
+    /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+    { 0x00000001ad0aab92, 0x0000000181d2c73a },
+    /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+    { 0x000000019e85f712, 0x0000000141c9320a },
+    /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+    { 0x000000005a871e76, 0x000000015235719a },
+    /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+    { 0x000000017249c662, 0x00000000be27d804 },
+    /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+    { 0x000000003a084712, 0x000000006242d45a },
+    /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+    { 0x00000000ed438478, 0x000000009a53638e },
+    /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+    { 0x00000000abac34cc, 0x00000001001ecfb6 },
+    /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+    { 0x000000005f35ef3e, 0x000000016d7c2d64 },
+    /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+    { 0x0000000047d6608c, 0x00000001d0ce46c0 },
+    /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+    { 0x000000002d01470e, 0x0000000124c907b4 },
+    /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+    { 0x0000000158bbc7b0, 0x0000000018a555ca },
+    /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+    { 0x00000000c0a23e8e, 0x000000006b0980bc },
+    /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+    { 0x00000001ebd85c88, 0x000000008bbba964 },
+    /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+    { 0x000000019ee20bb2, 0x00000001070a5a1e },
+    /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+    { 0x00000001acabf2d6, 0x000000002204322a },
+    /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+    { 0x00000001b7963d56, 0x00000000a27524d0 },
+    /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+    { 0x000000017bffa1fe, 0x0000000020b1e4ba },
+    /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+    { 0x000000001f15333e, 0x0000000032cc27fc },
+    /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+    { 0x000000018593129e, 0x0000000044dd22b8 },
+    /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+    { 0x000000019cb32602, 0x00000000dffc9e0a },
+    /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+    { 0x0000000142b05cc8, 0x00000001b7a0ed14 },
+    /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+    { 0x00000001be49e7a4, 0x00000000c7842488 },
+    /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+    { 0x0000000108f69d6c, 0x00000001c02a4fee },
+    /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+    { 0x000000006c0971f0, 0x000000003c273778 },
+    /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+    { 0x000000005b16467a, 0x00000001d63f8894 },
+    /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+    { 0x00000001551a628e, 0x000000006be557d6 },
+    /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+    { 0x000000019e42ea92, 0x000000006a7806ea },
+    /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+    { 0x000000012fa83ff2, 0x000000016155aa0c },
+    /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+    { 0x000000011ca9cde0, 0x00000000908650ac },
+    /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+    { 0x00000000c8e5cd74, 0x00000000aa5a8084 },
+    /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+    { 0x0000000096c27f0c, 0x0000000191bb500a },
+    /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+    { 0x000000002baed926, 0x0000000064e9bed0 },
+    /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+    { 0x000000017c8de8d2, 0x000000009444f302 },
+    /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+    { 0x00000000d43d6068, 0x000000019db07d3c },
+    /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+    { 0x00000000cb2c4b26, 0x00000001359e3e6e },
+    /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+    { 0x0000000145b8da26, 0x00000001e4f10dd2 },
+    /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+    { 0x000000018fff4b08, 0x0000000124f5735e },
+    /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+    { 0x0000000150b58ed0, 0x0000000124760a4c },
+    /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+    { 0x00000001549f39bc, 0x000000000f1fc186 },
+    /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+    { 0x00000000ef4d2f42, 0x00000000150e4cc4 },
+    /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+    { 0x00000001b1468572, 0x000000002a6204e8 },
+    /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+    { 0x000000013d7403b2, 0x00000000beb1d432 },
+    /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+    { 0x00000001a4681842, 0x0000000135f3f1f0 },
+    /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+    { 0x0000000167714492, 0x0000000074fe2232 },
+    /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+    { 0x00000001e599099a, 0x000000001ac6e2ba },
+    /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+    { 0x00000000fe128194, 0x0000000013fca91e },
+    /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+    { 0x0000000077e8b990, 0x0000000183f4931e },
+    /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+    { 0x00000001a267f63a, 0x00000000b6d9b4e4 },
+    /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+    { 0x00000001945c245a, 0x00000000b5188656 },
+    /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+    { 0x0000000149002e76, 0x0000000027a81a84 },
+    /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+    { 0x00000001bb8310a4, 0x0000000125699258 },
+    /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+    { 0x000000019ec60bcc, 0x00000001b23de796 },
+    /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+    { 0x000000012d8590ae, 0x00000000fe4365dc },
+    /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+    { 0x0000000065b00684, 0x00000000c68f497a },
+    /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+    { 0x000000015e5aeadc, 0x00000000fbf521ee },
+    /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+    { 0x00000000b77ff2b0, 0x000000015eac3378 },
+    /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+    { 0x0000000188da2ff6, 0x0000000134914b90 },
+    /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+    { 0x0000000063da929a, 0x0000000016335cfe },
+    /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+    { 0x00000001389caa80, 0x000000010372d10c },
+    /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+    { 0x000000013db599d2, 0x000000015097b908 },
+    /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+    { 0x0000000122505a86, 0x00000001227a7572 },
+    /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+    { 0x000000016bd72746, 0x000000009a8f75c0 },
+    /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+    { 0x00000001c3faf1d4, 0x00000000682c77a2 },
+    /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+    { 0x00000001111c826c, 0x00000000231f091c },
+    /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+    { 0x00000000153e9fb2, 0x000000007d4439f2 },
+    /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+    { 0x000000002b1f7b60, 0x000000017e221efc },
+    /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+    { 0x00000000b1dba570, 0x0000000167457c38 },
+    /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+    { 0x00000001f6397b76, 0x00000000bdf081c4 },
+    /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+    { 0x0000000156335214, 0x000000016286d6b0 },
+    /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+    { 0x00000001d70e3986, 0x00000000c84f001c },
+    /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+    { 0x000000003701a774, 0x0000000064efe7c0 },
+    /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+    { 0x00000000ac81ef72, 0x000000000ac2d904 },
+    /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+    { 0x0000000133212464, 0x00000000fd226d14 },
+    /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+    { 0x00000000e4e45610, 0x000000011cfd42e0 },
+    /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+    { 0x000000000c1bd370, 0x000000016e5a5678 },
+    /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+    { 0x00000001a7b9e7a6, 0x00000001d888fe22 },
+    /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+    { 0x000000007d657a10, 0x00000001af77fcd4 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+    { 0x00000001651797d2, 0x0000000099ea94a8 },
+    /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+    { 0x0000000021e0d56c, 0x00000000945a8420 },
+    /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+    { 0x000000000f95ecaa, 0x0000000030762706 },
+    /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+    { 0x00000001ebd224ac, 0x00000001a52fc582 },
+    /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+    { 0x000000000ccb97ca, 0x00000001a4a7167a },
+    /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+    { 0x00000001006ec8a8, 0x000000000c18249a },
+    /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+    { 0x000000014f58f196, 0x00000000a924ae7c },
+    /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+    { 0x00000001a7192ca6, 0x00000001e12ccc12 },
+    /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+    { 0x000000019a64bab2, 0x00000000a0b9d4ac },
+    /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+    { 0x0000000014f4ed2e, 0x0000000095e8ddfe },
+    /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+    { 0x000000011092b6a2, 0x00000000233fddc4 },
+    /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+    { 0x00000000c8a1629c, 0x00000001b4529b62 },
+    /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+    { 0x000000017bf32e8e, 0x00000001a7fa0e64 },
+    /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+    { 0x00000001f8cc6582, 0x00000001b5334592 },
+    /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+    { 0x000000008631ddf0, 0x000000011f8ee1b4 },
+    /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+    { 0x000000007e5a76d0, 0x000000006252e632 },
+    /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+    { 0x000000002b09b31c, 0x00000000ab973e84 },
+    /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+    { 0x00000001b2df1f84, 0x000000007734f5ec },
+    /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+    { 0x00000001d6f56afc, 0x000000007c547798 },
+    /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+    { 0x00000001b9b5e70c, 0x000000007ec40210 },
+    /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+    { 0x0000000034b626d2, 0x00000001ab1695a8 },
+    /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+    { 0x000000014c53479a, 0x0000000090494bba },
+    /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+    { 0x00000001a6d179a4, 0x00000001123fb816 },
+    /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+    { 0x000000015abd16b4, 0x00000001e188c74c },
+    /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+    { 0x00000000018f9852, 0x00000001c2d3451c },
+    /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+    { 0x000000001fb3084a, 0x00000000f55cf1ca },
+    /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+    { 0x00000000c53dfb04, 0x00000001a0531540 },
+    /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+    { 0x00000000e10c9ad6, 0x0000000132cd7ebc },
+    /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+    { 0x0000000025aa994a, 0x0000000073ab7f36 },
+    /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+    { 0x00000000fa3a74c4, 0x0000000041aed1c2 },
+    /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+    { 0x0000000033eb3f40, 0x0000000136c53800 },
+    /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+    { 0x000000017193f296, 0x0000000126835a30 },
+    /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+    { 0x0000000043f6c86a, 0x000000006241b502 },
+    /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+    { 0x000000016b513ec6, 0x00000000d5196ad4 },
+    /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+    { 0x00000000c8f25b4e, 0x000000009cfa769a },
+    /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+    { 0x00000001a45048ec, 0x00000000920e5df4 },
+    /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+    { 0x000000000c441004, 0x0000000169dc310e },
+    /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+    { 0x000000000e17cad6, 0x0000000009fc331c },
+    /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+    { 0x00000001253ae964, 0x000000010d94a81e },
+    /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+    { 0x00000001d7c88ebc, 0x0000000027a20ab2 },
+    /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+    { 0x00000001e7ca913a, 0x0000000114f87504 },
+    /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+    { 0x0000000033ed078a, 0x000000004b076d96 },
+    /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+    { 0x00000000e1839c78, 0x00000000da4d1e74 },
+    /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+    { 0x00000001322b267e, 0x000000001b81f672 },
+    /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+    { 0x00000000638231b6, 0x000000009367c988 },
+    /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+    { 0x00000001ee7f16f4, 0x00000001717214ca },
+    /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+    { 0x0000000117d9924a, 0x000000009f47d820 },
+    /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+    { 0x00000000e1a9e0c4, 0x000000010d9a47d2 },
+    /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+    { 0x00000001403731dc, 0x00000000a696c58c },
+    /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+    { 0x00000001a5ea9682, 0x000000002aa28ec6 },
+    /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+    { 0x0000000101c5c578, 0x00000001fe18fd9a },
+    /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+    { 0x00000000dddf6494, 0x000000019d4fc1ae },
+    /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+    { 0x00000000f1c3db28, 0x00000001ba0e3dea },
+    /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+    { 0x000000013112fb9c, 0x0000000074b59a5e },
+    /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+    { 0x00000000b680b906, 0x00000000f2b5ea98 },
+    /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+    { 0x000000001a282932, 0x0000000187132676 },
+    /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+    { 0x0000000089406e7e, 0x000000010a8c6ad4 },
+    /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+    { 0x00000001def6be8c, 0x00000001e21dfe70 },
+    /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+    { 0x0000000075258728, 0x00000001da0050e4 },
+    /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+    { 0x000000019536090a, 0x00000000772172ae },
+    /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+    { 0x00000000f2455bfc, 0x00000000e47724aa },
+    /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+    { 0x000000018c40baf4, 0x000000003cd63ac4 },
+    /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+    { 0x000000004cd390d4, 0x00000001bf47d352 },
+    /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+    { 0x00000001e4ece95a, 0x000000018dc1d708 },
+    /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+    { 0x000000001a3ee918, 0x000000002d4620a4 },
+    /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+    { 0x000000007c652fb8, 0x0000000058fd1740 },
+    /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+    { 0x000000011c67842c, 0x00000000dadd9bfc },
+    /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+    { 0x00000000254f759c, 0x00000001ea2140be },
+    /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+    { 0x000000007ece94ca, 0x000000009de128ba },
+    /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+    { 0x0000000038f258c2, 0x000000013ac3aa8e },
+    /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+    { 0x00000001cdf17b00, 0x0000000099980562 },
+    /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+    { 0x000000011f882c16, 0x00000001c1579c86 },
+    /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+    { 0x0000000100093fc8, 0x0000000068dbbf94 },
+    /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+    { 0x00000001cd684f16, 0x000000004509fb04 },
+    /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+    { 0x000000004bc6a70a, 0x00000001202f6398 },
+    /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+    { 0x000000004fc7e8e4, 0x000000013aea243e },
+    /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+    { 0x0000000130103f1c, 0x00000001b4052ae6 },
+    /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+    { 0x0000000111b0024c, 0x00000001cd2a0ae8 },
+    /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+    { 0x000000010b3079da, 0x00000001fe4aa8b4 },
+    /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+    { 0x000000010192bcc2, 0x00000001d1559a42 },
+    /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+    { 0x0000000074838d50, 0x00000001f3e05ecc },
+    /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+    { 0x000000001b20f520, 0x0000000104ddd2cc },
+    /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+    { 0x0000000050c3590a, 0x000000015393153c },
+    /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+    { 0x00000000b41cac8e, 0x0000000057e942c6 },
+    /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+    { 0x000000000c72cc78, 0x000000012c633850 },
+    /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+    { 0x0000000030cdb032, 0x00000000ebcaae4c },
+    /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+    { 0x000000013e09fc32, 0x000000013ee532a6 },
+    /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+    { 0x000000001ed624d2, 0x00000001bf0cbc7e },
+    /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+    { 0x00000000781aee1a, 0x00000000d50b7a5a },
+    /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+    { 0x00000001c4d8348c, 0x0000000002fca6e8 },
+    /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+    { 0x0000000057a40336, 0x000000007af40044 },
+    /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+    { 0x0000000085544940, 0x0000000016178744 },
+    /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+    { 0x000000019cd21e80, 0x000000014c177458 },
+    /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+    { 0x000000013eb95bc0, 0x000000011b6ddf04 },
+    /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+    { 0x00000001dfc9fdfc, 0x00000001f3e29ccc },
+    /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+    { 0x00000000cd028bc2, 0x0000000135ae7562 },
+    /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+    { 0x0000000090db8c44, 0x0000000190ef812c },
+    /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+    { 0x000000010010a4ce, 0x0000000067a2c786 },
+    /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+    { 0x00000001c8f4c72c, 0x0000000048b9496c },
+    /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+    { 0x000000001c26170c, 0x000000015a422de6 },
+    /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+    { 0x00000000e3fccf68, 0x00000001ef0e3640 },
+    /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+    { 0x00000000d513ed24, 0x00000001006d2d26 },
+    /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+    { 0x00000000141beada, 0x00000001170d56d6 },
+    /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+    { 0x000000011071aea0, 0x00000000a5fb613c },
+    /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+    { 0x000000012e19080a, 0x0000000040bbf7fc },
+    /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+    { 0x0000000100ecf826, 0x000000016ac3a5b2 },
+    /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+    { 0x0000000069b09412, 0x00000000abf16230 },
+    /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+    { 0x0000000122297bac, 0x00000001ebe23fac },
+    /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+    { 0x00000000e9e4b068, 0x000000008b6a0894 },
+    /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+    { 0x000000004b38651a, 0x00000001288ea478 },
+    /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+    { 0x00000001468360e2, 0x000000016619c442 },
+    /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+    { 0x00000000121c2408, 0x0000000086230038 },
+    /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+    { 0x00000000da7e7d08, 0x000000017746a756 },
+    /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+    { 0x00000001058d7652, 0x0000000191b8f8f8 },
+    /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+    { 0x000000014a098a90, 0x000000008e167708 },
+    /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+    { 0x0000000020dbe72e, 0x0000000148b22d54 },
+    /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+    { 0x000000011e7323e8, 0x0000000044ba2c3c },
+    /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+    { 0x00000000d5d4bf94, 0x00000000b54d2b52 },
+    /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+    { 0x0000000199d8746c, 0x0000000005a4fd8a },
+    /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+    { 0x00000000ce9ca8a0, 0x0000000139f9fc46 },
+    /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+    { 0x00000000136edece, 0x000000015a1fa824 },
+    /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+    { 0x000000019b92a068, 0x000000000a61ae4c },
+    /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+    { 0x0000000071d62206, 0x0000000145e9113e },
+    /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+    { 0x00000000dfc50158, 0x000000006a348448 },
+    /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+    { 0x00000001517626bc, 0x000000004d80a08c },
+    /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+    { 0x0000000148d1e4fa, 0x000000014b6837a0 },
+    /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+    { 0x0000000094d8266e, 0x000000016896a7fc },
+    /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+    { 0x00000000606c5e34, 0x000000014f187140 },
+    /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+    { 0x000000019766beaa, 0x000000019581b9da },
+    /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+    { 0x00000001d80c506c, 0x00000001091bc984 },
+    /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+    { 0x000000001e73837c, 0x000000001067223c },
+    /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+    { 0x0000000064d587de, 0x00000001ab16ea02 },
+    /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+    { 0x00000000f4a507b0, 0x000000013c4598a8 },
+    /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+    { 0x0000000040e342fc, 0x00000000b3735430 },
+    /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+    { 0x00000001d5ad9c3a, 0x00000001bb3fc0c0 },
+    /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+    { 0x0000000094a691a4, 0x00000001570ae19c },
+    /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+    { 0x00000001271ecdfa, 0x00000001ea910712 },
+    /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+    { 0x000000009e54475a, 0x0000000167127128 },
+    /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+    { 0x00000000c9c099ee, 0x0000000019e790a2 },
+    /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+    { 0x000000009a2f736c, 0x000000003788f710 },
+    /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+    { 0x00000000bb9f4996, 0x00000001682a160e },
+    /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+    { 0x00000001db688050, 0x000000007f0ebd2e },
+    /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+    { 0x00000000e9b10af4, 0x000000002b032080 },
+    /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+    { 0x000000012d4545e4, 0x00000000cfd1664a },
+    /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+    { 0x000000000361139c, 0x00000000aa1181c2 },
+    /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+    { 0x00000001a5a1a3a8, 0x00000000ddd08002 },
+    /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+    { 0x000000006844e0b0, 0x00000000e8dd0446 },
+    /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+    { 0x00000000c3762f28, 0x00000001bbd94a00 },
+    /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+    { 0x00000001d26287a2, 0x00000000ab6cd180 },
+    /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+    { 0x00000001f6f0bba8, 0x0000000031803ce2 },
+    /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+    { 0x000000002ffabd62, 0x0000000024f40b0c },
+    /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+    { 0x00000000fb4516b8, 0x00000001ba1d9834 },
+    /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+    { 0x000000018cfa961c, 0x0000000104de61aa },
+    /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+    { 0x000000019e588d52, 0x0000000113e40d46 },
+    /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+    { 0x00000001180f0bbc, 0x00000001415598a0 },
+    /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+    { 0x00000000e1d9177a, 0x00000000bf6c8c90 },
+    /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+    { 0x0000000105abc27c, 0x00000001788b0504 },
+    /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+    { 0x00000000972e4a58, 0x0000000038385d02 },
+    /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+    { 0x0000000183499a5e, 0x00000001b6c83844 },
+    /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+    { 0x00000001c96a8cca, 0x0000000051061a8a },
+    /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+    { 0x00000001a1a5b60c, 0x000000017351388a },
+    /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+    { 0x00000000e4b6ac9c, 0x0000000132928f92 },
+    /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+    { 0x00000001807e7f5a, 0x00000000e6b4f48a },
+    /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+    { 0x000000017a7e3bc8, 0x0000000039d15e90 },
+    /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+    { 0x00000000d73975da, 0x00000000312d6074 },
+    /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+    { 0x000000017375d038, 0x000000017bbb2cc4 },
+    /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+    { 0x00000000193680bc, 0x000000016ded3e18 },
+    /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+    { 0x00000000999b06f6, 0x00000000f1638b16 },
+    /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+    { 0x00000001f685d2b8, 0x00000001d38b9ecc },
+    /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+    { 0x00000001f4ecbed2, 0x000000018b8d09dc },
+    /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+    { 0x00000000ba16f1a0, 0x00000000e7bc27d2 },
+    /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+    { 0x0000000115aceac4, 0x00000000275e1e96 },
+    /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+    { 0x00000001aeff6292, 0x00000000e2e3031e },
+    /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+    { 0x000000009640124c, 0x00000001041c84d8 },
+    /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+    { 0x0000000114f41f02, 0x00000000706ce672 },
+    /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+    { 0x000000009c5f3586, 0x000000015d5070da },
+    /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+    { 0x00000001878275fa, 0x0000000038f9493a },
+    /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+    { 0x00000000ddc42ce8, 0x00000000a3348a76 },
+    /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+    { 0x0000000181d2c73a, 0x00000001ad0aab92 },
+    /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+    { 0x0000000141c9320a, 0x000000019e85f712 },
+    /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+    { 0x000000015235719a, 0x000000005a871e76 },
+    /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+    { 0x00000000be27d804, 0x000000017249c662 },
+    /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+    { 0x000000006242d45a, 0x000000003a084712 },
+    /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+    { 0x000000009a53638e, 0x00000000ed438478 },
+    /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+    { 0x00000001001ecfb6, 0x00000000abac34cc },
+    /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+    { 0x000000016d7c2d64, 0x000000005f35ef3e },
+    /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+    { 0x00000001d0ce46c0, 0x0000000047d6608c },
+    /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+    { 0x0000000124c907b4, 0x000000002d01470e },
+    /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+    { 0x0000000018a555ca, 0x0000000158bbc7b0 },
+    /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+    { 0x000000006b0980bc, 0x00000000c0a23e8e },
+    /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+    { 0x000000008bbba964, 0x00000001ebd85c88 },
+    /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+    { 0x00000001070a5a1e, 0x000000019ee20bb2 },
+    /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+    { 0x000000002204322a, 0x00000001acabf2d6 },
+    /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+    { 0x00000000a27524d0, 0x00000001b7963d56 },
+    /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+    { 0x0000000020b1e4ba, 0x000000017bffa1fe },
+    /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+    { 0x0000000032cc27fc, 0x000000001f15333e },
+    /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+    { 0x0000000044dd22b8, 0x000000018593129e },
+    /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+    { 0x00000000dffc9e0a, 0x000000019cb32602 },
+    /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+    { 0x00000001b7a0ed14, 0x0000000142b05cc8 },
+    /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+    { 0x00000000c7842488, 0x00000001be49e7a4 },
+    /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+    { 0x00000001c02a4fee, 0x0000000108f69d6c },
+    /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+    { 0x000000003c273778, 0x000000006c0971f0 },
+    /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+    { 0x00000001d63f8894, 0x000000005b16467a },
+    /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+    { 0x000000006be557d6, 0x00000001551a628e },
+    /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+    { 0x000000006a7806ea, 0x000000019e42ea92 },
+    /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+    { 0x000000016155aa0c, 0x000000012fa83ff2 },
+    /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+    { 0x00000000908650ac, 0x000000011ca9cde0 },
+    /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+    { 0x00000000aa5a8084, 0x00000000c8e5cd74 },
+    /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+    { 0x0000000191bb500a, 0x0000000096c27f0c },
+    /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+    { 0x0000000064e9bed0, 0x000000002baed926 },
+    /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+    { 0x000000009444f302, 0x000000017c8de8d2 },
+    /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+    { 0x000000019db07d3c, 0x00000000d43d6068 },
+    /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+    { 0x00000001359e3e6e, 0x00000000cb2c4b26 },
+    /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+    { 0x00000001e4f10dd2, 0x0000000145b8da26 },
+    /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+    { 0x0000000124f5735e, 0x000000018fff4b08 },
+    /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+    { 0x0000000124760a4c, 0x0000000150b58ed0 },
+    /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+    { 0x000000000f1fc186, 0x00000001549f39bc },
+    /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+    { 0x00000000150e4cc4, 0x00000000ef4d2f42 },
+    /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+    { 0x000000002a6204e8, 0x00000001b1468572 },
+    /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+    { 0x00000000beb1d432, 0x000000013d7403b2 },
+    /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+    { 0x0000000135f3f1f0, 0x00000001a4681842 },
+    /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+    { 0x0000000074fe2232, 0x0000000167714492 },
+    /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+    { 0x000000001ac6e2ba, 0x00000001e599099a },
+    /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+    { 0x0000000013fca91e, 0x00000000fe128194 },
+    /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+    { 0x0000000183f4931e, 0x0000000077e8b990 },
+    /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+    { 0x00000000b6d9b4e4, 0x00000001a267f63a },
+    /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+    { 0x00000000b5188656, 0x00000001945c245a },
+    /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+    { 0x0000000027a81a84, 0x0000000149002e76 },
+    /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+    { 0x0000000125699258, 0x00000001bb8310a4 },
+    /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+    { 0x00000001b23de796, 0x000000019ec60bcc },
+    /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+    { 0x00000000fe4365dc, 0x000000012d8590ae },
+    /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+    { 0x00000000c68f497a, 0x0000000065b00684 },
+    /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+    { 0x00000000fbf521ee, 0x000000015e5aeadc },
+    /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+    { 0x000000015eac3378, 0x00000000b77ff2b0 },
+    /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+    { 0x0000000134914b90, 0x0000000188da2ff6 },
+    /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+    { 0x0000000016335cfe, 0x0000000063da929a },
+    /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+    { 0x000000010372d10c, 0x00000001389caa80 },
+    /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+    { 0x000000015097b908, 0x000000013db599d2 },
+    /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+    { 0x00000001227a7572, 0x0000000122505a86 },
+    /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+    { 0x000000009a8f75c0, 0x000000016bd72746 },
+    /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+    { 0x00000000682c77a2, 0x00000001c3faf1d4 },
+    /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+    { 0x00000000231f091c, 0x00000001111c826c },
+    /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+    { 0x000000007d4439f2, 0x00000000153e9fb2 },
+    /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+    { 0x000000017e221efc, 0x000000002b1f7b60 },
+    /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+    { 0x0000000167457c38, 0x00000000b1dba570 },
+    /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+    { 0x00000000bdf081c4, 0x00000001f6397b76 },
+    /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+    { 0x000000016286d6b0, 0x0000000156335214 },
+    /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+    { 0x00000000c84f001c, 0x00000001d70e3986 },
+    /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+    { 0x0000000064efe7c0, 0x000000003701a774 },
+    /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+    { 0x000000000ac2d904, 0x00000000ac81ef72 },
+    /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+    { 0x00000000fd226d14, 0x0000000133212464 },
+    /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+    { 0x000000011cfd42e0, 0x00000000e4e45610 },
+    /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+    { 0x000000016e5a5678, 0x000000000c1bd370 },
+    /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+    { 0x00000001d888fe22, 0x00000001a7b9e7a6 },
+    /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+    { 0x00000001af77fcd4, 0x000000007d657a10 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+
+static const __vector unsigned long long vcrc_short_const[16] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x)  */
+    { 0x99168a18ec447f11, 0xed837b2613e8221e },
+    /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x)  */
+    { 0xe23e954e8fd2cd3c, 0xc8acdd8147b9ce5a },
+    /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x)  */
+    { 0x92f8befe6b1d2b53, 0xd9ad6d87d4277e25 },
+    /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x)  */
+    { 0xf38a3556291ea462, 0xc10ec5e033fbca3b },
+    /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x)  */
+    { 0x974ac56262b6ca4b, 0xc0b55b0e82e02e2f },
+    /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x)  */
+    { 0x855712b3784d2a56, 0x71aa1df0e172334d },
+    /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x)  */
+    { 0xa5abe9f80eaee722, 0xfee3053e3969324d },
+    /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x)  */
+    { 0x1fa0943ddb54814c, 0xf44779b93eb2bd08 },
+    /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x)  */
+    { 0xa53ff440d7bbfe6a, 0xf5449b3f00cc3374 },
+    /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x)  */
+    { 0xebe7e3566325605c, 0x6f8346e1d777606e },
+    /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x)  */
+    { 0xc65a272ce5b592b8, 0xe3ab4f2ac0b95347 },
+    /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x)  */
+    { 0x5705a9ca4721589f, 0xaa2215ea329ecc11 },
+    /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x)  */
+    { 0xe3720acb88d14467, 0x1ed8f66ed95efd26 },
+    /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x)  */
+    { 0xba1aca0315141c31, 0x78ed02d5a700e96a },
+    /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x)  */
+    { 0xad2a31b3ed627dae, 0xba8ccbe832b39da3 },
+    /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x)  */
+    { 0x6655004fa06a2517, 0xedb88320b1e6b092 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x)  */
+    { 0xed837b2613e8221e, 0x99168a18ec447f11 },
+    /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x)  */
+    { 0xc8acdd8147b9ce5a, 0xe23e954e8fd2cd3c },
+    /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x)  */
+    { 0xd9ad6d87d4277e25, 0x92f8befe6b1d2b53 },
+    /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x)  */
+    { 0xc10ec5e033fbca3b, 0xf38a3556291ea462 },
+    /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x)  */
+    { 0xc0b55b0e82e02e2f, 0x974ac56262b6ca4b },
+    /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x)  */
+    { 0x71aa1df0e172334d, 0x855712b3784d2a56 },
+    /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x)  */
+    { 0xfee3053e3969324d, 0xa5abe9f80eaee722 },
+    /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x)  */
+    { 0xf44779b93eb2bd08, 0x1fa0943ddb54814c },
+    /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x)  */
+    { 0xf5449b3f00cc3374, 0xa53ff440d7bbfe6a },
+    /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x)  */
+    { 0x6f8346e1d777606e, 0xebe7e3566325605c },
+    /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x)  */
+    { 0xe3ab4f2ac0b95347, 0xc65a272ce5b592b8 },
+    /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x)  */
+    { 0xaa2215ea329ecc11, 0x5705a9ca4721589f },
+    /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x)  */
+    { 0x1ed8f66ed95efd26, 0xe3720acb88d14467 },
+    /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x)  */
+    { 0x78ed02d5a700e96a, 0xba1aca0315141c31 },
+    /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x)  */
+    { 0xba8ccbe832b39da3, 0xad2a31b3ed627dae },
+    /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x)  */
+    { 0xedb88320b1e6b092, 0x6655004fa06a2517 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Barrett constants */
+/* 33 bit reflected Barrett constant m - (4^32)/n */
+
+static const __vector unsigned long long v_Barrett_const[2] ALIGNED_(16) = {
+    /* x^64 div p(x)  */
+#if BYTE_ORDER == LITTLE_ENDIAN
+    { 0x00000001f7011641, 0x0000000000000000 },
+    { 0x00000001db710641, 0x0000000000000000 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    { 0x0000000000000000, 0x00000001f7011641 },
+    { 0x0000000000000000, 0x00000001db710641 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
diff --git a/neozip/arch/power/crc32_power8.c b/neozip/arch/power/crc32_power8.c
new file mode 100644
index 0000000000..a7a2fb7435
--- /dev/null
+++ b/neozip/arch/power/crc32_power8.c
@@ -0,0 +1,593 @@
+/* crc32 for POWER8 using VSX instructions
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Author: Rogerio Alves <rogealve@br.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ */
+
+#ifdef POWER8_VSX_CRC32
+
+#include "zbuild.h"
+#include "zendian.h"
+
+#include "crc32_constants.h"
+#include "crc32_braid_tbl.h"
+
+#include "power_intrins.h"
+
+#define MAX_SIZE    32768
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
+    while (len--)
+        crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+    return crc;
+}
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
+    unsigned int prealign;
+    unsigned int tail;
+
+    unsigned long len = (unsigned long) _len;
+
+    crc ^= 0xffffffff;
+
+    if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+        crc = crc32_align(crc, p, len);
+        goto out;
+    }
+
+    if ((unsigned long)p & VMX_ALIGN_MASK) {
+        prealign = (unsigned int)ALIGN_DIFF(p, VMX_ALIGN);
+        crc = crc32_align(crc, p, prealign);
+        len -= prealign;
+        p += prealign;
+    }
+
+    crc = __crc32_vpmsum(crc, p, ALIGN_DOWN(len, VMX_ALIGN));
+
+    tail = len & VMX_ALIGN_MASK;
+    if (tail) {
+        p += ALIGN_DOWN(len, VMX_ALIGN);
+        crc = crc32_align(crc, p, tail);
+    }
+
+out:
+    crc ^= 0xffffffff;
+
+    return crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_power8(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that forward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+    const __vector unsigned long long vzero = {0,0};
+    const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+    const __vector unsigned long long vmask_32bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+
+    const __vector unsigned long long vmask_64bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+    __vector unsigned long long vcrc;
+
+    __vector unsigned long long vconst1, vconst2;
+
+    /* vdata0-vdata7 will contain our data (p). */
+    __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+    /* v0-v7 will contain our checksums */
+    __vector unsigned long long v0 = {0,0};
+    __vector unsigned long long v1 = {0,0};
+    __vector unsigned long long v2 = {0,0};
+    __vector unsigned long long v3 = {0,0};
+    __vector unsigned long long v4 = {0,0};
+    __vector unsigned long long v5 = {0,0};
+    __vector unsigned long long v6 = {0,0};
+    __vector unsigned long long v7 = {0,0};
+
+
+    /* Vector auxiliary variables. */
+    __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+    unsigned int offset; /* Constant table offset. */
+
+    unsigned long i; /* Counter. */
+    unsigned long chunks;
+
+    unsigned long block_size;
+    int next_block = 0;
+
+    /* Align by 128 bits. The last 128 bit block will be processed at end. */
+    unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+    vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
+
+    /* Short version. */
+    if (len < 256) {
+        /* Calculate where in the constant table we need to start. */
+        offset = 256 - len;
+
+        vconst1 = vec_ld(offset, vcrc_short_const);
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+        /* xor initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+        v0 = vec_xor(v0, vdata0);
+
+        for (i = 16; i < len; i += 16) {
+            vconst1 = vec_ld(offset + i, vcrc_short_const);
+            vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+            VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+            vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+            v0 = vec_xor(v0, vdata0);
+        }
+    } else {
+
+        /* Load initial values. */
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+        VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+        vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+        vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+        VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+        vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+        vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+        VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+        vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+        vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+        VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+        /* xor in initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        p = (char *)p + 128;
+
+        do {
+            /* Checksum in blocks of MAX_SIZE. */
+            block_size = length;
+            if (block_size > MAX_SIZE) {
+                block_size = MAX_SIZE;
+            }
+
+            length = length - block_size;
+
+            /*
+             * Work out the offset into the constants table to start at. Each
+             * constant is 16 bytes, and it is used against 128 bytes of input
+             * data - 128 / 16 = 8
+             */
+            offset = (MAX_SIZE/8) - (block_size/8);
+            /* We reduce our final 128 bytes in a separate step */
+            chunks = (block_size/128)-1;
+
+            vconst1 = vec_ld(offset, vcrc_const);
+
+            va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                           (__vector unsigned long long)vconst1);
+            va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                           (__vector unsigned long long)vconst1);
+            va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                           (__vector unsigned long long)vconst1);
+            va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                           (__vector unsigned long long)vconst1);
+            va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                           (__vector unsigned long long)vconst1);
+            va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                           (__vector unsigned long long)vconst1);
+            va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                           (__vector unsigned long long)vconst1);
+            va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                           (__vector unsigned long long)vconst1);
+
+            if (chunks > 1) {
+                offset += 16;
+                vconst2 = vec_ld(offset, vcrc_const);
+                GROUP_ENDING_NOP;
+
+                vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+                vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+                vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+                vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+                vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+                vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+                vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                p = (char *)p + 128;
+
+                /*
+                 * main loop. Each iteration calculates the CRC for a 128-byte
+                 * block.
+                 */
+                for (i = 0; i < chunks-2; i++) {
+                    vconst1 = vec_ld(offset, vcrc_const);
+                    offset += 16;
+                    GROUP_ENDING_NOP;
+
+                    v0 = vec_xor(v0, va0);
+                    va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v1 = vec_xor(v1, va1);
+                    va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v2 = vec_xor(v2, va2);
+                    va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
+                                                   vdata2, (__vector unsigned long long)vconst2);
+                    vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v3 = vec_xor(v3, va3);
+                    va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                    vconst2 = vec_ld(offset, vcrc_const);
+                    GROUP_ENDING_NOP;
+
+                    v4 = vec_xor(v4, va4);
+                    va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v5 = vec_xor(v5, va5);
+                    va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v6 = vec_xor(v6, va6);
+                    va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v7 = vec_xor(v7, va7);
+                    va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                    p = (char *)p + 128;
+                }
+
+                /* First cool down */
+                vconst1 = vec_ld(offset, vcrc_const);
+                offset += 16;
+
+                v0 = vec_xor(v0, va0);
+                va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v1 = vec_xor(v1, va1);
+                va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v2 = vec_xor(v2, va2);
+                va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v3 = vec_xor(v3, va3);
+                va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v4 = vec_xor(v4, va4);
+                va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v5 = vec_xor(v5, va5);
+                va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v6 = vec_xor(v6, va6);
+                va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v7 = vec_xor(v7, va7);
+                va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                               (__vector unsigned long long)vconst1);
+            }/* else */
+
+            /* Second cool down. */
+            v0 = vec_xor(v0, va0);
+            v1 = vec_xor(v1, va1);
+            v2 = vec_xor(v2, va2);
+            v3 = vec_xor(v3, va3);
+            v4 = vec_xor(v4, va4);
+            v5 = vec_xor(v5, va5);
+            v6 = vec_xor(v6, va6);
+            v7 = vec_xor(v7, va7);
+
+            /*
+             * vpmsumd produces a 96 bit result in the least significant bits
+             * of the register. Since we are bit reflected we have to shift it
+             * left 32 bits so it occupies the least significant bits in the
+             * bit reflected domain.
+             */
+            v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                                      (__vector unsigned char)vzero, 4);
+            v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+                                                      (__vector unsigned char)vzero, 4);
+            v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+                                                      (__vector unsigned char)vzero, 4);
+            v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+                                                      (__vector unsigned char)vzero, 4);
+            v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+                                                      (__vector unsigned char)vzero, 4);
+            v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+                                                      (__vector unsigned char)vzero, 4);
+            v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+                                                      (__vector unsigned char)vzero, 4);
+            v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+                                                      (__vector unsigned char)vzero, 4);
+
+            /* xor with the last 1024 bits. */
+            va0 = vec_ld(0, (__vector unsigned long long*) p);
+            VEC_PERM(va0, va0, va0, vperm_const);
+
+            va1 = vec_ld(16, (__vector unsigned long long*) p);
+            VEC_PERM(va1, va1, va1, vperm_const);
+
+            va2 = vec_ld(32, (__vector unsigned long long*) p);
+            VEC_PERM(va2, va2, va2, vperm_const);
+
+            va3 = vec_ld(48, (__vector unsigned long long*) p);
+            VEC_PERM(va3, va3, va3, vperm_const);
+
+            va4 = vec_ld(64, (__vector unsigned long long*) p);
+            VEC_PERM(va4, va4, va4, vperm_const);
+
+            va5 = vec_ld(80, (__vector unsigned long long*) p);
+            VEC_PERM(va5, va5, va5, vperm_const);
+
+            va6 = vec_ld(96, (__vector unsigned long long*) p);
+            VEC_PERM(va6, va6, va6, vperm_const);
+
+            va7 = vec_ld(112, (__vector unsigned long long*) p);
+            VEC_PERM(va7, va7, va7, vperm_const);
+
+            p = (char *)p + 128;
+
+            vdata0 = vec_xor(v0, va0);
+            vdata1 = vec_xor(v1, va1);
+            vdata2 = vec_xor(v2, va2);
+            vdata3 = vec_xor(v3, va3);
+            vdata4 = vec_xor(v4, va4);
+            vdata5 = vec_xor(v5, va5);
+            vdata6 = vec_xor(v6, va6);
+            vdata7 = vec_xor(v7, va7);
+
+            /* Check if we have more blocks to process */
+            next_block = 0;
+            if (length != 0) {
+                next_block = 1;
+
+                /* zero v0-v7 */
+                v0 = vec_xor(v0, v0);
+                v1 = vec_xor(v1, v1);
+                v2 = vec_xor(v2, v2);
+                v3 = vec_xor(v3, v3);
+                v4 = vec_xor(v4, v4);
+                v5 = vec_xor(v5, v5);
+                v6 = vec_xor(v6, v6);
+                v7 = vec_xor(v7, v7);
+            }
+            length = length + 128;
+
+        } while (next_block);
+
+        /* Calculate how many bytes we have left. */
+        length = (len & 127);
+
+        /* Calculate where in (short) constant table we need to start. */
+        offset = 128 - length;
+
+        v0 = vec_ld(offset, vcrc_short_const);
+        v1 = vec_ld(offset + 16, vcrc_short_const);
+        v2 = vec_ld(offset + 32, vcrc_short_const);
+        v3 = vec_ld(offset + 48, vcrc_short_const);
+        v4 = vec_ld(offset + 64, vcrc_short_const);
+        v5 = vec_ld(offset + 80, vcrc_short_const);
+        v6 = vec_ld(offset + 96, vcrc_short_const);
+        v7 = vec_ld(offset + 112, vcrc_short_const);
+
+        offset += 128;
+
+        v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+        v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+        v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+        v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+        v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+        v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+        v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+        v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+        /* Now reduce the tail (0-112 bytes). */
+        for (i = 0; i < length; i+=16) {
+            vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+            VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+            va0 = vec_ld(offset + i,vcrc_short_const);
+            va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+            v0 = vec_xor(v0, va0);
+        }
+
+        /* xor all parallel chunks together. */
+        v0 = vec_xor(v0, v1);
+        v2 = vec_xor(v2, v3);
+        v4 = vec_xor(v4, v5);
+        v6 = vec_xor(v6, v7);
+
+        v0 = vec_xor(v0, v2);
+        v4 = vec_xor(v4, v6);
+
+        v0 = vec_xor(v0, v4);
+    }
+
+    /* Barrett Reduction */
+    vconst1 = vec_ld(0, v_Barrett_const);
+    vconst2 = vec_ld(16, v_Barrett_const);
+
+    v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)v0, 8);
+    v0 = vec_xor(v1,v0);
+
+    /* shift left one bit */
+    __vector unsigned char vsht_splat = vec_splat_u8 (1);
+    v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+
+    v0 = vec_and(v0, vmask_64bit);
+
+    /*
+     * The reflected version of Barrett reduction. Instead of bit
+     * reflecting our data (which is expensive to do), we bit reflect our
+     * constants and our algorithm, which means the intermediate data in
+     * our vector registers goes from 0-63 instead of 63-0. We can reflect
+     * the algorithm because we don't carry in mod 2 arithmetic.
+     */
+
+    /* bottom 32 bits of a */
+    v1 = vec_and(v0, vmask_32bit);
+
+    /* ma */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst1);
+
+    /* bottom 32bits of ma */
+    v1 = vec_and(v1, vmask_32bit);
+    /* qn */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst2);
+    /* a - qn, subtraction is xor in GF(2) */
+    v0 = vec_xor (v0, v1);
+
+    /*
+     * Since we are bit reflected, the result (ie the low 32 bits) is in
+     * the high 32 bits. We just need to shift it left 4 bytes
+     * V0 [ 0 1 X 3 ]
+     * V0 [ 0 X 2 3 ]
+     */
+
+    /* shift result into top 64 bits of */
+    v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)vzero, 4);
+
+#if BYTE_ORDER == BIG_ENDIAN
+    return v0[0];
+#else
+    return v0[1];
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_features.c b/neozip/arch/power/power_features.c
new file mode 100644
index 0000000000..148f30a974
--- /dev/null
+++ b/neozip/arch/power/power_features.c
@@ -0,0 +1,54 @@
+/* power_features.c - POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(PPC_FEATURES) || defined(POWER_FEATURES)
+
+#include "zbuild.h"
+#include "power_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+#ifdef POWER_NEED_AUXVEC_H
+#  include <linux/auxvec.h>
+#endif
+#ifdef __FreeBSD__
+#  include <machine/cpu.h>
+#endif
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
+#ifdef PPC_FEATURES
+    unsigned long hwcap;
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+    hwcap = getauxval(AT_HWCAP);
+#endif
+
+    if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+        features->has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
+    unsigned long hwcap2;
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
+    hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#ifdef POWER8_VSX
+    if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+        features->has_arch_2_07 = 1;
+#endif
+#ifdef POWER9
+    if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+        features->has_arch_3_00 = 1;
+#endif
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_features.h b/neozip/arch/power/power_features.h
new file mode 100644
index 0000000000..1ff51de5dd
--- /dev/null
+++ b/neozip/arch/power/power_features.h
@@ -0,0 +1,18 @@
+/* power_features.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_
+
+struct power_cpu_features {
+    int has_altivec;
+    int has_arch_2_07;
+    int has_arch_3_00;
+};
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features);
+
+#endif /* POWER_FEATURES_H_ */
diff --git a/neozip/arch/power/power_functions.h b/neozip/arch/power/power_functions.h
new file mode 100644
index 0000000000..ccc7754a4c
--- /dev/null
+++ b/neozip/arch/power/power_functions.h
@@ -0,0 +1,74 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#include "power_natives.h"
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+#  ifdef PPC_VMX_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_vmx
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_vmx
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_vmx
+#  endif
+// Power8 - VSX
+#  ifdef POWER8_VSX_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_power8
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_power8
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_power8
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_power8
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_power8
+#  endif
+#  ifdef POWER8_VSX_CRC32_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_power8
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_power8
+#  endif
+// Power9
+#  ifdef POWER9_NATIVE
+#    undef native_compare256
+#    define native_compare256 compare256_power9
+#    undef native_longest_match
+#    define native_longest_match longest_match_power9
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_power9
+#  endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
diff --git a/neozip/arch/power/power_intrins.h b/neozip/arch/power/power_intrins.h
new file mode 100644
index 0000000000..3efcfb9722
--- /dev/null
+++ b/neozip/arch/power/power_intrins.h
@@ -0,0 +1,61 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ *   Daniel Black <daniel@linux.vnet.ibm.com>
+ *   Rogerio Alves <rogealve@br.ibm.com>
+ *   Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_INTRINS_H
+#define POWER_INTRINS_H
+
+#include <altivec.h>
+
+#if defined (__clang__)
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+#ifdef __VSX__
+static inline __vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b) {
+    return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+#endif
+
+#endif
+
+/* There's no version of this that operates over unsigned and if casted, it does
+ * sign extension. Let's write an endian independent version and hope the compiler
+ * eliminates creating another zero idiom for the zero value if one exists locally */
+static inline vector unsigned short vec_unpackl(vector unsigned char a) {
+    vector unsigned char zero = vec_splat_u8(0);
+
+#if BYTE_ORDER == BIG_ENDIAN
+    return (vector unsigned short)vec_mergel(zero, a);
+#else
+    return (vector unsigned short)vec_mergel(a, zero);
+#endif
+}
+
+static inline vector unsigned short vec_unpackh(vector unsigned char a) {
+    vector unsigned char zero = vec_splat_u8(0);
+
+#if BYTE_ORDER == BIG_ENDIAN
+    return (vector unsigned short)vec_mergeh(zero, a);
+#else
+    return (vector unsigned short)vec_mergeh(a, zero);
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_natives.h b/neozip/arch/power/power_natives.h
new file mode 100644
index 0000000000..59ec8a8aed
--- /dev/null
+++ b/neozip/arch/power/power_natives.h
@@ -0,0 +1,27 @@
+/* power_natives.h -- POWER compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_NATIVES_H_
+#define POWER_NATIVES_H_
+
+#if defined(__ALTIVEC__)
+#  ifdef PPC_VMX
+#    define PPC_VMX_NATIVE
+#  endif
+#endif
+#if defined(_ARCH_PWR8) && defined(__VSX__)
+#  ifdef POWER8_VSX
+#    define POWER8_VSX_NATIVE
+#  endif
+#  ifdef POWER8_VSX_CRC32
+#    define POWER8_VSX_CRC32_NATIVE
+#  endif
+#endif
+#if defined(_ARCH_PWR9)
+#  ifdef POWER9
+#    define POWER9_NATIVE
+#  endif
+#endif
+
+#endif /* POWER_NATIVES_H_ */
diff --git a/neozip/arch/power/slide_hash_power8.c b/neozip/arch/power/slide_hash_power8.c
new file mode 100644
index 0000000000..d01e0acd56
--- /dev/null
+++ b/neozip/arch/power/slide_hash_power8.c
@@ -0,0 +1,12 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#define SLIDE_PPC slide_hash_power8
+#include "slide_ppc_tpl.h"
+
+#endif /* POWER8_VSX */
diff --git a/neozip/arch/power/slide_hash_vmx.c b/neozip/arch/power/slide_hash_vmx.c
new file mode 100644
index 0000000000..5a87ef7d9a
--- /dev/null
+++ b/neozip/arch/power/slide_hash_vmx.c
@@ -0,0 +1,10 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX
+
+#define SLIDE_PPC slide_hash_vmx
+#include "slide_ppc_tpl.h"
+
+#endif /* PPC_VMX */
diff --git a/neozip/arch/power/slide_ppc_tpl.h b/neozip/arch/power/slide_ppc_tpl.h
new file mode 100644
index 0000000000..24629b4039
--- /dev/null
+++ b/neozip/arch/power/slide_ppc_tpl.h
@@ -0,0 +1,44 @@
+/* Optimized slide_hash for PowerPC processors
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    const vector unsigned short vmx_wsize = vec_splats(wsize);
+    Pos *p = table;
+
+    do {
+        /* Do the pointer arithmetic early to hopefully overlap the vector unit */
+        Pos *q = p;
+        p += 32;
+        vector unsigned short value0, value1, value2, value3;
+        vector unsigned short result0, result1, result2, result3;
+
+        value0 = vec_ld(0, q);
+        value1 = vec_ld(16, q);
+        value2 = vec_ld(32, q);
+        value3 = vec_ld(48, q);
+        result0 = vec_subs(value0, vmx_wsize);
+        result1 = vec_subs(value1, vmx_wsize);
+        result2 = vec_subs(value2, vmx_wsize);
+        result3 = vec_subs(value3, vmx_wsize);
+        vec_st(result0, 0, q);
+        vec_st(result1, 16, q);
+        vec_st(result2, 32, q);
+        vec_st(result3, 48, q);
+
+        entries -= 32;
+   } while (entries);
+}
+
+void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
diff --git a/neozip/arch/riscv/Makefile.in b/neozip/arch/riscv/Makefile.in
new file mode 100644
index 0000000000..43176eee6e
--- /dev/null
+++ b/neozip/arch/riscv/Makefile.in
@@ -0,0 +1,72 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# Copyright (C) 2025 Yin Tong <yintong.ustc@bytedance.com>, ByteDance
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+RVVFLAG=
+RVVZBCFLAG=
+ZBCFLAG=
+
+all: \
+	riscv_features.o riscv_features.lo \
+	adler32_rvv.o adler32_rvv.lo \
+	chunkset_rvv.o chunkset_rvv.lo \
+	compare256_rvv.o compare256_rvv.lo \
+	slide_hash_rvv.o slide_hash_rvv.lo \
+        crc32_zbc.o crc32_zbc.lo
+
+riscv_features.o: $(SRCDIR)/riscv_features.c
+	$(CC) $(CFLAGS) $(RVVZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c
+
+riscv_features.lo: $(SRCDIR)/riscv_features.c
+	$(CC) $(SFLAGS) $(RVVZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c
+
+adler32_rvv.o: $(SRCDIR)/adler32_rvv.c
+	$(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c
+
+adler32_rvv.lo: $(SRCDIR)/adler32_rvv.c
+	$(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c
+
+chunkset_rvv.o: $(SRCDIR)/chunkset_rvv.c
+	$(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c
+
+chunkset_rvv.lo: $(SRCDIR)/chunkset_rvv.c
+	$(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c
+
+compare256_rvv.o: $(SRCDIR)/compare256_rvv.c
+	$(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c
+
+compare256_rvv.lo: $(SRCDIR)/compare256_rvv.c
+	$(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c
+
+slide_hash_rvv.o: $(SRCDIR)/slide_hash_rvv.c
+	$(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c
+
+slide_hash_rvv.lo: $(SRCDIR)/slide_hash_rvv.c
+	$(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c
+
+crc32_zbc.o: $(SRCDIR)/crc32_zbc.c
+	$(CC) $(CFLAGS) $(ZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c
+
+crc32_zbc.lo: $(SRCDIR)/crc32_zbc.c
+	$(CC) $(SFLAGS) $(ZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/riscv/README.md b/neozip/arch/riscv/README.md
new file mode 100644
index 0000000000..013095c373
--- /dev/null
+++ b/neozip/arch/riscv/README.md
@@ -0,0 +1,45 @@
+# Building RISC-V Target with Cmake #
+
+> **Warning**
+> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer.
+>
+> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu.
+> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it.
+## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
+
+If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
+
+```bash
+./prepare_riscv_toolchain_qemu.sh
+```
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
+`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
+
+You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
+
+## Cross-Compile for RISC-V Target ##
+
+```bash
+cmake -G Ninja -B ./build-riscv \
+  -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
+  -D CMAKE_INSTALL_PREFIX=./build-riscv/install \
+  -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+  -D QEMU_PATH={QEMU_PATH} \
+  .
+
+cmake --build ./build-riscv
+```
+
+Disable the option if there is no RVV support:
+```
+-D WITH_RVV=OFF
+```
+
+## Run Unittests on User Mode QEMU ##
+
+```bash
+cd ./build-riscv && ctest --verbose
+```
diff --git a/neozip/arch/riscv/adler32_rvv.c b/neozip/arch/riscv/adler32_rvv.c
new file mode 100644
index 0000000000..e446189302
--- /dev/null
+++ b/neozip/arch/riscv/adler32_rvv.c
@@ -0,0 +1,119 @@
+/* adler32_rvv.c - RVV version of adler32
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <riscv_vector.h>
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
+
+    size_t left = len;
+    size_t vl = __riscv_vsetvlmax_e8m1();
+    vl = MIN(vl, 256);
+    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint16m2_t v_buf16_accu;
+
+    /*
+     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+     * accumulators to boost performance.
+     *
+     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+     * vl > 256 (255 * 256 <= UINT16_MAX).
+     *
+     * We accumulate 8-bit data into a 16-bit accumulator and then
+     * move the data into the 32-bit accumulator at the last iteration.
+     */
+    size_t block_size = (256 / vl) * vl;
+    size_t nmax_limit = (NMAX / block_size);
+    size_t cnt = 0;
+    while (left >= block_size) {
+        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+        size_t subprob = block_size;
+        while (subprob > 0) {
+            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+            if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+            src += vl;
+            if (COPY) dst += vl;
+            subprob -= vl;
+        }
+        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+        left -= block_size;
+        /* do modulo once each block of NMAX size */
+        if (++cnt >= nmax_limit) {
+            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+            v_buf32_accu = __riscv_vremu_vx_u32m4(v_buf32_accu, BASE, vl);
+            cnt = 0;
+        }
+    }
+    /* the left len <= 256 now, we can use 16-bit accum safely */
+    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+    size_t res = left;
+    while (left >= vl) {
+        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+        if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+        src += vl;
+        if (COPY) dst += vl;
+        left -= vl;
+    }
+    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum) % BASE;
+
+    sum2 += (sum2_sum + adler * ((len - left) % BASE));
+
+    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum) % BASE;
+
+    adler += adler_sum;
+
+    sum2 %= BASE;
+    adler %= BASE;
+
+    /* Process tail (left < 256). */
+    return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY);
+}
+
+Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
+    return adler32_copy_impl(adler, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/riscv/chunkset_rvv.c b/neozip/arch/riscv/chunkset_rvv.c
new file mode 100644
index 0000000000..cd8ed3cfd2
--- /dev/null
+++ b/neozip/arch/riscv/chunkset_rvv.c
@@ -0,0 +1,126 @@
+/* chunkset_rvv.c - RVV version of chunkset
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+
+#include <riscv_vector.h>
+
+/*
+ * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * so we prefer using large size chunk and copy memory as much as possible.
+ */
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+#define CHUNK_MEMSET_RVV_IMPL(from, chunk, elen)                        \
+do {                                                                    \
+    size_t vl, len = sizeof(*chunk) / sizeof(uint##elen##_t);           \
+    uint##elen##_t val = *(uint##elen##_t*)from;                        \
+    uint##elen##_t* chunk_p = (uint##elen##_t*)chunk;                   \
+    do {                                                                \
+        vl = __riscv_vsetvl_e##elen##m4(len);                           \
+        vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
+        __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl);          \
+        len -= vl; chunk_p += vl;                                       \
+    } while (len > 0);                                                  \
+} while (0)
+
+/* We don't have a 32-byte datatype for RISC-V arch. */
+typedef struct chunk_s {
+    uint64_t data[4];
+} chunk_t;
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(from, chunk, 16);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(from, chunk, 32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(from, chunk, 64);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    memcpy(chunk->data, (uint8_t *)s, sizeof(*chunk));
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    memcpy(out, chunk->data, sizeof(*chunk));
+}
+
+#define CHUNKSIZE        chunksize_rvv
+#define CHUNKCOPY        chunkcopy_rvv
+#define CHUNKUNROLL      chunkunroll_rvv
+#define CHUNKMEMSET      chunkmemset_rvv
+#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
+
+#define HAVE_CHUNKCOPY
+
+/*
+ * Assuming that the length is non-zero, and that `from` lags `out` by at least
+ * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
+ *
+ * We load/store a single chunk once in the `CHUNKCOPY`.
+ * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * such that, we prefer copy large memory size once to make good use of the the RVV advance.
+ *
+ * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
+ * but we still copy as much memory as possible for some conditions.
+ *
+ * case 1: out - from >= len (no overlap)
+ *         We can use memcpy to copy `len` size once
+ *         because the memory layout would be the same.
+ *
+ * case 2: overlap
+ *         We copy N chunks using memcpy at once, aiming to achieve our goal:
+ *         to copy as much memory as possible.
+ *
+ *         After using a single memcpy to copy N chunks, we have to use series of
+ *         loadchunk and storechunk to ensure the result is correct.
+ */
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+    size_t dist = out - from;
+    if (out < from || dist >= len) {
+        memcpy(out, from, len);
+        out += len;
+        from += len;
+        return out;
+    }
+
+    size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+    memcpy(out, from, sizeof(chunk_t));
+    out += align;
+    from += align;
+    len -= align;
+
+    size_t vl = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
+    while (len > dist) {
+        memcpy(out, from, vl);
+        out += vl;
+        from += vl;
+        len -= vl;
+    }
+
+    if (len > 0) {
+        memcpy(out, from, len);
+        out += len;
+    }
+    return out;
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_rvv
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/riscv/compare256_rvv.c b/neozip/arch/riscv/compare256_rvv.c
new file mode 100644
index 0000000000..edb18a3766
--- /dev/null
+++ b/neozip/arch/riscv/compare256_rvv.c
@@ -0,0 +1,48 @@
+/* compare256_rvv.c - RVV version of compare256
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+
+#include <riscv_vector.h>
+
+static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    size_t vl;
+    long found_diff;
+    do {
+        vl = __riscv_vsetvl_e8m4(256 - len);
+        vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+        vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+        vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
+        found_diff = __riscv_vfirst_m_b2(v_mask, vl);
+        if (found_diff >= 0)
+            return len + (uint32_t)found_diff;
+        src0 += vl, src1 += vl, len += vl;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_rvv_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/riscv/crc32_zbc.c b/neozip/arch/riscv/crc32_zbc.c
new file mode 100644
index 0000000000..cf52279b80
--- /dev/null
+++ b/neozip/arch/riscv/crc32_zbc.c
@@ -0,0 +1,103 @@
+/* crc32_zbc.c - RISCV Zbc version of crc32
+ * Copyright (C) 2025 ByteDance. All rights reserved.
+ * Contributed by Yin Tong <yintong.ustc@bytedance.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_CRC32_ZBC
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#define CLMUL_MIN_LEN 16   // Minimum size of buffer for _crc32_clmul
+#define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul
+
+#define CONSTANT_R3 0x1751997d0ULL
+#define CONSTANT_R4 0x0ccaa009eULL
+#define CONSTANT_R5 0x163cd6124ULL
+#define MASK32 0xFFFFFFFF
+#define CRCPOLY_TRUE_LE_FULL 0x1DB710641ULL
+#define CONSTANT_RU 0x1F7011641ULL
+
+static inline uint64_t clmul(uint64_t a, uint64_t b) {
+    uint64_t res;
+    __asm__ volatile("clmul %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
+    return res;
+}
+
+static inline uint64_t clmulh(uint64_t a, uint64_t b) {
+    uint64_t res;
+    __asm__ volatile("clmulh %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
+    return res;
+}
+
+Z_FORCEINLINE static uint32_t crc32_clmul_impl(uint64_t crc, const unsigned char *buf, uint64_t len) {
+    const uint64_t *buf64 = (const uint64_t *)buf;
+    uint64_t low = buf64[0] ^ crc;
+    uint64_t high = buf64[1];
+
+    if (len < 16)
+        goto finish_fold;
+    len -= 16;
+    buf64 += 2;
+
+    // process each 16-byte block
+    while (len >= 16) {
+        uint64_t t2 = clmul(CONSTANT_R4, high);
+        uint64_t t3 = clmulh(CONSTANT_R4, high);
+
+        uint64_t t0_new = clmul(CONSTANT_R3, low);
+        uint64_t t1_new = clmulh(CONSTANT_R3, low);
+
+        // Combine the results and XOR with new data
+        low = t0_new ^ t2;
+        high = t1_new ^ t3;
+        low ^= buf64[0];
+        high ^= buf64[1];
+
+        buf64 += 2;
+        len -= 16;
+    }
+
+finish_fold:
+    // Fold the 128-bit result into 64 bits
+    uint64_t fold_t3 = clmulh(low, CONSTANT_R4);
+    uint64_t fold_t2 = clmul(low, CONSTANT_R4);
+    low = high ^ fold_t2;
+    high = fold_t3;
+
+    // Combine the low and high parts and perform polynomial reduction
+    uint64_t combined = (low >> 32) | ((high & MASK32) << 32);
+    uint64_t reduced_low = clmul(low & MASK32, CONSTANT_R5) ^ combined;
+
+    // Barrett reduction step
+    uint64_t barrett = clmul(reduced_low & MASK32, CONSTANT_RU) & MASK32;
+    barrett = clmul(barrett, CRCPOLY_TRUE_LE_FULL);
+    uint64_t final = barrett ^ reduced_low;
+
+    // Return the high 32 bits as the final CRC
+    return (uint32_t)(final >> 32);
+}
+
+Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len) {
+    if (len < CLMUL_MIN_LEN) {
+        return crc32_braid(crc, buf, len);
+    }
+
+    uint64_t unaligned_length = len % CLMUL_CHUNK_LEN;
+    if (unaligned_length) {
+        crc = crc32_braid(crc, buf, unaligned_length);
+        buf += unaligned_length;
+        len -= unaligned_length;
+    }
+
+    crc = crc32_clmul_impl(~crc, buf, len);
+    return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_riscv64_zbc(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
+#endif
diff --git a/neozip/arch/riscv/riscv_features.c b/neozip/arch/riscv/riscv_features.c
new file mode 100644
index 0000000000..b23f10a699
--- /dev/null
+++ b/neozip/arch/riscv/riscv_features.c
@@ -0,0 +1,99 @@
+#ifdef RISCV_FEATURES
+
+#define _DEFAULT_SOURCE 1 /* For syscall() */
+
+#include "zbuild.h"
+#include "riscv_features.h"
+
+#include <sys/utsname.h>
+
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#endif
+
+#if defined(__linux__) && defined(HAVE_ASM_HWPROBE_H)
+#  include <asm/hwprobe.h>
+#  include <sys/syscall.h> /* For __NR_riscv_hwprobe */
+#  include <unistd.h> /* For syscall() */
+#endif
+
+#define ISA_V_HWCAP (1 << ('v' - 'a'))
+#define ISA_ZBC_HWCAP (1 << 29)
+
+static int riscv_check_features_runtime_hwprobe(struct riscv_cpu_features *features) {
+#if defined(__NR_riscv_hwprobe) && defined(RISCV_HWPROBE_KEY_IMA_EXT_0)
+    struct riscv_hwprobe probes[] = {
+        {RISCV_HWPROBE_KEY_IMA_EXT_0, 0},
+    };
+    int ret;
+    unsigned i;
+
+    ret = syscall(__NR_riscv_hwprobe, probes, sizeof(probes) / sizeof(probes[0]), 0, NULL, 0);
+
+    if (ret != 0) {
+        /* Kernel does not support hwprobe */
+        return 0;
+    }
+
+    for (i = 0; i < sizeof(probes) / sizeof(probes[0]); i++) {
+        switch (probes[i].key) {
+        case RISCV_HWPROBE_KEY_IMA_EXT_0:
+#  ifdef RISCV_HWPROBE_IMA_V
+            features->has_rvv = !!(probes[i].value & RISCV_HWPROBE_IMA_V);
+#  endif
+#  ifdef RISCV_HWPROBE_EXT_ZBC
+            features->has_zbc = !!(probes[i].value & RISCV_HWPROBE_EXT_ZBC);
+#  endif
+            break;
+        }
+    }
+
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+static int riscv_check_features_runtime_hwcap(struct riscv_cpu_features *features) {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+
+    features->has_rvv = hw_cap & ISA_V_HWCAP;
+    features->has_zbc = hw_cap & ISA_ZBC_HWCAP;
+
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+static void riscv_check_features_runtime(struct riscv_cpu_features *features) {
+    if (riscv_check_features_runtime_hwprobe(features))
+        return;
+
+    riscv_check_features_runtime_hwcap(features);
+}
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
+    riscv_check_features_runtime(features);
+#ifdef RISCV_RVV
+    if (features->has_rvv) {
+        size_t e8m1_vec_len;
+        intptr_t vtype_reg_val;
+        // Check that a vuint8m1_t vector is at least 16 bytes and that tail
+        // agnostic and mask agnostic mode are supported
+        //
+        __asm__ volatile(
+                "vsetvli %0, zero, e8, m1, ta, ma\n\t"
+                "csrr %1, vtype"
+                : "=r"(e8m1_vec_len), "=r"(vtype_reg_val));
+
+        // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of
+        // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16
+        // bytes
+        features->has_rvv = (vtype_reg_val >= 0 && e8m1_vec_len >= 16);
+    }
+#endif
+}
+
+#endif
diff --git a/neozip/arch/riscv/riscv_features.h b/neozip/arch/riscv/riscv_features.h
new file mode 100644
index 0000000000..42855a1b6b
--- /dev/null
+++ b/neozip/arch/riscv/riscv_features.h
@@ -0,0 +1,19 @@
+/* riscv_features.h -- check for riscv features.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FEATURES_H_
+#define RISCV_FEATURES_H_
+
+struct riscv_cpu_features {
+    int has_rvv;
+    int has_zbc;
+};
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
+
+#endif /* RISCV_FEATURES_H_ */
diff --git a/neozip/arch/riscv/riscv_functions.h b/neozip/arch/riscv/riscv_functions.h
new file mode 100644
index 0000000000..89120ffabf
--- /dev/null
+++ b/neozip/arch/riscv/riscv_functions.h
@@ -0,0 +1,60 @@
+/* riscv_functions.h -- RISCV implementations for arch-specific functions.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FUNCTIONS_H_
+#define RISCV_FUNCTIONS_H_
+
+#include "riscv_natives.h"
+
+#ifdef RISCV_RVV
+uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t longest_match_rvv(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_rvv(deflate_state *const s, uint32_t cur_match);
+void slide_hash_rvv(deflate_state *s);
+void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef RISCV_CRC32_ZBC
+uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// RISCV - RVV
+#  ifdef RISCV_RVV_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_rvv
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_rvv
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_rvv
+#    undef native_compare256
+#    define native_compare256 compare256_rvv
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_rvv
+#    undef native_longest_match
+#    define native_longest_match longest_match_rvv
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_rvv
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_rvv
+#  endif
+// RISCV - CRC32
+#  ifdef RISCV_ZBC_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_riscv64_zbc
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_riscv64_zbc
+#  endif
+#endif
+
+#endif /* RISCV_FUNCTIONS_H_ */
diff --git a/neozip/arch/riscv/riscv_natives.h b/neozip/arch/riscv/riscv_natives.h
new file mode 100644
index 0000000000..38d7aba648
--- /dev/null
+++ b/neozip/arch/riscv/riscv_natives.h
@@ -0,0 +1,19 @@
+/* riscv_natives.h -- RISCV compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_NATIVES_H_
+#define RISCV_NATIVES_H_
+
+#if defined(__riscv_v) && defined(__linux__)
+#  ifdef RISCV_RVV
+#    define RISCV_RVV_NATIVE
+#  endif
+#endif
+#if defined(__riscv_zbc)
+#  ifdef RISCV_CRC32_ZBC
+#    define RISCV_ZBC_NATIVE
+#  endif
+#endif
+
+#endif /* RISCV_NATIVES_H_ */
diff --git a/neozip/arch/riscv/slide_hash_rvv.c b/neozip/arch/riscv/slide_hash_rvv.c
new file mode 100644
index 0000000000..e794c38204
--- /dev/null
+++ b/neozip/arch/riscv/slide_hash_rvv.c
@@ -0,0 +1,33 @@
+/* slide_hash_rvv.c - RVV version of slide_hash
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <riscv_vector.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    size_t vl;
+    while (entries > 0) {
+        vl = __riscv_vsetvl_e16m4(entries);
+        vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
+        vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
+        __riscv_vse16_v_u16m4(table, v_diff, vl);
+        table += vl, entries -= vl;
+    }
+}
+
+Z_INTERNAL void slide_hash_rvv(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/s390/Makefile.in b/neozip/arch/s390/Makefile.in
new file mode 100644
index 0000000000..e994157df2
--- /dev/null
+++ b/neozip/arch/s390/Makefile.in
@@ -0,0 +1,48 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+s390_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+s390_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c
+
+dfltcc_deflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_deflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c
+
+dfltcc_inflate.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+dfltcc_inflate.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
+
+crc32-vx.o:
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/s390/README.md b/neozip/arch/s390/README.md
new file mode 100644
index 0000000000..c56ffd7654
--- /dev/null
+++ b/neozip/arch/s390/README.md
@@ -0,0 +1,265 @@
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:
+
+    $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
+    $ make
+
+or
+
+    $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
+    $ make
+
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
+
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.
+
+DFLTCC does not support every single zlib-ng feature, in particular:
+
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`
+
+When used, these functions will either switch to software, or, in case
+this is not possible, gracefully fail.
+
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros
+
+DFLTCC takes as arguments a parameter block, an input buffer, an output
+buffer, and a window. Parameter blocks are stored alongside zlib states;
+buffers are forwarded from the caller; and window - which must be
+4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`,
+`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()`
+and `INFLATE_ADJUST_WINDOW_SIZE()` hooks.
+
+Software and hardware window formats do not match, therefore,
+`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()`
+and `inflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`,
+`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros.
+
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.
+
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.
+
+The algorithm implemented in hardware has different compression ratio
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.
+
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.
+
+In addition to compression, DFLTCC computes CRC-32 and Adler-32
+checksums, therefore, whenever it's used, software checksumming is
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
+macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`.
+* Translating between software and hardware data formats, e.g.
+  `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+  `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+  is not the case, then it returns `0`, making `deflate()` use some
+  other function in order to compress in software. Otherwise it returns
+  `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+  when explicitly instructed to do so by the software. Furthermore,
+  whether to use fixed or dynamic Huffman tables must also be determined
+  by the software. Since looking at data in order to gather statistics
+  would negate performance benefits, the following approach is used: the
+  first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+  block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+  dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+  instructs DFLTCC to write EOBS, however, certain conditions need to be
+  met: input data length must be non-zero or Continuation Flag must be
+  set. To put this in simpler terms, DFLTCC will silently refuse to
+  write EOBS if this is the only thing that it is asked to do. Since the
+  code has to be able to emit EOBS in software anyway, in order to avoid
+  tricky corner cases Block Closing Control is never used. Whether to
+  write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+  must perform various additional actions when a block or a stream ends.
+  `dfltcc_deflate()` informs `deflate()` about this using
+  `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+  and Sub-Byte Boundary. Certain fields cannot be translated and must
+  persist untouched in the parameter block between calls, for example,
+  Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+  quite intertwined and pervasive. The general idea here is that the
+  code must not do anything in software - whether explicitly by e.g.
+  calling `send_eobs()`, or implicitly - by returning to `deflate()`
+  with certain return and `*result` values, when Continuation Flag is
+  set.
+* Ending streams. When a new block is started and flush mode is
+  `Z_FINISH`, Block Header Final parameter block bit is used to mark
+  this block as final. However, sometimes an empty final block is
+  needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+  refuse to do this. The general idea of DFLTCC implementation is to
+  rely as much as possible on the existing code. Here in order to do
+  this, the code pretends that it does not support DFLTCC, which makes
+  `deflate()` call a software compression function, which writes an
+  empty final block. Whether this is required is controlled by
+  `need_empty_block` variable.
+* Error handling. This is simply converting
+  Operation-Ending-Supplemental Code to string. Errors can only happen
+  due to things like memory corruption, and therefore they don't affect
+  the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+  Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+  block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+  the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+  `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `whave` and History Length or `wnext` and
+  History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+  and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+  Operation-Ending-Supplemental Code to string conversion. Unlike
+  deflate, errors may happen due to bad inputs, therefore they are
+  propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC
+testing. There is no official IBM Z GitHub Actions runner, so we build
+one inspired by `anup-kodlekere/gaplib`.
+Future updates to actions-runner might need an updated patch. The .net
+version number patch has been separated into a separate file to avoid a
+need for constantly changing the patch.
+
+## Configuring the builder.
+
+### Install prerequisites.
+```
+sudo dnf install podman
+```
+
+### Create a config file, needs github personal access token.
+Access token needs permissions; Repo Admin RW, Org Self-hosted runners RW.
+For details, consult
+https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository
+
+#### Create file /etc/actions-runner:
+```
+REPO=<owner>/<name>
+PAT_TOKEN=<github_pat_***>
+```
+
+#### Set permissions on /etc/actions-runner:
+```
+chmod 600 /etc/actions-runner
+```
+
+### Add actions-runner service.
+```
+sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/
+sudo systemctl daemon-reload
+```
+
+### Autostart actions-runner.
+```
+$ sudo systemctl enable --now actions-runner
+```
+
+### Add auto-rebuild cronjob
+```
+sudo cp self-hosted-builder/actions-runner-rebuild.sh /etc/cron.weekly/
+chmod +x /etc/cron.weekly/actions-runner-rebuild.sh
+```
+
+## Building / Rebuilding the container
+```
+sudo /etc/cron.weekly/actions-runner-rebuild.sh
+```
diff --git a/neozip/arch/s390/crc32-vx.c b/neozip/arch/s390/crc32-vx.c
new file mode 100644
index 0000000000..ba00f9a370
--- /dev/null
+++ b/neozip/arch/s390/crc32-vx.c
@@ -0,0 +1,232 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#ifdef S390_CRC32_VX
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) {
+    size_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return crc32_braid(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = (size_t)ALIGN_DIFF(buf, VX_ALIGNMENT);
+        len -= prealign;
+        crc = crc32_braid(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = ALIGN_DOWN(len, VX_ALIGNMENT);
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = ~crc32_le_vgfm_16(~crc, buf, aligned);
+
+    if (remaining)
+        crc = crc32_braid(crc, buf + aligned, remaining);
+
+    return crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_s390_vx(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_common.h b/neozip/arch/s390/dfltcc_common.h
new file mode 100644
index 0000000000..a6527ab5df
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_common.h
@@ -0,0 +1,119 @@
+#ifndef DFLTCC_COMMON_H
+#define DFLTCC_COMMON_H
+
+#include "zutil.h"
+
+/*
+   Parameter Block for Query Available Functions.
+ */
+struct dfltcc_qaf_param {
+    char fns[16];
+    char reserved1[8];
+    char fmts[2];
+    char reserved2[6];
+} ALIGNED_(8);
+
+/*
+   Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand.
+ */
+struct dfltcc_param_v0 {
+    uint16_t pbvn;                     /* Parameter-Block-Version Number */
+    uint8_t mvn;                       /* Model-Version Number */
+    uint8_t ribm;                      /* Reserved for IBM use */
+    uint32_t reserved32 : 31;
+    uint32_t cf : 1;                   /* Continuation Flag */
+    uint8_t reserved64[8];
+    uint32_t nt : 1;                   /* New Task */
+    uint32_t reserved129 : 1;
+    uint32_t cvt : 1;                  /* Check Value Type */
+    uint32_t reserved131 : 1;
+    uint32_t htt : 1;                  /* Huffman-Table Type */
+    uint32_t bcf : 1;                  /* Block-Continuation Flag */
+    uint32_t bcc : 1;                  /* Block Closing Control */
+    uint32_t bhf : 1;                  /* Block Header Final */
+    uint32_t reserved136 : 1;
+    uint32_t reserved137 : 1;
+    uint32_t dhtgc : 1;                /* DHT Generation Control */
+    uint32_t reserved139 : 5;
+    uint32_t reserved144 : 5;
+    uint32_t sbb : 3;                  /* Sub-Byte Boundary */
+    uint8_t oesc;                      /* Operation-Ending-Supplemental Code */
+    uint32_t reserved160 : 12;
+    uint32_t ifs : 4;                  /* Incomplete-Function Status */
+    uint16_t ifl;                      /* Incomplete-Function Length */
+    uint8_t reserved192[8];
+    uint8_t reserved256[8];
+    uint8_t reserved320[4];
+    uint16_t hl;                       /* History Length */
+    uint32_t reserved368 : 1;
+    uint16_t ho : 15;                  /* History Offset */
+    uint32_t cv;                       /* Check Value */
+    uint32_t eobs : 15;                /* End-of-block Symbol */
+    uint32_t reserved431: 1;
+    uint8_t eobl : 4;                  /* End-of-block Length */
+    uint32_t reserved436 : 12;
+    uint32_t reserved448 : 4;
+    uint16_t cdhtl : 12;               /* Compressed-Dynamic-Huffman Table
+                                          Length */
+    uint8_t reserved464[6];
+    uint8_t cdht[288];                 /* Compressed-Dynamic-Huffman Table */
+    uint8_t reserved[24];
+    uint8_t ribm2[8];                  /* Reserved for IBM use */
+    uint8_t csb[1152];                 /* Continuation-State Buffer */
+} ALIGNED_(8);
+
+/*
+   Extension of inflate_state and deflate_state.
+ */
+struct dfltcc_state {
+    struct dfltcc_param_v0 param;      /* Parameter block. */
+    struct dfltcc_qaf_param af;        /* Available functions. */
+    char msg[64];                      /* Buffer for strm->msg */
+};
+
+typedef struct {
+    struct dfltcc_state common;
+    uint16_t level_mask;               /* Levels on which to use DFLTCC */
+    uint32_t block_size;               /* New block each X bytes */
+    size_t block_threshold;            /* New block after total_in > X */
+    uint32_t dht_threshold;            /* New block only if avail_in >= X */
+} arch_deflate_state;
+
+typedef struct {
+    struct dfltcc_state common;
+} arch_inflate_state;
+
+/*
+   History buffer size.
+ */
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+/*
+   Sizes of deflate block parts.
+ */
+#define DFLTCC_BLOCK_HEADER_BITS 3
+#define DFLTCC_HLITS_COUNT_BITS 5
+#define DFLTCC_HDISTS_COUNT_BITS 5
+#define DFLTCC_HCLENS_COUNT_BITS 4
+#define DFLTCC_MAX_HCLENS 19
+#define DFLTCC_HCLEN_BITS 3
+#define DFLTCC_MAX_HLITS 286
+#define DFLTCC_MAX_HDISTS 30
+#define DFLTCC_MAX_HLIT_HDIST_BITS 7
+#define DFLTCC_MAX_SYMBOL_BITS 16
+#define DFLTCC_MAX_EOBS_BITS 15
+#define DFLTCC_MAX_PADDING_BITS 7
+
+#define DEFLATE_BOUND_COMPLEN(source_len) \
+    ((DFLTCC_BLOCK_HEADER_BITS + \
+      DFLTCC_HLITS_COUNT_BITS + \
+      DFLTCC_HDISTS_COUNT_BITS + \
+      DFLTCC_HCLENS_COUNT_BITS + \
+      DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \
+      (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \
+      (source_len) * DFLTCC_MAX_SYMBOL_BITS + \
+      DFLTCC_MAX_EOBS_BITS + \
+      DFLTCC_MAX_PADDING_BITS) >> 3)
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_deflate.c b/neozip/arch/s390/dfltcc_deflate.c
new file mode 100644
index 0000000000..5cbd700c64
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_deflate.c
@@ -0,0 +1,390 @@
+/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC compression support:
+
+        $ ./configure --with-dfltcc-deflate
+   or
+
+        $ cmake -DWITH_DFLTCC_DEFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#ifdef S390_DFLTCC_DEFLATE
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "trees_emit.h"
+#include "dfltcc_deflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    dfltcc_reset_state(&dfltcc_state->common);
+
+    /* Initialize tuning parameters */
+    dfltcc_state->level_mask = DFLTCC_LEVEL_MASK;
+    dfltcc_state->block_size = DFLTCC_BLOCK_SIZE;
+    dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE;
+    dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE;
+}
+
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+                                       int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+
+    /* Unsupported compression settings */
+    if ((dfltcc_state->level_mask & (1 << level)) == 0)
+        return 0;
+    if (window_bits != HB_BITS)
+        return 0;
+    if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+        return 0;
+    if (reproducible)
+        return 0;
+
+    /* Unsupported hardware */
+    if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) ||
+            !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) ||
+            !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0))
+        return 0;
+
+    return 1;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return dfltcc_can_deflate_with_params(strm, state->level, W_BITS(state), state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+
+    dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
+}
+
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->total_in += (strm->avail_in - avail_in);
+    strm->total_out += (strm->avail_out - avail_out);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    send_bits(state, bi_reverse((uint16_t)(param->eobs >> (15 - param->eobl)), param->eobl),
+        param->eobl, state->bi_buf, state->bi_valid);
+
+    flush_pending_inline(strm);
+    if (state->pending != 0) {
+        /* The remaining data is located in pending_out[0:pending]. If someone
+         * calls put_byte() - this might happen in deflate() - the byte will be
+         * placed into pending_buf[pending], which is incorrect. Move the
+         * remaining data to the beginning of pending_buf so that put_byte() is
+         * usable again.
+         */
+        memmove(state->pending_buf, state->pending_out, state->pending);
+        state->pending_out = state->pending_buf;
+    }
+#ifdef ZLIB_DEBUG
+    state->compressed_len += param->eobl;
+#endif
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) {
+    deflate_state *state = (deflate_state *)strm->state;
+    arch_deflate_state *dfltcc_state = &state->arch;
+    struct dfltcc_param_v0 *param = &dfltcc_state->common.param;
+    uInt masked_avail_in;
+    dfltcc_cc cc;
+    int need_empty_block;
+    int soft_bcc;
+    int no_flush;
+
+    if (!PREFIX(dfltcc_can_deflate)(strm)) {
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        return 0;
+    }
+
+again:
+    masked_avail_in = 0;
+    soft_bcc = 0;
+    no_flush = flush == Z_NO_FLUSH;
+
+    /* No input data. Return, except when Continuation Flag is set, which means
+     * that DFLTCC has buffered some output in the parameter block and needs to
+     * be called again in order to flush it.
+     */
+    if (strm->avail_in == 0 && !param->cf) {
+        /* A block is still open, and the hardware does not support closing
+         * blocks without adding data. Thus, close it manually.
+         */
+        if (!no_flush && param->bcf) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+        }
+        /* Let one of deflate_* functions write a trailing empty block. */
+        if (flush == Z_FINISH)
+            return 0;
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        /* Trigger block post-processing if necessary. */
+        *result = no_flush ? need_more : block_done;
+        return 1;
+    }
+
+    /* There is an open non-BFINAL block, we are not going to close it just
+     * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see
+     * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new
+     * DHT in order to adapt to a possibly changed input data distribution.
+     */
+    if (param->bcf && no_flush &&
+            strm->total_in > dfltcc_state->block_threshold &&
+            strm->avail_in >= dfltcc_state->dht_threshold) {
+        if (param->cf) {
+            /* We need to flush the DFLTCC buffer before writing the
+             * End-of-block Symbol. Mask the input data and proceed as usual.
+             */
+            masked_avail_in += strm->avail_in;
+            strm->avail_in = 0;
+            no_flush = 0;
+        } else {
+            /* DFLTCC buffer is empty, so we can manually write the
+             * End-of-block Symbol right away.
+             */
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        }
+    }
+
+    /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+     * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+     * set BCF=1, which is wrong. Avoid complications and return early.
+     */
+    if (strm->avail_out == 0) {
+        *result = need_more;
+        return 1;
+    }
+
+    /* The caller gave us too much data. Pass only one block worth of
+     * uncompressed data to DFLTCC and mask the rest, so that on the next
+     * iteration we start a new block.
+     */
+    if (no_flush && strm->avail_in > dfltcc_state->block_size) {
+        masked_avail_in += (strm->avail_in - dfltcc_state->block_size);
+        strm->avail_in = dfltcc_state->block_size;
+    }
+
+    /* When we have an open non-BFINAL deflate block and caller indicates that
+     * the stream is ending, we need to close an open deflate block and open a
+     * BFINAL one.
+     */
+    need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf;
+
+    /* Translate stream to parameter block */
+    param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
+    if (!no_flush)
+        /* We need to close a block. Always do this in software - when there is
+         * no input data, the hardware will not honor BCC. */
+        soft_bcc = 1;
+    if (flush == Z_FINISH && !param->bcf)
+        /* We are about to open a BFINAL block, set Block Header Final bit
+         * until the stream ends.
+         */
+        param->bhf = 1;
+    /* DFLTCC-CMPR will write to next_out, so make sure that buffers with
+     * higher precedence are empty.
+     */
+    Assert(state->pending == 0, "There must be no pending bytes");
+    Assert(state->bi_valid < 8, "There must be less than 8 pending bits");
+    param->sbb = (unsigned int)state->bi_valid;
+    if (param->sbb > 0)
+        *strm->next_out = (unsigned char)state->bi_buf;
+    /* Honor history and check value */
+    param->nt = 0;
+    if (state->wrap == 1)
+        param->cv = strm->adler;
+    else if (state->wrap == 2)
+        param->cv = ZSWAP32(strm->adler);
+
+    /* When opening a block, choose a Huffman-Table Type */
+    if (!param->bcf) {
+        if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0))
+            param->htt = HTT_FIXED;
+        else {
+            param->htt = HTT_DYNAMIC;
+            dfltcc_gdht(strm);
+        }
+    }
+
+    /* Deflate */
+    do {
+        cc = dfltcc_cmpr(strm);
+        if (strm->avail_in < 4096 && masked_avail_in > 0)
+            /* We are about to call DFLTCC with a small input buffer, which is
+             * inefficient. Since there is masked data, there will be at least
+             * one more DFLTCC call, so skip the current one and make the next
+             * one handle more data.
+             */
+            break;
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc);
+    state->bi_valid = param->sbb;
+    if (state->bi_valid == 0)
+        state->bi_buf = 0; /* Avoid accessing next_out */
+    else
+        state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
+    if (state->wrap == 1)
+        strm->adler = param->cv;
+    else if (state->wrap == 2)
+        strm->adler = ZSWAP32(param->cv);
+
+    /* Unmask the input data */
+    strm->avail_in += masked_avail_in;
+    masked_avail_in = 0;
+
+    /* If we encounter an error, it means there is a bug in DFLTCC call */
+    Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG");
+
+    /* Update Block-Continuation Flag. It will be used to check whether to call
+     * GDHT the next time.
+     */
+    if (cc == DFLTCC_CC_OK) {
+        if (soft_bcc) {
+            send_eobs(strm, param);
+            param->bcf = 0;
+            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
+        } else
+            param->bcf = 1;
+        if (flush == Z_FINISH) {
+            if (need_empty_block)
+                /* Make the current deflate() call also close the stream */
+                return 0;
+            else {
+                bi_windup(state);
+                *result = finish_done;
+            }
+        } else {
+            if (flush == Z_FULL_FLUSH)
+                param->hl = 0; /* Clear history */
+            *result = flush == Z_NO_FLUSH ? need_more : block_done;
+        }
+    } else {
+        param->bcf = 1;
+        *result = need_more;
+    }
+    if (strm->avail_in != 0 && strm->avail_out != 0)
+        goto again; /* deflate() must use all input or all output */
+    return 1;
+}
+
+/*
+   Switching between hardware and software compression.
+
+   DFLTCC does not support all zlib settings, e.g. generation of non-compressed
+   blocks or alternative window sizes. When such settings are applied on the
+   fly with deflateParams, we need to convert between hardware and software
+   window formats.
+*/
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    int could_deflate = PREFIX(dfltcc_can_deflate)(strm);
+    int can_deflate = dfltcc_can_deflate_with_params(strm, level, W_BITS(state), strategy, state->reproducible);
+
+    if (can_deflate == could_deflate)
+        /* We continue to work in the same mode - no changes needed */
+        return Z_OK;
+
+    if (!dfltcc_was_deflate_used(strm))
+        /* DFLTCC was not used yet - no changes needed */
+        return Z_OK;
+
+    /* For now, do not convert between window formats - simply get rid of the old data instead */
+    *flush = Z_FULL_FLUSH;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+     * close the block without resetting the compression state. Detect this
+     * situation and return that deflation is not done.
+     */
+    if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+        return 0;
+
+    /* Return that deflation is not done if DFLTCC is used and either it
+     * buffered some data (Continuation Flag is set), or has not written EOBS
+     * yet (Block-Continuation Flag is set).
+     */
+    return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->strstart = 1; /* Add FDICT to zlib header */
+    state->block_start = state->strstart; /* Make deflate_stored happy */
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_deflate.h b/neozip/arch/s390/dfltcc_deflate.h
new file mode 100644
index 0000000000..35e2fd3f62
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_deflate.h
@@ -0,0 +1,58 @@
+#ifndef DFLTCC_DEFLATE_H
+#define DFLTCC_DEFLATE_H
+
+#include "deflate.h"
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp));
+int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+
+#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_deflate)((strm))) \
+            return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state)
+
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
+    do { \
+        int err; \
+\
+        err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \
+        if (err == Z_STREAM_ERROR) \
+            return err; \
+    } while (0)
+
+#define DEFLATE_DONE PREFIX(dfltcc_deflate_done)
+
+#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
+    do { \
+        if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \
+            (complen) = DEFLATE_BOUND_COMPLEN(source_len); \
+    } while (0)
+
+#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_HOOK PREFIX(dfltcc_deflate)
+
+#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm)))
+
+#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible)
+
+#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_detail.h b/neozip/arch/s390/dfltcc_detail.h
new file mode 100644
index 0000000000..f790735ab4
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_detail.h
@@ -0,0 +1,274 @@
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include <stdio.h>
+
+#ifdef HAVE_SYS_SDT_H
+#include <sys/sdt.h>
+#endif
+
+/*
+   Tuning parameters.
+ */
+#ifndef DFLTCC_LEVEL_MASK
+#define DFLTCC_LEVEL_MASK 0x2
+#endif
+#ifndef DFLTCC_BLOCK_SIZE
+#define DFLTCC_BLOCK_SIZE 1048576
+#endif
+#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE
+#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096
+#endif
+#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE
+#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096
+#endif
+#ifndef DFLTCC_RIBM
+#define DFLTCC_RIBM 0
+#endif
+
+#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
+
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
+
+static inline int is_bit_set(const char *bits, int n) {
+    return bits[n / 8] & (1 << (7 - (n % 8)));
+}
+
+static inline void clear_bit(char *bits, int n) {
+    bits[n / 8] &= ~(1 << (7 - (n % 8)));
+}
+
+#define DFLTCC_FACILITY 151
+
+static inline int is_dfltcc_enabled(void) {
+    uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
+    Z_REGISTER uint8_t r0 __asm__("r0");
+
+    memset(facilities, 0, sizeof(facilities));
+    r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
+    /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+     * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+     * is 64-bit, it's always z/Architecture mode at runtime.
+     */
+    __asm__ volatile(
+#ifndef __clang__
+                     ".machinemode push\n"
+                     ".machinemode zarch\n"
+#endif
+                     "stfle %[facilities]\n"
+#ifndef __clang__
+                     ".machinemode pop\n"
+#endif
+                     : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+    return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
+}
+
+#define DFLTCC_FMT0 0
+
+#define CVT_CRC32 0
+#define CVT_ADLER32 1
+#define HTT_FIXED 0
+#define HTT_DYNAMIC 1
+
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
+
+static inline z_const char *oesc_msg(char *buf, int oesc) {
+    if (oesc == 0x00)
+        return NULL; /* Successful completion */
+    else {
+        sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc);
+        return buf;
+    }
+}
+
+/*
+   C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+    DFLTCC_CC_OK = 0,
+    DFLTCC_CC_OP1_TOO_SHORT = 1,
+    DFLTCC_CC_OP2_TOO_SHORT = 2,
+    DFLTCC_CC_OP2_CORRUPT = 2,
+    DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+
+/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */
+static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) {
+    *hl_high = MIN(param->hl, HB_SIZE - param->ho);
+    *hl_low = param->hl - *hl_high;
+}
+
+/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */
+static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    instrument_read_write(hist + param->ho, hl_high);
+    instrument_read_write(hist, hl_low);
+}
+
+/* Notify MSan about a completed write to the circular history buffer. */
+static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    __msan_unpoison(hist + param->ho, hl_high);
+    __msan_unpoison(hist, hl_low);
+}
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+                               unsigned char **op1, size_t *len1,
+                               z_const unsigned char **op2, size_t *len2, void *hist) {
+    unsigned char *t2 = op1 ? *op1 : NULL;
+    unsigned char *orig_t2 = t2;
+    size_t t3 = len1 ? *len1 : 0;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
+    size_t t5 = len2 ? *len2 : 0;
+    Z_REGISTER int r0 __asm__("r0");
+    Z_REGISTER void *r1 __asm__("r1");
+    Z_REGISTER unsigned char *r2 __asm__("r2");
+    Z_REGISTER size_t r3 __asm__("r3");
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4");
+    Z_REGISTER size_t r5 __asm__("r5");
+    int cc;
+
+    /* Insert pre-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        instrument_write(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0);
+        instrument_read(t4, t5);
+        break;
+    case DFLTCC_CMPR:
+    case DFLTCC_XPND:
+        instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        instrument_read(t4, t5);
+        instrument_write(t2, t3);
+        instrument_read_write_hist(param, hist);
+        break;
+    }
+
+    r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5;
+    __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     "ipm %[cc]\n"
+                     : [r2] "+r" (r2)
+                     , [r3] "+r" (r3)
+                     , [r4] "+r" (r4)
+                     , [r5] "+r" (r5)
+                     , [cc] "=r" (cc)
+                     : [r0] "r" (r0)
+                     , [r1] "r" (r1)
+                     , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+                     : "cc", "memory");
+    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+    /* Insert post-instrumentation for DFLTCC. */
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+        break;
+    case DFLTCC_CMPR:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+        msan_unpoison_hist(param, hist);
+        break;
+    case DFLTCC_XPND:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2);
+        msan_unpoison_hist(param, hist);
+        break;
+    }
+
+    if (op1)
+        *op1 = t2;
+    if (len1)
+        *len1 = t3;
+    if (op2)
+        *op2 = t4;
+    if (len2)
+        *len2 = t5;
+    return (cc >> 28) & 3;
+}
+
+static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) {
+    /* Initialize available functions */
+    if (is_dfltcc_enabled()) {
+        dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL);
+        memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af));
+    } else
+        memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+
+    /* Initialize parameter block */
+    memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param));
+    dfltcc_state->param.nt = 1;
+    dfltcc_state->param.ribm = DFLTCC_RIBM;
+}
+
+static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) {
+    memcpy(dst, src, ALIGN_UP(size, 8) + extension_size);
+}
+
+static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history,
+                                  const unsigned char *buf, uInt count) {
+    size_t offset;
+    size_t n;
+
+    /* Do not use more than 32K */
+    if (count > HB_SIZE) {
+        buf += count - HB_SIZE;
+        count = HB_SIZE;
+    }
+    offset = (param->ho + param->hl) % HB_SIZE;
+    if (offset + count <= HB_SIZE)
+        /* Circular history buffer does not wrap - copy one chunk */
+        memcpy(history + offset, buf, count);
+    else {
+        /* Circular history buffer wraps - copy two chunks */
+        n = HB_SIZE - offset;
+        memcpy(history + offset, buf, n);
+        memcpy(history, buf + n, count - n);
+    }
+    n = param->hl + count;
+    if (n <= HB_SIZE)
+        /* All history fits into buffer - no need to discard anything */
+        param->hl = n;
+    else {
+        /* History does not fit into buffer - discard extra bytes */
+        param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE;
+        param->hl = HB_SIZE;
+    }
+}
+
+static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history,
+                               unsigned char *buf) {
+    size_t hl_high, hl_low;
+
+    get_history_lengths(param, &hl_high, &hl_low);
+    memcpy(buf, history + param->ho, hl_high);
+    memcpy(buf + hl_high, history, hl_low);
+}
diff --git a/neozip/arch/s390/dfltcc_inflate.c b/neozip/arch/s390/dfltcc_inflate.c
new file mode 100644
index 0000000000..f6bc423c22
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_inflate.c
@@ -0,0 +1,195 @@
+/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */
+
+/*
+   Use the following commands to build zlib-ng with DFLTCC decompression support:
+
+        $ ./configure --with-dfltcc-inflate
+   or
+
+        $ cmake -DWITH_DFLTCC_INFLATE=1 .
+
+   and then
+
+        $ make
+*/
+
+#ifdef S390_DFLTCC_INFLATE
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "dfltcc_inflate.h"
+#include "dfltcc_detail.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    dfltcc_reset_state(&state->arch.common);
+}
+
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+
+    /* Unsupported hardware */
+    return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
+}
+
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+    size_t avail_in = strm->avail_in;
+    size_t avail_out = strm->avail_out;
+    dfltcc_cc cc;
+
+    cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR,
+                param, &strm->next_out, &avail_out,
+                &strm->next_in, &avail_in, state->window);
+    strm->avail_in = avail_in;
+    strm->avail_out = avail_out;
+    return cc;
+}
+
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+    dfltcc_cc cc;
+
+    if (flush == Z_BLOCK || flush == Z_TREES) {
+        /* DFLTCC does not support stopping on block boundaries */
+        if (PREFIX(dfltcc_inflate_disable)(strm)) {
+            *ret = Z_STREAM_ERROR;
+            return DFLTCC_INFLATE_BREAK;
+        } else
+            return DFLTCC_INFLATE_SOFTWARE;
+    }
+
+    if (state->last) {
+        if (state->bits != 0) {
+            strm->next_in++;
+            strm->avail_in--;
+            state->bits = 0;
+        }
+        state->mode = CHECK;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+
+    if (strm->avail_in == 0 && !param->cf)
+        return DFLTCC_INFLATE_BREAK;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    /* Translate stream to parameter block */
+    param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
+    param->sbb = state->bits;
+    if (param->hl)
+        param->nt = 0; /* Honor history for the first block */
+    if (state->wrap & 4)
+        param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+
+    /* Inflate */
+    do {
+        cc = dfltcc_xpnd(strm);
+    } while (cc == DFLTCC_CC_AGAIN);
+
+    /* Translate parameter block to stream */
+    strm->msg = oesc_msg(dfltcc_state->msg, param->oesc);
+    state->last = cc == DFLTCC_CC_OK;
+    state->bits = param->sbb;
+    if (state->wrap & 4)
+        strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+    if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
+        /* Report an error if stream is corrupted */
+        state->mode = BAD;
+        return DFLTCC_INFLATE_CONTINUE;
+    }
+    state->mode = TYPEDO;
+    /* Break if operands are exhausted, otherwise continue looping */
+    return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ?
+        DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+
+    return !state->arch.common.param.nt;
+}
+
+/*
+   Rotates a circular buffer.
+   The implementation is based on https://cplusplus.com/reference/algorithm/rotate/
+ */
+static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) {
+    unsigned char *p = pivot;
+    unsigned char tmp;
+
+    while (p != start) {
+        tmp = *start;
+        *start = *p;
+        *p = tmp;
+
+        start++;
+        p++;
+
+        if (p == end)
+            p = pivot;
+        else if (start == pivot)
+            pivot = p;
+    }
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = &state->arch.common;
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    if (!PREFIX(dfltcc_can_inflate)(strm))
+        return 0;
+    if (PREFIX(dfltcc_was_inflate_used)(strm))
+        /* DFLTCC has already decompressed some data. Since there is not
+         * enough information to resume decompression in software, the call
+         * must fail.
+         */
+        return 1;
+    /* DFLTCC was not used yet - decompress in software */
+    memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af));
+    /* Convert the window from the hardware to the software format */
+    rotate(state->window, state->window + param->ho, state->window + HB_SIZE);
+    state->whave = state->wnext = MIN(param->hl, state->wsize);
+    return 0;
+}
+
+/*
+   Preloading history.
+*/
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0)
+        state->wsize = 1U << state->wbits;
+
+    append_history(param, state->window, dictionary, dict_length);
+    state->havedict = 1;
+    return Z_OK;
+}
+
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt *dict_length) {
+    struct inflate_state *state = (struct inflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &state->arch.common.param;
+
+    if (dictionary && state->window)
+        get_history(param, state->window, dictionary);
+    if (dict_length)
+        *dict_length = param->hl;
+    return Z_OK;
+}
+
+#endif
diff --git a/neozip/arch/s390/dfltcc_inflate.h b/neozip/arch/s390/dfltcc_inflate.h
new file mode 100644
index 0000000000..3623f8ed7f
--- /dev/null
+++ b/neozip/arch/s390/dfltcc_inflate.h
@@ -0,0 +1,67 @@
+#ifndef DFLTCC_INFLATE_H
+#define DFLTCC_INFLATE_H
+
+#include "dfltcc_common.h"
+
+void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm);
+typedef enum {
+    DFLTCC_INFLATE_CONTINUE,
+    DFLTCC_INFLATE_BREAK,
+    DFLTCC_INFLATE_SOFTWARE,
+} dfltcc_inflate_action;
+dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm);
+int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm,
+                                                     const unsigned char *dictionary, uInt dict_length);
+int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm,
+                                                     unsigned char *dictionary, uInt* dict_length);
+
+#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state)
+
+#define INFLATE_PRIME_HOOK(strm, bits, value) \
+    do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0)
+
+#define INFLATE_TYPEDO_HOOK(strm, flush) \
+    if (PREFIX(dfltcc_can_inflate)((strm))) { \
+        dfltcc_inflate_action action; \
+\
+        RESTORE(); \
+        action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \
+        LOAD(); \
+        if (action == DFLTCC_INFLATE_CONTINUE) \
+            break; \
+        else if (action == DFLTCC_INFLATE_BREAK) \
+            goto inf_leave; \
+    }
+
+#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm)))
+
+#define INFLATE_MARK_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \
+    } while (0)
+
+#define INFLATE_SYNC_POINT_HOOK(strm) \
+    do { \
+        if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \
+    } while (0)
+
+#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \
+    do { \
+        if (PREFIX(dfltcc_can_inflate)((strm))) \
+            return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \
+    } while (0)
+
+#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE)
+
+#endif
diff --git a/neozip/arch/s390/s390_features.c b/neozip/arch/s390/s390_features.c
new file mode 100644
index 0000000000..dabb578a07
--- /dev/null
+++ b/neozip/arch/s390/s390_features.c
@@ -0,0 +1,18 @@
+#ifdef S390_FEATURES
+
+#include "zbuild.h"
+#include "s390_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+
+#ifndef HWCAP_S390_VXRS
+#define HWCAP_S390_VXRS (1 << 11)
+#endif
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) {
+    features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS;
+}
+
+#endif
diff --git a/neozip/arch/s390/s390_features.h b/neozip/arch/s390/s390_features.h
new file mode 100644
index 0000000000..fb2ac14b26
--- /dev/null
+++ b/neozip/arch/s390/s390_features.h
@@ -0,0 +1,14 @@
+/* s390_features.h -- check for s390 features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FEATURES_H_
+#define S390_FEATURES_H_
+
+struct s390_cpu_features {
+    int has_vx;
+};
+
+void Z_INTERNAL s390_check_features(struct s390_cpu_features *features);
+
+#endif
diff --git a/neozip/arch/s390/s390_functions.h b/neozip/arch/s390/s390_functions.h
new file mode 100644
index 0000000000..30647051f4
--- /dev/null
+++ b/neozip/arch/s390/s390_functions.h
@@ -0,0 +1,33 @@
+/* s390_functions.h -- s390 implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_FUNCTIONS_H_
+#define S390_FUNCTIONS_H_
+
+#include "s390_natives.h"
+
+#ifdef S390_CRC32_VX
+uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+
+#ifdef __clang__
+#  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+# error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \
+        Either disable the zlib-ng CRC32-VX optimization, or switch to another compiler/compiler version.
+#  endif
+#endif
+
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// S390 - CRC32 VX
+#  ifdef S390_CRC32_VX_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_s390_vx
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_s390_vx
+#  endif
+#endif
+
+#endif
diff --git a/neozip/arch/s390/s390_natives.h b/neozip/arch/s390/s390_natives.h
new file mode 100644
index 0000000000..5da913daf5
--- /dev/null
+++ b/neozip/arch/s390/s390_natives.h
@@ -0,0 +1,14 @@
+/* s390_natives.h -- s390 compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef S390_NATIVES_H_
+#define S390_NATIVES_H_
+
+#if defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__)
+#  ifdef S390_CRC32_VX
+#    define S390_CRC32_VX_NATIVE
+#  endif
+#endif
+
+#endif /* S390_NATIVES_H_ */
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner b/neozip/arch/s390/self-hosted-builder/actions-runner
new file mode 100755
index 0000000000..aabc802547
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+#
+# Ephemeral runner startup script.
+#
+# Expects the following environment variables:
+#
+# - REPO=<owner>
+# - PAT_TOKEN=<github_pat_***>
+#
+
+set -e -u
+
+# Validate required environment variables
+if [ -z "${REPO:-}" ] || [ -z "${PAT_TOKEN:-}" ]; then
+    echo "Error: REPO and/or PAT_TOKEN environment variables not found"
+    exit 1
+fi
+
+# Check the cached registration token.
+TOKEN_FILE=registration-token.json
+if [ -f $TOKEN_FILE ]; then
+    set +e
+    EXPIRES=$(jq --raw-output .expires_at "$TOKEN_FILE" 2>/dev/null)
+    STATUS=$?
+    set -e
+else
+    STATUS=1
+    EXPIRES=""
+fi
+
+if [[ $STATUS -ne 0 || -z "$EXPIRES" || "$EXPIRES" == "null" || $(date +%s) -ge $(date -d "$EXPIRES" +%s) ]]; then
+    # Refresh the cached registration token.
+    curl \
+        -sS \
+        -X POST \
+        -H "Accept: application/vnd.github+json" \
+        -H "Authorization: Bearer $PAT_TOKEN" \
+        "https://api.github.com/repos/$REPO/actions/runners/registration-token" \
+        -o "$TOKEN_FILE"
+fi
+
+REG_TOKEN=$(jq --raw-output .token "$TOKEN_FILE")
+if [ $REG_TOKEN = "null" ]; then
+    echo "Failed to get registration token"
+    exit 1
+fi
+
+# (Re-)register the runner.
+./config.sh remove --token "$REG_TOKEN" || true
+set -x
+./config.sh \
+    --url "https://github.com/$REPO" \
+    --token "$REG_TOKEN" \
+    --unattended \
+    --disableupdate \
+    --replace \
+    --labels z15 \
+    --ephemeral
+
+# Run one job.
+./run.sh
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh b/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh
new file mode 100644
index 0000000000..7fded31785
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/bash
+set -ex
+
+TMPDIR="$(mktemp -d)"
+
+if [ -f actions-runner.Dockerfile ]; then
+    MODE=1
+    cp actions-runner.Dockerfile actions-runner entrypoint $TMPDIR
+    cd $TMPDIR
+else
+    MODE=2
+    cd $TMPDIR
+    wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/actions-runner.Dockerfile
+    wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/actions-runner
+    wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/entrypoint
+fi
+
+# Stop service
+systemctl stop actions-runner || true
+
+# Delete old container
+podman container rm gaplib-actions-runner || true
+
+# Delete old image
+podman image rm localhost/zlib-ng/actions-runner || true
+
+# Prune all unused podman data
+podman system prune -f || true
+
+# Build new image
+podman build --squash -f actions-runner.Dockerfile --tag zlib-ng/actions-runner . 2>&1 | tee /var/log/actions-runner-build.log
+
+# Create new container
+podman create --replace --name=gaplib-actions-runner --env-file=/etc/actions-runner --init \
+       zlib-ng/actions-runner 2>&1 | tee -a /var/log/actions-runner-build.log
+
+# Start service
+systemctl start actions-runner || true
+
+# Cleanup
+podman image prune -af || true
+
+# Clean up tempfile
+if [ "$MODE" == "2" ] ; then
+    cd $TMPDIR
+    rm actions-runner.Dockerfile
+    rm actions-runner
+    rm entrypoint
+    cd ..
+    rmdir $TMPDIR
+    echo "Deleted tempfiles."
+fi
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile b/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile
new file mode 100644
index 0000000000..7210caaebe
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile
@@ -0,0 +1,47 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+FROM    almalinux:10
+
+RUN     dnf update -y -q && \
+        dnf install -y -q --enablerepo=crb wget git which sudo jq sed \
+            cmake make automake autoconf m4 libtool ninja-build \
+            python3-pip python3-devel python3-lxml \
+            gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \
+            glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel
+
+RUN     dnf install -y -q dotnet-sdk-8.0 && \
+        echo "Using SDK - `dotnet --version`"
+
+RUN     cd /tmp && \
+        git clone -q https://github.com/actions/runner && \
+        cd runner && \
+        git checkout $(git tag --sort=-v:refname | grep '^v[0-9]' | head -n1) && \
+        git log -n 1 && \
+        wget https://raw.githubusercontent.com/IBM/action-runner-image-pz/refs/heads/main/patches/runner-sdk8-s390x.patch -O runner-sdk8-s390x.patch && \
+        git apply --whitespace=nowarn runner-sdk8-s390x.patch && \
+
+        sed -i'' -e /version/s/8......\"$/$8.0.100\"/ src/global.json
+
+RUN     cd /tmp/runner/src && \
+        ./dev.sh layout && \
+        ./dev.sh package && \
+        rm -rf /root/.dotnet /root/.nuget
+
+RUN     useradd -c "Action Runner" -m actions-runner && \
+        usermod -L actions-runner
+
+RUN     tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \
+        chown -R actions-runner:actions-runner /home/actions-runner
+
+# Cleanup
+RUN     rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \
+        dnf clean all
+
+USER    actions-runner
+
+# Scripts.
+COPY    --chmod=555 entrypoint /usr/bin/
+COPY    --chmod=555 actions-runner /usr/bin/
+WORKDIR /home/actions-runner
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD     ["/usr/bin/actions-runner"]
diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner.service b/neozip/arch/s390/self-hosted-builder/actions-runner.service
new file mode 100644
index 0000000000..79560cde18
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/actions-runner.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Podman container: Gaplib Github Actions Runner
+Wants=network-online.target
+After=network-online.target
+StartLimitIntervalSec=1
+RequiresMountsFor=/run/user/1001/containers
+
+[Service]
+Environment=PODMAN_SYSTEMD_UNIT=%n
+Restart=always
+TimeoutStopSec=61
+ExecStart=/usr/bin/podman start gaplib-actions-runner
+ExecStop=/usr/bin/podman stop -t 30 gaplib-actions-runner
+ExecStopPost=/usr/bin/podman stop -t 10 gaplib-actions-runner
+Type=forking
+
+[Install]
+WantedBy=default.target
diff --git a/neozip/arch/s390/self-hosted-builder/entrypoint b/neozip/arch/s390/self-hosted-builder/entrypoint
new file mode 100755
index 0000000000..eb8772becf
--- /dev/null
+++ b/neozip/arch/s390/self-hosted-builder/entrypoint
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+#
+# Container entrypoint that waits for all spawned processes.
+#
+
+set -e -u
+
+# Create a FIFO and start reading from its read end.
+tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
+trap 'rm -r "$tempdir"' EXIT
+done="$tempdir/pipe"
+mkfifo "$done"
+cat "$done" & waiter=$!
+
+# Start the workload. Its descendants will inherit the FIFO's write end.
+status=0
+if [ "$#" -eq 0 ]; then
+    bash 9>"$done" || status=$?
+else
+    "$@" 9>"$done" || status=$?
+fi
+
+# When the workload and all of its descendants exit, the FIFO's write end will
+# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
+# in order to handle SelfUpdater, which the workload may start in background
+# before exiting.
+wait "$waiter"
+
+exit "$status"
diff --git a/neozip/arch/x86/Makefile.in b/neozip/arch/x86/Makefile.in
new file mode 100644
index 0000000000..f756844a9f
--- /dev/null
+++ b/neozip/arch/x86/Makefile.in
@@ -0,0 +1,176 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
+AVX512VNNIFLAG=-mavx512vnni -mbmi2
+AVX2FLAG=-mavx2 -mbmi2
+SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
+SSE41FLAG=-msse4.1
+SSE42FLAG=-msse4.2
+PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
+XSAVEFLAG=-mxsave
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	x86_features.o x86_features.lo \
+	adler32_avx2.o adler32_avx2.lo \
+	adler32_avx512.o adler32_avx512.lo \
+	adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+	adler32_sse42.o adler32_sse42.lo \
+	adler32_ssse3.o adler32_ssse3.lo \
+	chunkset_avx2.o chunkset_avx2.lo \
+	chunkset_avx512.o chunkset_avx512.lo \
+	chunkset_sse2.o chunkset_sse2.lo \
+	chunkset_ssse3.o chunkset_ssse3.lo \
+	compare256_avx2.o compare256_avx2.lo \
+	compare256_avx512.o compare256_avx512.lo \
+	compare256_sse2.o compare256_sse2.lo \
+	crc32_chorba_sse2.o crc32_chorba_sse2.lo \
+	crc32_chorba_sse41.o crc32_chorba_sse41.lo \
+	crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+	crc32_vpclmulqdq_avx2.o crc32_vpclmulqdq_avx2.lo \
+	crc32_vpclmulqdq_avx512.o crc32_vpclmulqdq_avx512.lo \
+	slide_hash_avx2.o slide_hash_avx2.lo \
+	slide_hash_sse2.o slide_hash_sse2.lo
+
+x86_features.o:
+	$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+x86_features.lo:
+	$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+chunkset_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx512.o:
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_avx512.lo:
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c
+
+chunkset_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_ssse3.o:
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+chunkset_ssse3.lo:
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+compare256_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx512.o:
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
+compare256_avx512.lo:
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
+
+compare256_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+crc32_chorba_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c
+
+crc32_chorba_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c
+
+crc32_chorba_sse41.o:
+	$(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c
+
+crc32_chorba_sse41.lo:
+	$(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c
+
+crc32_pclmulqdq.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_pclmulqdq.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_vpclmulqdq_avx2.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
+
+crc32_vpclmulqdq_avx2.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
+
+crc32_vpclmulqdq_avx512.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
+
+crc32_vpclmulqdq_avx512.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
+
+slide_hash_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+slide_hash_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/x86/adler32_avx2.c b/neozip/arch/x86/adler32_avx2.c
new file mode 100644
index 0000000000..d1811b254d
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx2.c
@@ -0,0 +1,172 @@
+/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2022 Adam Stylinski
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include <immintrin.h>
+#include "adler32_p.h"
+#include "adler32_avx2_p.h"
+#include "x86_intrins.h"
+
+extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
+    } else if (len < 32) {
+        if (COPY) {
+            return adler32_copy_sse42(adler, dst, src, len);
+        } else {
+            return adler32_ssse3(adler, src, len);
+        }
+    }
+
+    __m256i vs1, vs2, vs2_0;
+
+    const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
+                                           46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
+    const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+                                             14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m256i dot3v = _mm256_set1_epi16(1);
+    const __m256i zero = _mm256_setzero_si256();
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        vs2_0 = vs3;
+
+        size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+        len -= k;
+
+        while (k >= 64) {
+            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+            __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
+            __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
+
+            if (COPY) {
+                _mm256_storeu_si256((__m256i*)dst, vbuf);
+                _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
+                dst += 64;
+            }
+
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+            __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
+            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+            __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
+            vs1 = _mm256_add_epi32(vs1_sad2, vs1);
+            vs2 = _mm256_add_epi32(vsum2, vs2);
+            vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm256_add_epi32(vs2_0, vs2);
+        vs3 = _mm256_slli_epi32(vs3, 6);
+        vs2 = _mm256_add_epi32(vs3, vs2);
+        vs3 = _mm256_setzero_si256();
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+            */
+            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+
+            if (COPY) {
+                _mm256_storeu_si256((__m256i*)dst, vbuf);
+                dst += 32;
+            }
+
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
+            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+            vs2 = _mm256_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        /* Defer the multiplication with 32 to outside of the loop */
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+
+        /* The compiler is generating the following sequence for this integer modulus
+         * when done the scalar way, in GPRs:
+
+         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+
+         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
+         ...
+         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
+         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
+         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
+         shr    $0x2f,%rsi // shift right by 47
+         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
+         ...
+         // repeats for each element with vpextract instructions
+
+         This is tricky with AVX2 for a number of reasons:
+             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
+             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
+                 back down to 32 bit precision later (there is in AVX512)
+             3.) Full width integer multiplications aren't cheap
+
+         We can, however, do a relatively cheap sequence for horizontal sums.
+         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
+         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
+         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
+         performed on the maximum possible inputs before overflow
+         */
+
+
+         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
+          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
+          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
+          * what the compiler is doing to avoid integer divisions. */
+         adler0 = partial_hsum256(vs1) % BASE;
+         adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx2_p.h b/neozip/arch/x86/adler32_avx2_p.h
new file mode 100644
index 0000000000..f0f8a4a887
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx2_p.h
@@ -0,0 +1,32 @@
+/* adler32_avx2_p.h -- adler32 avx2 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_P_H_
+#define ADLER32_AVX2_P_H_
+
+#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
+
+/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
+static inline uint32_t hsum256(__m256i x) {
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
+                                  _mm256_castsi256_si128(x));
+    __m128i sum2  = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros */
+    const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
+    __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
+    __m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
+    __m128i sum2  = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+#endif
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512.c b/neozip/arch/x86/adler32_avx512.c
new file mode 100644
index 0000000000..8a8e165bb9
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512.c
@@ -0,0 +1,102 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 64) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        if (COPY && len) {
+            __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+            __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+            _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+        }
+
+        return adler32_avx2(adler, src, len);
+    }
+
+    __m512i vbuf, vs1_0, vs3;
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+    const __m512i dot3v = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+
+    while (len >= 64) {
+        __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        vs1_0 = vs1;
+        vs3 = _mm512_setzero_si512();
+
+        size_t k = ALIGN_DOWN(MIN(len, NMAX), 64);
+        len -= k;
+
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf = _mm512_loadu_si512(src);
+
+            if (COPY) {
+                _mm512_storeu_si512(dst, vbuf);
+                dst += 64;
+            }
+
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+            __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+            vs1 = _mm512_add_epi32(vs1_sad, vs1);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm512_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512_p.h b/neozip/arch/x86/adler32_avx512_p.h
new file mode 100644
index 0000000000..742269053c
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,57 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+
+/* Written because Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsic macros */
+#if defined(_MSC_VER) && !defined(_MM_K0_REG8)
+#  undef _mm512_extracti64x4_epi64
+#  define _mm512_extracti64x4_epi64(v1, e1) _mm512_maskz_extracti64x4_epi64(UINT8_MAX, v1, e1)
+#  undef _mm512_set1_epi16
+#  define _mm512_set1_epi16(e1) _mm512_maskz_set1_epi16(UINT32_MAX, e1)
+#  undef _mm512_maddubs_epi16
+#  define _mm512_maddubs_epi16(v1, v2) _mm512_maskz_maddubs_epi16(UINT32_MAX, v1, v2)
+#endif
+
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+    __m256i a = _mm512_extracti64x4_epi64(x, 1);
+    __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+    __m256i a_plus_b = _mm256_add_epi32(a, b);
+    __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+    __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+    __m128i c_plus_d = _mm_add_epi32(c, d);
+
+    __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+    __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+    return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros. Marking this const so the compiler stands
+     * a better chance of keeping this resident in a register through entire
+     * loop execution. We certainly have enough zmm registers (32) */
+    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+                                               1, 1, 1, 1, 1,  1,  1,  1);
+
+    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+    /* From here, it's a simple 256 bit wide reduction sum */
+    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+     * pretty slow, much slower than the longer instruction sequence below */
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+                                  _mm256_castsi256_si128(non_zero_avx));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_avx512_vnni.c b/neozip/arch/x86/adler32_avx512_vnni.c
new file mode 100644
index 0000000000..8bebffbf88
--- /dev/null
+++ b/neozip/arch/x86/adler32_avx512_vnni.c
@@ -0,0 +1,205 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512VNNI
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "arch_functions.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 32)
+        return adler32_ssse3(adler, src, len);
+
+    if (len < 64)
+        return adler32_avx2(adler, src, len);
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+    const __m512i zero = _mm512_setzero_si512();
+    __m512i vs1, vs2;
+
+    while (len >= 64) {
+        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        size_t k = ALIGN_DOWN(MIN(len, NMAX), 64);
+        len -= k;
+        __m512i vs1_0 = vs1;
+        __m512i vs3 = _mm512_setzero_si512();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m512i vs2_1 = _mm512_setzero_si512();
+        __m512i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 128) {
+            vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 128) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm512_loadu_si512((__m512i*)src);
+            vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+            src += 128;
+            k -= 128;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm512_add_epi32(vs3, vs1);
+            vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+        vs2 = _mm512_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+/* Use 256-bit vectors when copying because 512-bit variant is slower. */
+Z_INTERNAL uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel_copy:
+    if (len < 32) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+        return adler32_ssse3(adler, src, len);
+    }
+
+    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    const __m256i zero = _mm256_setzero_si256();
+    __m256i vs1, vs2;
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+
+        size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
+        len -= k;
+
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m256i vs2_1 = _mm256_setzero_si256();
+        __m256i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 64) {
+            vbuf1 = _mm256_loadu_si256((__m256i*)src);
+            _mm256_storeu_si256((__m256i*)dst, vbuf1);
+            dst += 32;
+
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm256_loadu_si256((__m256i*)src);
+            vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+            _mm256_storeu_si256((__m256i*)dst, vbuf0);
+            _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+            dst += 64;
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm256_add_epi32(vs3, vs1);
+            vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+        vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel_copy;
+    }
+
+    return adler;
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_sse42.c b/neozip/arch/x86/adler32_sse42.c
new file mode 100644
index 0000000000..c2301213f0
--- /dev/null
+++ b/neozip/arch/x86/adler32_sse42.c
@@ -0,0 +1,117 @@
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE42
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (UNLIKELY(len < 16))
+       return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1);
+
+    __m128i vbuf, vbuf_0;
+    __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            v_sad_sum2, vsum2, vsum2_0;
+    __m128i zero = _mm_setzero_si128();
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+
+    while (len >= 16) {
+        size_t k = ALIGN_DOWN(MIN(len, NMAX), 16);
+        len -= k;
+
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+    }
+
+    /* If this is true, there's fewer than 16 elements remaining */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler0 | (adler1 << 16);
+}
+
+#endif
diff --git a/neozip/arch/x86/adler32_ssse3.c b/neozip/arch/x86/adler32_ssse3.c
new file mode 100644
index 0000000000..702db50251
--- /dev/null
+++ b/neozip/arch/x86/adler32_ssse3.c
@@ -0,0 +1,149 @@
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSSE3
+
+#include "zbuild.h"
+#include "adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+
+    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+    /* If our buffer is unaligned (likely), make the determination whether
+     * or not there's enough of a buffer to consume to make the scalar, aligning
+     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+     * load. This is a pretty simple test, just test if len < 32 */
+    size_t n = NMAX;
+    size_t k = 0;
+
+    if (len < 32) {
+        /* Let's eat the cost of this one unaligned load so that
+         * we don't completely skip over the vectorization. Doing
+         * 16 bytes at a time unaligned is better than 16 + <= 15
+         * sums */
+        vbuf = _mm_loadu_si128((__m128i*)buf);
+        len -= 16;
+        buf += 16;
+        vs1 = _mm_cvtsi32_si128(adler);
+        vs2 = _mm_cvtsi32_si128(sum2);
+        vs3 = _mm_setzero_si128();
+        vs1_0 = vs1;
+        goto unaligned_jmp;
+    }
+
+    size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+    if (align_diff) {
+        adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0);
+
+        buf += align_diff;
+        len -= align_diff;
+        n -= align_diff;
+    }
+
+    while (len >= 16) {
+        vs1 = _mm_cvtsi32_si128(adler);
+        vs2 = _mm_cvtsi32_si128(sum2);
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        k = ALIGN_DOWN(MIN(len, n), 16);
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+            buf += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            buf += 16;
+            k -= 16;
+
+unaligned_jmp:
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+         * a partial reduction sum implicitly and only summing to integers in vector positions
+         * 0 and 2. This saves us some contention on the shuffle port(s) */
+        adler = partial_hsum(vs1) % BASE;
+        sum2 = hsum(vs2) % BASE;
+        n = NMAX;
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0);
+}
+
+/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
+Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    adler = adler32_ssse3(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+}
+#endif
diff --git a/neozip/arch/x86/adler32_ssse3_p.h b/neozip/arch/x86/adler32_ssse3_p.h
new file mode 100644
index 0000000000..d7ec3fe0d5
--- /dev/null
+++ b/neozip/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+    __m128i second_int = _mm_srli_si128(x, 8);
+    __m128i sum = _mm_add_epi32(x, second_int);
+    return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+    __m128i sum1 = _mm_unpackhi_epi64(x, x);
+    __m128i sum2 = _mm_add_epi32(x, sum1);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+    return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
diff --git a/neozip/arch/x86/chunkset_avx2.c b/neozip/arch/x86/chunkset_avx2.c
new file mode 100644
index 0000000000..3e69a7bf66
--- /dev/null
+++ b/neozip/arch/x86/chunkset_avx2.c
@@ -0,0 +1,129 @@
+/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+    /* See explanation in chunkset_avx512.c */
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+    halfchunk_t half = _mm_loadu_si128((__m128i*)from);
+    *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
+#else
+    *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+     * compiling this to a shared load for all branches, preferring the simpler code.  Given that the buf value isn't in
+     * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 32 - dist);
+
+    if (dist < 16) {
+        /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+         * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+         * shuffles and combining the halves later */
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    }  else {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+        /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+        __m128i xlane_res  = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+        /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+         * shuffle those values */
+        __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+    /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+     * unlikely to be actually written or read from */
+    return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    __msan_unpoison(buf + dist, 16 - dist);
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = half_rem_vals[dist - 3];
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+#define CHUNKSIZE        chunksize_avx2
+#define CHUNKCOPY        chunkcopy_avx2
+#define CHUNKUNROLL      chunkunroll_avx2
+#define CHUNKMEMSET      chunkmemset_avx2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_avx2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_avx512.c b/neozip/arch/x86/chunkset_avx512.c
new file mode 100644
index 0000000000..60450c653b
--- /dev/null
+++ b/neozip/arch/x86/chunkset_avx512.c
@@ -0,0 +1,186 @@
+/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include "arch/generic/chunk_256bit_perm_idx_lut.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+
+typedef __m256i chunk_t;
+typedef __m128i halfchunk_t;
+typedef __mmask32 mask_t;
+typedef __mmask16 halfmask_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
+#define HAVE_MASKED_READWRITE
+#define HAVE_CHUNKCOPY
+#define HAVE_HALFCHUNKCOPY
+
+static inline halfmask_t gen_half_mask(size_t len) {
+   return (halfmask_t)_bzhi_u32(0xFFFF, (unsigned)len);
+}
+
+static inline mask_t gen_mask(size_t len) {
+   return (mask_t)_bzhi_u32(0xFFFFFFFF, (unsigned)len);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
+    /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where
+     * the load is dumped to the stack with an aligned move for this memory-register
+     * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to
+     * stack doesn't exist if compiled with optimizations. For the sake of working
+     * properly in a debugger, let's take the 2 cycle penalty */
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+    halfchunk_t half = _mm_loadu_si128((__m128i*)from);
+    *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
+#else
+    *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+
+    chunk_t chunk;
+    size_t rem = len % sizeof(chunk_t);
+
+    if (len < sizeof(chunk_t)) {
+        mask_t rem_mask = gen_mask(rem);
+        chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
+        _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+        return out + rem;
+    }
+
+    loadchunk(from, &chunk);
+    rem = (rem == 0) ? sizeof(chunk_t) : rem;
+    storechunk(out, &chunk);
+    out += rem;
+    from += rem;
+    len -= rem;
+
+    while (len > 0) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+
+    return out;
+}
+
+/* MSVC compiler decompression bug when optimizing for size */
+#if defined(_MSC_VER) && _MSC_VER < 1943
+#  pragma optimize("", off)
+#endif
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    *chunk_rem = lut_rem.remval;
+
+    /* See the AVX2 implementation for more detailed comments. This is that + some masked
+     * loads to avoid an out of bounds read on the heap */
+
+    if (dist < 16) {
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        halfmask_t load_mask = gen_half_mask(dist);
+        __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    }  else {
+        halfmask_t load_mask = gen_half_mask(dist - 16);
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
+        __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+#if defined(_MSC_VER) && _MSC_VER < 1943
+#  pragma optimize("", on)
+#endif
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
+    /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+     * unlikely to be actually written or read from */
+    return _mm256_zextsi128_si256(*chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    halfmask_t load_mask = gen_half_mask(dist);
+    ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
+    *chunk_rem = half_rem_vals[dist - 3];
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+    halfchunk_t chunk;
+
+    size_t rem = len % sizeof(halfchunk_t);
+    if (rem == 0) {
+        rem = sizeof(halfchunk_t);
+    }
+
+    halfmask_t rem_mask = gen_half_mask(rem);
+    chunk = _mm_maskz_loadu_epi8(rem_mask, from);
+    _mm_mask_storeu_epi8(out, rem_mask, chunk);
+
+    return out + rem;
+}
+
+#define CHUNKSIZE        chunksize_avx512
+#define CHUNKUNROLL      chunkunroll_avx512
+#define CHUNKMEMSET      chunkmemset_avx512
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_avx512
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_sse2.c b/neozip/arch/x86/chunkset_sse2.c
new file mode 100644
index 0000000000..633ab6e64f
--- /dev/null
+++ b/neozip/arch/x86/chunkset_sse2.c
@@ -0,0 +1,50 @@
+/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE2
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_sse2
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKMEMSET      chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_sse2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/chunkset_ssse3.c b/neozip/arch/x86/chunkset_ssse3.c
new file mode 100644
index 0000000000..0bef7de811
--- /dev/null
+++ b/neozip/arch/x86/chunkset_ssse3.c
@@ -0,0 +1,72 @@
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSSE3
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+
+#include <immintrin.h>
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi16(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi32(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi64x(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    /* Important to note:
+     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+     * bytes we willingly and purposefully load uninitialized that we swizzle over
+     * in a vector register, anyway.  If what we assume is wrong about what is used,
+     * the memory sanitizer will still usefully flag it */
+    __msan_unpoison(buf + dist, 16 - dist);
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = lut_rem.remval;
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+#define CHUNKSIZE        chunksize_ssse3
+#define CHUNKMEMSET      chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
+#define CHUNKCOPY        chunkcopy_ssse3
+#define CHUNKUNROLL      chunkunroll_ssse3
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_ssse3
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_avx2.c b/neozip/arch/x86/compare256_avx2.c
new file mode 100644
index 0000000000..5e2b1716cf
--- /dev/null
+++ b/neozip/arch/x86/compare256_avx2.c
@@ -0,0 +1,61 @@
+/* compare256_avx2.c -- AVX2 version of compare256
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_AVX2
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+        mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF)
+            return len + zng_ctz32(~mask);
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_avx512.c b/neozip/arch/x86/compare256_avx512.c
new file mode 100644
index 0000000000..f3105505cb
--- /dev/null
+++ b/neozip/arch/x86/compare256_avx512.c
@@ -0,0 +1,87 @@
+/* compare256_avx512.c -- AVX512 version of compare256
+ * Copyright (C) 2025 Hans Kristian Rosbach
+ * Based on AVX2 implementation by Mika T. Lindqvist
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_AVX512
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) {
+    __m512i zmm_src0_4, zmm_src1_4;
+    __m512i zmm_src0_3, zmm_src1_3;
+    __m512i zmm_src0_2, zmm_src1_2;
+    __m512i zmm_src0_1, zmm_src1_1;
+    __m128i xmm_src0_0, xmm_src1_0;
+    uint64_t mask_1, mask_2, mask_3, mask_4;
+    uint32_t mask_0;
+
+    // First do a 16byte round before increasing to 64bytes, this reduces the
+    // penalty for the short matches, and those are usually the most common ones.
+    // This requires us to overlap on the last round, giving a small penalty
+    // on matches of 192+ bytes (Still faster than AVX2 though).
+
+    // 16 bytes
+    xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
+    mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0);
+    if (mask_0 != 0x0000FFFF)
+        return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */
+
+    // 64 bytes
+    zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16));
+    zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16));
+    mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1);
+    if (mask_1 != 0xFFFFFFFFFFFFFFFF)
+        return 16 + zng_ctz64(~mask_1);
+
+    // 64 bytes
+    zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80));
+    zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80));
+    mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2);
+    if (mask_2 != 0xFFFFFFFFFFFFFFFF)
+        return 80 + zng_ctz64(~mask_2);
+
+    // 64 bytes
+    zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144));
+    zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144));
+    mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3);
+    if (mask_3 != 0xFFFFFFFFFFFFFFFF)
+        return 144 + zng_ctz64(~mask_3);
+
+    // 64 bytes (overlaps the previous 16 bytes for fast tail processing)
+    zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192));
+    zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192));
+    mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4);
+    if (mask_4 != 0xFFFFFFFFFFFFFFFF)
+        return 192 + zng_ctz64(~mask_4);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_avx512_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_avx512
+#define COMPARE256          compare256_avx512_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_avx512
+#define COMPARE256          compare256_avx512_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/compare256_sse2.c b/neozip/arch/x86/compare256_sse2.c
new file mode 100644
index 0000000000..cfaff82cfa
--- /dev/null
+++ b/neozip/arch/x86/compare256_sse2.c
@@ -0,0 +1,86 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#ifdef X86_SSE2
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+    xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF)
+        return zng_ctz32(~mask);
+
+    const uint8_t *last0 = src0 + 240;
+    const uint8_t *last1 = src1 + 240;
+
+    int align_offset = ((uintptr_t)src0) & 15;
+    int align_adv = 16 - align_offset;
+    uint32_t len = align_adv;
+
+    src0 += align_adv;
+    src1 += align_adv;
+
+    for (int i = 0; i < 15; ++i) {
+        xmm_src0 = _mm_load_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF)
+            return len + zng_ctz32(~mask);
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        xmm_src0 = _mm_loadu_si128((__m128i*)last0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)last1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        if (mask != 0xFFFF)
+            return 240 + zng_ctz32(~mask);
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/x86/crc32_chorba_sse2.c b/neozip/arch/x86/crc32_chorba_sse2.c
new file mode 100644
index 0000000000..66191e046a
--- /dev/null
+++ b/neozip/arch/x86/crc32_chorba_sse2.c
@@ -0,0 +1,872 @@
+#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE)
+
+#include "zbuild.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include <emmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch_functions.h"
+
+#define READ_NEXT(in, off, a, b) do { \
+        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+        } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+        d  = _mm_srli_epi64(invec, 20); \
+        } while (0);
+
+Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint64_t* input = (const uint64_t*)buf;
+    ALIGNED_(16) uint64_t final[9] = {0};
+    uint64_t next1 = ~crc;
+    crc = 0;
+    uint64_t next2 = 0;
+    uint64_t next3 = 0;
+    uint64_t next4 = 0;
+    uint64_t next5 = 0;
+
+    __m128i next12 = _mm_cvtsi64_si128(next1);
+    __m128i next34 = _mm_setzero_si128();
+    __m128i next56 = _mm_setzero_si128();
+    __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4;
+
+    size_t i = 0;
+
+    /* This is weird, doing for vs while drops 10% off the exec time */
+    for (; (i + 256 + 40 + 32 + 32) < len; i += 32) {
+        __m128i in1in2, in3in4;
+
+        /*
+        uint64_t chorba1 = input[i / sizeof(uint64_t)];
+        uint64_t chorba2 = input[i / sizeof(uint64_t) + 1];
+        uint64_t chorba3 = input[i / sizeof(uint64_t) + 2];
+        uint64_t chorba4 = input[i / sizeof(uint64_t) + 3];
+        uint64_t chorba5 = input[i / sizeof(uint64_t) + 4];
+        uint64_t chorba6 = input[i / sizeof(uint64_t) + 5];
+        uint64_t chorba7 = input[i / sizeof(uint64_t) + 6];
+        uint64_t chorba8 = input[i / sizeof(uint64_t) + 7];
+        */
+
+        const uint64_t *input_ptr = input + (i / sizeof(uint64_t));
+        const __m128i *input_ptr_128 = (__m128i*)input_ptr;
+        __m128i chorba12 = _mm_load_si128(input_ptr_128++);
+        __m128i chorba34 = _mm_load_si128(input_ptr_128++);
+        __m128i chorba56 = _mm_load_si128(input_ptr_128++);
+        __m128i chorba78 = _mm_load_si128(input_ptr_128++);
+
+        chorba12 = _mm_xor_si128(chorba12, next12);
+        chorba34 = _mm_xor_si128(chorba34, next34);
+        chorba56 = _mm_xor_si128(chorba56, next56);
+        chorba78 = _mm_xor_si128(chorba78, chorba12);
+        __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1));
+        __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12),
+                                                           _mm_castsi128_pd(chorba34), 1));
+        /*
+        chorba1 ^= next1;
+        chorba2 ^= next2;
+        chorba3 ^= next3;
+        chorba4 ^= next4;
+        chorba5 ^= next5;
+        chorba7 ^= chorba1;
+        chorba8 ^= chorba2;
+        */
+        i += 8 * 8;
+
+        /* 0-3 */
+        /*in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];*/
+        READ_NEXT(input, i, in1in2, in3in4);
+        __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12));
+        in1in2 = _mm_xor_si128(in1in2, chorba34xor);
+        /*
+        in1 ^= chorba3;
+        in2 ^= chorba4 ^ chorba1;
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        */
+
+        in3in4 = _mm_xor_si128(in3in4, ab1);
+        /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */
+        __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2));
+
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23));
+        in3in4 = _mm_xor_si128(in3in4, chorba12);
+
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1;
+        in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+
+        /*out1 = a3 ^ b2 ^ c1;
+        out2 = b3 ^ c2 ^ d1 ^ a4;*/
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        //out3 = b4 ^ c3 ^ d2;
+        //out4 = c4 ^ d3;
+
+        //out5 = d4;
+
+        /*
+        next1 = out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 4-7 */
+        /*in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];*/
+        READ_NEXT(input, i, in1in2, in3in4);
+
+        in1in2 = _mm_xor_si128(in1in2, next12);
+        in1in2 = _mm_xor_si128(in1in2, chorba78);
+        in1in2 = _mm_xor_si128(in1in2, chorba45);
+        in1in2 = _mm_xor_si128(in1in2, chorba34);
+
+        /*
+        in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3;
+        in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4;
+        */
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+
+        in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5;
+        in4 ^= next4 ^ b1 ^ a2  ^ chorba7 ^ chorba6;
+        */
+        in3in4 = _mm_xor_si128(in3in4, next34);
+        in3in4 = _mm_xor_si128(in3in4, ab1);
+        in3in4 = _mm_xor_si128(in3in4, chorba56);
+        __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+
+        /*
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        ///*
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, cd1);
+
+        next12 = _mm_xor_si128(next12, a4_);
+        next12 = _mm_xor_si128(next12, next56);
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        next34 = _mm_xor_si128(b4c4, cd3);
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        //*/
+
+        /*
+        out1 = a3 ^ b2 ^ c1;
+        out2 = b3 ^ c2 ^ d1 ^ a4;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 8-11 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1;
+        in2 ^= next2 ^ chorba8 ^ chorba2;
+        */
+
+        READ_NEXT(input, i, in1in2, in3in4);
+
+        __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128());
+        __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12);
+        in1in2 = _mm_xor_si128(in1in2, chorba80);
+        in1in2 = _mm_xor_si128(in1in2, chorba78);
+        in1in2 = _mm_xor_si128(in1in2, next12_chorba12);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];*/
+        in3in4 = _mm_xor_si128(next34, in3in4);
+        in3in4 = _mm_xor_si128(in3in4, ab1);
+        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, chorba34);
+        in3in4 = _mm_xor_si128(in3in4, a2_);
+
+        /*
+        in3 ^= next3 ^ a1 ^ chorba3;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(a4_, ab3);
+        next12 = _mm_xor_si128(next12, cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 12-15 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        */
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, next12);
+        __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12);
+        in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12);
+        __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+        in1in2 = _mm_xor_si128(in1in2, chorb1_);
+
+
+        /*
+        in1 ^= next1 ^ chorba5 ^ chorba1;
+        in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
+        */
+
+        in3in4 = _mm_xor_si128(next34, in3in4);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        ///*
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        //*/
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 16-19 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
+        in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
+        */
+        ///*
+        READ_NEXT(input, i, in1in2, in3in4);
+        __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+        in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34));
+        in1in2 = _mm_xor_si128(chorba1_, in1in2);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+        //*/
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        */
+        ///*
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+        in3in4 = _mm_xor_si128(in3in4, next34);
+        //*/
+        /*
+        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
+        */
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        i += 32;
+
+        /* 20-23 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
+        in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
+        */
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+        in1in2 = _mm_xor_si128(in1in2, chorba80);
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+        */
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
+        in3in4 = _mm_xor_si128(in3in4, chorba12);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        i += 32;
+
+        /* 24-27 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+        in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
+        */
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
+        in1in2 = _mm_xor_si128(in1in2, chorba80);
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_));
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+        i += 32;
+
+        /* 28-31 */
+        /*
+        in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];
+        in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5;
+        in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6;
+        */
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
+        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56));
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7;
+        in4 ^= next4 ^ a2 ^ b1 ^ chorba8;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80));
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+        out1 =      a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+        */
+
+        /*
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        a4_ = _mm_unpacklo_epi64(next56, ab4);
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
+        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        next12 = _mm_xor_si128(next12, b2c2);
+        next34 = _mm_xor_si128(b4c4, cd3);
+        next34 = _mm_xor_si128(next34, d2_);
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+    }
+
+    for (; (i + 40 + 32) < len; i += 32) {
+        __m128i in1in2, in3in4;
+
+        /*in1 = input[i / sizeof(uint64_t)];
+        in2 = input[i / sizeof(uint64_t) + 1];*/
+        //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4);
+        READ_NEXT(input, i, in1in2, in3in4);
+        in1in2 = _mm_xor_si128(in1in2, next12);
+
+        /*
+        in1 ^=next1;
+        in2 ^=next2;
+        */
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+        /*
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+        */
+
+        /*
+        in3 = input[i / sizeof(uint64_t) + 2];
+        in4 = input[i / sizeof(uint64_t) + 3];
+        in3 ^= next3 ^ a1;
+        in4 ^= next4 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+        */
+
+        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        /*
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        */
+
+        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
+        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
+        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
+        next12 = _mm_xor_si128(next12, next56);
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+    }
+
+    next1 = _mm_cvtsi128_si64(next12);
+    next2 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next12, next12));
+    next3 = _mm_cvtsi128_si64(next34);
+    next4 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next34, next34));
+    next5 = _mm_cvtsi128_si64(next56);
+
+    /* Skip the call to memcpy */
+    size_t copy_len = len - i;
+    __m128i *final128 = (__m128i*)final;
+    __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t));
+    while (copy_len >= 64) {
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+         copy_len -= 64;
+    }
+
+    while (copy_len >= 16) {
+        _mm_store_si128(final128++, _mm_load_si128(input128++));
+        copy_len -= 16;
+    }
+
+    uint8_t *src_bytes = (uint8_t*)input128;
+    uint8_t *dst_bytes = (uint8_t*)final128;
+    while (copy_len--) {
+       *dst_bytes++ = *src_bytes++;
+    }
+
+    final[0] ^= next1;
+    final[1] ^= next2;
+    final[2] ^= next3;
+    final[3] ^= next4;
+    final[4] ^= next5;
+
+    /* We perform the same loop that braid_internal is doing but we'll skip
+     * the function call for this tiny tail */
+    uint8_t *final_bytes = (uint8_t*)final;
+    size_t rem = len - i;
+
+    while (rem--) {
+        crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8);
+    }
+
+    return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
+    uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+    if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT)
+        return crc32_braid(crc, buf, len);
+
+    if (align_diff) {
+        crc = crc32_braid(crc, buf, align_diff);
+        len -= align_diff;
+        buf += align_diff;
+    }
+#if !defined(WITHOUT_CHORBA)
+    if (len > CHORBA_LARGE_THRESHOLD)
+        return crc32_chorba_118960_nondestructive(crc, buf, len);
+#endif
+    return chorba_small_nondestructive_sse2(crc, buf, len);
+}
+
+Z_INTERNAL uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_chorba_sse2(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
+#endif
diff --git a/neozip/arch/x86/crc32_chorba_sse41.c b/neozip/arch/x86/crc32_chorba_sse41.c
new file mode 100644
index 0000000000..6ef9612440
--- /dev/null
+++ b/neozip/arch/x86/crc32_chorba_sse41.c
@@ -0,0 +1,332 @@
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
+
+#include "zbuild.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch_functions.h"
+
+#define READ_NEXT(in, off, a, b) do { \
+        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+        } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+        d  = _mm_srli_epi64(invec, 20); \
+        } while (0);
+
+#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
+        out0 = _mm_slli_si128(in0, shift); \
+        out1 = _mm_alignr_epi8(in1, in0, shift); \
+        out2 = _mm_alignr_epi8(in2, in1, shift); \
+        out3 = _mm_alignr_epi8(in3, in2, shift); \
+        out4 = _mm_srli_si128(in3, shift); \
+        } while (0)
+
+#define STORE4(out0, out1, out2, out3, out) do { \
+        _mm_store_si128(out++, out0); \
+        _mm_store_si128(out++, out1); \
+        _mm_store_si128(out++, out2); \
+        _mm_store_si128(out++, out3); \
+    } while (0)
+
+#define READ4(out0, out1, out2, out3, in) do { \
+    out0 = _mm_load_si128(in++); \
+    out1 = _mm_load_si128(in++); \
+    out2 = _mm_load_si128(in++); \
+    out3 = _mm_load_si128(in++); \
+    } while (0)
+
+/* This is intentionally shifted one down to compensate for the deferred store from
+ * the last iteration */
+#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
+    out0 = _mm_xor_si128(in[1], xor0); \
+    out1 = _mm_xor_si128(in[2], xor1); \
+    out2 = _mm_xor_si128(in[3], xor2); \
+    out3 = _mm_xor_si128(in[4], xor3); \
+    } while (0)
+
+Z_FORCEINLINE static uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint64_t* input = (const uint64_t*)buf;
+    ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+    __m128i *bitbuffer_v = (__m128i*)bitbuffer;
+    const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+    __m128i z = _mm_setzero_si128();
+
+    __m128i *bitbuf128 = &bitbuffer_v[64];
+    __m128i *bitbuf144 = &bitbuffer_v[72];
+    __m128i *bitbuf182 = &bitbuffer_v[91];
+    __m128i *bitbuf210 = &bitbuffer_v[105];
+    __m128i *bitbuf300 = &bitbuffer_v[150];
+    __m128i *bitbuf0 = bitbuf128;
+    __m128i *inptr = (__m128i*)input;
+
+    /* We only need to zero out the bytes between the 128'th value and the 144th
+     * that are actually read */
+    __m128i *z_cursor = bitbuf128;
+    for (size_t i = 0; i < 2; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We only need to zero out the bytes between the 144'th value and the 182nd that
+     * are actually read */
+    z_cursor = bitbuf144 + 8;
+    for (size_t i = 0; i < 11; ++i) {
+        _mm_store_si128(z_cursor++, z);
+    }
+
+    /* We only need to zero out the bytes between the 182nd value and the 210th that
+     * are actually read. */
+    z_cursor = bitbuf182;
+    for (size_t i = 0; i < 4; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We need to mix this in */
+    __m128i init_crc = _mm_cvtsi64_si128(~crc);
+    crc = 0;
+
+    size_t i = 0;
+
+    /* Previous iteration runs carried over */
+    __m128i buf144 = z;
+    __m128i buf182 = z;
+    __m128i buf210 = z;
+
+    for (; i + 300*8+64 < len && i < 22 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        READ4(in12, in34, in56, in78, inptr);
+
+        if (i == 0) {
+            in12 = _mm_xor_si128(in12, init_crc);
+        }
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e = _mm_xor_si128(buf182, in_1);
+        STORE4(e, in23, in45, in67, bitbuf182);
+        buf182 = in8_;
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for (; i + 300*8+64 < len && i < 32 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for (; i + 300*8+64 < len && i < 84 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for (; i + 300*8+64 < len; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        if (i < 128 * 8) {
+            READ4(in12, in34, in56, in78, inptr);
+        } else {
+            in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+        }
+
+        // [0, 145, 183, 211]
+
+        /* Pre Penryn CPUs the unpack should be faster */
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i n, o, p;
+        __m128i m = _mm_xor_si128(buf210, in_1);
+
+        /* Couldn't tell you why but despite knowing that this is always false,
+         * removing this branch with GCC makes things significantly slower. Some
+         * loop bodies must be being joined or something */
+        if (i < 84 * 8) {
+            n = in23;
+            o = in45;
+            p = in67;
+            buf210 = in8_;
+        } else {
+            READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
+        }
+
+        STORE4(m, n, o, p, bitbuf210);
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    /* Second half of stores bubbled out */
+    _mm_store_si128(bitbuf144, buf144);
+    _mm_store_si128(bitbuf182, buf182);
+    _mm_store_si128(bitbuf210, buf210);
+
+    /* We also have to zero out the tail */
+    size_t left_to_z = len - (300*8 + i);
+    __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
+    while (left_to_z >= 64) {
+       STORE4(z, z, z, z, bitbuf_tail);
+       left_to_z -= 64;
+    }
+
+    while (left_to_z >= 16) {
+       _mm_store_si128(bitbuf_tail++, z);
+       left_to_z -= 16;
+    }
+
+    uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
+    while (left_to_z--) {
+       *tail_bytes++ = 0;
+    }
+
+    ALIGNED_(16) uint64_t final[9] = {0};
+    __m128i next12, next34, next56;
+    next12 = z;
+    next34 = z;
+    next56 = z;
+
+    for (; (i + 72 < len); i += 32) {
+        __m128i in1in2, in3in4;
+        __m128i in1in2_, in3in4_;
+        __m128i ab1, ab2, ab3, ab4;
+        __m128i cd1, cd2, cd3, cd4;
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
+
+        in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
+        in3in4 = _mm_xor_si128(in3in4, in3in4_);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        __m128i a2_ = _mm_slli_si128(ab2, 8);
+        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
+        __m128i a4_ = _mm_slli_si128(ab4, 8);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_srli_si128(cd2, 8);
+        __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
+        next12 = _mm_xor_si128(next12, next56);
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_srli_si128(cd4, 8);
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    __m128i *final128 = (__m128i*)final;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
+
+    uint8_t *final_bytes = (uint8_t*)final;
+
+    for (size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8);
+    }
+    return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+    uintptr_t align_diff = ALIGN_DIFF(buf, 16);
+    if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT)
+        return crc32_braid(crc, buf, len);
+
+    if (align_diff) {
+        crc = crc32_braid(crc, buf, align_diff);
+        len -= align_diff;
+        buf += align_diff;
+    }
+#if !defined(WITHOUT_CHORBA)
+    if (len > CHORBA_LARGE_THRESHOLD)
+        return crc32_chorba_118960_nondestructive(crc, buf, len);
+#endif
+    if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
+        return crc32_chorba_32768_nondestructive_sse41(crc, buf, len);
+    return chorba_small_nondestructive_sse2(crc, buf, len);
+}
+
+Z_INTERNAL uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_chorba_sse41(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
+#endif
diff --git a/neozip/arch/x86/crc32_pclmulqdq.c b/neozip/arch/x86/crc32_pclmulqdq.c
new file mode 100644
index 0000000000..c8be1b43ba
--- /dev/null
+++ b/neozip/arch/x86/crc32_pclmulqdq.c
@@ -0,0 +1,31 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/crc32_pclmulqdq_tpl.h b/neozip/arch/x86/crc32_pclmulqdq_tpl.h
new file mode 100644
index 0000000000..e4ea546afd
--- /dev/null
+++ b/neozip/arch/x86/crc32_pclmulqdq_tpl.h
@@ -0,0 +1,708 @@
+/* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding
+ * approach with the PCLMULQDQ and VPCMULQDQ instructions.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support)
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+#include <immintrin.h>
+#include <wmmintrin.h>
+#include <smmintrin.h> // _mm_extract_epi32
+
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_p.h"
+#include "x86_intrins.h"
+
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+#  if defined(_MSC_VER) && _MSC_VER < 1920
+     /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */
+#    define z512_xor3_epi64(a, b, c)     _mm512_ternarylogic_epi32(a, b, c, 0x96)
+#    define z512_inserti64x2(a, b, imm)  _mm512_inserti32x4(a, b, imm)
+#    define z512_extracti64x2(a, imm)    _mm512_extracti32x4_epi32(a, imm)
+#  else
+#    define z512_xor3_epi64(a, b, c)     _mm512_ternarylogic_epi64(a, b, c, 0x96)
+#    define z512_inserti64x2(a, b, imm)  _mm512_inserti64x2(a, b, imm)
+#    define z512_extracti64x2(a, imm)    _mm512_extracti64x2_epi64(a, imm)
+#  endif
+#  ifdef __AVX512VL__
+#    define z128_xor3_epi64(a, b, c)  _mm_ternarylogic_epi64(a, b, c, 0x96)
+#  endif
+#endif
+/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+#  define z256_xor3_epi64(a, b, c)    _mm256_xor_si256(_mm256_xor_si256(a, b), c)
+#endif
+
+#ifndef z128_xor3_epi64
+#  define z128_xor3_epi64(a, b, c)    _mm_xor_si128(_mm_xor_si128(a, b), c)
+#endif
+
+static inline void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+    __m128i x_low  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+
+    *xmm_crc0 = *xmm_crc1;
+    *xmm_crc1 = *xmm_crc2;
+    *xmm_crc2 = *xmm_crc3;
+    *xmm_crc3 = _mm_xor_si128(x_low, x_high);
+}
+
+static inline void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+
+    *xmm_crc0 = *xmm_crc2;
+    *xmm_crc1 = *xmm_crc3;
+    *xmm_crc2 = _mm_xor_si128(x_low0, x_high0);
+    *xmm_crc3 = _mm_xor_si128(x_low1, x_high1);
+}
+
+static inline void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+
+    *xmm_crc0 = *xmm_crc3;
+    *xmm_crc1 = _mm_xor_si128(x_low0, x_high0);
+    *xmm_crc2 = _mm_xor_si128(x_low1, x_high1);
+    *xmm_crc3 = _mm_xor_si128(x_low2, x_high2);
+}
+
+static inline void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) {
+    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
+    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    __m128i x_low3  = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+    __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+
+    *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
+    *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
+    *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
+    *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
+}
+
+static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85);
+    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01);
+    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10);
+    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01);
+    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10);
+    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01);
+    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10);
+    __m128i x_low3  = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01);
+    __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10);
+
+    *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
+    *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
+    *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
+    *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
+}
+
+/* 512-bit fold function requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3,
+    const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) {
+    __m512i z_low0  = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01);
+    __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10);
+    __m512i z_low1  = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01);
+    __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10);
+    __m512i z_low2  = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01);
+    __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10);
+    __m512i z_low3  = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01);
+    __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10);
+
+    *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0);
+    *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1);
+    *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2);
+    *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3);
+}
+#endif
+/* 256-bit fold function for VPCLMULQDQ without AVX-512 */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+static inline void fold_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3,
+    const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, const __m256i ymm_t3, const __m256i ymm_fold8) {
+    __m256i y_low0  = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01);
+    __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10);
+    __m256i y_low1  = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01);
+    __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10);
+    __m256i y_low2  = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01);
+    __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10);
+    __m256i y_low3  = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01);
+    __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10);
+
+    *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0);
+    *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1);
+    *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2);
+    *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3);
+}
+#endif
+
+Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    size_t copy_len = len;
+    if (len >= 16) {
+        /* Calculate 16-byte alignment offset */
+        uintptr_t align_diff = ALIGN_DIFF(src, 16);
+
+        /* If total length is less than (alignment bytes + 16), use the faster small method.
+         * Handles both initially small buffers and cases where alignment would leave < 16 bytes */
+        copy_len = len < align_diff + 16 ? len : align_diff;
+    }
+
+    if (copy_len > 0) {
+        crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY);
+        src += copy_len;
+        len -= copy_len;
+        if (COPY) {
+            dst += copy_len;
+        }
+    }
+
+    if (len == 0)
+        return crc;
+
+    const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+
+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+    __m128i xmm_crc1 = _mm_setzero_si128();
+    __m128i xmm_crc2 = _mm_setzero_si128();
+    __m128i xmm_crc3 = _mm_setzero_si128();
+
+    if (crc != 0) {
+        // Process the first 16 bytes and handle initial CRC
+        len -= 16;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        src += 16;
+
+        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            dst += 16;
+        }
+        xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc));
+    }
+
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
+    if (len >= 256) {
+        len -= 256;
+
+        __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+        __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+        __m512i z_low0, z_high0;
+        const __m512i zmm_fold4 = _mm512_set4_epi32(
+            0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+        const __m512i zmm_fold16 = _mm512_set4_epi32(
+            0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+        zmm_crc0 = _mm512_loadu_si512((__m512i *)src);
+        zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+        zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+        zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+        src += 256;
+        if (COPY) {
+            _mm512_storeu_si512((__m512i *)dst, zmm_crc0);
+            _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+            _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+            _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+            dst += 256;
+        }
+
+        // Fold existing xmm state into first 64 bytes
+        zmm_t0 = _mm512_castsi128_si512(xmm_crc0);
+        zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc1, 1);
+        zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc2, 2);
+        zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc3, 3);
+
+        z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01);
+        z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10);
+        zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0);
+
+        while (len >= 256) {
+            len -= 256;
+            zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+            zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+            zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+            zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+            src += 256;
+
+            fold_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16);
+            if (COPY) {
+                _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+                _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+                _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+                _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+                dst += 256;
+            }
+        }
+
+        // zmm_crc[0,1,2,3] -> zmm_crc0
+        z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+        z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+        zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1);
+
+        z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+        z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+        zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2);
+
+        z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+        z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+        zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3);
+
+        // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+        xmm_crc0 = z512_extracti64x2(zmm_crc0, 0);
+        xmm_crc1 = z512_extracti64x2(zmm_crc0, 1);
+        xmm_crc2 = z512_extracti64x2(zmm_crc0, 2);
+        xmm_crc3 = z512_extracti64x2(zmm_crc0, 3);
+    }
+/* 256-bit VPCLMULQDQ path */
+#elif defined(X86_VPCLMULQDQ)
+    if (len >= 128) {
+        len -= 128;
+
+        __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3;
+        __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3;
+        __m256i y_low0, y_high0;
+        const __m256i ymm_fold4 = _mm256_set_epi32(
+            0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596,
+            0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+        const __m256i ymm_fold8 = _mm256_set_epi32(
+            0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880,
+            0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880);
+
+        ymm_crc0 = _mm256_loadu_si256((__m256i *)src);
+        ymm_crc1 = _mm256_loadu_si256((__m256i *)src + 1);
+        ymm_crc2 = _mm256_loadu_si256((__m256i *)src + 2);
+        ymm_crc3 = _mm256_loadu_si256((__m256i *)src + 3);
+        src += 128;
+        if (COPY) {
+            _mm256_storeu_si256((__m256i *)dst, ymm_crc0);
+            _mm256_storeu_si256((__m256i *)dst + 1, ymm_crc1);
+            _mm256_storeu_si256((__m256i *)dst + 2, ymm_crc2);
+            _mm256_storeu_si256((__m256i *)dst + 3, ymm_crc3);
+            dst += 128;
+        }
+
+        // Fold existing xmm state into first 32 bytes
+        ymm_t0 = _mm256_castsi128_si256(xmm_crc0);
+        ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc1, 1);
+
+        y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+        y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+        ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0);
+
+        ymm_t0 = _mm256_castsi128_si256(xmm_crc2);
+        ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc3, 1);
+
+        y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+        y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+        ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0);
+
+        while (len >= 128) {
+            len -= 128;
+            ymm_t0 = _mm256_loadu_si256((__m256i *)src);
+            ymm_t1 = _mm256_loadu_si256((__m256i *)src + 1);
+            ymm_t2 = _mm256_loadu_si256((__m256i *)src + 2);
+            ymm_t3 = _mm256_loadu_si256((__m256i *)src + 3);
+            src += 128;
+
+            fold_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8);
+            if (COPY) {
+                _mm256_storeu_si256((__m256i *)dst, ymm_t0);
+                _mm256_storeu_si256((__m256i *)dst + 1, ymm_t1);
+                _mm256_storeu_si256((__m256i *)dst + 2, ymm_t2);
+                _mm256_storeu_si256((__m256i *)dst + 3, ymm_t3);
+                dst += 128;
+            }
+        }
+
+        // Extract 8 x 128-bit lanes from 4 x 256-bit registers
+        __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0);
+        __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1);
+        __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1);
+        __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1);
+        __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2);
+        __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1);
+        __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3);
+        __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1);
+
+        // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4)
+        __m128i x_low, x_high;
+        x_low  = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01);
+        x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10);
+        xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4);
+
+        x_low  = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01);
+        x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10);
+        xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5);
+
+        x_low  = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01);
+        x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10);
+        xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6);
+
+        x_low  = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01);
+        x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10);
+        xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7);
+    }
+#else
+    /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
+     * We interleave the PCLMUL-base folds with 8x scaled generator
+     * polynomial copies; we read 8x QWORDS and then XOR them into
+     * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
+     * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
+     * as "generator_64_bits_unrolled_8" */
+#ifndef __AVX512VL__
+    if (!COPY) {
+#endif
+    while (len >= 512 + 64 + 16*8) {
+        __m128i chorba8 = _mm_load_si128((__m128i *)src);
+        __m128i chorba7 = _mm_load_si128((__m128i *)src + 1);
+        __m128i chorba6 = _mm_load_si128((__m128i *)src + 2);
+        __m128i chorba5 = _mm_load_si128((__m128i *)src + 3);
+        __m128i chorba4 = _mm_load_si128((__m128i *)src + 4);
+        __m128i chorba3 = _mm_load_si128((__m128i *)src + 5);
+        __m128i chorba2 = _mm_load_si128((__m128i *)src + 6);
+        __m128i chorba1 = _mm_load_si128((__m128i *)src + 7);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, chorba8);
+            _mm_storeu_si128((__m128i *)dst + 1, chorba7);
+            _mm_storeu_si128((__m128i *)dst + 2, chorba6);
+            _mm_storeu_si128((__m128i *)dst + 3, chorba5);
+            _mm_storeu_si128((__m128i *)dst + 4, chorba4);
+            _mm_storeu_si128((__m128i *)dst + 5, chorba3);
+            _mm_storeu_si128((__m128i *)dst + 6, chorba2);
+            _mm_storeu_si128((__m128i *)dst + 7, chorba1);
+            dst += 16*8;
+        }
+
+        chorba2 = _mm_xor_si128(chorba2, chorba8);
+        chorba1 = _mm_xor_si128(chorba1, chorba7);
+        src += 16*8;
+        len -= 16*8;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+
+        fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, xmm_crc0);
+        xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), xmm_crc1);
+        xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, xmm_crc2);
+        xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, xmm_crc3);
+
+        xmm_t0 = _mm_load_si128((__m128i *)src + 4);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 5);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 6);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 7);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, xmm_crc0);
+        xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, xmm_crc1);
+        xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), xmm_crc2);
+        xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), xmm_crc3);
+
+        xmm_t0 = _mm_load_si128((__m128i *)src + 8);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 9);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 10);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 11);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, xmm_crc0);
+        xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), xmm_crc1);
+        xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, xmm_crc2);
+        xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, xmm_crc3);
+
+        xmm_t0 = _mm_load_si128((__m128i *)src + 12);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 13);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 14);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 15);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), xmm_crc0);
+        xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, xmm_crc1);
+        xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), xmm_crc2);
+        xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), xmm_crc3);
+
+        xmm_t0 = _mm_load_si128((__m128i *)src + 16);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 17);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 18);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 19);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), xmm_crc0);
+        xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, xmm_crc1);
+        xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, xmm_crc2);
+        xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), xmm_crc3);
+
+        xmm_t0 = _mm_load_si128((__m128i *)src + 20);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 21);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 22);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 23);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc0);
+        xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, xmm_crc1);
+        xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, xmm_crc2);
+        xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc3);
+
+        xmm_t0 = _mm_load_si128((__m128i *)src + 24);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 25);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 26);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 27);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), xmm_crc0);
+        xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, xmm_crc1);
+        xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, xmm_crc2);
+        xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), xmm_crc3);
+
+        xmm_t0 = _mm_load_si128((__m128i *)src + 28);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 29);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 30);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 31);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, xmm_crc0);
+        xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, xmm_crc1);
+        xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), xmm_crc2);
+        xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, xmm_crc3);
+
+        len -= 512;
+        src += 512;
+    }
+#ifndef __AVX512VL__
+    }
+#endif
+
+#endif  /* X86_VPCLMULQDQ */
+
+    while (len >= 64) {
+        len -= 64;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+        src += 64;
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+            dst += 64;
+        }
+
+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+    }
+
+    /*
+     * len = num bytes left - 64
+     */
+    if (len >= 48) {
+        len -= 48;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        src += 48;
+
+        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+            dst += 48;
+        }
+
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+    } else if (len >= 32) {
+        len -= 32;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        src += 32;
+
+        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+            dst += 32;
+        }
+
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+    } else if (len >= 16) {
+        len -= 16;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        src += 16;
+
+        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
+        if (COPY) {
+            _mm_storeu_si128((__m128i *)dst, xmm_t0);
+            dst += 16;
+        }
+
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+    }
+
+    const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e);
+    const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641);
+
+    /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */
+    __m128i x_low0  = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01);
+    __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10);
+    xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0);
+
+    __m128i x_low1  = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01);
+    __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10);
+    xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1);
+
+    __m128i x_low2  = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01);
+    __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10);
+    xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2);
+
+    /* Fold remaining bytes into the 128-bit state */
+    if (len) {
+        const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
+        const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+        /* Create masks to shift bytes for partial input */
+        __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16));
+        __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3);
+
+        /* Shift out bytes from crc3 to make space for new data */
+        __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl);
+        xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr);
+
+        /* Insert the partial input into crc3 */
+#if defined(__AVX512BW__) && defined(__AVX512VL__)
+        __mmask16 k = (1 << len) - 1;
+        __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src);
+        if (COPY) {
+            _mm_mask_storeu_epi8(dst, k, xmm_crc_part);
+        }
+#else
+        __m128i xmm_crc_part = _mm_setzero_si128();
+        memcpy(&xmm_crc_part, src, len);
+        if (COPY) {
+            memcpy(dst, src, len);
+        }
+#endif
+        __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned);
+
+        /* Fold the bytes that were shifted out back into crc3 */
+        __m128i ovf_low  = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01);
+        __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10);
+        xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high);
+    }
+
+    /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */
+    __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00);
+    __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10);
+
+    x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf);
+    x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3);
+
+    __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01);
+    __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10);
+
+    crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2));
+
+    return ~crc;
+}
diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx2.c b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c
new file mode 100644
index 0000000000..1cdef13b09
--- /dev/null
+++ b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx2.c -- VPCLMULQDQ-based CRC32 with AVX2.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX2
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx512.c b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c
new file mode 100644
index 0000000000..a95a448f49
--- /dev/null
+++ b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx512.c -- VPCLMULQDQ-based CRC32 with AVX-512.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX512
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/neozip/arch/x86/slide_hash_avx2.c b/neozip/arch/x86/slide_hash_avx2.c
new file mode 100644
index 0000000000..241ea305e3
--- /dev/null
+++ b/neozip/arch/x86/slide_hash_avx2.c
@@ -0,0 +1,48 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <immintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 32;
+
+    do {
+        __m256i value1, value2, result1, result2;
+
+        value1 = _mm256_load_si256((__m256i *)table);
+        value2 = _mm256_load_si256((__m256i *)(table+16));
+        result1 = _mm256_subs_epu16(value1, wsize);
+        result2 = _mm256_subs_epu16(value2, wsize);
+        _mm256_store_si256((__m256i *)table, result1);
+        _mm256_store_si256((__m256i *)(table+16), result2);
+
+        table -= 32;
+        entries -= 32;
+    } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+    slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/x86/slide_hash_sse2.c b/neozip/arch/x86/slide_hash_sse2.c
new file mode 100644
index 0000000000..4aa8df5ee8
--- /dev/null
+++ b/neozip/arch/x86/slide_hash_sse2.c
@@ -0,0 +1,68 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_SSE2
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <immintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+                                    uint32_t entries1, const __m128i wsize) {
+    uint32_t entries;
+    Pos *table;
+    __m128i value0, value1, result0, result1;
+
+    int on_chain = 0;
+
+next_chain:
+    table = (on_chain) ? table1 : table0;
+    entries = (on_chain) ? entries1 : entries0;
+
+    table += entries;
+    table -= 16;
+
+    /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+     * Our alloc function is aligned to 64 byte boundaries */
+    do {
+        value0 = _mm_load_si128((__m128i *)table);
+        value1 = _mm_load_si128((__m128i *)(table + 8));
+        result0 = _mm_subs_epu16(value0, wsize);
+        result1 = _mm_subs_epu16(value1, wsize);
+        _mm_store_si128((__m128i *)table, result0);
+        _mm_store_si128((__m128i *)(table + 8), result1);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+
+    ++on_chain;
+    if (on_chain > 1) {
+        return;
+    } else {
+        goto next_chain;
+    }
+}
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+    assert(((uintptr_t)s->head & 15) == 0);
+    assert(((uintptr_t)s->prev & 15) == 0);
+
+    slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
+
+#endif
diff --git a/neozip/arch/x86/x86_features.c b/neozip/arch/x86/x86_features.c
new file mode 100644
index 0000000000..5eba18bf8a
--- /dev/null
+++ b/neozip/arch/x86/x86_features.c
@@ -0,0 +1,128 @@
+/* x86_features.c - x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ *  Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_FEATURES
+
+#include "zbuild.h"
+#include "x86_features.h"
+
+#if defined(HAVE_CPUID_MS)
+#   include <intrin.h>
+#elif defined(HAVE_CPUID_GNU)
+// Newer versions of GCC and clang come with cpuid.h
+#  include <cpuid.h>
+#  ifdef X86_HAVE_XSAVE_INTRIN
+#    if __GNUC__ == 8
+#      include <xsaveintrin.h>
+#    else
+#      include <immintrin.h>
+#    endif
+#  endif
+#endif
+
+static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#if defined(HAVE_CPUID_MS)
+    unsigned int registers[4];
+    __cpuid((int *)registers, info);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#elif defined(HAVE_CPUID_GNU)
+    *eax = *ebx = *ecx = *edx = 0;
+    __cpuid(info, *eax, *ebx, *ecx, *edx);
+#else
+    /* When using this fallback, the faster SSE/AVX code is disabled */
+    *eax = *ebx = *ecx = *edx = 0;
+#endif
+}
+
+static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#if defined(HAVE_CPUID_MS)
+    unsigned int registers[4];
+    __cpuidex((int *)registers, info, subinfo);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#elif defined(HAVE_CPUID_GNU)
+    *eax = *ebx = *ecx = *edx = 0;
+    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#else
+    /* When using this fallback, the faster SSE/AVX code is disabled */
+    *eax = *ebx = *ecx = *edx = 0;
+#endif
+}
+
+static inline uint64_t xgetbv(unsigned int xcr) {
+#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
+    return _xgetbv(xcr);
+#elif defined(__GNUC__)
+    uint32_t eax, edx;
+    __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
+    return (uint64_t)(edx) << 32 | eax;
+#else
+    /* When using this fallback, some of the faster code is disabled */
+    return 0;
+#endif
+}
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
+    unsigned eax, ebx, ecx, edx;
+    unsigned maxbasic;
+
+    cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+    cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+    features->has_sse2 = edx & 0x4000000;
+    features->has_ssse3 = ecx & 0x200;
+    features->has_sse41 = ecx & 0x80000;
+    features->has_sse42 = ecx & 0x100000;
+    features->has_pclmulqdq = ecx & 0x2;
+
+    if (ecx & 0x08000000) {
+        uint64_t xfeature = xgetbv(0);
+
+        features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
+        features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
+    }
+
+    if (maxbasic >= 7) {
+        // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+        cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+        // check BMI2 bit
+        features->has_bmi2 = ebx & 0x100;
+
+        // check AVX2 bit if the OS supports saving YMM registers
+        if (features->has_os_save_ymm) {
+            features->has_avx2 = ebx & 0x20;
+            features->has_vpclmulqdq = ecx & 0x400;
+        }
+
+        // check AVX512 bits if the OS supports saving ZMM registers
+        if (features->has_os_save_zmm) {
+            features->has_avx512f = ebx & 0x00010000;
+            if (features->has_avx512f) {
+                // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
+                // AVX512(DQ,BW,VL).
+                features->has_avx512dq = ebx & 0x00020000;
+                features->has_avx512bw = ebx & 0x40000000;
+                features->has_avx512vl = ebx & 0x80000000;
+            }
+            features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
+              && features->has_avx512vl && features->has_bmi2;
+            features->has_avx512vnni = ecx & 0x800;
+        }
+    }
+}
+
+#endif
diff --git a/neozip/arch/x86/x86_features.h b/neozip/arch/x86/x86_features.h
new file mode 100644
index 0000000000..2118b8e87a
--- /dev/null
+++ b/neozip/arch/x86/x86_features.h
@@ -0,0 +1,30 @@
+/* x86_features.h -- check for CPU features
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FEATURES_H_
+#define X86_FEATURES_H_
+
+struct x86_cpu_features {
+    int has_avx2;
+    int has_avx512f;
+    int has_avx512dq;
+    int has_avx512bw;
+    int has_avx512vl;
+    int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled.
+    int has_avx512vnni;
+    int has_bmi2;
+    int has_sse2;
+    int has_ssse3;
+    int has_sse41;
+    int has_sse42;
+    int has_pclmulqdq;
+    int has_vpclmulqdq;
+    int has_os_save_ymm;
+    int has_os_save_zmm;
+};
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
+
+#endif /* X86_FEATURES_H_ */
diff --git a/neozip/arch/x86/x86_functions.h b/neozip/arch/x86/x86_functions.h
new file mode 100644
index 0000000000..881c6efe23
--- /dev/null
+++ b/neozip/arch/x86/x86_functions.h
@@ -0,0 +1,196 @@
+/* x86_functions.h -- x86 implementations for arch-specific functions.
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_FUNCTIONS_H_
+#define X86_FUNCTIONS_H_
+
+#include "x86_natives.h"
+
+/* So great news, your compiler is broken and causes stack smashing. Rather than
+ * notching out its compilation we'll just remove the assignment in the functable.
+ * Further context:
+ * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
+#if defined(_MSC_VER) && defined(ARCH_32BIT) && _MSC_VER >= 1920 && _MSC_VER <= 1929
+#define NO_CHORBA_SSE
+#endif
+
+#ifdef X86_SSE2
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_sse2(deflate_state *s);
+
+#  if !defined(WITHOUT_CHORBA_SSE)
+    uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len);
+    uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+    uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len);
+#  endif
+#endif
+
+#ifdef X86_SSSE3
+uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, size_t len, size_t left);
+void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
+    uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len);
+    uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_SSE42
+uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_AVX2
+uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match);
+void slide_hash_avx2(deflate_state *s);
+#endif
+#ifdef X86_AVX512
+uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
+uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match);
+#endif
+#ifdef X86_AVX512VNNI
+uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef X86_PCLMULQDQ_CRC
+uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX2
+uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// X86 - SSE2
+#  ifdef X86_SSE2_NATIVE
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_sse2
+#    undef native_compare256
+#    define native_compare256 compare256_sse2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_sse2
+#    undef native_longest_match
+#    define native_longest_match longest_match_sse2
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_sse2
+#    if !defined(WITHOUT_CHORBA_SSE)
+#      undef native_crc32
+#      define native_crc32 crc32_chorba_sse2
+#      undef native_crc32_copy
+#      define native_crc32_copy crc32_copy_chorba_sse2
+#    endif
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_sse2
+#  endif
+// X86 - SSSE3
+#  ifdef X86_SSSE3_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_ssse3
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_ssse3
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_ssse3
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_ssse3
+#  endif
+// X86 - SSE4.1
+#  if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE)
+#    undef native_crc32
+#    define native_crc32 crc32_chorba_sse41
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_chorba_sse41
+#  endif
+// X86 - SSE4.2
+#  ifdef X86_SSE42_NATIVE
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_sse42
+#  endif
+// X86 - PCLMUL
+#  ifdef X86_PCLMULQDQ_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_pclmulqdq
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_pclmulqdq
+#  endif
+// X86 - AVX2
+#  ifdef X86_AVX2_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_avx2
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_avx2
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_avx2
+#    undef native_compare256
+#    define native_compare256 compare256_avx2
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_avx2
+#    undef native_longest_match
+#    define native_longest_match longest_match_avx2
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_avx2
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_avx2
+#  endif
+// X86 - AVX512 (F,DQ,BW,Vl)
+#  ifdef X86_AVX512_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_avx512
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_avx512
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_avx512
+#    undef native_compare256
+#    define native_compare256 compare256_avx512
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_avx512
+#    undef native_longest_match
+#    define native_longest_match longest_match_avx512
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_avx512
+// X86 - AVX512 (VNNI)
+#    ifdef X86_AVX512VNNI_NATIVE
+#      undef native_adler32
+#      define native_adler32 adler32_avx512_vnni
+#      undef native_adler32_copy
+#      define native_adler32_copy adler32_copy_avx512_vnni
+#    endif
+#  endif
+// X86 - VPCLMULQDQ
+#  ifdef X86_VPCLMULQDQ_AVX512_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_vpclmulqdq_avx512
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_vpclmulqdq_avx512
+#  elif defined(X86_VPCLMULQDQ_AVX2_NATIVE)
+#    undef native_crc32
+#    define native_crc32 crc32_vpclmulqdq_avx2
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_vpclmulqdq_avx2
+#  endif
+#endif
+
+#endif /* X86_FUNCTIONS_H_ */
diff --git a/neozip/arch/x86/x86_intrins.h b/neozip/arch/x86/x86_intrins.h
new file mode 100644
index 0000000000..1d1df5eb11
--- /dev/null
+++ b/neozip/arch/x86/x86_intrins.h
@@ -0,0 +1,126 @@
+#ifndef X86_INTRINS_H
+#define X86_INTRINS_H
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+/* Unfortunately GCC didn't support these things until version 10.
+ * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
+ */
+#ifdef __AVX2__
+#include <immintrin.h>
+
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \
+    || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+    __m128i r;
+    __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+    return _mm256_castsi128_si256(r);
+}
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+    __m128i r;
+    __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+    return _mm512_castsi128_si512(r);
+}
+#endif // __AVX512F__
+#endif // gcc/AppleClang version test
+
+#endif // __AVX2__
+
+/* GCC <9 is missing some AVX512 intrinsics.
+ */
+#ifdef __AVX512F__
+#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9)
+#include <immintrin.h>
+
+#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
+                              ((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
+
+static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
+                                      char __q59, char __q58, char __q57, char __q56,
+                                      char __q55, char __q54, char __q53, char __q52,
+                                      char __q51, char __q50, char __q49, char __q48,
+                                      char __q47, char __q46, char __q45, char __q44,
+                                      char __q43, char __q42, char __q41, char __q40,
+                                      char __q39, char __q38, char __q37, char __q36,
+                                      char __q35, char __q34, char __q33, char __q32,
+                                      char __q31, char __q30, char __q29, char __q28,
+                                      char __q27, char __q26, char __q25, char __q24,
+                                      char __q23, char __q22, char __q21, char __q20,
+                                      char __q19, char __q18, char __q17, char __q16,
+                                      char __q15, char __q14, char __q13, char __q12,
+                                      char __q11, char __q10, char __q09, char __q08,
+                                      char __q07, char __q06, char __q05, char __q04,
+                                      char __q03, char __q02, char __q01, char __q00) {
+    return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
+                            PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
+                            PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
+                            PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
+                            PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
+                            PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
+                            PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
+                            PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
+}
+
+#undef PACK
+
+#endif // gcc version test
+#endif // __AVX512F__
+
+/* Missing zero-extension AVX and AVX512 intrinsics.
+ * Fixed in Microsoft Visual Studio 2017 version 15.7
+ * https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#ifdef __AVX2__
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+    return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
+}
+#endif // __AVX2__
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+    return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
+}
+#endif // __AVX512F__
+#endif // defined(_MSC_VER) && _MSC_VER < 1914
+
+/* Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsics */
+#if defined(_MSC_VER) && defined(__AVX512F__) && !defined(_MM_K0_REG8)
+#  undef _mm512_extracti32x4_epi32
+#  define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1)
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+/* For whatever reason this intrinsic is 64 bit only with MSVC?
+ * While we don't have 64 bit GPRs, it should at least be able to move it to stack
+ * or shuffle it over 2 registers */
+#ifdef ARCH_32BIT
+/* So, while we can't move directly to a GPR, hopefully this move to
+ * a stack resident variable doesn't equate to something awful */
+static inline int64_t _mm_cvtsi128_si64(__m128i a) {
+    union { __m128i v; int64_t i; } u;
+    u.v = a;
+    return u.i;
+}
+
+static inline __m128i _mm_cvtsi64_si128(int64_t a) {
+   return _mm_set_epi64x(0, a);
+}
+#endif
+#endif
+
+#if defined(__GNUC__) && defined(ARCH_X86) && defined(ARCH_32BIT) && !defined(__clang__)
+static inline int64_t _mm_cvtsi128_si64(__m128i a) {
+    union { __m128i v; int64_t i; } u;
+    u.v = a;
+    return u.i;
+}
+#define _mm_cvtsi64_si128(a) _mm_set_epi64x(0, a)
+#endif
+
+#endif // include guard X86_INTRINS_H
diff --git a/neozip/arch/x86/x86_natives.h b/neozip/arch/x86/x86_natives.h
new file mode 100644
index 0000000000..a39b7a51f0
--- /dev/null
+++ b/neozip/arch/x86/x86_natives.h
@@ -0,0 +1,57 @@
+/* x86_natives.h -- x86 compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef X86_NATIVES_H_
+#define X86_NATIVES_H_
+
+#if defined(__SSE2__) || (defined(ARCH_X86) && defined(ARCH_64BIT))
+#  ifdef X86_SSE2
+#    define X86_SSE2_NATIVE
+#  endif
+#endif
+#if defined(__SSSE3__)
+#  ifdef X86_SSSE3
+#    define X86_SSSE3_NATIVE
+#  endif
+#endif
+#if defined(__SSE4_1__)
+#  ifdef X86_SSE41
+#    define X86_SSE41_NATIVE
+#  endif
+#endif
+#if defined(__SSE4_2__)
+#  ifdef X86_SSE42
+#    define X86_SSE42_NATIVE
+#  endif
+#endif
+#if defined(__PCLMUL__)
+#  ifdef X86_PCLMULQDQ_CRC
+#    define X86_PCLMULQDQ_NATIVE
+#  endif
+#endif
+#if defined(__AVX2__)
+#  ifdef X86_AVX2
+#    define X86_AVX2_NATIVE
+#  endif
+#endif
+#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+#  ifdef X86_AVX512
+#    define X86_AVX512_NATIVE
+#  endif
+#endif
+#if defined(__AVX512VNNI__)
+#  ifdef X86_AVX512VNNI
+#    define X86_AVX512VNNI_NATIVE
+#  endif
+#endif
+#if defined(__VPCLMULQDQ__)
+#  if defined(X86_VPCLMULQDQ_AVX2) && defined(X86_AVX2_NATIVE)
+#    define X86_VPCLMULQDQ_AVX2_NATIVE
+#  endif
+#  if defined(X86_VPCLMULQDQ_AVX512) && defined(X86_AVX512_NATIVE)
+#    define X86_VPCLMULQDQ_AVX512_NATIVE
+#  endif
+#endif
+
+#endif /* X86_NATIVES_H_ */
author	Mehmet Samet Duman <yongdohyun@projecttick.org>	2026-04-02 19:56:09 +0300
committer	Mehmet Samet Duman <yongdohyun@projecttick.org>	2026-04-02 19:56:09 +0300
commit	7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
tree	b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch
parent	b1e34e861b5d732afe828d58aad2c638135061fd (diff)
parent	c2712b8a345191f6ed79558c089777df94590087 (diff)
download	Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip