15 files changed, 1844 insertions, 0 deletions
diff --git a/neozip/arch/arm/Makefile.in b/neozip/arch/arm/Makefile.in
new file mode 100644
index 0000000000..d0bfe0e172
--- /dev/null
+++ b/neozip/arch/arm/Makefile.in
@@ -0,0 +1,86 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+ARMV8FLAG=
+PMULLEOR3FLAG=
+NEONFLAG=
+ARMV6FLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	adler32_neon.o adler32_neon.lo \
+	arm_features.o arm_features.lo \
+	chunkset_neon.o chunkset_neon.lo \
+	compare256_neon.o compare256_neon.lo \
+	crc32_armv8.o crc32_armv8.lo \
+	crc32_armv8_pmull_eor3.o crc32_armv8_pmull_eor3.lo \
+	slide_hash_neon.o slide_hash_neon.lo \
+	slide_hash_armv6.o slide_hash_armv6.lo \
+
+adler32_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+arm_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+arm_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+chunkset_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+compare256_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+crc32_armv8.o:
+	$(CC) $(CFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8.lo:
+	$(CC) $(SFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8_pmull_eor3.o:
+	$(CC) $(CFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+crc32_armv8_pmull_eor3.lo:
+	$(CC) $(SFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+slide_hash_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_armv6.o:
+	$(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+slide_hash_armv6.lo:
+	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/arm/acle_intrins.h b/neozip/arch/arm/acle_intrins.h
new file mode 100644
index 0000000000..16f5e2c77c
--- /dev/null
+++ b/neozip/arch/arm/acle_intrins.h
@@ -0,0 +1,90 @@
+#ifndef ARM_ACLE_INTRINS_H
+#define ARM_ACLE_INTRINS_H
+
+#include <stdint.h>
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif defined(HAVE_ARM_ACLE_H)
+#  include <arm_acle.h>
+#endif
+
+#ifdef ARM_CRC32
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+#  define Z_TARGET_CRC Z_TARGET("+crc")
+#else
+#  define Z_TARGET_CRC
+#endif
+#ifdef ARM_PMULL_EOR3
+#  define Z_TARGET_PMULL_EOR3 Z_TARGET("+crc+crypto+sha3")
+#else
+#  define Z_TARGET_PMULL_EOR3
+#endif
+
+#if !defined(ARM_CRC32_INTRIN) && !defined(_MSC_VER)
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+    uint32_t __c;
+    __asm__("crc32b %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+    uint32_t __c;
+    __asm__("crc32h %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+    uint32_t __c;
+    __asm__("crc32w %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+    uint32_t __c;
+    __asm__("crc32x %w0, %w1, %x2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+#else
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+    uint32_t __c;
+    __asm__("crc32b %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+    uint32_t __c;
+    __asm__("crc32h %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+    uint32_t __c;
+    __asm__("crc32w %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+    return __crc32w (__crc32w (__a, __b & 0xffffffffULL), __b >> 32);
+}
+#endif
+#endif
+#endif
+
+#ifdef ARM_SIMD
+#ifdef _MSC_VER
+typedef uint32_t uint16x2_t;
+
+#define __uqsub16 _arm_uqsub16
+#elif !defined(ARM_SIMD_INTRIN)
+typedef uint32_t uint16x2_t;
+
+static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
+    uint16x2_t __c;
+    __asm__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+#endif
+#endif
+
+#endif // include guard ARM_ACLE_INTRINS_H
diff --git a/neozip/arch/arm/adler32_neon.c b/neozip/arch/arm/adler32_neon.c
new file mode 100644
index 0000000000..48532e6cd1
--- /dev/null
+++ b/neozip/arch/arm/adler32_neon.c
@@ -0,0 +1,346 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors:
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "adler32_p.h"
+
+static const uint16_t ALIGNED_(64) taps[64] = {
+    64, 63, 62, 61, 60, 59, 58, 57,
+    56, 55, 54, 53, 52, 51, 50, 49,
+    48, 47, 46, 45, 44, 43, 42, 41,
+    40, 39, 38, 37, 36, 35, 34, 33,
+    32, 31, 30, 29, 28, 27, 26, 25,
+    24, 23, 22, 21, 20, 19, 18, 17,
+    16, 15, 14, 13, 12, 11, 10, 9,
+    8, 7, 6, 5, 4, 3, 2, 1 };
+
+Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uint8_t *buf, size_t len) {
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    size_t num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (size_t i = 0; i < num_iter; ++i) {
+        uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+        uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128);
+        uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128);
+        uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128);
+
+        vst1q_u8(dst, d0);
+        vst1q_u8(dst + 16, d1);
+        vst1q_u8(dst + 32, d2);
+        vst1q_u8(dst + 48, d3);
+        dst += 64;
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0);
+        hsum.val[1] = vpaddlq_u8(d1);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d2);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d3);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0));
+        s2_1 = vaddw_high_u8(s2_1, d0);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d1));
+        s2_3 = vaddw_high_u8(s2_3, d1);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d2));
+        s2_5 = vaddw_high_u8(s2_5, d2);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d3));
+        s2_7 = vaddw_high_u8(s2_7, d3);
+
+        adacc_prev = adacc;
+        buf += 64;
+    }
+
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+            vst1q_u8(dst, d0);
+            dst += 16;
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
+    }
+
+    uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    size_t num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (size_t i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
+    }
+
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
+    }
+
+    uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
+
+    uint32_t pair[2];
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * when there's no explicit aligned load instruction. Note:
+     * the code currently emits an alignment hint in the instruction
+     * for exactly 256 bits when supported by the compiler. Several ARM
+     * SIPs have small penalties for cacheline crossing loads as well (so
+     * really 512 bits is the optimal alignment of the buffer). 32 bytes
+     * should strike a balance, though. The Cortex-A8 and Cortex-A9
+     * processors are documented to benefit from 128 bit and 64 bit
+     * alignment, but it's unclear which other SIPs will benefit from it.
+     * In the copying variant we use fallback to 4x loads and 4x stores,
+     * as ld1x4 seems to block ILP when stores are in the mix */
+    size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
+    size_t n = NMAX_ALIGNED32;
+    if (align_diff) {
+        adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
+
+        if (COPY)
+            dst += align_diff;
+        src += align_diff;
+        len -= align_diff;
+        n = ALIGN_DOWN(n - align_diff, 32);
+    }
+
+    while (len >= 16) {
+        n = MIN(len, n);
+
+        if (COPY)
+            NEON_accum32_copy(pair, dst, src, n >> 4);
+        else
+            NEON_accum32(pair, src, n >> 4);
+
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        size_t k = (n >> 4) << 4;
+        src += k;
+        if (COPY)
+            dst += k;
+        len -= k;
+        n = NMAX_ALIGNED32;
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY);
+}
+
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+    return adler32_copy_impl(adler, dst, src, len, 1);
+#else
+    /* Without unaligned access, interleaved stores get decomposed into byte ops */
+    adler = adler32_neon(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+#endif
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.c b/neozip/arch/arm/arm_features.c
new file mode 100644
index 0000000000..8f179526ef
--- /dev/null
+++ b/neozip/arch/arm/arm_features.c
@@ -0,0 +1,334 @@
+#ifdef ARM_FEATURES
+
+#include "zbuild.h"
+#include "arm_features.h"
+
+#if defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#  ifdef ARM_ASM_HWCAP
+#    include <asm/hwcap.h>
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+#  include <machine/armreg.h>
+#  ifndef ID_AA64ISAR0_CRC32_VAL
+#    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+#  endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+#  include <machine/armreg.h>
+#  include <machine/cpu.h>
+#  include <sys/sysctl.h>
+#  include <sys/types.h>
+#elif defined(__APPLE__)
+#  if !defined(_DARWIN_C_SOURCE)
+#    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+#  endif
+#  include <sys/sysctl.h>
+#elif defined(_WIN32)
+#  include <windows.h>
+#endif
+
+static int arm_has_crc32(void) {
+    int has_crc32 = 0;
+#if defined(__ARM_FEATURE_CRC32)
+    /* Compile-time check */
+    has_crc32 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CRC32
+    has_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
+#  elif defined(HWCAP2_CRC32)
+    has_crc32 = (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CRC32
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_crc32 = (hwcap & HWCAP_CRC32) != 0;
+#  elif defined(HWCAP2_CRC32)
+    unsigned long hwcap2 = 0;
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+    has_crc32 = (hwcap2 & HWCAP2_CRC32) != 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+    has_crc32 = getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      has_crc32 = ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE;
+    }
+#elif defined(__APPLE__)
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_crc32 = sysctlbyname("hw.optional.armv8_crc32", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+#elif defined(_WIN32)
+    has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#endif
+    return has_crc32;
+}
+
+static int arm_has_pmull(void) {
+    int has_pmull = 0;
+#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)
+    /* Compile-time check */
+    has_pmull = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_PMULL
+    has_pmull = (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#  elif defined(HWCAP_AES)
+    /* PMULL is part of crypto extension, check for AES as proxy */
+    has_pmull = (getauxval(AT_HWCAP) & HWCAP_AES) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_PMULL
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_pmull = (hwcap & HWCAP_PMULL) != 0;
+#  elif defined(HWCAP_AES)
+    /* PMULL is part of crypto extension, check for AES as proxy */
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_pmull = (hwcap & HWCAP_AES) != 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+    /* Check for AES feature as PMULL is part of crypto extension */
+    has_pmull = getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_AES_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_AES_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      has_pmull = ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_BASE;
+    }
+#elif defined(__APPLE__)
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_pmull = sysctlbyname("hw.optional.arm.FEAT_PMULL", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+#elif defined(_WIN32)
+    /* Windows checks for crypto/AES support */
+#  ifdef PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE
+    has_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+#  endif
+#endif
+    return has_pmull;
+}
+
+static int arm_has_eor3(void) {
+    int has_eor3 = 0;
+#if defined(__ARM_FEATURE_SHA3)
+    /* Compile-time check */
+    has_eor3 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+    /* EOR3 is part of SHA3 extension, check HWCAP2_SHA3 */
+#  ifdef HWCAP2_SHA3
+    has_eor3 = (getauxval(AT_HWCAP2) & HWCAP2_SHA3) != 0;
+#  elif defined(HWCAP_SHA3)
+    has_eor3 = (getauxval(AT_HWCAP) & HWCAP_SHA3) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP2_SHA3
+    unsigned long hwcap2 = 0;
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+    has_eor3 = (hwcap2 & HWCAP2_SHA3) != 0;
+#  elif defined(HWCAP_SHA3)
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_eor3 = (hwcap & HWCAP_SHA3) != 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+    /* FreeBSD: check for SHA3 in id_aa64isar0_el1 */
+#  ifdef ID_AA64ISAR0_SHA3_VAL
+    has_eor3 = getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_SHA3_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_SHA3_BASE;
+#  endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+#  ifdef ID_AA64ISAR0_SHA3
+    int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+    uint64_t isar0 = 0;
+    size_t len = sizeof(isar0);
+    if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      has_eor3 = ID_AA64ISAR0_SHA3(isar0) >= ID_AA64ISAR0_SHA3_IMPL;
+    }
+#  endif
+#elif defined(__APPLE__)
+    /* All Apple Silicon (M1+) has SHA3/EOR3 support */
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_eor3 = sysctlbyname("hw.optional.arm.FEAT_SHA3", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+    /* Fallback to legacy name for older macOS versions */
+    if (!has_eor3) {
+        size = sizeof(has_feat);
+        has_eor3 = sysctlbyname("hw.optional.armv8_2_sha3", &has_feat, &size, NULL, 0) == 0
+            && has_feat == 1;
+    }
+#elif defined(_WIN32)
+#  ifdef PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE
+    has_eor3 = IsProcessorFeaturePresent(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE);
+#  endif
+#endif
+    return has_eor3;
+}
+
+/* AArch64 has neon. */
+#ifdef ARCH_32BIT
+static inline int arm_has_neon(void) {
+    int has_neon = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    /* Compile-time check */
+    has_neon = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_ARM_NEON
+    has_neon = (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
+#  elif defined(HWCAP_NEON)
+    has_neon = (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_NEON
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_neon = (hwcap & HWCAP_NEON) != 0;
+#  endif
+#elif defined(__APPLE__)
+    int has_feat = 0;
+    size_t size = sizeof(has_feat);
+    has_neon = sysctlbyname("hw.optional.neon", &has_feat, &size, NULL, 0) == 0
+        && has_feat == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+    has_neon = 1; /* Always supported */
+#  endif
+#endif
+    return has_neon;
+}
+#endif
+
+/* AArch64 does not have ARMv6 SIMD. */
+#ifdef ARCH_32BIT
+static inline int arm_has_simd(void) {
+    int has_simd = 0;
+#if defined(__ARM_FEATURE_SIMD32)
+    /* Compile-time check for ARMv6 SIMD */
+    has_simd = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+    const char *platform = (const char *)getauxval(AT_PLATFORM);
+    has_simd = platform
+       && (strncmp(platform, "v6l", 3) == 0
+        || strncmp(platform, "v7l", 3) == 0
+        || strncmp(platform, "v8l", 3) == 0);
+#endif
+    return has_simd;
+}
+#endif
+
+#if defined(ARCH_64BIT) && !defined(__APPLE__) && !defined(_WIN32)
+/* MIDR_EL1 bit field definitions */
+#define MIDR_IMPLEMENTOR(midr)  (((midr) & (0xffU << 24)) >> 24)
+#define MIDR_PARTNUM(midr)      (((midr) & (0xfffU << 4)) >> 4)
+
+/* ARM CPU Implementer IDs */
+#define ARM_IMPLEMENTER_ARM      0x41
+#define ARM_IMPLEMENTER_QUALCOMM 0x51
+#define ARM_IMPLEMENTER_APPLE    0x61
+
+/* ARM CPU Part Numbers */
+
+/* Cortex-X series - Multiple PMULL lanes */
+#define ARM_PART_CORTEX_X1   0xd44
+#define ARM_PART_CORTEX_X1C  0xd4c
+#define ARM_PART_CORTEX_X2   0xd48
+#define ARM_PART_CORTEX_X3   0xd4e
+#define ARM_PART_CORTEX_X4   0xd82
+#define ARM_PART_CORTEX_X925 0xd85
+
+/* Neoverse V/N2 series - Multiple PMULL lanes */
+#define ARM_PART_NEOVERSE_N2 0xd49
+#define ARM_PART_NEOVERSE_V1 0xd40
+#define ARM_PART_NEOVERSE_V2 0xd4f
+#define ARM_PART_NEOVERSE_V3 0xd8e
+
+/* Snapdragon X Elite/Plus - Custom core */
+#define QUALCOMM_PART_ORYON 0x001
+
+static inline int arm_has_cpuid(void) {
+    int has_cpuid = 0;
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CPUID
+    has_cpuid = (getauxval(AT_HWCAP) & HWCAP_CPUID) != 0;
+#  elif defined(HWCAP2_CPUID)
+    has_cpuid = (getauxval(AT_HWCAP2) & HWCAP2_CPUID) != 0;
+#  endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+#  ifdef HWCAP_CPUID
+    unsigned long hwcap = 0;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    has_cpuid = (hwcap & HWCAP_CPUID) != 0;
+#  endif
+#endif
+    return has_cpuid;
+}
+#endif
+
+/* Determine if CPU has fast PMULL (multiple execution units) */
+static inline int arm_cpu_has_fast_pmull(void) {
+    int has_fast_pmull = 0;
+#if defined(__APPLE__)
+    /* On macOS, all Apple Silicon has fast PMULL */
+    has_fast_pmull = 1;
+#elif defined(ARCH_64BIT) && !defined(_WIN32)
+    /* We need CPUID feature to read MIDR register */
+    if (!arm_has_cpuid())
+        return has_fast_pmull;
+
+    uint64_t midr;
+    __asm__ ("mrs %0, midr_el1" : "=r" (midr));
+
+    uint32_t implementer = MIDR_IMPLEMENTOR(midr);
+    uint32_t part = MIDR_PARTNUM(midr);
+
+    if (implementer == ARM_IMPLEMENTER_APPLE) {
+        /* All Apple Silicon (M1+) have fast PMULL */
+        has_fast_pmull = 1;
+    } else if (implementer == ARM_IMPLEMENTER_ARM) {
+        /* ARM Cortex-X and Neoverse V/N2 series have multi-lane PMULL */
+        switch (part) {
+            case ARM_PART_CORTEX_X1:
+            case ARM_PART_CORTEX_X1C:
+            case ARM_PART_CORTEX_X2:
+            case ARM_PART_CORTEX_X3:
+            case ARM_PART_CORTEX_X4:
+            case ARM_PART_CORTEX_X925:
+            case ARM_PART_NEOVERSE_N2:
+            case ARM_PART_NEOVERSE_V1:
+            case ARM_PART_NEOVERSE_V2:
+            case ARM_PART_NEOVERSE_V3:
+                has_fast_pmull = 1;
+        }
+    } else if (implementer == ARM_IMPLEMENTER_QUALCOMM) {
+        /* Qualcomm Oryon (Snapdragon X Elite/Plus) has fast PMULL */
+        if (part == QUALCOMM_PART_ORYON)
+            has_fast_pmull = 1;
+    }
+#endif
+    return has_fast_pmull;
+}
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
+#ifdef ARCH_64BIT
+    features->has_simd = 0; /* never available */
+    features->has_neon = 1; /* always available */
+#else
+    features->has_simd = arm_has_simd();
+    features->has_neon = arm_has_neon();
+#endif
+    features->has_crc32 = arm_has_crc32();
+    features->has_pmull = arm_has_pmull();
+    features->has_eor3 = arm_has_eor3();
+    features->has_fast_pmull = features->has_pmull && arm_cpu_has_fast_pmull();
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.h b/neozip/arch/arm/arm_features.h
new file mode 100644
index 0000000000..2f17a9ddf0
--- /dev/null
+++ b/neozip/arch/arm/arm_features.h
@@ -0,0 +1,19 @@
+/* arm_features.h -- check for ARM features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_
+
+struct arm_cpu_features {
+    int has_simd;
+    int has_neon;
+    int has_crc32;
+    int has_pmull;
+    int has_eor3;
+    int has_fast_pmull;
+};
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
+
+#endif /* ARM_FEATURES_H_ */
diff --git a/neozip/arch/arm/arm_functions.h b/neozip/arch/arm/arm_functions.h
new file mode 100644
index 0000000000..bc77adb977
--- /dev/null
+++ b/neozip/arch/arm/arm_functions.h
@@ -0,0 +1,75 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#include "arm_natives.h"
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match);
+void slide_hash_neon(deflate_state *s);
+#endif
+
+#ifdef ARM_CRC32
+uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef ARM_PMULL_EOR3
+uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+#  ifdef ARM_SIMD_NATIVE
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_armv6
+#  endif
+// ARM - NEON
+#  ifdef ARM_NEON_NATIVE
+#    undef native_adler32
+#    define native_adler32 adler32_neon
+#    undef native_adler32_copy
+#    define native_adler32_copy adler32_copy_neon
+#    undef native_chunkmemset_safe
+#    define native_chunkmemset_safe chunkmemset_safe_neon
+#    undef native_compare256
+#    define native_compare256 compare256_neon
+#    undef native_inflate_fast
+#    define native_inflate_fast inflate_fast_neon
+#    undef native_longest_match
+#    define native_longest_match longest_match_neon
+#    undef native_longest_match_slow
+#    define native_longest_match_slow longest_match_slow_neon
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_neon
+#  endif
+// ARM - CRC32
+#  ifdef ARM_CRC32_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_armv8
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_armv8
+#  endif
+// ARM - PMULL EOR3
+#  ifdef ARM_PMULL_EOR3_NATIVE
+#    undef native_crc32
+#    define native_crc32 crc32_armv8_pmull_eor3
+#    undef native_crc32_copy
+#    define native_crc32_copy crc32_copy_armv8_pmull_eor3
+#  endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
diff --git a/neozip/arch/arm/arm_natives.h b/neozip/arch/arm/arm_natives.h
new file mode 100644
index 0000000000..311e33e958
--- /dev/null
+++ b/neozip/arch/arm/arm_natives.h
@@ -0,0 +1,31 @@
+/* arm_natives.h -- ARM compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_NATIVES_H_
+#define ARM_NATIVES_H_
+
+#if defined(__ARM_FEATURE_SIMD32)
+#  ifdef ARM_SIMD
+#    define ARM_SIMD_NATIVE
+#  endif
+#endif
+/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT)
+#  ifdef ARM_NEON
+#    define ARM_NEON_NATIVE
+#  endif
+#endif
+/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */
+#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801)
+#  ifdef ARM_CRC32
+#    define ARM_CRC32_NATIVE
+#  endif
+#endif
+#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3)
+#  ifdef ARM_PMULL_EOR3
+#    define ARM_PMULL_EOR3_NATIVE
+#  endif
+#endif
+
+#endif /* ARM_NATIVES_H_ */
diff --git a/neozip/arch/arm/chunkset_neon.c b/neozip/arch/arm/chunkset_neon.c
new file mode 100644
index 0000000000..a891f10fa5
--- /dev/null
+++ b/neozip/arch/arm/chunkset_neon.c
@@ -0,0 +1,81 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+#include "neon_intrins.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef uint8x16_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from)));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from)));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from)));
+}
+
+#define CHUNKSIZE        chunksize_neon
+#define CHUNKCOPY        chunkcopy_neon
+#define CHUNKUNROLL      chunkunroll_neon
+#define CHUNKMEMSET      chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vst1q_u8(out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 16 - dist);
+
+    /* This version of table is only available on aarch64 */
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+    uint8x16_t ret_vec = vld1q_u8(buf);
+
+    uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128);
+    return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+    uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+    perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64);
+    perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
+    a = vld1_u8(buf);
+    b = vld1_u8(buf + 8);
+    ret0 = vtbl1_u8(a, perm_vec0);
+    uint8x8x2_t ab;
+    ab.val[0] = a;
+    ab.val[1] = b;
+    ret1 = vtbl2_u8(ab, perm_vec1);
+    return vcombine_u8(ret0, ret1);
+#endif
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_neon
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/compare256_neon.c b/neozip/arch/arm/compare256_neon.c
new file mode 100644
index 0000000000..4ced9fc9ca
--- /dev/null
+++ b/neozip/arch/arm/compare256_neon.c
@@ -0,0 +1,56 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON)
+#include "neon_intrins.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint8x16_t a, b, cmp;
+        uint64_t lane;
+
+        a = vld1q_u8(src0);
+        b = vld1q_u8(src1);
+
+        cmp = veorq_u8(a, b);
+
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+        if (lane)
+            return len + zng_ctz64(lane) / 8;
+        len += 8;
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+        if (lane)
+            return len + zng_ctz64(lane) / 8;
+        len += 8;
+
+        src0 += 16, src1 += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/crc32_armv8.c b/neozip/arch/arm/crc32_armv8.c
new file mode 100644
index 0000000000..59f2b65009
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8.c
@@ -0,0 +1,81 @@
+/* crc32_armv8.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
+ * Copyright (C) 2016 Yang Zhang
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_CRC32
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "crc32_armv8_p.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len,
+                                                           const int COPY) {
+    uint32_t c = ~crc;
+
+    if (UNLIKELY(len == 1)) {
+        if (COPY)
+            *dst = *src;
+        c = __crc32b(c, *src);
+        return ~c;
+    }
+
+    /* Align to 8-byte boundary for tail processing */
+    uintptr_t align_diff = ALIGN_DIFF(src, 8);
+    if (align_diff)
+        c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY);
+
+    while (len >= 64) {
+        uint64_t d0 = *(const uint64_t *)src;
+        uint64_t d1 = *(const uint64_t *)(src + 8);
+        uint64_t d2 = *(const uint64_t *)(src + 16);
+        uint64_t d3 = *(const uint64_t *)(src + 24);
+        uint64_t d4 = *(const uint64_t *)(src + 32);
+        uint64_t d5 = *(const uint64_t *)(src + 40);
+        uint64_t d6 = *(const uint64_t *)(src + 48);
+        uint64_t d7 = *(const uint64_t *)(src + 56);
+
+        if (COPY) {
+            memcpy(dst,      &d0, 8);
+            memcpy(dst + 8,  &d1, 8);
+            memcpy(dst + 16, &d2, 8);
+            memcpy(dst + 24, &d3, 8);
+            memcpy(dst + 32, &d4, 8);
+            memcpy(dst + 40, &d5, 8);
+            memcpy(dst + 48, &d6, 8);
+            memcpy(dst + 56, &d7, 8);
+            dst += 64;
+        }
+
+        c = __crc32d(c, d0);
+        c = __crc32d(c, d1);
+        c = __crc32d(c, d2);
+        c = __crc32d(c, d3);
+        c = __crc32d(c, d4);
+        c = __crc32d(c, d5);
+        c = __crc32d(c, d6);
+        c = __crc32d(c, d7);
+
+        src += 64;
+        len -= 64;
+    }
+
+    return crc32_armv8_tail(c, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+    return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+    /* Without unaligned access, interleaved stores get decomposed into byte ops */
+    crc = crc32_armv8(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/crc32_armv8_p.h b/neozip/arch/arm/crc32_armv8_p.h
new file mode 100644
index 0000000000..e72c4c0ad1
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_p.h
@@ -0,0 +1,103 @@
+/* crc32_armv8_p.h -- Private shared inline ARMv8 CRC32 functions
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CRC32_ARMV8_P_H
+#define CRC32_ARMV8_P_H
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_align(uint32_t crc, uint8_t **dst, const uint8_t **buf,
+                                                             size_t *len, uintptr_t align_diff, const int COPY) {
+    if (*len && (align_diff & 1)) {
+        uint8_t val = **buf;
+        if (COPY) {
+            **dst = val;
+            *dst += 1;
+        }
+        crc = __crc32b(crc, val);
+        *buf += 1;
+        *len -= 1;
+    }
+
+    if (*len >= 2 && (align_diff & 2)) {
+        uint16_t val = *((uint16_t*)*buf);
+        if (COPY) {
+            memcpy(*dst, &val, 2);
+            *dst += 2;
+        }
+        crc = __crc32h(crc, val);
+        *buf += 2;
+        *len -= 2;
+    }
+
+    if (*len >= 4 && (align_diff & 4)) {
+        uint32_t val = *((uint32_t*)*buf);
+        if (COPY) {
+            memcpy(*dst, &val, 4);
+            *dst += 4;
+        }
+        crc = __crc32w(crc, val);
+        *buf += 4;
+        *len -= 4;
+    }
+
+    if (*len >= 8 && (align_diff & 8)) {
+        uint64_t val = *((uint64_t*)*buf);
+        if (COPY) {
+            memcpy(*dst, &val, 8);
+            *dst += 8;
+        }
+        crc = __crc32d(crc, val);
+        *buf += 8;
+        *len -= 8;
+    }
+
+    return crc;
+}
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_tail(uint32_t crc, uint8_t *dst, const uint8_t *buf,
+                                                            size_t len, const int COPY) {
+    while (len >= 8) {
+        uint64_t val = *((uint64_t*)buf);
+        if (COPY) {
+            memcpy(dst, &val, 8);
+            dst += 8;
+        }
+        crc = __crc32d(crc, val);
+        buf += 8;
+        len -= 8;
+    }
+
+    if (len & 4) {
+        uint32_t val = *((uint32_t*)buf);
+        if (COPY) {
+            memcpy(dst, &val, 4);
+            dst += 4;
+        }
+        crc = __crc32w(crc, val);
+        buf += 4;
+    }
+
+    if (len & 2) {
+        uint16_t val = *((uint16_t*)buf);
+        if (COPY) {
+            memcpy(dst, &val, 2);
+            dst += 2;
+        }
+        crc = __crc32h(crc, val);
+        buf += 2;
+    }
+
+    if (len & 1) {
+        uint8_t val = *buf;
+        if (COPY)
+            *dst = val;
+        crc = __crc32b(crc, val);
+    }
+
+    return ~crc;
+}
+
+#endif /* CRC32_ARMV8_P_H */
diff --git a/neozip/arch/arm/crc32_armv8_pmull_eor3.c b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
new file mode 100644
index 0000000000..e0d5bf043b
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
@@ -0,0 +1,366 @@
+/* crc32_armv8_pmull_eor3.c -- ARMv8 CRC32 using PMULL + EOR3 (SHA3 extension)
+ * Copyright (C) 2025 Peter Cawley
+ *   https://github.com/corsix/fast-crc32
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This uses EOR3 (3-way XOR) from ARMv8.2-A SHA3 extension to save instructions.
+ * Uses 3-way parallel scalar CRC + 9 PMULL vector lanes, processing 192 bytes/iter.
+ */
+
+#ifdef ARM_PMULL_EOR3
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "acle_intrins.h"
+#include "neon_intrins.h"
+#include "crc32_armv8_p.h"
+
+/* Carryless multiply low 64 bits: a[0] * b[0] */
+static inline uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
+#ifdef _MSC_VER
+    return vreinterpretq_u64_p128(vmull_p64(
+        vget_low_p64(vreinterpret_p64_u64(a)),
+        vget_low_p64(vreinterpret_p64_u64(b))));
+#else
+    return vreinterpretq_u64_p128(vmull_p64(
+        vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(a)), 0),
+        vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(b)), 0)));
+#endif
+}
+
+/* Carryless multiply high 64 bits: a[1] * b[1] */
+static inline uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
+    return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b)));
+}
+
+/* Carryless multiply of two 32-bit scalars: a * b (returns 64-bit result in 128-bit vector) */
+static inline uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
+#ifdef _MSC_VER
+    return vreinterpretq_u64_p128(vmull_p64(vdup_n_p64((poly64_t)a), vdup_n_p64((poly64_t)b)));
+#else
+    return vreinterpretq_u64_p128(vmull_p64((poly64_t)a, (poly64_t)b));
+#endif
+}
+
+/* Compute x^n mod P (CRC-32 polynomial) in log(n) time, where P = 0x104c11db7 */
+static uint32_t xnmodp(uint64_t n) {
+  uint64_t stack = ~(uint64_t)1;
+  uint32_t acc, low;
+  for (; n > 191; n = (n >> 1) - 16) {
+    stack = (stack << 1) + (n & 1);
+  }
+  stack = ~stack;
+  acc = ((uint32_t)0x80000000) >> (n & 31);
+  for (n >>= 5; n; --n) {
+    acc = __crc32w(acc, 0);
+  }
+  while ((low = stack & 1), stack >>= 1) {
+    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
+    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
+    acc = __crc32d(0, y << low);
+  }
+  return acc;
+}
+
+/* Shift CRC forward by nbytes: equivalent to appending nbytes of zeros to the data stream */
+static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
+  Assert(nbytes >= 5, "crc_shift requires nbytes >= 5");
+  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
+}
+
+Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src,
+                                                                  size_t len, const int COPY) {
+    uint32_t crc0 = ~crc;
+
+    if (UNLIKELY(len == 1)) {
+        if (COPY)
+            *dst = *src;
+        crc0 = __crc32b(crc0, *src);
+        return ~crc0;
+    }
+
+    /* Align to 16-byte boundary for vector path */
+    uintptr_t align_diff = ALIGN_DIFF(src, 16);
+    if (align_diff)
+        crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY);
+
+    /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */
+    if (len >= 192) {
+        size_t blk = len / 192;                   /* Number of 192-byte blocks */
+        size_t klen = blk * 16;                   /* Scalar stride per CRC lane */
+        const uint8_t *end = src + len;
+        const uint8_t *src0 = src;
+        const uint8_t *src1 = src + klen;
+        const uint8_t *src2 = src + klen * 2;
+        const uint8_t *srcv = src + klen * 3;     /* Vector data starts after scalar lanes */
+        uint32_t crc1 = 0, crc2 = 0;
+        uint64x2_t vc0, vc1, vc2;
+        uint64_t vc;
+
+        /* Load first 9 vector chunks (144 bytes) */
+        uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0;
+        uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1;
+        uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2;
+        uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3;
+        uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4;
+        uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5;
+        uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6;
+        uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7;
+        uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8;
+        uint64x2_t k;
+        /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
+        { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
+
+        /* Per-region dst pointers */
+        uint8_t *dst0 = dst;
+        uint8_t *dst1 = NULL;
+        uint8_t *dst2 = NULL;
+        uint8_t *dst_v = NULL;
+
+        if (COPY) {
+            dst1 = dst + klen;
+            dst2 = dst + klen * 2;
+            dst_v = dst + klen * 3;
+            vst1q_u8(dst_v, vreinterpretq_u8_u64(x0));
+            vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(x1));
+            vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(x2));
+            vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(x3));
+            vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(x4));
+            vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(x5));
+            vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(x6));
+            vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(x7));
+            vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(x8));
+            dst_v += 144;
+        }
+        srcv += 144;
+
+        /* Fold 9 vectors + 3-way parallel scalar CRC */
+        if (blk > 1) {
+            /* Only form a limit pointer when we have at least 2 blocks. */
+            const uint8_t *limit = src0 + klen - 32;
+            while (src0 <= limit) {
+                /* Fold all 9 vector lanes using PMULL */
+                y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+                y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+                y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+                y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+                y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+                y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
+                y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+                y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
+                y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
+
+                /* EOR3: combine hi*k, lo*k, and new data in one instruction */
+                {
+                    uint64x2_t d0 = vld1q_u64_ex((const uint64_t*)srcv, 128);
+                    uint64x2_t d1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128);
+                    uint64x2_t d2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128);
+                    uint64x2_t d3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128);
+                    uint64x2_t d4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128);
+                    uint64x2_t d5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128);
+                    uint64x2_t d6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128);
+                    uint64x2_t d7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128);
+                    uint64x2_t d8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128);
+                    if (COPY) {
+                        vst1q_u8(dst_v, vreinterpretq_u8_u64(d0));
+                        vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(d1));
+                        vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(d2));
+                        vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(d3));
+                        vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(d4));
+                        vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(d5));
+                        vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(d6));
+                        vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(d7));
+                        vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(d8));
+                        dst_v += 144;
+                    }
+                    x0 = veor3q_u64(x0, y0, d0);
+                    x1 = veor3q_u64(x1, y1, d1);
+                    x2 = veor3q_u64(x2, y2, d2);
+                    x3 = veor3q_u64(x3, y3, d3);
+                    x4 = veor3q_u64(x4, y4, d4);
+                    x5 = veor3q_u64(x5, y5, d5);
+                    x6 = veor3q_u64(x6, y6, d6);
+                    x7 = veor3q_u64(x7, y7, d7);
+                    x8 = veor3q_u64(x8, y8, d8);
+                }
+
+                /* 3-way parallel scalar CRC (16 bytes each) */
+                {
+                    uint64_t s0a = *(const uint64_t*)src0;
+                    uint64_t s0b = *(const uint64_t*)(src0 + 8);
+                    uint64_t s1a = *(const uint64_t*)src1;
+                    uint64_t s1b = *(const uint64_t*)(src1 + 8);
+                    uint64_t s2a = *(const uint64_t*)src2;
+                    uint64_t s2b = *(const uint64_t*)(src2 + 8);
+                    if (COPY) {
+                        memcpy(dst0, &s0a, 8);
+                        memcpy(dst0 + 8, &s0b, 8);
+                        dst0 += 16;
+                        memcpy(dst1, &s1a, 8);
+                        memcpy(dst1 + 8, &s1b, 8);
+                        dst1 += 16;
+                        memcpy(dst2, &s2a, 8);
+                        memcpy(dst2 + 8, &s2b, 8);
+                        dst2 += 16;
+                    }
+                    crc0 = __crc32d(crc0, s0a);
+                    crc0 = __crc32d(crc0, s0b);
+                    crc1 = __crc32d(crc1, s1a);
+                    crc1 = __crc32d(crc1, s1b);
+                    crc2 = __crc32d(crc2, s2a);
+                    crc2 = __crc32d(crc2, s2b);
+                }
+                src0 += 16;
+                src1 += 16;
+                src2 += 16;
+                srcv += 144;
+            }
+        }
+
+        /* Reduce 9 vectors to 1 using tree reduction */
+        /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */
+        { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); }
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
+
+        /* Step 2: fold pairs (x0,x1), (x2,x3), (x4,x5), (x6,x7) */
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+        y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+        y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        /* Step 3: fold pairs (x0,x2), (x4,x6) */
+        { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); }
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        /* Step 4: final fold (x0, x4) -> x0 */
+        { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); }
+        y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        /* Process final scalar chunk */
+        {
+            uint64_t s0a = *(const uint64_t*)src0;
+            uint64_t s0b = *(const uint64_t*)(src0 + 8);
+            uint64_t s1a = *(const uint64_t*)src1;
+            uint64_t s1b = *(const uint64_t*)(src1 + 8);
+            uint64_t s2a = *(const uint64_t*)src2;
+            uint64_t s2b = *(const uint64_t*)(src2 + 8);
+            if (COPY) {
+                memcpy(dst0, &s0a, 8);
+                memcpy(dst0 + 8, &s0b, 8);
+                memcpy(dst1, &s1a, 8);
+                memcpy(dst1 + 8, &s1b, 8);
+                memcpy(dst2, &s2a, 8);
+                memcpy(dst2 + 8, &s2b, 8);
+            }
+            crc0 = __crc32d(crc0, s0a);
+            crc0 = __crc32d(crc0, s0b);
+            crc1 = __crc32d(crc1, s1a);
+            crc1 = __crc32d(crc1, s1b);
+            crc2 = __crc32d(crc2, s2a);
+            crc2 = __crc32d(crc2, s2b);
+        }
+
+        /* Shift and combine 3 scalar CRCs */
+        vc0 = crc_shift(crc0, klen * 2 + blk * 144);
+        vc1 = crc_shift(crc1, klen + blk * 144);
+        vc2 = crc_shift(crc2, blk * 144);
+        vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+        if (COPY)
+            dst += blk * 192;
+        src = srcv;
+        len = end - srcv;
+    }
+
+    /* 3-way scalar CRC (24 bytes/iter) */
+    if (len >= 80) {
+        size_t klen = ((len - 8) / 24) * 8;   /* Stride for 3-way parallel */
+        const uint8_t *buf0 = src;
+        const uint8_t *buf1 = src + klen;
+        const uint8_t *buf2 = src + klen * 2;
+        uint32_t crc1 = 0, crc2 = 0;
+        uint64x2_t vc0, vc1;
+        uint64_t vc;
+
+        /* Per-lane dst pointers */
+        uint8_t *dst0 = dst;
+        uint8_t *dst1 = NULL;
+        uint8_t *dst2 = NULL;
+        if (COPY) {
+            dst1 = dst + klen;
+            dst2 = dst + klen * 2;
+        }
+
+        /* 3-way parallel scalar CRC */
+        do {
+            uint64_t v0 = *(const uint64_t*)buf0;
+            uint64_t v1 = *(const uint64_t*)buf1;
+            uint64_t v2 = *(const uint64_t*)buf2;
+            if (COPY) {
+                memcpy(dst0, &v0, 8);
+                dst0 += 8;
+                memcpy(dst1, &v1, 8);
+                dst1 += 8;
+                memcpy(dst2, &v2, 8);
+                dst2 += 8;
+            }
+            crc0 = __crc32d(crc0, v0);
+            crc1 = __crc32d(crc1, v1);
+            crc2 = __crc32d(crc2, v2);
+            buf0 += 8;
+            buf1 += 8;
+            buf2 += 8;
+            len -= 24;
+        } while (len >= 32);
+
+        /* Combine the 3 CRCs */
+        vc0 = crc_shift(crc0, klen * 2 + 8);
+        vc1 = crc_shift(crc1, klen + 8);
+        vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        /* Process final 8 bytes with combined CRC */
+        crc0 = crc2;
+        {
+            uint64_t vf = *(const uint64_t*)buf2;
+            if (COPY)
+                memcpy(dst2, &vf, 8);
+            crc0 = __crc32d(crc0, vf ^ vc);
+        }
+        src = buf2 + 8;
+        len -= 8;
+        if (COPY)
+            dst = dst2 + 8;
+    }
+
+    /* Process remaining bytes */
+    return crc32_armv8_tail(crc0, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+    return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+    return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+    /* Without unaligned access, interleaved stores get decomposed into byte ops */
+    crc = crc32_armv8_pmull_eor3(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/neon_intrins.h b/neozip/arch/arm/neon_intrins.h
new file mode 100644
index 0000000000..449916e0b7
--- /dev/null
+++ b/neozip/arch/arm/neon_intrins.h
@@ -0,0 +1,79 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#if defined(_MSC_VER) && defined(ARCH_ARM) && defined(ARCH_64BIT)
+/* arm64_neon.h is MSVC specific */
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON) && defined(ARCH_ARM) && defined(ARCH_32BIT)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+    out.val[0] = vqsubq_u16(a.val[0], b); \
+    out.val[1] = vqsubq_u16(a.val[1], b); \
+    out.val[2] = vqsubq_u16(a.val[2], b); \
+    out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+#  if defined(ARCH_ARM) && defined(ARCH_32BIT) && defined(__clang__) && \
+    (!defined(__clang_major__) || __clang_major__ < 20)
+/* Clang versions before 20 have too strict of an
+ * alignment requirement (:256) for x4 NEON intrinsics */
+#    undef ARM_NEON_HASLD4
+#    undef vld1q_u16_x4
+#    undef vld1q_u8_x4
+#    undef vst1q_u16_x4
+#  endif
+
+#  ifndef ARM_NEON_HASLD4
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
+    uint16x8x4_t ret;
+    ret.val[0] = vld1q_u16(a);
+    ret.val[1] = vld1q_u16(a+8);
+    ret.val[2] = vld1q_u16(a+16);
+    ret.val[3] = vld1q_u16(a+24);
+    return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(a);
+    ret.val[1] = vld1q_u8(a+16);
+    ret.val[2] = vld1q_u8(a+32);
+    ret.val[3] = vld1q_u8(a+48);
+    return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+    vst1q_u16(p, a.val[0]);
+    vst1q_u16(p + 8, a.val[1]);
+    vst1q_u16(p + 16, a.val[2]);
+    vst1q_u16(p + 24, a.val[3]);
+}
+#  endif // HASLD4 check
+
+#  ifndef _MSC_VER
+#    define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8))
+#  endif
+#  if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
+#    define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
+#    define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
+#  endif
+
+#endif
+
+#endif // include guard ARM_NEON_INTRINS_H
diff --git a/neozip/arch/arm/slide_hash_armv6.c b/neozip/arch/arm/slide_hash_armv6.c
new file mode 100644
index 0000000000..b241e6c5e6
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_armv6.c
@@ -0,0 +1,49 @@
+/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
+ * Copyright (C) 2023 Cameron Cawley
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_SIMD
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x2_t v;
+    uint16x2_t p0, p1, p2, p3;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = wsize | (wsize << 16);
+
+    n = size / (sizeof(uint16x2_t) * 4);
+    do {
+        p0 = *((const uint16x2_t *)(table));
+        p1 = *((const uint16x2_t *)(table+2));
+        p2 = *((const uint16x2_t *)(table+4));
+        p3 = *((const uint16x2_t *)(table+6));
+        p0 = __uqsub16(p0, v);
+        p1 = __uqsub16(p1, v);
+        p2 = __uqsub16(p2, v);
+        p3 = __uqsub16(p3, v);
+        *((uint16x2_t *)(table)) = p0;
+        *((uint16x2_t *)(table+2)) = p1;
+        *((uint16x2_t *)(table+4)) = p2;
+        *((uint16x2_t *)(table+6)) = p3;
+        table += 8;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/neozip/arch/arm/slide_hash_neon.c b/neozip/arch/arm/slide_hash_neon.c
new file mode 100644
index 0000000000..2f9e94a33d
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_neon.c
@@ -0,0 +1,48 @@
+/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x8_t v;
+    uint16x8x4_t p0, p1;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = vdupq_n_u16(wsize);
+
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p0 = vld1q_u16_x4_ex(table, 256);
+        p1 = vld1q_u16_x4_ex(table+32, 256);
+        vqsubq_u16_x4_x1(p0, p0, v);
+        vqsubq_u16_x4_x1(p1, p1, v);
+        vst1q_u16_x4_ex(table, p0, 256);
+        vst1q_u16_x4_ex(table+32, p1, 256);
+        table += 64;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif