summaryrefslogtreecommitdiff
path: root/neozip/arch/arm
diff options
context:
space:
mode:
Diffstat (limited to 'neozip/arch/arm')
-rw-r--r--neozip/arch/arm/Makefile.in86
-rw-r--r--neozip/arch/arm/acle_intrins.h90
-rw-r--r--neozip/arch/arm/adler32_neon.c346
-rw-r--r--neozip/arch/arm/arm_features.c334
-rw-r--r--neozip/arch/arm/arm_features.h19
-rw-r--r--neozip/arch/arm/arm_functions.h75
-rw-r--r--neozip/arch/arm/arm_natives.h31
-rw-r--r--neozip/arch/arm/chunkset_neon.c81
-rw-r--r--neozip/arch/arm/compare256_neon.c56
-rw-r--r--neozip/arch/arm/crc32_armv8.c81
-rw-r--r--neozip/arch/arm/crc32_armv8_p.h103
-rw-r--r--neozip/arch/arm/crc32_armv8_pmull_eor3.c366
-rw-r--r--neozip/arch/arm/neon_intrins.h79
-rw-r--r--neozip/arch/arm/slide_hash_armv6.c49
-rw-r--r--neozip/arch/arm/slide_hash_neon.c48
15 files changed, 1844 insertions, 0 deletions
diff --git a/neozip/arch/arm/Makefile.in b/neozip/arch/arm/Makefile.in
new file mode 100644
index 0000000000..d0bfe0e172
--- /dev/null
+++ b/neozip/arch/arm/Makefile.in
@@ -0,0 +1,86 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+ARMV8FLAG=
+PMULLEOR3FLAG=
+NEONFLAG=
+ARMV6FLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ adler32_neon.o adler32_neon.lo \
+ arm_features.o arm_features.lo \
+ chunkset_neon.o chunkset_neon.lo \
+ compare256_neon.o compare256_neon.lo \
+ crc32_armv8.o crc32_armv8.lo \
+ crc32_armv8_pmull_eor3.o crc32_armv8_pmull_eor3.lo \
+ slide_hash_neon.o slide_hash_neon.lo \
+ slide_hash_armv6.o slide_hash_armv6.lo \
+
+adler32_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+arm_features.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+arm_features.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+chunkset_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+compare256_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+crc32_armv8.o:
+ $(CC) $(CFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8.lo:
+ $(CC) $(SFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c
+
+crc32_armv8_pmull_eor3.o:
+ $(CC) $(CFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+crc32_armv8_pmull_eor3.lo:
+ $(CC) $(SFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c
+
+slide_hash_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_armv6.o:
+ $(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+slide_hash_armv6.lo:
+ $(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/arm/acle_intrins.h b/neozip/arch/arm/acle_intrins.h
new file mode 100644
index 0000000000..16f5e2c77c
--- /dev/null
+++ b/neozip/arch/arm/acle_intrins.h
@@ -0,0 +1,90 @@
+#ifndef ARM_ACLE_INTRINS_H
+#define ARM_ACLE_INTRINS_H
+
+#include <stdint.h>
+#ifdef _MSC_VER
+# include <intrin.h>
+#elif defined(HAVE_ARM_ACLE_H)
+# include <arm_acle.h>
+#endif
+
+#ifdef ARM_CRC32
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+# define Z_TARGET_CRC Z_TARGET("+crc")
+#else
+# define Z_TARGET_CRC
+#endif
+#ifdef ARM_PMULL_EOR3
+# define Z_TARGET_PMULL_EOR3 Z_TARGET("+crc+crypto+sha3")
+#else
+# define Z_TARGET_PMULL_EOR3
+#endif
+
+#if !defined(ARM_CRC32_INTRIN) && !defined(_MSC_VER)
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+ uint32_t __c;
+ __asm__("crc32b %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+ uint32_t __c;
+ __asm__("crc32h %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+ uint32_t __c;
+ __asm__("crc32w %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+ uint32_t __c;
+ __asm__("crc32x %w0, %w1, %x2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+#else
+static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) {
+ uint32_t __c;
+ __asm__("crc32b %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) {
+ uint32_t __c;
+ __asm__("crc32h %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) {
+ uint32_t __c;
+ __asm__("crc32w %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+
+static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) {
+ return __crc32w (__crc32w (__a, __b & 0xffffffffULL), __b >> 32);
+}
+#endif
+#endif
+#endif
+
+#ifdef ARM_SIMD
+#ifdef _MSC_VER
+typedef uint32_t uint16x2_t;
+
+#define __uqsub16 _arm_uqsub16
+#elif !defined(ARM_SIMD_INTRIN)
+typedef uint32_t uint16x2_t;
+
+static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
+ uint16x2_t __c;
+ __asm__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+#endif
+#endif
+
+#endif // include guard ARM_ACLE_INTRINS_H
diff --git a/neozip/arch/arm/adler32_neon.c b/neozip/arch/arm/adler32_neon.c
new file mode 100644
index 0000000000..48532e6cd1
--- /dev/null
+++ b/neozip/arch/arm/adler32_neon.c
@@ -0,0 +1,346 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors:
+ * Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "adler32_p.h"
+
+static const uint16_t ALIGNED_(64) taps[64] = {
+ 64, 63, 62, 61, 60, 59, 58, 57,
+ 56, 55, 54, 53, 52, 51, 50, 49,
+ 48, 47, 46, 45, 44, 43, 42, 41,
+ 40, 39, 38, 37, 36, 35, 34, 33,
+ 32, 31, 30, 29, 28, 27, 26, 25,
+ 24, 23, 22, 21, 20, 19, 18, 17,
+ 16, 15, 14, 13, 12, 11, 10, 9,
+ 8, 7, 6, 5, 4, 3, 2, 1 };
+
+Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uint8_t *buf, size_t len) {
+ uint32x4_t adacc = vdupq_n_u32(0);
+ uint32x4_t s2acc = vdupq_n_u32(0);
+ uint32x4_t s2acc_0 = vdupq_n_u32(0);
+ uint32x4_t s2acc_1 = vdupq_n_u32(0);
+ uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+ adacc = vsetq_lane_u32(s[0], adacc, 0);
+ s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+ uint32x4_t s3acc = vdupq_n_u32(0);
+ uint32x4_t adacc_prev = adacc;
+
+ uint16x8_t s2_0, s2_1, s2_2, s2_3;
+ s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+ uint16x8_t s2_4, s2_5, s2_6, s2_7;
+ s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+ size_t num_iter = len >> 2;
+ int rem = len & 3;
+
+ for (size_t i = 0; i < num_iter; ++i) {
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+ uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128);
+ uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128);
+ uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128);
+
+ vst1q_u8(dst, d0);
+ vst1q_u8(dst + 16, d1);
+ vst1q_u8(dst + 32, d2);
+ vst1q_u8(dst + 48, d3);
+ dst += 64;
+
+ /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+ * bit instruction, we'll have to make due summing to 16 bits first */
+ uint16x8x2_t hsum, hsum_fold;
+ hsum.val[0] = vpaddlq_u8(d0);
+ hsum.val[1] = vpaddlq_u8(d1);
+
+ hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d2);
+ hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d3);
+
+ adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+ s3acc = vaddq_u32(s3acc, adacc_prev);
+ adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+ /* If we do straight widening additions to the 16 bit values, we don't incur
+ * the usual penalties of a pairwise add. We can defer the multiplications
+ * until the very end. These will not overflow because we are incurring at
+ * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+ * summed into once. This means for the maximum input size, the largest value
+ * we will see is 255 * 102 = 26010, safely under uint16 max */
+ s2_0 = vaddw_u8(s2_0, vget_low_u8(d0));
+ s2_1 = vaddw_high_u8(s2_1, d0);
+ s2_2 = vaddw_u8(s2_2, vget_low_u8(d1));
+ s2_3 = vaddw_high_u8(s2_3, d1);
+ s2_4 = vaddw_u8(s2_4, vget_low_u8(d2));
+ s2_5 = vaddw_high_u8(s2_5, d2);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d3));
+ s2_7 = vaddw_high_u8(s2_7, d3);
+
+ adacc_prev = adacc;
+ buf += 64;
+ }
+
+ s3acc = vshlq_n_u32(s3acc, 6);
+
+ if (rem) {
+ uint32x4_t s3acc_0 = vdupq_n_u32(0);
+ while (rem--) {
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+ vst1q_u8(dst, d0);
+ dst += 16;
+ uint16x8_t adler;
+ adler = vpaddlq_u8(d0);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+ s2_7 = vaddw_high_u8(s2_7, d0);
+ adacc = vpadalq_u16(adacc, adler);
+ s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+ adacc_prev = adacc;
+ buf += 16;
+ }
+
+ s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+ s3acc = vaddq_u32(s3acc_0, s3acc);
+ }
+
+ uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+ uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+ s2acc = vaddq_u32(s2acc_0, s2acc);
+ s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+ s2acc = vaddq_u32(s2acc, s2acc_2);
+
+ uint32x2_t adacc2, s2acc2, as;
+ s2acc = vaddq_u32(s2acc, s3acc);
+ adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+ s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+ as = vpadd_u32(adacc2, s2acc2);
+ s[0] = vget_lane_u32(as, 0);
+ s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+ uint32x4_t adacc = vdupq_n_u32(0);
+ uint32x4_t s2acc = vdupq_n_u32(0);
+ uint32x4_t s2acc_0 = vdupq_n_u32(0);
+ uint32x4_t s2acc_1 = vdupq_n_u32(0);
+ uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+ adacc = vsetq_lane_u32(s[0], adacc, 0);
+ s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+ uint32x4_t s3acc = vdupq_n_u32(0);
+ uint32x4_t adacc_prev = adacc;
+
+ uint16x8_t s2_0, s2_1, s2_2, s2_3;
+ s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+ uint16x8_t s2_4, s2_5, s2_6, s2_7;
+ s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+ size_t num_iter = len >> 2;
+ int rem = len & 3;
+
+ for (size_t i = 0; i < num_iter; ++i) {
+ uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256);
+
+ /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+ * bit instruction, we'll have to make due summing to 16 bits first */
+ uint16x8x2_t hsum, hsum_fold;
+ hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+ hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+ hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+ hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+ adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+ s3acc = vaddq_u32(s3acc, adacc_prev);
+ adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+ /* If we do straight widening additions to the 16 bit values, we don't incur
+ * the usual penalties of a pairwise add. We can defer the multiplications
+ * until the very end. These will not overflow because we are incurring at
+ * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+ * summed into once. This means for the maximum input size, the largest value
+ * we will see is 255 * 102 = 26010, safely under uint16 max */
+ s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+ s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+ s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+ s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+ s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+ s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+ s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+ adacc_prev = adacc;
+ buf += 64;
+ }
+
+ s3acc = vshlq_n_u32(s3acc, 6);
+
+ if (rem) {
+ uint32x4_t s3acc_0 = vdupq_n_u32(0);
+ while (rem--) {
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+ uint16x8_t adler;
+ adler = vpaddlq_u8(d0);
+ s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+ s2_7 = vaddw_high_u8(s2_7, d0);
+ adacc = vpadalq_u16(adacc, adler);
+ s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+ adacc_prev = adacc;
+ buf += 16;
+ }
+
+ s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+ s3acc = vaddq_u32(s3acc_0, s3acc);
+ }
+
+ uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+ uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+ s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+ s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+ s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+ s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+ s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+ s2acc = vaddq_u32(s2acc_0, s2acc);
+ s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+ s2acc = vaddq_u32(s2acc, s2acc_2);
+
+ uint32x2_t adacc2, s2acc2, as;
+ s2acc = vaddq_u32(s2acc, s3acc);
+ adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+ s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+ as = vpadd_u32(adacc2, s2acc2);
+ s[0] = vget_lane_u32(as, 0);
+ s[1] = vget_lane_u32(as, 1);
+}
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
+
+ uint32_t pair[2];
+
+ /* Split Adler-32 into component sums, it can be supplied by
+ * the caller sites (e.g. in a PNG file).
+ */
+ pair[0] = adler;
+ pair[1] = sum2;
+
+ /* If memory is not SIMD aligned, do scalar sums to an aligned
+ * offset, provided that doing so doesn't completely eliminate
+ * SIMD operation. Aligned loads are still faster on ARM, even
+ * when there's no explicit aligned load instruction. Note:
+ * the code currently emits an alignment hint in the instruction
+ * for exactly 256 bits when supported by the compiler. Several ARM
+ * SIPs have small penalties for cacheline crossing loads as well (so
+ * really 512 bits is the optimal alignment of the buffer). 32 bytes
+ * should strike a balance, though. The Cortex-A8 and Cortex-A9
+ * processors are documented to benefit from 128 bit and 64 bit
+ * alignment, but it's unclear which other SIPs will benefit from it.
+ * In the copying variant we use fallback to 4x loads and 4x stores,
+ * as ld1x4 seems to block ILP when stores are in the mix */
+ size_t align_diff = MIN(ALIGN_DIFF(src, 32), len);
+ size_t n = NMAX_ALIGNED32;
+ if (align_diff) {
+ adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY);
+
+ if (COPY)
+ dst += align_diff;
+ src += align_diff;
+ len -= align_diff;
+ n = ALIGN_DOWN(n - align_diff, 32);
+ }
+
+ while (len >= 16) {
+ n = MIN(len, n);
+
+ if (COPY)
+ NEON_accum32_copy(pair, dst, src, n >> 4);
+ else
+ NEON_accum32(pair, src, n >> 4);
+
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+
+ size_t k = (n >> 4) << 4;
+ src += k;
+ if (COPY)
+ dst += k;
+ len -= k;
+ n = NMAX_ALIGNED32;
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY);
+}
+
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+ return adler32_copy_impl(adler, dst, src, len, 1);
+#else
+ /* Without unaligned access, interleaved stores get decomposed into byte ops */
+ adler = adler32_neon(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+#endif
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.c b/neozip/arch/arm/arm_features.c
new file mode 100644
index 0000000000..8f179526ef
--- /dev/null
+++ b/neozip/arch/arm/arm_features.c
@@ -0,0 +1,334 @@
+#ifdef ARM_FEATURES
+
+#include "zbuild.h"
+#include "arm_features.h"
+
+#if defined(HAVE_SYS_AUXV_H)
+# include <sys/auxv.h>
+# ifdef ARM_ASM_HWCAP
+# include <asm/hwcap.h>
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+# include <machine/armreg.h>
+# ifndef ID_AA64ISAR0_CRC32_VAL
+# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+# endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+# include <machine/armreg.h>
+# include <machine/cpu.h>
+# include <sys/sysctl.h>
+# include <sys/types.h>
+#elif defined(__APPLE__)
+# if !defined(_DARWIN_C_SOURCE)
+# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+# endif
+# include <sys/sysctl.h>
+#elif defined(_WIN32)
+# include <windows.h>
+#endif
+
+static int arm_has_crc32(void) {
+ int has_crc32 = 0;
+#if defined(__ARM_FEATURE_CRC32)
+ /* Compile-time check */
+ has_crc32 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CRC32
+ has_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
+# elif defined(HWCAP2_CRC32)
+ has_crc32 = (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CRC32
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_crc32 = (hwcap & HWCAP_CRC32) != 0;
+# elif defined(HWCAP2_CRC32)
+ unsigned long hwcap2 = 0;
+ elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+ has_crc32 = (hwcap2 & HWCAP2_CRC32) != 0;
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+ has_crc32 = getenv("QEMU_EMULATING") == NULL
+ && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+ int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+ uint64_t isar0 = 0;
+ size_t len = sizeof(isar0);
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+ has_crc32 = ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE;
+ }
+#elif defined(__APPLE__)
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_crc32 = sysctlbyname("hw.optional.armv8_crc32", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+#elif defined(_WIN32)
+ has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#endif
+ return has_crc32;
+}
+
+static int arm_has_pmull(void) {
+ int has_pmull = 0;
+#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)
+ /* Compile-time check */
+ has_pmull = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_PMULL
+ has_pmull = (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+# elif defined(HWCAP_AES)
+ /* PMULL is part of crypto extension, check for AES as proxy */
+ has_pmull = (getauxval(AT_HWCAP) & HWCAP_AES) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_PMULL
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_pmull = (hwcap & HWCAP_PMULL) != 0;
+# elif defined(HWCAP_AES)
+ /* PMULL is part of crypto extension, check for AES as proxy */
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_pmull = (hwcap & HWCAP_AES) != 0;
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+ /* Check for AES feature as PMULL is part of crypto extension */
+ has_pmull = getenv("QEMU_EMULATING") == NULL
+ && ID_AA64ISAR0_AES_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_AES_BASE;
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+ int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+ uint64_t isar0 = 0;
+ size_t len = sizeof(isar0);
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+ has_pmull = ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_BASE;
+ }
+#elif defined(__APPLE__)
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_pmull = sysctlbyname("hw.optional.arm.FEAT_PMULL", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+#elif defined(_WIN32)
+ /* Windows checks for crypto/AES support */
+# ifdef PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE
+ has_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+# endif
+#endif
+ return has_pmull;
+}
+
+static int arm_has_eor3(void) {
+ int has_eor3 = 0;
+#if defined(__ARM_FEATURE_SHA3)
+ /* Compile-time check */
+ has_eor3 = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+ /* EOR3 is part of SHA3 extension, check HWCAP2_SHA3 */
+# ifdef HWCAP2_SHA3
+ has_eor3 = (getauxval(AT_HWCAP2) & HWCAP2_SHA3) != 0;
+# elif defined(HWCAP_SHA3)
+ has_eor3 = (getauxval(AT_HWCAP) & HWCAP_SHA3) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP2_SHA3
+ unsigned long hwcap2 = 0;
+ elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+ has_eor3 = (hwcap2 & HWCAP2_SHA3) != 0;
+# elif defined(HWCAP_SHA3)
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_eor3 = (hwcap & HWCAP_SHA3) != 0;
+# endif
+#elif defined(__FreeBSD__) && defined(ARCH_64BIT)
+ /* FreeBSD: check for SHA3 in id_aa64isar0_el1 */
+# ifdef ID_AA64ISAR0_SHA3_VAL
+ has_eor3 = getenv("QEMU_EMULATING") == NULL
+ && ID_AA64ISAR0_SHA3_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_SHA3_BASE;
+# endif
+#elif defined(__OpenBSD__) && defined(ARCH_64BIT)
+# ifdef ID_AA64ISAR0_SHA3
+ int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+ uint64_t isar0 = 0;
+ size_t len = sizeof(isar0);
+ if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+ has_eor3 = ID_AA64ISAR0_SHA3(isar0) >= ID_AA64ISAR0_SHA3_IMPL;
+ }
+# endif
+#elif defined(__APPLE__)
+ /* All Apple Silicon (M1+) has SHA3/EOR3 support */
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_eor3 = sysctlbyname("hw.optional.arm.FEAT_SHA3", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+ /* Fallback to legacy name for older macOS versions */
+ if (!has_eor3) {
+ size = sizeof(has_feat);
+ has_eor3 = sysctlbyname("hw.optional.armv8_2_sha3", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+ }
+#elif defined(_WIN32)
+# ifdef PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE
+ has_eor3 = IsProcessorFeaturePresent(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE);
+# endif
+#endif
+ return has_eor3;
+}
+
+/* AArch64 has neon. */
+#ifdef ARCH_32BIT
+static inline int arm_has_neon(void) {
+ int has_neon = 0;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+ /* Compile-time check */
+ has_neon = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_ARM_NEON
+ has_neon = (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
+# elif defined(HWCAP_NEON)
+ has_neon = (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_NEON
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_neon = (hwcap & HWCAP_NEON) != 0;
+# endif
+#elif defined(__APPLE__)
+ int has_feat = 0;
+ size_t size = sizeof(has_feat);
+ has_neon = sysctlbyname("hw.optional.neon", &has_feat, &size, NULL, 0) == 0
+ && has_feat == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+ has_neon = 1; /* Always supported */
+# endif
+#endif
+ return has_neon;
+}
+#endif
+
+/* AArch64 does not have ARMv6 SIMD. */
+#ifdef ARCH_32BIT
+static inline int arm_has_simd(void) {
+ int has_simd = 0;
+#if defined(__ARM_FEATURE_SIMD32)
+ /* Compile-time check for ARMv6 SIMD */
+ has_simd = 1;
+#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+ const char *platform = (const char *)getauxval(AT_PLATFORM);
+ has_simd = platform
+ && (strncmp(platform, "v6l", 3) == 0
+ || strncmp(platform, "v7l", 3) == 0
+ || strncmp(platform, "v8l", 3) == 0);
+#endif
+ return has_simd;
+}
+#endif
+
+#if defined(ARCH_64BIT) && !defined(__APPLE__) && !defined(_WIN32)
+/* MIDR_EL1 bit field definitions */
+#define MIDR_IMPLEMENTOR(midr) (((midr) & (0xffU << 24)) >> 24)
+#define MIDR_PARTNUM(midr) (((midr) & (0xfffU << 4)) >> 4)
+
+/* ARM CPU Implementer IDs */
+#define ARM_IMPLEMENTER_ARM 0x41
+#define ARM_IMPLEMENTER_QUALCOMM 0x51
+#define ARM_IMPLEMENTER_APPLE 0x61
+
+/* ARM CPU Part Numbers */
+
+/* Cortex-X series - Multiple PMULL lanes */
+#define ARM_PART_CORTEX_X1 0xd44
+#define ARM_PART_CORTEX_X1C 0xd4c
+#define ARM_PART_CORTEX_X2 0xd48
+#define ARM_PART_CORTEX_X3 0xd4e
+#define ARM_PART_CORTEX_X4 0xd82
+#define ARM_PART_CORTEX_X925 0xd85
+
+/* Neoverse V/N2 series - Multiple PMULL lanes */
+#define ARM_PART_NEOVERSE_N2 0xd49
+#define ARM_PART_NEOVERSE_V1 0xd40
+#define ARM_PART_NEOVERSE_V2 0xd4f
+#define ARM_PART_NEOVERSE_V3 0xd8e
+
+/* Snapdragon X Elite/Plus - Custom core */
+#define QUALCOMM_PART_ORYON 0x001
+
+static inline int arm_has_cpuid(void) {
+ int has_cpuid = 0;
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CPUID
+ has_cpuid = (getauxval(AT_HWCAP) & HWCAP_CPUID) != 0;
+# elif defined(HWCAP2_CPUID)
+ has_cpuid = (getauxval(AT_HWCAP2) & HWCAP2_CPUID) != 0;
+# endif
+#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H)
+# ifdef HWCAP_CPUID
+ unsigned long hwcap = 0;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ has_cpuid = (hwcap & HWCAP_CPUID) != 0;
+# endif
+#endif
+ return has_cpuid;
+}
+#endif
+
+/* Determine if CPU has fast PMULL (multiple execution units) */
+static inline int arm_cpu_has_fast_pmull(void) {
+ int has_fast_pmull = 0;
+#if defined(__APPLE__)
+ /* On macOS, all Apple Silicon has fast PMULL */
+ has_fast_pmull = 1;
+#elif defined(ARCH_64BIT) && !defined(_WIN32)
+ /* We need CPUID feature to read MIDR register */
+ if (!arm_has_cpuid())
+ return has_fast_pmull;
+
+ uint64_t midr;
+ __asm__ ("mrs %0, midr_el1" : "=r" (midr));
+
+ uint32_t implementer = MIDR_IMPLEMENTOR(midr);
+ uint32_t part = MIDR_PARTNUM(midr);
+
+ if (implementer == ARM_IMPLEMENTER_APPLE) {
+ /* All Apple Silicon (M1+) have fast PMULL */
+ has_fast_pmull = 1;
+ } else if (implementer == ARM_IMPLEMENTER_ARM) {
+ /* ARM Cortex-X and Neoverse V/N2 series have multi-lane PMULL */
+ switch (part) {
+ case ARM_PART_CORTEX_X1:
+ case ARM_PART_CORTEX_X1C:
+ case ARM_PART_CORTEX_X2:
+ case ARM_PART_CORTEX_X3:
+ case ARM_PART_CORTEX_X4:
+ case ARM_PART_CORTEX_X925:
+ case ARM_PART_NEOVERSE_N2:
+ case ARM_PART_NEOVERSE_V1:
+ case ARM_PART_NEOVERSE_V2:
+ case ARM_PART_NEOVERSE_V3:
+ has_fast_pmull = 1;
+ }
+ } else if (implementer == ARM_IMPLEMENTER_QUALCOMM) {
+ /* Qualcomm Oryon (Snapdragon X Elite/Plus) has fast PMULL */
+ if (part == QUALCOMM_PART_ORYON)
+ has_fast_pmull = 1;
+ }
+#endif
+ return has_fast_pmull;
+}
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
+#ifdef ARCH_64BIT
+ features->has_simd = 0; /* never available */
+ features->has_neon = 1; /* always available */
+#else
+ features->has_simd = arm_has_simd();
+ features->has_neon = arm_has_neon();
+#endif
+ features->has_crc32 = arm_has_crc32();
+ features->has_pmull = arm_has_pmull();
+ features->has_eor3 = arm_has_eor3();
+ features->has_fast_pmull = features->has_pmull && arm_cpu_has_fast_pmull();
+}
+
+#endif
diff --git a/neozip/arch/arm/arm_features.h b/neozip/arch/arm/arm_features.h
new file mode 100644
index 0000000000..2f17a9ddf0
--- /dev/null
+++ b/neozip/arch/arm/arm_features.h
@@ -0,0 +1,19 @@
+/* arm_features.h -- check for ARM features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FEATURES_H_
+#define ARM_FEATURES_H_
+
+struct arm_cpu_features {
+ int has_simd;
+ int has_neon;
+ int has_crc32;
+ int has_pmull;
+ int has_eor3;
+ int has_fast_pmull;
+};
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
+
+#endif /* ARM_FEATURES_H_ */
diff --git a/neozip/arch/arm/arm_functions.h b/neozip/arch/arm/arm_functions.h
new file mode 100644
index 0000000000..bc77adb977
--- /dev/null
+++ b/neozip/arch/arm/arm_functions.h
@@ -0,0 +1,75 @@
+/* arm_functions.h -- ARM implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_FUNCTIONS_H_
+#define ARM_FUNCTIONS_H_
+
+#include "arm_natives.h"
+
+#ifdef ARM_NEON
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match);
+void slide_hash_neon(deflate_state *s);
+#endif
+
+#ifdef ARM_CRC32
+uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef ARM_PMULL_EOR3
+uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef ARM_SIMD
+void slide_hash_armv6(deflate_state *s);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// ARM - SIMD
+# ifdef ARM_SIMD_NATIVE
+# undef native_slide_hash
+# define native_slide_hash slide_hash_armv6
+# endif
+// ARM - NEON
+# ifdef ARM_NEON_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_neon
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_neon
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_neon
+# undef native_compare256
+# define native_compare256 compare256_neon
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_neon
+# undef native_longest_match
+# define native_longest_match longest_match_neon
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_neon
+# undef native_slide_hash
+# define native_slide_hash slide_hash_neon
+# endif
+// ARM - CRC32
+# ifdef ARM_CRC32_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_armv8
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_armv8
+# endif
+// ARM - PMULL EOR3
+# ifdef ARM_PMULL_EOR3_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_armv8_pmull_eor3
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_armv8_pmull_eor3
+# endif
+#endif
+
+#endif /* ARM_FUNCTIONS_H_ */
diff --git a/neozip/arch/arm/arm_natives.h b/neozip/arch/arm/arm_natives.h
new file mode 100644
index 0000000000..311e33e958
--- /dev/null
+++ b/neozip/arch/arm/arm_natives.h
@@ -0,0 +1,31 @@
+/* arm_natives.h -- ARM compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_NATIVES_H_
+#define ARM_NATIVES_H_
+
+#if defined(__ARM_FEATURE_SIMD32)
+# ifdef ARM_SIMD
+# define ARM_SIMD_NATIVE
+# endif
+#endif
+/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT)
+# ifdef ARM_NEON
+# define ARM_NEON_NATIVE
+# endif
+#endif
+/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */
+#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801)
+# ifdef ARM_CRC32
+# define ARM_CRC32_NATIVE
+# endif
+#endif
+#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3)
+# ifdef ARM_PMULL_EOR3
+# define ARM_PMULL_EOR3_NATIVE
+# endif
+#endif
+
+#endif /* ARM_NATIVES_H_ */
diff --git a/neozip/arch/arm/chunkset_neon.c b/neozip/arch/arm/chunkset_neon.c
new file mode 100644
index 0000000000..a891f10fa5
--- /dev/null
+++ b/neozip/arch/arm/chunkset_neon.c
@@ -0,0 +1,81 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "zsanitizer.h"
+#include "zmemory.h"
+#include "neon_intrins.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef uint8x16_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from)));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from)));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from)));
+}
+
+#define CHUNKSIZE chunksize_neon
+#define CHUNKCOPY chunkcopy_neon
+#define CHUNKUNROLL chunkunroll_neon
+#define CHUNKMEMSET chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ vst1q_u8(out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ *chunk_rem = lut_rem.remval;
+
+ /* See note in chunkset_ssse3.c for why this is ok */
+ __msan_unpoison(buf + dist, 16 - dist);
+
+ /* This version of table is only available on aarch64 */
+#if defined(ARCH_ARM) && defined(ARCH_64BIT)
+ uint8x16_t ret_vec = vld1q_u8(buf);
+
+ uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128);
+ return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+ uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+ perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64);
+ perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
+ a = vld1_u8(buf);
+ b = vld1_u8(buf + 8);
+ ret0 = vtbl1_u8(a, perm_vec0);
+ uint8x8x2_t ab;
+ ab.val[0] = a;
+ ab.val[1] = b;
+ ret1 = vtbl2_u8(ab, perm_vec1);
+ return vcombine_u8(ret0, ret1);
+#endif
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_neon
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/compare256_neon.c b/neozip/arch/arm/compare256_neon.c
new file mode 100644
index 0000000000..4ced9fc9ca
--- /dev/null
+++ b/neozip/arch/arm/compare256_neon.c
@@ -0,0 +1,56 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON)
+#include "neon_intrins.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint8x16_t a, b, cmp;
+ uint64_t lane;
+
+ a = vld1q_u8(src0);
+ b = vld1q_u8(src1);
+
+ cmp = veorq_u8(a, b);
+
+ lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+ if (lane)
+ return len + zng_ctz64(lane) / 8;
+ len += 8;
+ lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+ if (lane)
+ return len + zng_ctz64(lane) / 8;
+ len += 8;
+
+ src0 += 16, src1 += 16;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_neon
+#define COMPARE256 compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_neon
+#define COMPARE256 compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/arm/crc32_armv8.c b/neozip/arch/arm/crc32_armv8.c
new file mode 100644
index 0000000000..59f2b65009
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8.c
@@ -0,0 +1,81 @@
+/* crc32_armv8.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
+ * Copyright (C) 2016 Yang Zhang
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_CRC32
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "crc32_armv8_p.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len,
+ const int COPY) {
+ uint32_t c = ~crc;
+
+ if (UNLIKELY(len == 1)) {
+ if (COPY)
+ *dst = *src;
+ c = __crc32b(c, *src);
+ return ~c;
+ }
+
+ /* Align to 8-byte boundary for tail processing */
+ uintptr_t align_diff = ALIGN_DIFF(src, 8);
+ if (align_diff)
+ c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY);
+
+ while (len >= 64) {
+ uint64_t d0 = *(const uint64_t *)src;
+ uint64_t d1 = *(const uint64_t *)(src + 8);
+ uint64_t d2 = *(const uint64_t *)(src + 16);
+ uint64_t d3 = *(const uint64_t *)(src + 24);
+ uint64_t d4 = *(const uint64_t *)(src + 32);
+ uint64_t d5 = *(const uint64_t *)(src + 40);
+ uint64_t d6 = *(const uint64_t *)(src + 48);
+ uint64_t d7 = *(const uint64_t *)(src + 56);
+
+ if (COPY) {
+ memcpy(dst, &d0, 8);
+ memcpy(dst + 8, &d1, 8);
+ memcpy(dst + 16, &d2, 8);
+ memcpy(dst + 24, &d3, 8);
+ memcpy(dst + 32, &d4, 8);
+ memcpy(dst + 40, &d5, 8);
+ memcpy(dst + 48, &d6, 8);
+ memcpy(dst + 56, &d7, 8);
+ dst += 64;
+ }
+
+ c = __crc32d(c, d0);
+ c = __crc32d(c, d1);
+ c = __crc32d(c, d2);
+ c = __crc32d(c, d3);
+ c = __crc32d(c, d4);
+ c = __crc32d(c, d5);
+ c = __crc32d(c, d6);
+ c = __crc32d(c, d7);
+
+ src += 64;
+ len -= 64;
+ }
+
+ return crc32_armv8_tail(c, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+ return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+ /* Without unaligned access, interleaved stores get decomposed into byte ops */
+ crc = crc32_armv8(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/crc32_armv8_p.h b/neozip/arch/arm/crc32_armv8_p.h
new file mode 100644
index 0000000000..e72c4c0ad1
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_p.h
@@ -0,0 +1,103 @@
+/* crc32_armv8_p.h -- Private shared inline ARMv8 CRC32 functions
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CRC32_ARMV8_P_H
+#define CRC32_ARMV8_P_H
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_align(uint32_t crc, uint8_t **dst, const uint8_t **buf,
+ size_t *len, uintptr_t align_diff, const int COPY) {
+ if (*len && (align_diff & 1)) {
+ uint8_t val = **buf;
+ if (COPY) {
+ **dst = val;
+ *dst += 1;
+ }
+ crc = __crc32b(crc, val);
+ *buf += 1;
+ *len -= 1;
+ }
+
+ if (*len >= 2 && (align_diff & 2)) {
+ uint16_t val = *((uint16_t*)*buf);
+ if (COPY) {
+ memcpy(*dst, &val, 2);
+ *dst += 2;
+ }
+ crc = __crc32h(crc, val);
+ *buf += 2;
+ *len -= 2;
+ }
+
+ if (*len >= 4 && (align_diff & 4)) {
+ uint32_t val = *((uint32_t*)*buf);
+ if (COPY) {
+ memcpy(*dst, &val, 4);
+ *dst += 4;
+ }
+ crc = __crc32w(crc, val);
+ *buf += 4;
+ *len -= 4;
+ }
+
+ if (*len >= 8 && (align_diff & 8)) {
+ uint64_t val = *((uint64_t*)*buf);
+ if (COPY) {
+ memcpy(*dst, &val, 8);
+ *dst += 8;
+ }
+ crc = __crc32d(crc, val);
+ *buf += 8;
+ *len -= 8;
+ }
+
+ return crc;
+}
+
+Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_tail(uint32_t crc, uint8_t *dst, const uint8_t *buf,
+ size_t len, const int COPY) {
+ while (len >= 8) {
+ uint64_t val = *((uint64_t*)buf);
+ if (COPY) {
+ memcpy(dst, &val, 8);
+ dst += 8;
+ }
+ crc = __crc32d(crc, val);
+ buf += 8;
+ len -= 8;
+ }
+
+ if (len & 4) {
+ uint32_t val = *((uint32_t*)buf);
+ if (COPY) {
+ memcpy(dst, &val, 4);
+ dst += 4;
+ }
+ crc = __crc32w(crc, val);
+ buf += 4;
+ }
+
+ if (len & 2) {
+ uint16_t val = *((uint16_t*)buf);
+ if (COPY) {
+ memcpy(dst, &val, 2);
+ dst += 2;
+ }
+ crc = __crc32h(crc, val);
+ buf += 2;
+ }
+
+ if (len & 1) {
+ uint8_t val = *buf;
+ if (COPY)
+ *dst = val;
+ crc = __crc32b(crc, val);
+ }
+
+ return ~crc;
+}
+
+#endif /* CRC32_ARMV8_P_H */
diff --git a/neozip/arch/arm/crc32_armv8_pmull_eor3.c b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
new file mode 100644
index 0000000000..e0d5bf043b
--- /dev/null
+++ b/neozip/arch/arm/crc32_armv8_pmull_eor3.c
@@ -0,0 +1,366 @@
+/* crc32_armv8_pmull_eor3.c -- ARMv8 CRC32 using PMULL + EOR3 (SHA3 extension)
+ * Copyright (C) 2025 Peter Cawley
+ * https://github.com/corsix/fast-crc32
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This uses EOR3 (3-way XOR) from ARMv8.2-A SHA3 extension to save instructions.
+ * Uses 3-way parallel scalar CRC + 9 PMULL vector lanes, processing 192 bytes/iter.
+ */
+
+#ifdef ARM_PMULL_EOR3
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "acle_intrins.h"
+#include "neon_intrins.h"
+#include "crc32_armv8_p.h"
+
+/* Carryless multiply low 64 bits: a[0] * b[0] */
+static inline uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
+#ifdef _MSC_VER
+ return vreinterpretq_u64_p128(vmull_p64(
+ vget_low_p64(vreinterpret_p64_u64(a)),
+ vget_low_p64(vreinterpret_p64_u64(b))));
+#else
+ return vreinterpretq_u64_p128(vmull_p64(
+ vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(a)), 0),
+ vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(b)), 0)));
+#endif
+}
+
+/* Carryless multiply high 64 bits: a[1] * b[1] */
+static inline uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
+ return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b)));
+}
+
+/* Carryless multiply of two 32-bit scalars: a * b (returns 64-bit result in 128-bit vector) */
+static inline uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
+#ifdef _MSC_VER
+ return vreinterpretq_u64_p128(vmull_p64(vdup_n_p64((poly64_t)a), vdup_n_p64((poly64_t)b)));
+#else
+ return vreinterpretq_u64_p128(vmull_p64((poly64_t)a, (poly64_t)b));
+#endif
+}
+
+/* Compute x^n mod P (CRC-32 polynomial) in log(n) time, where P = 0x104c11db7 */
+static uint32_t xnmodp(uint64_t n) {
+ uint64_t stack = ~(uint64_t)1;
+ uint32_t acc, low;
+ for (; n > 191; n = (n >> 1) - 16) {
+ stack = (stack << 1) + (n & 1);
+ }
+ stack = ~stack;
+ acc = ((uint32_t)0x80000000) >> (n & 31);
+ for (n >>= 5; n; --n) {
+ acc = __crc32w(acc, 0);
+ }
+ while ((low = stack & 1), stack >>= 1) {
+ poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
+ uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
+ acc = __crc32d(0, y << low);
+ }
+ return acc;
+}
+
+/* Shift CRC forward by nbytes: equivalent to appending nbytes of zeros to the data stream */
+static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
+ Assert(nbytes >= 5, "crc_shift requires nbytes >= 5");
+ return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
+}
+
+Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src,
+ size_t len, const int COPY) {
+ uint32_t crc0 = ~crc;
+
+ if (UNLIKELY(len == 1)) {
+ if (COPY)
+ *dst = *src;
+ crc0 = __crc32b(crc0, *src);
+ return ~crc0;
+ }
+
+ /* Align to 16-byte boundary for vector path */
+ uintptr_t align_diff = ALIGN_DIFF(src, 16);
+ if (align_diff)
+ crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY);
+
+ /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */
+ if (len >= 192) {
+ size_t blk = len / 192; /* Number of 192-byte blocks */
+ size_t klen = blk * 16; /* Scalar stride per CRC lane */
+ const uint8_t *end = src + len;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + klen;
+ const uint8_t *src2 = src + klen * 2;
+ const uint8_t *srcv = src + klen * 3; /* Vector data starts after scalar lanes */
+ uint32_t crc1 = 0, crc2 = 0;
+ uint64x2_t vc0, vc1, vc2;
+ uint64_t vc;
+
+ /* Load first 9 vector chunks (144 bytes) */
+ uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0;
+ uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1;
+ uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2;
+ uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3;
+ uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4;
+ uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5;
+ uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6;
+ uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7;
+ uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8;
+ uint64x2_t k;
+ /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
+ { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
+
+ /* Per-region dst pointers */
+ uint8_t *dst0 = dst;
+ uint8_t *dst1 = NULL;
+ uint8_t *dst2 = NULL;
+ uint8_t *dst_v = NULL;
+
+ if (COPY) {
+ dst1 = dst + klen;
+ dst2 = dst + klen * 2;
+ dst_v = dst + klen * 3;
+ vst1q_u8(dst_v, vreinterpretq_u8_u64(x0));
+ vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(x1));
+ vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(x2));
+ vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(x3));
+ vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(x4));
+ vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(x5));
+ vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(x6));
+ vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(x7));
+ vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(x8));
+ dst_v += 144;
+ }
+ srcv += 144;
+
+ /* Fold 9 vectors + 3-way parallel scalar CRC */
+ if (blk > 1) {
+ /* Only form a limit pointer when we have at least 2 blocks. */
+ const uint8_t *limit = src0 + klen - 32;
+ while (src0 <= limit) {
+ /* Fold all 9 vector lanes using PMULL */
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+ y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+ y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
+ y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+ y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
+ y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
+
+ /* EOR3: combine hi*k, lo*k, and new data in one instruction */
+ {
+ uint64x2_t d0 = vld1q_u64_ex((const uint64_t*)srcv, 128);
+ uint64x2_t d1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128);
+ uint64x2_t d2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128);
+ uint64x2_t d3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128);
+ uint64x2_t d4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128);
+ uint64x2_t d5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128);
+ uint64x2_t d6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128);
+ uint64x2_t d7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128);
+ uint64x2_t d8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128);
+ if (COPY) {
+ vst1q_u8(dst_v, vreinterpretq_u8_u64(d0));
+ vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(d1));
+ vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(d2));
+ vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(d3));
+ vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(d4));
+ vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(d5));
+ vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(d6));
+ vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(d7));
+ vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(d8));
+ dst_v += 144;
+ }
+ x0 = veor3q_u64(x0, y0, d0);
+ x1 = veor3q_u64(x1, y1, d1);
+ x2 = veor3q_u64(x2, y2, d2);
+ x3 = veor3q_u64(x3, y3, d3);
+ x4 = veor3q_u64(x4, y4, d4);
+ x5 = veor3q_u64(x5, y5, d5);
+ x6 = veor3q_u64(x6, y6, d6);
+ x7 = veor3q_u64(x7, y7, d7);
+ x8 = veor3q_u64(x8, y8, d8);
+ }
+
+ /* 3-way parallel scalar CRC (16 bytes each) */
+ {
+ uint64_t s0a = *(const uint64_t*)src0;
+ uint64_t s0b = *(const uint64_t*)(src0 + 8);
+ uint64_t s1a = *(const uint64_t*)src1;
+ uint64_t s1b = *(const uint64_t*)(src1 + 8);
+ uint64_t s2a = *(const uint64_t*)src2;
+ uint64_t s2b = *(const uint64_t*)(src2 + 8);
+ if (COPY) {
+ memcpy(dst0, &s0a, 8);
+ memcpy(dst0 + 8, &s0b, 8);
+ dst0 += 16;
+ memcpy(dst1, &s1a, 8);
+ memcpy(dst1 + 8, &s1b, 8);
+ dst1 += 16;
+ memcpy(dst2, &s2a, 8);
+ memcpy(dst2 + 8, &s2b, 8);
+ dst2 += 16;
+ }
+ crc0 = __crc32d(crc0, s0a);
+ crc0 = __crc32d(crc0, s0b);
+ crc1 = __crc32d(crc1, s1a);
+ crc1 = __crc32d(crc1, s1b);
+ crc2 = __crc32d(crc2, s2a);
+ crc2 = __crc32d(crc2, s2b);
+ }
+ src0 += 16;
+ src1 += 16;
+ src2 += 16;
+ srcv += 144;
+ }
+ }
+
+ /* Reduce 9 vectors to 1 using tree reduction */
+ /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */
+ { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); }
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ x0 = veor3q_u64(x0, y0, x1);
+ x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
+
+ /* Step 2: fold pairs (x0,x1), (x2,x3), (x4,x5), (x6,x7) */
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+ x0 = veor3q_u64(x0, y0, x1);
+ x2 = veor3q_u64(x2, y2, x3);
+ x4 = veor3q_u64(x4, y4, x5);
+ x6 = veor3q_u64(x6, y6, x7);
+
+ /* Step 3: fold pairs (x0,x2), (x4,x6) */
+ { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); }
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ x0 = veor3q_u64(x0, y0, x2);
+ x4 = veor3q_u64(x4, y4, x6);
+
+ /* Step 4: final fold (x0, x4) -> x0 */
+ { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); }
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ x0 = veor3q_u64(x0, y0, x4);
+
+ /* Process final scalar chunk */
+ {
+ uint64_t s0a = *(const uint64_t*)src0;
+ uint64_t s0b = *(const uint64_t*)(src0 + 8);
+ uint64_t s1a = *(const uint64_t*)src1;
+ uint64_t s1b = *(const uint64_t*)(src1 + 8);
+ uint64_t s2a = *(const uint64_t*)src2;
+ uint64_t s2b = *(const uint64_t*)(src2 + 8);
+ if (COPY) {
+ memcpy(dst0, &s0a, 8);
+ memcpy(dst0 + 8, &s0b, 8);
+ memcpy(dst1, &s1a, 8);
+ memcpy(dst1 + 8, &s1b, 8);
+ memcpy(dst2, &s2a, 8);
+ memcpy(dst2 + 8, &s2b, 8);
+ }
+ crc0 = __crc32d(crc0, s0a);
+ crc0 = __crc32d(crc0, s0b);
+ crc1 = __crc32d(crc1, s1a);
+ crc1 = __crc32d(crc1, s1b);
+ crc2 = __crc32d(crc2, s2a);
+ crc2 = __crc32d(crc2, s2b);
+ }
+
+ /* Shift and combine 3 scalar CRCs */
+ vc0 = crc_shift(crc0, klen * 2 + blk * 144);
+ vc1 = crc_shift(crc1, klen + blk * 144);
+ vc2 = crc_shift(crc2, blk * 144);
+ vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+ /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */
+ crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+ crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+ if (COPY)
+ dst += blk * 192;
+ src = srcv;
+ len = end - srcv;
+ }
+
+ /* 3-way scalar CRC (24 bytes/iter) */
+ if (len >= 80) {
+ size_t klen = ((len - 8) / 24) * 8; /* Stride for 3-way parallel */
+ const uint8_t *buf0 = src;
+ const uint8_t *buf1 = src + klen;
+ const uint8_t *buf2 = src + klen * 2;
+ uint32_t crc1 = 0, crc2 = 0;
+ uint64x2_t vc0, vc1;
+ uint64_t vc;
+
+ /* Per-lane dst pointers */
+ uint8_t *dst0 = dst;
+ uint8_t *dst1 = NULL;
+ uint8_t *dst2 = NULL;
+ if (COPY) {
+ dst1 = dst + klen;
+ dst2 = dst + klen * 2;
+ }
+
+ /* 3-way parallel scalar CRC */
+ do {
+ uint64_t v0 = *(const uint64_t*)buf0;
+ uint64_t v1 = *(const uint64_t*)buf1;
+ uint64_t v2 = *(const uint64_t*)buf2;
+ if (COPY) {
+ memcpy(dst0, &v0, 8);
+ dst0 += 8;
+ memcpy(dst1, &v1, 8);
+ dst1 += 8;
+ memcpy(dst2, &v2, 8);
+ dst2 += 8;
+ }
+ crc0 = __crc32d(crc0, v0);
+ crc1 = __crc32d(crc1, v1);
+ crc2 = __crc32d(crc2, v2);
+ buf0 += 8;
+ buf1 += 8;
+ buf2 += 8;
+ len -= 24;
+ } while (len >= 32);
+
+ /* Combine the 3 CRCs */
+ vc0 = crc_shift(crc0, klen * 2 + 8);
+ vc1 = crc_shift(crc1, klen + 8);
+ vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+ /* Process final 8 bytes with combined CRC */
+ crc0 = crc2;
+ {
+ uint64_t vf = *(const uint64_t*)buf2;
+ if (COPY)
+ memcpy(dst2, &vf, 8);
+ crc0 = __crc32d(crc0, vf ^ vc);
+ }
+ src = buf2 + 8;
+ len -= 8;
+ if (COPY)
+ dst = dst2 + 8;
+ }
+
+ /* Process remaining bytes */
+ return crc32_armv8_tail(crc0, dst, src, len, COPY);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#if OPTIMAL_CMP >= 32
+ return crc32_copy_impl(crc, dst, src, len, 1);
+#else
+ /* Without unaligned access, interleaved stores get decomposed into byte ops */
+ crc = crc32_armv8_pmull_eor3(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+#endif
+}
+#endif
diff --git a/neozip/arch/arm/neon_intrins.h b/neozip/arch/arm/neon_intrins.h
new file mode 100644
index 0000000000..449916e0b7
--- /dev/null
+++ b/neozip/arch/arm/neon_intrins.h
@@ -0,0 +1,79 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#if defined(_MSC_VER) && defined(ARCH_ARM) && defined(ARCH_64BIT)
+/* arm64_neon.h is MSVC specific */
+# include <arm64_neon.h>
+#else
+# include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON) && defined(ARCH_ARM) && defined(ARCH_32BIT)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+ out.val[0] = vqsubq_u16(a.val[0], b); \
+ out.val[1] = vqsubq_u16(a.val[1], b); \
+ out.val[2] = vqsubq_u16(a.val[2], b); \
+ out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+# if defined(ARCH_ARM) && defined(ARCH_32BIT) && defined(__clang__) && \
+ (!defined(__clang_major__) || __clang_major__ < 20)
+/* Clang versions before 20 have too strict of an
+ * alignment requirement (:256) for x4 NEON intrinsics */
+# undef ARM_NEON_HASLD4
+# undef vld1q_u16_x4
+# undef vld1q_u8_x4
+# undef vst1q_u16_x4
+# endif
+
+# ifndef ARM_NEON_HASLD4
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
+ uint16x8x4_t ret;
+ ret.val[0] = vld1q_u16(a);
+ ret.val[1] = vld1q_u16(a+8);
+ ret.val[2] = vld1q_u16(a+16);
+ ret.val[3] = vld1q_u16(a+24);
+ return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
+ uint8x16x4_t ret;
+ ret.val[0] = vld1q_u8(a);
+ ret.val[1] = vld1q_u8(a+16);
+ ret.val[2] = vld1q_u8(a+32);
+ ret.val[3] = vld1q_u8(a+48);
+ return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+ vst1q_u16(p, a.val[0]);
+ vst1q_u16(p + 8, a.val[1]);
+ vst1q_u16(p + 16, a.val[2]);
+ vst1q_u16(p + 24, a.val[3]);
+}
+# endif // HASLD4 check
+
+# ifndef _MSC_VER
+# define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8))
+# endif
+# if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
+# define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
+# define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
+# endif
+
+#endif
+
+#endif // include guard ARM_NEON_INTRINS_H
diff --git a/neozip/arch/arm/slide_hash_armv6.c b/neozip/arch/arm/slide_hash_armv6.c
new file mode 100644
index 0000000000..b241e6c5e6
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_armv6.c
@@ -0,0 +1,49 @@
+/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
+ * Copyright (C) 2023 Cameron Cawley
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_SIMD
+
+#include "zbuild.h"
+#include "acle_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ Z_REGISTER uint16x2_t v;
+ uint16x2_t p0, p1, p2, p3;
+ Z_REGISTER size_t n;
+
+ size_t size = entries*sizeof(table[0]);
+ Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
+
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
+ v = wsize | (wsize << 16);
+
+ n = size / (sizeof(uint16x2_t) * 4);
+ do {
+ p0 = *((const uint16x2_t *)(table));
+ p1 = *((const uint16x2_t *)(table+2));
+ p2 = *((const uint16x2_t *)(table+4));
+ p3 = *((const uint16x2_t *)(table+6));
+ p0 = __uqsub16(p0, v);
+ p1 = __uqsub16(p1, v);
+ p2 = __uqsub16(p2, v);
+ p3 = __uqsub16(p3, v);
+ *((uint16x2_t *)(table)) = p0;
+ *((uint16x2_t *)(table+2)) = p1;
+ *((uint16x2_t *)(table+4)) = p2;
+ *((uint16x2_t *)(table+6)) = p3;
+ table += 8;
+ } while (--n);
+}
+
+Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/neozip/arch/arm/slide_hash_neon.c b/neozip/arch/arm/slide_hash_neon.c
new file mode 100644
index 0000000000..2f9e94a33d
--- /dev/null
+++ b/neozip/arch/arm/slide_hash_neon.c
@@ -0,0 +1,48 @@
+/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+
+#include "zbuild.h"
+#include "neon_intrins.h"
+#include "deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ Z_REGISTER uint16x8_t v;
+ uint16x8x4_t p0, p1;
+ Z_REGISTER size_t n;
+
+ size_t size = entries*sizeof(table[0]);
+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
+ v = vdupq_n_u16(wsize);
+
+ n = size / (sizeof(uint16x8_t) * 8);
+ do {
+ p0 = vld1q_u16_x4_ex(table, 256);
+ p1 = vld1q_u16_x4_ex(table+32, 256);
+ vqsubq_u16_x4_x1(p0, p0, v);
+ vqsubq_u16_x4_x1(p1, p1, v);
+ vst1q_u16_x4_ex(table, p0, 256);
+ vst1q_u16_x4_ex(table+32, p1, 256);
+ table += 64;
+ } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif