diff options
| author | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
|---|---|---|
| committer | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
| commit | 7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch) | |
| tree | b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch/arm | |
| parent | b1e34e861b5d732afe828d58aad2c638135061fd (diff) | |
| parent | c2712b8a345191f6ed79558c089777df94590087 (diff) | |
| download | Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip | |
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip
git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd
git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/arch/arm')
| -rw-r--r-- | neozip/arch/arm/Makefile.in | 86 | ||||
| -rw-r--r-- | neozip/arch/arm/acle_intrins.h | 90 | ||||
| -rw-r--r-- | neozip/arch/arm/adler32_neon.c | 346 | ||||
| -rw-r--r-- | neozip/arch/arm/arm_features.c | 334 | ||||
| -rw-r--r-- | neozip/arch/arm/arm_features.h | 19 | ||||
| -rw-r--r-- | neozip/arch/arm/arm_functions.h | 75 | ||||
| -rw-r--r-- | neozip/arch/arm/arm_natives.h | 31 | ||||
| -rw-r--r-- | neozip/arch/arm/chunkset_neon.c | 81 | ||||
| -rw-r--r-- | neozip/arch/arm/compare256_neon.c | 56 | ||||
| -rw-r--r-- | neozip/arch/arm/crc32_armv8.c | 81 | ||||
| -rw-r--r-- | neozip/arch/arm/crc32_armv8_p.h | 103 | ||||
| -rw-r--r-- | neozip/arch/arm/crc32_armv8_pmull_eor3.c | 366 | ||||
| -rw-r--r-- | neozip/arch/arm/neon_intrins.h | 79 | ||||
| -rw-r--r-- | neozip/arch/arm/slide_hash_armv6.c | 49 | ||||
| -rw-r--r-- | neozip/arch/arm/slide_hash_neon.c | 48 |
15 files changed, 1844 insertions, 0 deletions
diff --git a/neozip/arch/arm/Makefile.in b/neozip/arch/arm/Makefile.in new file mode 100644 index 0000000000..d0bfe0e172 --- /dev/null +++ b/neozip/arch/arm/Makefile.in @@ -0,0 +1,86 @@ +# Makefile for zlib +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +ARMV8FLAG= +PMULLEOR3FLAG= +NEONFLAG= +ARMV6FLAG= +NOLTOFLAG= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +all: \ + adler32_neon.o adler32_neon.lo \ + arm_features.o arm_features.lo \ + chunkset_neon.o chunkset_neon.lo \ + compare256_neon.o compare256_neon.lo \ + crc32_armv8.o crc32_armv8.lo \ + crc32_armv8_pmull_eor3.o crc32_armv8_pmull_eor3.lo \ + slide_hash_neon.o slide_hash_neon.lo \ + slide_hash_armv6.o slide_hash_armv6.lo \ + +adler32_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c + +adler32_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c + +arm_features.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c + +arm_features.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c + +chunkset_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c + +chunkset_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c + +compare256_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c + +compare256_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c + +crc32_armv8.o: + $(CC) $(CFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c + +crc32_armv8.lo: + $(CC) $(SFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c + +crc32_armv8_pmull_eor3.o: + $(CC) $(CFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c + +crc32_armv8_pmull_eor3.lo: + $(CC) $(SFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c + +slide_hash_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c + +slide_hash_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c + +slide_hash_armv6.o: + $(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c + +slide_hash_armv6.lo: + $(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/arm/acle_intrins.h b/neozip/arch/arm/acle_intrins.h new file mode 100644 index 0000000000..16f5e2c77c --- /dev/null +++ b/neozip/arch/arm/acle_intrins.h @@ -0,0 +1,90 @@ +#ifndef ARM_ACLE_INTRINS_H +#define ARM_ACLE_INTRINS_H + +#include <stdint.h> +#ifdef _MSC_VER +# include <intrin.h> +#elif defined(HAVE_ARM_ACLE_H) +# include <arm_acle.h> +#endif + +#ifdef ARM_CRC32 +#if defined(ARCH_ARM) && defined(ARCH_64BIT) +# define Z_TARGET_CRC Z_TARGET("+crc") +#else +# define Z_TARGET_CRC +#endif +#ifdef ARM_PMULL_EOR3 +# define Z_TARGET_PMULL_EOR3 Z_TARGET("+crc+crypto+sha3") +#else +# define Z_TARGET_PMULL_EOR3 +#endif + +#if !defined(ARM_CRC32_INTRIN) && !defined(_MSC_VER) +#if defined(ARCH_ARM) && defined(ARCH_64BIT) +static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) { + uint32_t __c; + __asm__("crc32b %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) { + uint32_t __c; + __asm__("crc32h %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) { + uint32_t __c; + __asm__("crc32w %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) { + uint32_t __c; + __asm__("crc32x %w0, %w1, %x2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} +#else +static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) { + uint32_t __c; + __asm__("crc32b %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) { + uint32_t __c; + __asm__("crc32h %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) { + uint32_t __c; + __asm__("crc32w %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) { + return __crc32w (__crc32w (__a, __b & 0xffffffffULL), __b >> 32); +} +#endif +#endif +#endif + +#ifdef ARM_SIMD +#ifdef _MSC_VER +typedef uint32_t uint16x2_t; + +#define __uqsub16 _arm_uqsub16 +#elif !defined(ARM_SIMD_INTRIN) +typedef uint32_t uint16x2_t; + +static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) { + uint16x2_t __c; + __asm__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} +#endif +#endif + +#endif // include guard ARM_ACLE_INTRINS_H diff --git a/neozip/arch/arm/adler32_neon.c b/neozip/arch/arm/adler32_neon.c new file mode 100644 index 0000000000..48532e6cd1 --- /dev/null +++ b/neozip/arch/arm/adler32_neon.c @@ -0,0 +1,346 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Authors: + * Adenilson Cavalcanti <adenilson.cavalcanti@arm.com> + * Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_NEON + +#include "zbuild.h" +#include "neon_intrins.h" +#include "adler32_p.h" + +static const uint16_t ALIGNED_(64) taps[64] = { + 64, 63, 62, 61, 60, 59, 58, 57, + 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, + 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1 }; + +Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uint8_t *buf, size_t len) { + uint32x4_t adacc = vdupq_n_u32(0); + uint32x4_t s2acc = vdupq_n_u32(0); + uint32x4_t s2acc_0 = vdupq_n_u32(0); + uint32x4_t s2acc_1 = vdupq_n_u32(0); + uint32x4_t s2acc_2 = vdupq_n_u32(0); + + adacc = vsetq_lane_u32(s[0], adacc, 0); + s2acc = vsetq_lane_u32(s[1], s2acc, 0); + + uint32x4_t s3acc = vdupq_n_u32(0); + uint32x4_t adacc_prev = adacc; + + uint16x8_t s2_0, s2_1, s2_2, s2_3; + s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0); + + uint16x8_t s2_4, s2_5, s2_6, s2_7; + s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0); + + size_t num_iter = len >> 2; + int rem = len & 3; + + for (size_t i = 0; i < num_iter; ++i) { + uint8x16_t d0 = vld1q_u8_ex(buf, 128); + uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128); + uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128); + uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128); + + vst1q_u8(dst, d0); + vst1q_u8(dst + 16, d1); + vst1q_u8(dst + 32, d2); + vst1q_u8(dst + 48, d3); + dst += 64; + + /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32 + * bit instruction, we'll have to make due summing to 16 bits first */ + uint16x8x2_t hsum, hsum_fold; + hsum.val[0] = vpaddlq_u8(d0); + hsum.val[1] = vpaddlq_u8(d1); + + hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d2); + hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d3); + + adacc = vpadalq_u16(adacc, hsum_fold.val[0]); + s3acc = vaddq_u32(s3acc, adacc_prev); + adacc = vpadalq_u16(adacc, hsum_fold.val[1]); + + /* If we do straight widening additions to the 16 bit values, we don't incur + * the usual penalties of a pairwise add. We can defer the multiplications + * until the very end. These will not overflow because we are incurring at + * most 408 loop iterations (NMAX / 64), and a given lane is only going to be + * summed into once. This means for the maximum input size, the largest value + * we will see is 255 * 102 = 26010, safely under uint16 max */ + s2_0 = vaddw_u8(s2_0, vget_low_u8(d0)); + s2_1 = vaddw_high_u8(s2_1, d0); + s2_2 = vaddw_u8(s2_2, vget_low_u8(d1)); + s2_3 = vaddw_high_u8(s2_3, d1); + s2_4 = vaddw_u8(s2_4, vget_low_u8(d2)); + s2_5 = vaddw_high_u8(s2_5, d2); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d3)); + s2_7 = vaddw_high_u8(s2_7, d3); + + adacc_prev = adacc; + buf += 64; + } + + s3acc = vshlq_n_u32(s3acc, 6); + + if (rem) { + uint32x4_t s3acc_0 = vdupq_n_u32(0); + while (rem--) { + uint8x16_t d0 = vld1q_u8_ex(buf, 128); + vst1q_u8(dst, d0); + dst += 16; + uint16x8_t adler; + adler = vpaddlq_u8(d0); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d0)); + s2_7 = vaddw_high_u8(s2_7, d0); + adacc = vpadalq_u16(adacc, adler); + s3acc_0 = vaddq_u32(s3acc_0, adacc_prev); + adacc_prev = adacc; + buf += 16; + } + + s3acc_0 = vshlq_n_u32(s3acc_0, 4); + s3acc = vaddq_u32(s3acc_0, s3acc); + } + + uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256); + uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1)); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7)); + + s2acc = vaddq_u32(s2acc_0, s2acc); + s2acc_2 = vaddq_u32(s2acc_1, s2acc_2); + s2acc = vaddq_u32(s2acc, s2acc_2); + + uint32x2_t adacc2, s2acc2, as; + s2acc = vaddq_u32(s2acc, s3acc); + adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); + s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); + as = vpadd_u32(adacc2, s2acc2); + s[0] = vget_lane_u32(as, 0); + s[1] = vget_lane_u32(as, 1); +} + +Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) { + uint32x4_t adacc = vdupq_n_u32(0); + uint32x4_t s2acc = vdupq_n_u32(0); + uint32x4_t s2acc_0 = vdupq_n_u32(0); + uint32x4_t s2acc_1 = vdupq_n_u32(0); + uint32x4_t s2acc_2 = vdupq_n_u32(0); + + adacc = vsetq_lane_u32(s[0], adacc, 0); + s2acc = vsetq_lane_u32(s[1], s2acc, 0); + + uint32x4_t s3acc = vdupq_n_u32(0); + uint32x4_t adacc_prev = adacc; + + uint16x8_t s2_0, s2_1, s2_2, s2_3; + s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0); + + uint16x8_t s2_4, s2_5, s2_6, s2_7; + s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0); + + size_t num_iter = len >> 2; + int rem = len & 3; + + for (size_t i = 0; i < num_iter; ++i) { + uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256); + + /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32 + * bit instruction, we'll have to make due summing to 16 bits first */ + uint16x8x2_t hsum, hsum_fold; + hsum.val[0] = vpaddlq_u8(d0_d3.val[0]); + hsum.val[1] = vpaddlq_u8(d0_d3.val[1]); + + hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]); + hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]); + + adacc = vpadalq_u16(adacc, hsum_fold.val[0]); + s3acc = vaddq_u32(s3acc, adacc_prev); + adacc = vpadalq_u16(adacc, hsum_fold.val[1]); + + /* If we do straight widening additions to the 16 bit values, we don't incur + * the usual penalties of a pairwise add. We can defer the multiplications + * until the very end. These will not overflow because we are incurring at + * most 408 loop iterations (NMAX / 64), and a given lane is only going to be + * summed into once. This means for the maximum input size, the largest value + * we will see is 255 * 102 = 26010, safely under uint16 max */ + s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); + s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]); + s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1])); + s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]); + s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); + s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3])); + s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]); + + adacc_prev = adacc; + buf += 64; + } + + s3acc = vshlq_n_u32(s3acc, 6); + + if (rem) { + uint32x4_t s3acc_0 = vdupq_n_u32(0); + while (rem--) { + uint8x16_t d0 = vld1q_u8_ex(buf, 128); + uint16x8_t adler; + adler = vpaddlq_u8(d0); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d0)); + s2_7 = vaddw_high_u8(s2_7, d0); + adacc = vpadalq_u16(adacc, adler); + s3acc_0 = vaddq_u32(s3acc_0, adacc_prev); + adacc_prev = adacc; + buf += 16; + } + + s3acc_0 = vshlq_n_u32(s3acc_0, 4); + s3acc = vaddq_u32(s3acc_0, s3acc); + } + + uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256); + uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1)); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7)); + + s2acc = vaddq_u32(s2acc_0, s2acc); + s2acc_2 = vaddq_u32(s2acc_1, s2acc_2); + s2acc = vaddq_u32(s2acc, s2acc_2); + + uint32x2_t adacc2, s2acc2, as; + s2acc = vaddq_u32(s2acc, s3acc); + adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); + s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); + as = vpadd_u32(adacc2, s2acc2); + s[0] = vget_lane_u32(as, 0); + s[1] = vget_lane_u32(as, 1); +} + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY); + + uint32_t pair[2]; + + /* Split Adler-32 into component sums, it can be supplied by + * the caller sites (e.g. in a PNG file). + */ + pair[0] = adler; + pair[1] = sum2; + + /* If memory is not SIMD aligned, do scalar sums to an aligned + * offset, provided that doing so doesn't completely eliminate + * SIMD operation. Aligned loads are still faster on ARM, even + * when there's no explicit aligned load instruction. Note: + * the code currently emits an alignment hint in the instruction + * for exactly 256 bits when supported by the compiler. Several ARM + * SIPs have small penalties for cacheline crossing loads as well (so + * really 512 bits is the optimal alignment of the buffer). 32 bytes + * should strike a balance, though. The Cortex-A8 and Cortex-A9 + * processors are documented to benefit from 128 bit and 64 bit + * alignment, but it's unclear which other SIPs will benefit from it. + * In the copying variant we use fallback to 4x loads and 4x stores, + * as ld1x4 seems to block ILP when stores are in the mix */ + size_t align_diff = MIN(ALIGN_DIFF(src, 32), len); + size_t n = NMAX_ALIGNED32; + if (align_diff) { + adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY); + + if (COPY) + dst += align_diff; + src += align_diff; + len -= align_diff; + n = ALIGN_DOWN(n - align_diff, 32); + } + + while (len >= 16) { + n = MIN(len, n); + + if (COPY) + NEON_accum32_copy(pair, dst, src, n >> 4); + else + NEON_accum32(pair, src, n >> 4); + + pair[0] %= BASE; + pair[1] %= BASE; + + size_t k = (n >> 4) << 4; + src += k; + if (COPY) + dst += k; + len -= k; + n = NMAX_ALIGNED32; + } + + /* Process tail (len < 16). */ + return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY); +} + +Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { +#if OPTIMAL_CMP >= 32 + return adler32_copy_impl(adler, dst, src, len, 1); +#else + /* Without unaligned access, interleaved stores get decomposed into byte ops */ + adler = adler32_neon(adler, src, len); + memcpy(dst, src, len); + return adler; +#endif +} + +#endif diff --git a/neozip/arch/arm/arm_features.c b/neozip/arch/arm/arm_features.c new file mode 100644 index 0000000000..8f179526ef --- /dev/null +++ b/neozip/arch/arm/arm_features.c @@ -0,0 +1,334 @@ +#ifdef ARM_FEATURES + +#include "zbuild.h" +#include "arm_features.h" + +#if defined(HAVE_SYS_AUXV_H) +# include <sys/auxv.h> +# ifdef ARM_ASM_HWCAP +# include <asm/hwcap.h> +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) +# include <machine/armreg.h> +# ifndef ID_AA64ISAR0_CRC32_VAL +# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32 +# endif +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) +# include <machine/armreg.h> +# include <machine/cpu.h> +# include <sys/sysctl.h> +# include <sys/types.h> +#elif defined(__APPLE__) +# if !defined(_DARWIN_C_SOURCE) +# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */ +# endif +# include <sys/sysctl.h> +#elif defined(_WIN32) +# include <windows.h> +#endif + +static int arm_has_crc32(void) { + int has_crc32 = 0; +#if defined(__ARM_FEATURE_CRC32) + /* Compile-time check */ + has_crc32 = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CRC32 + has_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0; +# elif defined(HWCAP2_CRC32) + has_crc32 = (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CRC32 + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_crc32 = (hwcap & HWCAP_CRC32) != 0; +# elif defined(HWCAP2_CRC32) + unsigned long hwcap2 = 0; + elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); + has_crc32 = (hwcap2 & HWCAP2_CRC32) != 0; +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) + has_crc32 = getenv("QEMU_EMULATING") == NULL + && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE; +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0 = 0; + size_t len = sizeof(isar0); + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + has_crc32 = ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE; + } +#elif defined(__APPLE__) + int has_feat = 0; + size_t size = sizeof(has_feat); + has_crc32 = sysctlbyname("hw.optional.armv8_crc32", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; +#elif defined(_WIN32) + has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); +#endif + return has_crc32; +} + +static int arm_has_pmull(void) { + int has_pmull = 0; +#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES) + /* Compile-time check */ + has_pmull = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_PMULL + has_pmull = (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0; +# elif defined(HWCAP_AES) + /* PMULL is part of crypto extension, check for AES as proxy */ + has_pmull = (getauxval(AT_HWCAP) & HWCAP_AES) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_PMULL + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_pmull = (hwcap & HWCAP_PMULL) != 0; +# elif defined(HWCAP_AES) + /* PMULL is part of crypto extension, check for AES as proxy */ + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_pmull = (hwcap & HWCAP_AES) != 0; +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) + /* Check for AES feature as PMULL is part of crypto extension */ + has_pmull = getenv("QEMU_EMULATING") == NULL + && ID_AA64ISAR0_AES_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_AES_BASE; +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0 = 0; + size_t len = sizeof(isar0); + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + has_pmull = ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_BASE; + } +#elif defined(__APPLE__) + int has_feat = 0; + size_t size = sizeof(has_feat); + has_pmull = sysctlbyname("hw.optional.arm.FEAT_PMULL", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; +#elif defined(_WIN32) + /* Windows checks for crypto/AES support */ +# ifdef PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE + has_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +# endif +#endif + return has_pmull; +} + +static int arm_has_eor3(void) { + int has_eor3 = 0; +#if defined(__ARM_FEATURE_SHA3) + /* Compile-time check */ + has_eor3 = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) + /* EOR3 is part of SHA3 extension, check HWCAP2_SHA3 */ +# ifdef HWCAP2_SHA3 + has_eor3 = (getauxval(AT_HWCAP2) & HWCAP2_SHA3) != 0; +# elif defined(HWCAP_SHA3) + has_eor3 = (getauxval(AT_HWCAP) & HWCAP_SHA3) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP2_SHA3 + unsigned long hwcap2 = 0; + elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); + has_eor3 = (hwcap2 & HWCAP2_SHA3) != 0; +# elif defined(HWCAP_SHA3) + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_eor3 = (hwcap & HWCAP_SHA3) != 0; +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) + /* FreeBSD: check for SHA3 in id_aa64isar0_el1 */ +# ifdef ID_AA64ISAR0_SHA3_VAL + has_eor3 = getenv("QEMU_EMULATING") == NULL + && ID_AA64ISAR0_SHA3_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_SHA3_BASE; +# endif +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) +# ifdef ID_AA64ISAR0_SHA3 + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0 = 0; + size_t len = sizeof(isar0); + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + has_eor3 = ID_AA64ISAR0_SHA3(isar0) >= ID_AA64ISAR0_SHA3_IMPL; + } +# endif +#elif defined(__APPLE__) + /* All Apple Silicon (M1+) has SHA3/EOR3 support */ + int has_feat = 0; + size_t size = sizeof(has_feat); + has_eor3 = sysctlbyname("hw.optional.arm.FEAT_SHA3", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; + /* Fallback to legacy name for older macOS versions */ + if (!has_eor3) { + size = sizeof(has_feat); + has_eor3 = sysctlbyname("hw.optional.armv8_2_sha3", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; + } +#elif defined(_WIN32) +# ifdef PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE + has_eor3 = IsProcessorFeaturePresent(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE); +# endif +#endif + return has_eor3; +} + +/* AArch64 has neon. */ +#ifdef ARCH_32BIT +static inline int arm_has_neon(void) { + int has_neon = 0; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + /* Compile-time check */ + has_neon = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_ARM_NEON + has_neon = (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0; +# elif defined(HWCAP_NEON) + has_neon = (getauxval(AT_HWCAP) & HWCAP_NEON) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_NEON + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_neon = (hwcap & HWCAP_NEON) != 0; +# endif +#elif defined(__APPLE__) + int has_feat = 0; + size_t size = sizeof(has_feat); + has_neon = sysctlbyname("hw.optional.neon", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; +#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION) +# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) + has_neon = 1; /* Always supported */ +# endif +#endif + return has_neon; +} +#endif + +/* AArch64 does not have ARMv6 SIMD. */ +#ifdef ARCH_32BIT +static inline int arm_has_simd(void) { + int has_simd = 0; +#if defined(__ARM_FEATURE_SIMD32) + /* Compile-time check for ARMv6 SIMD */ + has_simd = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) + const char *platform = (const char *)getauxval(AT_PLATFORM); + has_simd = platform + && (strncmp(platform, "v6l", 3) == 0 + || strncmp(platform, "v7l", 3) == 0 + || strncmp(platform, "v8l", 3) == 0); +#endif + return has_simd; +} +#endif + +#if defined(ARCH_64BIT) && !defined(__APPLE__) && !defined(_WIN32) +/* MIDR_EL1 bit field definitions */ +#define MIDR_IMPLEMENTOR(midr) (((midr) & (0xffU << 24)) >> 24) +#define MIDR_PARTNUM(midr) (((midr) & (0xfffU << 4)) >> 4) + +/* ARM CPU Implementer IDs */ +#define ARM_IMPLEMENTER_ARM 0x41 +#define ARM_IMPLEMENTER_QUALCOMM 0x51 +#define ARM_IMPLEMENTER_APPLE 0x61 + +/* ARM CPU Part Numbers */ + +/* Cortex-X series - Multiple PMULL lanes */ +#define ARM_PART_CORTEX_X1 0xd44 +#define ARM_PART_CORTEX_X1C 0xd4c +#define ARM_PART_CORTEX_X2 0xd48 +#define ARM_PART_CORTEX_X3 0xd4e +#define ARM_PART_CORTEX_X4 0xd82 +#define ARM_PART_CORTEX_X925 0xd85 + +/* Neoverse V/N2 series - Multiple PMULL lanes */ +#define ARM_PART_NEOVERSE_N2 0xd49 +#define ARM_PART_NEOVERSE_V1 0xd40 +#define ARM_PART_NEOVERSE_V2 0xd4f +#define ARM_PART_NEOVERSE_V3 0xd8e + +/* Snapdragon X Elite/Plus - Custom core */ +#define QUALCOMM_PART_ORYON 0x001 + +static inline int arm_has_cpuid(void) { + int has_cpuid = 0; +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CPUID + has_cpuid = (getauxval(AT_HWCAP) & HWCAP_CPUID) != 0; +# elif defined(HWCAP2_CPUID) + has_cpuid = (getauxval(AT_HWCAP2) & HWCAP2_CPUID) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CPUID + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_cpuid = (hwcap & HWCAP_CPUID) != 0; +# endif +#endif + return has_cpuid; +} +#endif + +/* Determine if CPU has fast PMULL (multiple execution units) */ +static inline int arm_cpu_has_fast_pmull(void) { + int has_fast_pmull = 0; +#if defined(__APPLE__) + /* On macOS, all Apple Silicon has fast PMULL */ + has_fast_pmull = 1; +#elif defined(ARCH_64BIT) && !defined(_WIN32) + /* We need CPUID feature to read MIDR register */ + if (!arm_has_cpuid()) + return has_fast_pmull; + + uint64_t midr; + __asm__ ("mrs %0, midr_el1" : "=r" (midr)); + + uint32_t implementer = MIDR_IMPLEMENTOR(midr); + uint32_t part = MIDR_PARTNUM(midr); + + if (implementer == ARM_IMPLEMENTER_APPLE) { + /* All Apple Silicon (M1+) have fast PMULL */ + has_fast_pmull = 1; + } else if (implementer == ARM_IMPLEMENTER_ARM) { + /* ARM Cortex-X and Neoverse V/N2 series have multi-lane PMULL */ + switch (part) { + case ARM_PART_CORTEX_X1: + case ARM_PART_CORTEX_X1C: + case ARM_PART_CORTEX_X2: + case ARM_PART_CORTEX_X3: + case ARM_PART_CORTEX_X4: + case ARM_PART_CORTEX_X925: + case ARM_PART_NEOVERSE_N2: + case ARM_PART_NEOVERSE_V1: + case ARM_PART_NEOVERSE_V2: + case ARM_PART_NEOVERSE_V3: + has_fast_pmull = 1; + } + } else if (implementer == ARM_IMPLEMENTER_QUALCOMM) { + /* Qualcomm Oryon (Snapdragon X Elite/Plus) has fast PMULL */ + if (part == QUALCOMM_PART_ORYON) + has_fast_pmull = 1; + } +#endif + return has_fast_pmull; +} + +void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) { +#ifdef ARCH_64BIT + features->has_simd = 0; /* never available */ + features->has_neon = 1; /* always available */ +#else + features->has_simd = arm_has_simd(); + features->has_neon = arm_has_neon(); +#endif + features->has_crc32 = arm_has_crc32(); + features->has_pmull = arm_has_pmull(); + features->has_eor3 = arm_has_eor3(); + features->has_fast_pmull = features->has_pmull && arm_cpu_has_fast_pmull(); +} + +#endif diff --git a/neozip/arch/arm/arm_features.h b/neozip/arch/arm/arm_features.h new file mode 100644 index 0000000000..2f17a9ddf0 --- /dev/null +++ b/neozip/arch/arm/arm_features.h @@ -0,0 +1,19 @@ +/* arm_features.h -- check for ARM features. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_FEATURES_H_ +#define ARM_FEATURES_H_ + +struct arm_cpu_features { + int has_simd; + int has_neon; + int has_crc32; + int has_pmull; + int has_eor3; + int has_fast_pmull; +}; + +void Z_INTERNAL arm_check_features(struct arm_cpu_features *features); + +#endif /* ARM_FEATURES_H_ */ diff --git a/neozip/arch/arm/arm_functions.h b/neozip/arch/arm/arm_functions.h new file mode 100644 index 0000000000..bc77adb977 --- /dev/null +++ b/neozip/arch/arm/arm_functions.h @@ -0,0 +1,75 @@ +/* arm_functions.h -- ARM implementations for arch-specific functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_FUNCTIONS_H_ +#define ARM_FUNCTIONS_H_ + +#include "arm_natives.h" + +#ifdef ARM_NEON +uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match); +void slide_hash_neon(deflate_state *s); +#endif + +#ifdef ARM_CRC32 +uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef ARM_PMULL_EOR3 +uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef ARM_SIMD +void slide_hash_armv6(deflate_state *s); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// ARM - SIMD +# ifdef ARM_SIMD_NATIVE +# undef native_slide_hash +# define native_slide_hash slide_hash_armv6 +# endif +// ARM - NEON +# ifdef ARM_NEON_NATIVE +# undef native_adler32 +# define native_adler32 adler32_neon +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_neon +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_neon +# undef native_compare256 +# define native_compare256 compare256_neon +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_neon +# undef native_longest_match +# define native_longest_match longest_match_neon +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_neon +# undef native_slide_hash +# define native_slide_hash slide_hash_neon +# endif +// ARM - CRC32 +# ifdef ARM_CRC32_NATIVE +# undef native_crc32 +# define native_crc32 crc32_armv8 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_armv8 +# endif +// ARM - PMULL EOR3 +# ifdef ARM_PMULL_EOR3_NATIVE +# undef native_crc32 +# define native_crc32 crc32_armv8_pmull_eor3 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_armv8_pmull_eor3 +# endif +#endif + +#endif /* ARM_FUNCTIONS_H_ */ diff --git a/neozip/arch/arm/arm_natives.h b/neozip/arch/arm/arm_natives.h new file mode 100644 index 0000000000..311e33e958 --- /dev/null +++ b/neozip/arch/arm/arm_natives.h @@ -0,0 +1,31 @@ +/* arm_natives.h -- ARM compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_NATIVES_H_ +#define ARM_NATIVES_H_ + +#if defined(__ARM_FEATURE_SIMD32) +# ifdef ARM_SIMD +# define ARM_SIMD_NATIVE +# endif +#endif +/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */ +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT) +# ifdef ARM_NEON +# define ARM_NEON_NATIVE +# endif +#endif +/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */ +#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801) +# ifdef ARM_CRC32 +# define ARM_CRC32_NATIVE +# endif +#endif +#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3) +# ifdef ARM_PMULL_EOR3 +# define ARM_PMULL_EOR3_NATIVE +# endif +#endif + +#endif /* ARM_NATIVES_H_ */ diff --git a/neozip/arch/arm/chunkset_neon.c b/neozip/arch/arm/chunkset_neon.c new file mode 100644 index 0000000000..a891f10fa5 --- /dev/null +++ b/neozip/arch/arm/chunkset_neon.c @@ -0,0 +1,81 @@ +/* chunkset_neon.c -- NEON inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_NEON + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" +#include "neon_intrins.h" +#include "arch/generic/chunk_128bit_perm_idx_lut.h" + +typedef uint8x16_t chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from))); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from))); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from))); +} + +#define CHUNKSIZE chunksize_neon +#define CHUNKCOPY chunkcopy_neon +#define CHUNKUNROLL chunkunroll_neon +#define CHUNKMEMSET chunkmemset_neon +#define CHUNKMEMSET_SAFE chunkmemset_safe_neon + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = vld1q_u8(s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + vst1q_u8(out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + *chunk_rem = lut_rem.remval; + + /* See note in chunkset_ssse3.c for why this is ok */ + __msan_unpoison(buf + dist, 16 - dist); + + /* This version of table is only available on aarch64 */ +#if defined(ARCH_ARM) && defined(ARCH_64BIT) + uint8x16_t ret_vec = vld1q_u8(buf); + + uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128); + return vqtbl1q_u8(ret_vec, perm_vec); +#else + uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1; + perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64); + perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64); + a = vld1_u8(buf); + b = vld1_u8(buf + 8); + ret0 = vtbl1_u8(a, perm_vec0); + uint8x8x2_t ab; + ab.val[0] = a; + ab.val[1] = b; + ret1 = vtbl2_u8(ab, perm_vec1); + return vcombine_u8(ret0, ret1); +#endif +} + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_neon + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/arm/compare256_neon.c b/neozip/arch/arm/compare256_neon.c new file mode 100644 index 0000000000..4ced9fc9ca --- /dev/null +++ b/neozip/arch/arm/compare256_neon.c @@ -0,0 +1,56 @@ +/* compare256_neon.c - NEON version of compare256 + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#if defined(ARM_NEON) +#include "neon_intrins.h" + +static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint8x16_t a, b, cmp; + uint64_t lane; + + a = vld1q_u8(src0); + b = vld1q_u8(src1); + + cmp = veorq_u8(a, b); + + lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0); + if (lane) + return len + zng_ctz64(lane) / 8; + len += 8; + lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1); + if (lane) + return len + zng_ctz64(lane) / 8; + len += 8; + + src0 += 16, src1 += 16; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) { + return compare256_neon_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_neon +#define COMPARE256 compare256_neon_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_neon +#define COMPARE256 compare256_neon_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/arm/crc32_armv8.c b/neozip/arch/arm/crc32_armv8.c new file mode 100644 index 0000000000..59f2b65009 --- /dev/null +++ b/neozip/arch/arm/crc32_armv8.c @@ -0,0 +1,81 @@ +/* crc32_armv8.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler + * Copyright (C) 2016 Yang Zhang + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_CRC32 + +#include "zbuild.h" +#include "acle_intrins.h" +#include "crc32_armv8_p.h" + +Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, + const int COPY) { + uint32_t c = ~crc; + + if (UNLIKELY(len == 1)) { + if (COPY) + *dst = *src; + c = __crc32b(c, *src); + return ~c; + } + + /* Align to 8-byte boundary for tail processing */ + uintptr_t align_diff = ALIGN_DIFF(src, 8); + if (align_diff) + c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY); + + while (len >= 64) { + uint64_t d0 = *(const uint64_t *)src; + uint64_t d1 = *(const uint64_t *)(src + 8); + uint64_t d2 = *(const uint64_t *)(src + 16); + uint64_t d3 = *(const uint64_t *)(src + 24); + uint64_t d4 = *(const uint64_t *)(src + 32); + uint64_t d5 = *(const uint64_t *)(src + 40); + uint64_t d6 = *(const uint64_t *)(src + 48); + uint64_t d7 = *(const uint64_t *)(src + 56); + + if (COPY) { + memcpy(dst, &d0, 8); + memcpy(dst + 8, &d1, 8); + memcpy(dst + 16, &d2, 8); + memcpy(dst + 24, &d3, 8); + memcpy(dst + 32, &d4, 8); + memcpy(dst + 40, &d5, 8); + memcpy(dst + 48, &d6, 8); + memcpy(dst + 56, &d7, 8); + dst += 64; + } + + c = __crc32d(c, d0); + c = __crc32d(c, d1); + c = __crc32d(c, d2); + c = __crc32d(c, d3); + c = __crc32d(c, d4); + c = __crc32d(c, d5); + c = __crc32d(c, d6); + c = __crc32d(c, d7); + + src += 64; + len -= 64; + } + + return crc32_armv8_tail(c, dst, src, len, COPY); +} + +Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL Z_TARGET_CRC uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { +#if OPTIMAL_CMP >= 32 + return crc32_copy_impl(crc, dst, src, len, 1); +#else + /* Without unaligned access, interleaved stores get decomposed into byte ops */ + crc = crc32_armv8(crc, src, len); + memcpy(dst, src, len); + return crc; +#endif +} +#endif diff --git a/neozip/arch/arm/crc32_armv8_p.h b/neozip/arch/arm/crc32_armv8_p.h new file mode 100644 index 0000000000..e72c4c0ad1 --- /dev/null +++ b/neozip/arch/arm/crc32_armv8_p.h @@ -0,0 +1,103 @@ +/* crc32_armv8_p.h -- Private shared inline ARMv8 CRC32 functions + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CRC32_ARMV8_P_H +#define CRC32_ARMV8_P_H + +#include "zbuild.h" +#include "acle_intrins.h" + +Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_align(uint32_t crc, uint8_t **dst, const uint8_t **buf, + size_t *len, uintptr_t align_diff, const int COPY) { + if (*len && (align_diff & 1)) { + uint8_t val = **buf; + if (COPY) { + **dst = val; + *dst += 1; + } + crc = __crc32b(crc, val); + *buf += 1; + *len -= 1; + } + + if (*len >= 2 && (align_diff & 2)) { + uint16_t val = *((uint16_t*)*buf); + if (COPY) { + memcpy(*dst, &val, 2); + *dst += 2; + } + crc = __crc32h(crc, val); + *buf += 2; + *len -= 2; + } + + if (*len >= 4 && (align_diff & 4)) { + uint32_t val = *((uint32_t*)*buf); + if (COPY) { + memcpy(*dst, &val, 4); + *dst += 4; + } + crc = __crc32w(crc, val); + *buf += 4; + *len -= 4; + } + + if (*len >= 8 && (align_diff & 8)) { + uint64_t val = *((uint64_t*)*buf); + if (COPY) { + memcpy(*dst, &val, 8); + *dst += 8; + } + crc = __crc32d(crc, val); + *buf += 8; + *len -= 8; + } + + return crc; +} + +Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_tail(uint32_t crc, uint8_t *dst, const uint8_t *buf, + size_t len, const int COPY) { + while (len >= 8) { + uint64_t val = *((uint64_t*)buf); + if (COPY) { + memcpy(dst, &val, 8); + dst += 8; + } + crc = __crc32d(crc, val); + buf += 8; + len -= 8; + } + + if (len & 4) { + uint32_t val = *((uint32_t*)buf); + if (COPY) { + memcpy(dst, &val, 4); + dst += 4; + } + crc = __crc32w(crc, val); + buf += 4; + } + + if (len & 2) { + uint16_t val = *((uint16_t*)buf); + if (COPY) { + memcpy(dst, &val, 2); + dst += 2; + } + crc = __crc32h(crc, val); + buf += 2; + } + + if (len & 1) { + uint8_t val = *buf; + if (COPY) + *dst = val; + crc = __crc32b(crc, val); + } + + return ~crc; +} + +#endif /* CRC32_ARMV8_P_H */ diff --git a/neozip/arch/arm/crc32_armv8_pmull_eor3.c b/neozip/arch/arm/crc32_armv8_pmull_eor3.c new file mode 100644 index 0000000000..e0d5bf043b --- /dev/null +++ b/neozip/arch/arm/crc32_armv8_pmull_eor3.c @@ -0,0 +1,366 @@ +/* crc32_armv8_pmull_eor3.c -- ARMv8 CRC32 using PMULL + EOR3 (SHA3 extension) + * Copyright (C) 2025 Peter Cawley + * https://github.com/corsix/fast-crc32 + * For conditions of distribution and use, see copyright notice in zlib.h + * + * This uses EOR3 (3-way XOR) from ARMv8.2-A SHA3 extension to save instructions. + * Uses 3-way parallel scalar CRC + 9 PMULL vector lanes, processing 192 bytes/iter. + */ + +#ifdef ARM_PMULL_EOR3 + +#include "zbuild.h" +#include "zutil.h" +#include "acle_intrins.h" +#include "neon_intrins.h" +#include "crc32_armv8_p.h" + +/* Carryless multiply low 64 bits: a[0] * b[0] */ +static inline uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) { +#ifdef _MSC_VER + return vreinterpretq_u64_p128(vmull_p64( + vget_low_p64(vreinterpret_p64_u64(a)), + vget_low_p64(vreinterpret_p64_u64(b)))); +#else + return vreinterpretq_u64_p128(vmull_p64( + vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(a)), 0), + vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(b)), 0))); +#endif +} + +/* Carryless multiply high 64 bits: a[1] * b[1] */ +static inline uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) { + return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b))); +} + +/* Carryless multiply of two 32-bit scalars: a * b (returns 64-bit result in 128-bit vector) */ +static inline uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { +#ifdef _MSC_VER + return vreinterpretq_u64_p128(vmull_p64(vdup_n_p64((poly64_t)a), vdup_n_p64((poly64_t)b))); +#else + return vreinterpretq_u64_p128(vmull_p64((poly64_t)a, (poly64_t)b)); +#endif +} + +/* Compute x^n mod P (CRC-32 polynomial) in log(n) time, where P = 0x104c11db7 */ +static uint32_t xnmodp(uint64_t n) { + uint64_t stack = ~(uint64_t)1; + uint32_t acc, low; + for (; n > 191; n = (n >> 1) - 16) { + stack = (stack << 1) + (n & 1); + } + stack = ~stack; + acc = ((uint32_t)0x80000000) >> (n & 31); + for (n >>= 5; n; --n) { + acc = __crc32w(acc, 0); + } + while ((low = stack & 1), stack >>= 1) { + poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); + uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); + acc = __crc32d(0, y << low); + } + return acc; +} + +/* Shift CRC forward by nbytes: equivalent to appending nbytes of zeros to the data stream */ +static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { + Assert(nbytes >= 5, "crc_shift requires nbytes >= 5"); + return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); +} + +Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, + size_t len, const int COPY) { + uint32_t crc0 = ~crc; + + if (UNLIKELY(len == 1)) { + if (COPY) + *dst = *src; + crc0 = __crc32b(crc0, *src); + return ~crc0; + } + + /* Align to 16-byte boundary for vector path */ + uintptr_t align_diff = ALIGN_DIFF(src, 16); + if (align_diff) + crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY); + + /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */ + if (len >= 192) { + size_t blk = len / 192; /* Number of 192-byte blocks */ + size_t klen = blk * 16; /* Scalar stride per CRC lane */ + const uint8_t *end = src + len; + const uint8_t *src0 = src; + const uint8_t *src1 = src + klen; + const uint8_t *src2 = src + klen * 2; + const uint8_t *srcv = src + klen * 3; /* Vector data starts after scalar lanes */ + uint32_t crc1 = 0, crc2 = 0; + uint64x2_t vc0, vc1, vc2; + uint64_t vc; + + /* Load first 9 vector chunks (144 bytes) */ + uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0; + uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1; + uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2; + uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3; + uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4; + uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5; + uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6; + uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7; + uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8; + uint64x2_t k; + /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */ + { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); } + + /* Per-region dst pointers */ + uint8_t *dst0 = dst; + uint8_t *dst1 = NULL; + uint8_t *dst2 = NULL; + uint8_t *dst_v = NULL; + + if (COPY) { + dst1 = dst + klen; + dst2 = dst + klen * 2; + dst_v = dst + klen * 3; + vst1q_u8(dst_v, vreinterpretq_u8_u64(x0)); + vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(x1)); + vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(x2)); + vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(x3)); + vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(x4)); + vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(x5)); + vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(x6)); + vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(x7)); + vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(x8)); + dst_v += 144; + } + srcv += 144; + + /* Fold 9 vectors + 3-way parallel scalar CRC */ + if (blk > 1) { + /* Only form a limit pointer when we have at least 2 blocks. */ + const uint8_t *limit = src0 + klen - 32; + while (src0 <= limit) { + /* Fold all 9 vector lanes using PMULL */ + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); + y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k); + y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); + y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k); + y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k); + + /* EOR3: combine hi*k, lo*k, and new data in one instruction */ + { + uint64x2_t d0 = vld1q_u64_ex((const uint64_t*)srcv, 128); + uint64x2_t d1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128); + uint64x2_t d2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128); + uint64x2_t d3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128); + uint64x2_t d4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128); + uint64x2_t d5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128); + uint64x2_t d6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128); + uint64x2_t d7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128); + uint64x2_t d8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128); + if (COPY) { + vst1q_u8(dst_v, vreinterpretq_u8_u64(d0)); + vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(d1)); + vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(d2)); + vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(d3)); + vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(d4)); + vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(d5)); + vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(d6)); + vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(d7)); + vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(d8)); + dst_v += 144; + } + x0 = veor3q_u64(x0, y0, d0); + x1 = veor3q_u64(x1, y1, d1); + x2 = veor3q_u64(x2, y2, d2); + x3 = veor3q_u64(x3, y3, d3); + x4 = veor3q_u64(x4, y4, d4); + x5 = veor3q_u64(x5, y5, d5); + x6 = veor3q_u64(x6, y6, d6); + x7 = veor3q_u64(x7, y7, d7); + x8 = veor3q_u64(x8, y8, d8); + } + + /* 3-way parallel scalar CRC (16 bytes each) */ + { + uint64_t s0a = *(const uint64_t*)src0; + uint64_t s0b = *(const uint64_t*)(src0 + 8); + uint64_t s1a = *(const uint64_t*)src1; + uint64_t s1b = *(const uint64_t*)(src1 + 8); + uint64_t s2a = *(const uint64_t*)src2; + uint64_t s2b = *(const uint64_t*)(src2 + 8); + if (COPY) { + memcpy(dst0, &s0a, 8); + memcpy(dst0 + 8, &s0b, 8); + dst0 += 16; + memcpy(dst1, &s1a, 8); + memcpy(dst1 + 8, &s1b, 8); + dst1 += 16; + memcpy(dst2, &s2a, 8); + memcpy(dst2 + 8, &s2b, 8); + dst2 += 16; + } + crc0 = __crc32d(crc0, s0a); + crc0 = __crc32d(crc0, s0b); + crc1 = __crc32d(crc1, s1a); + crc1 = __crc32d(crc1, s1b); + crc2 = __crc32d(crc2, s2a); + crc2 = __crc32d(crc2, s2b); + } + src0 += 16; + src1 += 16; + src2 += 16; + srcv += 144; + } + } + + /* Reduce 9 vectors to 1 using tree reduction */ + /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */ + { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); } + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x1); + x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; + + /* Step 2: fold pairs (x0,x1), (x2,x3), (x4,x5), (x6,x7) */ + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); + y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); + x0 = veor3q_u64(x0, y0, x1); + x2 = veor3q_u64(x2, y2, x3); + x4 = veor3q_u64(x4, y4, x5); + x6 = veor3q_u64(x6, y6, x7); + + /* Step 3: fold pairs (x0,x2), (x4,x6) */ + { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); } + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); + x0 = veor3q_u64(x0, y0, x2); + x4 = veor3q_u64(x4, y4, x6); + + /* Step 4: final fold (x0, x4) -> x0 */ + { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); } + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x4); + + /* Process final scalar chunk */ + { + uint64_t s0a = *(const uint64_t*)src0; + uint64_t s0b = *(const uint64_t*)(src0 + 8); + uint64_t s1a = *(const uint64_t*)src1; + uint64_t s1b = *(const uint64_t*)(src1 + 8); + uint64_t s2a = *(const uint64_t*)src2; + uint64_t s2b = *(const uint64_t*)(src2 + 8); + if (COPY) { + memcpy(dst0, &s0a, 8); + memcpy(dst0 + 8, &s0b, 8); + memcpy(dst1, &s1a, 8); + memcpy(dst1 + 8, &s1b, 8); + memcpy(dst2, &s2a, 8); + memcpy(dst2 + 8, &s2b, 8); + } + crc0 = __crc32d(crc0, s0a); + crc0 = __crc32d(crc0, s0b); + crc1 = __crc32d(crc1, s1a); + crc1 = __crc32d(crc1, s1b); + crc2 = __crc32d(crc2, s2a); + crc2 = __crc32d(crc2, s2b); + } + + /* Shift and combine 3 scalar CRCs */ + vc0 = crc_shift(crc0, klen * 2 + blk * 144); + vc1 = crc_shift(crc1, klen + blk * 144); + vc2 = crc_shift(crc2, blk * 144); + vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); + + /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */ + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); + if (COPY) + dst += blk * 192; + src = srcv; + len = end - srcv; + } + + /* 3-way scalar CRC (24 bytes/iter) */ + if (len >= 80) { + size_t klen = ((len - 8) / 24) * 8; /* Stride for 3-way parallel */ + const uint8_t *buf0 = src; + const uint8_t *buf1 = src + klen; + const uint8_t *buf2 = src + klen * 2; + uint32_t crc1 = 0, crc2 = 0; + uint64x2_t vc0, vc1; + uint64_t vc; + + /* Per-lane dst pointers */ + uint8_t *dst0 = dst; + uint8_t *dst1 = NULL; + uint8_t *dst2 = NULL; + if (COPY) { + dst1 = dst + klen; + dst2 = dst + klen * 2; + } + + /* 3-way parallel scalar CRC */ + do { + uint64_t v0 = *(const uint64_t*)buf0; + uint64_t v1 = *(const uint64_t*)buf1; + uint64_t v2 = *(const uint64_t*)buf2; + if (COPY) { + memcpy(dst0, &v0, 8); + dst0 += 8; + memcpy(dst1, &v1, 8); + dst1 += 8; + memcpy(dst2, &v2, 8); + dst2 += 8; + } + crc0 = __crc32d(crc0, v0); + crc1 = __crc32d(crc1, v1); + crc2 = __crc32d(crc2, v2); + buf0 += 8; + buf1 += 8; + buf2 += 8; + len -= 24; + } while (len >= 32); + + /* Combine the 3 CRCs */ + vc0 = crc_shift(crc0, klen * 2 + 8); + vc1 = crc_shift(crc1, klen + 8); + vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); + + /* Process final 8 bytes with combined CRC */ + crc0 = crc2; + { + uint64_t vf = *(const uint64_t*)buf2; + if (COPY) + memcpy(dst2, &vf, 8); + crc0 = __crc32d(crc0, vf ^ vc); + } + src = buf2 + 8; + len -= 8; + if (COPY) + dst = dst2 + 8; + } + + /* Process remaining bytes */ + return crc32_armv8_tail(crc0, dst, src, len, COPY); +} + +Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { +#if OPTIMAL_CMP >= 32 + return crc32_copy_impl(crc, dst, src, len, 1); +#else + /* Without unaligned access, interleaved stores get decomposed into byte ops */ + crc = crc32_armv8_pmull_eor3(crc, src, len); + memcpy(dst, src, len); + return crc; +#endif +} +#endif diff --git a/neozip/arch/arm/neon_intrins.h b/neozip/arch/arm/neon_intrins.h new file mode 100644 index 0000000000..449916e0b7 --- /dev/null +++ b/neozip/arch/arm/neon_intrins.h @@ -0,0 +1,79 @@ +#ifndef ARM_NEON_INTRINS_H +#define ARM_NEON_INTRINS_H + +#if defined(_MSC_VER) && defined(ARCH_ARM) && defined(ARCH_64BIT) +/* arm64_neon.h is MSVC specific */ +# include <arm64_neon.h> +#else +# include <arm_neon.h> +#endif + +#if defined(ARM_NEON) && defined(ARCH_ARM) && defined(ARCH_32BIT) +/* Compatibility shim for the _high family of functions */ +#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b)) +#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c)) +#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c)) +#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b)) +#endif + +#ifdef ARM_NEON + +#define vqsubq_u16_x4_x1(out, a, b) do { \ + out.val[0] = vqsubq_u16(a.val[0], b); \ + out.val[1] = vqsubq_u16(a.val[1], b); \ + out.val[2] = vqsubq_u16(a.val[2], b); \ + out.val[3] = vqsubq_u16(a.val[3], b); \ +} while (0) + +# if defined(ARCH_ARM) && defined(ARCH_32BIT) && defined(__clang__) && \ + (!defined(__clang_major__) || __clang_major__ < 20) +/* Clang versions before 20 have too strict of an + * alignment requirement (:256) for x4 NEON intrinsics */ +# undef ARM_NEON_HASLD4 +# undef vld1q_u16_x4 +# undef vld1q_u8_x4 +# undef vst1q_u16_x4 +# endif + +# ifndef ARM_NEON_HASLD4 + +static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) { + uint16x8x4_t ret; + ret.val[0] = vld1q_u16(a); + ret.val[1] = vld1q_u16(a+8); + ret.val[2] = vld1q_u16(a+16); + ret.val[3] = vld1q_u16(a+24); + return ret; +} + +static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) { + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(a); + ret.val[1] = vld1q_u8(a+16); + ret.val[2] = vld1q_u8(a+32); + ret.val[3] = vld1q_u8(a+48); + return ret; +} + +static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { + vst1q_u16(p, a.val[0]); + vst1q_u16(p + 8, a.val[1]); + vst1q_u16(p + 16, a.val[2]); + vst1q_u16(p + 24, a.val[3]); +} +# endif // HASLD4 check + +# ifndef _MSC_VER +# define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8)) +# endif +# if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4) +# define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8)) +# define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a) +# endif + +#endif + +#endif // include guard ARM_NEON_INTRINS_H diff --git a/neozip/arch/arm/slide_hash_armv6.c b/neozip/arch/arm/slide_hash_armv6.c new file mode 100644 index 0000000000..b241e6c5e6 --- /dev/null +++ b/neozip/arch/arm/slide_hash_armv6.c @@ -0,0 +1,49 @@ +/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions + * Copyright (C) 2023 Cameron Cawley + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_SIMD + +#include "zbuild.h" +#include "acle_intrins.h" +#include "deflate.h" + +/* SIMD version of hash_chain rebase */ +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + Z_REGISTER uint16x2_t v; + uint16x2_t p0, p1, p2, p3; + Z_REGISTER size_t n; + + size_t size = entries*sizeof(table[0]); + Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err"); + + Assert(sizeof(Pos) == 2, "Wrong Pos size"); + v = wsize | (wsize << 16); + + n = size / (sizeof(uint16x2_t) * 4); + do { + p0 = *((const uint16x2_t *)(table)); + p1 = *((const uint16x2_t *)(table+2)); + p2 = *((const uint16x2_t *)(table+4)); + p3 = *((const uint16x2_t *)(table+6)); + p0 = __uqsub16(p0, v); + p1 = __uqsub16(p1, v); + p2 = __uqsub16(p2, v); + p3 = __uqsub16(p3, v); + *((uint16x2_t *)(table)) = p0; + *((uint16x2_t *)(table+2)) = p1; + *((uint16x2_t *)(table+4)) = p2; + *((uint16x2_t *)(table+6)) = p3; + table += 8; + } while (--n); +} + +Z_INTERNAL void slide_hash_armv6(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} +#endif diff --git a/neozip/arch/arm/slide_hash_neon.c b/neozip/arch/arm/slide_hash_neon.c new file mode 100644 index 0000000000..2f9e94a33d --- /dev/null +++ b/neozip/arch/arm/slide_hash_neon.c @@ -0,0 +1,48 @@ +/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions + * Copyright (C) 2017-2020 Mika T. Lindqvist + * + * Authors: + * Mika T. Lindqvist <postmaster@raasu.org> + * Jun He <jun.he@arm.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_NEON + +#include "zbuild.h" +#include "neon_intrins.h" +#include "deflate.h" + +/* SIMD version of hash_chain rebase */ +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + Z_REGISTER uint16x8_t v; + uint16x8x4_t p0, p1; + Z_REGISTER size_t n; + + size_t size = entries*sizeof(table[0]); + Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); + + Assert(sizeof(Pos) == 2, "Wrong Pos size"); + v = vdupq_n_u16(wsize); + + n = size / (sizeof(uint16x8_t) * 8); + do { + p0 = vld1q_u16_x4_ex(table, 256); + p1 = vld1q_u16_x4_ex(table+32, 256); + vqsubq_u16_x4_x1(p0, p0, v); + vqsubq_u16_x4_x1(p1, p1, v); + vst1q_u16_x4_ex(table, p0, 256); + vst1q_u16_x4_ex(table+32, p1, 256); + table += 64; + } while (--n); +} + +Z_INTERNAL void slide_hash_neon(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} +#endif |
