diff options
Diffstat (limited to 'neozip/arch')
117 files changed, 14578 insertions, 0 deletions
diff --git a/neozip/arch/.gitignore b/neozip/arch/.gitignore new file mode 100644 index 0000000000..2c3af0a08c --- /dev/null +++ b/neozip/arch/.gitignore @@ -0,0 +1,2 @@ +# ignore Makefiles; they're all automatically generated +Makefile diff --git a/neozip/arch/arm/Makefile.in b/neozip/arch/arm/Makefile.in new file mode 100644 index 0000000000..d0bfe0e172 --- /dev/null +++ b/neozip/arch/arm/Makefile.in @@ -0,0 +1,86 @@ +# Makefile for zlib +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +ARMV8FLAG= +PMULLEOR3FLAG= +NEONFLAG= +ARMV6FLAG= +NOLTOFLAG= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +all: \ + adler32_neon.o adler32_neon.lo \ + arm_features.o arm_features.lo \ + chunkset_neon.o chunkset_neon.lo \ + compare256_neon.o compare256_neon.lo \ + crc32_armv8.o crc32_armv8.lo \ + crc32_armv8_pmull_eor3.o crc32_armv8_pmull_eor3.lo \ + slide_hash_neon.o slide_hash_neon.lo \ + slide_hash_armv6.o slide_hash_armv6.lo \ + +adler32_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c + +adler32_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c + +arm_features.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c + +arm_features.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c + +chunkset_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c + +chunkset_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c + +compare256_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c + +compare256_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c + +crc32_armv8.o: + $(CC) $(CFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c + +crc32_armv8.lo: + $(CC) $(SFLAGS) $(ARMV8FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8.c + +crc32_armv8_pmull_eor3.o: + $(CC) $(CFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c + +crc32_armv8_pmull_eor3.lo: + $(CC) $(SFLAGS) $(PMULLEOR3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_armv8_pmull_eor3.c + +slide_hash_neon.o: + $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c + +slide_hash_neon.lo: + $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c + +slide_hash_armv6.o: + $(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c + +slide_hash_armv6.lo: + $(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/arm/acle_intrins.h b/neozip/arch/arm/acle_intrins.h new file mode 100644 index 0000000000..16f5e2c77c --- /dev/null +++ b/neozip/arch/arm/acle_intrins.h @@ -0,0 +1,90 @@ +#ifndef ARM_ACLE_INTRINS_H +#define ARM_ACLE_INTRINS_H + +#include <stdint.h> +#ifdef _MSC_VER +# include <intrin.h> +#elif defined(HAVE_ARM_ACLE_H) +# include <arm_acle.h> +#endif + +#ifdef ARM_CRC32 +#if defined(ARCH_ARM) && defined(ARCH_64BIT) +# define Z_TARGET_CRC Z_TARGET("+crc") +#else +# define Z_TARGET_CRC +#endif +#ifdef ARM_PMULL_EOR3 +# define Z_TARGET_PMULL_EOR3 Z_TARGET("+crc+crypto+sha3") +#else +# define Z_TARGET_PMULL_EOR3 +#endif + +#if !defined(ARM_CRC32_INTRIN) && !defined(_MSC_VER) +#if defined(ARCH_ARM) && defined(ARCH_64BIT) +static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) { + uint32_t __c; + __asm__("crc32b %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) { + uint32_t __c; + __asm__("crc32h %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) { + uint32_t __c; + __asm__("crc32w %w0, %w1, %w2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) { + uint32_t __c; + __asm__("crc32x %w0, %w1, %x2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} +#else +static inline uint32_t __crc32b(uint32_t __a, uint8_t __b) { + uint32_t __c; + __asm__("crc32b %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32h(uint32_t __a, uint16_t __b) { + uint32_t __c; + __asm__("crc32h %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32w(uint32_t __a, uint32_t __b) { + uint32_t __c; + __asm__("crc32w %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} + +static inline uint32_t __crc32d(uint32_t __a, uint64_t __b) { + return __crc32w (__crc32w (__a, __b & 0xffffffffULL), __b >> 32); +} +#endif +#endif +#endif + +#ifdef ARM_SIMD +#ifdef _MSC_VER +typedef uint32_t uint16x2_t; + +#define __uqsub16 _arm_uqsub16 +#elif !defined(ARM_SIMD_INTRIN) +typedef uint32_t uint16x2_t; + +static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) { + uint16x2_t __c; + __asm__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b)); + return __c; +} +#endif +#endif + +#endif // include guard ARM_ACLE_INTRINS_H diff --git a/neozip/arch/arm/adler32_neon.c b/neozip/arch/arm/adler32_neon.c new file mode 100644 index 0000000000..48532e6cd1 --- /dev/null +++ b/neozip/arch/arm/adler32_neon.c @@ -0,0 +1,346 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Authors: + * Adenilson Cavalcanti <adenilson.cavalcanti@arm.com> + * Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_NEON + +#include "zbuild.h" +#include "neon_intrins.h" +#include "adler32_p.h" + +static const uint16_t ALIGNED_(64) taps[64] = { + 64, 63, 62, 61, 60, 59, 58, 57, + 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, + 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1 }; + +Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uint8_t *buf, size_t len) { + uint32x4_t adacc = vdupq_n_u32(0); + uint32x4_t s2acc = vdupq_n_u32(0); + uint32x4_t s2acc_0 = vdupq_n_u32(0); + uint32x4_t s2acc_1 = vdupq_n_u32(0); + uint32x4_t s2acc_2 = vdupq_n_u32(0); + + adacc = vsetq_lane_u32(s[0], adacc, 0); + s2acc = vsetq_lane_u32(s[1], s2acc, 0); + + uint32x4_t s3acc = vdupq_n_u32(0); + uint32x4_t adacc_prev = adacc; + + uint16x8_t s2_0, s2_1, s2_2, s2_3; + s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0); + + uint16x8_t s2_4, s2_5, s2_6, s2_7; + s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0); + + size_t num_iter = len >> 2; + int rem = len & 3; + + for (size_t i = 0; i < num_iter; ++i) { + uint8x16_t d0 = vld1q_u8_ex(buf, 128); + uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128); + uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128); + uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128); + + vst1q_u8(dst, d0); + vst1q_u8(dst + 16, d1); + vst1q_u8(dst + 32, d2); + vst1q_u8(dst + 48, d3); + dst += 64; + + /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32 + * bit instruction, we'll have to make due summing to 16 bits first */ + uint16x8x2_t hsum, hsum_fold; + hsum.val[0] = vpaddlq_u8(d0); + hsum.val[1] = vpaddlq_u8(d1); + + hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d2); + hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d3); + + adacc = vpadalq_u16(adacc, hsum_fold.val[0]); + s3acc = vaddq_u32(s3acc, adacc_prev); + adacc = vpadalq_u16(adacc, hsum_fold.val[1]); + + /* If we do straight widening additions to the 16 bit values, we don't incur + * the usual penalties of a pairwise add. We can defer the multiplications + * until the very end. These will not overflow because we are incurring at + * most 408 loop iterations (NMAX / 64), and a given lane is only going to be + * summed into once. This means for the maximum input size, the largest value + * we will see is 255 * 102 = 26010, safely under uint16 max */ + s2_0 = vaddw_u8(s2_0, vget_low_u8(d0)); + s2_1 = vaddw_high_u8(s2_1, d0); + s2_2 = vaddw_u8(s2_2, vget_low_u8(d1)); + s2_3 = vaddw_high_u8(s2_3, d1); + s2_4 = vaddw_u8(s2_4, vget_low_u8(d2)); + s2_5 = vaddw_high_u8(s2_5, d2); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d3)); + s2_7 = vaddw_high_u8(s2_7, d3); + + adacc_prev = adacc; + buf += 64; + } + + s3acc = vshlq_n_u32(s3acc, 6); + + if (rem) { + uint32x4_t s3acc_0 = vdupq_n_u32(0); + while (rem--) { + uint8x16_t d0 = vld1q_u8_ex(buf, 128); + vst1q_u8(dst, d0); + dst += 16; + uint16x8_t adler; + adler = vpaddlq_u8(d0); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d0)); + s2_7 = vaddw_high_u8(s2_7, d0); + adacc = vpadalq_u16(adacc, adler); + s3acc_0 = vaddq_u32(s3acc_0, adacc_prev); + adacc_prev = adacc; + buf += 16; + } + + s3acc_0 = vshlq_n_u32(s3acc_0, 4); + s3acc = vaddq_u32(s3acc_0, s3acc); + } + + uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256); + uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1)); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7)); + + s2acc = vaddq_u32(s2acc_0, s2acc); + s2acc_2 = vaddq_u32(s2acc_1, s2acc_2); + s2acc = vaddq_u32(s2acc, s2acc_2); + + uint32x2_t adacc2, s2acc2, as; + s2acc = vaddq_u32(s2acc, s3acc); + adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); + s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); + as = vpadd_u32(adacc2, s2acc2); + s[0] = vget_lane_u32(as, 0); + s[1] = vget_lane_u32(as, 1); +} + +Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) { + uint32x4_t adacc = vdupq_n_u32(0); + uint32x4_t s2acc = vdupq_n_u32(0); + uint32x4_t s2acc_0 = vdupq_n_u32(0); + uint32x4_t s2acc_1 = vdupq_n_u32(0); + uint32x4_t s2acc_2 = vdupq_n_u32(0); + + adacc = vsetq_lane_u32(s[0], adacc, 0); + s2acc = vsetq_lane_u32(s[1], s2acc, 0); + + uint32x4_t s3acc = vdupq_n_u32(0); + uint32x4_t adacc_prev = adacc; + + uint16x8_t s2_0, s2_1, s2_2, s2_3; + s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0); + + uint16x8_t s2_4, s2_5, s2_6, s2_7; + s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0); + + size_t num_iter = len >> 2; + int rem = len & 3; + + for (size_t i = 0; i < num_iter; ++i) { + uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256); + + /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32 + * bit instruction, we'll have to make due summing to 16 bits first */ + uint16x8x2_t hsum, hsum_fold; + hsum.val[0] = vpaddlq_u8(d0_d3.val[0]); + hsum.val[1] = vpaddlq_u8(d0_d3.val[1]); + + hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]); + hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]); + + adacc = vpadalq_u16(adacc, hsum_fold.val[0]); + s3acc = vaddq_u32(s3acc, adacc_prev); + adacc = vpadalq_u16(adacc, hsum_fold.val[1]); + + /* If we do straight widening additions to the 16 bit values, we don't incur + * the usual penalties of a pairwise add. We can defer the multiplications + * until the very end. These will not overflow because we are incurring at + * most 408 loop iterations (NMAX / 64), and a given lane is only going to be + * summed into once. This means for the maximum input size, the largest value + * we will see is 255 * 102 = 26010, safely under uint16 max */ + s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); + s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]); + s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1])); + s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]); + s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); + s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3])); + s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]); + + adacc_prev = adacc; + buf += 64; + } + + s3acc = vshlq_n_u32(s3acc, 6); + + if (rem) { + uint32x4_t s3acc_0 = vdupq_n_u32(0); + while (rem--) { + uint8x16_t d0 = vld1q_u8_ex(buf, 128); + uint16x8_t adler; + adler = vpaddlq_u8(d0); + s2_6 = vaddw_u8(s2_6, vget_low_u8(d0)); + s2_7 = vaddw_high_u8(s2_7, d0); + adacc = vpadalq_u16(adacc, adler); + s3acc_0 = vaddq_u32(s3acc_0, adacc_prev); + adacc_prev = adacc; + buf += 16; + } + + s3acc_0 = vshlq_n_u32(s3acc_0, 4); + s3acc = vaddq_u32(s3acc_0, s3acc); + } + + uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256); + uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1)); + + s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2)); + s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5)); + + s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6); + s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6)); + s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7); + s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7)); + + s2acc = vaddq_u32(s2acc_0, s2acc); + s2acc_2 = vaddq_u32(s2acc_1, s2acc_2); + s2acc = vaddq_u32(s2acc, s2acc_2); + + uint32x2_t adacc2, s2acc2, as; + s2acc = vaddq_u32(s2acc, s3acc); + adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); + s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); + as = vpadd_u32(adacc2, s2acc2); + s[0] = vget_lane_u32(as, 0); + s[1] = vget_lane_u32(as, 1); +} + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY); + + uint32_t pair[2]; + + /* Split Adler-32 into component sums, it can be supplied by + * the caller sites (e.g. in a PNG file). + */ + pair[0] = adler; + pair[1] = sum2; + + /* If memory is not SIMD aligned, do scalar sums to an aligned + * offset, provided that doing so doesn't completely eliminate + * SIMD operation. Aligned loads are still faster on ARM, even + * when there's no explicit aligned load instruction. Note: + * the code currently emits an alignment hint in the instruction + * for exactly 256 bits when supported by the compiler. Several ARM + * SIPs have small penalties for cacheline crossing loads as well (so + * really 512 bits is the optimal alignment of the buffer). 32 bytes + * should strike a balance, though. The Cortex-A8 and Cortex-A9 + * processors are documented to benefit from 128 bit and 64 bit + * alignment, but it's unclear which other SIPs will benefit from it. + * In the copying variant we use fallback to 4x loads and 4x stores, + * as ld1x4 seems to block ILP when stores are in the mix */ + size_t align_diff = MIN(ALIGN_DIFF(src, 32), len); + size_t n = NMAX_ALIGNED32; + if (align_diff) { + adler32_copy_align(&pair[0], dst, src, align_diff, &pair[1], 31, COPY); + + if (COPY) + dst += align_diff; + src += align_diff; + len -= align_diff; + n = ALIGN_DOWN(n - align_diff, 32); + } + + while (len >= 16) { + n = MIN(len, n); + + if (COPY) + NEON_accum32_copy(pair, dst, src, n >> 4); + else + NEON_accum32(pair, src, n >> 4); + + pair[0] %= BASE; + pair[1] %= BASE; + + size_t k = (n >> 4) << 4; + src += k; + if (COPY) + dst += k; + len -= k; + n = NMAX_ALIGNED32; + } + + /* Process tail (len < 16). */ + return adler32_copy_tail(pair[0], dst, src, len, pair[1], len != 0 || align_diff, 15, COPY); +} + +Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { +#if OPTIMAL_CMP >= 32 + return adler32_copy_impl(adler, dst, src, len, 1); +#else + /* Without unaligned access, interleaved stores get decomposed into byte ops */ + adler = adler32_neon(adler, src, len); + memcpy(dst, src, len); + return adler; +#endif +} + +#endif diff --git a/neozip/arch/arm/arm_features.c b/neozip/arch/arm/arm_features.c new file mode 100644 index 0000000000..8f179526ef --- /dev/null +++ b/neozip/arch/arm/arm_features.c @@ -0,0 +1,334 @@ +#ifdef ARM_FEATURES + +#include "zbuild.h" +#include "arm_features.h" + +#if defined(HAVE_SYS_AUXV_H) +# include <sys/auxv.h> +# ifdef ARM_ASM_HWCAP +# include <asm/hwcap.h> +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) +# include <machine/armreg.h> +# ifndef ID_AA64ISAR0_CRC32_VAL +# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32 +# endif +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) +# include <machine/armreg.h> +# include <machine/cpu.h> +# include <sys/sysctl.h> +# include <sys/types.h> +#elif defined(__APPLE__) +# if !defined(_DARWIN_C_SOURCE) +# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */ +# endif +# include <sys/sysctl.h> +#elif defined(_WIN32) +# include <windows.h> +#endif + +static int arm_has_crc32(void) { + int has_crc32 = 0; +#if defined(__ARM_FEATURE_CRC32) + /* Compile-time check */ + has_crc32 = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CRC32 + has_crc32 = (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0; +# elif defined(HWCAP2_CRC32) + has_crc32 = (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CRC32 + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_crc32 = (hwcap & HWCAP_CRC32) != 0; +# elif defined(HWCAP2_CRC32) + unsigned long hwcap2 = 0; + elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); + has_crc32 = (hwcap2 & HWCAP2_CRC32) != 0; +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) + has_crc32 = getenv("QEMU_EMULATING") == NULL + && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE; +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0 = 0; + size_t len = sizeof(isar0); + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + has_crc32 = ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE; + } +#elif defined(__APPLE__) + int has_feat = 0; + size_t size = sizeof(has_feat); + has_crc32 = sysctlbyname("hw.optional.armv8_crc32", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; +#elif defined(_WIN32) + has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); +#endif + return has_crc32; +} + +static int arm_has_pmull(void) { + int has_pmull = 0; +#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES) + /* Compile-time check */ + has_pmull = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_PMULL + has_pmull = (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0; +# elif defined(HWCAP_AES) + /* PMULL is part of crypto extension, check for AES as proxy */ + has_pmull = (getauxval(AT_HWCAP) & HWCAP_AES) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_PMULL + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_pmull = (hwcap & HWCAP_PMULL) != 0; +# elif defined(HWCAP_AES) + /* PMULL is part of crypto extension, check for AES as proxy */ + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_pmull = (hwcap & HWCAP_AES) != 0; +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) + /* Check for AES feature as PMULL is part of crypto extension */ + has_pmull = getenv("QEMU_EMULATING") == NULL + && ID_AA64ISAR0_AES_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_AES_BASE; +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0 = 0; + size_t len = sizeof(isar0); + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + has_pmull = ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_BASE; + } +#elif defined(__APPLE__) + int has_feat = 0; + size_t size = sizeof(has_feat); + has_pmull = sysctlbyname("hw.optional.arm.FEAT_PMULL", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; +#elif defined(_WIN32) + /* Windows checks for crypto/AES support */ +# ifdef PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE + has_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +# endif +#endif + return has_pmull; +} + +static int arm_has_eor3(void) { + int has_eor3 = 0; +#if defined(__ARM_FEATURE_SHA3) + /* Compile-time check */ + has_eor3 = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) + /* EOR3 is part of SHA3 extension, check HWCAP2_SHA3 */ +# ifdef HWCAP2_SHA3 + has_eor3 = (getauxval(AT_HWCAP2) & HWCAP2_SHA3) != 0; +# elif defined(HWCAP_SHA3) + has_eor3 = (getauxval(AT_HWCAP) & HWCAP_SHA3) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP2_SHA3 + unsigned long hwcap2 = 0; + elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); + has_eor3 = (hwcap2 & HWCAP2_SHA3) != 0; +# elif defined(HWCAP_SHA3) + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_eor3 = (hwcap & HWCAP_SHA3) != 0; +# endif +#elif defined(__FreeBSD__) && defined(ARCH_64BIT) + /* FreeBSD: check for SHA3 in id_aa64isar0_el1 */ +# ifdef ID_AA64ISAR0_SHA3_VAL + has_eor3 = getenv("QEMU_EMULATING") == NULL + && ID_AA64ISAR0_SHA3_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_SHA3_BASE; +# endif +#elif defined(__OpenBSD__) && defined(ARCH_64BIT) +# ifdef ID_AA64ISAR0_SHA3 + int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0 = 0; + size_t len = sizeof(isar0); + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + has_eor3 = ID_AA64ISAR0_SHA3(isar0) >= ID_AA64ISAR0_SHA3_IMPL; + } +# endif +#elif defined(__APPLE__) + /* All Apple Silicon (M1+) has SHA3/EOR3 support */ + int has_feat = 0; + size_t size = sizeof(has_feat); + has_eor3 = sysctlbyname("hw.optional.arm.FEAT_SHA3", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; + /* Fallback to legacy name for older macOS versions */ + if (!has_eor3) { + size = sizeof(has_feat); + has_eor3 = sysctlbyname("hw.optional.armv8_2_sha3", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; + } +#elif defined(_WIN32) +# ifdef PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE + has_eor3 = IsProcessorFeaturePresent(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE); +# endif +#endif + return has_eor3; +} + +/* AArch64 has neon. */ +#ifdef ARCH_32BIT +static inline int arm_has_neon(void) { + int has_neon = 0; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + /* Compile-time check */ + has_neon = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_ARM_NEON + has_neon = (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0; +# elif defined(HWCAP_NEON) + has_neon = (getauxval(AT_HWCAP) & HWCAP_NEON) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_NEON + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_neon = (hwcap & HWCAP_NEON) != 0; +# endif +#elif defined(__APPLE__) + int has_feat = 0; + size_t size = sizeof(has_feat); + has_neon = sysctlbyname("hw.optional.neon", &has_feat, &size, NULL, 0) == 0 + && has_feat == 1; +#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION) +# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) + has_neon = 1; /* Always supported */ +# endif +#endif + return has_neon; +} +#endif + +/* AArch64 does not have ARMv6 SIMD. */ +#ifdef ARCH_32BIT +static inline int arm_has_simd(void) { + int has_simd = 0; +#if defined(__ARM_FEATURE_SIMD32) + /* Compile-time check for ARMv6 SIMD */ + has_simd = 1; +#elif defined(__linux__) && defined(HAVE_SYS_AUXV_H) + const char *platform = (const char *)getauxval(AT_PLATFORM); + has_simd = platform + && (strncmp(platform, "v6l", 3) == 0 + || strncmp(platform, "v7l", 3) == 0 + || strncmp(platform, "v8l", 3) == 0); +#endif + return has_simd; +} +#endif + +#if defined(ARCH_64BIT) && !defined(__APPLE__) && !defined(_WIN32) +/* MIDR_EL1 bit field definitions */ +#define MIDR_IMPLEMENTOR(midr) (((midr) & (0xffU << 24)) >> 24) +#define MIDR_PARTNUM(midr) (((midr) & (0xfffU << 4)) >> 4) + +/* ARM CPU Implementer IDs */ +#define ARM_IMPLEMENTER_ARM 0x41 +#define ARM_IMPLEMENTER_QUALCOMM 0x51 +#define ARM_IMPLEMENTER_APPLE 0x61 + +/* ARM CPU Part Numbers */ + +/* Cortex-X series - Multiple PMULL lanes */ +#define ARM_PART_CORTEX_X1 0xd44 +#define ARM_PART_CORTEX_X1C 0xd4c +#define ARM_PART_CORTEX_X2 0xd48 +#define ARM_PART_CORTEX_X3 0xd4e +#define ARM_PART_CORTEX_X4 0xd82 +#define ARM_PART_CORTEX_X925 0xd85 + +/* Neoverse V/N2 series - Multiple PMULL lanes */ +#define ARM_PART_NEOVERSE_N2 0xd49 +#define ARM_PART_NEOVERSE_V1 0xd40 +#define ARM_PART_NEOVERSE_V2 0xd4f +#define ARM_PART_NEOVERSE_V3 0xd8e + +/* Snapdragon X Elite/Plus - Custom core */ +#define QUALCOMM_PART_ORYON 0x001 + +static inline int arm_has_cpuid(void) { + int has_cpuid = 0; +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CPUID + has_cpuid = (getauxval(AT_HWCAP) & HWCAP_CPUID) != 0; +# elif defined(HWCAP2_CPUID) + has_cpuid = (getauxval(AT_HWCAP2) & HWCAP2_CPUID) != 0; +# endif +#elif (defined(__FreeBSD__) || defined(__OpenBSD__)) && defined(HAVE_SYS_AUXV_H) +# ifdef HWCAP_CPUID + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + has_cpuid = (hwcap & HWCAP_CPUID) != 0; +# endif +#endif + return has_cpuid; +} +#endif + +/* Determine if CPU has fast PMULL (multiple execution units) */ +static inline int arm_cpu_has_fast_pmull(void) { + int has_fast_pmull = 0; +#if defined(__APPLE__) + /* On macOS, all Apple Silicon has fast PMULL */ + has_fast_pmull = 1; +#elif defined(ARCH_64BIT) && !defined(_WIN32) + /* We need CPUID feature to read MIDR register */ + if (!arm_has_cpuid()) + return has_fast_pmull; + + uint64_t midr; + __asm__ ("mrs %0, midr_el1" : "=r" (midr)); + + uint32_t implementer = MIDR_IMPLEMENTOR(midr); + uint32_t part = MIDR_PARTNUM(midr); + + if (implementer == ARM_IMPLEMENTER_APPLE) { + /* All Apple Silicon (M1+) have fast PMULL */ + has_fast_pmull = 1; + } else if (implementer == ARM_IMPLEMENTER_ARM) { + /* ARM Cortex-X and Neoverse V/N2 series have multi-lane PMULL */ + switch (part) { + case ARM_PART_CORTEX_X1: + case ARM_PART_CORTEX_X1C: + case ARM_PART_CORTEX_X2: + case ARM_PART_CORTEX_X3: + case ARM_PART_CORTEX_X4: + case ARM_PART_CORTEX_X925: + case ARM_PART_NEOVERSE_N2: + case ARM_PART_NEOVERSE_V1: + case ARM_PART_NEOVERSE_V2: + case ARM_PART_NEOVERSE_V3: + has_fast_pmull = 1; + } + } else if (implementer == ARM_IMPLEMENTER_QUALCOMM) { + /* Qualcomm Oryon (Snapdragon X Elite/Plus) has fast PMULL */ + if (part == QUALCOMM_PART_ORYON) + has_fast_pmull = 1; + } +#endif + return has_fast_pmull; +} + +void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) { +#ifdef ARCH_64BIT + features->has_simd = 0; /* never available */ + features->has_neon = 1; /* always available */ +#else + features->has_simd = arm_has_simd(); + features->has_neon = arm_has_neon(); +#endif + features->has_crc32 = arm_has_crc32(); + features->has_pmull = arm_has_pmull(); + features->has_eor3 = arm_has_eor3(); + features->has_fast_pmull = features->has_pmull && arm_cpu_has_fast_pmull(); +} + +#endif diff --git a/neozip/arch/arm/arm_features.h b/neozip/arch/arm/arm_features.h new file mode 100644 index 0000000000..2f17a9ddf0 --- /dev/null +++ b/neozip/arch/arm/arm_features.h @@ -0,0 +1,19 @@ +/* arm_features.h -- check for ARM features. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_FEATURES_H_ +#define ARM_FEATURES_H_ + +struct arm_cpu_features { + int has_simd; + int has_neon; + int has_crc32; + int has_pmull; + int has_eor3; + int has_fast_pmull; +}; + +void Z_INTERNAL arm_check_features(struct arm_cpu_features *features); + +#endif /* ARM_FEATURES_H_ */ diff --git a/neozip/arch/arm/arm_functions.h b/neozip/arch/arm/arm_functions.h new file mode 100644 index 0000000000..bc77adb977 --- /dev/null +++ b/neozip/arch/arm/arm_functions.h @@ -0,0 +1,75 @@ +/* arm_functions.h -- ARM implementations for arch-specific functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_FUNCTIONS_H_ +#define ARM_FUNCTIONS_H_ + +#include "arm_natives.h" + +#ifdef ARM_NEON +uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_neon(deflate_state *const s, uint32_t cur_match); +void slide_hash_neon(deflate_state *s); +#endif + +#ifdef ARM_CRC32 +uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef ARM_PMULL_EOR3 +uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef ARM_SIMD +void slide_hash_armv6(deflate_state *s); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// ARM - SIMD +# ifdef ARM_SIMD_NATIVE +# undef native_slide_hash +# define native_slide_hash slide_hash_armv6 +# endif +// ARM - NEON +# ifdef ARM_NEON_NATIVE +# undef native_adler32 +# define native_adler32 adler32_neon +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_neon +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_neon +# undef native_compare256 +# define native_compare256 compare256_neon +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_neon +# undef native_longest_match +# define native_longest_match longest_match_neon +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_neon +# undef native_slide_hash +# define native_slide_hash slide_hash_neon +# endif +// ARM - CRC32 +# ifdef ARM_CRC32_NATIVE +# undef native_crc32 +# define native_crc32 crc32_armv8 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_armv8 +# endif +// ARM - PMULL EOR3 +# ifdef ARM_PMULL_EOR3_NATIVE +# undef native_crc32 +# define native_crc32 crc32_armv8_pmull_eor3 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_armv8_pmull_eor3 +# endif +#endif + +#endif /* ARM_FUNCTIONS_H_ */ diff --git a/neozip/arch/arm/arm_natives.h b/neozip/arch/arm/arm_natives.h new file mode 100644 index 0000000000..311e33e958 --- /dev/null +++ b/neozip/arch/arm/arm_natives.h @@ -0,0 +1,31 @@ +/* arm_natives.h -- ARM compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ARM_NATIVES_H_ +#define ARM_NATIVES_H_ + +#if defined(__ARM_FEATURE_SIMD32) +# ifdef ARM_SIMD +# define ARM_SIMD_NATIVE +# endif +#endif +/* NEON is guaranteed on ARM64 (like SSE2 on x86-64) */ +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(ARCH_64BIT) +# ifdef ARM_NEON +# define ARM_NEON_NATIVE +# endif +#endif +/* CRC32 is optional in ARMv8.0, mandatory in ARMv8.1+ */ +#if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && __ARM_ARCH >= 801) +# ifdef ARM_CRC32 +# define ARM_CRC32_NATIVE +# endif +#endif +#if defined(__ARM_FEATURE_CRC32) && defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_SHA3) +# ifdef ARM_PMULL_EOR3 +# define ARM_PMULL_EOR3_NATIVE +# endif +#endif + +#endif /* ARM_NATIVES_H_ */ diff --git a/neozip/arch/arm/chunkset_neon.c b/neozip/arch/arm/chunkset_neon.c new file mode 100644 index 0000000000..a891f10fa5 --- /dev/null +++ b/neozip/arch/arm/chunkset_neon.c @@ -0,0 +1,81 @@ +/* chunkset_neon.c -- NEON inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_NEON + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" +#include "neon_intrins.h" +#include "arch/generic/chunk_128bit_perm_idx_lut.h" + +typedef uint8x16_t chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from))); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = vreinterpretq_u8_u32(vdupq_n_u32(zng_memread_4(from))); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = vreinterpretq_u8_u64(vdupq_n_u64(zng_memread_8(from))); +} + +#define CHUNKSIZE chunksize_neon +#define CHUNKCOPY chunkcopy_neon +#define CHUNKUNROLL chunkunroll_neon +#define CHUNKMEMSET chunkmemset_neon +#define CHUNKMEMSET_SAFE chunkmemset_safe_neon + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = vld1q_u8(s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + vst1q_u8(out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + *chunk_rem = lut_rem.remval; + + /* See note in chunkset_ssse3.c for why this is ok */ + __msan_unpoison(buf + dist, 16 - dist); + + /* This version of table is only available on aarch64 */ +#if defined(ARCH_ARM) && defined(ARCH_64BIT) + uint8x16_t ret_vec = vld1q_u8(buf); + + uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128); + return vqtbl1q_u8(ret_vec, perm_vec); +#else + uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1; + perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64); + perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64); + a = vld1_u8(buf); + b = vld1_u8(buf + 8); + ret0 = vtbl1_u8(a, perm_vec0); + uint8x8x2_t ab; + ab.val[0] = a; + ab.val[1] = b; + ret1 = vtbl2_u8(ab, perm_vec1); + return vcombine_u8(ret0, ret1); +#endif +} + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_neon + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/arm/compare256_neon.c b/neozip/arch/arm/compare256_neon.c new file mode 100644 index 0000000000..4ced9fc9ca --- /dev/null +++ b/neozip/arch/arm/compare256_neon.c @@ -0,0 +1,56 @@ +/* compare256_neon.c - NEON version of compare256 + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#if defined(ARM_NEON) +#include "neon_intrins.h" + +static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint8x16_t a, b, cmp; + uint64_t lane; + + a = vld1q_u8(src0); + b = vld1q_u8(src1); + + cmp = veorq_u8(a, b); + + lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0); + if (lane) + return len + zng_ctz64(lane) / 8; + len += 8; + lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1); + if (lane) + return len + zng_ctz64(lane) / 8; + len += 8; + + src0 += 16, src1 += 16; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) { + return compare256_neon_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_neon +#define COMPARE256 compare256_neon_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_neon +#define COMPARE256 compare256_neon_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/arm/crc32_armv8.c b/neozip/arch/arm/crc32_armv8.c new file mode 100644 index 0000000000..59f2b65009 --- /dev/null +++ b/neozip/arch/arm/crc32_armv8.c @@ -0,0 +1,81 @@ +/* crc32_armv8.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler + * Copyright (C) 2016 Yang Zhang + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_CRC32 + +#include "zbuild.h" +#include "acle_intrins.h" +#include "crc32_armv8_p.h" + +Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, + const int COPY) { + uint32_t c = ~crc; + + if (UNLIKELY(len == 1)) { + if (COPY) + *dst = *src; + c = __crc32b(c, *src); + return ~c; + } + + /* Align to 8-byte boundary for tail processing */ + uintptr_t align_diff = ALIGN_DIFF(src, 8); + if (align_diff) + c = crc32_armv8_align(c, &dst, &src, &len, align_diff, COPY); + + while (len >= 64) { + uint64_t d0 = *(const uint64_t *)src; + uint64_t d1 = *(const uint64_t *)(src + 8); + uint64_t d2 = *(const uint64_t *)(src + 16); + uint64_t d3 = *(const uint64_t *)(src + 24); + uint64_t d4 = *(const uint64_t *)(src + 32); + uint64_t d5 = *(const uint64_t *)(src + 40); + uint64_t d6 = *(const uint64_t *)(src + 48); + uint64_t d7 = *(const uint64_t *)(src + 56); + + if (COPY) { + memcpy(dst, &d0, 8); + memcpy(dst + 8, &d1, 8); + memcpy(dst + 16, &d2, 8); + memcpy(dst + 24, &d3, 8); + memcpy(dst + 32, &d4, 8); + memcpy(dst + 40, &d5, 8); + memcpy(dst + 48, &d6, 8); + memcpy(dst + 56, &d7, 8); + dst += 64; + } + + c = __crc32d(c, d0); + c = __crc32d(c, d1); + c = __crc32d(c, d2); + c = __crc32d(c, d3); + c = __crc32d(c, d4); + c = __crc32d(c, d5); + c = __crc32d(c, d6); + c = __crc32d(c, d7); + + src += 64; + len -= 64; + } + + return crc32_armv8_tail(c, dst, src, len, COPY); +} + +Z_INTERNAL Z_TARGET_CRC uint32_t crc32_armv8(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL Z_TARGET_CRC uint32_t crc32_copy_armv8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { +#if OPTIMAL_CMP >= 32 + return crc32_copy_impl(crc, dst, src, len, 1); +#else + /* Without unaligned access, interleaved stores get decomposed into byte ops */ + crc = crc32_armv8(crc, src, len); + memcpy(dst, src, len); + return crc; +#endif +} +#endif diff --git a/neozip/arch/arm/crc32_armv8_p.h b/neozip/arch/arm/crc32_armv8_p.h new file mode 100644 index 0000000000..e72c4c0ad1 --- /dev/null +++ b/neozip/arch/arm/crc32_armv8_p.h @@ -0,0 +1,103 @@ +/* crc32_armv8_p.h -- Private shared inline ARMv8 CRC32 functions + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CRC32_ARMV8_P_H +#define CRC32_ARMV8_P_H + +#include "zbuild.h" +#include "acle_intrins.h" + +Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_align(uint32_t crc, uint8_t **dst, const uint8_t **buf, + size_t *len, uintptr_t align_diff, const int COPY) { + if (*len && (align_diff & 1)) { + uint8_t val = **buf; + if (COPY) { + **dst = val; + *dst += 1; + } + crc = __crc32b(crc, val); + *buf += 1; + *len -= 1; + } + + if (*len >= 2 && (align_diff & 2)) { + uint16_t val = *((uint16_t*)*buf); + if (COPY) { + memcpy(*dst, &val, 2); + *dst += 2; + } + crc = __crc32h(crc, val); + *buf += 2; + *len -= 2; + } + + if (*len >= 4 && (align_diff & 4)) { + uint32_t val = *((uint32_t*)*buf); + if (COPY) { + memcpy(*dst, &val, 4); + *dst += 4; + } + crc = __crc32w(crc, val); + *buf += 4; + *len -= 4; + } + + if (*len >= 8 && (align_diff & 8)) { + uint64_t val = *((uint64_t*)*buf); + if (COPY) { + memcpy(*dst, &val, 8); + *dst += 8; + } + crc = __crc32d(crc, val); + *buf += 8; + *len -= 8; + } + + return crc; +} + +Z_FORCEINLINE static Z_TARGET_CRC uint32_t crc32_armv8_tail(uint32_t crc, uint8_t *dst, const uint8_t *buf, + size_t len, const int COPY) { + while (len >= 8) { + uint64_t val = *((uint64_t*)buf); + if (COPY) { + memcpy(dst, &val, 8); + dst += 8; + } + crc = __crc32d(crc, val); + buf += 8; + len -= 8; + } + + if (len & 4) { + uint32_t val = *((uint32_t*)buf); + if (COPY) { + memcpy(dst, &val, 4); + dst += 4; + } + crc = __crc32w(crc, val); + buf += 4; + } + + if (len & 2) { + uint16_t val = *((uint16_t*)buf); + if (COPY) { + memcpy(dst, &val, 2); + dst += 2; + } + crc = __crc32h(crc, val); + buf += 2; + } + + if (len & 1) { + uint8_t val = *buf; + if (COPY) + *dst = val; + crc = __crc32b(crc, val); + } + + return ~crc; +} + +#endif /* CRC32_ARMV8_P_H */ diff --git a/neozip/arch/arm/crc32_armv8_pmull_eor3.c b/neozip/arch/arm/crc32_armv8_pmull_eor3.c new file mode 100644 index 0000000000..e0d5bf043b --- /dev/null +++ b/neozip/arch/arm/crc32_armv8_pmull_eor3.c @@ -0,0 +1,366 @@ +/* crc32_armv8_pmull_eor3.c -- ARMv8 CRC32 using PMULL + EOR3 (SHA3 extension) + * Copyright (C) 2025 Peter Cawley + * https://github.com/corsix/fast-crc32 + * For conditions of distribution and use, see copyright notice in zlib.h + * + * This uses EOR3 (3-way XOR) from ARMv8.2-A SHA3 extension to save instructions. + * Uses 3-way parallel scalar CRC + 9 PMULL vector lanes, processing 192 bytes/iter. + */ + +#ifdef ARM_PMULL_EOR3 + +#include "zbuild.h" +#include "zutil.h" +#include "acle_intrins.h" +#include "neon_intrins.h" +#include "crc32_armv8_p.h" + +/* Carryless multiply low 64 bits: a[0] * b[0] */ +static inline uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) { +#ifdef _MSC_VER + return vreinterpretq_u64_p128(vmull_p64( + vget_low_p64(vreinterpret_p64_u64(a)), + vget_low_p64(vreinterpret_p64_u64(b)))); +#else + return vreinterpretq_u64_p128(vmull_p64( + vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(a)), 0), + vget_lane_p64(vreinterpret_p64_u64(vget_low_u64(b)), 0))); +#endif +} + +/* Carryless multiply high 64 bits: a[1] * b[1] */ +static inline uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) { + return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b))); +} + +/* Carryless multiply of two 32-bit scalars: a * b (returns 64-bit result in 128-bit vector) */ +static inline uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { +#ifdef _MSC_VER + return vreinterpretq_u64_p128(vmull_p64(vdup_n_p64((poly64_t)a), vdup_n_p64((poly64_t)b))); +#else + return vreinterpretq_u64_p128(vmull_p64((poly64_t)a, (poly64_t)b)); +#endif +} + +/* Compute x^n mod P (CRC-32 polynomial) in log(n) time, where P = 0x104c11db7 */ +static uint32_t xnmodp(uint64_t n) { + uint64_t stack = ~(uint64_t)1; + uint32_t acc, low; + for (; n > 191; n = (n >> 1) - 16) { + stack = (stack << 1) + (n & 1); + } + stack = ~stack; + acc = ((uint32_t)0x80000000) >> (n & 31); + for (n >>= 5; n; --n) { + acc = __crc32w(acc, 0); + } + while ((low = stack & 1), stack >>= 1) { + poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); + uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); + acc = __crc32d(0, y << low); + } + return acc; +} + +/* Shift CRC forward by nbytes: equivalent to appending nbytes of zeros to the data stream */ +static inline uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { + Assert(nbytes >= 5, "crc_shift requires nbytes >= 5"); + return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); +} + +Z_FORCEINLINE static Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, + size_t len, const int COPY) { + uint32_t crc0 = ~crc; + + if (UNLIKELY(len == 1)) { + if (COPY) + *dst = *src; + crc0 = __crc32b(crc0, *src); + return ~crc0; + } + + /* Align to 16-byte boundary for vector path */ + uintptr_t align_diff = ALIGN_DIFF(src, 16); + if (align_diff) + crc0 = crc32_armv8_align(crc0, &dst, &src, &len, align_diff, COPY); + + /* 3-way scalar CRC + 9-way PMULL folding (192 bytes/iter) */ + if (len >= 192) { + size_t blk = len / 192; /* Number of 192-byte blocks */ + size_t klen = blk * 16; /* Scalar stride per CRC lane */ + const uint8_t *end = src + len; + const uint8_t *src0 = src; + const uint8_t *src1 = src + klen; + const uint8_t *src2 = src + klen * 2; + const uint8_t *srcv = src + klen * 3; /* Vector data starts after scalar lanes */ + uint32_t crc1 = 0, crc2 = 0; + uint64x2_t vc0, vc1, vc2; + uint64_t vc; + + /* Load first 9 vector chunks (144 bytes) */ + uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)srcv, 128), y0; + uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128), y1; + uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128), y2; + uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128), y3; + uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128), y4; + uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128), y5; + uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128), y6; + uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128), y7; + uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128), y8; + uint64x2_t k; + /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */ + { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); } + + /* Per-region dst pointers */ + uint8_t *dst0 = dst; + uint8_t *dst1 = NULL; + uint8_t *dst2 = NULL; + uint8_t *dst_v = NULL; + + if (COPY) { + dst1 = dst + klen; + dst2 = dst + klen * 2; + dst_v = dst + klen * 3; + vst1q_u8(dst_v, vreinterpretq_u8_u64(x0)); + vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(x1)); + vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(x2)); + vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(x3)); + vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(x4)); + vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(x5)); + vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(x6)); + vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(x7)); + vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(x8)); + dst_v += 144; + } + srcv += 144; + + /* Fold 9 vectors + 3-way parallel scalar CRC */ + if (blk > 1) { + /* Only form a limit pointer when we have at least 2 blocks. */ + const uint8_t *limit = src0 + klen - 32; + while (src0 <= limit) { + /* Fold all 9 vector lanes using PMULL */ + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); + y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k); + y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); + y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k); + y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k); + + /* EOR3: combine hi*k, lo*k, and new data in one instruction */ + { + uint64x2_t d0 = vld1q_u64_ex((const uint64_t*)srcv, 128); + uint64x2_t d1 = vld1q_u64_ex((const uint64_t*)(srcv + 16), 128); + uint64x2_t d2 = vld1q_u64_ex((const uint64_t*)(srcv + 32), 128); + uint64x2_t d3 = vld1q_u64_ex((const uint64_t*)(srcv + 48), 128); + uint64x2_t d4 = vld1q_u64_ex((const uint64_t*)(srcv + 64), 128); + uint64x2_t d5 = vld1q_u64_ex((const uint64_t*)(srcv + 80), 128); + uint64x2_t d6 = vld1q_u64_ex((const uint64_t*)(srcv + 96), 128); + uint64x2_t d7 = vld1q_u64_ex((const uint64_t*)(srcv + 112), 128); + uint64x2_t d8 = vld1q_u64_ex((const uint64_t*)(srcv + 128), 128); + if (COPY) { + vst1q_u8(dst_v, vreinterpretq_u8_u64(d0)); + vst1q_u8(dst_v + 16, vreinterpretq_u8_u64(d1)); + vst1q_u8(dst_v + 32, vreinterpretq_u8_u64(d2)); + vst1q_u8(dst_v + 48, vreinterpretq_u8_u64(d3)); + vst1q_u8(dst_v + 64, vreinterpretq_u8_u64(d4)); + vst1q_u8(dst_v + 80, vreinterpretq_u8_u64(d5)); + vst1q_u8(dst_v + 96, vreinterpretq_u8_u64(d6)); + vst1q_u8(dst_v + 112, vreinterpretq_u8_u64(d7)); + vst1q_u8(dst_v + 128, vreinterpretq_u8_u64(d8)); + dst_v += 144; + } + x0 = veor3q_u64(x0, y0, d0); + x1 = veor3q_u64(x1, y1, d1); + x2 = veor3q_u64(x2, y2, d2); + x3 = veor3q_u64(x3, y3, d3); + x4 = veor3q_u64(x4, y4, d4); + x5 = veor3q_u64(x5, y5, d5); + x6 = veor3q_u64(x6, y6, d6); + x7 = veor3q_u64(x7, y7, d7); + x8 = veor3q_u64(x8, y8, d8); + } + + /* 3-way parallel scalar CRC (16 bytes each) */ + { + uint64_t s0a = *(const uint64_t*)src0; + uint64_t s0b = *(const uint64_t*)(src0 + 8); + uint64_t s1a = *(const uint64_t*)src1; + uint64_t s1b = *(const uint64_t*)(src1 + 8); + uint64_t s2a = *(const uint64_t*)src2; + uint64_t s2b = *(const uint64_t*)(src2 + 8); + if (COPY) { + memcpy(dst0, &s0a, 8); + memcpy(dst0 + 8, &s0b, 8); + dst0 += 16; + memcpy(dst1, &s1a, 8); + memcpy(dst1 + 8, &s1b, 8); + dst1 += 16; + memcpy(dst2, &s2a, 8); + memcpy(dst2 + 8, &s2b, 8); + dst2 += 16; + } + crc0 = __crc32d(crc0, s0a); + crc0 = __crc32d(crc0, s0b); + crc1 = __crc32d(crc1, s1a); + crc1 = __crc32d(crc1, s1b); + crc2 = __crc32d(crc2, s2a); + crc2 = __crc32d(crc2, s2b); + } + src0 += 16; + src1 += 16; + src2 += 16; + srcv += 144; + } + } + + /* Reduce 9 vectors to 1 using tree reduction */ + /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */ + { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); } + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x1); + x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; + + /* Step 2: fold pairs (x0,x1), (x2,x3), (x4,x5), (x6,x7) */ + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); + y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); + x0 = veor3q_u64(x0, y0, x1); + x2 = veor3q_u64(x2, y2, x3); + x4 = veor3q_u64(x4, y4, x5); + x6 = veor3q_u64(x6, y6, x7); + + /* Step 3: fold pairs (x0,x2), (x4,x6) */ + { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); } + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); + x0 = veor3q_u64(x0, y0, x2); + x4 = veor3q_u64(x4, y4, x6); + + /* Step 4: final fold (x0, x4) -> x0 */ + { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); } + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + x0 = veor3q_u64(x0, y0, x4); + + /* Process final scalar chunk */ + { + uint64_t s0a = *(const uint64_t*)src0; + uint64_t s0b = *(const uint64_t*)(src0 + 8); + uint64_t s1a = *(const uint64_t*)src1; + uint64_t s1b = *(const uint64_t*)(src1 + 8); + uint64_t s2a = *(const uint64_t*)src2; + uint64_t s2b = *(const uint64_t*)(src2 + 8); + if (COPY) { + memcpy(dst0, &s0a, 8); + memcpy(dst0 + 8, &s0b, 8); + memcpy(dst1, &s1a, 8); + memcpy(dst1 + 8, &s1b, 8); + memcpy(dst2, &s2a, 8); + memcpy(dst2 + 8, &s2b, 8); + } + crc0 = __crc32d(crc0, s0a); + crc0 = __crc32d(crc0, s0b); + crc1 = __crc32d(crc1, s1a); + crc1 = __crc32d(crc1, s1b); + crc2 = __crc32d(crc2, s2a); + crc2 = __crc32d(crc2, s2b); + } + + /* Shift and combine 3 scalar CRCs */ + vc0 = crc_shift(crc0, klen * 2 + blk * 144); + vc1 = crc_shift(crc1, klen + blk * 144); + vc2 = crc_shift(crc2, blk * 144); + vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); + + /* Final reduction: 128-bit vector + scalar CRCs -> 32-bit */ + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); + if (COPY) + dst += blk * 192; + src = srcv; + len = end - srcv; + } + + /* 3-way scalar CRC (24 bytes/iter) */ + if (len >= 80) { + size_t klen = ((len - 8) / 24) * 8; /* Stride for 3-way parallel */ + const uint8_t *buf0 = src; + const uint8_t *buf1 = src + klen; + const uint8_t *buf2 = src + klen * 2; + uint32_t crc1 = 0, crc2 = 0; + uint64x2_t vc0, vc1; + uint64_t vc; + + /* Per-lane dst pointers */ + uint8_t *dst0 = dst; + uint8_t *dst1 = NULL; + uint8_t *dst2 = NULL; + if (COPY) { + dst1 = dst + klen; + dst2 = dst + klen * 2; + } + + /* 3-way parallel scalar CRC */ + do { + uint64_t v0 = *(const uint64_t*)buf0; + uint64_t v1 = *(const uint64_t*)buf1; + uint64_t v2 = *(const uint64_t*)buf2; + if (COPY) { + memcpy(dst0, &v0, 8); + dst0 += 8; + memcpy(dst1, &v1, 8); + dst1 += 8; + memcpy(dst2, &v2, 8); + dst2 += 8; + } + crc0 = __crc32d(crc0, v0); + crc1 = __crc32d(crc1, v1); + crc2 = __crc32d(crc2, v2); + buf0 += 8; + buf1 += 8; + buf2 += 8; + len -= 24; + } while (len >= 32); + + /* Combine the 3 CRCs */ + vc0 = crc_shift(crc0, klen * 2 + 8); + vc1 = crc_shift(crc1, klen + 8); + vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); + + /* Process final 8 bytes with combined CRC */ + crc0 = crc2; + { + uint64_t vf = *(const uint64_t*)buf2; + if (COPY) + memcpy(dst2, &vf, 8); + crc0 = __crc32d(crc0, vf ^ vc); + } + src = buf2 + 8; + len -= 8; + if (COPY) + dst = dst2 + 8; + } + + /* Process remaining bytes */ + return crc32_armv8_tail(crc0, dst, src, len, COPY); +} + +Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_copy_armv8_pmull_eor3(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { +#if OPTIMAL_CMP >= 32 + return crc32_copy_impl(crc, dst, src, len, 1); +#else + /* Without unaligned access, interleaved stores get decomposed into byte ops */ + crc = crc32_armv8_pmull_eor3(crc, src, len); + memcpy(dst, src, len); + return crc; +#endif +} +#endif diff --git a/neozip/arch/arm/neon_intrins.h b/neozip/arch/arm/neon_intrins.h new file mode 100644 index 0000000000..449916e0b7 --- /dev/null +++ b/neozip/arch/arm/neon_intrins.h @@ -0,0 +1,79 @@ +#ifndef ARM_NEON_INTRINS_H +#define ARM_NEON_INTRINS_H + +#if defined(_MSC_VER) && defined(ARCH_ARM) && defined(ARCH_64BIT) +/* arm64_neon.h is MSVC specific */ +# include <arm64_neon.h> +#else +# include <arm_neon.h> +#endif + +#if defined(ARM_NEON) && defined(ARCH_ARM) && defined(ARCH_32BIT) +/* Compatibility shim for the _high family of functions */ +#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b)) +#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c)) +#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c)) +#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b)) +#endif + +#ifdef ARM_NEON + +#define vqsubq_u16_x4_x1(out, a, b) do { \ + out.val[0] = vqsubq_u16(a.val[0], b); \ + out.val[1] = vqsubq_u16(a.val[1], b); \ + out.val[2] = vqsubq_u16(a.val[2], b); \ + out.val[3] = vqsubq_u16(a.val[3], b); \ +} while (0) + +# if defined(ARCH_ARM) && defined(ARCH_32BIT) && defined(__clang__) && \ + (!defined(__clang_major__) || __clang_major__ < 20) +/* Clang versions before 20 have too strict of an + * alignment requirement (:256) for x4 NEON intrinsics */ +# undef ARM_NEON_HASLD4 +# undef vld1q_u16_x4 +# undef vld1q_u8_x4 +# undef vst1q_u16_x4 +# endif + +# ifndef ARM_NEON_HASLD4 + +static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) { + uint16x8x4_t ret; + ret.val[0] = vld1q_u16(a); + ret.val[1] = vld1q_u16(a+8); + ret.val[2] = vld1q_u16(a+16); + ret.val[3] = vld1q_u16(a+24); + return ret; +} + +static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) { + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(a); + ret.val[1] = vld1q_u8(a+16); + ret.val[2] = vld1q_u8(a+32); + ret.val[3] = vld1q_u8(a+48); + return ret; +} + +static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { + vst1q_u16(p, a.val[0]); + vst1q_u16(p + 8, a.val[1]); + vst1q_u16(p + 16, a.val[2]); + vst1q_u16(p + 24, a.val[3]); +} +# endif // HASLD4 check + +# ifndef _MSC_VER +# define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8)) +# endif +# if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4) +# define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8)) +# define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a) +# endif + +#endif + +#endif // include guard ARM_NEON_INTRINS_H diff --git a/neozip/arch/arm/slide_hash_armv6.c b/neozip/arch/arm/slide_hash_armv6.c new file mode 100644 index 0000000000..b241e6c5e6 --- /dev/null +++ b/neozip/arch/arm/slide_hash_armv6.c @@ -0,0 +1,49 @@ +/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions + * Copyright (C) 2023 Cameron Cawley + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_SIMD + +#include "zbuild.h" +#include "acle_intrins.h" +#include "deflate.h" + +/* SIMD version of hash_chain rebase */ +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + Z_REGISTER uint16x2_t v; + uint16x2_t p0, p1, p2, p3; + Z_REGISTER size_t n; + + size_t size = entries*sizeof(table[0]); + Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err"); + + Assert(sizeof(Pos) == 2, "Wrong Pos size"); + v = wsize | (wsize << 16); + + n = size / (sizeof(uint16x2_t) * 4); + do { + p0 = *((const uint16x2_t *)(table)); + p1 = *((const uint16x2_t *)(table+2)); + p2 = *((const uint16x2_t *)(table+4)); + p3 = *((const uint16x2_t *)(table+6)); + p0 = __uqsub16(p0, v); + p1 = __uqsub16(p1, v); + p2 = __uqsub16(p2, v); + p3 = __uqsub16(p3, v); + *((uint16x2_t *)(table)) = p0; + *((uint16x2_t *)(table+2)) = p1; + *((uint16x2_t *)(table+4)) = p2; + *((uint16x2_t *)(table+6)) = p3; + table += 8; + } while (--n); +} + +Z_INTERNAL void slide_hash_armv6(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} +#endif diff --git a/neozip/arch/arm/slide_hash_neon.c b/neozip/arch/arm/slide_hash_neon.c new file mode 100644 index 0000000000..2f9e94a33d --- /dev/null +++ b/neozip/arch/arm/slide_hash_neon.c @@ -0,0 +1,48 @@ +/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions + * Copyright (C) 2017-2020 Mika T. Lindqvist + * + * Authors: + * Mika T. Lindqvist <postmaster@raasu.org> + * Jun He <jun.he@arm.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef ARM_NEON + +#include "zbuild.h" +#include "neon_intrins.h" +#include "deflate.h" + +/* SIMD version of hash_chain rebase */ +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + Z_REGISTER uint16x8_t v; + uint16x8x4_t p0, p1; + Z_REGISTER size_t n; + + size_t size = entries*sizeof(table[0]); + Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); + + Assert(sizeof(Pos) == 2, "Wrong Pos size"); + v = vdupq_n_u16(wsize); + + n = size / (sizeof(uint16x8_t) * 8); + do { + p0 = vld1q_u16_x4_ex(table, 256); + p1 = vld1q_u16_x4_ex(table+32, 256); + vqsubq_u16_x4_x1(p0, p0, v); + vqsubq_u16_x4_x1(p1, p1, v); + vst1q_u16_x4_ex(table, p0, 256); + vst1q_u16_x4_ex(table+32, p1, 256); + table += 64; + } while (--n); +} + +Z_INTERNAL void slide_hash_neon(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} +#endif diff --git a/neozip/arch/generic/Makefile.in b/neozip/arch/generic/Makefile.in new file mode 100644 index 0000000000..1d9cc4df5b --- /dev/null +++ b/neozip/arch/generic/Makefile.in @@ -0,0 +1,68 @@ +# Makefile for zlib-ng +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# Copyright (C) 2024 Hans Kristian Rosbach +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +all: \ + adler32_c.o adler32_c.lo \ + chunkset_c.o chunkset_c.lo \ + compare256_c.o compare256_c.lo \ + crc32_braid_c.o crc32_braid_c.lo \ + crc32_chorba_c.o crc32_chorba_c.lo \ + slide_hash_c.o slide_hash_c.lo + + +adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c + +adler32_c.lo: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c + +chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c + +chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c + +compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c + +compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c + +crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c + +crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c + +crc32_chorba_c.o: $(SRCDIR)/crc32_chorba_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c + +crc32_chorba_c.lo: $(SRCDIR)/crc32_chorba_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c + +slide_hash_c.o: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c + +slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c + + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/generic/adler32_c.c b/neozip/arch/generic/adler32_c.c new file mode 100644 index 0000000000..84c946f452 --- /dev/null +++ b/neozip/arch/generic/adler32_c.c @@ -0,0 +1,55 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011, 2016 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "functable.h" +#include "adler32_p.h" + +Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { + uint32_t sum2; + unsigned n; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0); + + /* do length NMAX blocks -- requires just one modulo operation */ + while (len >= NMAX) { + len -= NMAX; +#ifdef UNROLL_MORE + n = NMAX / 16; /* NMAX is divisible by 16 */ +#else + n = NMAX / 8; /* NMAX is divisible by 8 */ +#endif + do { +#ifdef UNROLL_MORE + ADLER_DO16(adler, sum2, buf); /* 16 sums unrolled */ + buf += 16; +#else + ADLER_DO8(adler, sum2, buf, 0); /* 8 sums unrolled */ + buf += 8; +#endif + } while (--n); + adler %= BASE; + sum2 %= BASE; + } + + /* do remaining bytes (less than NMAX, still just one modulo) */ + return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0); +} + +Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + adler = FUNCTABLE_CALL(adler32)(adler, src, len); + memcpy(dst, src, len); + return adler; +} diff --git a/neozip/arch/generic/chunk_128bit_perm_idx_lut.h b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h new file mode 100644 index 0000000000..6e5098bf26 --- /dev/null +++ b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h @@ -0,0 +1,26 @@ +/* chunk_128bit_perm_idx_lut.h - shared SSSE3/NEON/LSX permutation idx lut for use with chunkmemset family of functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CHUNK_128BIT_PERM_IDX_LUT_H_ +#define CHUNK_128BIT_PERM_IDX_LUT_H_ + +#include "chunk_permute_table.h" + +static const lut_rem_pair perm_idx_lut[13] = { + {0, 1}, /* 3 */ + {0, 0}, /* don't care */ + {1 * 32, 1}, /* 5 */ + {2 * 32, 4}, /* 6 */ + {3 * 32, 2}, /* 7 */ + {0 * 32, 0}, /* don't care */ + {4 * 32, 7}, /* 9 */ + {5 * 32, 6}, /* 10 */ + {6 * 32, 5}, /* 11 */ + {7 * 32, 4}, /* 12 */ + {8 * 32, 3}, /* 13 */ + {9 * 32, 2}, /* 14 */ + {10 * 32, 1},/* 15 */ +}; + +#endif diff --git a/neozip/arch/generic/chunk_256bit_perm_idx_lut.h b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h new file mode 100644 index 0000000000..796a7df120 --- /dev/null +++ b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h @@ -0,0 +1,47 @@ +/* chunk_256bit_perm_idx_lut.h - shared AVX512/AVX2/LASX permutation idx lut for use with chunkmemset family of functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef CHUNK_256BIT_PERM_IDX_LUT_H_ +#define CHUNK_256BIT_PERM_IDX_LUT_H_ + +#include "chunk_permute_table.h" + +/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can + * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ +static const lut_rem_pair perm_idx_lut[29] = { + { 0, 2}, /* 3 */ + { 0, 0}, /* don't care */ + { 1 * 32, 2}, /* 5 */ + { 2 * 32, 2}, /* 6 */ + { 3 * 32, 4}, /* 7 */ + { 0 * 32, 0}, /* don't care */ + { 4 * 32, 5}, /* 9 */ + { 5 * 32, 22}, /* 10 */ + { 6 * 32, 21}, /* 11 */ + { 7 * 32, 20}, /* 12 */ + { 8 * 32, 6}, /* 13 */ + { 9 * 32, 4}, /* 14 */ + {10 * 32, 2}, /* 15 */ + { 0 * 32, 0}, /* don't care */ + {11 * 32, 15}, /* 17 */ + {11 * 32 + 16, 14}, /* 18 */ + {11 * 32 + 16 * 2, 13}, /* 19 */ + {11 * 32 + 16 * 3, 12}, /* 20 */ + {11 * 32 + 16 * 4, 11}, /* 21 */ + {11 * 32 + 16 * 5, 10}, /* 22 */ + {11 * 32 + 16 * 6, 9}, /* 23 */ + {11 * 32 + 16 * 7, 8}, /* 24 */ + {11 * 32 + 16 * 8, 7}, /* 25 */ + {11 * 32 + 16 * 9, 6}, /* 26 */ + {11 * 32 + 16 * 10, 5}, /* 27 */ + {11 * 32 + 16 * 11, 4}, /* 28 */ + {11 * 32 + 16 * 12, 3}, /* 29 */ + {11 * 32 + 16 * 13, 2}, /* 30 */ + {11 * 32 + 16 * 14, 1} /* 31 */ +}; + +static const uint16_t half_rem_vals[13] = { + 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1 +}; + +#endif diff --git a/neozip/arch/generic/chunk_permute_table.h b/neozip/arch/generic/chunk_permute_table.h new file mode 100644 index 0000000000..bad66ccc77 --- /dev/null +++ b/neozip/arch/generic/chunk_permute_table.h @@ -0,0 +1,53 @@ +/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CHUNK_PERMUTE_TABLE_H_ +#define CHUNK_PERMUTE_TABLE_H_ + +#include "zbuild.h" + +/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */ +static const ALIGNED_(32) uint8_t permute_table[26*32] = { + 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */ + 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */ + 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */ + 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */ + + /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute + * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual + * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity, + * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but, + * this is what we're dealt. + */ + + 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */ + 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */ + 16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */ + 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */ + 16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */ + 16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */ + 16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */ + 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */ +}; + +typedef struct lut_rem_pair_s { + uint16_t idx; + uint16_t remval; +} lut_rem_pair; + +#endif diff --git a/neozip/arch/generic/chunkset_c.c b/neozip/arch/generic/chunkset_c.c new file mode 100644 index 0000000000..ff9b1cb5fb --- /dev/null +++ b/neozip/arch/generic/chunkset_c.c @@ -0,0 +1,40 @@ +/* chunkset.c -- inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zmemory.h" + +typedef uint64_t chunk_t; + +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + uint32_t tmp = zng_memread_4(from); + *chunk = tmp | ((chunk_t)tmp << 32); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = zng_memread_8(from); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = zng_memread_8(s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + zng_memwrite_8(out, *chunk); +} + +#define CHUNKSIZE chunksize_c +#define CHUNKCOPY chunkcopy_c +#define CHUNKUNROLL chunkunroll_c +#define CHUNKMEMSET chunkmemset_c +#define CHUNKMEMSET_SAFE chunkmemset_safe_c + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_c + +#include "inffast_tpl.h" diff --git a/neozip/arch/generic/compare256_c.c b/neozip/arch/generic/compare256_c.c new file mode 100644 index 0000000000..6934a55565 --- /dev/null +++ b/neozip/arch/generic/compare256_c.c @@ -0,0 +1,88 @@ +/* compare256.c -- 256 byte memory comparison with match length return + * Copyright (C) 2020 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "deflate.h" +#include "fallback_builtins.h" + +/* 8-bit integer comparison for hardware without unaligned loads */ +static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + if (src0[0] != src1[0]) + return len; + if (src0[1] != src1[1]) + return len + 1; + if (src0[2] != src1[2]) + return len + 2; + if (src0[3] != src1[3]) + return len + 3; + if (src0[4] != src1[4]) + return len + 4; + if (src0[5] != src1[5]) + return len + 5; + if (src0[6] != src1[6]) + return len + 6; + if (src0[7] != src1[7]) + return len + 7; + src0 += 8, src1 += 8, len += 8; + } while (len < 256); + + return 256; +} + +/* 64-bit integer comparison for hardware with unaligned loads */ +static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + uint64_t sv = zng_memread_8(src0); + uint64_t mv = zng_memread_8(src1); + uint64_t diff = sv ^ mv; + if (diff) + return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; + src0 += 8, src1 += 8, len += 8; + + sv = zng_memread_8(src0); + mv = zng_memread_8(src1); + diff = sv ^ mv; + if (diff) + return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8; + src0 += 8, src1 += 8, len += 8; + } while (len < 256); + + return 256; +} + +#if OPTIMAL_CMP == 8 +# define COMPARE256 compare256_8_static +#else +# define COMPARE256 compare256_64_static +#endif + +#ifdef WITH_ALL_FALLBACKS +Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) { + return compare256_8_static(src0, src1); +} + +Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) { + return compare256_64_static(src0, src1); +} +#endif + +Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { + return COMPARE256(src0, src1); +} + +// Generate longest_match_c +#define LONGEST_MATCH longest_match_c +#include "match_tpl.h" + +// Generate longest_match_slow_c +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_c +#include "match_tpl.h" diff --git a/neozip/arch/generic/compare256_p.h b/neozip/arch/generic/compare256_p.h new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/neozip/arch/generic/compare256_p.h diff --git a/neozip/arch/generic/crc32_braid_c.c b/neozip/arch/generic/crc32_braid_c.c new file mode 100644 index 0000000000..bda4a249bb --- /dev/null +++ b/neozip/arch/generic/crc32_braid_c.c @@ -0,0 +1,213 @@ +/* crc32_braid.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2022 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * This interleaved implementation of a CRC makes use of pipelined multiple + * arithmetic-logic units, commonly found in modern CPU cores. It is due to + * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. + */ + +#include "zbuild.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include "crc32_p.h" + +/* + A CRC of a message is computed on BRAID_N braids of words in the message, where + each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then + three running sparse CRCs are calculated respectively on each braid, at these + indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... + This is done starting at a word boundary, and continues until as many blocks of + BRAID_N * BRAID_W bytes as are available have been processed. The results are + combined into a single CRC at the end. For this code, BRAID_N must be in the + range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased + if desired by adding more #if blocks, extending the patterns apparent in the code. + In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N + value is increased. + + BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time + on a given processor. The choices for BRAID_N and BRAID_W below were based on + testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC + POWER9, and MIPS64 Octeon II processors. + The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8. + The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4. + They were all tested with either gcc or clang, all using the -O3 optimization + level. Your mileage may vary. +*/ + +/* ========================================================================= */ +#ifdef BRAID_W +/* + Return the CRC of the BRAID_W bytes in the word_t data, taking the + least-significant byte of the word as the first byte of data, without any pre + or post conditioning. This is used to combine the CRCs of each braid. + */ +# if BYTE_ORDER == LITTLE_ENDIAN +static uint32_t crc_word(z_word_t data) { + int k; + for (k = 0; k < BRAID_W; k++) + data = (data >> 8) ^ crc_table[data & 0xff]; + return (uint32_t)data; +} +# elif BYTE_ORDER == BIG_ENDIAN +static z_word_t crc_word(z_word_t data) { + int k; + for (k = 0; k < BRAID_W; k++) + data = (data << 8) ^ + crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff]; + return data; +} +# endif /* BYTE_ORDER */ +#endif /* BRAID_W */ + +/* ========================================================================= */ +Z_INTERNAL uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len) { + crc = ~crc; + +#ifdef BRAID_W + /* If provided enough bytes, do a braided CRC calculation. */ + if (len >= BRAID_N * BRAID_W + BRAID_W - 1) { + size_t blks; + z_word_t const *words; + int k; + + /* Compute the CRC up to a z_word_t boundary. */ + size_t align_diff = (size_t)MIN(ALIGN_DIFF(buf, BRAID_W), len); + if (align_diff) { + crc = crc32_copy_small(crc, NULL, buf, align_diff, BRAID_W - 1, 0); + len -= align_diff; + buf += align_diff; + } + + /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */ + blks = len / (BRAID_N * BRAID_W); + len -= blks * BRAID_N * BRAID_W; + words = (z_word_t const *)buf; + + z_word_t crc0, word0, comb; +#if BRAID_N > 1 + z_word_t crc1, word1; +#if BRAID_N > 2 + z_word_t crc2, word2; +#if BRAID_N > 3 + z_word_t crc3, word3; +#if BRAID_N > 4 + z_word_t crc4, word4; +#if BRAID_N > 5 + z_word_t crc5, word5; +#endif +#endif +#endif +#endif +#endif + /* Initialize the CRC for each braid. */ + crc0 = Z_WORD_FROM_LE(crc); +#if BRAID_N > 1 + crc1 = 0; +#if BRAID_N > 2 + crc2 = 0; +#if BRAID_N > 3 + crc3 = 0; +#if BRAID_N > 4 + crc4 = 0; +#if BRAID_N > 5 + crc5 = 0; +#endif +#endif +#endif +#endif +#endif + /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */ + while (--blks) { + /* Load the word for each braid into registers. */ + word0 = crc0 ^ words[0]; +#if BRAID_N > 1 + word1 = crc1 ^ words[1]; +#if BRAID_N > 2 + word2 = crc2 ^ words[2]; +#if BRAID_N > 3 + word3 = crc3 ^ words[3]; +#if BRAID_N > 4 + word4 = crc4 ^ words[4]; +#if BRAID_N > 5 + word5 = crc5 ^ words[5]; +#endif +#endif +#endif +#endif +#endif + words += BRAID_N; + + /* Compute and update the CRC for each word. The loop should get unrolled. */ + crc0 = BRAID_TABLE[0][word0 & 0xff]; +#if BRAID_N > 1 + crc1 = BRAID_TABLE[0][word1 & 0xff]; +#if BRAID_N > 2 + crc2 = BRAID_TABLE[0][word2 & 0xff]; +#if BRAID_N > 3 + crc3 = BRAID_TABLE[0][word3 & 0xff]; +#if BRAID_N > 4 + crc4 = BRAID_TABLE[0][word4 & 0xff]; +#if BRAID_N > 5 + crc5 = BRAID_TABLE[0][word5 & 0xff]; +#endif +#endif +#endif +#endif +#endif + for (k = 1; k < BRAID_W; k++) { + crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff]; +#if BRAID_N > 1 + crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff]; +#if BRAID_N > 2 + crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff]; +#if BRAID_N > 3 + crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff]; +#if BRAID_N > 4 + crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff]; +#if BRAID_N > 5 + crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff]; +#endif +#endif +#endif +#endif +#endif + } + } + + /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */ + comb = crc_word(crc0 ^ words[0]); +#if BRAID_N > 1 + comb = crc_word(crc1 ^ words[1] ^ comb); +#if BRAID_N > 2 + comb = crc_word(crc2 ^ words[2] ^ comb); +#if BRAID_N > 3 + comb = crc_word(crc3 ^ words[3] ^ comb); +#if BRAID_N > 4 + comb = crc_word(crc4 ^ words[4] ^ comb); +#if BRAID_N > 5 + comb = crc_word(crc5 ^ words[5] ^ comb); +#endif +#endif +#endif +#endif +#endif + words += BRAID_N; + Assert(comb <= UINT32_MAX, "comb should fit in uint32_t"); + crc = (uint32_t)Z_WORD_FROM_LE(comb); + + /* Update the pointer to the remaining bytes to process. */ + buf = (const unsigned char *)words; + } + +#endif /* BRAID_W */ + + /* Complete the computation of the CRC on any remaining bytes. */ + return ~crc32_copy_small(crc, NULL, buf, len, (BRAID_N * BRAID_W) - 1, 0); +} + +Z_INTERNAL uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_braid(crc, src, len); + memcpy(dst, src, len); + return crc; +} diff --git a/neozip/arch/generic/crc32_chorba_c.c b/neozip/arch/generic/crc32_chorba_c.c new file mode 100644 index 0000000000..693972da11 --- /dev/null +++ b/neozip/arch/generic/crc32_chorba_c.c @@ -0,0 +1,1275 @@ +#include "zbuild.h" +#include "zendian.h" +#if defined(__EMSCRIPTEN__) +# include "zutil_p.h" +#endif +#include "zmemory.h" +#include "crc32_chorba_p.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include "generic_functions.h" + +/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */ +#define bitbuffer_size_bytes (16 * 1024 * sizeof(chorba_word_t)) +#define bitbuffer_size_zwords (bitbuffer_size_bytes / sizeof(chorba_word_t)) +#define bitbuffer_size_qwords (bitbuffer_size_bytes / sizeof(uint64_t)) + +#if defined(HAVE_MAY_ALIAS) && CHORBA_W != 8 + typedef uint64_t __attribute__ ((__may_alias__)) uint64a_t; +#else + typedef uint64_t uint64a_t; +#endif + +/** + * Implements the Chorba algorithm for CRC32 computation (https://arxiv.org/abs/2412.16398). + * + * This implementation processes data in three phases: + * 1. Initial pass: Zeros out bitbuffer + * 2. Intermediate pass: Processes half the values + * 3. Main pass: Processes remaining data + * + * @param crc Initial CRC value + * @param input Input data buffer + * @param len Length of input data + * @return Computed CRC32 value + * + * @note Requires minimum input size of 118960 + 512 bytes + * @note Uses 128KB temporary buffer + */ +Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) { +#if defined(__EMSCRIPTEN__) + chorba_word_t *bitbuffer = (chorba_word_t*)zng_alloc(bitbuffer_size_bytes); +#else + ALIGNED_(16) chorba_word_t bitbuffer[bitbuffer_size_zwords]; +#endif + const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer; + uint64a_t *bitbuffer_qwords = (uint64a_t*)bitbuffer; + /* The calling function ensured that this is aligned correctly */ + const chorba_word_t* input = (const chorba_word_t*)buf; + const uint64a_t* input_qwords = (const uint64a_t*)buf; + + size_t i = 0; + + chorba_word_t next1 = CHORBA_WORD_FROM_LE(~crc); + + chorba_word_t next2 = 0; + chorba_word_t next3 = 0; + chorba_word_t next4 = 0; + chorba_word_t next5 = 0; + chorba_word_t next6 = 0; + chorba_word_t next7 = 0; + chorba_word_t next8 = 0; + chorba_word_t next9 = 0; + chorba_word_t next10 = 0; + chorba_word_t next11 = 0; + chorba_word_t next12 = 0; + chorba_word_t next13 = 0; + chorba_word_t next14 = 0; + chorba_word_t next15 = 0; + chorba_word_t next16 = 0; + chorba_word_t next17 = 0; + chorba_word_t next18 = 0; + chorba_word_t next19 = 0; + chorba_word_t next20 = 0; + chorba_word_t next21 = 0; + chorba_word_t next22 = 0; + crc = 0; + + // do a first pass to zero out bitbuffer + for (; i < (14848 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) { + chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8; + chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16; + chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24; + chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32; + int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords; + int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords; + + in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1; + in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2; + in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3; + in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4; + in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5; + in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6; + in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7; + in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1; + in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2; + in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3; + in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4; + in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5; + in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6; + in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7; + in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8; + in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9; + in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10; + in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11; + in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12; + in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13; + in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14; + in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15; + in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16; + in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17; + in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18; + in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19; + in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20; + in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21; + in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22; + in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23; + in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24; + in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25; + + next1 = in11 ^ in22 ^ in26; + next2 = in12 ^ in23 ^ in27; + next3 = in13 ^ in24 ^ in28; + next4 = in14 ^ in25 ^ in29; + next5 = in15 ^ in26 ^ in30; + next6 = in16 ^ in27 ^ in31; + next7 = in17 ^ in28 ^ in32; + next8 = in18 ^ in29; + next9 = in19 ^ in30; + next10 = in20 ^ in31; + next11 = in21 ^ in32; + next12 = in22; + next13 = in23; + next14 = in24; + next15 = in25; + next16 = in26; + next17 = in27; + next18 = in28; + next19 = in29; + next20 = in30; + next21 = in31; + next22 = in32; + + bitbuffer[out_offset1 + 22] = in1; + bitbuffer[out_offset1 + 23] = in2; + bitbuffer[out_offset1 + 24] = in3; + bitbuffer[out_offset1 + 25] = in4; + bitbuffer[out_offset1 + 26] = in5; + bitbuffer[out_offset1 + 27] = in6; + bitbuffer[out_offset1 + 28] = in7; + bitbuffer[out_offset1 + 29] = in8; + bitbuffer[out_offset1 + 30] = in9; + bitbuffer[out_offset1 + 31] = in10; + bitbuffer[out_offset2 + 0] = in11; + bitbuffer[out_offset2 + 1] = in12; + bitbuffer[out_offset2 + 2] = in13; + bitbuffer[out_offset2 + 3] = in14; + bitbuffer[out_offset2 + 4] = in15; + bitbuffer[out_offset2 + 5] = in16; + bitbuffer[out_offset2 + 6] = in17; + bitbuffer[out_offset2 + 7] = in18; + bitbuffer[out_offset2 + 8] = in19; + bitbuffer[out_offset2 + 9] = in20; + bitbuffer[out_offset2 + 10] = in21; + bitbuffer[out_offset2 + 11] = in22; + bitbuffer[out_offset2 + 12] = in23; + bitbuffer[out_offset2 + 13] = in24; + bitbuffer[out_offset2 + 14] = in25; + bitbuffer[out_offset2 + 15] = in26; + bitbuffer[out_offset2 + 16] = in27; + bitbuffer[out_offset2 + 17] = in28; + bitbuffer[out_offset2 + 18] = in29; + bitbuffer[out_offset2 + 19] = in30; + bitbuffer[out_offset2 + 20] = in31; + bitbuffer[out_offset2 + 21] = in32; + } + + // one intermediate pass where we pull half the values + for (; i < (14880 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) { + chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8; + chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16; + chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24; + chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32; + int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords; + int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords; + int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords; + + in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1; + in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2; + in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3; + in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4; + in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5; + in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6; + in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7; + in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1; + in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2; + in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3; + in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4; + in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5; + in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6; + in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7; + in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8; + in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9; + in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10; + in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11; + in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12; + in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13; + in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14; + in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15; + in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22]; + in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23]; + in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24]; + in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25]; + in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26]; + in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27]; + in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28]; + in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29]; + in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30]; + in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31]; + + next1 = in11 ^ in22 ^ in26; + next2 = in12 ^ in23 ^ in27; + next3 = in13 ^ in24 ^ in28; + next4 = in14 ^ in25 ^ in29; + next5 = in15 ^ in26 ^ in30; + next6 = in16 ^ in27 ^ in31; + next7 = in17 ^ in28 ^ in32; + next8 = in18 ^ in29; + next9 = in19 ^ in30; + next10 = in20 ^ in31; + next11 = in21 ^ in32; + next12 = in22; + next13 = in23; + next14 = in24; + next15 = in25; + next16 = in26; + next17 = in27; + next18 = in28; + next19 = in29; + next20 = in30; + next21 = in31; + next22 = in32; + + bitbuffer[out_offset1 + 22] = in1; + bitbuffer[out_offset1 + 23] = in2; + bitbuffer[out_offset1 + 24] = in3; + bitbuffer[out_offset1 + 25] = in4; + bitbuffer[out_offset1 + 26] = in5; + bitbuffer[out_offset1 + 27] = in6; + bitbuffer[out_offset1 + 28] = in7; + bitbuffer[out_offset1 + 29] = in8; + bitbuffer[out_offset1 + 30] = in9; + bitbuffer[out_offset1 + 31] = in10; + bitbuffer[out_offset2 + 0] = in11; + bitbuffer[out_offset2 + 1] = in12; + bitbuffer[out_offset2 + 2] = in13; + bitbuffer[out_offset2 + 3] = in14; + bitbuffer[out_offset2 + 4] = in15; + bitbuffer[out_offset2 + 5] = in16; + bitbuffer[out_offset2 + 6] = in17; + bitbuffer[out_offset2 + 7] = in18; + bitbuffer[out_offset2 + 8] = in19; + bitbuffer[out_offset2 + 9] = in20; + bitbuffer[out_offset2 + 10] = in21; + bitbuffer[out_offset2 + 11] = in22; + bitbuffer[out_offset2 + 12] = in23; + bitbuffer[out_offset2 + 13] = in24; + bitbuffer[out_offset2 + 14] = in25; + bitbuffer[out_offset2 + 15] = in26; + bitbuffer[out_offset2 + 16] = in27; + bitbuffer[out_offset2 + 17] = in28; + bitbuffer[out_offset2 + 18] = in29; + bitbuffer[out_offset2 + 19] = in30; + bitbuffer[out_offset2 + 20] = in31; + bitbuffer[out_offset2 + 21] = in32; + } + + for (; (i + (14870 + 64) * sizeof(chorba_word_t)) < len; i += (32 * sizeof(chorba_word_t))) { + chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8; + chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16; + chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24; + chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32; + int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords; + int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords; + int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords; + + in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1 ^ bitbuffer[in_offset + 0]; + in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2 ^ bitbuffer[in_offset + 1]; + in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3 ^ bitbuffer[in_offset + 2]; + in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4 ^ bitbuffer[in_offset + 3]; + in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5 ^ bitbuffer[in_offset + 4]; + in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6 ^ bitbuffer[in_offset + 5]; + in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7 ^ bitbuffer[in_offset + 6]; + in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1 ^ bitbuffer[in_offset + 7]; + in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2 ^ bitbuffer[in_offset + 8]; + in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3 ^ bitbuffer[in_offset + 9]; + in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4 ^ bitbuffer[in_offset + 10]; + in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5 ^ bitbuffer[in_offset + 11]; + in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6 ^ bitbuffer[in_offset + 12]; + in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7 ^ bitbuffer[in_offset + 13]; + in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8 ^ bitbuffer[in_offset + 14]; + in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9 ^ bitbuffer[in_offset + 15]; + in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10 ^ bitbuffer[in_offset + 16]; + in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11 ^ bitbuffer[in_offset + 17]; + in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12 ^ bitbuffer[in_offset + 18]; + in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13 ^ bitbuffer[in_offset + 19]; + in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14 ^ bitbuffer[in_offset + 20]; + in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15 ^ bitbuffer[in_offset + 21]; + in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22]; + in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23]; + in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24]; + in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25]; + in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26]; + in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27]; + in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28]; + in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29]; + in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30]; + in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31]; + + next1 = in11 ^ in22 ^ in26; + next2 = in12 ^ in23 ^ in27; + next3 = in13 ^ in24 ^ in28; + next4 = in14 ^ in25 ^ in29; + next5 = in15 ^ in26 ^ in30; + next6 = in16 ^ in27 ^ in31; + next7 = in17 ^ in28 ^ in32; + next8 = in18 ^ in29; + next9 = in19 ^ in30; + next10 = in20 ^ in31; + next11 = in21 ^ in32; + next12 = in22; + next13 = in23; + next14 = in24; + next15 = in25; + next16 = in26; + next17 = in27; + next18 = in28; + next19 = in29; + next20 = in30; + next21 = in31; + next22 = in32; + + bitbuffer[out_offset1 + 22] = in1; + bitbuffer[out_offset1 + 23] = in2; + bitbuffer[out_offset1 + 24] = in3; + bitbuffer[out_offset1 + 25] = in4; + bitbuffer[out_offset1 + 26] = in5; + bitbuffer[out_offset1 + 27] = in6; + bitbuffer[out_offset1 + 28] = in7; + bitbuffer[out_offset1 + 29] = in8; + bitbuffer[out_offset1 + 30] = in9; + bitbuffer[out_offset1 + 31] = in10; + bitbuffer[out_offset2 + 0] = in11; + bitbuffer[out_offset2 + 1] = in12; + bitbuffer[out_offset2 + 2] = in13; + bitbuffer[out_offset2 + 3] = in14; + bitbuffer[out_offset2 + 4] = in15; + bitbuffer[out_offset2 + 5] = in16; + bitbuffer[out_offset2 + 6] = in17; + bitbuffer[out_offset2 + 7] = in18; + bitbuffer[out_offset2 + 8] = in19; + bitbuffer[out_offset2 + 9] = in20; + bitbuffer[out_offset2 + 10] = in21; + bitbuffer[out_offset2 + 11] = in22; + bitbuffer[out_offset2 + 12] = in23; + bitbuffer[out_offset2 + 13] = in24; + bitbuffer[out_offset2 + 14] = in25; + bitbuffer[out_offset2 + 15] = in26; + bitbuffer[out_offset2 + 16] = in27; + bitbuffer[out_offset2 + 17] = in28; + bitbuffer[out_offset2 + 18] = in29; + bitbuffer[out_offset2 + 19] = in30; + bitbuffer[out_offset2 + 20] = in31; + bitbuffer[out_offset2 + 21] = in32; + } + + bitbuffer[(i / sizeof(chorba_word_t) + 0) % bitbuffer_size_zwords] ^= next1; + bitbuffer[(i / sizeof(chorba_word_t) + 1) % bitbuffer_size_zwords] ^= next2; + bitbuffer[(i / sizeof(chorba_word_t) + 2) % bitbuffer_size_zwords] ^= next3; + bitbuffer[(i / sizeof(chorba_word_t) + 3) % bitbuffer_size_zwords] ^= next4; + bitbuffer[(i / sizeof(chorba_word_t) + 4) % bitbuffer_size_zwords] ^= next5; + bitbuffer[(i / sizeof(chorba_word_t) + 5) % bitbuffer_size_zwords] ^= next6; + bitbuffer[(i / sizeof(chorba_word_t) + 6) % bitbuffer_size_zwords] ^= next7; + bitbuffer[(i / sizeof(chorba_word_t) + 7) % bitbuffer_size_zwords] ^= next8; + bitbuffer[(i / sizeof(chorba_word_t) + 8) % bitbuffer_size_zwords] ^= next9; + bitbuffer[(i / sizeof(chorba_word_t) + 9) % bitbuffer_size_zwords] ^= next10; + bitbuffer[(i / sizeof(chorba_word_t) + 10) % bitbuffer_size_zwords] ^= next11; + bitbuffer[(i / sizeof(chorba_word_t) + 11) % bitbuffer_size_zwords] ^= next12; + bitbuffer[(i / sizeof(chorba_word_t) + 12) % bitbuffer_size_zwords] ^= next13; + bitbuffer[(i / sizeof(chorba_word_t) + 13) % bitbuffer_size_zwords] ^= next14; + bitbuffer[(i / sizeof(chorba_word_t) + 14) % bitbuffer_size_zwords] ^= next15; + bitbuffer[(i / sizeof(chorba_word_t) + 15) % bitbuffer_size_zwords] ^= next16; + bitbuffer[(i / sizeof(chorba_word_t) + 16) % bitbuffer_size_zwords] ^= next17; + bitbuffer[(i / sizeof(chorba_word_t) + 17) % bitbuffer_size_zwords] ^= next18; + bitbuffer[(i / sizeof(chorba_word_t) + 18) % bitbuffer_size_zwords] ^= next19; + bitbuffer[(i / sizeof(chorba_word_t) + 19) % bitbuffer_size_zwords] ^= next20; + bitbuffer[(i / sizeof(chorba_word_t) + 20) % bitbuffer_size_zwords] ^= next21; + bitbuffer[(i / sizeof(chorba_word_t) + 21) % bitbuffer_size_zwords] ^= next22; + + for (int j = 14870; j < 14870 + 64; j++) { + bitbuffer[(j + (i / sizeof(chorba_word_t))) % bitbuffer_size_zwords] = 0; + } + + uint64_t next1_64 = 0; + uint64_t next2_64 = 0; + uint64_t next3_64 = 0; + uint64_t next4_64 = 0; + uint64_t next5_64 = 0; + uint64_t final[9] = {0}; + + for (; (i + 72 < len); i += 32) { + uint64_t in1; + uint64_t in2; + uint64_t in3; + uint64_t in4; + uint64_t a1, a2, a3, a4; + uint64_t b1, b2, b3, b4; + uint64_t c1, c2, c3, c4; + uint64_t d1, d2, d3, d4; + + uint64_t out1; + uint64_t out2; + uint64_t out3; + uint64_t out4; + uint64_t out5; + + in1 = input_qwords[i / sizeof(uint64_t)] ^ bitbuffer_qwords[(i / sizeof(uint64_t)) % bitbuffer_size_qwords]; + in2 = input_qwords[i / sizeof(uint64_t) + 1] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 1) % bitbuffer_size_qwords]; + in1 = Z_U64_FROM_LE(in1) ^ next1_64; + in2 = Z_U64_FROM_LE(in2) ^ next2_64; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = input_qwords[i / sizeof(uint64_t) + 2] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 2) % bitbuffer_size_qwords]; + in4 = input_qwords[i / sizeof(uint64_t) + 3] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 3) % bitbuffer_size_qwords]; + in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1; + in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1_64 = next5_64 ^ out1; + next2_64 = out2; + next3_64 = out3; + next4_64 = out4; + next5_64 = out5; + + } + + memcpy(final, input_qwords + (i / sizeof(uint64_t)), len-i); + final[0] ^= Z_U64_TO_LE(next1_64); + final[1] ^= Z_U64_TO_LE(next2_64); + final[2] ^= Z_U64_TO_LE(next3_64); + final[3] ^= Z_U64_TO_LE(next4_64); + final[4] ^= Z_U64_TO_LE(next5_64); + + uint8_t *final_bytes = (uint8_t*)final; + + for (size_t j = 0; j < (len-i); j++) { + crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i) % bitbuffer_size_bytes]) & 0xff] ^ (crc >> 8); + } + +#if defined(__EMSCRIPTEN__) + zng_free(bitbuffer); +#endif + return ~crc; +} + +# if CHORBA_W == 8 +/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */ +Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) { + /* The calling function ensured that this is aligned correctly */ + const uint64_t* input = (const uint64_t*)buf; + uint64_t bitbuffer[32768 / sizeof(uint64_t)]; + const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer; + memset(bitbuffer, 0, 32768); + bitbuffer[0] = Z_U64_TO_LE(~crc); + + crc = 0; + + size_t i = 0; + + for(; i + 300*8+64 < len; i += 64) { + uint64_t in1, in2, in3, in4; + uint64_t in5, in6, in7, in8; + size_t in_offset = (i/8); + + in1 = input[i / sizeof(uint64_t) + 0] ^ bitbuffer[in_offset + 0]; + in2 = input[i / sizeof(uint64_t) + 1] ^ bitbuffer[in_offset + 1]; + in3 = input[i / sizeof(uint64_t) + 2] ^ bitbuffer[in_offset + 2]; + in4 = input[i / sizeof(uint64_t) + 3] ^ bitbuffer[in_offset + 3]; + in5 = input[i / sizeof(uint64_t) + 4] ^ bitbuffer[in_offset + 4]; + in6 = input[i / sizeof(uint64_t) + 5] ^ bitbuffer[in_offset + 5]; + in7 = input[i / sizeof(uint64_t) + 6] ^ bitbuffer[in_offset + 6]; + in8 = input[i / sizeof(uint64_t) + 7] ^ bitbuffer[in_offset + 7]; + + // [0, 145, 183, 211] + + bitbuffer[(i/8 + 0 + 145)] ^= in1; + bitbuffer[(i/8 + 1 + 145)] ^= in2; + bitbuffer[(i/8 + 2 + 145)] ^= in3; + bitbuffer[(i/8 + 3 + 145)] ^= in4; + bitbuffer[(i/8 + 4 + 145)] ^= in5; + bitbuffer[(i/8 + 5 + 145)] ^= in6; + bitbuffer[(i/8 + 6 + 145)] ^= in7; + bitbuffer[(i/8 + 7 + 145)] ^= in8; + + bitbuffer[(i/8 + 0 + 183)] ^= in1; + bitbuffer[(i/8 + 1 + 183)] ^= in2; + bitbuffer[(i/8 + 2 + 183)] ^= in3; + bitbuffer[(i/8 + 3 + 183)] ^= in4; + bitbuffer[(i/8 + 4 + 183)] ^= in5; + bitbuffer[(i/8 + 5 + 183)] ^= in6; + bitbuffer[(i/8 + 6 + 183)] ^= in7; + bitbuffer[(i/8 + 7 + 183)] ^= in8; + + bitbuffer[(i/8 + 0 + 211)] ^= in1; + bitbuffer[(i/8 + 1 + 211)] ^= in2; + bitbuffer[(i/8 + 2 + 211)] ^= in3; + bitbuffer[(i/8 + 3 + 211)] ^= in4; + bitbuffer[(i/8 + 4 + 211)] ^= in5; + bitbuffer[(i/8 + 5 + 211)] ^= in6; + bitbuffer[(i/8 + 6 + 211)] ^= in7; + bitbuffer[(i/8 + 7 + 211)] ^= in8; + + bitbuffer[(i/8 + 0 + 300)] = in1; + bitbuffer[(i/8 + 1 + 300)] = in2; + bitbuffer[(i/8 + 2 + 300)] = in3; + bitbuffer[(i/8 + 3 + 300)] = in4; + bitbuffer[(i/8 + 4 + 300)] = in5; + bitbuffer[(i/8 + 5 + 300)] = in6; + bitbuffer[(i/8 + 6 + 300)] = in7; + bitbuffer[(i/8 + 7 + 300)] = in8; + } + + uint64_t next1_64 = 0; + uint64_t next2_64 = 0; + uint64_t next3_64 = 0; + uint64_t next4_64 = 0; + uint64_t next5_64 = 0; + uint64_t final[9] = {0}; + + for (; (i + 72 < len); i += 32) { + uint64_t in1; + uint64_t in2; + uint64_t in3; + uint64_t in4; + uint64_t a1, a2, a3, a4; + uint64_t b1, b2, b3, b4; + uint64_t c1, c2, c3, c4; + uint64_t d1, d2, d3, d4; + + uint64_t out1; + uint64_t out2; + uint64_t out3; + uint64_t out4; + uint64_t out5; + + in1 = input[i / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t))]; + in2 = input[(i + 8) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 1)]; + in1 = Z_U64_FROM_LE(in1) ^ next1_64; + in2 = Z_U64_FROM_LE(in2) ^ next2_64; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = input[(i + 16) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 2)]; + in4 = input[(i + 24) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 3)]; + in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1; + in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1_64 = next5_64 ^ out1; + next2_64 = out2; + next3_64 = out3; + next4_64 = out4; + next5_64 = out5; + + } + + memcpy(final, input+(i / sizeof(uint64_t)), len-i); + final[0] ^= Z_U64_TO_LE(next1_64); + final[1] ^= Z_U64_TO_LE(next2_64); + final[2] ^= Z_U64_TO_LE(next3_64); + final[3] ^= Z_U64_TO_LE(next4_64); + final[4] ^= Z_U64_TO_LE(next5_64); + + uint8_t *final_bytes = (uint8_t*)final; + + for (size_t j = 0; j < (len-i); j++) { + crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8); + } + + return ~crc; +} + +/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */ +Z_INTERNAL uint32_t crc32_chorba_small_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) { + /* The calling function ensured that this is aligned correctly */ + const uint64_t* input = (const uint64_t*)buf; + uint64_t final[9] = {0}; + uint64_t next1 = ~crc; + crc = 0; + uint64_t next2 = 0; + uint64_t next3 = 0; + uint64_t next4 = 0; + uint64_t next5 = 0; + + size_t i = 0; + + /* This is weird, doing for vs while drops 10% off the exec time */ + for (; (i + 256 + 40 + 32 + 32) < len; i += 32) { + uint64_t in1; + uint64_t in2; + uint64_t in3; + uint64_t in4; + uint64_t a1, a2, a3, a4; + uint64_t b1, b2, b3, b4; + uint64_t c1, c2, c3, c4; + uint64_t d1, d2, d3, d4; + + uint64_t out1; + uint64_t out2; + uint64_t out3; + uint64_t out4; + uint64_t out5; + + uint64_t chorba1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1; + uint64_t chorba2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2; + uint64_t chorba3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3; + uint64_t chorba4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4; + uint64_t chorba5 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 4]) ^ next5; + uint64_t chorba6 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 5]); + uint64_t chorba7 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 6]) ^ chorba1; + uint64_t chorba8 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 7]) ^ chorba2; + + i += 8 * 8; + + /* 0-3 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ chorba3; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ chorba4 ^ chorba1; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ a1 ^ chorba5 ^ chorba2 ^ chorba1; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ a2 ^ b1 ^ chorba6 ^ chorba3 ^ chorba2; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + + i += 32; + + /* 4-7 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba4 ^ chorba3; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba4; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba6 ^ chorba5; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba7 ^ chorba6; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + + i += 32; + + /* 8-11 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba1; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba2; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba3; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba4; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + + i += 32; + + /* 12-15 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba1; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba2 ^ chorba1; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + + i += 32; + + /* 16-19 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + + i += 32; + + /* 20-23 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + + i += 32; + + /* 24-27 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + + i += 32; + + /* 28-31 */ + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba6 ^ chorba5; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba7 ^ chorba6; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + } + + for (; (i + 40 + 32) < len; i += 32) { + uint64_t in1; + uint64_t in2; + uint64_t in3; + uint64_t in4; + uint64_t a1, a2, a3, a4; + uint64_t b1, b2, b3, b4; + uint64_t c1, c2, c3, c4; + uint64_t d1, d2, d3, d4; + + uint64_t out1; + uint64_t out2; + uint64_t out3; + uint64_t out4; + uint64_t out5; + + in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1; + in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1; + in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + } + + memcpy(final, input+(i / sizeof(uint64_t)), len-i); + final[0] ^= Z_U64_TO_LE(next1); + final[1] ^= Z_U64_TO_LE(next2); + final[2] ^= Z_U64_TO_LE(next3); + final[3] ^= Z_U64_TO_LE(next4); + final[4] ^= Z_U64_TO_LE(next5); + + return crc32_braid(~crc, (uint8_t*)final, len-i); +} + +#else // CHORBA_W == 8 + +Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit(uint32_t crc, const uint8_t *buf, size_t len) { + /* The calling function ensured that this is aligned correctly */ + const uint32_t* input = (const uint32_t*)buf; + uint32_t final[20] = {0}; + + uint32_t next1 = ~crc; + crc = 0; + uint32_t next2 = 0; + uint32_t next3 = 0; + uint32_t next4 = 0; + uint32_t next5 = 0; + uint32_t next6 = 0; + uint32_t next7 = 0; + uint32_t next8 = 0; + uint32_t next9 = 0; + uint32_t next10 = 0; + + size_t i = 0; + for (; i + 80 < len; i += 40) { + uint32_t in1; + uint32_t in2; + uint32_t in3; + uint32_t in4; + uint32_t in5; + uint32_t in6; + uint32_t in7; + uint32_t in8; + uint32_t in9; + uint32_t in10; + + uint32_t a1, a2, a3, a4, a6, a7; + uint32_t b1, b2, b3, b4, b6, b7; + uint32_t c1, c2, c3, c4, c6, c7; + uint32_t d1, d2, d3, d4, d6, d7; + uint32_t e1, e2, e3, e4, e6, e7; + uint32_t f1, f2, f3, f4, f6, f7; + uint32_t g1, g2, g3, g4, g6, g7; + uint32_t h1, h2, h3, h4, h6, h7; + uint32_t i1, i2, i3, i4, i6, i7; + uint32_t j1, j2, j3, j4, j6, j7; + + uint32_t out1; + uint32_t out2; + uint32_t out3; + uint32_t out4; + uint32_t out5; + uint32_t out6; + uint32_t out7; + uint32_t out8; + uint32_t out9; + uint32_t out10; + + in1 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 0]) ^ next1; + in2 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 1]) ^ next2; + in3 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 2]) ^ next3; + in4 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 3]) ^ next4; + + a1 = (in1 << 17); + a2 = (in1 >> 15) ^ (in1 << 23); + a3 = (in1 >> 9) ^ (in1 << 19); + a4 = (in1 >> 13); + a6 = (in1 << 12); + a7 = (in1 >> 20); + + b1 = (in2 << 17); + b2 = (in2 >> 15) ^ (in2 << 23); + b3 = (in2 >> 9) ^ (in2 << 19); + b4 = (in2 >> 13); + b6 = (in2 << 12); + b7 = (in2 >> 20); + + c1 = (in3 << 17); + c2 = (in3 >> 15) ^ (in3 << 23); + c3 = (in3 >> 9) ^ (in3 << 19); + c4 = (in3 >> 13); + c6 = (in3 << 12); + c7 = (in3 >> 20); + + d1 = (in4 << 17); + d2 = (in4 >> 15) ^ (in4 << 23); + d3 = (in4 >> 9) ^ (in4 << 19); + d4 = (in4 >> 13); + d6 = (in4 << 12); + d7 = (in4 >> 20); + + in5 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 4]) ^ next5 ^ a1; + in6 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 5]) ^ next6 ^ a2 ^ b1; + in7 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 6]) ^ next7 ^ a3 ^ b2 ^ c1; + in8 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 7]) ^ next8 ^ a4 ^ b3 ^ c2 ^ d1; + + e1 = (in5 << 17); + e2 = (in5 >> 15) ^ (in5 << 23); + e3 = (in5 >> 9) ^ (in5 << 19); + e4 = (in5 >> 13); + e6 = (in5 << 12); + e7 = (in5 >> 20); + + f1 = (in6 << 17); + f2 = (in6 >> 15) ^ (in6 << 23); + f3 = (in6 >> 9) ^ (in6 << 19); + f4 = (in6 >> 13); + f6 = (in6 << 12); + f7 = (in6 >> 20); + + g1 = (in7 << 17); + g2 = (in7 >> 15) ^ (in7 << 23); + g3 = (in7 >> 9) ^ (in7 << 19); + g4 = (in7 >> 13); + g6 = (in7 << 12); + g7 = (in7 >> 20); + + h1 = (in8 << 17); + h2 = (in8 >> 15) ^ (in8 << 23); + h3 = (in8 >> 9) ^ (in8 << 19); + h4 = (in8 >> 13); + h6 = (in8 << 12); + h7 = (in8 >> 20); + + in9 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 8]) ^ next9 ^ b4 ^ c3 ^ d2 ^ e1; + in10 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 9]) ^ next10 ^ a6 ^ c4 ^ d3 ^ e2 ^ f1; + + i1 = (in9 << 17); + i2 = (in9 >> 15) ^ (in9 << 23); + i3 = (in9 >> 9) ^ (in9 << 19); + i4 = (in9 >> 13); + i6 = (in9 << 12); + i7 = (in9 >> 20); + + j1 = (in10 << 17); + j2 = (in10 >> 15) ^ (in10 << 23); + j3 = (in10 >> 9) ^ (in10 << 19); + j4 = (in10 >> 13); + j6 = (in10 << 12); + j7 = (in10 >> 20); + + out1 = a7 ^ b6 ^ d4 ^ e3 ^ f2 ^ g1; + out2 = b7 ^ c6 ^ e4 ^ f3 ^ g2 ^ h1; + out3 = c7 ^ d6 ^ f4 ^ g3 ^ h2 ^ i1; + out4 = d7 ^ e6 ^ g4 ^ h3 ^ i2 ^ j1; + out5 = e7 ^ f6 ^ h4 ^ i3 ^ j2; + out6 = f7 ^ g6 ^ i4 ^ j3; + out7 = g7 ^ h6 ^ j4; + out8 = h7 ^ i6; + out9 = i7 ^ j6; + out10 = j7; + + next1 = out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + next6 = out6; + next7 = out7; + next8 = out8; + next9 = out9; + next10 = out10; + + } + + memcpy(final, input+(i/sizeof(uint32_t)), len-i); + final[0] ^= Z_U32_TO_LE(next1); + final[1] ^= Z_U32_TO_LE(next2); + final[2] ^= Z_U32_TO_LE(next3); + final[3] ^= Z_U32_TO_LE(next4); + final[4] ^= Z_U32_TO_LE(next5); + final[5] ^= Z_U32_TO_LE(next6); + final[6] ^= Z_U32_TO_LE(next7); + final[7] ^= Z_U32_TO_LE(next8); + final[8] ^= Z_U32_TO_LE(next9); + final[9] ^= Z_U32_TO_LE(next10); + + return crc32_braid(~crc, (uint8_t*)final, len-i); +} +#endif // CHORBA_W == 8 + +Z_INTERNAL uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len) { + uintptr_t align_diff = ALIGN_DIFF(buf, 8); + if (len <= align_diff + CHORBA_SMALL_THRESHOLD) + return crc32_braid(crc, buf, len); + + if (align_diff) { + crc = crc32_braid(crc, buf, align_diff); + len -= align_diff; + buf += align_diff; + } + if (len > CHORBA_LARGE_THRESHOLD) + return crc32_chorba_118960_nondestructive(crc, buf, len); +#if CHORBA_W == 8 + if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD) + return crc32_chorba_32768_nondestructive(crc, buf, len); + return crc32_chorba_small_nondestructive(crc, buf, len); +#else + return crc32_chorba_small_nondestructive_32bit(crc, buf, len); +#endif +} + +uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_chorba(crc, src, len); + memcpy(dst, src, len); + return crc; +} diff --git a/neozip/arch/generic/generic_functions.h b/neozip/arch/generic/generic_functions.h new file mode 100644 index 0000000000..c150a2f010 --- /dev/null +++ b/neozip/arch/generic/generic_functions.h @@ -0,0 +1,64 @@ +/* generic_functions.h -- generic C implementations for arch-specific functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef GENERIC_FUNCTIONS_H_ +#define GENERIC_FUNCTIONS_H_ + +#include "zendian.h" +#include "deflate.h" + +typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len); +typedef uint32_t (*adler32_copy_func)(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1); +typedef uint32_t (*crc32_func)(uint32_t crc, const uint8_t *buf, size_t len); +typedef uint32_t (*crc32_copy_func)(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +typedef void (*slide_hash_func)(deflate_state *s); + + +uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); + +uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, size_t len, size_t left); + +#ifdef WITH_ALL_FALLBACKS +uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1); +uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1); +#endif +uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); + +uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); + +#ifndef WITHOUT_CHORBA + uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len); + uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); + +uint32_t longest_match_c(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_c(deflate_state *const s, uint32_t cur_match); + +void slide_hash_c(deflate_state *s); + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// Generic code +# define native_adler32 adler32_c +# define native_adler32_copy adler32_copy_c +# define native_chunkmemset_safe chunkmemset_safe_c +#ifndef WITHOUT_CHORBA +# define native_crc32 crc32_chorba +# define native_crc32_copy crc32_copy_chorba +#else +# define native_crc32 crc32_braid +# define native_crc32_copy crc32_copy_braid +#endif +# define native_inflate_fast inflate_fast_c +# define native_slide_hash slide_hash_c +# define native_longest_match longest_match_c +# define native_longest_match_slow longest_match_slow_c +# define native_compare256 compare256_c +#endif + +#endif diff --git a/neozip/arch/generic/slide_hash_c.c b/neozip/arch/generic/slide_hash_c.c new file mode 100644 index 0000000000..8345b9e36b --- /dev/null +++ b/neozip/arch/generic/slide_hash_c.c @@ -0,0 +1,52 @@ +/* slide_hash.c -- slide hash table C implementation + * + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "deflate.h" + +/* =========================================================================== + * Slide the hash table when sliding the window down (could be avoided with 32 + * bit values at the expense of memory usage). We slide even when level == 0 to + * keep the hash table consistent if we switch back to level > 0 later. + */ +static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) { +#ifdef NOT_TWEAK_COMPILER + table += entries; + do { + unsigned m; + m = *--table; + *table = (Pos)(m >= wsize ? m-wsize : 0); + /* If entries is not on any hash chain, prev[entries] is garbage but + * its value will never be used. + */ + } while (--entries); +#else + { + /* As of I make this change, gcc (4.8.*) isn't able to vectorize + * this hot loop using saturated-subtraction on x86-64 architecture. + * To avoid this defect, we can change the loop such that + * o. the pointer advance forward, and + * o. demote the variable 'm' to be local to the loop, and + * choose type "Pos" (instead of 'unsigned int') for the + * variable to avoid unnecessary zero-extension. + */ + unsigned int i; + Pos *q = table; + for (i = 0; i < entries; i++) { + Pos m = *q; + Pos t = (Pos)wsize; + *q++ = (Pos)(m >= t ? m-t: 0); + } + } +#endif /* NOT_TWEAK_COMPILER */ +} + +Z_INTERNAL void slide_hash_c(deflate_state *s) { + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_c_chain(s->head, HASH_SIZE, wsize); + slide_hash_c_chain(s->prev, wsize, wsize); +} diff --git a/neozip/arch/loongarch/Makefile.in b/neozip/arch/loongarch/Makefile.in new file mode 100644 index 0000000000..86baed1553 --- /dev/null +++ b/neozip/arch/loongarch/Makefile.in @@ -0,0 +1,99 @@ +# Makefile for zlib-ng +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# Copyright (C) 2024 Hans Kristian Rosbach +# Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +LSXFLAG=-mlsx +LASXFLAG=-mlasx + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +all: \ + loongarch_features.o loongarch_features.lo \ + crc32_la.o crc32_la.lo \ + adler32_lasx.o adler32_lasx.lo \ + adler32_lsx.o adler32_lsx.lo \ + chunkset_lasx.o chunkset_lasx.lo \ + chunkset_lsx.o chunkset_lsx.lo \ + compare256_lasx.o compare256_lasx.lo \ + compare256_lsx.o compare256_lsx.lo \ + slide_hash_lasx.o slide_hash_lasx.lo \ + slide_hash_lsx.o slide_hash_lsx.lo + +loongarch_features.o: $(SRCDIR)/loongarch_features.c + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c + +loongarch_features.lo: $(SRCDIR)/loongarch_features.c + $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c + +crc32_la.o: $(SRCDIR)/crc32_la.c + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c + +crc32_la.lo: $(SRCDIR)/crc32_la.c + $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c + +adler32_lasx.o: + $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lasx.c + +adler32_lasx.lo: + $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lasx.c + +adler32_lsx.o: + $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lsx.c + +adler32_lsx.lo: + $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_lsx.c + +chunkset_lasx.o: + $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c + +chunkset_lasx.lo: + $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c + +chunkset_lsx.o: + $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c + +chunkset_lsx.lo: + $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c + +compare256_lasx.o: + $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c + +compare256_lasx.lo: + $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c + +compare256_lsx.o: + $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c + +compare256_lsx.lo: + $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c + +slide_hash_lasx.o: + $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c + +slide_hash_lasx.lo: + $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c + +slide_hash_lsx.o: + $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c + +slide_hash_lsx.lo: + $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/loongarch/adler32_lasx.c b/neozip/arch/loongarch/adler32_lasx.c new file mode 100644 index 0000000000..a7268e73ff --- /dev/null +++ b/neozip/arch/loongarch/adler32_lasx.c @@ -0,0 +1,154 @@ +/* adler32_lasx.c -- compute the Adler-32 checksum of a data stream, based on Intel AVX2 implementation + * Copyright (C) 1995-2011 Mark Adler + * Copyright (C) 2022 Adam Stylinski + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * Authors: + * Brian Bockelman <bockelman@gmail.com> + * Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_LASX + +#include "zbuild.h" +#include "adler32_p.h" + +#include <lasxintrin.h> +#include "lasxintrin_ext.h" + + +/* 32 bit horizontal sum */ +static inline uint32_t hsum256(__m256i x) { + __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8)); + __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2)); + __m256i sum3 = __lasx_xvadd_w(sum2, __lasx_xvbsrl_v(sum2, 4)); + return (uint32_t)__lasx_xvpickve2gr_wu(sum3, 0); +} + +static inline uint32_t partial_hsum256(__m256i x) { + __m256i sum1 = __lasx_xvadd_w(x, __lasx_xvbsrl_v(x, 8)); + __m256i sum2 = __lasx_xvadd_w(sum1, __lasx_xvpermi_d(sum1, 0x2)); + return (uint32_t)__lasx_xvpickve2gr_wu(sum2, 0); +} + +extern uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) { + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); + } else if (len < 32) { + if (COPY) { + return adler32_copy_lsx(adler, dst, src, len); + } else { + return adler32_lsx(adler, src, len); + } + } + + __m256i vs1, vs2, vs2_0; + + const __m256i dot2v = (__m256i)((v32i8){ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, + 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33 }); + const __m256i dot2v_0 = (__m256i)((v32i8){ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }); + const __m256i dot3v = __lasx_xvreplgr2vr_h(1); + const __m256i zero = __lasx_xvldi(0); + + while (len >= 32) { + vs1 = __lasx_xvinsgr2vr_w(zero, adler0, 0); + vs2 = __lasx_xvinsgr2vr_w(zero, adler1, 0); + + __m256i vs1_0 = vs1; + __m256i vs3 = __lasx_xvldi(0); + vs2_0 = vs3; + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); + len -= k; + + while (k >= 64) { + __m256i vbuf = __lasx_xvld(src, 0); + __m256i vbuf_0 = __lasx_xvld(src, 32); + src += 64; + k -= 64; + + __m256i vs1_sad = lasx_sad_bu(vbuf, zero); + __m256i vs1_sad2 = lasx_sad_bu(vbuf_0, zero); + + if (COPY) { + __lasx_xvst(vbuf, dst, 0); + __lasx_xvst(vbuf_0, dst, 32); + dst += 64; + } + + vs1 = __lasx_xvadd_w(vs1, vs1_sad); + vs3 = __lasx_xvadd_w(vs3, vs1_0); + __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v); // sum 32 uint8s to 16 shorts + __m256i v_short_sum2_0 = lasx_maddubs_w_h(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + __m256i vsum2_0 = lasx_madd_w_h(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s + vs1 = __lasx_xvadd_w(vs1_sad2, vs1); + vs2 = __lasx_xvadd_w(vsum2, vs2); + vs2_0 = __lasx_xvadd_w(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = __lasx_xvadd_w(vs2_0, vs2); + vs3 = __lasx_xvslli_w(vs3, 6); + vs2 = __lasx_xvadd_w(vs3, vs2); + vs3 = __lasx_xvldi(0); + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) + */ + __m256i vbuf = __lasx_xvld(src, 0); + src += 32; + k -= 32; + + __m256i vs1_sad = lasx_sad_bu(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's + + if (COPY) { + __lasx_xvst(vbuf, dst, 0); + dst += 32; + } + + vs1 = __lasx_xvadd_w(vs1, vs1_sad); + vs3 = __lasx_xvadd_w(vs3, vs1_0); + __m256i v_short_sum2 = lasx_maddubs_w_h(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = lasx_madd_w_h(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + vs2 = __lasx_xvadd_w(vsum2, vs2); + vs1_0 = vs1; + } + + /* Defer the multiplication with 32 to outside of the loop */ + vs3 = __lasx_xvslli_w(vs3, 5); + vs2 = __lasx_xvadd_w(vs2, vs3); + + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif diff --git a/neozip/arch/loongarch/adler32_lsx.c b/neozip/arch/loongarch/adler32_lsx.c new file mode 100644 index 0000000000..389f74c683 --- /dev/null +++ b/neozip/arch/loongarch/adler32_lsx.c @@ -0,0 +1,147 @@ +/* adler32_lsx.c -- compute the Adler-32 checksum of a data stream, based on Intel SSE4.2 implementation + * Copyright (C) 1995-2011 Mark Adler + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_LSX + +#include "zbuild.h" +#include "adler32_p.h" + +#include <lsxintrin.h> +#include "lsxintrin_ext.h" + +static inline uint32_t partial_hsum(__m128i x) { + __m128i second_int = __lsx_vbsrl_v(x, 8); + __m128i sum = __lsx_vadd_w(x, second_int); + return __lsx_vpickve2gr_w(sum, 0); +} + +static inline uint32_t hsum(__m128i x) { + __m128i sum1 = __lsx_vilvh_d(x, x); + __m128i sum2 = __lsx_vadd_w(x, sum1); + __m128i sum3 = __lsx_vshuf4i_w(sum2, 0x01); + __m128i sum4 = __lsx_vadd_w(sum2, sum3); + return __lsx_vpickve2gr_w(sum4, 0); +} + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); + + __m128i vbuf, vbuf_0; + __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + v_sad_sum2, vsum2, vsum2_0; + __m128i zero = __lsx_vldi(0); + const __m128i dot2v = (__m128i)((v16i8){ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 }); + const __m128i dot2v_0 = (__m128i)((v16i8){ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }); + const __m128i dot3v = __lsx_vreplgr2vr_h(1); + size_t k; + + while (len >= 16) { + + k = ALIGN_DOWN(MIN(len, NMAX), 16); + len -= k; + + vs1 = __lsx_vinsgr2vr_w(zero, adler0, 0); + vs2 = __lsx_vinsgr2vr_w(zero, adler1, 0); + + vs3 = __lsx_vldi(0); + vs2_0 = __lsx_vldi(0); + vs1_0 = vs1; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = __lsx_vld(src, 0); + vbuf_0 = __lsx_vld(src, 16); + src += 32; + k -= 32; + + v_sad_sum1 = lsx_sad_bu(vbuf, zero); + v_sad_sum2 = lsx_sad_bu(vbuf_0, zero); + + if (COPY) { + __lsx_vst(vbuf, dst, 0); + __lsx_vst(vbuf_0, dst, 16); + dst += 32; + } + + v_short_sum2 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf, dot2v), __lsx_vmulwod_h_bu_b(vbuf, dot2v)); + v_short_sum2_0 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf_0, dot2v_0), __lsx_vmulwod_h_bu_b(vbuf_0, dot2v_0)); + + vs1 = __lsx_vadd_w(v_sad_sum1, vs1); + vs3 = __lsx_vadd_w(vs1_0, vs3); + + vsum2 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2, dot3v), v_short_sum2, dot3v); + vsum2_0 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2_0, dot3v), v_short_sum2_0, dot3v); + vs1 = __lsx_vadd_w(v_sad_sum2, vs1); + vs2 = __lsx_vadd_w(vsum2, vs2); + vs2_0 = __lsx_vadd_w(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = __lsx_vadd_w(vs2_0, vs2); + vs3 = __lsx_vslli_w(vs3, 5); + vs2 = __lsx_vadd_w(vs3, vs2); + vs3 = __lsx_vldi(0); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = __lsx_vld(src, 0); + src += 16; + k -= 16; + + v_sad_sum1 = lsx_sad_bu(vbuf, zero); + v_short_sum2 = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(vbuf, dot2v_0), __lsx_vmulwod_h_bu_b(vbuf, dot2v_0)); + + vs1 = __lsx_vadd_w(v_sad_sum1, vs1); + vs3 = __lsx_vadd_w(vs1_0, vs3); + vsum2 = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(v_short_sum2, dot3v), v_short_sum2, dot3v); + vs2 = __lsx_vadd_w(vsum2, vs2); + vs1_0 = vs1; + + if (COPY) { + __lsx_vst(vbuf, dst, 0); + dst += 16; + } + } + + vs3 = __lsx_vslli_w(vs3, 4); + vs2 = __lsx_vadd_w(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = hsum(vs2) % BASE; + } + + /* If this is true, there's fewer than 16 elements remaining */ + if (len) { + goto rem_peel; + } + + return adler0 | (adler1 << 16); +} + +Z_INTERNAL uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif diff --git a/neozip/arch/loongarch/chunkset_lasx.c b/neozip/arch/loongarch/chunkset_lasx.c new file mode 100644 index 0000000000..905704172d --- /dev/null +++ b/neozip/arch/loongarch/chunkset_lasx.c @@ -0,0 +1,126 @@ +/* chunkset_lasx.c -- LASX inline functions to copy small data chunks, based on Intel AVX2 implementation + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_LASX + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" + +#include <lasxintrin.h> +#include "lasxintrin_ext.h" +#include "lsxintrin_ext.h" + +#include "arch/generic/chunk_256bit_perm_idx_lut.h" + +typedef __m256i chunk_t; +typedef __m128i halfchunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 +#define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = __lasx_xvreplgr2vr_h(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = __lasx_xvreplgr2vr_w(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = __lasx_xvreplgr2vr_d(zng_memread_8(from)); +} + +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + *chunk = lasx_broadcast_128(__lsx_vld(from, 0)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = __lasx_xvld(s, 0); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + __lasx_xvst(*chunk, out, 0); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is + * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in + * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */ + *chunk_rem = lut_rem.remval; + + /* See note in chunkset_ssse3.c for why this is ok */ + __msan_unpoison(buf + dist, 32 - dist); + + if (dist < 16) { + /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after + * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate + * shuffles and combining the halves later */ + __m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0); + __m128i ret_vec0 = __lsx_vld(buf, 0); + ret_vec = __lasx_concat_128(ret_vec0, ret_vec0); + ret_vec = lasx_shuffle_b(ret_vec, perm_vec); + } else { + __m128i ret_vec0 = __lsx_vld(buf, 0); + __m128i ret_vec1 = __lsx_vld(buf, 16); + /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ + __m128i perm_vec1 = __lsx_vld(permute_table + lut_rem.idx, 0); + __m128i xlane_permutes = __lsx_vslt_b(perm_vec1, __lsx_vreplgr2vr_b(16)); + __m128i xlane_res = lsx_shuffle_b(ret_vec0, perm_vec1); + /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_ + * shuffle those values */ + __m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes); + ret_vec = __lasx_concat_128(ret_vec0, latter_half); + } + + return ret_vec; +} + +static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { + *chunk = __lsx_vld(s, 0); +} + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + __lsx_vst(*chunk, out, 0); +} + +static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return lasx_zext_128(*chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = __lsx_vld(buf, 0); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0); + ret_vec = lsx_shuffle_b(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_lasx +#define CHUNKCOPY chunkcopy_lasx +#define CHUNKUNROLL chunkunroll_lasx +#define CHUNKMEMSET chunkmemset_lasx +#define CHUNKMEMSET_SAFE chunkmemset_safe_lasx + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_lasx + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/loongarch/chunkset_lsx.c b/neozip/arch/loongarch/chunkset_lsx.c new file mode 100644 index 0000000000..23dabfba51 --- /dev/null +++ b/neozip/arch/loongarch/chunkset_lsx.c @@ -0,0 +1,74 @@ +/* chunkset_lsx.c -- LSX inline functions to copy small data chunks, based on Intel SSSE3 implementation + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_LSX + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" + +#include <lsxintrin.h> +#include "lsxintrin_ext.h" +#include "arch/generic/chunk_128bit_perm_idx_lut.h" + +typedef __m128i chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = __lsx_vreplgr2vr_h(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = __lsx_vreplgr2vr_w(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = __lsx_vreplgr2vr_d(zng_memread_8(from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = __lsx_vld(s, 0); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + __lsx_vst(*chunk, out, 0); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + /* Important to note: + * This is _not_ to subvert the memory sanitizer but to instead unpoison some + * bytes we willingly and purposefully load uninitialized that we swizzle over + * in a vector register, anyway. If what we assume is wrong about what is used, + * the memory sanitizer will still usefully flag it */ + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = __lsx_vld(buf, 0); + *chunk_rem = lut_rem.remval; + + perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0); + ret_vec = lsx_shuffle_b(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_lsx +#define CHUNKMEMSET chunkmemset_lsx +#define CHUNKMEMSET_SAFE chunkmemset_safe_lsx +#define CHUNKCOPY chunkcopy_lsx +#define CHUNKUNROLL chunkunroll_lsx + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_lsx + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/loongarch/compare256_lasx.c b/neozip/arch/loongarch/compare256_lasx.c new file mode 100644 index 0000000000..d61d6e57b3 --- /dev/null +++ b/neozip/arch/loongarch/compare256_lasx.c @@ -0,0 +1,60 @@ +/* compare256_lasx.c -- LASX version of compare256, based on Intel AVX2 implementation + * Copyright Mika T. Lindqvist <postmaster@raasu.org> + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef LOONGARCH_LASX + +#include <lasxintrin.h> +#include "lasxintrin_ext.h" + +static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + __m256i ymm_src0, ymm_src1, ymm_cmp; + ymm_src0 = __lasx_xvld(src0, 0); + ymm_src1 = __lasx_xvld(src1, 0); + ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ + unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */ + + src0 += 32, src1 += 32, len += 32; + + ymm_src0 = __lasx_xvld(src0, 0); + ymm_src1 = __lasx_xvld(src1, 0); + ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); + mask = (unsigned)lasx_movemask_b(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); + + src0 += 32, src1 += 32, len += 32; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1) { + return compare256_lasx_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_lasx +#define COMPARE256 compare256_lasx_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_lasx +#define COMPARE256 compare256_lasx_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/loongarch/compare256_lsx.c b/neozip/arch/loongarch/compare256_lsx.c new file mode 100644 index 0000000000..4afd261e76 --- /dev/null +++ b/neozip/arch/loongarch/compare256_lsx.c @@ -0,0 +1,88 @@ +/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation + * Copyright Adam Stylinski <kungfujesus06@gmail.com> + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef LOONGARCH_LSX + +#include <lsxintrin.h> +#include "lsxintrin_ext.h" + +static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) { + __m128i xmm_src0, xmm_src1, xmm_cmp; + + /* Do the first load unaligned, than all subsequent ones we have at least + * one aligned load. Sadly aligning both loads is probably unrealistic */ + xmm_src0 = __lsx_vld(src0, 0); + xmm_src1 = __lsx_vld(src1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return zng_ctz32(~mask); + + const uint8_t *last0 = src0 + 240; + const uint8_t *last1 = src1 + 240; + + int align_offset = ((uintptr_t)src0) & 15; + int align_adv = 16 - align_offset; + uint32_t len = align_adv; + + src0 += align_adv; + src1 += align_adv; + + for (int i = 0; i < 15; i++) { + xmm_src0 = __lsx_vld(src0, 0); + xmm_src1 = __lsx_vld(src1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + mask = (unsigned)lsx_movemask_b(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return len + zng_ctz32(~mask); + + len += 16, src0 += 16, src1 += 16; + } + + if (align_offset) { + xmm_src0 = __lsx_vld(last0, 0); + xmm_src1 = __lsx_vld(last1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + mask = (unsigned)lsx_movemask_b(xmm_cmp); + + if (mask != 0xFFFF) + return 240 + zng_ctz32(~mask); + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) { + return compare256_lsx_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_lsx +#define COMPARE256 compare256_lsx_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_lsx +#define COMPARE256 compare256_lsx_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/loongarch/crc32_la.c b/neozip/arch/loongarch/crc32_la.c new file mode 100644 index 0000000000..f1bd314e65 --- /dev/null +++ b/neozip/arch/loongarch/crc32_la.c @@ -0,0 +1,71 @@ +/* crc32_la.c - LoongArch version of crc32 + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_CRC + +#include "zbuild.h" + +#include <larchintrin.h> + +Z_INTERNAL uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len) { + uint32_t c = ~crc; + + if (UNLIKELY(len == 1)) { + c = (uint32_t)__crc_w_b_w((char)(*buf), (int)c); + c = ~c; + return c; + } + + uintptr_t align_diff = ALIGN_DIFF(buf, 8); + if (align_diff) { + if (len && (align_diff & 1)) { + c = (uint32_t)__crc_w_b_w((char)(*buf++), (int)c); + len--; + } + + if (len >= 2 && (align_diff & 2)) { + c = (uint32_t)__crc_w_h_w((short)*((uint16_t*)buf), (int)c); + buf += 2; + len -= 2; + } + + if (len >= 4 && (align_diff & 4)) { + c = (uint32_t)__crc_w_w_w((int)*((uint32_t*)buf), (int)c); + len -= 4; + buf += 4; + } + + } + + while (len >= 8) { + c = (uint32_t)__crc_w_d_w((long int)*((uint64_t*)buf), (int)c); + len -= 8; + buf += 8; + } + + if (len & 4) { + c = (uint32_t)__crc_w_w_w((int)*((uint32_t*)buf), (int)c); + buf += 4; + } + + if (len & 2) { + c = (uint32_t)__crc_w_h_w((short)*((uint16_t*)buf), (int)c); + buf += 2; + } + + if (len & 1) { + c = (uint32_t)__crc_w_b_w((char)(*buf), (int)c); + } + + c = ~c; + return c; +} + +Z_INTERNAL uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_loongarch64(crc, src, len); + memcpy(dst, src, len); + return crc; +} +#endif diff --git a/neozip/arch/loongarch/lasxintrin_ext.h b/neozip/arch/loongarch/lasxintrin_ext.h new file mode 100644 index 0000000000..b1e72cff86 --- /dev/null +++ b/neozip/arch/loongarch/lasxintrin_ext.h @@ -0,0 +1,61 @@ +/* lasxintrin_ext.h + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef LASXINTRIN_EXT_H +#define LASXINTRIN_EXT_H + +#include <lsxintrin.h> +#include <lasxintrin.h> + + +static inline __m256i lasx_zext_128(__m128i src) { +#ifdef __loongarch_asx_sx_conv + return __lasx_insert_128_lo(__lasx_xvldi(0), src); +#else + __m256i dest = __lasx_xvldi(0); + __asm__ volatile ("xvpermi.q %u0,%u2,0x30\n" : "=f"(dest) : "0"(dest), "f"(src)); + return dest; +#endif +} + +#ifndef __loongarch_asx_sx_conv +static inline __m256i __lasx_concat_128(__m128i lo, __m128i hi) { + __m256i dest; + __asm__ volatile ("xvpermi.q %u0,%u2,0x02\n" : "=f"(dest) : "0"(lo), "f"(hi)); + return dest; +} +#endif + +static inline __m256i lasx_broadcast_128(__m128i in) { + return __lasx_concat_128(in, in); +} + +static inline __m256i lasx_sad_bu(__m256i a, __m256i b) { + __m256i tmp = __lasx_xvabsd_bu(a, b); + tmp = __lasx_xvhaddw_hu_bu(tmp, tmp); + tmp = __lasx_xvhaddw_wu_hu(tmp, tmp); + return __lasx_xvhaddw_du_wu(tmp, tmp); +} + +static inline __m256i lasx_maddubs_w_h(__m256i a, __m256i b) { + return __lasx_xvsadd_h(__lasx_xvmulwod_h_bu_b(a, b), __lasx_xvmulwev_h_bu_b(a, b)); +} + +static inline __m256i lasx_madd_w_h(__m256i a, __m256i b) { + return __lasx_xvmaddwod_w_h(__lasx_xvmulwev_w_h(a, b), a, b); +} + +static inline int lasx_movemask_b(__m256i v) { + v = __lasx_xvmskltz_b(v); + return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16); +} + +/* See: lsx_shuffle_b */ +static inline __m256i lasx_shuffle_b(__m256i a, __m256i b) { + __m256i msb_mask = __lasx_xvslti_b(b, 0); + __m256i dst = __lasx_xvshuf_b(a, a, __lasx_xvandi_b(b, 0xF)); + return __lasx_xvand_v(dst, __lasx_xvnor_v(msb_mask, msb_mask)); +} + +#endif // include guard LASXINTRIN_EXT_H diff --git a/neozip/arch/loongarch/loongarch_features.c b/neozip/arch/loongarch/loongarch_features.c new file mode 100644 index 0000000000..bedf8499f7 --- /dev/null +++ b/neozip/arch/loongarch/loongarch_features.c @@ -0,0 +1,31 @@ +/* loongarch_features.c -- check for LoongArch features. + * + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_FEATURES + +#include "zbuild.h" +#include "loongarch_features.h" + +#include <larchintrin.h> + +/* + * https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html + * + * Word number Bit number Annotation Implication + * 0x1 25 CRC 1 indicates support for CRC instruction + * 0x1 6 LSX 1 indicates support for 128-bit vector extension + * 0x1 7 LASX 1 indicates support for 256-bit vector expansion + */ + +void Z_INTERNAL loongarch_check_features(struct loongarch_cpu_features *features) { + unsigned int w1 = __cpucfg(0x1); + features->has_crc = w1 & 0x2000000; + features->has_lsx = w1 & 0x40; + features->has_lasx = w1 & 0x80; +} + +#endif diff --git a/neozip/arch/loongarch/loongarch_features.h b/neozip/arch/loongarch/loongarch_features.h new file mode 100644 index 0000000000..27c90b14b3 --- /dev/null +++ b/neozip/arch/loongarch/loongarch_features.h @@ -0,0 +1,19 @@ +/* loongarch_features.h -- check for LoongArch features. + * + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef LOONGARCH_FEATURES_H_ +#define LOONGARCH_FEATURES_H_ + +struct loongarch_cpu_features { + int has_crc; + int has_lsx; + int has_lasx; +}; + +void Z_INTERNAL loongarch_check_features(struct loongarch_cpu_features *features); + +#endif /* LOONGARCH_FEATURES_H_ */ diff --git a/neozip/arch/loongarch/loongarch_functions.h b/neozip/arch/loongarch/loongarch_functions.h new file mode 100644 index 0000000000..0ec8bd66d7 --- /dev/null +++ b/neozip/arch/loongarch/loongarch_functions.h @@ -0,0 +1,86 @@ +/* loongarch_functions.h -- LoongArch implementations for arch-specific functions. + * + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef LOONGARCH_FUNCTIONS_H_ +#define LOONGARCH_FUNCTIONS_H_ + +#include "loongarch_natives.h" + +#ifdef LOONGARCH_CRC +uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_loongarch64(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef LOONGARCH_LSX +uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len); +uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_lsx(deflate_state *const s, uint32_t cur_match); +void slide_hash_lsx(deflate_state *s); +#endif + +#ifdef LOONGARCH_LASX +uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len); +uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); +uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_lasx(deflate_state *const s, uint32_t cur_match); +void slide_hash_lasx(deflate_state *s); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// LOONGARCH - CRC32 +# ifdef LOONGARCH_CRC_NATIVE +# undef native_crc32 +# define native_crc32 crc32_loongarch64 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_loongarch64 +# endif +# ifdef LOONGARCH_LSX_NATIVE +# undef native_adler32 +# define native_adler32 adler32_lsx +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_lsx +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_lsx +# undef native_compare256 +# define native_compare256 compare256_lsx +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_lsx +# undef native_longest_match +# define native_longest_match longest_match_lsx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lsx +# undef native_slide_hash +# define native_slide_hash slide_hash_lsx +# endif +# ifdef LOONGARCH_LASX_NATIVE +# undef native_adler32 +# define native_adler32 adler32_lasx +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_lasx +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_lasx +# undef native_compare256 +# define native_compare256 compare256_lasx +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_lasx +# undef native_longest_match +# define native_longest_match longest_match_lasx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lasx +# undef native_slide_hash +# define native_slide_hash slide_hash_lasx +# endif +#endif + +#endif /* LOONGARCH_FUNCTIONS_H_ */ diff --git a/neozip/arch/loongarch/loongarch_natives.h b/neozip/arch/loongarch/loongarch_natives.h new file mode 100644 index 0000000000..35f6d3c7bd --- /dev/null +++ b/neozip/arch/loongarch/loongarch_natives.h @@ -0,0 +1,25 @@ +/* loongarch_natives.h -- LoongArch compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef LOONGARCH_NATIVES_H_ +#define LOONGARCH_NATIVES_H_ + +#if defined(__loongarch__) +// All known CPUs have crc instructions +# ifdef LOONGARCH_CRC +# define LOONGARCH_CRC_NATIVE +# endif +#endif +#if defined(__loongarch_sx) +# ifdef LOONGARCH_LSX +# define LOONGARCH_LSX_NATIVE +# endif +#endif +#if defined(__loongarch_asx) +# ifdef LOONGARCH_LASX +# define LOONGARCH_LASX_NATIVE +# endif +#endif + +#endif /* LOONGARCH_NATIVES_H_ */ diff --git a/neozip/arch/loongarch/lsxintrin_ext.h b/neozip/arch/loongarch/lsxintrin_ext.h new file mode 100644 index 0000000000..0a0503b9f9 --- /dev/null +++ b/neozip/arch/loongarch/lsxintrin_ext.h @@ -0,0 +1,33 @@ +/* lsxintrin_ext.h + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef LSXINTRIN_EXT_H +#define LSXINTRIN_EXT_H + +#include <lsxintrin.h> + + +static inline __m128i lsx_sad_bu(__m128i a, __m128i b) { + __m128i tmp = __lsx_vabsd_bu(a, b); + tmp = __lsx_vhaddw_hu_bu(tmp, tmp); + tmp = __lsx_vhaddw_wu_hu(tmp, tmp); + return __lsx_vhaddw_du_wu(tmp, tmp); +} + +static inline int lsx_movemask_b(__m128i v) { + return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0); +} + +static inline __m128i lsx_shuffle_b(__m128i a, __m128i b) { + /* most significant bit is set - negative 8-bit integer */ + __m128i msb_mask = __lsx_vslti_b(b, 0); + + /* shuffle, clear msb in indices vector b */ + __m128i dst = __lsx_vshuf_b(a, a, __lsx_vandi_b(b, 0xF)); + + /* invert and apply mask - clear dst-element if b-msb is set */ + return __lsx_vand_v(dst, __lsx_vnor_v(msb_mask, msb_mask)); +} + +#endif // include guard LSXINTRIN_EXT_H diff --git a/neozip/arch/loongarch/slide_hash_lasx.c b/neozip/arch/loongarch/slide_hash_lasx.c new file mode 100644 index 0000000000..f464779090 --- /dev/null +++ b/neozip/arch/loongarch/slide_hash_lasx.c @@ -0,0 +1,49 @@ +/* + * LASX optimized hash slide, based on Intel AVX2 implementation + * + * Copyright (C) 2017 Intel Corporation + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * Authors: + * Arjan van de Ven <arjan@linux.intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * Mika T. Lindqvist <postmaster@raasu.org> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_LASX + +#include "zbuild.h" +#include "deflate.h" + +#include <lasxintrin.h> + +static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) { + table += entries; + table -= 32; + + do { + __m256i value1, value2, result1, result2; + + value1 = __lasx_xvld(table, 0); + value2 = __lasx_xvld(table, 32); + result1 = __lasx_xvssub_hu(value1, wsize); + result2 = __lasx_xvssub_hu(value2, wsize); + __lasx_xvst(result1, table, 0); + __lasx_xvst(result2, table, 32); + + table -= 32; + entries -= 32; + } while (entries > 0); +} + +Z_INTERNAL void slide_hash_lasx(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m256i ymm_wsize = __lasx_xvreplgr2vr_h((short)wsize); + + slide_hash_chain(s->head, HASH_SIZE, ymm_wsize); + slide_hash_chain(s->prev, wsize, ymm_wsize); +} + +#endif diff --git a/neozip/arch/loongarch/slide_hash_lsx.c b/neozip/arch/loongarch/slide_hash_lsx.c new file mode 100644 index 0000000000..f4c94ea70d --- /dev/null +++ b/neozip/arch/loongarch/slide_hash_lsx.c @@ -0,0 +1,54 @@ +/* + * LSX optimized hash slide, based on Intel SSE implementation + * + * Copyright (C) 2017 Intel Corporation + * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru> + * Authors: + * Arjan van de Ven <arjan@linux.intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef LOONGARCH_LSX + +#include "zbuild.h" +#include "deflate.h" + +#include <lsxintrin.h> +#include <assert.h> + +static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m128i wsize) { + table += entries; + table -= 16; + + /* ZALLOC allocates this pointer unless the user chose a custom allocator. + * Our alloc function is aligned to 64 byte boundaries */ + do { + __m128i value0, value1, result0, result1; + + value0 = __lsx_vld(table, 0); + value1 = __lsx_vld(table, 16); + result0 = __lsx_vssub_hu(value0, wsize); + result1 = __lsx_vssub_hu(value1, wsize); + __lsx_vst(result0, table, 0); + __lsx_vst(result1, table, 16); + + table -= 16; + entries -= 16; + } while (entries > 0); +} + +Z_INTERNAL void slide_hash_lsx(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m128i xmm_wsize = __lsx_vreplgr2vr_h((short)wsize); + + assert(((uintptr_t)s->head & 15) == 0); + assert(((uintptr_t)s->prev & 15) == 0); + + slide_hash_chain(s->head, HASH_SIZE, xmm_wsize); + slide_hash_chain(s->prev, wsize, xmm_wsize); +} + +#endif diff --git a/neozip/arch/power/Makefile.in b/neozip/arch/power/Makefile.in new file mode 100644 index 0000000000..e2bec5e510 --- /dev/null +++ b/neozip/arch/power/Makefile.in @@ -0,0 +1,93 @@ +# Makefile for POWER-specific files +# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM +# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org> +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +P8FLAGS=-mcpu=power8 +P9FLAGS=-mcpu=power9 +PPCFLAGS=-maltivec +NOLTOFLAG= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +all: power_features.o \ + power_features.lo \ + adler32_power8.o \ + adler32_power8.lo \ + adler32_vmx.o \ + adler32_vmx.lo \ + chunkset_power8.o \ + chunkset_power8.lo \ + compare256_power9.o \ + compare256_power9.lo \ + crc32_power8.o \ + crc32_power8.lo \ + slide_hash_power8.o \ + slide_hash_power8.lo \ + slide_hash_vmx.o \ + slide_hash_vmx.lo + +power_features.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c + +power_features.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c + +adler32_power8.o: + $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c + +adler32_power8.lo: + $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c + +adler32_vmx.o: + $(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c + +adler32_vmx.lo: + $(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c + +chunkset_power8.o: + $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c + +chunkset_power8.lo: + $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c + +compare256_power9.o: + $(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c + +compare256_power9.lo: + $(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c + +crc32_power8.o: + $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c + +crc32_power8.lo: + $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c + +slide_hash_power8.o: + $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c + +slide_hash_power8.lo: + $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c + +slide_hash_vmx.o: + $(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c + +slide_hash_vmx.lo: + $(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/power/adler32_power8.c b/neozip/arch/power/adler32_power8.c new file mode 100644 index 0000000000..39b3cf399c --- /dev/null +++ b/neozip/arch/power/adler32_power8.c @@ -0,0 +1,160 @@ +/* Adler32 for POWER8 using VSX instructions. + * Copyright (C) 2020 IBM Corporation + * Author: Rogerio Alves <rcardoso@linux.ibm.com> + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector) + * instructions. + * + * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means + * iteration n) is the initial value of adler - at start _0 is 1 unless + * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after + * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on. + * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on + * after iteration N. + * + * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] + + * N-1*c[1] + ... + c[N] + * + * In a more general way: + * + * s1_N = s1_0 + sum(i=1 to N)c[i] + * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i] + * + * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we + * can process N-bit at time we can do this at once. + * + * Since VSX can support 16-bit vector instructions, we can process + * 16-bit at time using N = 16 we have: + * + * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i] + * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i] + * + * After the first iteration we calculate the adler32 checksum for 16 bytes. + * + * For more background about adler32 please check the RFC: + * https://www.ietf.org/rfc/rfc1950.txt + */ + +#ifdef POWER8_VSX + +#include "zbuild.h" +#include "adler32_p.h" + +#include <altivec.h> + +/* Vector across sum unsigned int (saturate). */ +static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) { + __b = vec_sld(__a, __a, 8); + __b = vec_add(__b, __a); + __a = vec_sld(__b, __b, 4); + __a = vec_add(__a, __b); + + return __a; +} + +Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) { + uint32_t s1 = adler & 0xffff; + uint32_t s2 = (adler >> 16) & 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0); + + /* This is faster than VSX code for len < 64. */ + if (len < 64) + return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0); + + /* Use POWER VSX instructions for len >= 64. */ + const vector unsigned int v_zeros = { 0 }; + const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, + 6, 5, 4, 3, 2, 1}; + const vector unsigned char vsh = vec_splat_u8(4); + const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0}; + vector unsigned int vs1 = { 0 }; + vector unsigned int vs2 = { 0 }; + vector unsigned int vs1_save = { 0 }; + vector unsigned int vsum1, vsum2; + vector unsigned char vbuf; + int n; + + vs1[0] = s1; + vs2[0] = s2; + + /* Do length bigger than NMAX in blocks of NMAX size. */ + while (len >= NMAX) { + len -= NMAX; + n = NMAX / 16; + do { + vbuf = vec_xl(0, (unsigned char *) buf); + vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ + /* sum(i=1 to 16) buf[i]*(16-i+1). */ + vsum2 = vec_msum(vbuf, v_mul, v_zeros); + /* Save vs1. */ + vs1_save = vec_add(vs1_save, vs1); + /* Accumulate the sums. */ + vs1 = vec_add(vsum1, vs1); + vs2 = vec_add(vsum2, vs2); + + buf += 16; + } while (--n); + /* Once each block of NMAX size. */ + vs1 = vec_sumsu(vs1, vsum1); + vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ + vs2 = vec_add(vs1_save, vs2); + vs2 = vec_sumsu(vs2, vsum2); + + /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */ + vs1[0] = vs1[0] % BASE; + /* vs2[0] = s2_i + 16*s1_save + + sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */ + vs2[0] = vs2[0] % BASE; + + vs1 = vec_and(vs1, vmask); + vs2 = vec_and(vs2, vmask); + vs1_save = v_zeros; + } + + /* len is less than NMAX one modulo is needed. */ + if (len >= 16) { + while (len >= 16) { + len -= 16; + + vbuf = vec_xl(0, (unsigned char *) buf); + + vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ + /* sum(i=1 to 16) buf[i]*(16-i+1). */ + vsum2 = vec_msum(vbuf, v_mul, v_zeros); + /* Save vs1. */ + vs1_save = vec_add(vs1_save, vs1); + /* Accumulate the sums. */ + vs1 = vec_add(vsum1, vs1); + vs2 = vec_add(vsum2, vs2); + + buf += 16; + } + /* Since the size will be always less than NMAX we do this once. */ + vs1 = vec_sumsu(vs1, vsum1); + vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ + vs2 = vec_add(vs1_save, vs2); + vs2 = vec_sumsu(vs2, vsum2); + } + /* Copy result back to s1, s2 (mod 65521). */ + s1 = vs1[0] % BASE; + s2 = vs2[0] % BASE; + + /* Process tail (len < 16). */ + return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0); +} + +Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) { + return adler32_impl(adler, buf, len); +} + +/* VSX/VMX stores can have higher latency than optimized memcpy on POWER8+ */ +Z_INTERNAL uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len) { + adler = adler32_impl(adler, buf, len); + memcpy(dst, buf, len); + return adler; +} +#endif /* POWER8_VSX */ diff --git a/neozip/arch/power/adler32_vmx.c b/neozip/arch/power/adler32_vmx.c new file mode 100644 index 0000000000..5171bab35b --- /dev/null +++ b/neozip/arch/power/adler32_vmx.c @@ -0,0 +1,168 @@ +/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org> + * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef PPC_VMX + +#include "zbuild.h" +#include "zendian.h" +#include "adler32_p.h" + +#include <altivec.h> + +#define vmx_zero() (vec_splat_u32(0)) + +static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) { + /* Different taps for the separable components of sums */ + const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49}; + const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33}; + const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17}; + const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + /* As silly and inefficient as it seems, creating 1 permutation vector to permute + * a 2 element vector from a single load + a subsequent shift is just barely faster + * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */ + const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2)); + vector unsigned int adacc, s2acc; + vector unsigned int pair_vec = vec_ld(0, s); + adacc = vec_perm(pair_vec, pair_vec, s0_perm); +#if BYTE_ORDER == LITTLE_ENDIAN + s2acc = vec_sro(pair_vec, shift_vec); +#else + s2acc = vec_slo(pair_vec, shift_vec); +#endif + + vector unsigned int zero = vmx_zero(); + vector unsigned int s3acc = zero; + vector unsigned int s3acc_0 = zero; + vector unsigned int adacc_prev = adacc; + vector unsigned int adacc_prev_0 = zero; + + vector unsigned int s2acc_0 = zero; + vector unsigned int s2acc_1 = zero; + vector unsigned int s2acc_2 = zero; + + /* Maintain a running sum of a second half, this might help use break yet another + * data dependency bubble in the sum */ + vector unsigned int adacc_0 = zero; + + int num_iter = len / 4; + int rem = len & 3; + + for (int i = 0; i < num_iter; ++i) { + vector unsigned char d0 = vec_ld(0, buf); + vector unsigned char d1 = vec_ld(16, buf); + vector unsigned char d2 = vec_ld(32, buf); + vector unsigned char d3 = vec_ld(48, buf); + + /* The core operation of the loop, basically + * what is being unrolled below */ + adacc = vec_sum4s(d0, adacc); + s3acc = vec_add(s3acc, adacc_prev); + s3acc_0 = vec_add(s3acc_0, adacc_prev_0); + s2acc = vec_msum(t0, d0, s2acc); + + /* interleave dependent sums in here */ + adacc_0 = vec_sum4s(d1, adacc_0); + s2acc_0 = vec_msum(t1, d1, s2acc_0); + adacc = vec_sum4s(d2, adacc); + s2acc_1 = vec_msum(t2, d2, s2acc_1); + s2acc_2 = vec_msum(t3, d3, s2acc_2); + adacc_0 = vec_sum4s(d3, adacc_0); + + adacc_prev = adacc; + adacc_prev_0 = adacc_0; + buf += 64; + } + + adacc = vec_add(adacc, adacc_0); + s3acc = vec_add(s3acc, s3acc_0); + s3acc = vec_sl(s3acc, vec_splat_u32(6)); + + if (rem) { + adacc_prev = vec_add(adacc_prev_0, adacc_prev); + adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4)); + while (rem--) { + vector unsigned char d0 = vec_ld(0, buf); + adacc = vec_sum4s(d0, adacc); + s3acc = vec_add(s3acc, adacc_prev); + s2acc = vec_msum(t3, d0, s2acc); + adacc_prev = vec_sl(adacc, vec_splat_u32(4)); + buf += 16; + } + } + + + /* Sum up independent second sums */ + s2acc = vec_add(s2acc, s2acc_0); + s2acc_2 = vec_add(s2acc_1, s2acc_2); + s2acc = vec_add(s2acc, s2acc_2); + + s2acc = vec_add(s2acc, s3acc); + + adacc = vec_add(adacc, vec_sld(adacc, adacc, 8)); + s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8)); + adacc = vec_add(adacc, vec_sld(adacc, adacc, 4)); + s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4)); + + vec_ste(adacc, 0, s); + vec_ste(s2acc, 0, s+1); +} + +Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) { + /* Split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0); + + uint32_t pair[4] ALIGNED_(16); + pair[0] = adler; + pair[1] = sum2; + pair[2] = 0; + pair[3] = 0; + + // Align buffer + size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len); + size_t n = NMAX; + if (align_diff) { + adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0); + + buf += align_diff; + len -= align_diff; + n -= align_diff; + } + + while (len >= 16) { + n = MIN(len, n); + + vmx_accum32(pair, buf, n / 16); + pair[0] %= BASE; + pair[1] %= BASE; + + size_t k = (n / 16) * 16; + buf += k; + len -= k; + n = NMAX; + } + + /* Process tail (len < 16). */ + return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0); +} + +/* VMX stores can have higher latency than optimized memcpy */ +Z_INTERNAL uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + adler = adler32_vmx(adler, src, len); + memcpy(dst, src, len); + return adler; +} +#endif diff --git a/neozip/arch/power/chunkset_power8.c b/neozip/arch/power/chunkset_power8.c new file mode 100644 index 0000000000..f9855e677e --- /dev/null +++ b/neozip/arch/power/chunkset_power8.c @@ -0,0 +1,50 @@ +/* chunkset_power8.c -- VSX inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef POWER8_VSX + +#include "zbuild.h" +#include "zmemory.h" + +#include <altivec.h> + +typedef vector unsigned char chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = (vector unsigned char)vec_splats(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = (vector unsigned char)vec_splats(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = vec_xl(0, s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + vec_xst(*chunk, 0, out); +} + +#define CHUNKSIZE chunksize_power8 +#define CHUNKCOPY chunkcopy_power8 +#define CHUNKUNROLL chunkunroll_power8 +#define CHUNKMEMSET chunkmemset_power8 +#define CHUNKMEMSET_SAFE chunkmemset_safe_power8 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_power8 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/power/compare256_power9.c b/neozip/arch/power/compare256_power9.c new file mode 100644 index 0000000000..99c3b0b6d1 --- /dev/null +++ b/neozip/arch/power/compare256_power9.c @@ -0,0 +1,68 @@ +/* compare256_power9.c - Power9 version of compare256 + * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef POWER9 + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" +#include "zendian.h" + +#include <altivec.h> + +/* Older versions of GCC misimplemented semantics for these bit counting builtins. + * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */ +#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12) +#if BYTE_ORDER == LITTLE_ENDIAN +# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc) +#else +# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc) +#endif +#else +# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc) +#endif + +static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0, cmplen; + + do { + vector unsigned char vsrc0, vsrc1, vc; + + vsrc0 = *((vector unsigned char *)src0); + vsrc1 = *((vector unsigned char *)src1); + + /* Compare 16 bytes at a time. Each byte of vc will be either + * all ones or all zeroes, depending on the result of the comparison. */ + vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1); + + /* Since the index of matching bytes will contain only zeroes + * on vc (since we used cmpne), counting the number of consecutive + * bytes where LSB == 0 is the same as counting the length of the match. */ + zng_vec_vctzlsbb(vc, cmplen); + if (cmplen != 16) + return len + cmplen; + + src0 += 16, src1 += 16, len += 16; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) { + return compare256_power9_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_power9 +#define COMPARE256 compare256_power9_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_power9 +#define COMPARE256 compare256_power9_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/power/crc32_constants.h b/neozip/arch/power/crc32_constants.h new file mode 100644 index 0000000000..8c8f2153b6 --- /dev/null +++ b/neozip/arch/power/crc32_constants.h @@ -0,0 +1,1123 @@ +/* Constants table used by crc32_power8.c + * Copyright (C) 2021 IBM Corporation + * + * This file was automatically generated, DO NOT EDIT IT MANUALLY. + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zendian.h" +#include "zbuild.h" + +/* Reduce 262144 kbits to 1024 bits */ +static const __vector unsigned long long vcrc_const[255] ALIGNED_(16) = { +#if BYTE_ORDER == LITTLE_ENDIAN + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + { 0x0000000099ea94a8, 0x00000001651797d2 }, + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + { 0x00000000945a8420, 0x0000000021e0d56c }, + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + { 0x0000000030762706, 0x000000000f95ecaa }, + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + { 0x00000001a52fc582, 0x00000001ebd224ac }, + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + { 0x00000001a4a7167a, 0x000000000ccb97ca }, + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + { 0x000000000c18249a, 0x00000001006ec8a8 }, + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + { 0x00000000a924ae7c, 0x000000014f58f196 }, + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + { 0x00000001e12ccc12, 0x00000001a7192ca6 }, + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + { 0x00000000a0b9d4ac, 0x000000019a64bab2 }, + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + { 0x0000000095e8ddfe, 0x0000000014f4ed2e }, + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + { 0x00000000233fddc4, 0x000000011092b6a2 }, + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + { 0x00000001b4529b62, 0x00000000c8a1629c }, + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + { 0x00000001a7fa0e64, 0x000000017bf32e8e }, + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + { 0x00000001b5334592, 0x00000001f8cc6582 }, + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + { 0x000000011f8ee1b4, 0x000000008631ddf0 }, + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + { 0x000000006252e632, 0x000000007e5a76d0 }, + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + { 0x00000000ab973e84, 0x000000002b09b31c }, + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + { 0x000000007734f5ec, 0x00000001b2df1f84 }, + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + { 0x000000007c547798, 0x00000001d6f56afc }, + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + { 0x000000007ec40210, 0x00000001b9b5e70c }, + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + { 0x00000001ab1695a8, 0x0000000034b626d2 }, + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + { 0x0000000090494bba, 0x000000014c53479a }, + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + { 0x00000001123fb816, 0x00000001a6d179a4 }, + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + { 0x00000001e188c74c, 0x000000015abd16b4 }, + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + { 0x00000001c2d3451c, 0x00000000018f9852 }, + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + { 0x00000000f55cf1ca, 0x000000001fb3084a }, + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + { 0x00000001a0531540, 0x00000000c53dfb04 }, + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + { 0x0000000132cd7ebc, 0x00000000e10c9ad6 }, + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + { 0x0000000073ab7f36, 0x0000000025aa994a }, + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + { 0x0000000041aed1c2, 0x00000000fa3a74c4 }, + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + { 0x0000000136c53800, 0x0000000033eb3f40 }, + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + { 0x0000000126835a30, 0x000000017193f296 }, + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + { 0x000000006241b502, 0x0000000043f6c86a }, + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + { 0x00000000d5196ad4, 0x000000016b513ec6 }, + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + { 0x000000009cfa769a, 0x00000000c8f25b4e }, + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + { 0x00000000920e5df4, 0x00000001a45048ec }, + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + { 0x0000000169dc310e, 0x000000000c441004 }, + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + { 0x0000000009fc331c, 0x000000000e17cad6 }, + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + { 0x000000010d94a81e, 0x00000001253ae964 }, + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + { 0x0000000027a20ab2, 0x00000001d7c88ebc }, + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + { 0x0000000114f87504, 0x00000001e7ca913a }, + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + { 0x000000004b076d96, 0x0000000033ed078a }, + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + { 0x00000000da4d1e74, 0x00000000e1839c78 }, + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + { 0x000000001b81f672, 0x00000001322b267e }, + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + { 0x000000009367c988, 0x00000000638231b6 }, + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + { 0x00000001717214ca, 0x00000001ee7f16f4 }, + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + { 0x000000009f47d820, 0x0000000117d9924a }, + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + { 0x000000010d9a47d2, 0x00000000e1a9e0c4 }, + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + { 0x00000000a696c58c, 0x00000001403731dc }, + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + { 0x000000002aa28ec6, 0x00000001a5ea9682 }, + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + { 0x00000001fe18fd9a, 0x0000000101c5c578 }, + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + { 0x000000019d4fc1ae, 0x00000000dddf6494 }, + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + { 0x00000001ba0e3dea, 0x00000000f1c3db28 }, + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + { 0x0000000074b59a5e, 0x000000013112fb9c }, + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + { 0x00000000f2b5ea98, 0x00000000b680b906 }, + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + { 0x0000000187132676, 0x000000001a282932 }, + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + { 0x000000010a8c6ad4, 0x0000000089406e7e }, + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + { 0x00000001e21dfe70, 0x00000001def6be8c }, + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + { 0x00000001da0050e4, 0x0000000075258728 }, + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + { 0x00000000772172ae, 0x000000019536090a }, + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + { 0x00000000e47724aa, 0x00000000f2455bfc }, + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + { 0x000000003cd63ac4, 0x000000018c40baf4 }, + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + { 0x00000001bf47d352, 0x000000004cd390d4 }, + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + { 0x000000018dc1d708, 0x00000001e4ece95a }, + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + { 0x000000002d4620a4, 0x000000001a3ee918 }, + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + { 0x0000000058fd1740, 0x000000007c652fb8 }, + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + { 0x00000000dadd9bfc, 0x000000011c67842c }, + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + { 0x00000001ea2140be, 0x00000000254f759c }, + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + { 0x000000009de128ba, 0x000000007ece94ca }, + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + { 0x000000013ac3aa8e, 0x0000000038f258c2 }, + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + { 0x0000000099980562, 0x00000001cdf17b00 }, + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + { 0x00000001c1579c86, 0x000000011f882c16 }, + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + { 0x0000000068dbbf94, 0x0000000100093fc8 }, + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + { 0x000000004509fb04, 0x00000001cd684f16 }, + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + { 0x00000001202f6398, 0x000000004bc6a70a }, + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + { 0x000000013aea243e, 0x000000004fc7e8e4 }, + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + { 0x00000001b4052ae6, 0x0000000130103f1c }, + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + { 0x00000001cd2a0ae8, 0x0000000111b0024c }, + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + { 0x00000001fe4aa8b4, 0x000000010b3079da }, + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + { 0x00000001d1559a42, 0x000000010192bcc2 }, + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + { 0x00000001f3e05ecc, 0x0000000074838d50 }, + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + { 0x0000000104ddd2cc, 0x000000001b20f520 }, + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + { 0x000000015393153c, 0x0000000050c3590a }, + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + { 0x0000000057e942c6, 0x00000000b41cac8e }, + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + { 0x000000012c633850, 0x000000000c72cc78 }, + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + { 0x00000000ebcaae4c, 0x0000000030cdb032 }, + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + { 0x000000013ee532a6, 0x000000013e09fc32 }, + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + { 0x00000001bf0cbc7e, 0x000000001ed624d2 }, + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + { 0x00000000d50b7a5a, 0x00000000781aee1a }, + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + { 0x0000000002fca6e8, 0x00000001c4d8348c }, + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + { 0x000000007af40044, 0x0000000057a40336 }, + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + { 0x0000000016178744, 0x0000000085544940 }, + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + { 0x000000014c177458, 0x000000019cd21e80 }, + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + { 0x000000011b6ddf04, 0x000000013eb95bc0 }, + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + { 0x00000001f3e29ccc, 0x00000001dfc9fdfc }, + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + { 0x0000000135ae7562, 0x00000000cd028bc2 }, + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + { 0x0000000190ef812c, 0x0000000090db8c44 }, + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + { 0x0000000067a2c786, 0x000000010010a4ce }, + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + { 0x0000000048b9496c, 0x00000001c8f4c72c }, + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + { 0x000000015a422de6, 0x000000001c26170c }, + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + { 0x00000001ef0e3640, 0x00000000e3fccf68 }, + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + { 0x00000001006d2d26, 0x00000000d513ed24 }, + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + { 0x00000001170d56d6, 0x00000000141beada }, + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + { 0x00000000a5fb613c, 0x000000011071aea0 }, + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + { 0x0000000040bbf7fc, 0x000000012e19080a }, + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + { 0x000000016ac3a5b2, 0x0000000100ecf826 }, + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + { 0x00000000abf16230, 0x0000000069b09412 }, + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + { 0x00000001ebe23fac, 0x0000000122297bac }, + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + { 0x000000008b6a0894, 0x00000000e9e4b068 }, + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + { 0x00000001288ea478, 0x000000004b38651a }, + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + { 0x000000016619c442, 0x00000001468360e2 }, + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + { 0x0000000086230038, 0x00000000121c2408 }, + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + { 0x000000017746a756, 0x00000000da7e7d08 }, + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + { 0x0000000191b8f8f8, 0x00000001058d7652 }, + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + { 0x000000008e167708, 0x000000014a098a90 }, + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + { 0x0000000148b22d54, 0x0000000020dbe72e }, + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + { 0x0000000044ba2c3c, 0x000000011e7323e8 }, + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + { 0x00000000b54d2b52, 0x00000000d5d4bf94 }, + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + { 0x0000000005a4fd8a, 0x0000000199d8746c }, + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + { 0x0000000139f9fc46, 0x00000000ce9ca8a0 }, + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + { 0x000000015a1fa824, 0x00000000136edece }, + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + { 0x000000000a61ae4c, 0x000000019b92a068 }, + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + { 0x0000000145e9113e, 0x0000000071d62206 }, + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + { 0x000000006a348448, 0x00000000dfc50158 }, + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + { 0x000000004d80a08c, 0x00000001517626bc }, + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + { 0x000000014b6837a0, 0x0000000148d1e4fa }, + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + { 0x000000016896a7fc, 0x0000000094d8266e }, + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + { 0x000000014f187140, 0x00000000606c5e34 }, + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + { 0x000000019581b9da, 0x000000019766beaa }, + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + { 0x00000001091bc984, 0x00000001d80c506c }, + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + { 0x000000001067223c, 0x000000001e73837c }, + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + { 0x00000001ab16ea02, 0x0000000064d587de }, + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + { 0x000000013c4598a8, 0x00000000f4a507b0 }, + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + { 0x00000000b3735430, 0x0000000040e342fc }, + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + { 0x00000001bb3fc0c0, 0x00000001d5ad9c3a }, + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + { 0x00000001570ae19c, 0x0000000094a691a4 }, + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + { 0x00000001ea910712, 0x00000001271ecdfa }, + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + { 0x0000000167127128, 0x000000009e54475a }, + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + { 0x0000000019e790a2, 0x00000000c9c099ee }, + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + { 0x000000003788f710, 0x000000009a2f736c }, + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + { 0x00000001682a160e, 0x00000000bb9f4996 }, + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + { 0x000000007f0ebd2e, 0x00000001db688050 }, + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + { 0x000000002b032080, 0x00000000e9b10af4 }, + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + { 0x00000000cfd1664a, 0x000000012d4545e4 }, + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + { 0x00000000aa1181c2, 0x000000000361139c }, + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + { 0x00000000ddd08002, 0x00000001a5a1a3a8 }, + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + { 0x00000000e8dd0446, 0x000000006844e0b0 }, + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + { 0x00000001bbd94a00, 0x00000000c3762f28 }, + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + { 0x00000000ab6cd180, 0x00000001d26287a2 }, + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + { 0x0000000031803ce2, 0x00000001f6f0bba8 }, + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + { 0x0000000024f40b0c, 0x000000002ffabd62 }, + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + { 0x00000001ba1d9834, 0x00000000fb4516b8 }, + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + { 0x0000000104de61aa, 0x000000018cfa961c }, + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + { 0x0000000113e40d46, 0x000000019e588d52 }, + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + { 0x00000001415598a0, 0x00000001180f0bbc }, + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + { 0x00000000bf6c8c90, 0x00000000e1d9177a }, + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + { 0x00000001788b0504, 0x0000000105abc27c }, + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + { 0x0000000038385d02, 0x00000000972e4a58 }, + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + { 0x00000001b6c83844, 0x0000000183499a5e }, + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + { 0x0000000051061a8a, 0x00000001c96a8cca }, + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + { 0x000000017351388a, 0x00000001a1a5b60c }, + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + { 0x0000000132928f92, 0x00000000e4b6ac9c }, + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + { 0x00000000e6b4f48a, 0x00000001807e7f5a }, + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + { 0x0000000039d15e90, 0x000000017a7e3bc8 }, + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + { 0x00000000312d6074, 0x00000000d73975da }, + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + { 0x000000017bbb2cc4, 0x000000017375d038 }, + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + { 0x000000016ded3e18, 0x00000000193680bc }, + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + { 0x00000000f1638b16, 0x00000000999b06f6 }, + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + { 0x00000001d38b9ecc, 0x00000001f685d2b8 }, + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + { 0x000000018b8d09dc, 0x00000001f4ecbed2 }, + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + { 0x00000000e7bc27d2, 0x00000000ba16f1a0 }, + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + { 0x00000000275e1e96, 0x0000000115aceac4 }, + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + { 0x00000000e2e3031e, 0x00000001aeff6292 }, + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + { 0x00000001041c84d8, 0x000000009640124c }, + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + { 0x00000000706ce672, 0x0000000114f41f02 }, + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + { 0x000000015d5070da, 0x000000009c5f3586 }, + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + { 0x0000000038f9493a, 0x00000001878275fa }, + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + { 0x00000000a3348a76, 0x00000000ddc42ce8 }, + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + { 0x00000001ad0aab92, 0x0000000181d2c73a }, + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + { 0x000000019e85f712, 0x0000000141c9320a }, + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + { 0x000000005a871e76, 0x000000015235719a }, + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + { 0x000000017249c662, 0x00000000be27d804 }, + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + { 0x000000003a084712, 0x000000006242d45a }, + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + { 0x00000000ed438478, 0x000000009a53638e }, + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + { 0x00000000abac34cc, 0x00000001001ecfb6 }, + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + { 0x000000005f35ef3e, 0x000000016d7c2d64 }, + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + { 0x0000000047d6608c, 0x00000001d0ce46c0 }, + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + { 0x000000002d01470e, 0x0000000124c907b4 }, + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + { 0x0000000158bbc7b0, 0x0000000018a555ca }, + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + { 0x00000000c0a23e8e, 0x000000006b0980bc }, + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + { 0x00000001ebd85c88, 0x000000008bbba964 }, + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + { 0x000000019ee20bb2, 0x00000001070a5a1e }, + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + { 0x00000001acabf2d6, 0x000000002204322a }, + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + { 0x00000001b7963d56, 0x00000000a27524d0 }, + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + { 0x000000017bffa1fe, 0x0000000020b1e4ba }, + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + { 0x000000001f15333e, 0x0000000032cc27fc }, + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + { 0x000000018593129e, 0x0000000044dd22b8 }, + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + { 0x000000019cb32602, 0x00000000dffc9e0a }, + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + { 0x0000000142b05cc8, 0x00000001b7a0ed14 }, + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + { 0x00000001be49e7a4, 0x00000000c7842488 }, + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + { 0x0000000108f69d6c, 0x00000001c02a4fee }, + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + { 0x000000006c0971f0, 0x000000003c273778 }, + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + { 0x000000005b16467a, 0x00000001d63f8894 }, + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + { 0x00000001551a628e, 0x000000006be557d6 }, + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + { 0x000000019e42ea92, 0x000000006a7806ea }, + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + { 0x000000012fa83ff2, 0x000000016155aa0c }, + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + { 0x000000011ca9cde0, 0x00000000908650ac }, + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + { 0x00000000c8e5cd74, 0x00000000aa5a8084 }, + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + { 0x0000000096c27f0c, 0x0000000191bb500a }, + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + { 0x000000002baed926, 0x0000000064e9bed0 }, + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + { 0x000000017c8de8d2, 0x000000009444f302 }, + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + { 0x00000000d43d6068, 0x000000019db07d3c }, + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + { 0x00000000cb2c4b26, 0x00000001359e3e6e }, + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + { 0x0000000145b8da26, 0x00000001e4f10dd2 }, + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + { 0x000000018fff4b08, 0x0000000124f5735e }, + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + { 0x0000000150b58ed0, 0x0000000124760a4c }, + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + { 0x00000001549f39bc, 0x000000000f1fc186 }, + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + { 0x00000000ef4d2f42, 0x00000000150e4cc4 }, + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + { 0x00000001b1468572, 0x000000002a6204e8 }, + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + { 0x000000013d7403b2, 0x00000000beb1d432 }, + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + { 0x00000001a4681842, 0x0000000135f3f1f0 }, + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + { 0x0000000167714492, 0x0000000074fe2232 }, + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + { 0x00000001e599099a, 0x000000001ac6e2ba }, + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + { 0x00000000fe128194, 0x0000000013fca91e }, + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + { 0x0000000077e8b990, 0x0000000183f4931e }, + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + { 0x00000001a267f63a, 0x00000000b6d9b4e4 }, + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + { 0x00000001945c245a, 0x00000000b5188656 }, + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + { 0x0000000149002e76, 0x0000000027a81a84 }, + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + { 0x00000001bb8310a4, 0x0000000125699258 }, + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + { 0x000000019ec60bcc, 0x00000001b23de796 }, + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + { 0x000000012d8590ae, 0x00000000fe4365dc }, + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + { 0x0000000065b00684, 0x00000000c68f497a }, + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + { 0x000000015e5aeadc, 0x00000000fbf521ee }, + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + { 0x00000000b77ff2b0, 0x000000015eac3378 }, + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + { 0x0000000188da2ff6, 0x0000000134914b90 }, + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + { 0x0000000063da929a, 0x0000000016335cfe }, + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + { 0x00000001389caa80, 0x000000010372d10c }, + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + { 0x000000013db599d2, 0x000000015097b908 }, + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + { 0x0000000122505a86, 0x00000001227a7572 }, + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + { 0x000000016bd72746, 0x000000009a8f75c0 }, + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + { 0x00000001c3faf1d4, 0x00000000682c77a2 }, + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + { 0x00000001111c826c, 0x00000000231f091c }, + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + { 0x00000000153e9fb2, 0x000000007d4439f2 }, + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + { 0x000000002b1f7b60, 0x000000017e221efc }, + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + { 0x00000000b1dba570, 0x0000000167457c38 }, + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + { 0x00000001f6397b76, 0x00000000bdf081c4 }, + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + { 0x0000000156335214, 0x000000016286d6b0 }, + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + { 0x00000001d70e3986, 0x00000000c84f001c }, + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + { 0x000000003701a774, 0x0000000064efe7c0 }, + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + { 0x00000000ac81ef72, 0x000000000ac2d904 }, + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + { 0x0000000133212464, 0x00000000fd226d14 }, + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + { 0x00000000e4e45610, 0x000000011cfd42e0 }, + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + { 0x000000000c1bd370, 0x000000016e5a5678 }, + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + { 0x00000001a7b9e7a6, 0x00000001d888fe22 }, + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + { 0x000000007d657a10, 0x00000001af77fcd4 } +#else /* BYTE_ORDER == LITTLE_ENDIAN */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + { 0x00000001651797d2, 0x0000000099ea94a8 }, + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + { 0x0000000021e0d56c, 0x00000000945a8420 }, + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + { 0x000000000f95ecaa, 0x0000000030762706 }, + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + { 0x00000001ebd224ac, 0x00000001a52fc582 }, + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + { 0x000000000ccb97ca, 0x00000001a4a7167a }, + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + { 0x00000001006ec8a8, 0x000000000c18249a }, + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + { 0x000000014f58f196, 0x00000000a924ae7c }, + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + { 0x00000001a7192ca6, 0x00000001e12ccc12 }, + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + { 0x000000019a64bab2, 0x00000000a0b9d4ac }, + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + { 0x0000000014f4ed2e, 0x0000000095e8ddfe }, + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + { 0x000000011092b6a2, 0x00000000233fddc4 }, + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + { 0x00000000c8a1629c, 0x00000001b4529b62 }, + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + { 0x000000017bf32e8e, 0x00000001a7fa0e64 }, + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + { 0x00000001f8cc6582, 0x00000001b5334592 }, + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + { 0x000000008631ddf0, 0x000000011f8ee1b4 }, + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + { 0x000000007e5a76d0, 0x000000006252e632 }, + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + { 0x000000002b09b31c, 0x00000000ab973e84 }, + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + { 0x00000001b2df1f84, 0x000000007734f5ec }, + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + { 0x00000001d6f56afc, 0x000000007c547798 }, + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + { 0x00000001b9b5e70c, 0x000000007ec40210 }, + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + { 0x0000000034b626d2, 0x00000001ab1695a8 }, + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + { 0x000000014c53479a, 0x0000000090494bba }, + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + { 0x00000001a6d179a4, 0x00000001123fb816 }, + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + { 0x000000015abd16b4, 0x00000001e188c74c }, + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + { 0x00000000018f9852, 0x00000001c2d3451c }, + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + { 0x000000001fb3084a, 0x00000000f55cf1ca }, + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + { 0x00000000c53dfb04, 0x00000001a0531540 }, + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + { 0x00000000e10c9ad6, 0x0000000132cd7ebc }, + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + { 0x0000000025aa994a, 0x0000000073ab7f36 }, + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + { 0x00000000fa3a74c4, 0x0000000041aed1c2 }, + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + { 0x0000000033eb3f40, 0x0000000136c53800 }, + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + { 0x000000017193f296, 0x0000000126835a30 }, + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + { 0x0000000043f6c86a, 0x000000006241b502 }, + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + { 0x000000016b513ec6, 0x00000000d5196ad4 }, + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + { 0x00000000c8f25b4e, 0x000000009cfa769a }, + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + { 0x00000001a45048ec, 0x00000000920e5df4 }, + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + { 0x000000000c441004, 0x0000000169dc310e }, + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + { 0x000000000e17cad6, 0x0000000009fc331c }, + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + { 0x00000001253ae964, 0x000000010d94a81e }, + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + { 0x00000001d7c88ebc, 0x0000000027a20ab2 }, + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + { 0x00000001e7ca913a, 0x0000000114f87504 }, + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + { 0x0000000033ed078a, 0x000000004b076d96 }, + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + { 0x00000000e1839c78, 0x00000000da4d1e74 }, + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + { 0x00000001322b267e, 0x000000001b81f672 }, + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + { 0x00000000638231b6, 0x000000009367c988 }, + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + { 0x00000001ee7f16f4, 0x00000001717214ca }, + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + { 0x0000000117d9924a, 0x000000009f47d820 }, + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + { 0x00000000e1a9e0c4, 0x000000010d9a47d2 }, + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + { 0x00000001403731dc, 0x00000000a696c58c }, + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + { 0x00000001a5ea9682, 0x000000002aa28ec6 }, + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + { 0x0000000101c5c578, 0x00000001fe18fd9a }, + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + { 0x00000000dddf6494, 0x000000019d4fc1ae }, + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + { 0x00000000f1c3db28, 0x00000001ba0e3dea }, + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + { 0x000000013112fb9c, 0x0000000074b59a5e }, + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + { 0x00000000b680b906, 0x00000000f2b5ea98 }, + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + { 0x000000001a282932, 0x0000000187132676 }, + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + { 0x0000000089406e7e, 0x000000010a8c6ad4 }, + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + { 0x00000001def6be8c, 0x00000001e21dfe70 }, + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + { 0x0000000075258728, 0x00000001da0050e4 }, + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + { 0x000000019536090a, 0x00000000772172ae }, + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + { 0x00000000f2455bfc, 0x00000000e47724aa }, + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + { 0x000000018c40baf4, 0x000000003cd63ac4 }, + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + { 0x000000004cd390d4, 0x00000001bf47d352 }, + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + { 0x00000001e4ece95a, 0x000000018dc1d708 }, + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + { 0x000000001a3ee918, 0x000000002d4620a4 }, + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + { 0x000000007c652fb8, 0x0000000058fd1740 }, + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + { 0x000000011c67842c, 0x00000000dadd9bfc }, + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + { 0x00000000254f759c, 0x00000001ea2140be }, + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + { 0x000000007ece94ca, 0x000000009de128ba }, + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + { 0x0000000038f258c2, 0x000000013ac3aa8e }, + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + { 0x00000001cdf17b00, 0x0000000099980562 }, + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + { 0x000000011f882c16, 0x00000001c1579c86 }, + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + { 0x0000000100093fc8, 0x0000000068dbbf94 }, + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + { 0x00000001cd684f16, 0x000000004509fb04 }, + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + { 0x000000004bc6a70a, 0x00000001202f6398 }, + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + { 0x000000004fc7e8e4, 0x000000013aea243e }, + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + { 0x0000000130103f1c, 0x00000001b4052ae6 }, + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + { 0x0000000111b0024c, 0x00000001cd2a0ae8 }, + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + { 0x000000010b3079da, 0x00000001fe4aa8b4 }, + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + { 0x000000010192bcc2, 0x00000001d1559a42 }, + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + { 0x0000000074838d50, 0x00000001f3e05ecc }, + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + { 0x000000001b20f520, 0x0000000104ddd2cc }, + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + { 0x0000000050c3590a, 0x000000015393153c }, + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + { 0x00000000b41cac8e, 0x0000000057e942c6 }, + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + { 0x000000000c72cc78, 0x000000012c633850 }, + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + { 0x0000000030cdb032, 0x00000000ebcaae4c }, + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + { 0x000000013e09fc32, 0x000000013ee532a6 }, + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + { 0x000000001ed624d2, 0x00000001bf0cbc7e }, + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + { 0x00000000781aee1a, 0x00000000d50b7a5a }, + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + { 0x00000001c4d8348c, 0x0000000002fca6e8 }, + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + { 0x0000000057a40336, 0x000000007af40044 }, + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + { 0x0000000085544940, 0x0000000016178744 }, + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + { 0x000000019cd21e80, 0x000000014c177458 }, + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + { 0x000000013eb95bc0, 0x000000011b6ddf04 }, + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + { 0x00000001dfc9fdfc, 0x00000001f3e29ccc }, + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + { 0x00000000cd028bc2, 0x0000000135ae7562 }, + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + { 0x0000000090db8c44, 0x0000000190ef812c }, + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + { 0x000000010010a4ce, 0x0000000067a2c786 }, + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + { 0x00000001c8f4c72c, 0x0000000048b9496c }, + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + { 0x000000001c26170c, 0x000000015a422de6 }, + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + { 0x00000000e3fccf68, 0x00000001ef0e3640 }, + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + { 0x00000000d513ed24, 0x00000001006d2d26 }, + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + { 0x00000000141beada, 0x00000001170d56d6 }, + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + { 0x000000011071aea0, 0x00000000a5fb613c }, + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + { 0x000000012e19080a, 0x0000000040bbf7fc }, + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + { 0x0000000100ecf826, 0x000000016ac3a5b2 }, + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + { 0x0000000069b09412, 0x00000000abf16230 }, + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + { 0x0000000122297bac, 0x00000001ebe23fac }, + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + { 0x00000000e9e4b068, 0x000000008b6a0894 }, + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + { 0x000000004b38651a, 0x00000001288ea478 }, + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + { 0x00000001468360e2, 0x000000016619c442 }, + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + { 0x00000000121c2408, 0x0000000086230038 }, + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + { 0x00000000da7e7d08, 0x000000017746a756 }, + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + { 0x00000001058d7652, 0x0000000191b8f8f8 }, + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + { 0x000000014a098a90, 0x000000008e167708 }, + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + { 0x0000000020dbe72e, 0x0000000148b22d54 }, + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + { 0x000000011e7323e8, 0x0000000044ba2c3c }, + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + { 0x00000000d5d4bf94, 0x00000000b54d2b52 }, + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + { 0x0000000199d8746c, 0x0000000005a4fd8a }, + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + { 0x00000000ce9ca8a0, 0x0000000139f9fc46 }, + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + { 0x00000000136edece, 0x000000015a1fa824 }, + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + { 0x000000019b92a068, 0x000000000a61ae4c }, + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + { 0x0000000071d62206, 0x0000000145e9113e }, + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + { 0x00000000dfc50158, 0x000000006a348448 }, + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + { 0x00000001517626bc, 0x000000004d80a08c }, + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + { 0x0000000148d1e4fa, 0x000000014b6837a0 }, + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + { 0x0000000094d8266e, 0x000000016896a7fc }, + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + { 0x00000000606c5e34, 0x000000014f187140 }, + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + { 0x000000019766beaa, 0x000000019581b9da }, + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + { 0x00000001d80c506c, 0x00000001091bc984 }, + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + { 0x000000001e73837c, 0x000000001067223c }, + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + { 0x0000000064d587de, 0x00000001ab16ea02 }, + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + { 0x00000000f4a507b0, 0x000000013c4598a8 }, + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + { 0x0000000040e342fc, 0x00000000b3735430 }, + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + { 0x00000001d5ad9c3a, 0x00000001bb3fc0c0 }, + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + { 0x0000000094a691a4, 0x00000001570ae19c }, + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + { 0x00000001271ecdfa, 0x00000001ea910712 }, + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + { 0x000000009e54475a, 0x0000000167127128 }, + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + { 0x00000000c9c099ee, 0x0000000019e790a2 }, + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + { 0x000000009a2f736c, 0x000000003788f710 }, + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + { 0x00000000bb9f4996, 0x00000001682a160e }, + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + { 0x00000001db688050, 0x000000007f0ebd2e }, + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + { 0x00000000e9b10af4, 0x000000002b032080 }, + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + { 0x000000012d4545e4, 0x00000000cfd1664a }, + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + { 0x000000000361139c, 0x00000000aa1181c2 }, + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + { 0x00000001a5a1a3a8, 0x00000000ddd08002 }, + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + { 0x000000006844e0b0, 0x00000000e8dd0446 }, + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + { 0x00000000c3762f28, 0x00000001bbd94a00 }, + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + { 0x00000001d26287a2, 0x00000000ab6cd180 }, + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + { 0x00000001f6f0bba8, 0x0000000031803ce2 }, + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + { 0x000000002ffabd62, 0x0000000024f40b0c }, + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + { 0x00000000fb4516b8, 0x00000001ba1d9834 }, + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + { 0x000000018cfa961c, 0x0000000104de61aa }, + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + { 0x000000019e588d52, 0x0000000113e40d46 }, + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + { 0x00000001180f0bbc, 0x00000001415598a0 }, + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + { 0x00000000e1d9177a, 0x00000000bf6c8c90 }, + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + { 0x0000000105abc27c, 0x00000001788b0504 }, + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + { 0x00000000972e4a58, 0x0000000038385d02 }, + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + { 0x0000000183499a5e, 0x00000001b6c83844 }, + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + { 0x00000001c96a8cca, 0x0000000051061a8a }, + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + { 0x00000001a1a5b60c, 0x000000017351388a }, + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + { 0x00000000e4b6ac9c, 0x0000000132928f92 }, + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + { 0x00000001807e7f5a, 0x00000000e6b4f48a }, + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + { 0x000000017a7e3bc8, 0x0000000039d15e90 }, + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + { 0x00000000d73975da, 0x00000000312d6074 }, + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + { 0x000000017375d038, 0x000000017bbb2cc4 }, + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + { 0x00000000193680bc, 0x000000016ded3e18 }, + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + { 0x00000000999b06f6, 0x00000000f1638b16 }, + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + { 0x00000001f685d2b8, 0x00000001d38b9ecc }, + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + { 0x00000001f4ecbed2, 0x000000018b8d09dc }, + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + { 0x00000000ba16f1a0, 0x00000000e7bc27d2 }, + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + { 0x0000000115aceac4, 0x00000000275e1e96 }, + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + { 0x00000001aeff6292, 0x00000000e2e3031e }, + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + { 0x000000009640124c, 0x00000001041c84d8 }, + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + { 0x0000000114f41f02, 0x00000000706ce672 }, + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + { 0x000000009c5f3586, 0x000000015d5070da }, + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + { 0x00000001878275fa, 0x0000000038f9493a }, + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + { 0x00000000ddc42ce8, 0x00000000a3348a76 }, + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + { 0x0000000181d2c73a, 0x00000001ad0aab92 }, + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + { 0x0000000141c9320a, 0x000000019e85f712 }, + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + { 0x000000015235719a, 0x000000005a871e76 }, + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + { 0x00000000be27d804, 0x000000017249c662 }, + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + { 0x000000006242d45a, 0x000000003a084712 }, + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + { 0x000000009a53638e, 0x00000000ed438478 }, + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + { 0x00000001001ecfb6, 0x00000000abac34cc }, + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + { 0x000000016d7c2d64, 0x000000005f35ef3e }, + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + { 0x00000001d0ce46c0, 0x0000000047d6608c }, + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + { 0x0000000124c907b4, 0x000000002d01470e }, + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + { 0x0000000018a555ca, 0x0000000158bbc7b0 }, + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + { 0x000000006b0980bc, 0x00000000c0a23e8e }, + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + { 0x000000008bbba964, 0x00000001ebd85c88 }, + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + { 0x00000001070a5a1e, 0x000000019ee20bb2 }, + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + { 0x000000002204322a, 0x00000001acabf2d6 }, + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + { 0x00000000a27524d0, 0x00000001b7963d56 }, + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + { 0x0000000020b1e4ba, 0x000000017bffa1fe }, + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + { 0x0000000032cc27fc, 0x000000001f15333e }, + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + { 0x0000000044dd22b8, 0x000000018593129e }, + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + { 0x00000000dffc9e0a, 0x000000019cb32602 }, + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + { 0x00000001b7a0ed14, 0x0000000142b05cc8 }, + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + { 0x00000000c7842488, 0x00000001be49e7a4 }, + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + { 0x00000001c02a4fee, 0x0000000108f69d6c }, + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + { 0x000000003c273778, 0x000000006c0971f0 }, + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + { 0x00000001d63f8894, 0x000000005b16467a }, + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + { 0x000000006be557d6, 0x00000001551a628e }, + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + { 0x000000006a7806ea, 0x000000019e42ea92 }, + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + { 0x000000016155aa0c, 0x000000012fa83ff2 }, + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + { 0x00000000908650ac, 0x000000011ca9cde0 }, + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + { 0x00000000aa5a8084, 0x00000000c8e5cd74 }, + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + { 0x0000000191bb500a, 0x0000000096c27f0c }, + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + { 0x0000000064e9bed0, 0x000000002baed926 }, + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + { 0x000000009444f302, 0x000000017c8de8d2 }, + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + { 0x000000019db07d3c, 0x00000000d43d6068 }, + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + { 0x00000001359e3e6e, 0x00000000cb2c4b26 }, + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + { 0x00000001e4f10dd2, 0x0000000145b8da26 }, + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + { 0x0000000124f5735e, 0x000000018fff4b08 }, + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + { 0x0000000124760a4c, 0x0000000150b58ed0 }, + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + { 0x000000000f1fc186, 0x00000001549f39bc }, + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + { 0x00000000150e4cc4, 0x00000000ef4d2f42 }, + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + { 0x000000002a6204e8, 0x00000001b1468572 }, + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + { 0x00000000beb1d432, 0x000000013d7403b2 }, + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + { 0x0000000135f3f1f0, 0x00000001a4681842 }, + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + { 0x0000000074fe2232, 0x0000000167714492 }, + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + { 0x000000001ac6e2ba, 0x00000001e599099a }, + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + { 0x0000000013fca91e, 0x00000000fe128194 }, + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + { 0x0000000183f4931e, 0x0000000077e8b990 }, + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + { 0x00000000b6d9b4e4, 0x00000001a267f63a }, + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + { 0x00000000b5188656, 0x00000001945c245a }, + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + { 0x0000000027a81a84, 0x0000000149002e76 }, + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + { 0x0000000125699258, 0x00000001bb8310a4 }, + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + { 0x00000001b23de796, 0x000000019ec60bcc }, + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + { 0x00000000fe4365dc, 0x000000012d8590ae }, + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + { 0x00000000c68f497a, 0x0000000065b00684 }, + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + { 0x00000000fbf521ee, 0x000000015e5aeadc }, + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + { 0x000000015eac3378, 0x00000000b77ff2b0 }, + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + { 0x0000000134914b90, 0x0000000188da2ff6 }, + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + { 0x0000000016335cfe, 0x0000000063da929a }, + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + { 0x000000010372d10c, 0x00000001389caa80 }, + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + { 0x000000015097b908, 0x000000013db599d2 }, + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + { 0x00000001227a7572, 0x0000000122505a86 }, + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + { 0x000000009a8f75c0, 0x000000016bd72746 }, + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + { 0x00000000682c77a2, 0x00000001c3faf1d4 }, + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + { 0x00000000231f091c, 0x00000001111c826c }, + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + { 0x000000007d4439f2, 0x00000000153e9fb2 }, + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + { 0x000000017e221efc, 0x000000002b1f7b60 }, + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + { 0x0000000167457c38, 0x00000000b1dba570 }, + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + { 0x00000000bdf081c4, 0x00000001f6397b76 }, + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + { 0x000000016286d6b0, 0x0000000156335214 }, + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + { 0x00000000c84f001c, 0x00000001d70e3986 }, + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + { 0x0000000064efe7c0, 0x000000003701a774 }, + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + { 0x000000000ac2d904, 0x00000000ac81ef72 }, + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + { 0x00000000fd226d14, 0x0000000133212464 }, + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + { 0x000000011cfd42e0, 0x00000000e4e45610 }, + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + { 0x000000016e5a5678, 0x000000000c1bd370 }, + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + { 0x00000001d888fe22, 0x00000001a7b9e7a6 }, + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + { 0x00000001af77fcd4, 0x000000007d657a10 } +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ +}; + +/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + +static const __vector unsigned long long vcrc_short_const[16] ALIGNED_(16) = { +#if BYTE_ORDER == LITTLE_ENDIAN + /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x) */ + { 0x99168a18ec447f11, 0xed837b2613e8221e }, + /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x) */ + { 0xe23e954e8fd2cd3c, 0xc8acdd8147b9ce5a }, + /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x) */ + { 0x92f8befe6b1d2b53, 0xd9ad6d87d4277e25 }, + /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x) */ + { 0xf38a3556291ea462, 0xc10ec5e033fbca3b }, + /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x) */ + { 0x974ac56262b6ca4b, 0xc0b55b0e82e02e2f }, + /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x) */ + { 0x855712b3784d2a56, 0x71aa1df0e172334d }, + /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x) */ + { 0xa5abe9f80eaee722, 0xfee3053e3969324d }, + /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x) */ + { 0x1fa0943ddb54814c, 0xf44779b93eb2bd08 }, + /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x) */ + { 0xa53ff440d7bbfe6a, 0xf5449b3f00cc3374 }, + /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x) */ + { 0xebe7e3566325605c, 0x6f8346e1d777606e }, + /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x) */ + { 0xc65a272ce5b592b8, 0xe3ab4f2ac0b95347 }, + /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x) */ + { 0x5705a9ca4721589f, 0xaa2215ea329ecc11 }, + /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x) */ + { 0xe3720acb88d14467, 0x1ed8f66ed95efd26 }, + /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x) */ + { 0xba1aca0315141c31, 0x78ed02d5a700e96a }, + /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x) */ + { 0xad2a31b3ed627dae, 0xba8ccbe832b39da3 }, + /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x) */ + { 0x6655004fa06a2517, 0xedb88320b1e6b092 } +#else /* BYTE_ORDER == LITTLE_ENDIAN */ + /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x) */ + { 0xed837b2613e8221e, 0x99168a18ec447f11 }, + /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x) */ + { 0xc8acdd8147b9ce5a, 0xe23e954e8fd2cd3c }, + /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x) */ + { 0xd9ad6d87d4277e25, 0x92f8befe6b1d2b53 }, + /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x) */ + { 0xc10ec5e033fbca3b, 0xf38a3556291ea462 }, + /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x) */ + { 0xc0b55b0e82e02e2f, 0x974ac56262b6ca4b }, + /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x) */ + { 0x71aa1df0e172334d, 0x855712b3784d2a56 }, + /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x) */ + { 0xfee3053e3969324d, 0xa5abe9f80eaee722 }, + /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x) */ + { 0xf44779b93eb2bd08, 0x1fa0943ddb54814c }, + /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x) */ + { 0xf5449b3f00cc3374, 0xa53ff440d7bbfe6a }, + /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x) */ + { 0x6f8346e1d777606e, 0xebe7e3566325605c }, + /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x) */ + { 0xe3ab4f2ac0b95347, 0xc65a272ce5b592b8 }, + /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x) */ + { 0xaa2215ea329ecc11, 0x5705a9ca4721589f }, + /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x) */ + { 0x1ed8f66ed95efd26, 0xe3720acb88d14467 }, + /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x) */ + { 0x78ed02d5a700e96a, 0xba1aca0315141c31 }, + /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x) */ + { 0xba8ccbe832b39da3, 0xad2a31b3ed627dae }, + /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x) */ + { 0xedb88320b1e6b092, 0x6655004fa06a2517 } +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ +}; + +/* Barrett constants */ +/* 33 bit reflected Barrett constant m - (4^32)/n */ + +static const __vector unsigned long long v_Barrett_const[2] ALIGNED_(16) = { + /* x^64 div p(x) */ +#if BYTE_ORDER == LITTLE_ENDIAN + { 0x00000001f7011641, 0x0000000000000000 }, + { 0x00000001db710641, 0x0000000000000000 } +#else /* BYTE_ORDER == LITTLE_ENDIAN */ + { 0x0000000000000000, 0x00000001f7011641 }, + { 0x0000000000000000, 0x00000001db710641 } +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ +}; diff --git a/neozip/arch/power/crc32_power8.c b/neozip/arch/power/crc32_power8.c new file mode 100644 index 0000000000..a7a2fb7435 --- /dev/null +++ b/neozip/arch/power/crc32_power8.c @@ -0,0 +1,593 @@ +/* crc32 for POWER8 using VSX instructions + * Copyright (C) 2021 IBM Corporation + * + * Author: Rogerio Alves <rogealve@br.ibm.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * This code uses gcc vector builtins instead using assembly directly. + */ + +#ifdef POWER8_VSX_CRC32 + +#include "zbuild.h" +#include "zendian.h" + +#include "crc32_constants.h" +#include "crc32_braid_tbl.h" + +#include "power_intrins.h" + +#define MAX_SIZE 32768 +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) { + while (len--) + crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + return crc; +} + +static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); + +Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) { + unsigned int prealign; + unsigned int tail; + + unsigned long len = (unsigned long) _len; + + crc ^= 0xffffffff; + + if (len < VMX_ALIGN + VMX_ALIGN_MASK) { + crc = crc32_align(crc, p, len); + goto out; + } + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = (unsigned int)ALIGN_DIFF(p, VMX_ALIGN); + crc = crc32_align(crc, p, prealign); + len -= prealign; + p += prealign; + } + + crc = __crc32_vpmsum(crc, p, ALIGN_DOWN(len, VMX_ALIGN)); + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += ALIGN_DOWN(len, VMX_ALIGN); + crc = crc32_align(crc, p, tail); + } + +out: + crc ^= 0xffffffff; + + return crc; +} + +Z_INTERNAL uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_power8(crc, src, len); + memcpy(dst, src, len); + return crc; +} + +/* When we have a load-store in a single-dispatch group and address overlap + * such that forward is not allowed (load-hit-store) the group must be flushed. + * A group ending NOP prevents the flush. + */ +#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory") + +#if BYTE_ORDER == BIG_ENDIAN +#define BYTESWAP_DATA +#endif + +#ifdef BYTESWAP_DATA +#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc) +#if BYTE_ORDER == LITTLE_ENDIAN +/* Byte reverse permute constant LE. */ +static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL }; +#else +static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL }; +#endif +#else +#define VEC_PERM(vr, va, vb, vc) +#endif + +static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) { + + const __vector unsigned long long vzero = {0,0}; + const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL}; + + const __vector unsigned long long vmask_32bit = + (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4); + + const __vector unsigned long long vmask_64bit = + (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8); + + __vector unsigned long long vcrc; + + __vector unsigned long long vconst1, vconst2; + + /* vdata0-vdata7 will contain our data (p). */ + __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7; + + /* v0-v7 will contain our checksums */ + __vector unsigned long long v0 = {0,0}; + __vector unsigned long long v1 = {0,0}; + __vector unsigned long long v2 = {0,0}; + __vector unsigned long long v3 = {0,0}; + __vector unsigned long long v4 = {0,0}; + __vector unsigned long long v5 = {0,0}; + __vector unsigned long long v6 = {0,0}; + __vector unsigned long long v7 = {0,0}; + + + /* Vector auxiliary variables. */ + __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7; + + unsigned int offset; /* Constant table offset. */ + + unsigned long i; /* Counter. */ + unsigned long chunks; + + unsigned long block_size; + int next_block = 0; + + /* Align by 128 bits. The last 128 bit block will be processed at end. */ + unsigned long length = len & 0xFFFFFFFFFFFFFF80UL; + + vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc); + + /* Short version. */ + if (len < 256) { + /* Calculate where in the constant table we need to start. */ + offset = 256 - len; + + vconst1 = vec_ld(offset, vcrc_short_const); + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vconst1, vperm_const); + + /* xor initial value */ + vdata0 = vec_xor(vdata0, vcrc); + + vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw( + (__vector unsigned int)vdata0, (__vector unsigned int)vconst1); + v0 = vec_xor(v0, vdata0); + + for (i = 16; i < len; i += 16) { + vconst1 = vec_ld(offset + i, vcrc_short_const); + vdata0 = vec_ld(i, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vconst1, vperm_const); + vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw( + (__vector unsigned int)vdata0, (__vector unsigned int)vconst1); + v0 = vec_xor(v0, vdata0); + } + } else { + + /* Load initial values. */ + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + vdata1 = vec_ld(16, (__vector unsigned long long*) p); + + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + VEC_PERM(vdata1, vdata1, vdata1, vperm_const); + + vdata2 = vec_ld(32, (__vector unsigned long long*) p); + vdata3 = vec_ld(48, (__vector unsigned long long*) p); + + VEC_PERM(vdata2, vdata2, vdata2, vperm_const); + VEC_PERM(vdata3, vdata3, vdata3, vperm_const); + + vdata4 = vec_ld(64, (__vector unsigned long long*) p); + vdata5 = vec_ld(80, (__vector unsigned long long*) p); + + VEC_PERM(vdata4, vdata4, vdata4, vperm_const); + VEC_PERM(vdata5, vdata5, vdata5, vperm_const); + + vdata6 = vec_ld(96, (__vector unsigned long long*) p); + vdata7 = vec_ld(112, (__vector unsigned long long*) p); + + VEC_PERM(vdata6, vdata6, vdata6, vperm_const); + VEC_PERM(vdata7, vdata7, vdata7, vperm_const); + + /* xor in initial value */ + vdata0 = vec_xor(vdata0, vcrc); + + p = (char *)p + 128; + + do { + /* Checksum in blocks of MAX_SIZE. */ + block_size = length; + if (block_size > MAX_SIZE) { + block_size = MAX_SIZE; + } + + length = length - block_size; + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + offset = (MAX_SIZE/8) - (block_size/8); + /* We reduce our final 128 bytes in a separate step */ + chunks = (block_size/128)-1; + + vconst1 = vec_ld(offset, vcrc_const); + + va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0, + (__vector unsigned long long)vconst1); + va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1, + (__vector unsigned long long)vconst1); + va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2, + (__vector unsigned long long)vconst1); + va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3, + (__vector unsigned long long)vconst1); + va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4, + (__vector unsigned long long)vconst1); + va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5, + (__vector unsigned long long)vconst1); + va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6, + (__vector unsigned long long)vconst1); + va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7, + (__vector unsigned long long)vconst1); + + if (chunks > 1) { + offset += 16; + vconst2 = vec_ld(offset, vcrc_const); + GROUP_ENDING_NOP; + + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + + vdata1 = vec_ld(16, (__vector unsigned long long*) p); + VEC_PERM(vdata1, vdata1, vdata1, vperm_const); + + vdata2 = vec_ld(32, (__vector unsigned long long*) p); + VEC_PERM(vdata2, vdata2, vdata2, vperm_const); + + vdata3 = vec_ld(48, (__vector unsigned long long*) p); + VEC_PERM(vdata3, vdata3, vdata3, vperm_const); + + vdata4 = vec_ld(64, (__vector unsigned long long*) p); + VEC_PERM(vdata4, vdata4, vdata4, vperm_const); + + vdata5 = vec_ld(80, (__vector unsigned long long*) p); + VEC_PERM(vdata5, vdata5, vdata5, vperm_const); + + vdata6 = vec_ld(96, (__vector unsigned long long*) p); + VEC_PERM(vdata6, vdata6, vdata6, vperm_const); + + vdata7 = vec_ld(112, (__vector unsigned long long*) p); + VEC_PERM(vdata7, vdata7, vdata7, vperm_const); + + p = (char *)p + 128; + + /* + * main loop. Each iteration calculates the CRC for a 128-byte + * block. + */ + for (i = 0; i < chunks-2; i++) { + vconst1 = vec_ld(offset, vcrc_const); + offset += 16; + GROUP_ENDING_NOP; + + v0 = vec_xor(v0, va0); + va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0, + (__vector unsigned long long)vconst2); + vdata0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + GROUP_ENDING_NOP; + + v1 = vec_xor(v1, va1); + va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1, + (__vector unsigned long long)vconst2); + vdata1 = vec_ld(16, (__vector unsigned long long*) p); + VEC_PERM(vdata1, vdata1, vdata1, vperm_const); + GROUP_ENDING_NOP; + + v2 = vec_xor(v2, va2); + va2 = __builtin_crypto_vpmsumd((__vector unsigned long long) + vdata2, (__vector unsigned long long)vconst2); + vdata2 = vec_ld(32, (__vector unsigned long long*) p); + VEC_PERM(vdata2, vdata2, vdata2, vperm_const); + GROUP_ENDING_NOP; + + v3 = vec_xor(v3, va3); + va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3, + (__vector unsigned long long)vconst2); + vdata3 = vec_ld(48, (__vector unsigned long long*) p); + VEC_PERM(vdata3, vdata3, vdata3, vperm_const); + + vconst2 = vec_ld(offset, vcrc_const); + GROUP_ENDING_NOP; + + v4 = vec_xor(v4, va4); + va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4, + (__vector unsigned long long)vconst1); + vdata4 = vec_ld(64, (__vector unsigned long long*) p); + VEC_PERM(vdata4, vdata4, vdata4, vperm_const); + GROUP_ENDING_NOP; + + v5 = vec_xor(v5, va5); + va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5, + (__vector unsigned long long)vconst1); + vdata5 = vec_ld(80, (__vector unsigned long long*) p); + VEC_PERM(vdata5, vdata5, vdata5, vperm_const); + GROUP_ENDING_NOP; + + v6 = vec_xor(v6, va6); + va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6, + (__vector unsigned long long)vconst1); + vdata6 = vec_ld(96, (__vector unsigned long long*) p); + VEC_PERM(vdata6, vdata6, vdata6, vperm_const); + GROUP_ENDING_NOP; + + v7 = vec_xor(v7, va7); + va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7, + (__vector unsigned long long)vconst1); + vdata7 = vec_ld(112, (__vector unsigned long long*) p); + VEC_PERM(vdata7, vdata7, vdata7, vperm_const); + + p = (char *)p + 128; + } + + /* First cool down */ + vconst1 = vec_ld(offset, vcrc_const); + offset += 16; + + v0 = vec_xor(v0, va0); + va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0, + (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v1 = vec_xor(v1, va1); + va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1, + (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v2 = vec_xor(v2, va2); + va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2, + (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v3 = vec_xor(v3, va3); + va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3, + (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v4 = vec_xor(v4, va4); + va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4, + (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v5 = vec_xor(v5, va5); + va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5, + (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v6 = vec_xor(v6, va6); + va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6, + (__vector unsigned long long)vconst1); + GROUP_ENDING_NOP; + + v7 = vec_xor(v7, va7); + va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7, + (__vector unsigned long long)vconst1); + }/* else */ + + /* Second cool down. */ + v0 = vec_xor(v0, va0); + v1 = vec_xor(v1, va1); + v2 = vec_xor(v2, va2); + v3 = vec_xor(v3, va3); + v4 = vec_xor(v4, va4); + v5 = vec_xor(v5, va5); + v6 = vec_xor(v6, va6); + v7 = vec_xor(v7, va7); + + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, + (__vector unsigned char)vzero, 4); + v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1, + (__vector unsigned char)vzero, 4); + v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2, + (__vector unsigned char)vzero, 4); + v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3, + (__vector unsigned char)vzero, 4); + v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4, + (__vector unsigned char)vzero, 4); + v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5, + (__vector unsigned char)vzero, 4); + v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6, + (__vector unsigned char)vzero, 4); + v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7, + (__vector unsigned char)vzero, 4); + + /* xor with the last 1024 bits. */ + va0 = vec_ld(0, (__vector unsigned long long*) p); + VEC_PERM(va0, va0, va0, vperm_const); + + va1 = vec_ld(16, (__vector unsigned long long*) p); + VEC_PERM(va1, va1, va1, vperm_const); + + va2 = vec_ld(32, (__vector unsigned long long*) p); + VEC_PERM(va2, va2, va2, vperm_const); + + va3 = vec_ld(48, (__vector unsigned long long*) p); + VEC_PERM(va3, va3, va3, vperm_const); + + va4 = vec_ld(64, (__vector unsigned long long*) p); + VEC_PERM(va4, va4, va4, vperm_const); + + va5 = vec_ld(80, (__vector unsigned long long*) p); + VEC_PERM(va5, va5, va5, vperm_const); + + va6 = vec_ld(96, (__vector unsigned long long*) p); + VEC_PERM(va6, va6, va6, vperm_const); + + va7 = vec_ld(112, (__vector unsigned long long*) p); + VEC_PERM(va7, va7, va7, vperm_const); + + p = (char *)p + 128; + + vdata0 = vec_xor(v0, va0); + vdata1 = vec_xor(v1, va1); + vdata2 = vec_xor(v2, va2); + vdata3 = vec_xor(v3, va3); + vdata4 = vec_xor(v4, va4); + vdata5 = vec_xor(v5, va5); + vdata6 = vec_xor(v6, va6); + vdata7 = vec_xor(v7, va7); + + /* Check if we have more blocks to process */ + next_block = 0; + if (length != 0) { + next_block = 1; + + /* zero v0-v7 */ + v0 = vec_xor(v0, v0); + v1 = vec_xor(v1, v1); + v2 = vec_xor(v2, v2); + v3 = vec_xor(v3, v3); + v4 = vec_xor(v4, v4); + v5 = vec_xor(v5, v5); + v6 = vec_xor(v6, v6); + v7 = vec_xor(v7, v7); + } + length = length + 128; + + } while (next_block); + + /* Calculate how many bytes we have left. */ + length = (len & 127); + + /* Calculate where in (short) constant table we need to start. */ + offset = 128 - length; + + v0 = vec_ld(offset, vcrc_short_const); + v1 = vec_ld(offset + 16, vcrc_short_const); + v2 = vec_ld(offset + 32, vcrc_short_const); + v3 = vec_ld(offset + 48, vcrc_short_const); + v4 = vec_ld(offset + 64, vcrc_short_const); + v5 = vec_ld(offset + 80, vcrc_short_const); + v6 = vec_ld(offset + 96, vcrc_short_const); + v7 = vec_ld(offset + 112, vcrc_short_const); + + offset += 128; + + v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata0, (__vector unsigned int)v0); + v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata1, (__vector unsigned int)v1); + v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata2, (__vector unsigned int)v2); + v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata3, (__vector unsigned int)v3); + v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata4, (__vector unsigned int)v4); + v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata5, (__vector unsigned int)v5); + v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata6, (__vector unsigned int)v6); + v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata7, (__vector unsigned int)v7); + + /* Now reduce the tail (0-112 bytes). */ + for (i = 0; i < length; i+=16) { + vdata0 = vec_ld(i,(__vector unsigned long long*)p); + VEC_PERM(vdata0, vdata0, vdata0, vperm_const); + va0 = vec_ld(offset + i,vcrc_short_const); + va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw( + (__vector unsigned int)vdata0, (__vector unsigned int)va0); + v0 = vec_xor(v0, va0); + } + + /* xor all parallel chunks together. */ + v0 = vec_xor(v0, v1); + v2 = vec_xor(v2, v3); + v4 = vec_xor(v4, v5); + v6 = vec_xor(v6, v7); + + v0 = vec_xor(v0, v2); + v4 = vec_xor(v4, v6); + + v0 = vec_xor(v0, v4); + } + + /* Barrett Reduction */ + vconst1 = vec_ld(0, v_Barrett_const); + vconst2 = vec_ld(16, v_Barrett_const); + + v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, + (__vector unsigned char)v0, 8); + v0 = vec_xor(v1,v0); + + /* shift left one bit */ + __vector unsigned char vsht_splat = vec_splat_u8 (1); + v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat); + + v0 = vec_and(v0, vmask_64bit); + + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + + /* bottom 32 bits of a */ + v1 = vec_and(v0, vmask_32bit); + + /* ma */ + v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1, + (__vector unsigned long long)vconst1); + + /* bottom 32bits of ma */ + v1 = vec_and(v1, vmask_32bit); + /* qn */ + v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1, + (__vector unsigned long long)vconst2); + /* a - qn, subtraction is xor in GF(2) */ + v0 = vec_xor (v0, v1); + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + + /* shift result into top 64 bits of */ + v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, + (__vector unsigned char)vzero, 4); + +#if BYTE_ORDER == BIG_ENDIAN + return v0[0]; +#else + return v0[1]; +#endif +} + +#endif diff --git a/neozip/arch/power/power_features.c b/neozip/arch/power/power_features.c new file mode 100644 index 0000000000..148f30a974 --- /dev/null +++ b/neozip/arch/power/power_features.c @@ -0,0 +1,54 @@ +/* power_features.c - POWER feature check + * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM + * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#if defined(PPC_FEATURES) || defined(POWER_FEATURES) + +#include "zbuild.h" +#include "power_features.h" + +#ifdef HAVE_SYS_AUXV_H +# include <sys/auxv.h> +#endif +#ifdef POWER_NEED_AUXVEC_H +# include <linux/auxvec.h> +#endif +#ifdef __FreeBSD__ +# include <machine/cpu.h> +#endif + +void Z_INTERNAL power_check_features(struct power_cpu_features *features) { +#ifdef PPC_FEATURES + unsigned long hwcap; +#if defined(__FreeBSD__) || defined(__OpenBSD__) + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); +#else + hwcap = getauxval(AT_HWCAP); +#endif + + if (hwcap & PPC_FEATURE_HAS_ALTIVEC) + features->has_altivec = 1; +#endif + +#ifdef POWER_FEATURES + unsigned long hwcap2; +#if defined(__FreeBSD__) || defined(__OpenBSD__) + elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); +#else + hwcap2 = getauxval(AT_HWCAP2); +#endif + +#ifdef POWER8_VSX + if (hwcap2 & PPC_FEATURE2_ARCH_2_07) + features->has_arch_2_07 = 1; +#endif +#ifdef POWER9 + if (hwcap2 & PPC_FEATURE2_ARCH_3_00) + features->has_arch_3_00 = 1; +#endif +#endif +} + +#endif diff --git a/neozip/arch/power/power_features.h b/neozip/arch/power/power_features.h new file mode 100644 index 0000000000..1ff51de5dd --- /dev/null +++ b/neozip/arch/power/power_features.h @@ -0,0 +1,18 @@ +/* power_features.h -- check for POWER CPU features + * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM + * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef POWER_FEATURES_H_ +#define POWER_FEATURES_H_ + +struct power_cpu_features { + int has_altivec; + int has_arch_2_07; + int has_arch_3_00; +}; + +void Z_INTERNAL power_check_features(struct power_cpu_features *features); + +#endif /* POWER_FEATURES_H_ */ diff --git a/neozip/arch/power/power_functions.h b/neozip/arch/power/power_functions.h new file mode 100644 index 0000000000..ccc7754a4c --- /dev/null +++ b/neozip/arch/power/power_functions.h @@ -0,0 +1,74 @@ +/* power_functions.h -- POWER implementations for arch-specific functions. + * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM + * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef POWER_FUNCTIONS_H_ +#define POWER_FUNCTIONS_H_ + +#include "power_natives.h" + +#ifdef PPC_VMX +uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +void slide_hash_vmx(deflate_state *s); +#endif + +#ifdef POWER8_VSX +uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len); +uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +void slide_hash_power8(deflate_state *s); +void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); +#endif + +#ifdef POWER9 +uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); +uint32_t longest_match_power9(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// Power - VMX +# ifdef PPC_VMX_NATIVE +# undef native_adler32 +# define native_adler32 adler32_vmx +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_vmx +# undef native_slide_hash +# define native_slide_hash slide_hash_vmx +# endif +// Power8 - VSX +# ifdef POWER8_VSX_NATIVE +# undef native_adler32 +# define native_adler32 adler32_power8 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_power8 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_power8 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_power8 +# undef native_slide_hash +# define native_slide_hash slide_hash_power8 +# endif +# ifdef POWER8_VSX_CRC32_NATIVE +# undef native_crc32 +# define native_crc32 crc32_power8 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_power8 +# endif +// Power9 +# ifdef POWER9_NATIVE +# undef native_compare256 +# define native_compare256 compare256_power9 +# undef native_longest_match +# define native_longest_match longest_match_power9 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_power9 +# endif +#endif + +#endif /* POWER_FUNCTIONS_H_ */ diff --git a/neozip/arch/power/power_intrins.h b/neozip/arch/power/power_intrins.h new file mode 100644 index 0000000000..3efcfb9722 --- /dev/null +++ b/neozip/arch/power/power_intrins.h @@ -0,0 +1,61 @@ +/* Helper functions to work around issues with clang builtins + * Copyright (C) 2021 IBM Corporation + * + * Authors: + * Daniel Black <daniel@linux.vnet.ibm.com> + * Rogerio Alves <rogealve@br.ibm.com> + * Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef POWER_INTRINS_H +#define POWER_INTRINS_H + +#include <altivec.h> + +#if defined (__clang__) +/* + * These stubs fix clang incompatibilities with GCC builtins. + */ + +#ifndef __builtin_crypto_vpmsumw +#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb +#endif +#ifndef __builtin_crypto_vpmsumd +#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb +#endif + +#ifdef __VSX__ +static inline __vector unsigned long long __attribute__((overloadable)) +vec_ld(int __a, const __vector unsigned long long* __b) { + return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b); +} +#endif + +#endif + +/* There's no version of this that operates over unsigned and if casted, it does + * sign extension. Let's write an endian independent version and hope the compiler + * eliminates creating another zero idiom for the zero value if one exists locally */ +static inline vector unsigned short vec_unpackl(vector unsigned char a) { + vector unsigned char zero = vec_splat_u8(0); + +#if BYTE_ORDER == BIG_ENDIAN + return (vector unsigned short)vec_mergel(zero, a); +#else + return (vector unsigned short)vec_mergel(a, zero); +#endif +} + +static inline vector unsigned short vec_unpackh(vector unsigned char a) { + vector unsigned char zero = vec_splat_u8(0); + +#if BYTE_ORDER == BIG_ENDIAN + return (vector unsigned short)vec_mergeh(zero, a); +#else + return (vector unsigned short)vec_mergeh(a, zero); +#endif +} + +#endif diff --git a/neozip/arch/power/power_natives.h b/neozip/arch/power/power_natives.h new file mode 100644 index 0000000000..59ec8a8aed --- /dev/null +++ b/neozip/arch/power/power_natives.h @@ -0,0 +1,27 @@ +/* power_natives.h -- POWER compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef POWER_NATIVES_H_ +#define POWER_NATIVES_H_ + +#if defined(__ALTIVEC__) +# ifdef PPC_VMX +# define PPC_VMX_NATIVE +# endif +#endif +#if defined(_ARCH_PWR8) && defined(__VSX__) +# ifdef POWER8_VSX +# define POWER8_VSX_NATIVE +# endif +# ifdef POWER8_VSX_CRC32 +# define POWER8_VSX_CRC32_NATIVE +# endif +#endif +#if defined(_ARCH_PWR9) +# ifdef POWER9 +# define POWER9_NATIVE +# endif +#endif + +#endif /* POWER_NATIVES_H_ */ diff --git a/neozip/arch/power/slide_hash_power8.c b/neozip/arch/power/slide_hash_power8.c new file mode 100644 index 0000000000..d01e0acd56 --- /dev/null +++ b/neozip/arch/power/slide_hash_power8.c @@ -0,0 +1,12 @@ +/* Optimized slide_hash for POWER processors + * Copyright (C) 2019-2020 IBM Corporation + * Author: Matheus Castanho <msc@linux.ibm.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef POWER8_VSX + +#define SLIDE_PPC slide_hash_power8 +#include "slide_ppc_tpl.h" + +#endif /* POWER8_VSX */ diff --git a/neozip/arch/power/slide_hash_vmx.c b/neozip/arch/power/slide_hash_vmx.c new file mode 100644 index 0000000000..5a87ef7d9a --- /dev/null +++ b/neozip/arch/power/slide_hash_vmx.c @@ -0,0 +1,10 @@ +/* Optimized slide_hash for PowerPC processors with VMX instructions + * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifdef PPC_VMX + +#define SLIDE_PPC slide_hash_vmx +#include "slide_ppc_tpl.h" + +#endif /* PPC_VMX */ diff --git a/neozip/arch/power/slide_ppc_tpl.h b/neozip/arch/power/slide_ppc_tpl.h new file mode 100644 index 0000000000..24629b4039 --- /dev/null +++ b/neozip/arch/power/slide_ppc_tpl.h @@ -0,0 +1,44 @@ +/* Optimized slide_hash for PowerPC processors + * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <altivec.h> +#include "zbuild.h" +#include "deflate.h" + +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + const vector unsigned short vmx_wsize = vec_splats(wsize); + Pos *p = table; + + do { + /* Do the pointer arithmetic early to hopefully overlap the vector unit */ + Pos *q = p; + p += 32; + vector unsigned short value0, value1, value2, value3; + vector unsigned short result0, result1, result2, result3; + + value0 = vec_ld(0, q); + value1 = vec_ld(16, q); + value2 = vec_ld(32, q); + value3 = vec_ld(48, q); + result0 = vec_subs(value0, vmx_wsize); + result1 = vec_subs(value1, vmx_wsize); + result2 = vec_subs(value2, vmx_wsize); + result3 = vec_subs(value3, vmx_wsize); + vec_st(result0, 0, q); + vec_st(result1, 16, q); + vec_st(result2, 32, q); + vec_st(result3, 48, q); + + entries -= 32; + } while (entries); +} + +void Z_INTERNAL SLIDE_PPC(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} diff --git a/neozip/arch/riscv/Makefile.in b/neozip/arch/riscv/Makefile.in new file mode 100644 index 0000000000..43176eee6e --- /dev/null +++ b/neozip/arch/riscv/Makefile.in @@ -0,0 +1,72 @@ +# Makefile for zlib-ng +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# Copyright (C) 2024 Hans Kristian Rosbach +# Copyright (C) 2025 Yin Tong <yintong.ustc@bytedance.com>, ByteDance +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +RVVFLAG= +RVVZBCFLAG= +ZBCFLAG= + +all: \ + riscv_features.o riscv_features.lo \ + adler32_rvv.o adler32_rvv.lo \ + chunkset_rvv.o chunkset_rvv.lo \ + compare256_rvv.o compare256_rvv.lo \ + slide_hash_rvv.o slide_hash_rvv.lo \ + crc32_zbc.o crc32_zbc.lo + +riscv_features.o: $(SRCDIR)/riscv_features.c + $(CC) $(CFLAGS) $(RVVZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c + +riscv_features.lo: $(SRCDIR)/riscv_features.c + $(CC) $(SFLAGS) $(RVVZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c + +adler32_rvv.o: $(SRCDIR)/adler32_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c + +adler32_rvv.lo: $(SRCDIR)/adler32_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c + +chunkset_rvv.o: $(SRCDIR)/chunkset_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c + +chunkset_rvv.lo: $(SRCDIR)/chunkset_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c + +compare256_rvv.o: $(SRCDIR)/compare256_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c + +compare256_rvv.lo: $(SRCDIR)/compare256_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c + +slide_hash_rvv.o: $(SRCDIR)/slide_hash_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c + +slide_hash_rvv.lo: $(SRCDIR)/slide_hash_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c + +crc32_zbc.o: $(SRCDIR)/crc32_zbc.c + $(CC) $(CFLAGS) $(ZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c + +crc32_zbc.lo: $(SRCDIR)/crc32_zbc.c + $(CC) $(SFLAGS) $(ZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/riscv/README.md b/neozip/arch/riscv/README.md new file mode 100644 index 0000000000..013095c373 --- /dev/null +++ b/neozip/arch/riscv/README.md @@ -0,0 +1,45 @@ +# Building RISC-V Target with Cmake # + +> **Warning** +> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer. +> +> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu. +> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it. +## Prerequisite: Build RISC-V Clang Toolchain and QEMU ## + +If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version). + +```bash +./prepare_riscv_toolchain_qemu.sh +``` + +After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`. + +`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`. +`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`. + +You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them. + +## Cross-Compile for RISC-V Target ## + +```bash +cmake -G Ninja -B ./build-riscv \ + -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \ + -D CMAKE_INSTALL_PREFIX=./build-riscv/install \ + -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \ + -D QEMU_PATH={QEMU_PATH} \ + . + +cmake --build ./build-riscv +``` + +Disable the option if there is no RVV support: +``` +-D WITH_RVV=OFF +``` + +## Run Unittests on User Mode QEMU ## + +```bash +cd ./build-riscv && ctest --verbose +``` diff --git a/neozip/arch/riscv/adler32_rvv.c b/neozip/arch/riscv/adler32_rvv.c new file mode 100644 index 0000000000..e446189302 --- /dev/null +++ b/neozip/arch/riscv/adler32_rvv.c @@ -0,0 +1,119 @@ +/* adler32_rvv.c - RVV version of adler32 + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" +#include "adler32_p.h" + +#include <riscv_vector.h> + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) { + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY); + + size_t left = len; + size_t vl = __riscv_vsetvlmax_e8m1(); + vl = MIN(vl, 256); + vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint16m2_t v_buf16_accu; + + /* + * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. + * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit + * accumulators to boost performance. + * + * The block_size is the largest multiple of vl that <= 256, because overflow would occur when + * vl > 256 (255 * 256 <= UINT16_MAX). + * + * We accumulate 8-bit data into a 16-bit accumulator and then + * move the data into the 32-bit accumulator at the last iteration. + */ + size_t block_size = (256 / vl) * vl; + size_t nmax_limit = (NMAX / block_size); + size_t cnt = 0; + while (left >= block_size) { + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t subprob = block_size; + while (subprob > 0) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); + if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + src += vl; + if (COPY) dst += vl; + subprob -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + left -= block_size; + /* do modulo once each block of NMAX size */ + if (++cnt >= nmax_limit) { + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); + v_buf32_accu = __riscv_vremu_vx_u32m4(v_buf32_accu, BASE, vl); + cnt = 0; + } + } + /* the left len <= 256 now, we can use 16-bit accum safely */ + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t res = left; + while (left >= vl) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); + if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + src += vl; + if (COPY) dst += vl; + left -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + + vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); + vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); + vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); + + v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); + + vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); + uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum) % BASE; + + sum2 += (sum2_sum + adler * ((len - left) % BASE)); + + vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); + uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum) % BASE; + + adler += adler_sum; + + sum2 %= BASE; + adler %= BASE; + + /* Process tail (left < 256). */ + return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY); +} + +Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { + return adler32_copy_impl(adler, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif // RISCV_RVV diff --git a/neozip/arch/riscv/chunkset_rvv.c b/neozip/arch/riscv/chunkset_rvv.c new file mode 100644 index 0000000000..cd8ed3cfd2 --- /dev/null +++ b/neozip/arch/riscv/chunkset_rvv.c @@ -0,0 +1,126 @@ +/* chunkset_rvv.c - RVV version of chunkset + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" + +#include <riscv_vector.h> + +/* + * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, + * so we prefer using large size chunk and copy memory as much as possible. + */ +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +#define CHUNK_MEMSET_RVV_IMPL(from, chunk, elen) \ +do { \ + size_t vl, len = sizeof(*chunk) / sizeof(uint##elen##_t); \ + uint##elen##_t val = *(uint##elen##_t*)from; \ + uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \ + do { \ + vl = __riscv_vsetvl_e##elen##m4(len); \ + vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \ + __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \ + len -= vl; chunk_p += vl; \ + } while (len > 0); \ +} while (0) + +/* We don't have a 32-byte datatype for RISC-V arch. */ +typedef struct chunk_s { + uint64_t data[4]; +} chunk_t; + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + CHUNK_MEMSET_RVV_IMPL(from, chunk, 16); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + CHUNK_MEMSET_RVV_IMPL(from, chunk, 32); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + CHUNK_MEMSET_RVV_IMPL(from, chunk, 64); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + memcpy(chunk->data, (uint8_t *)s, sizeof(*chunk)); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + memcpy(out, chunk->data, sizeof(*chunk)); +} + +#define CHUNKSIZE chunksize_rvv +#define CHUNKCOPY chunkcopy_rvv +#define CHUNKUNROLL chunkunroll_rvv +#define CHUNKMEMSET chunkmemset_rvv +#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv + +#define HAVE_CHUNKCOPY + +/* + * Assuming that the length is non-zero, and that `from` lags `out` by at least + * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h. + * + * We load/store a single chunk once in the `CHUNKCOPY`. + * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, + * such that, we prefer copy large memory size once to make good use of the the RVV advance. + * + * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot, + * but we still copy as much memory as possible for some conditions. + * + * case 1: out - from >= len (no overlap) + * We can use memcpy to copy `len` size once + * because the memory layout would be the same. + * + * case 2: overlap + * We copy N chunks using memcpy at once, aiming to achieve our goal: + * to copy as much memory as possible. + * + * After using a single memcpy to copy N chunks, we have to use series of + * loadchunk and storechunk to ensure the result is correct. + */ +static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + size_t dist = out - from; + if (out < from || dist >= len) { + memcpy(out, from, len); + out += len; + from += len; + return out; + } + + size_t align = ((len - 1) % sizeof(chunk_t)) + 1; + memcpy(out, from, sizeof(chunk_t)); + out += align; + from += align; + len -= align; + + size_t vl = (dist / sizeof(chunk_t)) * sizeof(chunk_t); + while (len > dist) { + memcpy(out, from, vl); + out += vl; + from += vl; + len -= vl; + } + + if (len > 0) { + memcpy(out, from, len); + out += len; + } + return out; +} + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_rvv + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/riscv/compare256_rvv.c b/neozip/arch/riscv/compare256_rvv.c new file mode 100644 index 0000000000..edb18a3766 --- /dev/null +++ b/neozip/arch/riscv/compare256_rvv.c @@ -0,0 +1,48 @@ +/* compare256_rvv.c - RVV version of compare256 + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" + +#include <riscv_vector.h> + +static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + size_t vl; + long found_diff; + do { + vl = __riscv_vsetvl_e8m4(256 - len); + vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); + vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); + vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl); + found_diff = __riscv_vfirst_m_b2(v_mask, vl); + if (found_diff >= 0) + return len + (uint32_t)found_diff; + src0 += vl, src1 += vl, len += vl; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) { + return compare256_rvv_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_rvv +#define COMPARE256 compare256_rvv_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_rvv +#define COMPARE256 compare256_rvv_static + +#include "match_tpl.h" + +#endif // RISCV_RVV diff --git a/neozip/arch/riscv/crc32_zbc.c b/neozip/arch/riscv/crc32_zbc.c new file mode 100644 index 0000000000..cf52279b80 --- /dev/null +++ b/neozip/arch/riscv/crc32_zbc.c @@ -0,0 +1,103 @@ +/* crc32_zbc.c - RISCV Zbc version of crc32 + * Copyright (C) 2025 ByteDance. All rights reserved. + * Contributed by Yin Tong <yintong.ustc@bytedance.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_CRC32_ZBC + +#include "zbuild.h" +#include "arch_functions.h" + +#define CLMUL_MIN_LEN 16 // Minimum size of buffer for _crc32_clmul +#define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul + +#define CONSTANT_R3 0x1751997d0ULL +#define CONSTANT_R4 0x0ccaa009eULL +#define CONSTANT_R5 0x163cd6124ULL +#define MASK32 0xFFFFFFFF +#define CRCPOLY_TRUE_LE_FULL 0x1DB710641ULL +#define CONSTANT_RU 0x1F7011641ULL + +static inline uint64_t clmul(uint64_t a, uint64_t b) { + uint64_t res; + __asm__ volatile("clmul %0, %1, %2" : "=r"(res) : "r"(a), "r"(b)); + return res; +} + +static inline uint64_t clmulh(uint64_t a, uint64_t b) { + uint64_t res; + __asm__ volatile("clmulh %0, %1, %2" : "=r"(res) : "r"(a), "r"(b)); + return res; +} + +Z_FORCEINLINE static uint32_t crc32_clmul_impl(uint64_t crc, const unsigned char *buf, uint64_t len) { + const uint64_t *buf64 = (const uint64_t *)buf; + uint64_t low = buf64[0] ^ crc; + uint64_t high = buf64[1]; + + if (len < 16) + goto finish_fold; + len -= 16; + buf64 += 2; + + // process each 16-byte block + while (len >= 16) { + uint64_t t2 = clmul(CONSTANT_R4, high); + uint64_t t3 = clmulh(CONSTANT_R4, high); + + uint64_t t0_new = clmul(CONSTANT_R3, low); + uint64_t t1_new = clmulh(CONSTANT_R3, low); + + // Combine the results and XOR with new data + low = t0_new ^ t2; + high = t1_new ^ t3; + low ^= buf64[0]; + high ^= buf64[1]; + + buf64 += 2; + len -= 16; + } + +finish_fold: + // Fold the 128-bit result into 64 bits + uint64_t fold_t3 = clmulh(low, CONSTANT_R4); + uint64_t fold_t2 = clmul(low, CONSTANT_R4); + low = high ^ fold_t2; + high = fold_t3; + + // Combine the low and high parts and perform polynomial reduction + uint64_t combined = (low >> 32) | ((high & MASK32) << 32); + uint64_t reduced_low = clmul(low & MASK32, CONSTANT_R5) ^ combined; + + // Barrett reduction step + uint64_t barrett = clmul(reduced_low & MASK32, CONSTANT_RU) & MASK32; + barrett = clmul(barrett, CRCPOLY_TRUE_LE_FULL); + uint64_t final = barrett ^ reduced_low; + + // Return the high 32 bits as the final CRC + return (uint32_t)(final >> 32); +} + +Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len) { + if (len < CLMUL_MIN_LEN) { + return crc32_braid(crc, buf, len); + } + + uint64_t unaligned_length = len % CLMUL_CHUNK_LEN; + if (unaligned_length) { + crc = crc32_braid(crc, buf, unaligned_length); + buf += unaligned_length; + len -= unaligned_length; + } + + crc = crc32_clmul_impl(~crc, buf, len); + return ~crc; +} + +Z_INTERNAL uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_riscv64_zbc(crc, src, len); + memcpy(dst, src, len); + return crc; +} +#endif diff --git a/neozip/arch/riscv/riscv_features.c b/neozip/arch/riscv/riscv_features.c new file mode 100644 index 0000000000..b23f10a699 --- /dev/null +++ b/neozip/arch/riscv/riscv_features.c @@ -0,0 +1,99 @@ +#ifdef RISCV_FEATURES + +#define _DEFAULT_SOURCE 1 /* For syscall() */ + +#include "zbuild.h" +#include "riscv_features.h" + +#include <sys/utsname.h> + +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# include <sys/auxv.h> +#endif + +#if defined(__linux__) && defined(HAVE_ASM_HWPROBE_H) +# include <asm/hwprobe.h> +# include <sys/syscall.h> /* For __NR_riscv_hwprobe */ +# include <unistd.h> /* For syscall() */ +#endif + +#define ISA_V_HWCAP (1 << ('v' - 'a')) +#define ISA_ZBC_HWCAP (1 << 29) + +static int riscv_check_features_runtime_hwprobe(struct riscv_cpu_features *features) { +#if defined(__NR_riscv_hwprobe) && defined(RISCV_HWPROBE_KEY_IMA_EXT_0) + struct riscv_hwprobe probes[] = { + {RISCV_HWPROBE_KEY_IMA_EXT_0, 0}, + }; + int ret; + unsigned i; + + ret = syscall(__NR_riscv_hwprobe, probes, sizeof(probes) / sizeof(probes[0]), 0, NULL, 0); + + if (ret != 0) { + /* Kernel does not support hwprobe */ + return 0; + } + + for (i = 0; i < sizeof(probes) / sizeof(probes[0]); i++) { + switch (probes[i].key) { + case RISCV_HWPROBE_KEY_IMA_EXT_0: +# ifdef RISCV_HWPROBE_IMA_V + features->has_rvv = !!(probes[i].value & RISCV_HWPROBE_IMA_V); +# endif +# ifdef RISCV_HWPROBE_EXT_ZBC + features->has_zbc = !!(probes[i].value & RISCV_HWPROBE_EXT_ZBC); +# endif + break; + } + } + + return 1; +#else + return 0; +#endif +} + +static int riscv_check_features_runtime_hwcap(struct riscv_cpu_features *features) { +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) + unsigned long hw_cap = getauxval(AT_HWCAP); + + features->has_rvv = hw_cap & ISA_V_HWCAP; + features->has_zbc = hw_cap & ISA_ZBC_HWCAP; + + return 1; +#else + return 0; +#endif +} + +static void riscv_check_features_runtime(struct riscv_cpu_features *features) { + if (riscv_check_features_runtime_hwprobe(features)) + return; + + riscv_check_features_runtime_hwcap(features); +} + +void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) { + riscv_check_features_runtime(features); +#ifdef RISCV_RVV + if (features->has_rvv) { + size_t e8m1_vec_len; + intptr_t vtype_reg_val; + // Check that a vuint8m1_t vector is at least 16 bytes and that tail + // agnostic and mask agnostic mode are supported + // + __asm__ volatile( + "vsetvli %0, zero, e8, m1, ta, ma\n\t" + "csrr %1, vtype" + : "=r"(e8m1_vec_len), "=r"(vtype_reg_val)); + + // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of + // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16 + // bytes + features->has_rvv = (vtype_reg_val >= 0 && e8m1_vec_len >= 16); + } +#endif +} + +#endif diff --git a/neozip/arch/riscv/riscv_features.h b/neozip/arch/riscv/riscv_features.h new file mode 100644 index 0000000000..42855a1b6b --- /dev/null +++ b/neozip/arch/riscv/riscv_features.h @@ -0,0 +1,19 @@ +/* riscv_features.h -- check for riscv features. + * + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_FEATURES_H_ +#define RISCV_FEATURES_H_ + +struct riscv_cpu_features { + int has_rvv; + int has_zbc; +}; + +void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features); + +#endif /* RISCV_FEATURES_H_ */ diff --git a/neozip/arch/riscv/riscv_functions.h b/neozip/arch/riscv/riscv_functions.h new file mode 100644 index 0000000000..89120ffabf --- /dev/null +++ b/neozip/arch/riscv/riscv_functions.h @@ -0,0 +1,60 @@ +/* riscv_functions.h -- RISCV implementations for arch-specific functions. + * + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_FUNCTIONS_H_ +#define RISCV_FUNCTIONS_H_ + +#include "riscv_natives.h" + +#ifdef RISCV_RVV +uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); + +uint32_t longest_match_rvv(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_rvv(deflate_state *const s, uint32_t cur_match); +void slide_hash_rvv(deflate_state *s); +void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); +#endif + +#ifdef RISCV_CRC32_ZBC +uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// RISCV - RVV +# ifdef RISCV_RVV_NATIVE +# undef native_adler32 +# define native_adler32 adler32_rvv +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_rvv +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_rvv +# undef native_compare256 +# define native_compare256 compare256_rvv +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_rvv +# undef native_longest_match +# define native_longest_match longest_match_rvv +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_rvv +# undef native_slide_hash +# define native_slide_hash slide_hash_rvv +# endif +// RISCV - CRC32 +# ifdef RISCV_ZBC_NATIVE +# undef native_crc32 +# define native_crc32 crc32_riscv64_zbc +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_riscv64_zbc +# endif +#endif + +#endif /* RISCV_FUNCTIONS_H_ */ diff --git a/neozip/arch/riscv/riscv_natives.h b/neozip/arch/riscv/riscv_natives.h new file mode 100644 index 0000000000..38d7aba648 --- /dev/null +++ b/neozip/arch/riscv/riscv_natives.h @@ -0,0 +1,19 @@ +/* riscv_natives.h -- RISCV compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_NATIVES_H_ +#define RISCV_NATIVES_H_ + +#if defined(__riscv_v) && defined(__linux__) +# ifdef RISCV_RVV +# define RISCV_RVV_NATIVE +# endif +#endif +#if defined(__riscv_zbc) +# ifdef RISCV_CRC32_ZBC +# define RISCV_ZBC_NATIVE +# endif +#endif + +#endif /* RISCV_NATIVES_H_ */ diff --git a/neozip/arch/riscv/slide_hash_rvv.c b/neozip/arch/riscv/slide_hash_rvv.c new file mode 100644 index 0000000000..e794c38204 --- /dev/null +++ b/neozip/arch/riscv/slide_hash_rvv.c @@ -0,0 +1,33 @@ +/* slide_hash_rvv.c - RVV version of slide_hash + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" +#include "deflate.h" + +#include <riscv_vector.h> + +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + size_t vl; + while (entries > 0) { + vl = __riscv_vsetvl_e16m4(entries); + vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl); + vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl); + __riscv_vse16_v_u16m4(table, v_diff, vl); + table += vl, entries -= vl; + } +} + +Z_INTERNAL void slide_hash_rvv(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} + +#endif // RISCV_RVV diff --git a/neozip/arch/s390/Makefile.in b/neozip/arch/s390/Makefile.in new file mode 100644 index 0000000000..e994157df2 --- /dev/null +++ b/neozip/arch/s390/Makefile.in @@ -0,0 +1,48 @@ +# Makefile for zlib-ng +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= +VGFMAFLAG= +NOLTOFLAG= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +s390_features.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c + +s390_features.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390_features.c + +dfltcc_deflate.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c + +dfltcc_deflate.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_deflate.c + +dfltcc_inflate.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c + +dfltcc_inflate.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c + +crc32-vx.o: + $(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c + +crc32-vx.lo: + $(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/s390/README.md b/neozip/arch/s390/README.md new file mode 100644 index 0000000000..c56ffd7654 --- /dev/null +++ b/neozip/arch/s390/README.md @@ -0,0 +1,265 @@ +# Introduction + +This directory contains SystemZ deflate hardware acceleration support. +It can be enabled using the following build commands: + + $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate + $ make + +or + + $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 . + $ make + +When built like this, zlib-ng would compress using hardware on level 1, +and using software on all other levels. Decompression will always happen +in hardware. In order to enable hardware compression for levels 1-6 +(i.e. to make it used by default) one could add +`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng. + +SystemZ deflate hardware acceleration is available on [IBM z15]( +https://www.ibm.com/products/z15) and newer machines under the name [ +"Integrated Accelerator for zEnterprise Data Compression"]( +https://www.ibm.com/support/z-content-solutions/compression/). The +programming interface to it is a machine instruction called DEFLATE +CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles +of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both +the code and the rest of this document refer to this feature simply as +"DFLTCC". + +# Performance + +Performance figures are published [here]( +https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine +). The compression speed-up can be as high as 110x and the decompression +speed-up can be as high as 15x. + +# Limitations + +Two DFLTCC compression calls with identical inputs are not guaranteed to +produce identical outputs. Therefore care should be taken when using +hardware compression when reproducible results are desired. In +particular, zlib-ng-specific `zng_deflateSetParams` call allows setting +`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a +particular stream. + +DFLTCC does not support every single zlib-ng feature, in particular: + +* `inflate(Z_BLOCK)` and `inflate(Z_TREES)` +* `inflateMark()` +* `inflatePrime()` +* `inflateSyncPoint()` + +When used, these functions will either switch to software, or, in case +this is not possible, gracefully fail. + +# Code structure + +All SystemZ-specific code lives in `arch/s390` directory and is +integrated with the rest of zlib-ng using hook macros. + +## Hook macros + +DFLTCC takes as arguments a parameter block, an input buffer, an output +buffer, and a window. Parameter blocks are stored alongside zlib states; +buffers are forwarded from the caller; and window - which must be +4k-aligned and is always 64k large, is managed using the `PAD_WINDOW()`, +`WINDOW_PAD_SIZE`, `HINT_ALIGNED_WINDOW` and `DEFLATE_ADJUST_WINDOW_SIZE()` +and `INFLATE_ADJUST_WINDOW_SIZE()` hooks. + +Software and hardware window formats do not match, therefore, +`deflateSetDictionary()`, `deflateGetDictionary()`, `inflateSetDictionary()` +and `inflateGetDictionary()` need special handling, which is triggered using +`DEFLATE_SET_DICTIONARY_HOOK()`, `DEFLATE_GET_DICTIONARY_HOOK()`, +`INFLATE_SET_DICTIONARY_HOOK()` and `INFLATE_GET_DICTIONARY_HOOK()` macros. + +`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC +parameter block using `DEFLATE_RESET_KEEP_HOOK()` and +`INFLATE_RESET_KEEP_HOOK()` macros. + +`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and +`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported +calls gracefully fail. + +`DEFLATE_PARAMS_HOOK()` implements switching between hardware and +software compression mid-stream using `deflateParams()`. Switching +normally entails flushing the current block, which might not be possible +in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook +in order to detect and gracefully handle such situations. + +The algorithm implemented in hardware has different compression ratio +than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()` +and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()` +return the correct results for the hardware implementation. + +Actual compression and decompression are handled by `DEFLATE_HOOK()` and +`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the +window on its own, calling `updatewindow()` is suppressed using +`INFLATE_NEED_UPDATEWINDOW()` macro. + +In addition to compression, DFLTCC computes CRC-32 and Adler-32 +checksums, therefore, whenever it's used, software checksumming is +suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()` +macros. + +While software always produces reproducible compression results, this +is not the case for DFLTCC. Therefore, zlib-ng users are given the +ability to specify whether or not reproducible compression results +are required. While it is always possible to specify this setting +before the compression begins, it is not always possible to do so in +the middle of a deflate stream - the exact conditions for that are +determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro. + +## SystemZ-specific code + +When zlib-ng is built with DFLTCC, the hooks described above are +converted to calls to functions, which are implemented in +`arch/s390/dfltcc_*` files. The functions can be grouped in three broad +categories: + +* Base DFLTCC support, e.g. wrapping the machine instruction - `dfltcc()`. +* Translating between software and hardware data formats, e.g. + `dfltcc_deflate_set_dictionary()`. +* Translating between software and hardware state machines, e.g. + `dfltcc_deflate()` and `dfltcc_inflate()`. + +The functions from the first two categories are fairly simple, however, +various quirks in both software and hardware state machines make the +functions from the third category quite complicated. + +### `dfltcc_deflate()` function + +This function is called by `deflate()` and has the following +responsibilities: + +* Checking whether DFLTCC can be used with the current stream. If this + is not the case, then it returns `0`, making `deflate()` use some + other function in order to compress in software. Otherwise it returns + `1`. +* Block management and Huffman table generation. DFLTCC ends blocks only + when explicitly instructed to do so by the software. Furthermore, + whether to use fixed or dynamic Huffman tables must also be determined + by the software. Since looking at data in order to gather statistics + would negate performance benefits, the following approach is used: the + first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed + block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into + dynamic blocks. +* Writing EOBS. Block Closing Control bit in the parameter block + instructs DFLTCC to write EOBS, however, certain conditions need to be + met: input data length must be non-zero or Continuation Flag must be + set. To put this in simpler terms, DFLTCC will silently refuse to + write EOBS if this is the only thing that it is asked to do. Since the + code has to be able to emit EOBS in software anyway, in order to avoid + tricky corner cases Block Closing Control is never used. Whether to + write EOBS is instead controlled by `soft_bcc` variable. +* Triggering block post-processing. Depending on flush mode, `deflate()` + must perform various additional actions when a block or a stream ends. + `dfltcc_deflate()` informs `deflate()` about this using + `block_state *result` parameter. +* Converting software state fields into hardware parameter block fields, + and vice versa. For example, `wrap` and Check Value Type or `bi_valid` + and Sub-Byte Boundary. Certain fields cannot be translated and must + persist untouched in the parameter block between calls, for example, + Continuation Flag or Continuation State Buffer. +* Handling flush modes and low-memory situations. These aspects are + quite intertwined and pervasive. The general idea here is that the + code must not do anything in software - whether explicitly by e.g. + calling `send_eobs()`, or implicitly - by returning to `deflate()` + with certain return and `*result` values, when Continuation Flag is + set. +* Ending streams. When a new block is started and flush mode is + `Z_FINISH`, Block Header Final parameter block bit is used to mark + this block as final. However, sometimes an empty final block is + needed, and, unfortunately, just like with EOBS, DFLTCC will silently + refuse to do this. The general idea of DFLTCC implementation is to + rely as much as possible on the existing code. Here in order to do + this, the code pretends that it does not support DFLTCC, which makes + `deflate()` call a software compression function, which writes an + empty final block. Whether this is required is controlled by + `need_empty_block` variable. +* Error handling. This is simply converting + Operation-Ending-Supplemental Code to string. Errors can only happen + due to things like memory corruption, and therefore they don't affect + the `deflate()` return code. + +### `dfltcc_inflate()` function + +This function is called by `inflate()` from the `TYPEDO` state (that is, +when all the metadata is parsed and the stream is positioned at the type +bits of deflate block header) and it's responsible for the following: + +* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`. + Unfortunately, there is no way to ask DFLTCC to stop decompressing on + block or tree boundary. +* `inflate()` decompression loop management. This is controlled using + the return value, which can be either `DFLTCC_INFLATE_BREAK` or + `DFLTCC_INFLATE_CONTINUE`. +* Converting software state fields into hardware parameter block fields, + and vice versa. For example, `whave` and History Length or `wnext` and + History Offset. +* Ending streams. This instructs `inflate()` to return `Z_STREAM_END` + and is controlled by `last` state field. +* Error handling. Like deflate, error handling comprises + Operation-Ending-Supplemental Code to string conversion. Unlike + deflate, errors may happen due to bad inputs, therefore they are + propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`. + +# Testing + +Given complexity of DFLTCC machine instruction, it is not clear whether +QEMU TCG will ever support it. At the time of writing, one has to have +access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since +DFLTCC is a non-privileged instruction, neither special VM/LPAR +configuration nor root are required. + +zlib-ng CI uses an IBM-provided z15 self-hosted builder for the DFLTCC +testing. There is no official IBM Z GitHub Actions runner, so we build +one inspired by `anup-kodlekere/gaplib`. +Future updates to actions-runner might need an updated patch. The .net +version number patch has been separated into a separate file to avoid a +need for constantly changing the patch. + +## Configuring the builder. + +### Install prerequisites. +``` +sudo dnf install podman +``` + +### Create a config file, needs github personal access token. +Access token needs permissions; Repo Admin RW, Org Self-hosted runners RW. +For details, consult +https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository + +#### Create file /etc/actions-runner: +``` +REPO=<owner>/<name> +PAT_TOKEN=<github_pat_***> +``` + +#### Set permissions on /etc/actions-runner: +``` +chmod 600 /etc/actions-runner +``` + +### Add actions-runner service. +``` +sudo cp self-hosted-builder/actions-runner.service /etc/systemd/system/ +sudo systemctl daemon-reload +``` + +### Autostart actions-runner. +``` +$ sudo systemctl enable --now actions-runner +``` + +### Add auto-rebuild cronjob +``` +sudo cp self-hosted-builder/actions-runner-rebuild.sh /etc/cron.weekly/ +chmod +x /etc/cron.weekly/actions-runner-rebuild.sh +``` + +## Building / Rebuilding the container +``` +sudo /etc/cron.weekly/actions-runner-rebuild.sh +``` diff --git a/neozip/arch/s390/crc32-vx.c b/neozip/arch/s390/crc32-vx.c new file mode 100644 index 0000000000..ba00f9a370 --- /dev/null +++ b/neozip/arch/s390/crc32-vx.c @@ -0,0 +1,232 @@ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of bitreflected CRC-32 checksums. + * + * This CRC-32 implementation algorithm is bitreflected and processes + * the least-significant bit first (Little-Endian). + * + * This code was originally written by Hendrik Brueckner + * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been + * relicensed under the zlib license. + */ + +#ifdef S390_CRC32_VX + +#include "zbuild.h" +#include "arch_functions.h" + +#include <vecintrin.h> + +typedef unsigned char uv16qi __attribute__((vector_size(16))); +typedef unsigned int uv4si __attribute__((vector_size(16))); +typedef unsigned long long uv2di __attribute__((vector_size(16))); + +static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) { + /* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 + * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 + * R3 = [(x128+32 mod P'(x) << 32)]' << 1 + * R4 = [(x128-32 mod P'(x) << 32)]' << 1 + * R5 = [(x64 mod P'(x) << 32)]' << 1 + * R6 = [(x32 mod P'(x) << 32)]' << 1 + * + * The bitreflected Barret reduction constant, u', is defined as + * the bit reversal of floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + */ + const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */ + const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */ + const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */ + const uv2di r5 = {0, 0x163CD6124}; /* R5 */ + const uv2di ru_poly = {0, 0x1F7011641}; /* u' */ + const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */ + + /* + * Load the initial CRC value. + * + * The CRC value is loaded into the rightmost word of the + * vector register and is later XORed with the LSB portion + * of the loaded input data. + */ + uv2di v0 = {0, 0}; + v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3); + + /* Load a 64-byte data chunk and XOR with CRC */ + uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be); + uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be); + uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be); + uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be); + + v1 ^= v0; + buf += 64; + len -= 64; + + while (len >= 64) { + /* Load the next 64-byte data chunk */ + uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be); + uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be); + uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be); + uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be); + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate result + * is then folded (accumulated) with the next data chunk in PART1 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1); + v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2); + v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3); + v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4); + + buf += 64; + len -= 64; + } + + /* + * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 + * and R4 and accumulating the next 128-bit chunk until a single 128-bit + * value remains. + */ + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2); + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3); + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4); + + while (len >= 16) { + /* Load next data chunk */ + v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be); + + /* Fold next data chunk */ + v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2); + + buf += 16; + len -= 16; + } + + /* + * Set up a vector register for byte shifts. The shift value must + * be loaded in bits 1-4 in byte element 7 of a vector register. + * Shift by 8 bytes: 0x40 + * Shift by 4 bytes: 0x20 + */ + uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + v9 = vec_insert((unsigned char)0x40, v9, 7); + + /* + * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes + * to move R4 into the rightmost doubleword and set the leftmost + * doubleword to 0x1. + */ + v0 = vec_srb(r4r3, (uv2di)v9); + v0[0] = 1; + + /* + * Compute GF(2) product of V1 and V0. The rightmost doubleword + * of V1 is multiplied with R4. The leftmost doubleword of V1 is + * multiplied by 0x1 and is then XORed with rightmost product. + * Implicitly, the intermediate leftmost product becomes padded + */ + v1 = (uv2di)vec_gfmsum_128(v0, v1); + + /* + * Now do the final 32-bit fold by multiplying the rightmost word + * in V1 with R5 and XOR the result with the remaining bits in V1. + * + * To achieve this by a single VGFMAG, right shift V1 by a word + * and store the result in V2 which is then accumulated. Use the + * vector unpack instruction to load the rightmost half of the + * doubleword into the rightmost doubleword element of V1; the other + * half is loaded in the leftmost doubleword. + * The vector register with CONST_R5 contains the R5 constant in the + * rightmost doubleword and the leftmost doubleword is zero to ignore + * the leftmost product of V1. + */ + v9 = vec_insert((unsigned char)0x20, v9, 7); + v2 = vec_srb(v1, (uv2di)v9); + v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */ + v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2); + + /* + * Apply a Barret reduction to compute the final 32-bit CRC value. + * + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: The leftmost doubleword of vector register containing + * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product + * is zero and does not contribute to the final result. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + v2 = vec_unpackl((uv4si)v1); + v2 = (uv2di)vec_gfmsum_128(ru_poly, v2); + + /* + * Compute the GF(2) product of the CRC polynomial with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is stored in word element 2 of V2. + */ + v2 = vec_unpackl((uv4si)v2); + v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1); + + return ((uv4si)v2)[2]; +} + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t len) { + size_t prealign, aligned, remaining; + + if (len < VX_MIN_LEN + VX_ALIGN_MASK) + return crc32_braid(crc, buf, len); + + if ((uintptr_t)buf & VX_ALIGN_MASK) { + prealign = (size_t)ALIGN_DIFF(buf, VX_ALIGNMENT); + len -= prealign; + crc = crc32_braid(crc, buf, prealign); + buf += prealign; + } + aligned = ALIGN_DOWN(len, VX_ALIGNMENT); + remaining = len & VX_ALIGN_MASK; + + crc = ~crc32_le_vgfm_16(~crc, buf, aligned); + + if (remaining) + crc = crc32_braid(crc, buf + aligned, remaining); + + return crc; +} + +Z_INTERNAL uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_s390_vx(crc, src, len); + memcpy(dst, src, len); + return crc; +} + +#endif diff --git a/neozip/arch/s390/dfltcc_common.h b/neozip/arch/s390/dfltcc_common.h new file mode 100644 index 0000000000..a6527ab5df --- /dev/null +++ b/neozip/arch/s390/dfltcc_common.h @@ -0,0 +1,119 @@ +#ifndef DFLTCC_COMMON_H +#define DFLTCC_COMMON_H + +#include "zutil.h" + +/* + Parameter Block for Query Available Functions. + */ +struct dfltcc_qaf_param { + char fns[16]; + char reserved1[8]; + char fmts[2]; + char reserved2[6]; +} ALIGNED_(8); + +/* + Parameter Block for Generate Dynamic-Huffman Table, Compress and Expand. + */ +struct dfltcc_param_v0 { + uint16_t pbvn; /* Parameter-Block-Version Number */ + uint8_t mvn; /* Model-Version Number */ + uint8_t ribm; /* Reserved for IBM use */ + uint32_t reserved32 : 31; + uint32_t cf : 1; /* Continuation Flag */ + uint8_t reserved64[8]; + uint32_t nt : 1; /* New Task */ + uint32_t reserved129 : 1; + uint32_t cvt : 1; /* Check Value Type */ + uint32_t reserved131 : 1; + uint32_t htt : 1; /* Huffman-Table Type */ + uint32_t bcf : 1; /* Block-Continuation Flag */ + uint32_t bcc : 1; /* Block Closing Control */ + uint32_t bhf : 1; /* Block Header Final */ + uint32_t reserved136 : 1; + uint32_t reserved137 : 1; + uint32_t dhtgc : 1; /* DHT Generation Control */ + uint32_t reserved139 : 5; + uint32_t reserved144 : 5; + uint32_t sbb : 3; /* Sub-Byte Boundary */ + uint8_t oesc; /* Operation-Ending-Supplemental Code */ + uint32_t reserved160 : 12; + uint32_t ifs : 4; /* Incomplete-Function Status */ + uint16_t ifl; /* Incomplete-Function Length */ + uint8_t reserved192[8]; + uint8_t reserved256[8]; + uint8_t reserved320[4]; + uint16_t hl; /* History Length */ + uint32_t reserved368 : 1; + uint16_t ho : 15; /* History Offset */ + uint32_t cv; /* Check Value */ + uint32_t eobs : 15; /* End-of-block Symbol */ + uint32_t reserved431: 1; + uint8_t eobl : 4; /* End-of-block Length */ + uint32_t reserved436 : 12; + uint32_t reserved448 : 4; + uint16_t cdhtl : 12; /* Compressed-Dynamic-Huffman Table + Length */ + uint8_t reserved464[6]; + uint8_t cdht[288]; /* Compressed-Dynamic-Huffman Table */ + uint8_t reserved[24]; + uint8_t ribm2[8]; /* Reserved for IBM use */ + uint8_t csb[1152]; /* Continuation-State Buffer */ +} ALIGNED_(8); + +/* + Extension of inflate_state and deflate_state. + */ +struct dfltcc_state { + struct dfltcc_param_v0 param; /* Parameter block. */ + struct dfltcc_qaf_param af; /* Available functions. */ + char msg[64]; /* Buffer for strm->msg */ +}; + +typedef struct { + struct dfltcc_state common; + uint16_t level_mask; /* Levels on which to use DFLTCC */ + uint32_t block_size; /* New block each X bytes */ + size_t block_threshold; /* New block after total_in > X */ + uint32_t dht_threshold; /* New block only if avail_in >= X */ +} arch_deflate_state; + +typedef struct { + struct dfltcc_state common; +} arch_inflate_state; + +/* + History buffer size. + */ +#define HB_BITS 15 +#define HB_SIZE (1 << HB_BITS) + +/* + Sizes of deflate block parts. + */ +#define DFLTCC_BLOCK_HEADER_BITS 3 +#define DFLTCC_HLITS_COUNT_BITS 5 +#define DFLTCC_HDISTS_COUNT_BITS 5 +#define DFLTCC_HCLENS_COUNT_BITS 4 +#define DFLTCC_MAX_HCLENS 19 +#define DFLTCC_HCLEN_BITS 3 +#define DFLTCC_MAX_HLITS 286 +#define DFLTCC_MAX_HDISTS 30 +#define DFLTCC_MAX_HLIT_HDIST_BITS 7 +#define DFLTCC_MAX_SYMBOL_BITS 16 +#define DFLTCC_MAX_EOBS_BITS 15 +#define DFLTCC_MAX_PADDING_BITS 7 + +#define DEFLATE_BOUND_COMPLEN(source_len) \ + ((DFLTCC_BLOCK_HEADER_BITS + \ + DFLTCC_HLITS_COUNT_BITS + \ + DFLTCC_HDISTS_COUNT_BITS + \ + DFLTCC_HCLENS_COUNT_BITS + \ + DFLTCC_MAX_HCLENS * DFLTCC_HCLEN_BITS + \ + (DFLTCC_MAX_HLITS + DFLTCC_MAX_HDISTS) * DFLTCC_MAX_HLIT_HDIST_BITS + \ + (source_len) * DFLTCC_MAX_SYMBOL_BITS + \ + DFLTCC_MAX_EOBS_BITS + \ + DFLTCC_MAX_PADDING_BITS) >> 3) + +#endif diff --git a/neozip/arch/s390/dfltcc_deflate.c b/neozip/arch/s390/dfltcc_deflate.c new file mode 100644 index 0000000000..5cbd700c64 --- /dev/null +++ b/neozip/arch/s390/dfltcc_deflate.c @@ -0,0 +1,390 @@ +/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL compression support. */ + +/* + Use the following commands to build zlib-ng with DFLTCC compression support: + + $ ./configure --with-dfltcc-deflate + or + + $ cmake -DWITH_DFLTCC_DEFLATE=1 . + + and then + + $ make +*/ + +#ifdef S390_DFLTCC_DEFLATE + +#include "zbuild.h" +#include "deflate.h" +#include "deflate_p.h" +#include "trees_emit.h" +#include "dfltcc_deflate.h" +#include "dfltcc_detail.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + arch_deflate_state *dfltcc_state = &state->arch; + + dfltcc_reset_state(&dfltcc_state->common); + + /* Initialize tuning parameters */ + dfltcc_state->level_mask = DFLTCC_LEVEL_MASK; + dfltcc_state->block_size = DFLTCC_BLOCK_SIZE; + dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE; + dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE; +} + +static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy, + int reproducible) { + deflate_state *state = (deflate_state *)strm->state; + arch_deflate_state *dfltcc_state = &state->arch; + + /* Unsupported compression settings */ + if ((dfltcc_state->level_mask & (1 << level)) == 0) + return 0; + if (window_bits != HB_BITS) + return 0; + if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY) + return 0; + if (reproducible) + return 0; + + /* Unsupported hardware */ + if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) || + !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) || + !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0)) + return 0; + + return 1; +} + +int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + + return dfltcc_can_deflate_with_params(strm, state->level, W_BITS(state), state->strategy, state->reproducible); +} + +static inline void dfltcc_gdht(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + size_t avail_in = strm->avail_in; + + dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL); +} + +static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + size_t avail_in = strm->avail_in; + size_t avail_out = strm->avail_out; + dfltcc_cc cc; + + cc = dfltcc(DFLTCC_CMPR | HBT_CIRCULAR, + param, &strm->next_out, &avail_out, + &strm->next_in, &avail_in, state->window); + strm->total_in += (strm->avail_in - avail_in); + strm->total_out += (strm->avail_out - avail_out); + strm->avail_in = avail_in; + strm->avail_out = avail_out; + return cc; +} + +static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) { + deflate_state *state = (deflate_state *)strm->state; + + send_bits(state, bi_reverse((uint16_t)(param->eobs >> (15 - param->eobl)), param->eobl), + param->eobl, state->bi_buf, state->bi_valid); + + flush_pending_inline(strm); + if (state->pending != 0) { + /* The remaining data is located in pending_out[0:pending]. If someone + * calls put_byte() - this might happen in deflate() - the byte will be + * placed into pending_buf[pending], which is incorrect. Move the + * remaining data to the beginning of pending_buf so that put_byte() is + * usable again. + */ + memmove(state->pending_buf, state->pending_out, state->pending); + state->pending_out = state->pending_buf; + } +#ifdef ZLIB_DEBUG + state->compressed_len += param->eobl; +#endif +} + +int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result) { + deflate_state *state = (deflate_state *)strm->state; + arch_deflate_state *dfltcc_state = &state->arch; + struct dfltcc_param_v0 *param = &dfltcc_state->common.param; + uInt masked_avail_in; + dfltcc_cc cc; + int need_empty_block; + int soft_bcc; + int no_flush; + + if (!PREFIX(dfltcc_can_deflate)(strm)) { + /* Clear history. */ + if (flush == Z_FULL_FLUSH) + param->hl = 0; + return 0; + } + +again: + masked_avail_in = 0; + soft_bcc = 0; + no_flush = flush == Z_NO_FLUSH; + + /* No input data. Return, except when Continuation Flag is set, which means + * that DFLTCC has buffered some output in the parameter block and needs to + * be called again in order to flush it. + */ + if (strm->avail_in == 0 && !param->cf) { + /* A block is still open, and the hardware does not support closing + * blocks without adding data. Thus, close it manually. + */ + if (!no_flush && param->bcf) { + send_eobs(strm, param); + param->bcf = 0; + } + /* Let one of deflate_* functions write a trailing empty block. */ + if (flush == Z_FINISH) + return 0; + /* Clear history. */ + if (flush == Z_FULL_FLUSH) + param->hl = 0; + /* Trigger block post-processing if necessary. */ + *result = no_flush ? need_more : block_done; + return 1; + } + + /* There is an open non-BFINAL block, we are not going to close it just + * yet, we have compressed more than DFLTCC_BLOCK_SIZE bytes and we see + * more than DFLTCC_DHT_MIN_SAMPLE_SIZE bytes. Open a new block with a new + * DHT in order to adapt to a possibly changed input data distribution. + */ + if (param->bcf && no_flush && + strm->total_in > dfltcc_state->block_threshold && + strm->avail_in >= dfltcc_state->dht_threshold) { + if (param->cf) { + /* We need to flush the DFLTCC buffer before writing the + * End-of-block Symbol. Mask the input data and proceed as usual. + */ + masked_avail_in += strm->avail_in; + strm->avail_in = 0; + no_flush = 0; + } else { + /* DFLTCC buffer is empty, so we can manually write the + * End-of-block Symbol right away. + */ + send_eobs(strm, param); + param->bcf = 0; + dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size; + } + } + + /* No space for compressed data. If we proceed, dfltcc_cmpr() will return + * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still + * set BCF=1, which is wrong. Avoid complications and return early. + */ + if (strm->avail_out == 0) { + *result = need_more; + return 1; + } + + /* The caller gave us too much data. Pass only one block worth of + * uncompressed data to DFLTCC and mask the rest, so that on the next + * iteration we start a new block. + */ + if (no_flush && strm->avail_in > dfltcc_state->block_size) { + masked_avail_in += (strm->avail_in - dfltcc_state->block_size); + strm->avail_in = dfltcc_state->block_size; + } + + /* When we have an open non-BFINAL deflate block and caller indicates that + * the stream is ending, we need to close an open deflate block and open a + * BFINAL one. + */ + need_empty_block = flush == Z_FINISH && param->bcf && !param->bhf; + + /* Translate stream to parameter block */ + param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32; + if (!no_flush) + /* We need to close a block. Always do this in software - when there is + * no input data, the hardware will not honor BCC. */ + soft_bcc = 1; + if (flush == Z_FINISH && !param->bcf) + /* We are about to open a BFINAL block, set Block Header Final bit + * until the stream ends. + */ + param->bhf = 1; + /* DFLTCC-CMPR will write to next_out, so make sure that buffers with + * higher precedence are empty. + */ + Assert(state->pending == 0, "There must be no pending bytes"); + Assert(state->bi_valid < 8, "There must be less than 8 pending bits"); + param->sbb = (unsigned int)state->bi_valid; + if (param->sbb > 0) + *strm->next_out = (unsigned char)state->bi_buf; + /* Honor history and check value */ + param->nt = 0; + if (state->wrap == 1) + param->cv = strm->adler; + else if (state->wrap == 2) + param->cv = ZSWAP32(strm->adler); + + /* When opening a block, choose a Huffman-Table Type */ + if (!param->bcf) { + if (state->strategy == Z_FIXED || (strm->total_in == 0 && dfltcc_state->block_threshold > 0)) + param->htt = HTT_FIXED; + else { + param->htt = HTT_DYNAMIC; + dfltcc_gdht(strm); + } + } + + /* Deflate */ + do { + cc = dfltcc_cmpr(strm); + if (strm->avail_in < 4096 && masked_avail_in > 0) + /* We are about to call DFLTCC with a small input buffer, which is + * inefficient. Since there is masked data, there will be at least + * one more DFLTCC call, so skip the current one and make the next + * one handle more data. + */ + break; + } while (cc == DFLTCC_CC_AGAIN); + + /* Translate parameter block to stream */ + strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc); + state->bi_valid = param->sbb; + if (state->bi_valid == 0) + state->bi_buf = 0; /* Avoid accessing next_out */ + else + state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1); + if (state->wrap == 1) + strm->adler = param->cv; + else if (state->wrap == 2) + strm->adler = ZSWAP32(param->cv); + + /* Unmask the input data */ + strm->avail_in += masked_avail_in; + masked_avail_in = 0; + + /* If we encounter an error, it means there is a bug in DFLTCC call */ + Assert(cc != DFLTCC_CC_OP2_CORRUPT || param->oesc == 0, "BUG"); + + /* Update Block-Continuation Flag. It will be used to check whether to call + * GDHT the next time. + */ + if (cc == DFLTCC_CC_OK) { + if (soft_bcc) { + send_eobs(strm, param); + param->bcf = 0; + dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size; + } else + param->bcf = 1; + if (flush == Z_FINISH) { + if (need_empty_block) + /* Make the current deflate() call also close the stream */ + return 0; + else { + bi_windup(state); + *result = finish_done; + } + } else { + if (flush == Z_FULL_FLUSH) + param->hl = 0; /* Clear history */ + *result = flush == Z_NO_FLUSH ? need_more : block_done; + } + } else { + param->bcf = 1; + *result = need_more; + } + if (strm->avail_in != 0 && strm->avail_out != 0) + goto again; /* deflate() must use all input or all output */ + return 1; +} + +/* + Switching between hardware and software compression. + + DFLTCC does not support all zlib settings, e.g. generation of non-compressed + blocks or alternative window sizes. When such settings are applied on the + fly with deflateParams, we need to convert between hardware and software + window formats. +*/ +static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + return strm->total_in > 0 || param->nt == 0 || param->hl > 0; +} + +int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush) { + deflate_state *state = (deflate_state *)strm->state; + int could_deflate = PREFIX(dfltcc_can_deflate)(strm); + int can_deflate = dfltcc_can_deflate_with_params(strm, level, W_BITS(state), strategy, state->reproducible); + + if (can_deflate == could_deflate) + /* We continue to work in the same mode - no changes needed */ + return Z_OK; + + if (!dfltcc_was_deflate_used(strm)) + /* DFLTCC was not used yet - no changes needed */ + return Z_OK; + + /* For now, do not convert between window formats - simply get rid of the old data instead */ + *flush = Z_FULL_FLUSH; + return Z_OK; +} + +int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might + * close the block without resetting the compression state. Detect this + * situation and return that deflation is not done. + */ + if (flush == Z_FULL_FLUSH && strm->avail_out == 0) + return 0; + + /* Return that deflation is not done if DFLTCC is used and either it + * buffered some data (Continuation Flag is set), or has not written EOBS + * yet (Block-Continuation Flag is set). + */ + return !PREFIX(dfltcc_can_deflate)(strm) || (!param->cf && !param->bcf); +} + +int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible) { + deflate_state *state = (deflate_state *)strm->state; + + return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm); +} + +/* + Preloading history. +*/ +int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + append_history(param, state->window, dictionary, dict_length); + state->strstart = 1; /* Add FDICT to zlib header */ + state->block_start = state->strstart; /* Make deflate_stored happy */ + return Z_OK; +} + +int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + if (dictionary) + get_history(param, state->window, dictionary); + if (dict_length) + *dict_length = param->hl; + return Z_OK; +} + +#endif diff --git a/neozip/arch/s390/dfltcc_deflate.h b/neozip/arch/s390/dfltcc_deflate.h new file mode 100644 index 0000000000..35e2fd3f62 --- /dev/null +++ b/neozip/arch/s390/dfltcc_deflate.h @@ -0,0 +1,58 @@ +#ifndef DFLTCC_DEFLATE_H +#define DFLTCC_DEFLATE_H + +#include "deflate.h" +#include "dfltcc_common.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_deflate_state)(PREFIX3(streamp)); +int Z_INTERNAL PREFIX(dfltcc_can_deflate)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_deflate)(PREFIX3(streamp) strm, int flush, block_state *result); +int Z_INTERNAL PREFIX(dfltcc_deflate_params)(PREFIX3(streamp) strm, int level, int strategy, int *flush); +int Z_INTERNAL PREFIX(dfltcc_deflate_done)(PREFIX3(streamp) strm, int flush); +int Z_INTERNAL PREFIX(dfltcc_can_set_reproducible)(PREFIX3(streamp) strm, int reproducible); +int Z_INTERNAL PREFIX(dfltcc_deflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length); +int Z_INTERNAL PREFIX(dfltcc_deflate_get_dictionary)(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length); + +#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_deflate)((strm))) \ + return PREFIX(dfltcc_deflate_set_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_deflate)((strm))) \ + return PREFIX(dfltcc_deflate_get_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define DEFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_deflate_state) + +#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \ + do { \ + int err; \ +\ + err = PREFIX(dfltcc_deflate_params)((strm), (level), (strategy), (hook_flush)); \ + if (err == Z_STREAM_ERROR) \ + return err; \ + } while (0) + +#define DEFLATE_DONE PREFIX(dfltcc_deflate_done) + +#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \ + do { \ + if (deflateStateCheck((strm)) || PREFIX(dfltcc_can_deflate)((strm))) \ + (complen) = DEFLATE_BOUND_COMPLEN(source_len); \ + } while (0) + +#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (PREFIX(dfltcc_can_deflate)((strm))) + +#define DEFLATE_HOOK PREFIX(dfltcc_deflate) + +#define DEFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_deflate)((strm))) + +#define DEFLATE_CAN_SET_REPRODUCIBLE PREFIX(dfltcc_can_set_reproducible) + +#define DEFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE) + +#endif diff --git a/neozip/arch/s390/dfltcc_detail.h b/neozip/arch/s390/dfltcc_detail.h new file mode 100644 index 0000000000..f790735ab4 --- /dev/null +++ b/neozip/arch/s390/dfltcc_detail.h @@ -0,0 +1,274 @@ +#include "zbuild.h" +#include "zsanitizer.h" +#include <stdio.h> + +#ifdef HAVE_SYS_SDT_H +#include <sys/sdt.h> +#endif + +/* + Tuning parameters. + */ +#ifndef DFLTCC_LEVEL_MASK +#define DFLTCC_LEVEL_MASK 0x2 +#endif +#ifndef DFLTCC_BLOCK_SIZE +#define DFLTCC_BLOCK_SIZE 1048576 +#endif +#ifndef DFLTCC_FIRST_FHT_BLOCK_SIZE +#define DFLTCC_FIRST_FHT_BLOCK_SIZE 4096 +#endif +#ifndef DFLTCC_DHT_MIN_SAMPLE_SIZE +#define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096 +#endif +#ifndef DFLTCC_RIBM +#define DFLTCC_RIBM 0 +#endif + +#define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1] + +#define DFLTCC_SIZEOF_QAF 32 +static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf); + +static inline int is_bit_set(const char *bits, int n) { + return bits[n / 8] & (1 << (7 - (n % 8))); +} + +static inline void clear_bit(char *bits, int n) { + bits[n / 8] &= ~(1 << (7 - (n % 8))); +} + +#define DFLTCC_FACILITY 151 + +static inline int is_dfltcc_enabled(void) { + uint64_t facilities[(DFLTCC_FACILITY / 64) + 1]; + Z_REGISTER uint8_t r0 __asm__("r0"); + + memset(facilities, 0, sizeof(facilities)); + r0 = sizeof(facilities) / sizeof(facilities[0]) - 1; + /* STFLE is supported since z9-109 and only in z/Architecture mode. When + * compiling with -m31, gcc defaults to ESA mode, however, since the kernel + * is 64-bit, it's always z/Architecture mode at runtime. + */ + __asm__ volatile( +#ifndef __clang__ + ".machinemode push\n" + ".machinemode zarch\n" +#endif + "stfle %[facilities]\n" +#ifndef __clang__ + ".machinemode pop\n" +#endif + : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc"); + return is_bit_set((const char *)facilities, DFLTCC_FACILITY); +} + +#define DFLTCC_FMT0 0 + +#define CVT_CRC32 0 +#define CVT_ADLER32 1 +#define HTT_FIXED 0 +#define HTT_DYNAMIC 1 + +#define DFLTCC_SIZEOF_GDHT_V0 384 +#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536 +static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0); +static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0); + +static inline z_const char *oesc_msg(char *buf, int oesc) { + if (oesc == 0x00) + return NULL; /* Successful completion */ + else { + sprintf(buf, "Operation-Ending-Supplemental Code is 0x%.2X", oesc); + return buf; + } +} + +/* + C wrapper for the DEFLATE CONVERSION CALL instruction. + */ +typedef enum { + DFLTCC_CC_OK = 0, + DFLTCC_CC_OP1_TOO_SHORT = 1, + DFLTCC_CC_OP2_TOO_SHORT = 2, + DFLTCC_CC_OP2_CORRUPT = 2, + DFLTCC_CC_AGAIN = 3, +} dfltcc_cc; + +#define DFLTCC_QAF 0 +#define DFLTCC_GDHT 1 +#define DFLTCC_CMPR 2 +#define DFLTCC_XPND 4 +#define HBT_CIRCULAR (1 << 7) +#define DFLTCC_FN_MASK ((1 << 7) - 1) + +/* Return lengths of high (starting at param->ho) and low (starting at 0) fragments of the circular history buffer. */ +static inline void get_history_lengths(struct dfltcc_param_v0 *param, size_t *hl_high, size_t *hl_low) { + *hl_high = MIN(param->hl, HB_SIZE - param->ho); + *hl_low = param->hl - *hl_high; +} + +/* Notify instrumentation about an upcoming read/write access to the circular history buffer. */ +static inline void instrument_read_write_hist(struct dfltcc_param_v0 *param, void *hist) { + size_t hl_high, hl_low; + + get_history_lengths(param, &hl_high, &hl_low); + instrument_read_write(hist + param->ho, hl_high); + instrument_read_write(hist, hl_low); +} + +/* Notify MSan about a completed write to the circular history buffer. */ +static inline void msan_unpoison_hist(struct dfltcc_param_v0 *param, void *hist) { + size_t hl_high, hl_low; + + get_history_lengths(param, &hl_high, &hl_low); + __msan_unpoison(hist + param->ho, hl_high); + __msan_unpoison(hist, hl_low); +} + +static inline dfltcc_cc dfltcc(int fn, void *param, + unsigned char **op1, size_t *len1, + z_const unsigned char **op2, size_t *len2, void *hist) { + unsigned char *t2 = op1 ? *op1 : NULL; + unsigned char *orig_t2 = t2; + size_t t3 = len1 ? *len1 : 0; + z_const unsigned char *t4 = op2 ? *op2 : NULL; + size_t t5 = len2 ? *len2 : 0; + Z_REGISTER int r0 __asm__("r0"); + Z_REGISTER void *r1 __asm__("r1"); + Z_REGISTER unsigned char *r2 __asm__("r2"); + Z_REGISTER size_t r3 __asm__("r3"); + Z_REGISTER z_const unsigned char *r4 __asm__("r4"); + Z_REGISTER size_t r5 __asm__("r5"); + int cc; + + /* Insert pre-instrumentation for DFLTCC. */ + switch (fn & DFLTCC_FN_MASK) { + case DFLTCC_QAF: + instrument_write(param, DFLTCC_SIZEOF_QAF); + break; + case DFLTCC_GDHT: + instrument_read_write(param, DFLTCC_SIZEOF_GDHT_V0); + instrument_read(t4, t5); + break; + case DFLTCC_CMPR: + case DFLTCC_XPND: + instrument_read_write(param, DFLTCC_SIZEOF_CMPR_XPND_V0); + instrument_read(t4, t5); + instrument_write(t2, t3); + instrument_read_write_hist(param, hist); + break; + } + + r0 = fn; r1 = param; r2 = t2; r3 = t3; r4 = t4; r5 = t5; + __asm__ volatile( +#ifdef HAVE_SYS_SDT_H + STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5)) +#endif + ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n" +#ifdef HAVE_SYS_SDT_H + STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5)) +#endif + "ipm %[cc]\n" + : [r2] "+r" (r2) + , [r3] "+r" (r3) + , [r4] "+r" (r4) + , [r5] "+r" (r5) + , [cc] "=r" (cc) + : [r0] "r" (r0) + , [r1] "r" (r1) + , [hist] "r" (hist) +#ifdef HAVE_SYS_SDT_H + , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist) +#endif + : "cc", "memory"); + t2 = r2; t3 = r3; t4 = r4; t5 = r5; + + /* Insert post-instrumentation for DFLTCC. */ + switch (fn & DFLTCC_FN_MASK) { + case DFLTCC_QAF: + __msan_unpoison(param, DFLTCC_SIZEOF_QAF); + break; + case DFLTCC_GDHT: + __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0); + break; + case DFLTCC_CMPR: + __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0); + __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1)); + msan_unpoison_hist(param, hist); + break; + case DFLTCC_XPND: + __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0); + __msan_unpoison(orig_t2, t2 - orig_t2); + msan_unpoison_hist(param, hist); + break; + } + + if (op1) + *op1 = t2; + if (len1) + *len1 = t3; + if (op2) + *op2 = t4; + if (len2) + *len2 = t5; + return (cc >> 28) & 3; +} + +static inline void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) { + /* Initialize available functions */ + if (is_dfltcc_enabled()) { + dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL); + memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af)); + } else + memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af)); + + /* Initialize parameter block */ + memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param)); + dfltcc_state->param.nt = 1; + dfltcc_state->param.ribm = DFLTCC_RIBM; +} + +static inline void dfltcc_copy_state(void *dst, const void *src, uInt size, uInt extension_size) { + memcpy(dst, src, ALIGN_UP(size, 8) + extension_size); +} + +static inline void append_history(struct dfltcc_param_v0 *param, unsigned char *history, + const unsigned char *buf, uInt count) { + size_t offset; + size_t n; + + /* Do not use more than 32K */ + if (count > HB_SIZE) { + buf += count - HB_SIZE; + count = HB_SIZE; + } + offset = (param->ho + param->hl) % HB_SIZE; + if (offset + count <= HB_SIZE) + /* Circular history buffer does not wrap - copy one chunk */ + memcpy(history + offset, buf, count); + else { + /* Circular history buffer wraps - copy two chunks */ + n = HB_SIZE - offset; + memcpy(history + offset, buf, n); + memcpy(history, buf + n, count - n); + } + n = param->hl + count; + if (n <= HB_SIZE) + /* All history fits into buffer - no need to discard anything */ + param->hl = n; + else { + /* History does not fit into buffer - discard extra bytes */ + param->ho = (param->ho + (n - HB_SIZE)) % HB_SIZE; + param->hl = HB_SIZE; + } +} + +static inline void get_history(struct dfltcc_param_v0 *param, const unsigned char *history, + unsigned char *buf) { + size_t hl_high, hl_low; + + get_history_lengths(param, &hl_high, &hl_low); + memcpy(buf, history + param->ho, hl_high); + memcpy(buf + hl_high, history, hl_low); +} diff --git a/neozip/arch/s390/dfltcc_inflate.c b/neozip/arch/s390/dfltcc_inflate.c new file mode 100644 index 0000000000..f6bc423c22 --- /dev/null +++ b/neozip/arch/s390/dfltcc_inflate.c @@ -0,0 +1,195 @@ +/* dfltcc_inflate.c - IBM Z DEFLATE CONVERSION CALL decompression support. */ + +/* + Use the following commands to build zlib-ng with DFLTCC decompression support: + + $ ./configure --with-dfltcc-inflate + or + + $ cmake -DWITH_DFLTCC_INFLATE=1 . + + and then + + $ make +*/ + +#ifdef S390_DFLTCC_INFLATE + +#include "zbuild.h" +#include "zutil.h" +#include "inftrees.h" +#include "inflate.h" +#include "dfltcc_inflate.h" +#include "dfltcc_detail.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + + dfltcc_reset_state(&state->arch.common); +} + +int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_state *dfltcc_state = &state->arch.common; + + /* Unsupported hardware */ + return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0); +} + +static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + size_t avail_in = strm->avail_in; + size_t avail_out = strm->avail_out; + dfltcc_cc cc; + + cc = dfltcc(DFLTCC_XPND | HBT_CIRCULAR, + param, &strm->next_out, &avail_out, + &strm->next_in, &avail_in, state->window); + strm->avail_in = avail_in; + strm->avail_out = avail_out; + return cc; +} + +dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_state *dfltcc_state = &state->arch.common; + struct dfltcc_param_v0 *param = &dfltcc_state->param; + dfltcc_cc cc; + + if (flush == Z_BLOCK || flush == Z_TREES) { + /* DFLTCC does not support stopping on block boundaries */ + if (PREFIX(dfltcc_inflate_disable)(strm)) { + *ret = Z_STREAM_ERROR; + return DFLTCC_INFLATE_BREAK; + } else + return DFLTCC_INFLATE_SOFTWARE; + } + + if (state->last) { + if (state->bits != 0) { + strm->next_in++; + strm->avail_in--; + state->bits = 0; + } + state->mode = CHECK; + return DFLTCC_INFLATE_CONTINUE; + } + + if (strm->avail_in == 0 && !param->cf) + return DFLTCC_INFLATE_BREAK; + + /* if window not in use yet, initialize */ + if (state->wsize == 0) + state->wsize = 1U << state->wbits; + + /* Translate stream to parameter block */ + param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32; + param->sbb = state->bits; + if (param->hl) + param->nt = 0; /* Honor history for the first block */ + if (state->wrap & 4) + param->cv = state->flags ? ZSWAP32(state->check) : state->check; + + /* Inflate */ + do { + cc = dfltcc_xpnd(strm); + } while (cc == DFLTCC_CC_AGAIN); + + /* Translate parameter block to stream */ + strm->msg = oesc_msg(dfltcc_state->msg, param->oesc); + state->last = cc == DFLTCC_CC_OK; + state->bits = param->sbb; + if (state->wrap & 4) + strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv; + if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) { + /* Report an error if stream is corrupted */ + state->mode = BAD; + return DFLTCC_INFLATE_CONTINUE; + } + state->mode = TYPEDO; + /* Break if operands are exhausted, otherwise continue looping */ + return (cc == DFLTCC_CC_OP1_TOO_SHORT || cc == DFLTCC_CC_OP2_TOO_SHORT) ? + DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE; +} + +int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + + return !state->arch.common.param.nt; +} + +/* + Rotates a circular buffer. + The implementation is based on https://cplusplus.com/reference/algorithm/rotate/ + */ +static void rotate(unsigned char *start, unsigned char *pivot, unsigned char *end) { + unsigned char *p = pivot; + unsigned char tmp; + + while (p != start) { + tmp = *start; + *start = *p; + *p = tmp; + + start++; + p++; + + if (p == end) + p = pivot; + else if (start == pivot) + pivot = p; + } +} + +int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_state *dfltcc_state = &state->arch.common; + struct dfltcc_param_v0 *param = &dfltcc_state->param; + + if (!PREFIX(dfltcc_can_inflate)(strm)) + return 0; + if (PREFIX(dfltcc_was_inflate_used)(strm)) + /* DFLTCC has already decompressed some data. Since there is not + * enough information to resume decompression in software, the call + * must fail. + */ + return 1; + /* DFLTCC was not used yet - decompress in software */ + memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af)); + /* Convert the window from the hardware to the software format */ + rotate(state->window, state->window + param->ho, state->window + HB_SIZE); + state->whave = state->wnext = MIN(param->hl, state->wsize); + return 0; +} + +/* + Preloading history. +*/ +int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + /* if window not in use yet, initialize */ + if (state->wsize == 0) + state->wsize = 1U << state->wbits; + + append_history(param, state->window, dictionary, dict_length); + state->havedict = 1; + return Z_OK; +} + +int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm, + unsigned char *dictionary, uInt *dict_length) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_param_v0 *param = &state->arch.common.param; + + if (dictionary && state->window) + get_history(param, state->window, dictionary); + if (dict_length) + *dict_length = param->hl; + return Z_OK; +} + +#endif diff --git a/neozip/arch/s390/dfltcc_inflate.h b/neozip/arch/s390/dfltcc_inflate.h new file mode 100644 index 0000000000..3623f8ed7f --- /dev/null +++ b/neozip/arch/s390/dfltcc_inflate.h @@ -0,0 +1,67 @@ +#ifndef DFLTCC_INFLATE_H +#define DFLTCC_INFLATE_H + +#include "dfltcc_common.h" + +void Z_INTERNAL PREFIX(dfltcc_reset_inflate_state)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_can_inflate)(PREFIX3(streamp) strm); +typedef enum { + DFLTCC_INFLATE_CONTINUE, + DFLTCC_INFLATE_BREAK, + DFLTCC_INFLATE_SOFTWARE, +} dfltcc_inflate_action; +dfltcc_inflate_action Z_INTERNAL PREFIX(dfltcc_inflate)(PREFIX3(streamp) strm, int flush, int *ret); +int Z_INTERNAL PREFIX(dfltcc_was_inflate_used)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_inflate_disable)(PREFIX3(streamp) strm); +int Z_INTERNAL PREFIX(dfltcc_inflate_set_dictionary)(PREFIX3(streamp) strm, + const unsigned char *dictionary, uInt dict_length); +int Z_INTERNAL PREFIX(dfltcc_inflate_get_dictionary)(PREFIX3(streamp) strm, + unsigned char *dictionary, uInt* dict_length); + +#define INFLATE_RESET_KEEP_HOOK PREFIX(dfltcc_reset_inflate_state) + +#define INFLATE_PRIME_HOOK(strm, bits, value) \ + do { if (PREFIX(dfltcc_inflate_disable)((strm))) return Z_STREAM_ERROR; } while (0) + +#define INFLATE_TYPEDO_HOOK(strm, flush) \ + if (PREFIX(dfltcc_can_inflate)((strm))) { \ + dfltcc_inflate_action action; \ +\ + RESTORE(); \ + action = PREFIX(dfltcc_inflate)((strm), (flush), &ret); \ + LOAD(); \ + if (action == DFLTCC_INFLATE_CONTINUE) \ + break; \ + else if (action == DFLTCC_INFLATE_BREAK) \ + goto inf_leave; \ + } + +#define INFLATE_NEED_CHECKSUM(strm) (!PREFIX(dfltcc_can_inflate)((strm))) + +#define INFLATE_NEED_UPDATEWINDOW(strm) (!PREFIX(dfltcc_can_inflate)((strm))) + +#define INFLATE_MARK_HOOK(strm) \ + do { \ + if (PREFIX(dfltcc_was_inflate_used)((strm))) return -(1L << 16); \ + } while (0) + +#define INFLATE_SYNC_POINT_HOOK(strm) \ + do { \ + if (PREFIX(dfltcc_was_inflate_used)((strm))) return Z_STREAM_ERROR; \ + } while (0) + +#define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_inflate)((strm))) \ + return PREFIX(dfltcc_inflate_set_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) \ + do { \ + if (PREFIX(dfltcc_can_inflate)((strm))) \ + return PREFIX(dfltcc_inflate_get_dictionary)((strm), (dict), (dict_len)); \ + } while (0) + +#define INFLATE_ADJUST_WINDOW_SIZE(n) MAX(n, HB_SIZE) + +#endif diff --git a/neozip/arch/s390/s390_features.c b/neozip/arch/s390/s390_features.c new file mode 100644 index 0000000000..dabb578a07 --- /dev/null +++ b/neozip/arch/s390/s390_features.c @@ -0,0 +1,18 @@ +#ifdef S390_FEATURES + +#include "zbuild.h" +#include "s390_features.h" + +#ifdef HAVE_SYS_AUXV_H +# include <sys/auxv.h> +#endif + +#ifndef HWCAP_S390_VXRS +#define HWCAP_S390_VXRS (1 << 11) +#endif + +void Z_INTERNAL s390_check_features(struct s390_cpu_features *features) { + features->has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VXRS; +} + +#endif diff --git a/neozip/arch/s390/s390_features.h b/neozip/arch/s390/s390_features.h new file mode 100644 index 0000000000..fb2ac14b26 --- /dev/null +++ b/neozip/arch/s390/s390_features.h @@ -0,0 +1,14 @@ +/* s390_features.h -- check for s390 features. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef S390_FEATURES_H_ +#define S390_FEATURES_H_ + +struct s390_cpu_features { + int has_vx; +}; + +void Z_INTERNAL s390_check_features(struct s390_cpu_features *features); + +#endif diff --git a/neozip/arch/s390/s390_functions.h b/neozip/arch/s390/s390_functions.h new file mode 100644 index 0000000000..30647051f4 --- /dev/null +++ b/neozip/arch/s390/s390_functions.h @@ -0,0 +1,33 @@ +/* s390_functions.h -- s390 implementations for arch-specific functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef S390_FUNCTIONS_H_ +#define S390_FUNCTIONS_H_ + +#include "s390_natives.h" + +#ifdef S390_CRC32_VX +uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_s390_vx(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); + +#ifdef __clang__ +# if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) +# error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \ + Either disable the zlib-ng CRC32-VX optimization, or switch to another compiler/compiler version. +# endif +#endif + +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// S390 - CRC32 VX +# ifdef S390_CRC32_VX_NATIVE +# undef native_crc32 +# define native_crc32 crc32_s390_vx +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_s390_vx +# endif +#endif + +#endif diff --git a/neozip/arch/s390/s390_natives.h b/neozip/arch/s390/s390_natives.h new file mode 100644 index 0000000000..5da913daf5 --- /dev/null +++ b/neozip/arch/s390/s390_natives.h @@ -0,0 +1,14 @@ +/* s390_natives.h -- s390 compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef S390_NATIVES_H_ +#define S390_NATIVES_H_ + +#if defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__) +# ifdef S390_CRC32_VX +# define S390_CRC32_VX_NATIVE +# endif +#endif + +#endif /* S390_NATIVES_H_ */ diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner b/neozip/arch/s390/self-hosted-builder/actions-runner new file mode 100755 index 0000000000..aabc802547 --- /dev/null +++ b/neozip/arch/s390/self-hosted-builder/actions-runner @@ -0,0 +1,62 @@ +#!/bin/bash + +# +# Ephemeral runner startup script. +# +# Expects the following environment variables: +# +# - REPO=<owner> +# - PAT_TOKEN=<github_pat_***> +# + +set -e -u + +# Validate required environment variables +if [ -z "${REPO:-}" ] || [ -z "${PAT_TOKEN:-}" ]; then + echo "Error: REPO and/or PAT_TOKEN environment variables not found" + exit 1 +fi + +# Check the cached registration token. +TOKEN_FILE=registration-token.json +if [ -f $TOKEN_FILE ]; then + set +e + EXPIRES=$(jq --raw-output .expires_at "$TOKEN_FILE" 2>/dev/null) + STATUS=$? + set -e +else + STATUS=1 + EXPIRES="" +fi + +if [[ $STATUS -ne 0 || -z "$EXPIRES" || "$EXPIRES" == "null" || $(date +%s) -ge $(date -d "$EXPIRES" +%s) ]]; then + # Refresh the cached registration token. + curl \ + -sS \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $PAT_TOKEN" \ + "https://api.github.com/repos/$REPO/actions/runners/registration-token" \ + -o "$TOKEN_FILE" +fi + +REG_TOKEN=$(jq --raw-output .token "$TOKEN_FILE") +if [ $REG_TOKEN = "null" ]; then + echo "Failed to get registration token" + exit 1 +fi + +# (Re-)register the runner. +./config.sh remove --token "$REG_TOKEN" || true +set -x +./config.sh \ + --url "https://github.com/$REPO" \ + --token "$REG_TOKEN" \ + --unattended \ + --disableupdate \ + --replace \ + --labels z15 \ + --ephemeral + +# Run one job. +./run.sh diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh b/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh new file mode 100644 index 0000000000..7fded31785 --- /dev/null +++ b/neozip/arch/s390/self-hosted-builder/actions-runner-rebuild.sh @@ -0,0 +1,52 @@ +#!/usr/bin/bash +set -ex + +TMPDIR="$(mktemp -d)" + +if [ -f actions-runner.Dockerfile ]; then + MODE=1 + cp actions-runner.Dockerfile actions-runner entrypoint $TMPDIR + cd $TMPDIR +else + MODE=2 + cd $TMPDIR + wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/actions-runner.Dockerfile + wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/actions-runner + wget https://raw.githubusercontent.com/zlib-ng/zlib-ng/refs/heads/develop/arch/s390/self-hosted-builder/entrypoint +fi + +# Stop service +systemctl stop actions-runner || true + +# Delete old container +podman container rm gaplib-actions-runner || true + +# Delete old image +podman image rm localhost/zlib-ng/actions-runner || true + +# Prune all unused podman data +podman system prune -f || true + +# Build new image +podman build --squash -f actions-runner.Dockerfile --tag zlib-ng/actions-runner . 2>&1 | tee /var/log/actions-runner-build.log + +# Create new container +podman create --replace --name=gaplib-actions-runner --env-file=/etc/actions-runner --init \ + zlib-ng/actions-runner 2>&1 | tee -a /var/log/actions-runner-build.log + +# Start service +systemctl start actions-runner || true + +# Cleanup +podman image prune -af || true + +# Clean up tempfile +if [ "$MODE" == "2" ] ; then + cd $TMPDIR + rm actions-runner.Dockerfile + rm actions-runner + rm entrypoint + cd .. + rmdir $TMPDIR + echo "Deleted tempfiles." +fi diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile b/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile new file mode 100644 index 0000000000..7210caaebe --- /dev/null +++ b/neozip/arch/s390/self-hosted-builder/actions-runner.Dockerfile @@ -0,0 +1,47 @@ +# Self-Hosted IBM Z Github Actions Runner. + +FROM almalinux:10 + +RUN dnf update -y -q && \ + dnf install -y -q --enablerepo=crb wget git which sudo jq sed \ + cmake make automake autoconf m4 libtool ninja-build \ + python3-pip python3-devel python3-lxml \ + gcc gcc-c++ clang llvm-toolset glibc-all-langpacks langpacks-en \ + glibc-static libstdc++-static libstdc++-devel libxslt-devel libxml2-devel + +RUN dnf install -y -q dotnet-sdk-8.0 && \ + echo "Using SDK - `dotnet --version`" + +RUN cd /tmp && \ + git clone -q https://github.com/actions/runner && \ + cd runner && \ + git checkout $(git tag --sort=-v:refname | grep '^v[0-9]' | head -n1) && \ + git log -n 1 && \ + wget https://raw.githubusercontent.com/IBM/action-runner-image-pz/refs/heads/main/patches/runner-sdk8-s390x.patch -O runner-sdk8-s390x.patch && \ + git apply --whitespace=nowarn runner-sdk8-s390x.patch && \ + + sed -i'' -e /version/s/8......\"$/$8.0.100\"/ src/global.json + +RUN cd /tmp/runner/src && \ + ./dev.sh layout && \ + ./dev.sh package && \ + rm -rf /root/.dotnet /root/.nuget + +RUN useradd -c "Action Runner" -m actions-runner && \ + usermod -L actions-runner + +RUN tar -xf /tmp/runner/_package/*.tar.gz -C /home/actions-runner && \ + chown -R actions-runner:actions-runner /home/actions-runner + +# Cleanup +RUN rm -rf /tmp/runner /var/cache/dnf/* /tmp/runner.patch /tmp/global.json && \ + dnf clean all + +USER actions-runner + +# Scripts. +COPY --chmod=555 entrypoint /usr/bin/ +COPY --chmod=555 actions-runner /usr/bin/ +WORKDIR /home/actions-runner +ENTRYPOINT ["/usr/bin/entrypoint"] +CMD ["/usr/bin/actions-runner"] diff --git a/neozip/arch/s390/self-hosted-builder/actions-runner.service b/neozip/arch/s390/self-hosted-builder/actions-runner.service new file mode 100644 index 0000000000..79560cde18 --- /dev/null +++ b/neozip/arch/s390/self-hosted-builder/actions-runner.service @@ -0,0 +1,18 @@ +[Unit] +Description=Podman container: Gaplib Github Actions Runner +Wants=network-online.target +After=network-online.target +StartLimitIntervalSec=1 +RequiresMountsFor=/run/user/1001/containers + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Restart=always +TimeoutStopSec=61 +ExecStart=/usr/bin/podman start gaplib-actions-runner +ExecStop=/usr/bin/podman stop -t 30 gaplib-actions-runner +ExecStopPost=/usr/bin/podman stop -t 10 gaplib-actions-runner +Type=forking + +[Install] +WantedBy=default.target diff --git a/neozip/arch/s390/self-hosted-builder/entrypoint b/neozip/arch/s390/self-hosted-builder/entrypoint new file mode 100755 index 0000000000..eb8772becf --- /dev/null +++ b/neozip/arch/s390/self-hosted-builder/entrypoint @@ -0,0 +1,30 @@ +#!/bin/bash + +# +# Container entrypoint that waits for all spawned processes. +# + +set -e -u + +# Create a FIFO and start reading from its read end. +tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX") +trap 'rm -r "$tempdir"' EXIT +done="$tempdir/pipe" +mkfifo "$done" +cat "$done" & waiter=$! + +# Start the workload. Its descendants will inherit the FIFO's write end. +status=0 +if [ "$#" -eq 0 ]; then + bash 9>"$done" || status=$? +else + "$@" 9>"$done" || status=$? +fi + +# When the workload and all of its descendants exit, the FIFO's write end will +# be closed and `cat "$done"` will exit. Wait until it happens. This is needed +# in order to handle SelfUpdater, which the workload may start in background +# before exiting. +wait "$waiter" + +exit "$status" diff --git a/neozip/arch/x86/Makefile.in b/neozip/arch/x86/Makefile.in new file mode 100644 index 0000000000..f756844a9f --- /dev/null +++ b/neozip/arch/x86/Makefile.in @@ -0,0 +1,176 @@ +# Makefile for zlib +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2 +AVX512VNNIFLAG=-mavx512vnni -mbmi2 +AVX2FLAG=-mavx2 -mbmi2 +SSE2FLAG=-msse2 +SSSE3FLAG=-mssse3 +SSE41FLAG=-msse4.1 +SSE42FLAG=-msse4.2 +PCLMULFLAG=-mpclmul +VPCLMULFLAG=-mvpclmulqdq +XSAVEFLAG=-mxsave +NOLTOFLAG= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +all: \ + x86_features.o x86_features.lo \ + adler32_avx2.o adler32_avx2.lo \ + adler32_avx512.o adler32_avx512.lo \ + adler32_avx512_vnni.o adler32_avx512_vnni.lo \ + adler32_sse42.o adler32_sse42.lo \ + adler32_ssse3.o adler32_ssse3.lo \ + chunkset_avx2.o chunkset_avx2.lo \ + chunkset_avx512.o chunkset_avx512.lo \ + chunkset_sse2.o chunkset_sse2.lo \ + chunkset_ssse3.o chunkset_ssse3.lo \ + compare256_avx2.o compare256_avx2.lo \ + compare256_avx512.o compare256_avx512.lo \ + compare256_sse2.o compare256_sse2.lo \ + crc32_chorba_sse2.o crc32_chorba_sse2.lo \ + crc32_chorba_sse41.o crc32_chorba_sse41.lo \ + crc32_pclmulqdq.o crc32_pclmulqdq.lo \ + crc32_vpclmulqdq_avx2.o crc32_vpclmulqdq_avx2.lo \ + crc32_vpclmulqdq_avx512.o crc32_vpclmulqdq_avx512.lo \ + slide_hash_avx2.o slide_hash_avx2.lo \ + slide_hash_sse2.o slide_hash_sse2.lo + +x86_features.o: + $(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c + +x86_features.lo: + $(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c + +chunkset_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c + +chunkset_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c + +chunkset_avx512.o: + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c + +chunkset_avx512.lo: + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c + +chunkset_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c + +chunkset_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c + +chunkset_ssse3.o: + $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c + +chunkset_ssse3.lo: + $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c + +compare256_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c + +compare256_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c + +compare256_avx512.o: + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c + +compare256_avx512.lo: + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c + +compare256_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c + +compare256_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c + +crc32_chorba_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c + +crc32_chorba_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse2.c + +crc32_chorba_sse41.o: + $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c + +crc32_chorba_sse41.lo: + $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_sse41.c + +crc32_pclmulqdq.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c + +crc32_pclmulqdq.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c + +crc32_vpclmulqdq_avx2.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c + +crc32_vpclmulqdq_avx2.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c + +crc32_vpclmulqdq_avx512.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c + +crc32_vpclmulqdq_avx512.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c + +slide_hash_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c + +slide_hash_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c + +slide_hash_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c + +slide_hash_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c + +adler32_avx2.o: $(SRCDIR)/adler32_avx2.c + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c + +adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c + +adler32_avx512.o: $(SRCDIR)/adler32_avx512.c + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c + +adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c + +adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c + $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + +adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c + $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + +adler32_sse42.o: $(SRCDIR)/adler32_sse42.c + $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c + +adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c + $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/x86/adler32_avx2.c b/neozip/arch/x86/adler32_avx2.c new file mode 100644 index 0000000000..d1811b254d --- /dev/null +++ b/neozip/arch/x86/adler32_avx2.c @@ -0,0 +1,172 @@ +/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Copyright (C) 2022 Adam Stylinski + * Authors: + * Brian Bockelman <bockelman@gmail.com> + * Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX2 + +#include "zbuild.h" +#include <immintrin.h> +#include "adler32_p.h" +#include "adler32_avx2_p.h" +#include "x86_intrins.h" + +extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) { + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); + } else if (len < 32) { + if (COPY) { + return adler32_copy_sse42(adler, dst, src, len); + } else { + return adler32_ssse3(adler, src, len); + } + } + + __m256i vs1, vs2, vs2_0; + + const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, + 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33); + const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m256i dot3v = _mm256_set1_epi16(1); + const __m256i zero = _mm256_setzero_si256(); + + while (len >= 32) { + vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); + vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); + __m256i vs1_0 = vs1; + __m256i vs3 = _mm256_setzero_si256(); + vs2_0 = vs3; + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); + len -= k; + + while (k >= 64) { + __m256i vbuf = _mm256_loadu_si256((__m256i*)src); + __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32)); + src += 64; + k -= 64; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); + __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero); + + if (COPY) { + _mm256_storeu_si256((__m256i*)dst, vbuf); + _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0); + dst += 64; + } + + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts + __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s + vs1 = _mm256_add_epi32(vs1_sad2, vs1); + vs2 = _mm256_add_epi32(vsum2, vs2); + vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm256_add_epi32(vs2_0, vs2); + vs3 = _mm256_slli_epi32(vs3, 6); + vs2 = _mm256_add_epi32(vs3, vs2); + vs3 = _mm256_setzero_si256(); + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) + */ + __m256i vbuf = _mm256_loadu_si256((__m256i*)src); + src += 32; + k -= 32; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's + + if (COPY) { + _mm256_storeu_si256((__m256i*)dst, vbuf); + dst += 32; + } + + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts + __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + vs2 = _mm256_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + /* Defer the multiplication with 32 to outside of the loop */ + vs3 = _mm256_slli_epi32(vs3, 5); + vs2 = _mm256_add_epi32(vs2, vs3); + + /* The compiler is generating the following sequence for this integer modulus + * when done the scalar way, in GPRs: + + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); + + mov $0x80078071,%edi // move magic constant into 32 bit register %edi + ... + vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi + mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax + imul %rdi,%rsi // do a signed multiplication with magic constant and vector element + shr $0x2f,%rsi // shift right by 47 + imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 + sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above + ... + // repeats for each element with vpextract instructions + + This is tricky with AVX2 for a number of reasons: + 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there + 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate + back down to 32 bit precision later (there is in AVX512) + 3.) Full width integer multiplications aren't cheap + + We can, however, do a relatively cheap sequence for horizontal sums. + Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was + previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but + that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be + performed on the maximum possible inputs before overflow + */ + + + /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy + * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant). + * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly + * what the compiler is doing to avoid integer divisions. */ + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif diff --git a/neozip/arch/x86/adler32_avx2_p.h b/neozip/arch/x86/adler32_avx2_p.h new file mode 100644 index 0000000000..f0f8a4a887 --- /dev/null +++ b/neozip/arch/x86/adler32_avx2_p.h @@ -0,0 +1,32 @@ +/* adler32_avx2_p.h -- adler32 avx2 utility functions + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_AVX2_P_H_ +#define ADLER32_AVX2_P_H_ + +#if defined(X86_AVX2) || defined(X86_AVX512VNNI) + +/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */ +static inline uint32_t hsum256(__m256i x) { + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1), + _mm256_castsi256_si128(x)); + __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +static inline uint32_t partial_hsum256(__m256i x) { + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros */ + const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1); + __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec); + __m128i non_zero_sse = _mm256_castsi256_si128(non_zero); + __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse)); + __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} +#endif + +#endif diff --git a/neozip/arch/x86/adler32_avx512.c b/neozip/arch/x86/adler32_avx512.c new file mode 100644 index 0000000000..8a8e165bb9 --- /dev/null +++ b/neozip/arch/x86/adler32_avx512.c @@ -0,0 +1,102 @@ +/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512 + +#include "zbuild.h" +#include "adler32_p.h" +#include "arch_functions.h" +#include <immintrin.h> +#include "x86_intrins.h" +#include "adler32_avx512_p.h" + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 64) { + /* This handles the remaining copies, just call normal adler checksum after this */ + if (COPY && len) { + __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len)); + __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src); + _mm512_mask_storeu_epi8(dst, storemask, copy_vec); + } + + return adler32_avx2(adler, src, len); + } + + __m512i vbuf, vs1_0, vs3; + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + const __m512i dot3v = _mm512_set1_epi16(1); + const __m512i zero = _mm512_setzero_si512(); + + while (len >= 64) { + __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + vs1_0 = vs1; + vs3 = _mm512_setzero_si512(); + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 64); + len -= k; + + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf = _mm512_loadu_si512(src); + + if (COPY) { + _mm512_storeu_si512(dst, vbuf); + dst += 64; + } + + src += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero); + __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v); + vs1 = _mm512_add_epi32(vs1_sad, vs1); + vs3 = _mm512_add_epi32(vs3, vs1_0); + __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm512_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, NULL, src, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif diff --git a/neozip/arch/x86/adler32_avx512_p.h b/neozip/arch/x86/adler32_avx512_p.h new file mode 100644 index 0000000000..742269053c --- /dev/null +++ b/neozip/arch/x86/adler32_avx512_p.h @@ -0,0 +1,57 @@ +#ifndef AVX512_FUNCS_H +#define AVX512_FUNCS_H + +#include <immintrin.h> +#include <stdint.h> + +/* Written because Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsic macros */ +#if defined(_MSC_VER) && !defined(_MM_K0_REG8) +# undef _mm512_extracti64x4_epi64 +# define _mm512_extracti64x4_epi64(v1, e1) _mm512_maskz_extracti64x4_epi64(UINT8_MAX, v1, e1) +# undef _mm512_set1_epi16 +# define _mm512_set1_epi16(e1) _mm512_maskz_set1_epi16(UINT32_MAX, e1) +# undef _mm512_maddubs_epi16 +# define _mm512_maddubs_epi16(v1, v2) _mm512_maskz_maddubs_epi16(UINT32_MAX, v1, v2) +#endif + +/* Written because *_add_epi32(a) sets off ubsan */ +static inline uint32_t _mm512_reduce_add_epu32(__m512i x) { + __m256i a = _mm512_extracti64x4_epi64(x, 1); + __m256i b = _mm512_extracti64x4_epi64(x, 0); + + __m256i a_plus_b = _mm256_add_epi32(a, b); + __m128i c = _mm256_extracti128_si256(a_plus_b, 1); + __m128i d = _mm256_extracti128_si256(a_plus_b, 0); + __m128i c_plus_d = _mm_add_epi32(c, d); + + __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d); + __m128i sum2 = _mm_add_epi32(sum1, c_plus_d); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + + return _mm_cvtsi128_si32(sum4); +} + +static inline uint32_t partial_hsum(__m512i x) { + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros. Marking this const so the compiler stands + * a better chance of keeping this resident in a register through entire + * loop execution. We certainly have enough zmm registers (32) */ + const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, + 1, 1, 1, 1, 1, 1, 1, 1); + + __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); + + /* From here, it's a simple 256 bit wide reduction sum */ + __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); + + /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is + * pretty slow, much slower than the longer instruction sequence below */ + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), + _mm256_castsi256_si128(non_zero_avx)); + __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +#endif diff --git a/neozip/arch/x86/adler32_avx512_vnni.c b/neozip/arch/x86/adler32_avx512_vnni.c new file mode 100644 index 0000000000..8bebffbf88 --- /dev/null +++ b/neozip/arch/x86/adler32_avx512_vnni.c @@ -0,0 +1,205 @@ +/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream + * Based on Brian Bockelman's AVX2 version + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512VNNI + +#include "zbuild.h" +#include "adler32_p.h" +#include "arch_functions.h" +#include <immintrin.h> +#include "x86_intrins.h" +#include "adler32_avx512_p.h" +#include "adler32_avx2_p.h" + +Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 32) + return adler32_ssse3(adler, src, len); + + if (len < 64) + return adler32_avx2(adler, src, len); + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + + const __m512i zero = _mm512_setzero_si512(); + __m512i vs1, vs2; + + while (len >= 64) { + vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + size_t k = ALIGN_DOWN(MIN(len, NMAX), 64); + len -= k; + __m512i vs1_0 = vs1; + __m512i vs3 = _mm512_setzero_si512(); + /* We might get a tad bit more ILP here if we sum to a second register in the loop */ + __m512i vs2_1 = _mm512_setzero_si512(); + __m512i vbuf0, vbuf1; + + /* Remainder peeling */ + if (k % 128) { + vbuf1 = _mm512_loadu_si512((__m512i*)src); + + src += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 128) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf0 = _mm512_loadu_si512((__m512i*)src); + vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64)); + src += 128; + k -= 128; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v); + + vs3 = _mm512_add_epi32(vs3, vs1); + vs1_sad = _mm512_sad_epu8(vbuf1, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + vs2 = _mm512_add_epi32(vs2, vs2_1); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +/* Use 256-bit vectors when copying because 512-bit variant is slower. */ +Z_INTERNAL uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel_copy: + if (len < 32) { + /* This handles the remaining copies, just call normal adler checksum after this */ + __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len)); + __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); + _mm256_mask_storeu_epi8(dst, storemask, copy_vec); + + return adler32_ssse3(adler, src, len); + } + + const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); + + const __m256i zero = _mm256_setzero_si256(); + __m256i vs1, vs2; + + while (len >= 32) { + vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); + vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); + + size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); + len -= k; + + __m256i vs1_0 = vs1; + __m256i vs3 = _mm256_setzero_si256(); + /* We might get a tad bit more ILP here if we sum to a second register in the loop */ + __m256i vs2_1 = _mm256_setzero_si256(); + __m256i vbuf0, vbuf1; + + /* Remainder peeling */ + if (k % 64) { + vbuf1 = _mm256_loadu_si256((__m256i*)src); + _mm256_storeu_si256((__m256i*)dst, vbuf1); + dst += 32; + + src += 32; + k -= 32; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf0 = _mm256_loadu_si256((__m256i*)src); + vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32)); + _mm256_storeu_si256((__m256i*)dst, vbuf0); + _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1); + dst += 64; + src += 64; + k -= 64; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v); + + vs3 = _mm256_add_epi32(vs3, vs1); + vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm256_slli_epi32(vs3, 5); + vs2 = _mm256_add_epi32(vs2, vs3); + vs2 = _mm256_add_epi32(vs2, vs2_1); + + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel_copy; + } + + return adler; +} + +#endif diff --git a/neozip/arch/x86/adler32_sse42.c b/neozip/arch/x86/adler32_sse42.c new file mode 100644 index 0000000000..c2301213f0 --- /dev/null +++ b/neozip/arch/x86/adler32_sse42.c @@ -0,0 +1,117 @@ +/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSE42 + +#include "zbuild.h" +#include "adler32_p.h" +#include "adler32_ssse3_p.h" + +#include <immintrin.h> + +Z_INTERNAL uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, 1); + + __m128i vbuf, vbuf_0; + __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + v_sad_sum2, vsum2, vsum2_0; + __m128i zero = _mm_setzero_si128(); + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + + while (len >= 16) { + size_t k = ALIGN_DOWN(MIN(len, NMAX), 16); + len -= k; + + vs1 = _mm_cvtsi32_si128(adler0); + vs2 = _mm_cvtsi32_si128(adler1); + + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16)); + src += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + _mm_storeu_si128((__m128i*)dst, vbuf); + _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0); + dst += 32; + + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + vs2 = _mm_add_epi32(vsum2, vs2); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + src += 16; + k -= 16; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + + _mm_storeu_si128((__m128i*)dst, vbuf); + dst += 16; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = hsum(vs2) % BASE; + } + + /* If this is true, there's fewer than 16 elements remaining */ + if (len) { + goto rem_peel; + } + + return adler0 | (adler1 << 16); +} + +#endif diff --git a/neozip/arch/x86/adler32_ssse3.c b/neozip/arch/x86/adler32_ssse3.c new file mode 100644 index 0000000000..702db50251 --- /dev/null +++ b/neozip/arch/x86/adler32_ssse3.c @@ -0,0 +1,149 @@ +/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSSE3 + +#include "zbuild.h" +#include "adler32_p.h" +#include "adler32_ssse3_p.h" + +#include <immintrin.h> + +Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) { + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0); + + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + const __m128i zero = _mm_setzero_si128(); + + __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + vbuf_0, v_sad_sum2, vsum2, vsum2_0; + + /* If our buffer is unaligned (likely), make the determination whether + * or not there's enough of a buffer to consume to make the scalar, aligning + * additions worthwhile or if it's worth it to just eat the cost of an unaligned + * load. This is a pretty simple test, just test if len < 32 */ + size_t n = NMAX; + size_t k = 0; + + if (len < 32) { + /* Let's eat the cost of this one unaligned load so that + * we don't completely skip over the vectorization. Doing + * 16 bytes at a time unaligned is better than 16 + <= 15 + * sums */ + vbuf = _mm_loadu_si128((__m128i*)buf); + len -= 16; + buf += 16; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs1_0 = vs1; + goto unaligned_jmp; + } + + size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len); + if (align_diff) { + adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 15, 0); + + buf += align_diff; + len -= align_diff; + n -= align_diff; + } + + while (len >= 16) { + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + k = ALIGN_DOWN(MIN(len, n), 16); + len -= k; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); + buf += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + vs2 = _mm_add_epi32(vsum2, vs2); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + buf += 16; + k -= 16; + +unaligned_jmp: + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + /* We don't actually need to do a full horizontal sum, since psadbw is actually doing + * a partial reduction sum implicitly and only summing to integers in vector positions + * 0 and 2. This saves us some contention on the shuffle port(s) */ + adler = partial_hsum(vs1) % BASE; + sum2 = hsum(vs2) % BASE; + n = NMAX; + } + + /* Process tail (len < 16). */ + return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, 15, 0); +} + +/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */ +Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + adler = adler32_ssse3(adler, src, len); + memcpy(dst, src, len); + return adler; +} +#endif diff --git a/neozip/arch/x86/adler32_ssse3_p.h b/neozip/arch/x86/adler32_ssse3_p.h new file mode 100644 index 0000000000..d7ec3fe0d5 --- /dev/null +++ b/neozip/arch/x86/adler32_ssse3_p.h @@ -0,0 +1,29 @@ +/* adler32_ssse3_p.h -- adler32 ssse3 utility functions + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_SSSE3_P_H_ +#define ADLER32_SSSE3_P_H_ + +#ifdef X86_SSSE3 + +#include <immintrin.h> +#include <stdint.h> + +static inline uint32_t partial_hsum(__m128i x) { + __m128i second_int = _mm_srli_si128(x, 8); + __m128i sum = _mm_add_epi32(x, second_int); + return _mm_cvtsi128_si32(sum); +} + +static inline uint32_t hsum(__m128i x) { + __m128i sum1 = _mm_unpackhi_epi64(x, x); + __m128i sum2 = _mm_add_epi32(x, sum1); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + return _mm_cvtsi128_si32(sum4); +} +#endif + +#endif diff --git a/neozip/arch/x86/chunkset_avx2.c b/neozip/arch/x86/chunkset_avx2.c new file mode 100644 index 0000000000..3e69a7bf66 --- /dev/null +++ b/neozip/arch/x86/chunkset_avx2.c @@ -0,0 +1,129 @@ +/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX2 + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" + +#include "arch/generic/chunk_256bit_perm_idx_lut.h" +#include <immintrin.h> +#include "x86_intrins.h" + +typedef __m256i chunk_t; +typedef __m128i halfchunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 +#define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi64x(zng_memread_8(from)); +} + +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + /* See explanation in chunkset_avx512.c */ +#if defined(_MSC_VER) && _MSC_VER <= 1900 + halfchunk_t half = _mm_loadu_si128((__m128i*)from); + *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1); +#else + *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +#endif +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm256_loadu_si256((__m256i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm256_storeu_si256((__m256i *)out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is + * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in + * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */ + *chunk_rem = lut_rem.remval; + + /* See note in chunkset_ssse3.c for why this is ok */ + __msan_unpoison(buf + dist, 32 - dist); + + if (dist < 16) { + /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after + * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate + * shuffles and combining the halves later */ + __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); + } else { + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16)); + /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ + __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1); + __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1); + /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_ + * shuffle those values */ + __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} + +static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return _mm256_zextsi128_si256(*chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = _mm_loadu_si128((__m128i*)buf); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_avx2 +#define CHUNKCOPY chunkcopy_avx2 +#define CHUNKUNROLL chunkunroll_avx2 +#define CHUNKMEMSET chunkmemset_avx2 +#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_avx2 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/chunkset_avx512.c b/neozip/arch/x86/chunkset_avx512.c new file mode 100644 index 0000000000..60450c653b --- /dev/null +++ b/neozip/arch/x86/chunkset_avx512.c @@ -0,0 +1,186 @@ +/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512 + +#include "zbuild.h" +#include "zmemory.h" + +#include "arch/generic/chunk_256bit_perm_idx_lut.h" +#include <immintrin.h> +#include "x86_intrins.h" + +typedef __m256i chunk_t; +typedef __m128i halfchunk_t; +typedef __mmask32 mask_t; +typedef __mmask16 halfmask_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 +#define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK +#define HAVE_MASKED_READWRITE +#define HAVE_CHUNKCOPY +#define HAVE_HALFCHUNKCOPY + +static inline halfmask_t gen_half_mask(size_t len) { + return (halfmask_t)_bzhi_u32(0xFFFF, (unsigned)len); +} + +static inline mask_t gen_mask(size_t len) { + return (mask_t)_bzhi_u32(0xFFFFFFFF, (unsigned)len); +} + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi64x(zng_memread_8(from)); +} + +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where + * the load is dumped to the stack with an aligned move for this memory-register + * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to + * stack doesn't exist if compiled with optimizations. For the sake of working + * properly in a debugger, let's take the 2 cycle penalty */ +#if defined(_MSC_VER) && _MSC_VER <= 1900 + halfchunk_t half = _mm_loadu_si128((__m128i*)from); + *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1); +#else + *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +#endif +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm256_loadu_si256((__m256i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm256_storeu_si256((__m256i *)out, *chunk); +} + +static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + + chunk_t chunk; + size_t rem = len % sizeof(chunk_t); + + if (len < sizeof(chunk_t)) { + mask_t rem_mask = gen_mask(rem); + chunk = _mm256_maskz_loadu_epi8(rem_mask, from); + _mm256_mask_storeu_epi8(out, rem_mask, chunk); + return out + rem; + } + + loadchunk(from, &chunk); + rem = (rem == 0) ? sizeof(chunk_t) : rem; + storechunk(out, &chunk); + out += rem; + from += rem; + len -= rem; + + while (len > 0) { + loadchunk(from, &chunk); + storechunk(out, &chunk); + out += sizeof(chunk_t); + from += sizeof(chunk_t); + len -= sizeof(chunk_t); + } + + return out; +} + +/* MSVC compiler decompression bug when optimizing for size */ +#if defined(_MSC_VER) && _MSC_VER < 1943 +# pragma optimize("", off) +#endif +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + *chunk_rem = lut_rem.remval; + + /* See the AVX2 implementation for more detailed comments. This is that + some masked + * loads to avoid an out of bounds read on the heap */ + + if (dist < 16) { + __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); + halfmask_t load_mask = gen_half_mask(dist); + __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); + } else { + halfmask_t load_mask = gen_half_mask(dist - 16); + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16)); + __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE); + __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} +#if defined(_MSC_VER) && _MSC_VER < 1943 +# pragma optimize("", on) +#endif + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return _mm256_zextsi128_si256(*chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + halfmask_t load_mask = gen_half_mask(dist); + ret_vec = _mm_maskz_loadu_epi8(load_mask, buf); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + halfchunk_t chunk; + + size_t rem = len % sizeof(halfchunk_t); + if (rem == 0) { + rem = sizeof(halfchunk_t); + } + + halfmask_t rem_mask = gen_half_mask(rem); + chunk = _mm_maskz_loadu_epi8(rem_mask, from); + _mm_mask_storeu_epi8(out, rem_mask, chunk); + + return out + rem; +} + +#define CHUNKSIZE chunksize_avx512 +#define CHUNKUNROLL chunkunroll_avx512 +#define CHUNKMEMSET chunkmemset_avx512 +#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_avx512 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/chunkset_sse2.c b/neozip/arch/x86/chunkset_sse2.c new file mode 100644 index 0000000000..633ab6e64f --- /dev/null +++ b/neozip/arch/x86/chunkset_sse2.c @@ -0,0 +1,50 @@ +/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSE2 + +#include "zbuild.h" +#include "zmemory.h" + +#include <immintrin.h> + +typedef __m128i chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi64x(zng_memread_8(from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +#define CHUNKSIZE chunksize_sse2 +#define CHUNKCOPY chunkcopy_sse2 +#define CHUNKUNROLL chunkunroll_sse2 +#define CHUNKMEMSET chunkmemset_sse2 +#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_sse2 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/chunkset_ssse3.c b/neozip/arch/x86/chunkset_ssse3.c new file mode 100644 index 0000000000..0bef7de811 --- /dev/null +++ b/neozip/arch/x86/chunkset_ssse3.c @@ -0,0 +1,72 @@ +/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSSE3 + +#include "zbuild.h" +#include "zsanitizer.h" +#include "zmemory.h" + +#include <immintrin.h> +#include "arch/generic/chunk_128bit_perm_idx_lut.h" + +typedef __m128i chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi16(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi32(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm_set1_epi64x(zng_memread_8(from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + /* Important to note: + * This is _not_ to subvert the memory sanitizer but to instead unpoison some + * bytes we willingly and purposefully load uninitialized that we swizzle over + * in a vector register, anyway. If what we assume is wrong about what is used, + * the memory sanitizer will still usefully flag it */ + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = _mm_loadu_si128((__m128i*)buf); + *chunk_rem = lut_rem.remval; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_ssse3 +#define CHUNKMEMSET chunkmemset_ssse3 +#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3 +#define CHUNKCOPY chunkcopy_ssse3 +#define CHUNKUNROLL chunkunroll_ssse3 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_ssse3 + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/x86/compare256_avx2.c b/neozip/arch/x86/compare256_avx2.c new file mode 100644 index 0000000000..5e2b1716cf --- /dev/null +++ b/neozip/arch/x86/compare256_avx2.c @@ -0,0 +1,61 @@ +/* compare256_avx2.c -- AVX2 version of compare256 + * Copyright Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef X86_AVX2 + +#include <immintrin.h> +#ifdef _MSC_VER +# include <nmmintrin.h> +#endif + +static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + __m256i ymm_src0, ymm_src1, ymm_cmp; + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ + unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */ + + src0 += 32, src1 += 32, len += 32; + + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); + mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); + + src0 += 32, src1 += 32, len += 32; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx2_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_avx2 +#define COMPARE256 compare256_avx2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_avx2 +#define COMPARE256 compare256_avx2_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/x86/compare256_avx512.c b/neozip/arch/x86/compare256_avx512.c new file mode 100644 index 0000000000..f3105505cb --- /dev/null +++ b/neozip/arch/x86/compare256_avx512.c @@ -0,0 +1,87 @@ +/* compare256_avx512.c -- AVX512 version of compare256 + * Copyright (C) 2025 Hans Kristian Rosbach + * Based on AVX2 implementation by Mika T. Lindqvist + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef X86_AVX512 + +#include <immintrin.h> +#ifdef _MSC_VER +# include <nmmintrin.h> +#endif + +static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) { + __m512i zmm_src0_4, zmm_src1_4; + __m512i zmm_src0_3, zmm_src1_3; + __m512i zmm_src0_2, zmm_src1_2; + __m512i zmm_src0_1, zmm_src1_1; + __m128i xmm_src0_0, xmm_src1_0; + uint64_t mask_1, mask_2, mask_3, mask_4; + uint32_t mask_0; + + // First do a 16byte round before increasing to 64bytes, this reduces the + // penalty for the short matches, and those are usually the most common ones. + // This requires us to overlap on the last round, giving a small penalty + // on matches of 192+ bytes (Still faster than AVX2 though). + + // 16 bytes + xmm_src0_0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1_0 = _mm_loadu_si128((__m128i*)src1); + mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); + if (mask_0 != 0x0000FFFF) + return zng_ctz32(~mask_0); /* Invert bits so identical = 0 */ + + // 64 bytes + zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16)); + zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16)); + mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1); + if (mask_1 != 0xFFFFFFFFFFFFFFFF) + return 16 + zng_ctz64(~mask_1); + + // 64 bytes + zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80)); + zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80)); + mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2); + if (mask_2 != 0xFFFFFFFFFFFFFFFF) + return 80 + zng_ctz64(~mask_2); + + // 64 bytes + zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144)); + zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144)); + mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3); + if (mask_3 != 0xFFFFFFFFFFFFFFFF) + return 144 + zng_ctz64(~mask_3); + + // 64 bytes (overlaps the previous 16 bytes for fast tail processing) + zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192)); + zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192)); + mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4); + if (mask_4 != 0xFFFFFFFFFFFFFFFF) + return 192 + zng_ctz64(~mask_4); + + return 256; +} + +Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx512_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_avx512 +#define COMPARE256 compare256_avx512_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_avx512 +#define COMPARE256 compare256_avx512_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/x86/compare256_sse2.c b/neozip/arch/x86/compare256_sse2.c new file mode 100644 index 0000000000..cfaff82cfa --- /dev/null +++ b/neozip/arch/x86/compare256_sse2.c @@ -0,0 +1,86 @@ +/* compare256_sse2.c -- SSE2 version of compare256 + * Copyright Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef X86_SSE2 + +#include <emmintrin.h> + +static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) { + __m128i xmm_src0, xmm_src1, xmm_cmp; + + /* Do the first load unaligned, than all subsequent ones we have at least + * one aligned load. Sadly aligning both loads is probably unrealistic */ + xmm_src0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return zng_ctz32(~mask); + + const uint8_t *last0 = src0 + 240; + const uint8_t *last1 = src1 + 240; + + int align_offset = ((uintptr_t)src0) & 15; + int align_adv = 16 - align_offset; + uint32_t len = align_adv; + + src0 += align_adv; + src1 += align_adv; + + for (int i = 0; i < 15; ++i) { + xmm_src0 = _mm_load_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) + return len + zng_ctz32(~mask); + + len += 16, src0 += 16, src1 += 16; + } + + if (align_offset) { + xmm_src0 = _mm_loadu_si128((__m128i*)last0); + xmm_src1 = _mm_loadu_si128((__m128i*)last1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + if (mask != 0xFFFF) + return 240 + zng_ctz32(~mask); + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) { + return compare256_sse2_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_sse2 +#define COMPARE256 compare256_sse2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_sse2 +#define COMPARE256 compare256_sse2_static + +#include "match_tpl.h" + +#endif diff --git a/neozip/arch/x86/crc32_chorba_sse2.c b/neozip/arch/x86/crc32_chorba_sse2.c new file mode 100644 index 0000000000..66191e046a --- /dev/null +++ b/neozip/arch/x86/crc32_chorba_sse2.c @@ -0,0 +1,872 @@ +#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE) + +#include "zbuild.h" +#include "crc32_chorba_p.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include <emmintrin.h> +#include "arch/x86/x86_intrins.h" +#include "arch_functions.h" + +#define READ_NEXT(in, off, a, b) do { \ + a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ + b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \ + } while (0); + +#define NEXT_ROUND(invec, a, b, c, d) do { \ + a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \ + b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \ + c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \ + d = _mm_srli_epi64(invec, 20); \ + } while (0); + +Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len) { + /* The calling function ensured that this is aligned correctly */ + const uint64_t* input = (const uint64_t*)buf; + ALIGNED_(16) uint64_t final[9] = {0}; + uint64_t next1 = ~crc; + crc = 0; + uint64_t next2 = 0; + uint64_t next3 = 0; + uint64_t next4 = 0; + uint64_t next5 = 0; + + __m128i next12 = _mm_cvtsi64_si128(next1); + __m128i next34 = _mm_setzero_si128(); + __m128i next56 = _mm_setzero_si128(); + __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4; + + size_t i = 0; + + /* This is weird, doing for vs while drops 10% off the exec time */ + for (; (i + 256 + 40 + 32 + 32) < len; i += 32) { + __m128i in1in2, in3in4; + + /* + uint64_t chorba1 = input[i / sizeof(uint64_t)]; + uint64_t chorba2 = input[i / sizeof(uint64_t) + 1]; + uint64_t chorba3 = input[i / sizeof(uint64_t) + 2]; + uint64_t chorba4 = input[i / sizeof(uint64_t) + 3]; + uint64_t chorba5 = input[i / sizeof(uint64_t) + 4]; + uint64_t chorba6 = input[i / sizeof(uint64_t) + 5]; + uint64_t chorba7 = input[i / sizeof(uint64_t) + 6]; + uint64_t chorba8 = input[i / sizeof(uint64_t) + 7]; + */ + + const uint64_t *input_ptr = input + (i / sizeof(uint64_t)); + const __m128i *input_ptr_128 = (__m128i*)input_ptr; + __m128i chorba12 = _mm_load_si128(input_ptr_128++); + __m128i chorba34 = _mm_load_si128(input_ptr_128++); + __m128i chorba56 = _mm_load_si128(input_ptr_128++); + __m128i chorba78 = _mm_load_si128(input_ptr_128++); + + chorba12 = _mm_xor_si128(chorba12, next12); + chorba34 = _mm_xor_si128(chorba34, next34); + chorba56 = _mm_xor_si128(chorba56, next56); + chorba78 = _mm_xor_si128(chorba78, chorba12); + __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1)); + __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12), + _mm_castsi128_pd(chorba34), 1)); + /* + chorba1 ^= next1; + chorba2 ^= next2; + chorba3 ^= next3; + chorba4 ^= next4; + chorba5 ^= next5; + chorba7 ^= chorba1; + chorba8 ^= chorba2; + */ + i += 8 * 8; + + /* 0-3 */ + /*in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1];*/ + READ_NEXT(input, i, in1in2, in3in4); + __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12)); + in1in2 = _mm_xor_si128(in1in2, chorba34xor); + /* + in1 ^= chorba3; + in2 ^= chorba4 ^ chorba1; + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + + */ + + in3in4 = _mm_xor_si128(in3in4, ab1); + /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */ + __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)); + + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23)); + in3in4 = _mm_xor_si128(in3in4, chorba12); + + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1; + in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, a4_); + next12 = _mm_xor_si128(next12, cd1); + + __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + + /*out1 = a3 ^ b2 ^ c1; + out2 = b3 ^ c2 ^ d1 ^ a4;*/ + next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + //out3 = b4 ^ c3 ^ d2; + //out4 = c4 ^ d3; + + //out5 = d4; + + /* + next1 = out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 4-7 */ + /*in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1];*/ + READ_NEXT(input, i, in1in2, in3in4); + + in1in2 = _mm_xor_si128(in1in2, next12); + in1in2 = _mm_xor_si128(in1in2, chorba78); + in1in2 = _mm_xor_si128(in1in2, chorba45); + in1in2 = _mm_xor_si128(in1in2, chorba34); + + /* + in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3; + in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4; + */ + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + + in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5; + in4 ^= next4 ^ b1 ^ a2 ^ chorba7 ^ chorba6; + */ + in3in4 = _mm_xor_si128(in3in4, next34); + in3in4 = _mm_xor_si128(in3in4, ab1); + in3in4 = _mm_xor_si128(in3in4, chorba56); + __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); + + /* + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + ///* + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, cd1); + + next12 = _mm_xor_si128(next12, a4_); + next12 = _mm_xor_si128(next12, next56); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + next34 = _mm_xor_si128(b4c4, cd3); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + //*/ + + /* + out1 = a3 ^ b2 ^ c1; + out2 = b3 ^ c2 ^ d1 ^ a4; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 8-11 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1; + in2 ^= next2 ^ chorba8 ^ chorba2; + */ + + READ_NEXT(input, i, in1in2, in3in4); + + __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128()); + __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12); + in1in2 = _mm_xor_si128(in1in2, chorba80); + in1in2 = _mm_xor_si128(in1in2, chorba78); + in1in2 = _mm_xor_si128(in1in2, next12_chorba12); + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /*in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3];*/ + in3in4 = _mm_xor_si128(next34, in3in4); + in3in4 = _mm_xor_si128(in3in4, ab1); + __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, chorba34); + in3in4 = _mm_xor_si128(in3in4, a2_); + + /* + in3 ^= next3 ^ a1 ^ chorba3; + in4 ^= next4 ^ a2 ^ b1 ^ chorba4; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(a4_, ab3); + next12 = _mm_xor_si128(next12, cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 12-15 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + */ + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, next12); + __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12); + in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12); + __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); + in1in2 = _mm_xor_si128(in1in2, chorb1_); + + + /* + in1 ^= next1 ^ chorba5 ^ chorba1; + in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1; + + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2; + */ + + in3in4 = _mm_xor_si128(next34, in3in4); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + ///* + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + //*/ + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 16-19 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1; + in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2; + */ + ///* + READ_NEXT(input, i, in1in2, in3in4); + __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); + in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34)); + in1in2 = _mm_xor_si128(chorba1_, in1in2); + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + //*/ + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + */ + ///* + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_)); + in3in4 = _mm_xor_si128(in3in4, next34); + //*/ + /* + in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1; + */ + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + i += 32; + + /* 20-23 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1; + in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2; + */ + + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12)); + in1in2 = _mm_xor_si128(in1in2, chorba80); + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1; + */ + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_)); + in3in4 = _mm_xor_si128(in3in4, chorba12); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + i += 32; + + /* 24-27 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1; + in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2; + */ + + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12)); + in1in2 = _mm_xor_si128(in1in2, chorba80); + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /*in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_)); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + i += 32; + + /* 28-31 */ + /* + in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1]; + in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5; + in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6; + */ + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78)); + in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56)); + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7; + in4 ^= next4 ^ a2 ^ b1 ^ chorba8; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); + in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80)); + in3in4 = _mm_xor_si128(a2_, in3in4); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + */ + + /* + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + a4_ = _mm_unpacklo_epi64(next56, ab4); + next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); + b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + next12 = _mm_xor_si128(next12, b2c2); + next34 = _mm_xor_si128(b4c4, cd3); + next34 = _mm_xor_si128(next34, d2_); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + } + + for (; (i + 40 + 32) < len; i += 32) { + __m128i in1in2, in3in4; + + /*in1 = input[i / sizeof(uint64_t)]; + in2 = input[i / sizeof(uint64_t) + 1];*/ + //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4); + READ_NEXT(input, i, in1in2, in3in4); + in1in2 = _mm_xor_si128(in1in2, next12); + + /* + in1 ^=next1; + in2 ^=next2; + */ + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + /* + a1 = (in1 << 17) ^ (in1 << 55); + a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); + a3 = (in1 >> 45) ^ (in1 << 44); + a4 = (in1 >> 20); + + b1 = (in2 << 17) ^ (in2 << 55); + b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); + b3 = (in2 >> 45) ^ (in2 << 44); + b4 = (in2 >> 20); + */ + + /* + in3 = input[i / sizeof(uint64_t) + 2]; + in4 = input[i / sizeof(uint64_t) + 3]; + in3 ^= next3 ^ a1; + in4 ^= next4 ^ a2 ^ b1; + + c1 = (in3 << 17) ^ (in3 << 55); + c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); + c3 = (in3 >> 45) ^ (in3 << 44); + c4 = (in3 >> 20); + + d1 = (in4 << 17) ^ (in4 << 55); + d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); + d3 = (in4 >> 45) ^ (in4 << 44); + d4 = (in4 >> 20); + */ + + __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); + __m128i ab1_next34 = _mm_xor_si128(next34, ab1); + in3in4 = _mm_xor_si128(in3in4, ab1_next34); + in3in4 = _mm_xor_si128(a2_, in3in4); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + /* + + out1 = a3 ^ b2 ^ c1; + out2 = a4 ^ b3 ^ c2 ^ d1; + out3 = b4 ^ c3 ^ d2; + out4 = c4 ^ d3; + out5 = d4; + + next1 = next5 ^ out1; + next2 = out2; + next3 = out3; + next4 = out4; + next5 = out5; + */ + + __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); + __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, a4_); + next12 = _mm_xor_si128(next12, cd1); + + __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); + __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); + next12 = _mm_xor_si128(next12, next56); + next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); + next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); + } + + next1 = _mm_cvtsi128_si64(next12); + next2 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next12, next12)); + next3 = _mm_cvtsi128_si64(next34); + next4 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next34, next34)); + next5 = _mm_cvtsi128_si64(next56); + + /* Skip the call to memcpy */ + size_t copy_len = len - i; + __m128i *final128 = (__m128i*)final; + __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t)); + while (copy_len >= 64) { + _mm_store_si128(final128++, _mm_load_si128(input128++)); + _mm_store_si128(final128++, _mm_load_si128(input128++)); + _mm_store_si128(final128++, _mm_load_si128(input128++)); + _mm_store_si128(final128++, _mm_load_si128(input128++)); + copy_len -= 64; + } + + while (copy_len >= 16) { + _mm_store_si128(final128++, _mm_load_si128(input128++)); + copy_len -= 16; + } + + uint8_t *src_bytes = (uint8_t*)input128; + uint8_t *dst_bytes = (uint8_t*)final128; + while (copy_len--) { + *dst_bytes++ = *src_bytes++; + } + + final[0] ^= next1; + final[1] ^= next2; + final[2] ^= next3; + final[3] ^= next4; + final[4] ^= next5; + + /* We perform the same loop that braid_internal is doing but we'll skip + * the function call for this tiny tail */ + uint8_t *final_bytes = (uint8_t*)final; + size_t rem = len - i; + + while (rem--) { + crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8); + } + + return ~crc; +} + +Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) { + uintptr_t align_diff = ALIGN_DIFF(buf, 16); + if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT) + return crc32_braid(crc, buf, len); + + if (align_diff) { + crc = crc32_braid(crc, buf, align_diff); + len -= align_diff; + buf += align_diff; + } +#if !defined(WITHOUT_CHORBA) + if (len > CHORBA_LARGE_THRESHOLD) + return crc32_chorba_118960_nondestructive(crc, buf, len); +#endif + return chorba_small_nondestructive_sse2(crc, buf, len); +} + +Z_INTERNAL uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_chorba_sse2(crc, src, len); + memcpy(dst, src, len); + return crc; +} +#endif diff --git a/neozip/arch/x86/crc32_chorba_sse41.c b/neozip/arch/x86/crc32_chorba_sse41.c new file mode 100644 index 0000000000..6ef9612440 --- /dev/null +++ b/neozip/arch/x86/crc32_chorba_sse41.c @@ -0,0 +1,332 @@ +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) + +#include "zbuild.h" +#include "crc32_chorba_p.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include <emmintrin.h> +#include <smmintrin.h> +#include "arch/x86/x86_intrins.h" +#include "arch_functions.h" + +#define READ_NEXT(in, off, a, b) do { \ + a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ + b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \ + } while (0); + +#define NEXT_ROUND(invec, a, b, c, d) do { \ + a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \ + b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \ + c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \ + d = _mm_srli_epi64(invec, 20); \ + } while (0); + +#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \ + out0 = _mm_slli_si128(in0, shift); \ + out1 = _mm_alignr_epi8(in1, in0, shift); \ + out2 = _mm_alignr_epi8(in2, in1, shift); \ + out3 = _mm_alignr_epi8(in3, in2, shift); \ + out4 = _mm_srli_si128(in3, shift); \ + } while (0) + +#define STORE4(out0, out1, out2, out3, out) do { \ + _mm_store_si128(out++, out0); \ + _mm_store_si128(out++, out1); \ + _mm_store_si128(out++, out2); \ + _mm_store_si128(out++, out3); \ + } while (0) + +#define READ4(out0, out1, out2, out3, in) do { \ + out0 = _mm_load_si128(in++); \ + out1 = _mm_load_si128(in++); \ + out2 = _mm_load_si128(in++); \ + out3 = _mm_load_si128(in++); \ + } while (0) + +/* This is intentionally shifted one down to compensate for the deferred store from + * the last iteration */ +#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \ + out0 = _mm_xor_si128(in[1], xor0); \ + out1 = _mm_xor_si128(in[2], xor1); \ + out2 = _mm_xor_si128(in[3], xor2); \ + out3 = _mm_xor_si128(in[4], xor3); \ + } while (0) + +Z_FORCEINLINE static uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint8_t *buf, size_t len) { + /* The calling function ensured that this is aligned correctly */ + const uint64_t* input = (const uint64_t*)buf; + ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)]; + __m128i *bitbuffer_v = (__m128i*)bitbuffer; + const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer; + __m128i z = _mm_setzero_si128(); + + __m128i *bitbuf128 = &bitbuffer_v[64]; + __m128i *bitbuf144 = &bitbuffer_v[72]; + __m128i *bitbuf182 = &bitbuffer_v[91]; + __m128i *bitbuf210 = &bitbuffer_v[105]; + __m128i *bitbuf300 = &bitbuffer_v[150]; + __m128i *bitbuf0 = bitbuf128; + __m128i *inptr = (__m128i*)input; + + /* We only need to zero out the bytes between the 128'th value and the 144th + * that are actually read */ + __m128i *z_cursor = bitbuf128; + for (size_t i = 0; i < 2; ++i) { + STORE4(z, z, z, z, z_cursor); + } + + /* We only need to zero out the bytes between the 144'th value and the 182nd that + * are actually read */ + z_cursor = bitbuf144 + 8; + for (size_t i = 0; i < 11; ++i) { + _mm_store_si128(z_cursor++, z); + } + + /* We only need to zero out the bytes between the 182nd value and the 210th that + * are actually read. */ + z_cursor = bitbuf182; + for (size_t i = 0; i < 4; ++i) { + STORE4(z, z, z, z, z_cursor); + } + + /* We need to mix this in */ + __m128i init_crc = _mm_cvtsi64_si128(~crc); + crc = 0; + + size_t i = 0; + + /* Previous iteration runs carried over */ + __m128i buf144 = z; + __m128i buf182 = z; + __m128i buf210 = z; + + for (; i + 300*8+64 < len && i < 22 * 8; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + + READ4(in12, in34, in56, in78, inptr); + + if (i == 0) { + in12 = _mm_xor_si128(in12, init_crc); + } + + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a = _mm_xor_si128(buf144, in_1); + + STORE4(a, in23, in45, in67, bitbuf144); + buf144 = in8_; + + __m128i e = _mm_xor_si128(buf182, in_1); + STORE4(e, in23, in45, in67, bitbuf182); + buf182 = in8_; + + __m128i m = _mm_xor_si128(buf210, in_1); + STORE4(m, in23, in45, in67, bitbuf210); + buf210 = in8_; + + STORE4(in12, in34, in56, in78, bitbuf300); + } + + for (; i + 300*8+64 < len && i < 32 * 8; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + READ4(in12, in34, in56, in78, inptr); + + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a = _mm_xor_si128(buf144, in_1); + + STORE4(a, in23, in45, in67, bitbuf144); + buf144 = in8_; + + __m128i e, f, g, h; + e = _mm_xor_si128(buf182, in_1); + READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); + STORE4(e, f, g, h, bitbuf182); + + __m128i m = _mm_xor_si128(buf210, in_1); + STORE4(m, in23, in45, in67, bitbuf210); + buf210 = in8_; + + STORE4(in12, in34, in56, in78, bitbuf300); + } + + for (; i + 300*8+64 < len && i < 84 * 8; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + READ4(in12, in34, in56, in78, inptr); + + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a, b, c, d; + a = _mm_xor_si128(buf144, in_1); + READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144); + STORE4(a, b, c, d, bitbuf144); + + __m128i e, f, g, h; + e = _mm_xor_si128(buf182, in_1); + READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); + STORE4(e, f, g, h, bitbuf182); + + __m128i m = _mm_xor_si128(buf210, in_1); + STORE4(m, in23, in45, in67, bitbuf210); + buf210 = in8_; + + STORE4(in12, in34, in56, in78, bitbuf300); + } + + for (; i + 300*8+64 < len; i += 64) { + __m128i in12, in34, in56, in78, + in_1, in23, in45, in67, in8_; + + if (i < 128 * 8) { + READ4(in12, in34, in56, in78, inptr); + } else { + in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); + } + + // [0, 145, 183, 211] + + /* Pre Penryn CPUs the unpack should be faster */ + REALIGN_CHORBA(in12, in34, in56, in78, + in_1, in23, in45, in67, in8_, 8); + + __m128i a, b, c, d; + a = _mm_xor_si128(buf144, in_1); + READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144); + STORE4(a, b, c, d, bitbuf144); + + __m128i e, f, g, h; + e = _mm_xor_si128(buf182, in_1); + READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); + STORE4(e, f, g, h, bitbuf182); + + __m128i n, o, p; + __m128i m = _mm_xor_si128(buf210, in_1); + + /* Couldn't tell you why but despite knowing that this is always false, + * removing this branch with GCC makes things significantly slower. Some + * loop bodies must be being joined or something */ + if (i < 84 * 8) { + n = in23; + o = in45; + p = in67; + buf210 = in8_; + } else { + READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210); + } + + STORE4(m, n, o, p, bitbuf210); + STORE4(in12, in34, in56, in78, bitbuf300); + } + + /* Second half of stores bubbled out */ + _mm_store_si128(bitbuf144, buf144); + _mm_store_si128(bitbuf182, buf182); + _mm_store_si128(bitbuf210, buf210); + + /* We also have to zero out the tail */ + size_t left_to_z = len - (300*8 + i); + __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8); + while (left_to_z >= 64) { + STORE4(z, z, z, z, bitbuf_tail); + left_to_z -= 64; + } + + while (left_to_z >= 16) { + _mm_store_si128(bitbuf_tail++, z); + left_to_z -= 16; + } + + uint8_t *tail_bytes = (uint8_t*)bitbuf_tail; + while (left_to_z--) { + *tail_bytes++ = 0; + } + + ALIGNED_(16) uint64_t final[9] = {0}; + __m128i next12, next34, next56; + next12 = z; + next34 = z; + next56 = z; + + for (; (i + 72 < len); i += 32) { + __m128i in1in2, in3in4; + __m128i in1in2_, in3in4_; + __m128i ab1, ab2, ab3, ab4; + __m128i cd1, cd2, cd3, cd4; + + READ_NEXT(input, i, in1in2, in3in4); + READ_NEXT(bitbuffer, i, in1in2_, in3in4_); + + in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12); + in3in4 = _mm_xor_si128(in3in4, in3in4_); + + NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); + + __m128i a2_ = _mm_slli_si128(ab2, 8); + __m128i ab1_next34 = _mm_xor_si128(next34, ab1); + in3in4 = _mm_xor_si128(in3in4, ab1_next34); + in3in4 = _mm_xor_si128(a2_, in3in4); + NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); + + __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8); + __m128i a4_ = _mm_slli_si128(ab4, 8); + a4_ = _mm_xor_si128(b2c2, a4_); + next12 = _mm_xor_si128(ab3, a4_); + next12 = _mm_xor_si128(next12, cd1); + + __m128i d2_ = _mm_srli_si128(cd2, 8); + __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8); + next12 = _mm_xor_si128(next12, next56); + next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); + next56 = _mm_srli_si128(cd4, 8); + } + + memcpy(final, input+(i / sizeof(uint64_t)), len-i); + __m128i *final128 = (__m128i*)final; + _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12)); + ++final128; + _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34)); + ++final128; + _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56)); + + uint8_t *final_bytes = (uint8_t*)final; + + for (size_t j = 0; j < (len-i); j++) { + crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8); + } + return ~crc; +} + +Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) { + uintptr_t align_diff = ALIGN_DIFF(buf, 16); + if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT) + return crc32_braid(crc, buf, len); + + if (align_diff) { + crc = crc32_braid(crc, buf, align_diff); + len -= align_diff; + buf += align_diff; + } +#if !defined(WITHOUT_CHORBA) + if (len > CHORBA_LARGE_THRESHOLD) + return crc32_chorba_118960_nondestructive(crc, buf, len); +#endif + if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD) + return crc32_chorba_32768_nondestructive_sse41(crc, buf, len); + return chorba_small_nondestructive_sse2(crc, buf, len); +} + +Z_INTERNAL uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_chorba_sse41(crc, src, len); + memcpy(dst, src, len); + return crc; +} +#endif diff --git a/neozip/arch/x86/crc32_pclmulqdq.c b/neozip/arch/x86/crc32_pclmulqdq.c new file mode 100644 index 0000000000..c8be1b43ba --- /dev/null +++ b/neozip/arch/x86/crc32_pclmulqdq.c @@ -0,0 +1,31 @@ +/* + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ + * instruction. + * + * A white paper describing this algorithm can be found at: + * doc/crc-pclmulqdq.pdf + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2016 Marian Beermann (support for initial value) + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * Jim Guilford <james.guilford@intel.com> + * Vinodh Gopal <vinodh.gopal@intel.com> + * Erdinc Ozturk <erdinc.ozturk@intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_PCLMULQDQ_CRC + +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/neozip/arch/x86/crc32_pclmulqdq_tpl.h b/neozip/arch/x86/crc32_pclmulqdq_tpl.h new file mode 100644 index 0000000000..e4ea546afd --- /dev/null +++ b/neozip/arch/x86/crc32_pclmulqdq_tpl.h @@ -0,0 +1,708 @@ +/* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding + * approach with the PCLMULQDQ and VPCMULQDQ instructions. + * + * A white paper describing this algorithm can be found at: + * doc/crc-pclmulqdq.pdf + * + * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support) + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2016 Marian Beermann (support for initial value) + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * Jim Guilford <james.guilford@intel.com> + * Vinodh Gopal <vinodh.gopal@intel.com> + * Erdinc Ozturk <erdinc.ozturk@intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" + +#include <immintrin.h> +#include <wmmintrin.h> +#include <smmintrin.h> // _mm_extract_epi32 + +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" +#include "crc32_p.h" +#include "x86_intrins.h" + +/* 512-bit VPCLMULQDQ path requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) +# if defined(_MSC_VER) && _MSC_VER < 1920 + /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */ +# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96) +# define z512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm) +# define z512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm) +# else +# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, 0x96) +# define z512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm) +# define z512_extracti64x2(a, imm) _mm512_extracti64x2_epi64(a, imm) +# endif +# ifdef __AVX512VL__ +# define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96) +# endif +#endif +/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */ +#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) +# define z256_xor3_epi64(a, b, c) _mm256_xor_si256(_mm256_xor_si256(a, b), c) +#endif + +#ifndef z128_xor3_epi64 +# define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c) +#endif + +static inline void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + + *xmm_crc0 = *xmm_crc1; + *xmm_crc1 = *xmm_crc2; + *xmm_crc2 = *xmm_crc3; + *xmm_crc3 = _mm_xor_si128(x_low, x_high); +} + +static inline void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); + + *xmm_crc0 = *xmm_crc2; + *xmm_crc1 = *xmm_crc3; + *xmm_crc2 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc3 = _mm_xor_si128(x_low1, x_high1); +} + +static inline void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); + __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); + + *xmm_crc0 = *xmm_crc3; + *xmm_crc1 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc2 = _mm_xor_si128(x_low1, x_high1); + *xmm_crc3 = _mm_xor_si128(x_low2, x_high2); +} + +static inline void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); + __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); + __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); + __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); + + *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); + *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); + *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); +} + +static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { + const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85); + __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10); + __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10); + __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10); + __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01); + __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10); + + *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); + *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); + *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); + *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); +} + +/* 512-bit fold function requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) +static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3, + const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) { + __m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01); + __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10); + __m512i z_low1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01); + __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10); + __m512i z_low2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01); + __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10); + __m512i z_low3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01); + __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10); + + *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0); + *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1); + *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2); + *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3); +} +#endif +/* 256-bit fold function for VPCLMULQDQ without AVX-512 */ +#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) +static inline void fold_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3, + const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, const __m256i ymm_t3, const __m256i ymm_fold8) { + __m256i y_low0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01); + __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10); + __m256i y_low1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01); + __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10); + __m256i y_low2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01); + __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10); + __m256i y_low3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01); + __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10); + + *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0); + *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1); + *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2); + *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3); +} +#endif + +Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { + size_t copy_len = len; + if (len >= 16) { + /* Calculate 16-byte alignment offset */ + uintptr_t align_diff = ALIGN_DIFF(src, 16); + + /* If total length is less than (alignment bytes + 16), use the faster small method. + * Handles both initially small buffers and cases where alignment would leave < 16 bytes */ + copy_len = len < align_diff + 16 ? len : align_diff; + } + + if (copy_len > 0) { + crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY); + src += copy_len; + len -= copy_len; + if (COPY) { + dst += copy_len; + } + } + + if (len == 0) + return crc; + + const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + + __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; + __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); + __m128i xmm_crc1 = _mm_setzero_si128(); + __m128i xmm_crc2 = _mm_setzero_si128(); + __m128i xmm_crc3 = _mm_setzero_si128(); + + if (crc != 0) { + // Process the first 16 bytes and handle initial CRC + len -= 16; + xmm_t0 = _mm_load_si128((__m128i *)src); + src += 16; + + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + dst += 16; + } + xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc)); + } + +/* 512-bit VPCLMULQDQ path requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) + if (len >= 256) { + len -= 256; + + __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; + __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; + __m512i z_low0, z_high0; + const __m512i zmm_fold4 = _mm512_set4_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m512i zmm_fold16 = _mm512_set4_epi32( + 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); + + zmm_crc0 = _mm512_loadu_si512((__m512i *)src); + zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); + src += 256; + if (COPY) { + _mm512_storeu_si512((__m512i *)dst, zmm_crc0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); + dst += 256; + } + + // Fold existing xmm state into first 64 bytes + zmm_t0 = _mm512_castsi128_si512(xmm_crc0); + zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc1, 1); + zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc2, 2); + zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc3, 3); + + z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0); + + while (len >= 256) { + len -= 256; + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); + src += 256; + + fold_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16); + if (COPY) { + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); + dst += 256; + } + } + + // zmm_crc[0,1,2,3] -> zmm_crc0 + z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1); + + z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2); + + z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3); + + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] + xmm_crc0 = z512_extracti64x2(zmm_crc0, 0); + xmm_crc1 = z512_extracti64x2(zmm_crc0, 1); + xmm_crc2 = z512_extracti64x2(zmm_crc0, 2); + xmm_crc3 = z512_extracti64x2(zmm_crc0, 3); + } +/* 256-bit VPCLMULQDQ path */ +#elif defined(X86_VPCLMULQDQ) + if (len >= 128) { + len -= 128; + + __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3; + __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3; + __m256i y_low0, y_high0; + const __m256i ymm_fold4 = _mm256_set_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596, + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m256i ymm_fold8 = _mm256_set_epi32( + 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880, + 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880); + + ymm_crc0 = _mm256_loadu_si256((__m256i *)src); + ymm_crc1 = _mm256_loadu_si256((__m256i *)src + 1); + ymm_crc2 = _mm256_loadu_si256((__m256i *)src + 2); + ymm_crc3 = _mm256_loadu_si256((__m256i *)src + 3); + src += 128; + if (COPY) { + _mm256_storeu_si256((__m256i *)dst, ymm_crc0); + _mm256_storeu_si256((__m256i *)dst + 1, ymm_crc1); + _mm256_storeu_si256((__m256i *)dst + 2, ymm_crc2); + _mm256_storeu_si256((__m256i *)dst + 3, ymm_crc3); + dst += 128; + } + + // Fold existing xmm state into first 32 bytes + ymm_t0 = _mm256_castsi128_si256(xmm_crc0); + ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc1, 1); + + y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); + y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); + ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0); + + ymm_t0 = _mm256_castsi128_si256(xmm_crc2); + ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc3, 1); + + y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); + y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); + ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0); + + while (len >= 128) { + len -= 128; + ymm_t0 = _mm256_loadu_si256((__m256i *)src); + ymm_t1 = _mm256_loadu_si256((__m256i *)src + 1); + ymm_t2 = _mm256_loadu_si256((__m256i *)src + 2); + ymm_t3 = _mm256_loadu_si256((__m256i *)src + 3); + src += 128; + + fold_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8); + if (COPY) { + _mm256_storeu_si256((__m256i *)dst, ymm_t0); + _mm256_storeu_si256((__m256i *)dst + 1, ymm_t1); + _mm256_storeu_si256((__m256i *)dst + 2, ymm_t2); + _mm256_storeu_si256((__m256i *)dst + 3, ymm_t3); + dst += 128; + } + } + + // Extract 8 x 128-bit lanes from 4 x 256-bit registers + __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0); + __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1); + __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1); + __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1); + __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2); + __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1); + __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3); + __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1); + + // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4) + __m128i x_low, x_high; + x_low = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10); + xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4); + + x_low = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10); + xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5); + + x_low = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10); + xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6); + + x_low = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10); + xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7); + } +#else + /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 + * We interleave the PCLMUL-base folds with 8x scaled generator + * polynomial copies; we read 8x QWORDS and then XOR them into + * the stream at the following offsets: 6, 9, 10, 16, 20, 22, + * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper + * as "generator_64_bits_unrolled_8" */ +#ifndef __AVX512VL__ + if (!COPY) { +#endif + while (len >= 512 + 64 + 16*8) { + __m128i chorba8 = _mm_load_si128((__m128i *)src); + __m128i chorba7 = _mm_load_si128((__m128i *)src + 1); + __m128i chorba6 = _mm_load_si128((__m128i *)src + 2); + __m128i chorba5 = _mm_load_si128((__m128i *)src + 3); + __m128i chorba4 = _mm_load_si128((__m128i *)src + 4); + __m128i chorba3 = _mm_load_si128((__m128i *)src + 5); + __m128i chorba2 = _mm_load_si128((__m128i *)src + 6); + __m128i chorba1 = _mm_load_si128((__m128i *)src + 7); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, chorba8); + _mm_storeu_si128((__m128i *)dst + 1, chorba7); + _mm_storeu_si128((__m128i *)dst + 2, chorba6); + _mm_storeu_si128((__m128i *)dst + 3, chorba5); + _mm_storeu_si128((__m128i *)dst + 4, chorba4); + _mm_storeu_si128((__m128i *)dst + 5, chorba3); + _mm_storeu_si128((__m128i *)dst + 6, chorba2); + _mm_storeu_si128((__m128i *)dst + 7, chorba1); + dst += 16*8; + } + + chorba2 = _mm_xor_si128(chorba2, chorba8); + chorba1 = _mm_xor_si128(chorba1, chorba7); + src += 16*8; + len -= 16*8; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + xmm_t3 = _mm_load_si128((__m128i *)src + 3); + + fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, xmm_crc0); + xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, xmm_crc2); + xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 4); + xmm_t1 = _mm_load_si128((__m128i *)src + 5); + xmm_t2 = _mm_load_si128((__m128i *)src + 6); + xmm_t3 = _mm_load_si128((__m128i *)src + 7); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, xmm_crc1); + xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 8); + xmm_t1 = _mm_load_si128((__m128i *)src + 9); + xmm_t2 = _mm_load_si128((__m128i *)src + 10); + xmm_t3 = _mm_load_si128((__m128i *)src + 11); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, xmm_crc0); + xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), xmm_crc1); + xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, xmm_crc2); + xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 12); + xmm_t1 = _mm_load_si128((__m128i *)src + 13); + xmm_t2 = _mm_load_si128((__m128i *)src + 14); + xmm_t3 = _mm_load_si128((__m128i *)src + 15); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, xmm_crc1); + xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 16); + xmm_t1 = _mm_load_si128((__m128i *)src + 17); + xmm_t2 = _mm_load_si128((__m128i *)src + 18); + xmm_t3 = _mm_load_si128((__m128i *)src + 19); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 20); + xmm_t1 = _mm_load_si128((__m128i *)src + 21); + xmm_t2 = _mm_load_si128((__m128i *)src + 22); + xmm_t3 = _mm_load_si128((__m128i *)src + 23); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 24); + xmm_t1 = _mm_load_si128((__m128i *)src + 25); + xmm_t2 = _mm_load_si128((__m128i *)src + 26); + xmm_t3 = _mm_load_si128((__m128i *)src + 27); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, xmm_crc1); + xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, xmm_crc2); + xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), xmm_crc3); + + xmm_t0 = _mm_load_si128((__m128i *)src + 28); + xmm_t1 = _mm_load_si128((__m128i *)src + 29); + xmm_t2 = _mm_load_si128((__m128i *)src + 30); + xmm_t3 = _mm_load_si128((__m128i *)src + 31); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, xmm_crc0); + xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, xmm_crc1); + xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), xmm_crc2); + xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, xmm_crc3); + + len -= 512; + src += 512; + } +#ifndef __AVX512VL__ + } +#endif + +#endif /* X86_VPCLMULQDQ */ + + while (len >= 64) { + len -= 64; + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + xmm_t3 = _mm_load_si128((__m128i *)src + 3); + src += 64; + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; + } + + xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); + } + + /* + * len = num bytes left - 64 + */ + if (len >= 48) { + len -= 48; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + src += 48; + + fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + dst += 48; + } + + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); + } else if (len >= 32) { + len -= 32; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + src += 32; + + fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + dst += 32; + } + + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); + } else if (len >= 16) { + len -= 16; + xmm_t0 = _mm_load_si128((__m128i *)src); + src += 16; + + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); + if (COPY) { + _mm_storeu_si128((__m128i *)dst, xmm_t0); + dst += 16; + } + + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + } + + const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e); + const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641); + + /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */ + __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01); + __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10); + xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0); + + __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01); + __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10); + xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1); + + __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01); + __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10); + xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2); + + /* Fold remaining bytes into the 128-bit state */ + if (len) { + const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080); + const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + /* Create masks to shift bytes for partial input */ + __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16)); + __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3); + + /* Shift out bytes from crc3 to make space for new data */ + __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl); + xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr); + + /* Insert the partial input into crc3 */ +#if defined(__AVX512BW__) && defined(__AVX512VL__) + __mmask16 k = (1 << len) - 1; + __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src); + if (COPY) { + _mm_mask_storeu_epi8(dst, k, xmm_crc_part); + } +#else + __m128i xmm_crc_part = _mm_setzero_si128(); + memcpy(&xmm_crc_part, src, len); + if (COPY) { + memcpy(dst, src, len); + } +#endif + __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl); + xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned); + + /* Fold the bytes that were shifted out back into crc3 */ + __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01); + __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10); + xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high); + } + + /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */ + __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00); + __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10); + + x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf); + x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3); + + __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01); + __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10); + + crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2)); + + return ~crc; +} diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx2.c b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c new file mode 100644 index 0000000000..1cdef13b09 --- /dev/null +++ b/neozip/arch/x86/crc32_vpclmulqdq_avx2.c @@ -0,0 +1,17 @@ +/* crc32_vpclmulqdq_avx2.c -- VPCLMULQDQ-based CRC32 with AVX2. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_AVX2 + +#define X86_VPCLMULQDQ +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/neozip/arch/x86/crc32_vpclmulqdq_avx512.c b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c new file mode 100644 index 0000000000..a95a448f49 --- /dev/null +++ b/neozip/arch/x86/crc32_vpclmulqdq_avx512.c @@ -0,0 +1,17 @@ +/* crc32_vpclmulqdq_avx512.c -- VPCLMULQDQ-based CRC32 with AVX-512. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_AVX512 + +#define X86_VPCLMULQDQ +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/neozip/arch/x86/slide_hash_avx2.c b/neozip/arch/x86/slide_hash_avx2.c new file mode 100644 index 0000000000..241ea305e3 --- /dev/null +++ b/neozip/arch/x86/slide_hash_avx2.c @@ -0,0 +1,48 @@ +/* + * AVX2 optimized hash slide, based on Intel's slide_sse implementation + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven <arjan@linux.intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * Mika T. Lindqvist <postmaster@raasu.org> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX2 + +#include "zbuild.h" +#include "deflate.h" + +#include <immintrin.h> + +static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) { + table += entries; + table -= 32; + + do { + __m256i value1, value2, result1, result2; + + value1 = _mm256_load_si256((__m256i *)table); + value2 = _mm256_load_si256((__m256i *)(table+16)); + result1 = _mm256_subs_epu16(value1, wsize); + result2 = _mm256_subs_epu16(value2, wsize); + _mm256_store_si256((__m256i *)table, result1); + _mm256_store_si256((__m256i *)(table+16), result2); + + table -= 32; + entries -= 32; + } while (entries > 0); +} + +Z_INTERNAL void slide_hash_avx2(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize); + + slide_hash_chain(s->head, HASH_SIZE, ymm_wsize); + slide_hash_chain(s->prev, wsize, ymm_wsize); +} + +#endif diff --git a/neozip/arch/x86/slide_hash_sse2.c b/neozip/arch/x86/slide_hash_sse2.c new file mode 100644 index 0000000000..4aa8df5ee8 --- /dev/null +++ b/neozip/arch/x86/slide_hash_sse2.c @@ -0,0 +1,68 @@ +/* + * SSE optimized hash slide + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven <arjan@linux.intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_SSE2 + +#include "zbuild.h" +#include "deflate.h" + +#include <immintrin.h> +#include <assert.h> + +static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0, + uint32_t entries1, const __m128i wsize) { + uint32_t entries; + Pos *table; + __m128i value0, value1, result0, result1; + + int on_chain = 0; + +next_chain: + table = (on_chain) ? table1 : table0; + entries = (on_chain) ? entries1 : entries0; + + table += entries; + table -= 16; + + /* ZALLOC allocates this pointer unless the user chose a custom allocator. + * Our alloc function is aligned to 64 byte boundaries */ + do { + value0 = _mm_load_si128((__m128i *)table); + value1 = _mm_load_si128((__m128i *)(table + 8)); + result0 = _mm_subs_epu16(value0, wsize); + result1 = _mm_subs_epu16(value1, wsize); + _mm_store_si128((__m128i *)table, result0); + _mm_store_si128((__m128i *)(table + 8), result1); + + table -= 16; + entries -= 16; + } while (entries > 0); + + ++on_chain; + if (on_chain > 1) { + return; + } else { + goto next_chain; + } +} + +Z_INTERNAL void slide_hash_sse2(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m128i xmm_wsize = _mm_set1_epi16((short)wsize); + + assert(((uintptr_t)s->head & 15) == 0); + assert(((uintptr_t)s->prev & 15) == 0); + + slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize); +} + +#endif diff --git a/neozip/arch/x86/x86_features.c b/neozip/arch/x86/x86_features.c new file mode 100644 index 0000000000..5eba18bf8a --- /dev/null +++ b/neozip/arch/x86/x86_features.c @@ -0,0 +1,128 @@ +/* x86_features.c - x86 feature check + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Author: + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_FEATURES + +#include "zbuild.h" +#include "x86_features.h" + +#if defined(HAVE_CPUID_MS) +# include <intrin.h> +#elif defined(HAVE_CPUID_GNU) +// Newer versions of GCC and clang come with cpuid.h +# include <cpuid.h> +# ifdef X86_HAVE_XSAVE_INTRIN +# if __GNUC__ == 8 +# include <xsaveintrin.h> +# else +# include <immintrin.h> +# endif +# endif +#endif + +static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { +#if defined(HAVE_CPUID_MS) + unsigned int registers[4]; + __cpuid((int *)registers, info); + + *eax = registers[0]; + *ebx = registers[1]; + *ecx = registers[2]; + *edx = registers[3]; +#elif defined(HAVE_CPUID_GNU) + *eax = *ebx = *ecx = *edx = 0; + __cpuid(info, *eax, *ebx, *ecx, *edx); +#else + /* When using this fallback, the faster SSE/AVX code is disabled */ + *eax = *ebx = *ecx = *edx = 0; +#endif +} + +static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { +#if defined(HAVE_CPUID_MS) + unsigned int registers[4]; + __cpuidex((int *)registers, info, subinfo); + + *eax = registers[0]; + *ebx = registers[1]; + *ecx = registers[2]; + *edx = registers[3]; +#elif defined(HAVE_CPUID_GNU) + *eax = *ebx = *ecx = *edx = 0; + __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx); +#else + /* When using this fallback, the faster SSE/AVX code is disabled */ + *eax = *ebx = *ecx = *edx = 0; +#endif +} + +static inline uint64_t xgetbv(unsigned int xcr) { +#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN) + return _xgetbv(xcr); +#elif defined(__GNUC__) + uint32_t eax, edx; + __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (uint64_t)(edx) << 32 | eax; +#else + /* When using this fallback, some of the faster code is disabled */ + return 0; +#endif +} + +void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { + unsigned eax, ebx, ecx, edx; + unsigned maxbasic; + + cpuid(0, &maxbasic, &ebx, &ecx, &edx); + cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx); + + features->has_sse2 = edx & 0x4000000; + features->has_ssse3 = ecx & 0x200; + features->has_sse41 = ecx & 0x80000; + features->has_sse42 = ecx & 0x100000; + features->has_pclmulqdq = ecx & 0x2; + + if (ecx & 0x08000000) { + uint64_t xfeature = xgetbv(0); + + features->has_os_save_ymm = ((xfeature & 0x06) == 0x06); + features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6); + } + + if (maxbasic >= 7) { + // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf + cpuidex(7, 0, &eax, &ebx, &ecx, &edx); + + // check BMI2 bit + features->has_bmi2 = ebx & 0x100; + + // check AVX2 bit if the OS supports saving YMM registers + if (features->has_os_save_ymm) { + features->has_avx2 = ebx & 0x20; + features->has_vpclmulqdq = ecx & 0x400; + } + + // check AVX512 bits if the OS supports saving ZMM registers + if (features->has_os_save_zmm) { + features->has_avx512f = ebx & 0x00010000; + if (features->has_avx512f) { + // According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable + // AVX512(DQ,BW,VL). + features->has_avx512dq = ebx & 0x00020000; + features->has_avx512bw = ebx & 0x40000000; + features->has_avx512vl = ebx & 0x80000000; + } + features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \ + && features->has_avx512vl && features->has_bmi2; + features->has_avx512vnni = ecx & 0x800; + } + } +} + +#endif diff --git a/neozip/arch/x86/x86_features.h b/neozip/arch/x86/x86_features.h new file mode 100644 index 0000000000..2118b8e87a --- /dev/null +++ b/neozip/arch/x86/x86_features.h @@ -0,0 +1,30 @@ +/* x86_features.h -- check for CPU features + * Copyright (C) 2013 Intel Corporation Jim Kukunas + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_FEATURES_H_ +#define X86_FEATURES_H_ + +struct x86_cpu_features { + int has_avx2; + int has_avx512f; + int has_avx512dq; + int has_avx512bw; + int has_avx512vl; + int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled. + int has_avx512vnni; + int has_bmi2; + int has_sse2; + int has_ssse3; + int has_sse41; + int has_sse42; + int has_pclmulqdq; + int has_vpclmulqdq; + int has_os_save_ymm; + int has_os_save_zmm; +}; + +void Z_INTERNAL x86_check_features(struct x86_cpu_features *features); + +#endif /* X86_FEATURES_H_ */ diff --git a/neozip/arch/x86/x86_functions.h b/neozip/arch/x86/x86_functions.h new file mode 100644 index 0000000000..881c6efe23 --- /dev/null +++ b/neozip/arch/x86/x86_functions.h @@ -0,0 +1,196 @@ +/* x86_functions.h -- x86 implementations for arch-specific functions. + * Copyright (C) 2013 Intel Corporation Jim Kukunas + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_FUNCTIONS_H_ +#define X86_FUNCTIONS_H_ + +#include "x86_natives.h" + +/* So great news, your compiler is broken and causes stack smashing. Rather than + * notching out its compilation we'll just remove the assignment in the functable. + * Further context: + * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */ +#if defined(_MSC_VER) && defined(ARCH_32BIT) && _MSC_VER >= 1920 && _MSC_VER <= 1929 +#define NO_CHORBA_SSE +#endif + +#ifdef X86_SSE2 +uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_sse2(deflate_state *const s, uint32_t cur_match); +void slide_hash_sse2(deflate_state *s); + +# if !defined(WITHOUT_CHORBA_SSE) + uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len); + uint32_t crc32_copy_chorba_sse2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); + uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len); +# endif +#endif + +#ifdef X86_SSSE3 +uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, size_t len, size_t left); +void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); +#endif + +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) + uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len); + uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_SSE42 +uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_AVX2 +uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx2(deflate_state *const s, uint32_t cur_match); +void slide_hash_avx2(deflate_state *s); +#endif +#ifdef X86_AVX512 +uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1); +void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); +uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_avx512(deflate_state *const s, uint32_t cur_match); +#endif +#ifdef X86_AVX512VNNI +uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef X86_PCLMULQDQ_CRC +uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_VPCLMULQDQ_AVX2 +uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_VPCLMULQDQ_AVX512 +uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// X86 - SSE2 +# ifdef X86_SSE2_NATIVE +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_sse2 +# undef native_compare256 +# define native_compare256 compare256_sse2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_sse2 +# undef native_longest_match +# define native_longest_match longest_match_sse2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_sse2 +# if !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse2 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_chorba_sse2 +# endif +# undef native_slide_hash +# define native_slide_hash slide_hash_sse2 +# endif +// X86 - SSSE3 +# ifdef X86_SSSE3_NATIVE +# undef native_adler32 +# define native_adler32 adler32_ssse3 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_ssse3 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_ssse3 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_ssse3 +# endif +// X86 - SSE4.1 +# if defined(X86_SSE41_NATIVE) && !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse41 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_chorba_sse41 +# endif +// X86 - SSE4.2 +# ifdef X86_SSE42_NATIVE +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_sse42 +# endif +// X86 - PCLMUL +# ifdef X86_PCLMULQDQ_NATIVE +# undef native_crc32 +# define native_crc32 crc32_pclmulqdq +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_pclmulqdq +# endif +// X86 - AVX2 +# ifdef X86_AVX2_NATIVE +# undef native_adler32 +# define native_adler32 adler32_avx2 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx2 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_avx2 +# undef native_compare256 +# define native_compare256 compare256_avx2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_avx2 +# undef native_longest_match +# define native_longest_match longest_match_avx2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx2 +# undef native_slide_hash +# define native_slide_hash slide_hash_avx2 +# endif +// X86 - AVX512 (F,DQ,BW,Vl) +# ifdef X86_AVX512_NATIVE +# undef native_adler32 +# define native_adler32 adler32_avx512 +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx512 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_avx512 +# undef native_compare256 +# define native_compare256 compare256_avx512 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_avx512 +# undef native_longest_match +# define native_longest_match longest_match_avx512 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx512 +// X86 - AVX512 (VNNI) +# ifdef X86_AVX512VNNI_NATIVE +# undef native_adler32 +# define native_adler32 adler32_avx512_vnni +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_avx512_vnni +# endif +# endif +// X86 - VPCLMULQDQ +# ifdef X86_VPCLMULQDQ_AVX512_NATIVE +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq_avx512 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_vpclmulqdq_avx512 +# elif defined(X86_VPCLMULQDQ_AVX2_NATIVE) +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq_avx2 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_vpclmulqdq_avx2 +# endif +#endif + +#endif /* X86_FUNCTIONS_H_ */ diff --git a/neozip/arch/x86/x86_intrins.h b/neozip/arch/x86/x86_intrins.h new file mode 100644 index 0000000000..1d1df5eb11 --- /dev/null +++ b/neozip/arch/x86/x86_intrins.h @@ -0,0 +1,126 @@ +#ifndef X86_INTRINS_H +#define X86_INTRINS_H + +#ifdef __SSE2__ +#include <emmintrin.h> +#endif + +/* Unfortunately GCC didn't support these things until version 10. + * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3. + */ +#ifdef __AVX2__ +#include <immintrin.h> + +#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 10) \ + || (defined(__apple_build_version__) && __apple_build_version__ < 9020039) +static inline __m256i _mm256_zextsi128_si256(__m128i a) { + __m128i r; + __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a)); + return _mm256_castsi128_si256(r); +} + +#ifdef __AVX512F__ +static inline __m512i _mm512_zextsi128_si512(__m128i a) { + __m128i r; + __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a)); + return _mm512_castsi128_si512(r); +} +#endif // __AVX512F__ +#endif // gcc/AppleClang version test + +#endif // __AVX2__ + +/* GCC <9 is missing some AVX512 intrinsics. + */ +#ifdef __AVX512F__ +#if (!defined(__clang__) && !defined(__NVCOMPILER) && defined(__GNUC__) && __GNUC__ < 9) +#include <immintrin.h> + +#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \ + ((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3))) + +static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60, + char __q59, char __q58, char __q57, char __q56, + char __q55, char __q54, char __q53, char __q52, + char __q51, char __q50, char __q49, char __q48, + char __q47, char __q46, char __q45, char __q44, + char __q43, char __q42, char __q41, char __q40, + char __q39, char __q38, char __q37, char __q36, + char __q35, char __q34, char __q33, char __q32, + char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) { + return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56), + PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48), + PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40), + PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32), + PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24), + PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16), + PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08), + PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00)); +} + +#undef PACK + +#endif // gcc version test +#endif // __AVX512F__ + +/* Missing zero-extension AVX and AVX512 intrinsics. + * Fixed in Microsoft Visual Studio 2017 version 15.7 + * https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737 + */ +#if defined(_MSC_VER) && _MSC_VER < 1914 +#ifdef __AVX2__ +static inline __m256i _mm256_zextsi128_si256(__m128i a) { + return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0); +} +#endif // __AVX2__ + +#ifdef __AVX512F__ +static inline __m512i _mm512_zextsi128_si512(__m128i a) { + return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0); +} +#endif // __AVX512F__ +#endif // defined(_MSC_VER) && _MSC_VER < 1914 + +/* Visual C++ toolchains before v142 have constant overflow in AVX512 intrinsics */ +#if defined(_MSC_VER) && defined(__AVX512F__) && !defined(_MM_K0_REG8) +# undef _mm512_extracti32x4_epi32 +# define _mm512_extracti32x4_epi32(v1, e1) _mm512_maskz_extracti32x4_epi32(UINT8_MAX, v1, e1) +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +#include <intrin.h> +/* For whatever reason this intrinsic is 64 bit only with MSVC? + * While we don't have 64 bit GPRs, it should at least be able to move it to stack + * or shuffle it over 2 registers */ +#ifdef ARCH_32BIT +/* So, while we can't move directly to a GPR, hopefully this move to + * a stack resident variable doesn't equate to something awful */ +static inline int64_t _mm_cvtsi128_si64(__m128i a) { + union { __m128i v; int64_t i; } u; + u.v = a; + return u.i; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) { + return _mm_set_epi64x(0, a); +} +#endif +#endif + +#if defined(__GNUC__) && defined(ARCH_X86) && defined(ARCH_32BIT) && !defined(__clang__) +static inline int64_t _mm_cvtsi128_si64(__m128i a) { + union { __m128i v; int64_t i; } u; + u.v = a; + return u.i; +} +#define _mm_cvtsi64_si128(a) _mm_set_epi64x(0, a) +#endif + +#endif // include guard X86_INTRINS_H diff --git a/neozip/arch/x86/x86_natives.h b/neozip/arch/x86/x86_natives.h new file mode 100644 index 0000000000..a39b7a51f0 --- /dev/null +++ b/neozip/arch/x86/x86_natives.h @@ -0,0 +1,57 @@ +/* x86_natives.h -- x86 compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef X86_NATIVES_H_ +#define X86_NATIVES_H_ + +#if defined(__SSE2__) || (defined(ARCH_X86) && defined(ARCH_64BIT)) +# ifdef X86_SSE2 +# define X86_SSE2_NATIVE +# endif +#endif +#if defined(__SSSE3__) +# ifdef X86_SSSE3 +# define X86_SSSE3_NATIVE +# endif +#endif +#if defined(__SSE4_1__) +# ifdef X86_SSE41 +# define X86_SSE41_NATIVE +# endif +#endif +#if defined(__SSE4_2__) +# ifdef X86_SSE42 +# define X86_SSE42_NATIVE +# endif +#endif +#if defined(__PCLMUL__) +# ifdef X86_PCLMULQDQ_CRC +# define X86_PCLMULQDQ_NATIVE +# endif +#endif +#if defined(__AVX2__) +# ifdef X86_AVX2 +# define X86_AVX2_NATIVE +# endif +#endif +#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) +# ifdef X86_AVX512 +# define X86_AVX512_NATIVE +# endif +#endif +#if defined(__AVX512VNNI__) +# ifdef X86_AVX512VNNI +# define X86_AVX512VNNI_NATIVE +# endif +#endif +#if defined(__VPCLMULQDQ__) +# if defined(X86_VPCLMULQDQ_AVX2) && defined(X86_AVX2_NATIVE) +# define X86_VPCLMULQDQ_AVX2_NATIVE +# endif +# if defined(X86_VPCLMULQDQ_AVX512) && defined(X86_AVX512_NATIVE) +# define X86_VPCLMULQDQ_AVX512_NATIVE +# endif +#endif + +#endif /* X86_NATIVES_H_ */ |
