summaryrefslogtreecommitdiff
path: root/neozip/arch/power
diff options
context:
space:
mode:
Diffstat (limited to 'neozip/arch/power')
-rw-r--r--neozip/arch/power/Makefile.in93
-rw-r--r--neozip/arch/power/adler32_power8.c160
-rw-r--r--neozip/arch/power/adler32_vmx.c168
-rw-r--r--neozip/arch/power/chunkset_power8.c50
-rw-r--r--neozip/arch/power/compare256_power9.c68
-rw-r--r--neozip/arch/power/crc32_constants.h1123
-rw-r--r--neozip/arch/power/crc32_power8.c593
-rw-r--r--neozip/arch/power/power_features.c54
-rw-r--r--neozip/arch/power/power_features.h18
-rw-r--r--neozip/arch/power/power_functions.h74
-rw-r--r--neozip/arch/power/power_intrins.h61
-rw-r--r--neozip/arch/power/power_natives.h27
-rw-r--r--neozip/arch/power/slide_hash_power8.c12
-rw-r--r--neozip/arch/power/slide_hash_vmx.c10
-rw-r--r--neozip/arch/power/slide_ppc_tpl.h44
15 files changed, 2555 insertions, 0 deletions
diff --git a/neozip/arch/power/Makefile.in b/neozip/arch/power/Makefile.in
new file mode 100644
index 0000000000..e2bec5e510
--- /dev/null
+++ b/neozip/arch/power/Makefile.in
@@ -0,0 +1,93 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
+PPCFLAGS=-maltivec
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: power_features.o \
+ power_features.lo \
+ adler32_power8.o \
+ adler32_power8.lo \
+ adler32_vmx.o \
+ adler32_vmx.lo \
+ chunkset_power8.o \
+ chunkset_power8.lo \
+ compare256_power9.o \
+ compare256_power9.lo \
+ crc32_power8.o \
+ crc32_power8.lo \
+ slide_hash_power8.o \
+ slide_hash_power8.lo \
+ slide_hash_vmx.o \
+ slide_hash_vmx.lo
+
+power_features.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+power_features.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+adler32_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_vmx.o:
+ $(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+ $(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+chunkset_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+compare256_power9.o:
+ $(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+ $(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+crc32_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+slide_hash_power8.o:
+ $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+ $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_vmx.o:
+ $(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+ $(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/power/adler32_power8.c b/neozip/arch/power/adler32_power8.c
new file mode 100644
index 0000000000..39b3cf399c
--- /dev/null
+++ b/neozip/arch/power/adler32_power8.c
@@ -0,0 +1,160 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <altivec.h>
+
+/* Vector across sum unsigned int (saturate). */
+static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+ __b = vec_sld(__a, __a, 8);
+ __b = vec_add(__b, __a);
+ __a = vec_sld(__b, __b, 4);
+ __a = vec_add(__a, __b);
+
+ return __a;
+}
+
+Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) {
+ uint32_t s1 = adler & 0xffff;
+ uint32_t s2 = (adler >> 16) & 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(s1, NULL, buf, 1, s2, 1, 1, 0);
+
+ /* This is faster than VSX code for len < 64. */
+ if (len < 64)
+ return adler32_copy_tail(s1, NULL, buf, len, s2, 1, 63, 0);
+
+ /* Use POWER VSX instructions for len >= 64. */
+ const vector unsigned int v_zeros = { 0 };
+ const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+ 6, 5, 4, 3, 2, 1};
+ const vector unsigned char vsh = vec_splat_u8(4);
+ const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+ vector unsigned int vs1 = { 0 };
+ vector unsigned int vs2 = { 0 };
+ vector unsigned int vs1_save = { 0 };
+ vector unsigned int vsum1, vsum2;
+ vector unsigned char vbuf;
+ int n;
+
+ vs1[0] = s1;
+ vs2[0] = s2;
+
+ /* Do length bigger than NMAX in blocks of NMAX size. */
+ while (len >= NMAX) {
+ len -= NMAX;
+ n = NMAX / 16;
+ do {
+ vbuf = vec_xl(0, (unsigned char *) buf);
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+ /* Save vs1. */
+ vs1_save = vec_add(vs1_save, vs1);
+ /* Accumulate the sums. */
+ vs1 = vec_add(vsum1, vs1);
+ vs2 = vec_add(vsum2, vs2);
+
+ buf += 16;
+ } while (--n);
+ /* Once each block of NMAX size. */
+ vs1 = vec_sumsu(vs1, vsum1);
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
+ vs2 = vec_add(vs1_save, vs2);
+ vs2 = vec_sumsu(vs2, vsum2);
+
+ /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
+ vs1[0] = vs1[0] % BASE;
+ /* vs2[0] = s2_i + 16*s1_save +
+ sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
+ vs2[0] = vs2[0] % BASE;
+
+ vs1 = vec_and(vs1, vmask);
+ vs2 = vec_and(vs2, vmask);
+ vs1_save = v_zeros;
+ }
+
+ /* len is less than NMAX one modulo is needed. */
+ if (len >= 16) {
+ while (len >= 16) {
+ len -= 16;
+
+ vbuf = vec_xl(0, (unsigned char *) buf);
+
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+ /* Save vs1. */
+ vs1_save = vec_add(vs1_save, vs1);
+ /* Accumulate the sums. */
+ vs1 = vec_add(vsum1, vs1);
+ vs2 = vec_add(vsum2, vs2);
+
+ buf += 16;
+ }
+ /* Since the size will be always less than NMAX we do this once. */
+ vs1 = vec_sumsu(vs1, vsum1);
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
+ vs2 = vec_add(vs1_save, vs2);
+ vs2 = vec_sumsu(vs2, vsum2);
+ }
+ /* Copy result back to s1, s2 (mod 65521). */
+ s1 = vs1[0] % BASE;
+ s2 = vs2[0] % BASE;
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(s1, NULL, buf, len, s2, len != 0, 15, 0);
+}
+
+Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
+ return adler32_impl(adler, buf, len);
+}
+
+/* VSX/VMX stores can have higher latency than optimized memcpy on POWER8+ */
+Z_INTERNAL uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len) {
+ adler = adler32_impl(adler, buf, len);
+ memcpy(dst, buf, len);
+ return adler;
+}
+#endif /* POWER8_VSX */
diff --git a/neozip/arch/power/adler32_vmx.c b/neozip/arch/power/adler32_vmx.c
new file mode 100644
index 0000000000..5171bab35b
--- /dev/null
+++ b/neozip/arch/power/adler32_vmx.c
@@ -0,0 +1,168 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "adler32_p.h"
+
+#include <altivec.h>
+
+#define vmx_zero() (vec_splat_u32(0))
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+ /* Different taps for the separable components of sums */
+ const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+ const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+ const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+ const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+ /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+ * a 2 element vector from a single load + a subsequent shift is just barely faster
+ * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+ const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+ const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+ vector unsigned int adacc, s2acc;
+ vector unsigned int pair_vec = vec_ld(0, s);
+ adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+#if BYTE_ORDER == LITTLE_ENDIAN
+ s2acc = vec_sro(pair_vec, shift_vec);
+#else
+ s2acc = vec_slo(pair_vec, shift_vec);
+#endif
+
+ vector unsigned int zero = vmx_zero();
+ vector unsigned int s3acc = zero;
+ vector unsigned int s3acc_0 = zero;
+ vector unsigned int adacc_prev = adacc;
+ vector unsigned int adacc_prev_0 = zero;
+
+ vector unsigned int s2acc_0 = zero;
+ vector unsigned int s2acc_1 = zero;
+ vector unsigned int s2acc_2 = zero;
+
+ /* Maintain a running sum of a second half, this might help use break yet another
+ * data dependency bubble in the sum */
+ vector unsigned int adacc_0 = zero;
+
+ int num_iter = len / 4;
+ int rem = len & 3;
+
+ for (int i = 0; i < num_iter; ++i) {
+ vector unsigned char d0 = vec_ld(0, buf);
+ vector unsigned char d1 = vec_ld(16, buf);
+ vector unsigned char d2 = vec_ld(32, buf);
+ vector unsigned char d3 = vec_ld(48, buf);
+
+ /* The core operation of the loop, basically
+ * what is being unrolled below */
+ adacc = vec_sum4s(d0, adacc);
+ s3acc = vec_add(s3acc, adacc_prev);
+ s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+ s2acc = vec_msum(t0, d0, s2acc);
+
+ /* interleave dependent sums in here */
+ adacc_0 = vec_sum4s(d1, adacc_0);
+ s2acc_0 = vec_msum(t1, d1, s2acc_0);
+ adacc = vec_sum4s(d2, adacc);
+ s2acc_1 = vec_msum(t2, d2, s2acc_1);
+ s2acc_2 = vec_msum(t3, d3, s2acc_2);
+ adacc_0 = vec_sum4s(d3, adacc_0);
+
+ adacc_prev = adacc;
+ adacc_prev_0 = adacc_0;
+ buf += 64;
+ }
+
+ adacc = vec_add(adacc, adacc_0);
+ s3acc = vec_add(s3acc, s3acc_0);
+ s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+ if (rem) {
+ adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+ adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+ while (rem--) {
+ vector unsigned char d0 = vec_ld(0, buf);
+ adacc = vec_sum4s(d0, adacc);
+ s3acc = vec_add(s3acc, adacc_prev);
+ s2acc = vec_msum(t3, d0, s2acc);
+ adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+ buf += 16;
+ }
+ }
+
+
+ /* Sum up independent second sums */
+ s2acc = vec_add(s2acc, s2acc_0);
+ s2acc_2 = vec_add(s2acc_1, s2acc_2);
+ s2acc = vec_add(s2acc, s2acc_2);
+
+ s2acc = vec_add(s2acc, s3acc);
+
+ adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+ s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+ adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+ s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+ vec_ste(adacc, 0, s);
+ vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+ /* Split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+ uint32_t pair[4] ALIGNED_(16);
+ pair[0] = adler;
+ pair[1] = sum2;
+ pair[2] = 0;
+ pair[3] = 0;
+
+ // Align buffer
+ size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+ size_t n = NMAX;
+ if (align_diff) {
+ adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0);
+
+ buf += align_diff;
+ len -= align_diff;
+ n -= align_diff;
+ }
+
+ while (len >= 16) {
+ n = MIN(len, n);
+
+ vmx_accum32(pair, buf, n / 16);
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+
+ size_t k = (n / 16) * 16;
+ buf += k;
+ len -= k;
+ n = NMAX;
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0);
+}
+
+/* VMX stores can have higher latency than optimized memcpy */
+Z_INTERNAL uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = adler32_vmx(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
+#endif
diff --git a/neozip/arch/power/chunkset_power8.c b/neozip/arch/power/chunkset_power8.c
new file mode 100644
index 0000000000..f9855e677e
--- /dev/null
+++ b/neozip/arch/power/chunkset_power8.c
@@ -0,0 +1,50 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#include <altivec.h>
+
+typedef vector unsigned char chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = (vector unsigned char)vec_splats(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = (vector unsigned char)vec_splats(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = (vector unsigned char)vec_splats((unsigned long long)zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ vec_xst(*chunk, 0, out);
+}
+
+#define CHUNKSIZE chunksize_power8
+#define CHUNKCOPY chunkcopy_power8
+#define CHUNKUNROLL chunkunroll_power8
+#define CHUNKMEMSET chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_power8
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/power/compare256_power9.c b/neozip/arch/power/compare256_power9.c
new file mode 100644
index 0000000000..99c3b0b6d1
--- /dev/null
+++ b/neozip/arch/power/compare256_power9.c
@@ -0,0 +1,68 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "zendian.h"
+
+#include <altivec.h>
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
+#if BYTE_ORDER == LITTLE_ENDIAN
+# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#else
+# define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+#else
+# define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0, cmplen;
+
+ do {
+ vector unsigned char vsrc0, vsrc1, vc;
+
+ vsrc0 = *((vector unsigned char *)src0);
+ vsrc1 = *((vector unsigned char *)src1);
+
+ /* Compare 16 bytes at a time. Each byte of vc will be either
+ * all ones or all zeroes, depending on the result of the comparison. */
+ vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+ /* Since the index of matching bytes will contain only zeroes
+ * on vc (since we used cmpne), counting the number of consecutive
+ * bytes where LSB == 0 is the same as counting the length of the match. */
+ zng_vec_vctzlsbb(vc, cmplen);
+ if (cmplen != 16)
+ return len + cmplen;
+
+ src0 += 16, src1 += 16, len += 16;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_power9
+#define COMPARE256 compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_power9
+#define COMPARE256 compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/neozip/arch/power/crc32_constants.h b/neozip/arch/power/crc32_constants.h
new file mode 100644
index 0000000000..8c8f2153b6
--- /dev/null
+++ b/neozip/arch/power/crc32_constants.h
@@ -0,0 +1,1123 @@
+/* Constants table used by crc32_power8.c
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * This file was automatically generated, DO NOT EDIT IT MANUALLY.
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zendian.h"
+#include "zbuild.h"
+
+/* Reduce 262144 kbits to 1024 bits */
+static const __vector unsigned long long vcrc_const[255] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ { 0x0000000099ea94a8, 0x00000001651797d2 },
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ { 0x00000000945a8420, 0x0000000021e0d56c },
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ { 0x0000000030762706, 0x000000000f95ecaa },
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ { 0x00000001a52fc582, 0x00000001ebd224ac },
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ { 0x00000001a4a7167a, 0x000000000ccb97ca },
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ { 0x000000000c18249a, 0x00000001006ec8a8 },
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ { 0x00000000a924ae7c, 0x000000014f58f196 },
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ { 0x00000001e12ccc12, 0x00000001a7192ca6 },
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ { 0x00000000a0b9d4ac, 0x000000019a64bab2 },
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ { 0x0000000095e8ddfe, 0x0000000014f4ed2e },
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ { 0x00000000233fddc4, 0x000000011092b6a2 },
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ { 0x00000001b4529b62, 0x00000000c8a1629c },
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ { 0x00000001a7fa0e64, 0x000000017bf32e8e },
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ { 0x00000001b5334592, 0x00000001f8cc6582 },
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ { 0x000000011f8ee1b4, 0x000000008631ddf0 },
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ { 0x000000006252e632, 0x000000007e5a76d0 },
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ { 0x00000000ab973e84, 0x000000002b09b31c },
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ { 0x000000007734f5ec, 0x00000001b2df1f84 },
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ { 0x000000007c547798, 0x00000001d6f56afc },
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ { 0x000000007ec40210, 0x00000001b9b5e70c },
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ { 0x00000001ab1695a8, 0x0000000034b626d2 },
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ { 0x0000000090494bba, 0x000000014c53479a },
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ { 0x00000001123fb816, 0x00000001a6d179a4 },
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ { 0x00000001e188c74c, 0x000000015abd16b4 },
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ { 0x00000001c2d3451c, 0x00000000018f9852 },
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ { 0x00000000f55cf1ca, 0x000000001fb3084a },
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ { 0x00000001a0531540, 0x00000000c53dfb04 },
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ { 0x0000000132cd7ebc, 0x00000000e10c9ad6 },
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ { 0x0000000073ab7f36, 0x0000000025aa994a },
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ { 0x0000000041aed1c2, 0x00000000fa3a74c4 },
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ { 0x0000000136c53800, 0x0000000033eb3f40 },
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ { 0x0000000126835a30, 0x000000017193f296 },
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ { 0x000000006241b502, 0x0000000043f6c86a },
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ { 0x00000000d5196ad4, 0x000000016b513ec6 },
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ { 0x000000009cfa769a, 0x00000000c8f25b4e },
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ { 0x00000000920e5df4, 0x00000001a45048ec },
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ { 0x0000000169dc310e, 0x000000000c441004 },
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ { 0x0000000009fc331c, 0x000000000e17cad6 },
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ { 0x000000010d94a81e, 0x00000001253ae964 },
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ { 0x0000000027a20ab2, 0x00000001d7c88ebc },
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ { 0x0000000114f87504, 0x00000001e7ca913a },
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ { 0x000000004b076d96, 0x0000000033ed078a },
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ { 0x00000000da4d1e74, 0x00000000e1839c78 },
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ { 0x000000001b81f672, 0x00000001322b267e },
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ { 0x000000009367c988, 0x00000000638231b6 },
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ { 0x00000001717214ca, 0x00000001ee7f16f4 },
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ { 0x000000009f47d820, 0x0000000117d9924a },
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ { 0x000000010d9a47d2, 0x00000000e1a9e0c4 },
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ { 0x00000000a696c58c, 0x00000001403731dc },
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ { 0x000000002aa28ec6, 0x00000001a5ea9682 },
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ { 0x00000001fe18fd9a, 0x0000000101c5c578 },
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ { 0x000000019d4fc1ae, 0x00000000dddf6494 },
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ { 0x00000001ba0e3dea, 0x00000000f1c3db28 },
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ { 0x0000000074b59a5e, 0x000000013112fb9c },
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ { 0x00000000f2b5ea98, 0x00000000b680b906 },
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ { 0x0000000187132676, 0x000000001a282932 },
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ { 0x000000010a8c6ad4, 0x0000000089406e7e },
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ { 0x00000001e21dfe70, 0x00000001def6be8c },
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ { 0x00000001da0050e4, 0x0000000075258728 },
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ { 0x00000000772172ae, 0x000000019536090a },
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ { 0x00000000e47724aa, 0x00000000f2455bfc },
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ { 0x000000003cd63ac4, 0x000000018c40baf4 },
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ { 0x00000001bf47d352, 0x000000004cd390d4 },
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ { 0x000000018dc1d708, 0x00000001e4ece95a },
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ { 0x000000002d4620a4, 0x000000001a3ee918 },
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ { 0x0000000058fd1740, 0x000000007c652fb8 },
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ { 0x00000000dadd9bfc, 0x000000011c67842c },
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ { 0x00000001ea2140be, 0x00000000254f759c },
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ { 0x000000009de128ba, 0x000000007ece94ca },
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ { 0x000000013ac3aa8e, 0x0000000038f258c2 },
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ { 0x0000000099980562, 0x00000001cdf17b00 },
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ { 0x00000001c1579c86, 0x000000011f882c16 },
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ { 0x0000000068dbbf94, 0x0000000100093fc8 },
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ { 0x000000004509fb04, 0x00000001cd684f16 },
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ { 0x00000001202f6398, 0x000000004bc6a70a },
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ { 0x000000013aea243e, 0x000000004fc7e8e4 },
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ { 0x00000001b4052ae6, 0x0000000130103f1c },
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ { 0x00000001cd2a0ae8, 0x0000000111b0024c },
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ { 0x00000001fe4aa8b4, 0x000000010b3079da },
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ { 0x00000001d1559a42, 0x000000010192bcc2 },
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ { 0x00000001f3e05ecc, 0x0000000074838d50 },
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ { 0x0000000104ddd2cc, 0x000000001b20f520 },
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ { 0x000000015393153c, 0x0000000050c3590a },
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ { 0x0000000057e942c6, 0x00000000b41cac8e },
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ { 0x000000012c633850, 0x000000000c72cc78 },
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ { 0x00000000ebcaae4c, 0x0000000030cdb032 },
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ { 0x000000013ee532a6, 0x000000013e09fc32 },
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ { 0x00000001bf0cbc7e, 0x000000001ed624d2 },
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ { 0x00000000d50b7a5a, 0x00000000781aee1a },
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ { 0x0000000002fca6e8, 0x00000001c4d8348c },
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ { 0x000000007af40044, 0x0000000057a40336 },
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ { 0x0000000016178744, 0x0000000085544940 },
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ { 0x000000014c177458, 0x000000019cd21e80 },
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ { 0x000000011b6ddf04, 0x000000013eb95bc0 },
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ { 0x00000001f3e29ccc, 0x00000001dfc9fdfc },
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ { 0x0000000135ae7562, 0x00000000cd028bc2 },
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ { 0x0000000190ef812c, 0x0000000090db8c44 },
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ { 0x0000000067a2c786, 0x000000010010a4ce },
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ { 0x0000000048b9496c, 0x00000001c8f4c72c },
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ { 0x000000015a422de6, 0x000000001c26170c },
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ { 0x00000001ef0e3640, 0x00000000e3fccf68 },
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ { 0x00000001006d2d26, 0x00000000d513ed24 },
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ { 0x00000001170d56d6, 0x00000000141beada },
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ { 0x00000000a5fb613c, 0x000000011071aea0 },
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ { 0x0000000040bbf7fc, 0x000000012e19080a },
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ { 0x000000016ac3a5b2, 0x0000000100ecf826 },
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ { 0x00000000abf16230, 0x0000000069b09412 },
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ { 0x00000001ebe23fac, 0x0000000122297bac },
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ { 0x000000008b6a0894, 0x00000000e9e4b068 },
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ { 0x00000001288ea478, 0x000000004b38651a },
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ { 0x000000016619c442, 0x00000001468360e2 },
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ { 0x0000000086230038, 0x00000000121c2408 },
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ { 0x000000017746a756, 0x00000000da7e7d08 },
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ { 0x0000000191b8f8f8, 0x00000001058d7652 },
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ { 0x000000008e167708, 0x000000014a098a90 },
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ { 0x0000000148b22d54, 0x0000000020dbe72e },
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ { 0x0000000044ba2c3c, 0x000000011e7323e8 },
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ { 0x00000000b54d2b52, 0x00000000d5d4bf94 },
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ { 0x0000000005a4fd8a, 0x0000000199d8746c },
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ { 0x0000000139f9fc46, 0x00000000ce9ca8a0 },
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ { 0x000000015a1fa824, 0x00000000136edece },
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ { 0x000000000a61ae4c, 0x000000019b92a068 },
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ { 0x0000000145e9113e, 0x0000000071d62206 },
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ { 0x000000006a348448, 0x00000000dfc50158 },
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ { 0x000000004d80a08c, 0x00000001517626bc },
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ { 0x000000014b6837a0, 0x0000000148d1e4fa },
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ { 0x000000016896a7fc, 0x0000000094d8266e },
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ { 0x000000014f187140, 0x00000000606c5e34 },
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ { 0x000000019581b9da, 0x000000019766beaa },
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ { 0x00000001091bc984, 0x00000001d80c506c },
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ { 0x000000001067223c, 0x000000001e73837c },
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ { 0x00000001ab16ea02, 0x0000000064d587de },
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ { 0x000000013c4598a8, 0x00000000f4a507b0 },
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ { 0x00000000b3735430, 0x0000000040e342fc },
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ { 0x00000001bb3fc0c0, 0x00000001d5ad9c3a },
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ { 0x00000001570ae19c, 0x0000000094a691a4 },
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ { 0x00000001ea910712, 0x00000001271ecdfa },
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ { 0x0000000167127128, 0x000000009e54475a },
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ { 0x0000000019e790a2, 0x00000000c9c099ee },
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ { 0x000000003788f710, 0x000000009a2f736c },
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ { 0x00000001682a160e, 0x00000000bb9f4996 },
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ { 0x000000007f0ebd2e, 0x00000001db688050 },
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ { 0x000000002b032080, 0x00000000e9b10af4 },
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ { 0x00000000cfd1664a, 0x000000012d4545e4 },
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ { 0x00000000aa1181c2, 0x000000000361139c },
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ { 0x00000000ddd08002, 0x00000001a5a1a3a8 },
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ { 0x00000000e8dd0446, 0x000000006844e0b0 },
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ { 0x00000001bbd94a00, 0x00000000c3762f28 },
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ { 0x00000000ab6cd180, 0x00000001d26287a2 },
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ { 0x0000000031803ce2, 0x00000001f6f0bba8 },
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ { 0x0000000024f40b0c, 0x000000002ffabd62 },
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ { 0x00000001ba1d9834, 0x00000000fb4516b8 },
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ { 0x0000000104de61aa, 0x000000018cfa961c },
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ { 0x0000000113e40d46, 0x000000019e588d52 },
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ { 0x00000001415598a0, 0x00000001180f0bbc },
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ { 0x00000000bf6c8c90, 0x00000000e1d9177a },
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ { 0x00000001788b0504, 0x0000000105abc27c },
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ { 0x0000000038385d02, 0x00000000972e4a58 },
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ { 0x00000001b6c83844, 0x0000000183499a5e },
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ { 0x0000000051061a8a, 0x00000001c96a8cca },
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ { 0x000000017351388a, 0x00000001a1a5b60c },
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ { 0x0000000132928f92, 0x00000000e4b6ac9c },
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ { 0x00000000e6b4f48a, 0x00000001807e7f5a },
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ { 0x0000000039d15e90, 0x000000017a7e3bc8 },
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ { 0x00000000312d6074, 0x00000000d73975da },
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ { 0x000000017bbb2cc4, 0x000000017375d038 },
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ { 0x000000016ded3e18, 0x00000000193680bc },
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ { 0x00000000f1638b16, 0x00000000999b06f6 },
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ { 0x00000001d38b9ecc, 0x00000001f685d2b8 },
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ { 0x000000018b8d09dc, 0x00000001f4ecbed2 },
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ { 0x00000000e7bc27d2, 0x00000000ba16f1a0 },
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ { 0x00000000275e1e96, 0x0000000115aceac4 },
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ { 0x00000000e2e3031e, 0x00000001aeff6292 },
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ { 0x00000001041c84d8, 0x000000009640124c },
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ { 0x00000000706ce672, 0x0000000114f41f02 },
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ { 0x000000015d5070da, 0x000000009c5f3586 },
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ { 0x0000000038f9493a, 0x00000001878275fa },
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ { 0x00000000a3348a76, 0x00000000ddc42ce8 },
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ { 0x00000001ad0aab92, 0x0000000181d2c73a },
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ { 0x000000019e85f712, 0x0000000141c9320a },
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ { 0x000000005a871e76, 0x000000015235719a },
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ { 0x000000017249c662, 0x00000000be27d804 },
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ { 0x000000003a084712, 0x000000006242d45a },
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ { 0x00000000ed438478, 0x000000009a53638e },
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ { 0x00000000abac34cc, 0x00000001001ecfb6 },
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ { 0x000000005f35ef3e, 0x000000016d7c2d64 },
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ { 0x0000000047d6608c, 0x00000001d0ce46c0 },
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ { 0x000000002d01470e, 0x0000000124c907b4 },
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ { 0x0000000158bbc7b0, 0x0000000018a555ca },
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ { 0x00000000c0a23e8e, 0x000000006b0980bc },
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ { 0x00000001ebd85c88, 0x000000008bbba964 },
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ { 0x000000019ee20bb2, 0x00000001070a5a1e },
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ { 0x00000001acabf2d6, 0x000000002204322a },
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ { 0x00000001b7963d56, 0x00000000a27524d0 },
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ { 0x000000017bffa1fe, 0x0000000020b1e4ba },
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ { 0x000000001f15333e, 0x0000000032cc27fc },
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ { 0x000000018593129e, 0x0000000044dd22b8 },
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ { 0x000000019cb32602, 0x00000000dffc9e0a },
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ { 0x0000000142b05cc8, 0x00000001b7a0ed14 },
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ { 0x00000001be49e7a4, 0x00000000c7842488 },
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ { 0x0000000108f69d6c, 0x00000001c02a4fee },
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ { 0x000000006c0971f0, 0x000000003c273778 },
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ { 0x000000005b16467a, 0x00000001d63f8894 },
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ { 0x00000001551a628e, 0x000000006be557d6 },
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ { 0x000000019e42ea92, 0x000000006a7806ea },
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ { 0x000000012fa83ff2, 0x000000016155aa0c },
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ { 0x000000011ca9cde0, 0x00000000908650ac },
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ { 0x00000000c8e5cd74, 0x00000000aa5a8084 },
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ { 0x0000000096c27f0c, 0x0000000191bb500a },
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ { 0x000000002baed926, 0x0000000064e9bed0 },
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ { 0x000000017c8de8d2, 0x000000009444f302 },
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ { 0x00000000d43d6068, 0x000000019db07d3c },
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ { 0x00000000cb2c4b26, 0x00000001359e3e6e },
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ { 0x0000000145b8da26, 0x00000001e4f10dd2 },
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ { 0x000000018fff4b08, 0x0000000124f5735e },
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ { 0x0000000150b58ed0, 0x0000000124760a4c },
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ { 0x00000001549f39bc, 0x000000000f1fc186 },
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ { 0x00000000ef4d2f42, 0x00000000150e4cc4 },
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ { 0x00000001b1468572, 0x000000002a6204e8 },
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ { 0x000000013d7403b2, 0x00000000beb1d432 },
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ { 0x00000001a4681842, 0x0000000135f3f1f0 },
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ { 0x0000000167714492, 0x0000000074fe2232 },
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ { 0x00000001e599099a, 0x000000001ac6e2ba },
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ { 0x00000000fe128194, 0x0000000013fca91e },
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ { 0x0000000077e8b990, 0x0000000183f4931e },
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ { 0x00000001a267f63a, 0x00000000b6d9b4e4 },
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ { 0x00000001945c245a, 0x00000000b5188656 },
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ { 0x0000000149002e76, 0x0000000027a81a84 },
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ { 0x00000001bb8310a4, 0x0000000125699258 },
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ { 0x000000019ec60bcc, 0x00000001b23de796 },
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ { 0x000000012d8590ae, 0x00000000fe4365dc },
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ { 0x0000000065b00684, 0x00000000c68f497a },
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ { 0x000000015e5aeadc, 0x00000000fbf521ee },
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ { 0x00000000b77ff2b0, 0x000000015eac3378 },
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ { 0x0000000188da2ff6, 0x0000000134914b90 },
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ { 0x0000000063da929a, 0x0000000016335cfe },
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ { 0x00000001389caa80, 0x000000010372d10c },
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ { 0x000000013db599d2, 0x000000015097b908 },
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ { 0x0000000122505a86, 0x00000001227a7572 },
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ { 0x000000016bd72746, 0x000000009a8f75c0 },
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ { 0x00000001c3faf1d4, 0x00000000682c77a2 },
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ { 0x00000001111c826c, 0x00000000231f091c },
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ { 0x00000000153e9fb2, 0x000000007d4439f2 },
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ { 0x000000002b1f7b60, 0x000000017e221efc },
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ { 0x00000000b1dba570, 0x0000000167457c38 },
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ { 0x00000001f6397b76, 0x00000000bdf081c4 },
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ { 0x0000000156335214, 0x000000016286d6b0 },
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ { 0x00000001d70e3986, 0x00000000c84f001c },
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ { 0x000000003701a774, 0x0000000064efe7c0 },
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ { 0x00000000ac81ef72, 0x000000000ac2d904 },
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ { 0x0000000133212464, 0x00000000fd226d14 },
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ { 0x00000000e4e45610, 0x000000011cfd42e0 },
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ { 0x000000000c1bd370, 0x000000016e5a5678 },
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ { 0x00000001a7b9e7a6, 0x00000001d888fe22 },
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ { 0x000000007d657a10, 0x00000001af77fcd4 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ { 0x00000001651797d2, 0x0000000099ea94a8 },
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ { 0x0000000021e0d56c, 0x00000000945a8420 },
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ { 0x000000000f95ecaa, 0x0000000030762706 },
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ { 0x00000001ebd224ac, 0x00000001a52fc582 },
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ { 0x000000000ccb97ca, 0x00000001a4a7167a },
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ { 0x00000001006ec8a8, 0x000000000c18249a },
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ { 0x000000014f58f196, 0x00000000a924ae7c },
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ { 0x00000001a7192ca6, 0x00000001e12ccc12 },
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ { 0x000000019a64bab2, 0x00000000a0b9d4ac },
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ { 0x0000000014f4ed2e, 0x0000000095e8ddfe },
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ { 0x000000011092b6a2, 0x00000000233fddc4 },
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ { 0x00000000c8a1629c, 0x00000001b4529b62 },
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ { 0x000000017bf32e8e, 0x00000001a7fa0e64 },
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ { 0x00000001f8cc6582, 0x00000001b5334592 },
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ { 0x000000008631ddf0, 0x000000011f8ee1b4 },
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ { 0x000000007e5a76d0, 0x000000006252e632 },
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ { 0x000000002b09b31c, 0x00000000ab973e84 },
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ { 0x00000001b2df1f84, 0x000000007734f5ec },
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ { 0x00000001d6f56afc, 0x000000007c547798 },
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ { 0x00000001b9b5e70c, 0x000000007ec40210 },
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ { 0x0000000034b626d2, 0x00000001ab1695a8 },
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ { 0x000000014c53479a, 0x0000000090494bba },
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ { 0x00000001a6d179a4, 0x00000001123fb816 },
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ { 0x000000015abd16b4, 0x00000001e188c74c },
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ { 0x00000000018f9852, 0x00000001c2d3451c },
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ { 0x000000001fb3084a, 0x00000000f55cf1ca },
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ { 0x00000000c53dfb04, 0x00000001a0531540 },
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ { 0x00000000e10c9ad6, 0x0000000132cd7ebc },
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ { 0x0000000025aa994a, 0x0000000073ab7f36 },
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ { 0x00000000fa3a74c4, 0x0000000041aed1c2 },
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ { 0x0000000033eb3f40, 0x0000000136c53800 },
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ { 0x000000017193f296, 0x0000000126835a30 },
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ { 0x0000000043f6c86a, 0x000000006241b502 },
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ { 0x000000016b513ec6, 0x00000000d5196ad4 },
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ { 0x00000000c8f25b4e, 0x000000009cfa769a },
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ { 0x00000001a45048ec, 0x00000000920e5df4 },
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ { 0x000000000c441004, 0x0000000169dc310e },
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ { 0x000000000e17cad6, 0x0000000009fc331c },
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ { 0x00000001253ae964, 0x000000010d94a81e },
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ { 0x00000001d7c88ebc, 0x0000000027a20ab2 },
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ { 0x00000001e7ca913a, 0x0000000114f87504 },
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ { 0x0000000033ed078a, 0x000000004b076d96 },
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ { 0x00000000e1839c78, 0x00000000da4d1e74 },
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ { 0x00000001322b267e, 0x000000001b81f672 },
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ { 0x00000000638231b6, 0x000000009367c988 },
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ { 0x00000001ee7f16f4, 0x00000001717214ca },
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ { 0x0000000117d9924a, 0x000000009f47d820 },
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ { 0x00000000e1a9e0c4, 0x000000010d9a47d2 },
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ { 0x00000001403731dc, 0x00000000a696c58c },
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ { 0x00000001a5ea9682, 0x000000002aa28ec6 },
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ { 0x0000000101c5c578, 0x00000001fe18fd9a },
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ { 0x00000000dddf6494, 0x000000019d4fc1ae },
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ { 0x00000000f1c3db28, 0x00000001ba0e3dea },
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ { 0x000000013112fb9c, 0x0000000074b59a5e },
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ { 0x00000000b680b906, 0x00000000f2b5ea98 },
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ { 0x000000001a282932, 0x0000000187132676 },
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ { 0x0000000089406e7e, 0x000000010a8c6ad4 },
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ { 0x00000001def6be8c, 0x00000001e21dfe70 },
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ { 0x0000000075258728, 0x00000001da0050e4 },
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ { 0x000000019536090a, 0x00000000772172ae },
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ { 0x00000000f2455bfc, 0x00000000e47724aa },
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ { 0x000000018c40baf4, 0x000000003cd63ac4 },
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ { 0x000000004cd390d4, 0x00000001bf47d352 },
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ { 0x00000001e4ece95a, 0x000000018dc1d708 },
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ { 0x000000001a3ee918, 0x000000002d4620a4 },
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ { 0x000000007c652fb8, 0x0000000058fd1740 },
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ { 0x000000011c67842c, 0x00000000dadd9bfc },
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ { 0x00000000254f759c, 0x00000001ea2140be },
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ { 0x000000007ece94ca, 0x000000009de128ba },
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ { 0x0000000038f258c2, 0x000000013ac3aa8e },
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ { 0x00000001cdf17b00, 0x0000000099980562 },
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ { 0x000000011f882c16, 0x00000001c1579c86 },
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ { 0x0000000100093fc8, 0x0000000068dbbf94 },
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ { 0x00000001cd684f16, 0x000000004509fb04 },
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ { 0x000000004bc6a70a, 0x00000001202f6398 },
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ { 0x000000004fc7e8e4, 0x000000013aea243e },
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ { 0x0000000130103f1c, 0x00000001b4052ae6 },
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ { 0x0000000111b0024c, 0x00000001cd2a0ae8 },
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ { 0x000000010b3079da, 0x00000001fe4aa8b4 },
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ { 0x000000010192bcc2, 0x00000001d1559a42 },
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ { 0x0000000074838d50, 0x00000001f3e05ecc },
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ { 0x000000001b20f520, 0x0000000104ddd2cc },
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ { 0x0000000050c3590a, 0x000000015393153c },
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ { 0x00000000b41cac8e, 0x0000000057e942c6 },
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ { 0x000000000c72cc78, 0x000000012c633850 },
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ { 0x0000000030cdb032, 0x00000000ebcaae4c },
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ { 0x000000013e09fc32, 0x000000013ee532a6 },
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ { 0x000000001ed624d2, 0x00000001bf0cbc7e },
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ { 0x00000000781aee1a, 0x00000000d50b7a5a },
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ { 0x00000001c4d8348c, 0x0000000002fca6e8 },
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ { 0x0000000057a40336, 0x000000007af40044 },
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ { 0x0000000085544940, 0x0000000016178744 },
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ { 0x000000019cd21e80, 0x000000014c177458 },
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ { 0x000000013eb95bc0, 0x000000011b6ddf04 },
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ { 0x00000001dfc9fdfc, 0x00000001f3e29ccc },
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ { 0x00000000cd028bc2, 0x0000000135ae7562 },
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ { 0x0000000090db8c44, 0x0000000190ef812c },
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ { 0x000000010010a4ce, 0x0000000067a2c786 },
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ { 0x00000001c8f4c72c, 0x0000000048b9496c },
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ { 0x000000001c26170c, 0x000000015a422de6 },
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ { 0x00000000e3fccf68, 0x00000001ef0e3640 },
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ { 0x00000000d513ed24, 0x00000001006d2d26 },
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ { 0x00000000141beada, 0x00000001170d56d6 },
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ { 0x000000011071aea0, 0x00000000a5fb613c },
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ { 0x000000012e19080a, 0x0000000040bbf7fc },
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ { 0x0000000100ecf826, 0x000000016ac3a5b2 },
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ { 0x0000000069b09412, 0x00000000abf16230 },
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ { 0x0000000122297bac, 0x00000001ebe23fac },
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ { 0x00000000e9e4b068, 0x000000008b6a0894 },
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ { 0x000000004b38651a, 0x00000001288ea478 },
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ { 0x00000001468360e2, 0x000000016619c442 },
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ { 0x00000000121c2408, 0x0000000086230038 },
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ { 0x00000000da7e7d08, 0x000000017746a756 },
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ { 0x00000001058d7652, 0x0000000191b8f8f8 },
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ { 0x000000014a098a90, 0x000000008e167708 },
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ { 0x0000000020dbe72e, 0x0000000148b22d54 },
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ { 0x000000011e7323e8, 0x0000000044ba2c3c },
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ { 0x00000000d5d4bf94, 0x00000000b54d2b52 },
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ { 0x0000000199d8746c, 0x0000000005a4fd8a },
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ { 0x00000000ce9ca8a0, 0x0000000139f9fc46 },
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ { 0x00000000136edece, 0x000000015a1fa824 },
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ { 0x000000019b92a068, 0x000000000a61ae4c },
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ { 0x0000000071d62206, 0x0000000145e9113e },
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ { 0x00000000dfc50158, 0x000000006a348448 },
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ { 0x00000001517626bc, 0x000000004d80a08c },
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ { 0x0000000148d1e4fa, 0x000000014b6837a0 },
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ { 0x0000000094d8266e, 0x000000016896a7fc },
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ { 0x00000000606c5e34, 0x000000014f187140 },
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ { 0x000000019766beaa, 0x000000019581b9da },
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ { 0x00000001d80c506c, 0x00000001091bc984 },
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ { 0x000000001e73837c, 0x000000001067223c },
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ { 0x0000000064d587de, 0x00000001ab16ea02 },
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ { 0x00000000f4a507b0, 0x000000013c4598a8 },
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ { 0x0000000040e342fc, 0x00000000b3735430 },
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ { 0x00000001d5ad9c3a, 0x00000001bb3fc0c0 },
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ { 0x0000000094a691a4, 0x00000001570ae19c },
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ { 0x00000001271ecdfa, 0x00000001ea910712 },
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ { 0x000000009e54475a, 0x0000000167127128 },
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ { 0x00000000c9c099ee, 0x0000000019e790a2 },
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ { 0x000000009a2f736c, 0x000000003788f710 },
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ { 0x00000000bb9f4996, 0x00000001682a160e },
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ { 0x00000001db688050, 0x000000007f0ebd2e },
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ { 0x00000000e9b10af4, 0x000000002b032080 },
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ { 0x000000012d4545e4, 0x00000000cfd1664a },
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ { 0x000000000361139c, 0x00000000aa1181c2 },
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ { 0x00000001a5a1a3a8, 0x00000000ddd08002 },
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ { 0x000000006844e0b0, 0x00000000e8dd0446 },
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ { 0x00000000c3762f28, 0x00000001bbd94a00 },
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ { 0x00000001d26287a2, 0x00000000ab6cd180 },
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ { 0x00000001f6f0bba8, 0x0000000031803ce2 },
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ { 0x000000002ffabd62, 0x0000000024f40b0c },
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ { 0x00000000fb4516b8, 0x00000001ba1d9834 },
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ { 0x000000018cfa961c, 0x0000000104de61aa },
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ { 0x000000019e588d52, 0x0000000113e40d46 },
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ { 0x00000001180f0bbc, 0x00000001415598a0 },
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ { 0x00000000e1d9177a, 0x00000000bf6c8c90 },
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ { 0x0000000105abc27c, 0x00000001788b0504 },
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ { 0x00000000972e4a58, 0x0000000038385d02 },
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ { 0x0000000183499a5e, 0x00000001b6c83844 },
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ { 0x00000001c96a8cca, 0x0000000051061a8a },
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ { 0x00000001a1a5b60c, 0x000000017351388a },
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ { 0x00000000e4b6ac9c, 0x0000000132928f92 },
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ { 0x00000001807e7f5a, 0x00000000e6b4f48a },
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ { 0x000000017a7e3bc8, 0x0000000039d15e90 },
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ { 0x00000000d73975da, 0x00000000312d6074 },
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ { 0x000000017375d038, 0x000000017bbb2cc4 },
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ { 0x00000000193680bc, 0x000000016ded3e18 },
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ { 0x00000000999b06f6, 0x00000000f1638b16 },
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ { 0x00000001f685d2b8, 0x00000001d38b9ecc },
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ { 0x00000001f4ecbed2, 0x000000018b8d09dc },
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ { 0x00000000ba16f1a0, 0x00000000e7bc27d2 },
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ { 0x0000000115aceac4, 0x00000000275e1e96 },
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ { 0x00000001aeff6292, 0x00000000e2e3031e },
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ { 0x000000009640124c, 0x00000001041c84d8 },
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ { 0x0000000114f41f02, 0x00000000706ce672 },
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ { 0x000000009c5f3586, 0x000000015d5070da },
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ { 0x00000001878275fa, 0x0000000038f9493a },
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ { 0x00000000ddc42ce8, 0x00000000a3348a76 },
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ { 0x0000000181d2c73a, 0x00000001ad0aab92 },
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ { 0x0000000141c9320a, 0x000000019e85f712 },
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ { 0x000000015235719a, 0x000000005a871e76 },
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ { 0x00000000be27d804, 0x000000017249c662 },
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ { 0x000000006242d45a, 0x000000003a084712 },
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ { 0x000000009a53638e, 0x00000000ed438478 },
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ { 0x00000001001ecfb6, 0x00000000abac34cc },
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ { 0x000000016d7c2d64, 0x000000005f35ef3e },
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ { 0x00000001d0ce46c0, 0x0000000047d6608c },
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ { 0x0000000124c907b4, 0x000000002d01470e },
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ { 0x0000000018a555ca, 0x0000000158bbc7b0 },
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ { 0x000000006b0980bc, 0x00000000c0a23e8e },
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ { 0x000000008bbba964, 0x00000001ebd85c88 },
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ { 0x00000001070a5a1e, 0x000000019ee20bb2 },
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ { 0x000000002204322a, 0x00000001acabf2d6 },
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ { 0x00000000a27524d0, 0x00000001b7963d56 },
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ { 0x0000000020b1e4ba, 0x000000017bffa1fe },
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ { 0x0000000032cc27fc, 0x000000001f15333e },
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ { 0x0000000044dd22b8, 0x000000018593129e },
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ { 0x00000000dffc9e0a, 0x000000019cb32602 },
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ { 0x00000001b7a0ed14, 0x0000000142b05cc8 },
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ { 0x00000000c7842488, 0x00000001be49e7a4 },
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ { 0x00000001c02a4fee, 0x0000000108f69d6c },
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ { 0x000000003c273778, 0x000000006c0971f0 },
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ { 0x00000001d63f8894, 0x000000005b16467a },
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ { 0x000000006be557d6, 0x00000001551a628e },
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ { 0x000000006a7806ea, 0x000000019e42ea92 },
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ { 0x000000016155aa0c, 0x000000012fa83ff2 },
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ { 0x00000000908650ac, 0x000000011ca9cde0 },
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ { 0x00000000aa5a8084, 0x00000000c8e5cd74 },
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ { 0x0000000191bb500a, 0x0000000096c27f0c },
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ { 0x0000000064e9bed0, 0x000000002baed926 },
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ { 0x000000009444f302, 0x000000017c8de8d2 },
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ { 0x000000019db07d3c, 0x00000000d43d6068 },
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ { 0x00000001359e3e6e, 0x00000000cb2c4b26 },
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ { 0x00000001e4f10dd2, 0x0000000145b8da26 },
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ { 0x0000000124f5735e, 0x000000018fff4b08 },
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ { 0x0000000124760a4c, 0x0000000150b58ed0 },
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ { 0x000000000f1fc186, 0x00000001549f39bc },
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ { 0x00000000150e4cc4, 0x00000000ef4d2f42 },
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ { 0x000000002a6204e8, 0x00000001b1468572 },
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ { 0x00000000beb1d432, 0x000000013d7403b2 },
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ { 0x0000000135f3f1f0, 0x00000001a4681842 },
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ { 0x0000000074fe2232, 0x0000000167714492 },
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ { 0x000000001ac6e2ba, 0x00000001e599099a },
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ { 0x0000000013fca91e, 0x00000000fe128194 },
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ { 0x0000000183f4931e, 0x0000000077e8b990 },
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ { 0x00000000b6d9b4e4, 0x00000001a267f63a },
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ { 0x00000000b5188656, 0x00000001945c245a },
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ { 0x0000000027a81a84, 0x0000000149002e76 },
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ { 0x0000000125699258, 0x00000001bb8310a4 },
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ { 0x00000001b23de796, 0x000000019ec60bcc },
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ { 0x00000000fe4365dc, 0x000000012d8590ae },
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ { 0x00000000c68f497a, 0x0000000065b00684 },
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ { 0x00000000fbf521ee, 0x000000015e5aeadc },
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ { 0x000000015eac3378, 0x00000000b77ff2b0 },
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ { 0x0000000134914b90, 0x0000000188da2ff6 },
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ { 0x0000000016335cfe, 0x0000000063da929a },
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ { 0x000000010372d10c, 0x00000001389caa80 },
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ { 0x000000015097b908, 0x000000013db599d2 },
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ { 0x00000001227a7572, 0x0000000122505a86 },
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ { 0x000000009a8f75c0, 0x000000016bd72746 },
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ { 0x00000000682c77a2, 0x00000001c3faf1d4 },
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ { 0x00000000231f091c, 0x00000001111c826c },
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ { 0x000000007d4439f2, 0x00000000153e9fb2 },
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ { 0x000000017e221efc, 0x000000002b1f7b60 },
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ { 0x0000000167457c38, 0x00000000b1dba570 },
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ { 0x00000000bdf081c4, 0x00000001f6397b76 },
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ { 0x000000016286d6b0, 0x0000000156335214 },
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ { 0x00000000c84f001c, 0x00000001d70e3986 },
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ { 0x0000000064efe7c0, 0x000000003701a774 },
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ { 0x000000000ac2d904, 0x00000000ac81ef72 },
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ { 0x00000000fd226d14, 0x0000000133212464 },
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ { 0x000000011cfd42e0, 0x00000000e4e45610 },
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ { 0x000000016e5a5678, 0x000000000c1bd370 },
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ { 0x00000001d888fe22, 0x00000001a7b9e7a6 },
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ { 0x00000001af77fcd4, 0x000000007d657a10 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+
+static const __vector unsigned long long vcrc_short_const[16] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x) */
+ { 0x99168a18ec447f11, 0xed837b2613e8221e },
+ /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x) */
+ { 0xe23e954e8fd2cd3c, 0xc8acdd8147b9ce5a },
+ /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x) */
+ { 0x92f8befe6b1d2b53, 0xd9ad6d87d4277e25 },
+ /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x) */
+ { 0xf38a3556291ea462, 0xc10ec5e033fbca3b },
+ /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x) */
+ { 0x974ac56262b6ca4b, 0xc0b55b0e82e02e2f },
+ /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x) */
+ { 0x855712b3784d2a56, 0x71aa1df0e172334d },
+ /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x) */
+ { 0xa5abe9f80eaee722, 0xfee3053e3969324d },
+ /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x) */
+ { 0x1fa0943ddb54814c, 0xf44779b93eb2bd08 },
+ /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x) */
+ { 0xa53ff440d7bbfe6a, 0xf5449b3f00cc3374 },
+ /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x) */
+ { 0xebe7e3566325605c, 0x6f8346e1d777606e },
+ /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x) */
+ { 0xc65a272ce5b592b8, 0xe3ab4f2ac0b95347 },
+ /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x) */
+ { 0x5705a9ca4721589f, 0xaa2215ea329ecc11 },
+ /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x) */
+ { 0xe3720acb88d14467, 0x1ed8f66ed95efd26 },
+ /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x) */
+ { 0xba1aca0315141c31, 0x78ed02d5a700e96a },
+ /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x) */
+ { 0xad2a31b3ed627dae, 0xba8ccbe832b39da3 },
+ /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x) */
+ { 0x6655004fa06a2517, 0xedb88320b1e6b092 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+ /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x) */
+ { 0xed837b2613e8221e, 0x99168a18ec447f11 },
+ /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x) */
+ { 0xc8acdd8147b9ce5a, 0xe23e954e8fd2cd3c },
+ /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x) */
+ { 0xd9ad6d87d4277e25, 0x92f8befe6b1d2b53 },
+ /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x) */
+ { 0xc10ec5e033fbca3b, 0xf38a3556291ea462 },
+ /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x) */
+ { 0xc0b55b0e82e02e2f, 0x974ac56262b6ca4b },
+ /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x) */
+ { 0x71aa1df0e172334d, 0x855712b3784d2a56 },
+ /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x) */
+ { 0xfee3053e3969324d, 0xa5abe9f80eaee722 },
+ /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x) */
+ { 0xf44779b93eb2bd08, 0x1fa0943ddb54814c },
+ /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x) */
+ { 0xf5449b3f00cc3374, 0xa53ff440d7bbfe6a },
+ /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x) */
+ { 0x6f8346e1d777606e, 0xebe7e3566325605c },
+ /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x) */
+ { 0xe3ab4f2ac0b95347, 0xc65a272ce5b592b8 },
+ /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x) */
+ { 0xaa2215ea329ecc11, 0x5705a9ca4721589f },
+ /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x) */
+ { 0x1ed8f66ed95efd26, 0xe3720acb88d14467 },
+ /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x) */
+ { 0x78ed02d5a700e96a, 0xba1aca0315141c31 },
+ /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x) */
+ { 0xba8ccbe832b39da3, 0xad2a31b3ed627dae },
+ /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x) */
+ { 0xedb88320b1e6b092, 0x6655004fa06a2517 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Barrett constants */
+/* 33 bit reflected Barrett constant m - (4^32)/n */
+
+static const __vector unsigned long long v_Barrett_const[2] ALIGNED_(16) = {
+ /* x^64 div p(x) */
+#if BYTE_ORDER == LITTLE_ENDIAN
+ { 0x00000001f7011641, 0x0000000000000000 },
+ { 0x00000001db710641, 0x0000000000000000 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+ { 0x0000000000000000, 0x00000001f7011641 },
+ { 0x0000000000000000, 0x00000001db710641 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
diff --git a/neozip/arch/power/crc32_power8.c b/neozip/arch/power/crc32_power8.c
new file mode 100644
index 0000000000..a7a2fb7435
--- /dev/null
+++ b/neozip/arch/power/crc32_power8.c
@@ -0,0 +1,593 @@
+/* crc32 for POWER8 using VSX instructions
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Author: Rogerio Alves <rogealve@br.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ */
+
+#ifdef POWER8_VSX_CRC32
+
+#include "zbuild.h"
+#include "zendian.h"
+
+#include "crc32_constants.h"
+#include "crc32_braid_tbl.h"
+
+#include "power_intrins.h"
+
+#define MAX_SIZE 32768
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
+ while (len--)
+ crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+ return crc;
+}
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
+ unsigned int prealign;
+ unsigned int tail;
+
+ unsigned long len = (unsigned long) _len;
+
+ crc ^= 0xffffffff;
+
+ if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+ crc = crc32_align(crc, p, len);
+ goto out;
+ }
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = (unsigned int)ALIGN_DIFF(p, VMX_ALIGN);
+ crc = crc32_align(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ crc = __crc32_vpmsum(crc, p, ALIGN_DOWN(len, VMX_ALIGN));
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += ALIGN_DOWN(len, VMX_ALIGN);
+ crc = crc32_align(crc, p, tail);
+ }
+
+out:
+ crc ^= 0xffffffff;
+
+ return crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_power8(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that forward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+ const __vector unsigned long long vzero = {0,0};
+ const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+ const __vector unsigned long long vmask_32bit =
+ (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+
+ const __vector unsigned long long vmask_64bit =
+ (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+ __vector unsigned long long vcrc;
+
+ __vector unsigned long long vconst1, vconst2;
+
+ /* vdata0-vdata7 will contain our data (p). */
+ __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+ /* v0-v7 will contain our checksums */
+ __vector unsigned long long v0 = {0,0};
+ __vector unsigned long long v1 = {0,0};
+ __vector unsigned long long v2 = {0,0};
+ __vector unsigned long long v3 = {0,0};
+ __vector unsigned long long v4 = {0,0};
+ __vector unsigned long long v5 = {0,0};
+ __vector unsigned long long v6 = {0,0};
+ __vector unsigned long long v7 = {0,0};
+
+
+ /* Vector auxiliary variables. */
+ __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+ unsigned int offset; /* Constant table offset. */
+
+ unsigned long i; /* Counter. */
+ unsigned long chunks;
+
+ unsigned long block_size;
+ int next_block = 0;
+
+ /* Align by 128 bits. The last 128 bit block will be processed at end. */
+ unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+ vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
+
+ /* Short version. */
+ if (len < 256) {
+ /* Calculate where in the constant table we need to start. */
+ offset = 256 - len;
+
+ vconst1 = vec_ld(offset, vcrc_short_const);
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+ /* xor initial value */
+ vdata0 = vec_xor(vdata0, vcrc);
+
+ vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+ v0 = vec_xor(v0, vdata0);
+
+ for (i = 16; i < len; i += 16) {
+ vconst1 = vec_ld(offset + i, vcrc_short_const);
+ vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+ vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+ v0 = vec_xor(v0, vdata0);
+ }
+ } else {
+
+ /* Load initial values. */
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+ VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+ vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+ vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+ VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+ vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+ vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+ VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+ vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+ vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+ VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+ VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+ /* xor in initial value */
+ vdata0 = vec_xor(vdata0, vcrc);
+
+ p = (char *)p + 128;
+
+ do {
+ /* Checksum in blocks of MAX_SIZE. */
+ block_size = length;
+ if (block_size > MAX_SIZE) {
+ block_size = MAX_SIZE;
+ }
+
+ length = length - block_size;
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ offset = (MAX_SIZE/8) - (block_size/8);
+ /* We reduce our final 128 bytes in a separate step */
+ chunks = (block_size/128)-1;
+
+ vconst1 = vec_ld(offset, vcrc_const);
+
+ va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+ (__vector unsigned long long)vconst1);
+ va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+ (__vector unsigned long long)vconst1);
+ va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+ (__vector unsigned long long)vconst1);
+ va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+ (__vector unsigned long long)vconst1);
+ va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+ (__vector unsigned long long)vconst1);
+ va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+ (__vector unsigned long long)vconst1);
+ va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+ (__vector unsigned long long)vconst1);
+ va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+ (__vector unsigned long long)vconst1);
+
+ if (chunks > 1) {
+ offset += 16;
+ vconst2 = vec_ld(offset, vcrc_const);
+ GROUP_ENDING_NOP;
+
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+ vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+ VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+ vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+ VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+ vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+ VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+ vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+ VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+ vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+ VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+ vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+ VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+ vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+ VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+ p = (char *)p + 128;
+
+ /*
+ * main loop. Each iteration calculates the CRC for a 128-byte
+ * block.
+ */
+ for (i = 0; i < chunks-2; i++) {
+ vconst1 = vec_ld(offset, vcrc_const);
+ offset += 16;
+ GROUP_ENDING_NOP;
+
+ v0 = vec_xor(v0, va0);
+ va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+ (__vector unsigned long long)vconst2);
+ vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v1 = vec_xor(v1, va1);
+ va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+ (__vector unsigned long long)vconst2);
+ vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+ VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v2 = vec_xor(v2, va2);
+ va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
+ vdata2, (__vector unsigned long long)vconst2);
+ vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+ VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v3 = vec_xor(v3, va3);
+ va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+ (__vector unsigned long long)vconst2);
+ vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+ VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+ vconst2 = vec_ld(offset, vcrc_const);
+ GROUP_ENDING_NOP;
+
+ v4 = vec_xor(v4, va4);
+ va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+ (__vector unsigned long long)vconst1);
+ vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+ VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v5 = vec_xor(v5, va5);
+ va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+ (__vector unsigned long long)vconst1);
+ vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+ VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v6 = vec_xor(v6, va6);
+ va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+ (__vector unsigned long long)vconst1);
+ vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+ VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+ GROUP_ENDING_NOP;
+
+ v7 = vec_xor(v7, va7);
+ va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+ (__vector unsigned long long)vconst1);
+ vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+ VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+ p = (char *)p + 128;
+ }
+
+ /* First cool down */
+ vconst1 = vec_ld(offset, vcrc_const);
+ offset += 16;
+
+ v0 = vec_xor(v0, va0);
+ va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v1 = vec_xor(v1, va1);
+ va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v2 = vec_xor(v2, va2);
+ va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v3 = vec_xor(v3, va3);
+ va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v4 = vec_xor(v4, va4);
+ va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v5 = vec_xor(v5, va5);
+ va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v6 = vec_xor(v6, va6);
+ va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+ (__vector unsigned long long)vconst1);
+ GROUP_ENDING_NOP;
+
+ v7 = vec_xor(v7, va7);
+ va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+ (__vector unsigned long long)vconst1);
+ }/* else */
+
+ /* Second cool down. */
+ v0 = vec_xor(v0, va0);
+ v1 = vec_xor(v1, va1);
+ v2 = vec_xor(v2, va2);
+ v3 = vec_xor(v3, va3);
+ v4 = vec_xor(v4, va4);
+ v5 = vec_xor(v5, va5);
+ v6 = vec_xor(v6, va6);
+ v7 = vec_xor(v7, va7);
+
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+ (__vector unsigned char)vzero, 4);
+ v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+ (__vector unsigned char)vzero, 4);
+ v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+ (__vector unsigned char)vzero, 4);
+ v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+ (__vector unsigned char)vzero, 4);
+ v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+ (__vector unsigned char)vzero, 4);
+ v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+ (__vector unsigned char)vzero, 4);
+ v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+ (__vector unsigned char)vzero, 4);
+ v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+ (__vector unsigned char)vzero, 4);
+
+ /* xor with the last 1024 bits. */
+ va0 = vec_ld(0, (__vector unsigned long long*) p);
+ VEC_PERM(va0, va0, va0, vperm_const);
+
+ va1 = vec_ld(16, (__vector unsigned long long*) p);
+ VEC_PERM(va1, va1, va1, vperm_const);
+
+ va2 = vec_ld(32, (__vector unsigned long long*) p);
+ VEC_PERM(va2, va2, va2, vperm_const);
+
+ va3 = vec_ld(48, (__vector unsigned long long*) p);
+ VEC_PERM(va3, va3, va3, vperm_const);
+
+ va4 = vec_ld(64, (__vector unsigned long long*) p);
+ VEC_PERM(va4, va4, va4, vperm_const);
+
+ va5 = vec_ld(80, (__vector unsigned long long*) p);
+ VEC_PERM(va5, va5, va5, vperm_const);
+
+ va6 = vec_ld(96, (__vector unsigned long long*) p);
+ VEC_PERM(va6, va6, va6, vperm_const);
+
+ va7 = vec_ld(112, (__vector unsigned long long*) p);
+ VEC_PERM(va7, va7, va7, vperm_const);
+
+ p = (char *)p + 128;
+
+ vdata0 = vec_xor(v0, va0);
+ vdata1 = vec_xor(v1, va1);
+ vdata2 = vec_xor(v2, va2);
+ vdata3 = vec_xor(v3, va3);
+ vdata4 = vec_xor(v4, va4);
+ vdata5 = vec_xor(v5, va5);
+ vdata6 = vec_xor(v6, va6);
+ vdata7 = vec_xor(v7, va7);
+
+ /* Check if we have more blocks to process */
+ next_block = 0;
+ if (length != 0) {
+ next_block = 1;
+
+ /* zero v0-v7 */
+ v0 = vec_xor(v0, v0);
+ v1 = vec_xor(v1, v1);
+ v2 = vec_xor(v2, v2);
+ v3 = vec_xor(v3, v3);
+ v4 = vec_xor(v4, v4);
+ v5 = vec_xor(v5, v5);
+ v6 = vec_xor(v6, v6);
+ v7 = vec_xor(v7, v7);
+ }
+ length = length + 128;
+
+ } while (next_block);
+
+ /* Calculate how many bytes we have left. */
+ length = (len & 127);
+
+ /* Calculate where in (short) constant table we need to start. */
+ offset = 128 - length;
+
+ v0 = vec_ld(offset, vcrc_short_const);
+ v1 = vec_ld(offset + 16, vcrc_short_const);
+ v2 = vec_ld(offset + 32, vcrc_short_const);
+ v3 = vec_ld(offset + 48, vcrc_short_const);
+ v4 = vec_ld(offset + 64, vcrc_short_const);
+ v5 = vec_ld(offset + 80, vcrc_short_const);
+ v6 = vec_ld(offset + 96, vcrc_short_const);
+ v7 = vec_ld(offset + 112, vcrc_short_const);
+
+ offset += 128;
+
+ v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+ v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+ v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+ v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+ v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+ v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+ v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+ v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+ /* Now reduce the tail (0-112 bytes). */
+ for (i = 0; i < length; i+=16) {
+ vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+ VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+ va0 = vec_ld(offset + i,vcrc_short_const);
+ va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+ (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+ v0 = vec_xor(v0, va0);
+ }
+
+ /* xor all parallel chunks together. */
+ v0 = vec_xor(v0, v1);
+ v2 = vec_xor(v2, v3);
+ v4 = vec_xor(v4, v5);
+ v6 = vec_xor(v6, v7);
+
+ v0 = vec_xor(v0, v2);
+ v4 = vec_xor(v4, v6);
+
+ v0 = vec_xor(v0, v4);
+ }
+
+ /* Barrett Reduction */
+ vconst1 = vec_ld(0, v_Barrett_const);
+ vconst2 = vec_ld(16, v_Barrett_const);
+
+ v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+ (__vector unsigned char)v0, 8);
+ v0 = vec_xor(v1,v0);
+
+ /* shift left one bit */
+ __vector unsigned char vsht_splat = vec_splat_u8 (1);
+ v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+
+ v0 = vec_and(v0, vmask_64bit);
+
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+
+ /* bottom 32 bits of a */
+ v1 = vec_and(v0, vmask_32bit);
+
+ /* ma */
+ v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+ (__vector unsigned long long)vconst1);
+
+ /* bottom 32bits of ma */
+ v1 = vec_and(v1, vmask_32bit);
+ /* qn */
+ v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+ (__vector unsigned long long)vconst2);
+ /* a - qn, subtraction is xor in GF(2) */
+ v0 = vec_xor (v0, v1);
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+
+ /* shift result into top 64 bits of */
+ v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+ (__vector unsigned char)vzero, 4);
+
+#if BYTE_ORDER == BIG_ENDIAN
+ return v0[0];
+#else
+ return v0[1];
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_features.c b/neozip/arch/power/power_features.c
new file mode 100644
index 0000000000..148f30a974
--- /dev/null
+++ b/neozip/arch/power/power_features.c
@@ -0,0 +1,54 @@
+/* power_features.c - POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021-2024 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(PPC_FEATURES) || defined(POWER_FEATURES)
+
+#include "zbuild.h"
+#include "power_features.h"
+
+#ifdef HAVE_SYS_AUXV_H
+# include <sys/auxv.h>
+#endif
+#ifdef POWER_NEED_AUXVEC_H
+# include <linux/auxvec.h>
+#endif
+#ifdef __FreeBSD__
+# include <machine/cpu.h>
+#endif
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
+#ifdef PPC_FEATURES
+ unsigned long hwcap;
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+ hwcap = getauxval(AT_HWCAP);
+#endif
+
+ if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ features->has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
+ unsigned long hwcap2;
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
+ elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
+ hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#ifdef POWER8_VSX
+ if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ features->has_arch_2_07 = 1;
+#endif
+#ifdef POWER9
+ if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ features->has_arch_3_00 = 1;
+#endif
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_features.h b/neozip/arch/power/power_features.h
new file mode 100644
index 0000000000..1ff51de5dd
--- /dev/null
+++ b/neozip/arch/power/power_features.h
@@ -0,0 +1,18 @@
+/* power_features.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FEATURES_H_
+#define POWER_FEATURES_H_
+
+struct power_cpu_features {
+ int has_altivec;
+ int has_arch_2_07;
+ int has_arch_3_00;
+};
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features);
+
+#endif /* POWER_FEATURES_H_ */
diff --git a/neozip/arch/power/power_functions.h b/neozip/arch/power/power_functions.h
new file mode 100644
index 0000000000..ccc7754a4c
--- /dev/null
+++ b/neozip/arch/power/power_functions.h
@@ -0,0 +1,74 @@
+/* power_functions.h -- POWER implementations for arch-specific functions.
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_FUNCTIONS_H_
+#define POWER_FUNCTIONS_H_
+
+#include "power_natives.h"
+
+#ifdef PPC_VMX
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+void slide_hash_vmx(deflate_state *s);
+#endif
+
+#ifdef POWER8_VSX
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+void slide_hash_power8(deflate_state *s);
+void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef POWER9
+uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+uint32_t longest_match_power9(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Power - VMX
+# ifdef PPC_VMX_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_vmx
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_vmx
+# undef native_slide_hash
+# define native_slide_hash slide_hash_vmx
+# endif
+// Power8 - VSX
+# ifdef POWER8_VSX_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_power8
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_power8
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_power8
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_power8
+# undef native_slide_hash
+# define native_slide_hash slide_hash_power8
+# endif
+# ifdef POWER8_VSX_CRC32_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_power8
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_power8
+# endif
+// Power9
+# ifdef POWER9_NATIVE
+# undef native_compare256
+# define native_compare256 compare256_power9
+# undef native_longest_match
+# define native_longest_match longest_match_power9
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_power9
+# endif
+#endif
+
+#endif /* POWER_FUNCTIONS_H_ */
diff --git a/neozip/arch/power/power_intrins.h b/neozip/arch/power/power_intrins.h
new file mode 100644
index 0000000000..3efcfb9722
--- /dev/null
+++ b/neozip/arch/power/power_intrins.h
@@ -0,0 +1,61 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ * Daniel Black <daniel@linux.vnet.ibm.com>
+ * Rogerio Alves <rogealve@br.ibm.com>
+ * Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_INTRINS_H
+#define POWER_INTRINS_H
+
+#include <altivec.h>
+
+#if defined (__clang__)
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+#ifdef __VSX__
+static inline __vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b) {
+ return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+#endif
+
+#endif
+
+/* There's no version of this that operates over unsigned and if casted, it does
+ * sign extension. Let's write an endian independent version and hope the compiler
+ * eliminates creating another zero idiom for the zero value if one exists locally */
+static inline vector unsigned short vec_unpackl(vector unsigned char a) {
+ vector unsigned char zero = vec_splat_u8(0);
+
+#if BYTE_ORDER == BIG_ENDIAN
+ return (vector unsigned short)vec_mergel(zero, a);
+#else
+ return (vector unsigned short)vec_mergel(a, zero);
+#endif
+}
+
+static inline vector unsigned short vec_unpackh(vector unsigned char a) {
+ vector unsigned char zero = vec_splat_u8(0);
+
+#if BYTE_ORDER == BIG_ENDIAN
+ return (vector unsigned short)vec_mergeh(zero, a);
+#else
+ return (vector unsigned short)vec_mergeh(a, zero);
+#endif
+}
+
+#endif
diff --git a/neozip/arch/power/power_natives.h b/neozip/arch/power/power_natives.h
new file mode 100644
index 0000000000..59ec8a8aed
--- /dev/null
+++ b/neozip/arch/power/power_natives.h
@@ -0,0 +1,27 @@
+/* power_natives.h -- POWER compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_NATIVES_H_
+#define POWER_NATIVES_H_
+
+#if defined(__ALTIVEC__)
+# ifdef PPC_VMX
+# define PPC_VMX_NATIVE
+# endif
+#endif
+#if defined(_ARCH_PWR8) && defined(__VSX__)
+# ifdef POWER8_VSX
+# define POWER8_VSX_NATIVE
+# endif
+# ifdef POWER8_VSX_CRC32
+# define POWER8_VSX_CRC32_NATIVE
+# endif
+#endif
+#if defined(_ARCH_PWR9)
+# ifdef POWER9
+# define POWER9_NATIVE
+# endif
+#endif
+
+#endif /* POWER_NATIVES_H_ */
diff --git a/neozip/arch/power/slide_hash_power8.c b/neozip/arch/power/slide_hash_power8.c
new file mode 100644
index 0000000000..d01e0acd56
--- /dev/null
+++ b/neozip/arch/power/slide_hash_power8.c
@@ -0,0 +1,12 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#define SLIDE_PPC slide_hash_power8
+#include "slide_ppc_tpl.h"
+
+#endif /* POWER8_VSX */
diff --git a/neozip/arch/power/slide_hash_vmx.c b/neozip/arch/power/slide_hash_vmx.c
new file mode 100644
index 0000000000..5a87ef7d9a
--- /dev/null
+++ b/neozip/arch/power/slide_hash_vmx.c
@@ -0,0 +1,10 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX
+
+#define SLIDE_PPC slide_hash_vmx
+#include "slide_ppc_tpl.h"
+
+#endif /* PPC_VMX */
diff --git a/neozip/arch/power/slide_ppc_tpl.h b/neozip/arch/power/slide_ppc_tpl.h
new file mode 100644
index 0000000000..24629b4039
--- /dev/null
+++ b/neozip/arch/power/slide_ppc_tpl.h
@@ -0,0 +1,44 @@
+/* Optimized slide_hash for PowerPC processors
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ const vector unsigned short vmx_wsize = vec_splats(wsize);
+ Pos *p = table;
+
+ do {
+ /* Do the pointer arithmetic early to hopefully overlap the vector unit */
+ Pos *q = p;
+ p += 32;
+ vector unsigned short value0, value1, value2, value3;
+ vector unsigned short result0, result1, result2, result3;
+
+ value0 = vec_ld(0, q);
+ value1 = vec_ld(16, q);
+ value2 = vec_ld(32, q);
+ value3 = vec_ld(48, q);
+ result0 = vec_subs(value0, vmx_wsize);
+ result1 = vec_subs(value1, vmx_wsize);
+ result2 = vec_subs(value2, vmx_wsize);
+ result3 = vec_subs(value3, vmx_wsize);
+ vec_st(result0, 0, q);
+ vec_st(result1, 16, q);
+ vec_st(result2, 32, q);
+ vec_st(result3, 48, q);
+
+ entries -= 32;
+ } while (entries);
+}
+
+void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}