12 files changed, 1981 insertions, 0 deletions
diff --git a/neozip/arch/generic/Makefile.in b/neozip/arch/generic/Makefile.in
new file mode 100644
index 0000000000..1d9cc4df5b
--- /dev/null
+++ b/neozip/arch/generic/Makefile.in
@@ -0,0 +1,68 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+ adler32_c.o adler32_c.lo \
+ chunkset_c.o chunkset_c.lo \
+ compare256_c.o compare256_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_chorba_c.o crc32_chorba_c.lo \
+ slide_hash_c.o slide_hash_c.lo
+
+
+adler32_c.o: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+adler32_c.lo: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_c.c
+
+chunkset_c.o: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+chunkset_c.lo: $(SRCDIR)/chunkset_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
+
+compare256_c.o: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+compare256_c.lo: $(SRCDIR)/compare256_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/zendian.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
+
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_chorba_c.o: $(SRCDIR)/crc32_chorba_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c
+
+crc32_chorba_c.lo: $(SRCDIR)/crc32_chorba_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c
+
+slide_hash_c.o: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+slide_hash_c.lo: $(SRCDIR)/slide_hash_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c
+
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/neozip/arch/generic/adler32_c.c b/neozip/arch/generic/adler32_c.c
new file mode 100644
index 0000000000..84c946f452
--- /dev/null
+++ b/neozip/arch/generic/adler32_c.c
@@ -0,0 +1,55 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            ADLER_DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            ADLER_DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_copy_tail(adler, NULL, buf, len, sum2, len != 0, NMAX - 1, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    adler = FUNCTABLE_CALL(adler32)(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+}
diff --git a/neozip/arch/generic/chunk_128bit_perm_idx_lut.h b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h
new file mode 100644
index 0000000000..6e5098bf26
--- /dev/null
+++ b/neozip/arch/generic/chunk_128bit_perm_idx_lut.h
@@ -0,0 +1,26 @@
+/* chunk_128bit_perm_idx_lut.h - shared SSSE3/NEON/LSX permutation idx lut for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_128BIT_PERM_IDX_LUT_H_
+#define CHUNK_128BIT_PERM_IDX_LUT_H_
+
+#include "chunk_permute_table.h"
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+#endif
diff --git a/neozip/arch/generic/chunk_256bit_perm_idx_lut.h b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h
new file mode 100644
index 0000000000..796a7df120
--- /dev/null
+++ b/neozip/arch/generic/chunk_256bit_perm_idx_lut.h
@@ -0,0 +1,47 @@
+/* chunk_256bit_perm_idx_lut.h - shared AVX512/AVX2/LASX permutation idx lut for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CHUNK_256BIT_PERM_IDX_LUT_H_
+#define CHUNK_256BIT_PERM_IDX_LUT_H_
+
+#include "chunk_permute_table.h"
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
+
+static const uint16_t half_rem_vals[13] = {
+    1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
+#endif
diff --git a/neozip/arch/generic/chunk_permute_table.h b/neozip/arch/generic/chunk_permute_table.h
new file mode 100644
index 0000000000..bad66ccc77
--- /dev/null
+++ b/neozip/arch/generic/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/neozip/arch/generic/chunkset_c.c b/neozip/arch/generic/chunkset_c.c
new file mode 100644
index 0000000000..ff9b1cb5fb
--- /dev/null
+++ b/neozip/arch/generic/chunkset_c.c
@@ -0,0 +1,40 @@
+/* chunkset.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+typedef uint64_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp = zng_memread_4(from);
+    *chunk = tmp | ((chunk_t)tmp << 32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = zng_memread_8(from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = zng_memread_8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    zng_memwrite_8(out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_c
+#define CHUNKCOPY        chunkcopy_c
+#define CHUNKUNROLL      chunkunroll_c
+#define CHUNKMEMSET      chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_c
+
+#include "inffast_tpl.h"
diff --git a/neozip/arch/generic/compare256_c.c b/neozip/arch/generic/compare256_c.c
new file mode 100644
index 0000000000..6934a55565
--- /dev/null
+++ b/neozip/arch/generic/compare256_c.c
@@ -0,0 +1,88 @@
+/* compare256.c -- 256 byte memory comparison with match length return
+ * Copyright (C) 2020 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+/* 8-bit integer comparison for hardware without unaligned loads */
+static inline uint32_t compare256_8_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (src0[0] != src1[0])
+            return len;
+        if (src0[1] != src1[1])
+            return len + 1;
+        if (src0[2] != src1[2])
+            return len + 2;
+        if (src0[3] != src1[3])
+            return len + 3;
+        if (src0[4] != src1[4])
+            return len + 4;
+        if (src0[5] != src1[5])
+            return len + 5;
+        if (src0[6] != src1[6])
+            return len + 6;
+        if (src0[7] != src1[7])
+            return len + 7;
+        src0 += 8, src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+/* 64-bit integer comparison for hardware with unaligned loads */
+static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint64_t sv = zng_memread_8(src0);
+        uint64_t mv = zng_memread_8(src1);
+        uint64_t diff = sv ^ mv;
+        if (diff)
+            return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+        src0 += 8, src1 += 8, len += 8;
+
+        sv = zng_memread_8(src0);
+        mv = zng_memread_8(src1);
+        diff = sv ^ mv;
+        if (diff)
+            return len + zng_ctz64(Z_U64_TO_LE(diff)) / 8;
+        src0 += 8, src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+#if OPTIMAL_CMP == 8
+#  define COMPARE256 compare256_8_static
+#else
+#  define COMPARE256 compare256_64_static
+#endif
+
+#ifdef WITH_ALL_FALLBACKS
+Z_INTERNAL uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_8_static(src0, src1);
+}
+
+Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_64_static(src0, src1);
+}
+#endif
+
+Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
+    return COMPARE256(src0, src1);
+}
+
+// Generate longest_match_c
+#define LONGEST_MATCH       longest_match_c
+#include "match_tpl.h"
+
+// Generate longest_match_slow_c
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_c
+#include "match_tpl.h"
diff --git a/neozip/arch/generic/compare256_p.h b/neozip/arch/generic/compare256_p.h
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/neozip/arch/generic/compare256_p.h
diff --git a/neozip/arch/generic/crc32_braid_c.c b/neozip/arch/generic/crc32_braid_c.c
new file mode 100644
index 0000000000..bda4a249bb
--- /dev/null
+++ b/neozip/arch/generic/crc32_braid_c.c
@@ -0,0 +1,213 @@
+/* crc32_braid.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_p.h"
+
+/*
+  A CRC of a message is computed on BRAID_N braids of words in the message, where
+  each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then
+  three running sparse CRCs are calculated respectively on each braid, at these
+  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
+  This is done starting at a word boundary, and continues until as many blocks of
+  BRAID_N * BRAID_W bytes as are available have been processed. The results are
+  combined into a single CRC at the end. For this code, BRAID_N must be in the
+  range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased
+  if desired by adding more #if blocks, extending the patterns apparent in the code.
+  In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N
+  value is increased.
+
+  BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time
+  on a given processor. The choices for BRAID_N and BRAID_W below were based on
+  testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC
+  POWER9, and MIPS64 Octeon II processors.
+  The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.
+  The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.
+  They were all tested with either gcc or clang, all using the -O3 optimization
+  level. Your mileage may vary.
+*/
+
+/* ========================================================================= */
+#ifdef BRAID_W
+/*
+  Return the CRC of the BRAID_W bytes in the word_t data, taking the
+  least-significant byte of the word as the first byte of data, without any pre
+  or post conditioning. This is used to combine the CRCs of each braid.
+ */
+#  if BYTE_ORDER == LITTLE_ENDIAN
+static uint32_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < BRAID_W; k++)
+        data = (data >> 8) ^ crc_table[data & 0xff];
+    return (uint32_t)data;
+}
+#  elif BYTE_ORDER == BIG_ENDIAN
+static z_word_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < BRAID_W; k++)
+        data = (data << 8) ^
+            crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];
+    return data;
+}
+#  endif /* BYTE_ORDER */
+#endif /* BRAID_W */
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len) {
+    crc = ~crc;
+
+#ifdef BRAID_W
+    /* If provided enough bytes, do a braided CRC calculation. */
+    if (len >= BRAID_N * BRAID_W + BRAID_W - 1) {
+        size_t blks;
+        z_word_t const *words;
+        int k;
+
+        /* Compute the CRC up to a z_word_t boundary. */
+        size_t align_diff = (size_t)MIN(ALIGN_DIFF(buf, BRAID_W), len);
+        if (align_diff) {
+            crc = crc32_copy_small(crc, NULL, buf, align_diff, BRAID_W - 1, 0);
+            len -= align_diff;
+            buf += align_diff;
+        }
+
+        /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */
+        blks = len / (BRAID_N * BRAID_W);
+        len -= blks * BRAID_N * BRAID_W;
+        words = (z_word_t const *)buf;
+
+        z_word_t crc0, word0, comb;
+#if BRAID_N > 1
+        z_word_t crc1, word1;
+#if BRAID_N > 2
+        z_word_t crc2, word2;
+#if BRAID_N > 3
+        z_word_t crc3, word3;
+#if BRAID_N > 4
+        z_word_t crc4, word4;
+#if BRAID_N > 5
+        z_word_t crc5, word5;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Initialize the CRC for each braid. */
+        crc0 = Z_WORD_FROM_LE(crc);
+#if BRAID_N > 1
+        crc1 = 0;
+#if BRAID_N > 2
+        crc2 = 0;
+#if BRAID_N > 3
+        crc3 = 0;
+#if BRAID_N > 4
+        crc4 = 0;
+#if BRAID_N > 5
+        crc5 = 0;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
+        while (--blks) {
+            /* Load the word for each braid into registers. */
+            word0 = crc0 ^ words[0];
+#if BRAID_N > 1
+            word1 = crc1 ^ words[1];
+#if BRAID_N > 2
+            word2 = crc2 ^ words[2];
+#if BRAID_N > 3
+            word3 = crc3 ^ words[3];
+#if BRAID_N > 4
+            word4 = crc4 ^ words[4];
+#if BRAID_N > 5
+            word5 = crc5 ^ words[5];
+#endif
+#endif
+#endif
+#endif
+#endif
+            words += BRAID_N;
+
+            /* Compute and update the CRC for each word. The loop should get unrolled. */
+            crc0 = BRAID_TABLE[0][word0 & 0xff];
+#if BRAID_N > 1
+            crc1 = BRAID_TABLE[0][word1 & 0xff];
+#if BRAID_N > 2
+            crc2 = BRAID_TABLE[0][word2 & 0xff];
+#if BRAID_N > 3
+            crc3 = BRAID_TABLE[0][word3 & 0xff];
+#if BRAID_N > 4
+            crc4 = BRAID_TABLE[0][word4 & 0xff];
+#if BRAID_N > 5
+            crc5 = BRAID_TABLE[0][word5 & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            for (k = 1; k < BRAID_W; k++) {
+                crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
+#if BRAID_N > 1
+                crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
+#if BRAID_N > 2
+                crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
+#if BRAID_N > 3
+                crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
+#if BRAID_N > 4
+                crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
+#if BRAID_N > 5
+                crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            }
+        }
+
+        /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */
+        comb = crc_word(crc0 ^ words[0]);
+#if BRAID_N > 1
+        comb = crc_word(crc1 ^ words[1] ^ comb);
+#if BRAID_N > 2
+        comb = crc_word(crc2 ^ words[2] ^ comb);
+#if BRAID_N > 3
+        comb = crc_word(crc3 ^ words[3] ^ comb);
+#if BRAID_N > 4
+        comb = crc_word(crc4 ^ words[4] ^ comb);
+#if BRAID_N > 5
+        comb = crc_word(crc5 ^ words[5] ^ comb);
+#endif
+#endif
+#endif
+#endif
+#endif
+        words += BRAID_N;
+        Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");
+        crc = (uint32_t)Z_WORD_FROM_LE(comb);
+
+        /* Update the pointer to the remaining bytes to process. */
+        buf = (const unsigned char *)words;
+    }
+
+#endif /* BRAID_W */
+
+    /* Complete the computation of the CRC on any remaining bytes. */
+    return ~crc32_copy_small(crc, NULL, buf, len, (BRAID_N * BRAID_W) - 1, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_braid(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
diff --git a/neozip/arch/generic/crc32_chorba_c.c b/neozip/arch/generic/crc32_chorba_c.c
new file mode 100644
index 0000000000..693972da11
--- /dev/null
+++ b/neozip/arch/generic/crc32_chorba_c.c
@@ -0,0 +1,1275 @@
+#include "zbuild.h"
+#include "zendian.h"
+#if defined(__EMSCRIPTEN__)
+#  include "zutil_p.h"
+#endif
+#include "zmemory.h"
+#include "crc32_chorba_p.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "generic_functions.h"
+
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+#define bitbuffer_size_bytes (16 * 1024 * sizeof(chorba_word_t))
+#define bitbuffer_size_zwords (bitbuffer_size_bytes / sizeof(chorba_word_t))
+#define bitbuffer_size_qwords (bitbuffer_size_bytes / sizeof(uint64_t))
+
+#if defined(HAVE_MAY_ALIAS) && CHORBA_W != 8
+    typedef uint64_t __attribute__ ((__may_alias__)) uint64a_t;
+#else
+    typedef uint64_t uint64a_t;
+#endif
+
+/**
+ * Implements the Chorba algorithm for CRC32 computation (https://arxiv.org/abs/2412.16398).
+ *
+ * This implementation processes data in three phases:
+ * 1. Initial pass: Zeros out bitbuffer
+ * 2. Intermediate pass: Processes half the values
+ * 3. Main pass: Processes remaining data
+ *
+ * @param crc Initial CRC value
+ * @param input Input data buffer
+ * @param len Length of input data
+ * @return Computed CRC32 value
+ *
+ * @note Requires minimum input size of 118960 + 512 bytes
+ * @note Uses 128KB temporary buffer
+ */
+Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+#if defined(__EMSCRIPTEN__)
+    chorba_word_t *bitbuffer = (chorba_word_t*)zng_alloc(bitbuffer_size_bytes);
+#else
+    ALIGNED_(16) chorba_word_t bitbuffer[bitbuffer_size_zwords];
+#endif
+    const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+    uint64a_t *bitbuffer_qwords = (uint64a_t*)bitbuffer;
+    /* The calling function ensured that this is aligned correctly */
+    const chorba_word_t* input = (const chorba_word_t*)buf;
+    const uint64a_t* input_qwords = (const uint64a_t*)buf;
+
+    size_t i = 0;
+
+    chorba_word_t next1 = CHORBA_WORD_FROM_LE(~crc);
+
+    chorba_word_t next2 = 0;
+    chorba_word_t next3 = 0;
+    chorba_word_t next4 = 0;
+    chorba_word_t next5 = 0;
+    chorba_word_t next6 = 0;
+    chorba_word_t next7 = 0;
+    chorba_word_t next8 = 0;
+    chorba_word_t next9 = 0;
+    chorba_word_t next10 = 0;
+    chorba_word_t next11 = 0;
+    chorba_word_t next12 = 0;
+    chorba_word_t next13 = 0;
+    chorba_word_t next14 = 0;
+    chorba_word_t next15 = 0;
+    chorba_word_t next16 = 0;
+    chorba_word_t next17 = 0;
+    chorba_word_t next18 = 0;
+    chorba_word_t next19 = 0;
+    chorba_word_t next20 = 0;
+    chorba_word_t next21 = 0;
+    chorba_word_t next22 = 0;
+    crc = 0;
+
+    // do a first pass to zero out bitbuffer
+    for (; i < (14848 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) {
+        chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+        chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+        chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+        chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+        int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+        int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+        in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1;
+        in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2;
+        in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3;
+        in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4;
+        in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5;
+        in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6;
+        in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7;
+        in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1;
+        in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2;
+        in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3;
+        in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4;
+        in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5;
+        in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6;
+        in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7;
+        in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8;
+        in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9;
+        in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10;
+        in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11;
+        in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12;
+        in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13;
+        in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14;
+        in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15;
+        in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16;
+        in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17;
+        in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18;
+        in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19;
+        in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20;
+        in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21;
+        in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22;
+        in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23;
+        in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24;
+        in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25;
+
+        next1 = in11 ^ in22 ^ in26;
+        next2 = in12 ^ in23 ^ in27;
+        next3 = in13 ^ in24 ^ in28;
+        next4 = in14 ^ in25 ^ in29;
+        next5 = in15 ^ in26 ^ in30;
+        next6 = in16 ^ in27 ^ in31;
+        next7 = in17 ^ in28 ^ in32;
+        next8 = in18 ^ in29;
+        next9 = in19 ^ in30;
+        next10 = in20 ^ in31;
+        next11 = in21 ^ in32;
+        next12 = in22;
+        next13 = in23;
+        next14 = in24;
+        next15 = in25;
+        next16 = in26;
+        next17 = in27;
+        next18 = in28;
+        next19 = in29;
+        next20 = in30;
+        next21 = in31;
+        next22 = in32;
+
+        bitbuffer[out_offset1 + 22] = in1;
+        bitbuffer[out_offset1 + 23] = in2;
+        bitbuffer[out_offset1 + 24] = in3;
+        bitbuffer[out_offset1 + 25] = in4;
+        bitbuffer[out_offset1 + 26] = in5;
+        bitbuffer[out_offset1 + 27] = in6;
+        bitbuffer[out_offset1 + 28] = in7;
+        bitbuffer[out_offset1 + 29] = in8;
+        bitbuffer[out_offset1 + 30] = in9;
+        bitbuffer[out_offset1 + 31] = in10;
+        bitbuffer[out_offset2 + 0] = in11;
+        bitbuffer[out_offset2 + 1] = in12;
+        bitbuffer[out_offset2 + 2] = in13;
+        bitbuffer[out_offset2 + 3] = in14;
+        bitbuffer[out_offset2 + 4] = in15;
+        bitbuffer[out_offset2 + 5] = in16;
+        bitbuffer[out_offset2 + 6] = in17;
+        bitbuffer[out_offset2 + 7] = in18;
+        bitbuffer[out_offset2 + 8] = in19;
+        bitbuffer[out_offset2 + 9] = in20;
+        bitbuffer[out_offset2 + 10] = in21;
+        bitbuffer[out_offset2 + 11] = in22;
+        bitbuffer[out_offset2 + 12] = in23;
+        bitbuffer[out_offset2 + 13] = in24;
+        bitbuffer[out_offset2 + 14] = in25;
+        bitbuffer[out_offset2 + 15] = in26;
+        bitbuffer[out_offset2 + 16] = in27;
+        bitbuffer[out_offset2 + 17] = in28;
+        bitbuffer[out_offset2 + 18] = in29;
+        bitbuffer[out_offset2 + 19] = in30;
+        bitbuffer[out_offset2 + 20] = in31;
+        bitbuffer[out_offset2 + 21] = in32;
+    }
+
+    // one intermediate pass where we pull half the values
+    for (; i < (14880 * sizeof(chorba_word_t)); i += (32 * sizeof(chorba_word_t))) {
+        chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+        chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+        chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+        chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+        int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords;
+        int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+        int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+        in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1;
+        in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2;
+        in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3;
+        in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4;
+        in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5;
+        in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6;
+        in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7;
+        in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1;
+        in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2;
+        in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3;
+        in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4;
+        in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5;
+        in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6;
+        in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7;
+        in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8;
+        in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9;
+        in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10;
+        in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11;
+        in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12;
+        in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13;
+        in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14;
+        in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15;
+        in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22];
+        in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23];
+        in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24];
+        in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25];
+        in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26];
+        in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27];
+        in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28];
+        in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29];
+        in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30];
+        in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31];
+
+        next1 = in11 ^ in22 ^ in26;
+        next2 = in12 ^ in23 ^ in27;
+        next3 = in13 ^ in24 ^ in28;
+        next4 = in14 ^ in25 ^ in29;
+        next5 = in15 ^ in26 ^ in30;
+        next6 = in16 ^ in27 ^ in31;
+        next7 = in17 ^ in28 ^ in32;
+        next8 = in18 ^ in29;
+        next9 = in19 ^ in30;
+        next10 = in20 ^ in31;
+        next11 = in21 ^ in32;
+        next12 = in22;
+        next13 = in23;
+        next14 = in24;
+        next15 = in25;
+        next16 = in26;
+        next17 = in27;
+        next18 = in28;
+        next19 = in29;
+        next20 = in30;
+        next21 = in31;
+        next22 = in32;
+
+        bitbuffer[out_offset1 + 22] = in1;
+        bitbuffer[out_offset1 + 23] = in2;
+        bitbuffer[out_offset1 + 24] = in3;
+        bitbuffer[out_offset1 + 25] = in4;
+        bitbuffer[out_offset1 + 26] = in5;
+        bitbuffer[out_offset1 + 27] = in6;
+        bitbuffer[out_offset1 + 28] = in7;
+        bitbuffer[out_offset1 + 29] = in8;
+        bitbuffer[out_offset1 + 30] = in9;
+        bitbuffer[out_offset1 + 31] = in10;
+        bitbuffer[out_offset2 + 0] = in11;
+        bitbuffer[out_offset2 + 1] = in12;
+        bitbuffer[out_offset2 + 2] = in13;
+        bitbuffer[out_offset2 + 3] = in14;
+        bitbuffer[out_offset2 + 4] = in15;
+        bitbuffer[out_offset2 + 5] = in16;
+        bitbuffer[out_offset2 + 6] = in17;
+        bitbuffer[out_offset2 + 7] = in18;
+        bitbuffer[out_offset2 + 8] = in19;
+        bitbuffer[out_offset2 + 9] = in20;
+        bitbuffer[out_offset2 + 10] = in21;
+        bitbuffer[out_offset2 + 11] = in22;
+        bitbuffer[out_offset2 + 12] = in23;
+        bitbuffer[out_offset2 + 13] = in24;
+        bitbuffer[out_offset2 + 14] = in25;
+        bitbuffer[out_offset2 + 15] = in26;
+        bitbuffer[out_offset2 + 16] = in27;
+        bitbuffer[out_offset2 + 17] = in28;
+        bitbuffer[out_offset2 + 18] = in29;
+        bitbuffer[out_offset2 + 19] = in30;
+        bitbuffer[out_offset2 + 20] = in31;
+        bitbuffer[out_offset2 + 21] = in32;
+    }
+
+    for (; (i + (14870 + 64) * sizeof(chorba_word_t)) < len; i += (32 * sizeof(chorba_word_t))) {
+        chorba_word_t in1, in2, in3, in4, in5, in6, in7, in8;
+        chorba_word_t in9, in10, in11, in12, in13, in14, in15, in16;
+        chorba_word_t in17, in18, in19, in20, in21, in22, in23, in24;
+        chorba_word_t in25, in26, in27, in28, in29, in30, in31, in32;
+        int in_offset = (i / sizeof(chorba_word_t)) % bitbuffer_size_zwords;
+        int out_offset1 = ((i / sizeof(chorba_word_t)) + 14848) % bitbuffer_size_zwords;
+        int out_offset2 = ((i / sizeof(chorba_word_t)) + 14880) % bitbuffer_size_zwords;
+
+        in1 = input[i / sizeof(chorba_word_t) + 0] ^ next1 ^ bitbuffer[in_offset + 0];
+        in2 = input[i / sizeof(chorba_word_t) + 1] ^ next2 ^ bitbuffer[in_offset + 1];
+        in3 = input[i / sizeof(chorba_word_t) + 2] ^ next3 ^ bitbuffer[in_offset + 2];
+        in4 = input[i / sizeof(chorba_word_t) + 3] ^ next4 ^ bitbuffer[in_offset + 3];
+        in5 = input[i / sizeof(chorba_word_t) + 4] ^ next5 ^ bitbuffer[in_offset + 4];
+        in6 = input[i / sizeof(chorba_word_t) + 5] ^ next6 ^ bitbuffer[in_offset + 5];
+        in7 = input[i / sizeof(chorba_word_t) + 6] ^ next7 ^ bitbuffer[in_offset + 6];
+        in8 = input[i / sizeof(chorba_word_t) + 7] ^ next8 ^ in1 ^ bitbuffer[in_offset + 7];
+        in9 = input[i / sizeof(chorba_word_t) + 8] ^ next9 ^ in2 ^ bitbuffer[in_offset + 8];
+        in10 = input[i / sizeof(chorba_word_t) + 9] ^ next10 ^ in3 ^ bitbuffer[in_offset + 9];
+        in11 = input[i / sizeof(chorba_word_t) + 10] ^ next11 ^ in4 ^ bitbuffer[in_offset + 10];
+        in12 = input[i / sizeof(chorba_word_t) + 11] ^ next12 ^ in1 ^ in5 ^ bitbuffer[in_offset + 11];
+        in13 = input[i / sizeof(chorba_word_t) + 12] ^ next13 ^ in2 ^ in6 ^ bitbuffer[in_offset + 12];
+        in14 = input[i / sizeof(chorba_word_t) + 13] ^ next14 ^ in3 ^ in7 ^ bitbuffer[in_offset + 13];
+        in15 = input[i / sizeof(chorba_word_t) + 14] ^ next15 ^ in4 ^ in8 ^ bitbuffer[in_offset + 14];
+        in16 = input[i / sizeof(chorba_word_t) + 15] ^ next16 ^ in5 ^ in9 ^ bitbuffer[in_offset + 15];
+        in17 = input[i / sizeof(chorba_word_t) + 16] ^ next17 ^ in6 ^ in10 ^ bitbuffer[in_offset + 16];
+        in18 = input[i / sizeof(chorba_word_t) + 17] ^ next18 ^ in7 ^ in11 ^ bitbuffer[in_offset + 17];
+        in19 = input[i / sizeof(chorba_word_t) + 18] ^ next19 ^ in8 ^ in12 ^ bitbuffer[in_offset + 18];
+        in20 = input[i / sizeof(chorba_word_t) + 19] ^ next20 ^ in9 ^ in13 ^ bitbuffer[in_offset + 19];
+        in21 = input[i / sizeof(chorba_word_t) + 20] ^ next21 ^ in10 ^ in14 ^ bitbuffer[in_offset + 20];
+        in22 = input[i / sizeof(chorba_word_t) + 21] ^ next22 ^ in11 ^ in15 ^ bitbuffer[in_offset + 21];
+        in23 = input[i / sizeof(chorba_word_t) + 22] ^ in1 ^ in12 ^ in16 ^ bitbuffer[in_offset + 22];
+        in24 = input[i / sizeof(chorba_word_t) + 23] ^ in2 ^ in13 ^ in17 ^ bitbuffer[in_offset + 23];
+        in25 = input[i / sizeof(chorba_word_t) + 24] ^ in3 ^ in14 ^ in18 ^ bitbuffer[in_offset + 24];
+        in26 = input[i / sizeof(chorba_word_t) + 25] ^ in4 ^ in15 ^ in19 ^ bitbuffer[in_offset + 25];
+        in27 = input[i / sizeof(chorba_word_t) + 26] ^ in5 ^ in16 ^ in20 ^ bitbuffer[in_offset + 26];
+        in28 = input[i / sizeof(chorba_word_t) + 27] ^ in6 ^ in17 ^ in21 ^ bitbuffer[in_offset + 27];
+        in29 = input[i / sizeof(chorba_word_t) + 28] ^ in7 ^ in18 ^ in22 ^ bitbuffer[in_offset + 28];
+        in30 = input[i / sizeof(chorba_word_t) + 29] ^ in8 ^ in19 ^ in23 ^ bitbuffer[in_offset + 29];
+        in31 = input[i / sizeof(chorba_word_t) + 30] ^ in9 ^ in20 ^ in24 ^ bitbuffer[in_offset + 30];
+        in32 = input[i / sizeof(chorba_word_t) + 31] ^ in10 ^ in21 ^ in25 ^ bitbuffer[in_offset + 31];
+
+        next1 = in11 ^ in22 ^ in26;
+        next2 = in12 ^ in23 ^ in27;
+        next3 = in13 ^ in24 ^ in28;
+        next4 = in14 ^ in25 ^ in29;
+        next5 = in15 ^ in26 ^ in30;
+        next6 = in16 ^ in27 ^ in31;
+        next7 = in17 ^ in28 ^ in32;
+        next8 = in18 ^ in29;
+        next9 = in19 ^ in30;
+        next10 = in20 ^ in31;
+        next11 = in21 ^ in32;
+        next12 = in22;
+        next13 = in23;
+        next14 = in24;
+        next15 = in25;
+        next16 = in26;
+        next17 = in27;
+        next18 = in28;
+        next19 = in29;
+        next20 = in30;
+        next21 = in31;
+        next22 = in32;
+
+        bitbuffer[out_offset1 + 22] = in1;
+        bitbuffer[out_offset1 + 23] = in2;
+        bitbuffer[out_offset1 + 24] = in3;
+        bitbuffer[out_offset1 + 25] = in4;
+        bitbuffer[out_offset1 + 26] = in5;
+        bitbuffer[out_offset1 + 27] = in6;
+        bitbuffer[out_offset1 + 28] = in7;
+        bitbuffer[out_offset1 + 29] = in8;
+        bitbuffer[out_offset1 + 30] = in9;
+        bitbuffer[out_offset1 + 31] = in10;
+        bitbuffer[out_offset2 + 0] = in11;
+        bitbuffer[out_offset2 + 1] = in12;
+        bitbuffer[out_offset2 + 2] = in13;
+        bitbuffer[out_offset2 + 3] = in14;
+        bitbuffer[out_offset2 + 4] = in15;
+        bitbuffer[out_offset2 + 5] = in16;
+        bitbuffer[out_offset2 + 6] = in17;
+        bitbuffer[out_offset2 + 7] = in18;
+        bitbuffer[out_offset2 + 8] = in19;
+        bitbuffer[out_offset2 + 9] = in20;
+        bitbuffer[out_offset2 + 10] = in21;
+        bitbuffer[out_offset2 + 11] = in22;
+        bitbuffer[out_offset2 + 12] = in23;
+        bitbuffer[out_offset2 + 13] = in24;
+        bitbuffer[out_offset2 + 14] = in25;
+        bitbuffer[out_offset2 + 15] = in26;
+        bitbuffer[out_offset2 + 16] = in27;
+        bitbuffer[out_offset2 + 17] = in28;
+        bitbuffer[out_offset2 + 18] = in29;
+        bitbuffer[out_offset2 + 19] = in30;
+        bitbuffer[out_offset2 + 20] = in31;
+        bitbuffer[out_offset2 + 21] = in32;
+    }
+
+    bitbuffer[(i / sizeof(chorba_word_t) + 0) % bitbuffer_size_zwords] ^= next1;
+    bitbuffer[(i / sizeof(chorba_word_t) + 1) % bitbuffer_size_zwords] ^= next2;
+    bitbuffer[(i / sizeof(chorba_word_t) + 2) % bitbuffer_size_zwords] ^= next3;
+    bitbuffer[(i / sizeof(chorba_word_t) + 3) % bitbuffer_size_zwords] ^= next4;
+    bitbuffer[(i / sizeof(chorba_word_t) + 4) % bitbuffer_size_zwords] ^= next5;
+    bitbuffer[(i / sizeof(chorba_word_t) + 5) % bitbuffer_size_zwords] ^= next6;
+    bitbuffer[(i / sizeof(chorba_word_t) + 6) % bitbuffer_size_zwords] ^= next7;
+    bitbuffer[(i / sizeof(chorba_word_t) + 7) % bitbuffer_size_zwords] ^= next8;
+    bitbuffer[(i / sizeof(chorba_word_t) + 8) % bitbuffer_size_zwords] ^= next9;
+    bitbuffer[(i / sizeof(chorba_word_t) + 9) % bitbuffer_size_zwords] ^= next10;
+    bitbuffer[(i / sizeof(chorba_word_t) + 10) % bitbuffer_size_zwords] ^= next11;
+    bitbuffer[(i / sizeof(chorba_word_t) + 11) % bitbuffer_size_zwords] ^= next12;
+    bitbuffer[(i / sizeof(chorba_word_t) + 12) % bitbuffer_size_zwords] ^= next13;
+    bitbuffer[(i / sizeof(chorba_word_t) + 13) % bitbuffer_size_zwords] ^= next14;
+    bitbuffer[(i / sizeof(chorba_word_t) + 14) % bitbuffer_size_zwords] ^= next15;
+    bitbuffer[(i / sizeof(chorba_word_t) + 15) % bitbuffer_size_zwords] ^= next16;
+    bitbuffer[(i / sizeof(chorba_word_t) + 16) % bitbuffer_size_zwords] ^= next17;
+    bitbuffer[(i / sizeof(chorba_word_t) + 17) % bitbuffer_size_zwords] ^= next18;
+    bitbuffer[(i / sizeof(chorba_word_t) + 18) % bitbuffer_size_zwords] ^= next19;
+    bitbuffer[(i / sizeof(chorba_word_t) + 19) % bitbuffer_size_zwords] ^= next20;
+    bitbuffer[(i / sizeof(chorba_word_t) + 20) % bitbuffer_size_zwords] ^= next21;
+    bitbuffer[(i / sizeof(chorba_word_t) + 21) % bitbuffer_size_zwords] ^= next22;
+
+    for (int j = 14870; j < 14870 + 64; j++) {
+        bitbuffer[(j + (i / sizeof(chorba_word_t))) % bitbuffer_size_zwords] = 0;
+    }
+
+    uint64_t next1_64 = 0;
+    uint64_t next2_64 = 0;
+    uint64_t next3_64 = 0;
+    uint64_t next4_64 = 0;
+    uint64_t next5_64 = 0;
+    uint64_t final[9] = {0};
+
+    for (; (i + 72 < len); i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        in1 = input_qwords[i / sizeof(uint64_t)] ^ bitbuffer_qwords[(i / sizeof(uint64_t)) % bitbuffer_size_qwords];
+        in2 = input_qwords[i / sizeof(uint64_t) + 1] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 1) % bitbuffer_size_qwords];
+        in1 = Z_U64_FROM_LE(in1) ^ next1_64;
+        in2 = Z_U64_FROM_LE(in2) ^ next2_64;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = input_qwords[i / sizeof(uint64_t) + 2] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 2) % bitbuffer_size_qwords];
+        in4 = input_qwords[i / sizeof(uint64_t) + 3] ^ bitbuffer_qwords[(i / sizeof(uint64_t) + 3) % bitbuffer_size_qwords];
+        in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1;
+        in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1_64 = next5_64 ^ out1;
+        next2_64 = out2;
+        next3_64 = out3;
+        next4_64 = out4;
+        next5_64 = out5;
+
+    }
+
+    memcpy(final, input_qwords + (i / sizeof(uint64_t)), len-i);
+    final[0] ^= Z_U64_TO_LE(next1_64);
+    final[1] ^= Z_U64_TO_LE(next2_64);
+    final[2] ^= Z_U64_TO_LE(next3_64);
+    final[3] ^= Z_U64_TO_LE(next4_64);
+    final[4] ^= Z_U64_TO_LE(next5_64);
+
+    uint8_t *final_bytes = (uint8_t*)final;
+
+    for (size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i) % bitbuffer_size_bytes]) & 0xff] ^ (crc >> 8);
+    }
+
+#if defined(__EMSCRIPTEN__)
+    zng_free(bitbuffer);
+#endif
+    return ~crc;
+}
+
+#  if CHORBA_W == 8
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint64_t* input = (const uint64_t*)buf;
+    uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+    const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
+    memset(bitbuffer, 0, 32768);
+    bitbuffer[0] = Z_U64_TO_LE(~crc);
+
+    crc = 0;
+
+    size_t i = 0;
+
+    for(; i + 300*8+64 < len; i += 64) {
+        uint64_t in1, in2, in3, in4;
+        uint64_t in5, in6, in7, in8;
+        size_t in_offset = (i/8);
+
+        in1 = input[i / sizeof(uint64_t) + 0] ^ bitbuffer[in_offset + 0];
+        in2 = input[i / sizeof(uint64_t) + 1] ^ bitbuffer[in_offset + 1];
+        in3 = input[i / sizeof(uint64_t) + 2] ^ bitbuffer[in_offset + 2];
+        in4 = input[i / sizeof(uint64_t) + 3] ^ bitbuffer[in_offset + 3];
+        in5 = input[i / sizeof(uint64_t) + 4] ^ bitbuffer[in_offset + 4];
+        in6 = input[i / sizeof(uint64_t) + 5] ^ bitbuffer[in_offset + 5];
+        in7 = input[i / sizeof(uint64_t) + 6] ^ bitbuffer[in_offset + 6];
+        in8 = input[i / sizeof(uint64_t) + 7] ^ bitbuffer[in_offset + 7];
+
+        // [0, 145, 183, 211]
+
+        bitbuffer[(i/8 + 0 + 145)] ^= in1;
+        bitbuffer[(i/8 + 1 + 145)] ^= in2;
+        bitbuffer[(i/8 + 2 + 145)] ^= in3;
+        bitbuffer[(i/8 + 3 + 145)] ^= in4;
+        bitbuffer[(i/8 + 4 + 145)] ^= in5;
+        bitbuffer[(i/8 + 5 + 145)] ^= in6;
+        bitbuffer[(i/8 + 6 + 145)] ^= in7;
+        bitbuffer[(i/8 + 7 + 145)] ^= in8;
+
+        bitbuffer[(i/8 + 0 + 183)] ^= in1;
+        bitbuffer[(i/8 + 1 + 183)] ^= in2;
+        bitbuffer[(i/8 + 2 + 183)] ^= in3;
+        bitbuffer[(i/8 + 3 + 183)] ^= in4;
+        bitbuffer[(i/8 + 4 + 183)] ^= in5;
+        bitbuffer[(i/8 + 5 + 183)] ^= in6;
+        bitbuffer[(i/8 + 6 + 183)] ^= in7;
+        bitbuffer[(i/8 + 7 + 183)] ^= in8;
+
+        bitbuffer[(i/8 + 0 + 211)] ^= in1;
+        bitbuffer[(i/8 + 1 + 211)] ^= in2;
+        bitbuffer[(i/8 + 2 + 211)] ^= in3;
+        bitbuffer[(i/8 + 3 + 211)] ^= in4;
+        bitbuffer[(i/8 + 4 + 211)] ^= in5;
+        bitbuffer[(i/8 + 5 + 211)] ^= in6;
+        bitbuffer[(i/8 + 6 + 211)] ^= in7;
+        bitbuffer[(i/8 + 7 + 211)] ^= in8;
+
+        bitbuffer[(i/8 + 0 + 300)] = in1;
+        bitbuffer[(i/8 + 1 + 300)] = in2;
+        bitbuffer[(i/8 + 2 + 300)] = in3;
+        bitbuffer[(i/8 + 3 + 300)] = in4;
+        bitbuffer[(i/8 + 4 + 300)] = in5;
+        bitbuffer[(i/8 + 5 + 300)] = in6;
+        bitbuffer[(i/8 + 6 + 300)] = in7;
+        bitbuffer[(i/8 + 7 + 300)] = in8;
+    }
+
+    uint64_t next1_64 = 0;
+    uint64_t next2_64 = 0;
+    uint64_t next3_64 = 0;
+    uint64_t next4_64 = 0;
+    uint64_t next5_64 = 0;
+    uint64_t final[9] = {0};
+
+    for (; (i + 72 < len); i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        in1 = input[i / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t))];
+        in2 = input[(i + 8) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 1)];
+        in1 = Z_U64_FROM_LE(in1) ^ next1_64;
+        in2 = Z_U64_FROM_LE(in2) ^ next2_64;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = input[(i + 16) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 2)];
+        in4 = input[(i + 24) / sizeof(uint64_t)] ^ bitbuffer[(i / sizeof(uint64_t) + 3)];
+        in3 = Z_U64_FROM_LE(in3) ^ next3_64 ^ a1;
+        in4 = Z_U64_FROM_LE(in4) ^ next4_64 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1_64 = next5_64 ^ out1;
+        next2_64 = out2;
+        next3_64 = out3;
+        next4_64 = out4;
+        next5_64 = out5;
+
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    final[0] ^= Z_U64_TO_LE(next1_64);
+    final[1] ^= Z_U64_TO_LE(next2_64);
+    final[2] ^= Z_U64_TO_LE(next3_64);
+    final[3] ^= Z_U64_TO_LE(next4_64);
+    final[4] ^= Z_U64_TO_LE(next5_64);
+
+    uint8_t *final_bytes = (uint8_t*)final;
+
+    for (size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8);
+    }
+
+    return ~crc;
+}
+
+/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
+Z_INTERNAL uint32_t crc32_chorba_small_nondestructive(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint64_t* input = (const uint64_t*)buf;
+    uint64_t final[9] = {0};
+    uint64_t next1 = ~crc;
+    crc = 0;
+    uint64_t next2 = 0;
+    uint64_t next3 = 0;
+    uint64_t next4 = 0;
+    uint64_t next5 = 0;
+
+    size_t i = 0;
+
+    /* This is weird, doing for vs while drops 10% off the exec time */
+    for (; (i + 256 + 40 + 32 + 32) < len; i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        uint64_t chorba1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1;
+        uint64_t chorba2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2;
+        uint64_t chorba3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3;
+        uint64_t chorba4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4;
+        uint64_t chorba5 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 4]) ^ next5;
+        uint64_t chorba6 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 5]);
+        uint64_t chorba7 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 6]) ^ chorba1;
+        uint64_t chorba8 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 7]) ^ chorba2;
+
+        i += 8 * 8;
+
+        /* 0-3 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ chorba3;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ chorba4 ^ chorba1;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ a1 ^ chorba5 ^ chorba2 ^ chorba1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ a2 ^ b1 ^ chorba6 ^ chorba3 ^ chorba2;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 4-7 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba4 ^ chorba3;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba4;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba6 ^ chorba5;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba7 ^ chorba6;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 8-11 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba3;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 12-15 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba2 ^ chorba1;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 16-19 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 20-23 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 24-27 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+
+        i += 32;
+
+        /* 28-31 */
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1 ^ chorba7 ^ chorba6 ^ chorba5;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2 ^ chorba8 ^ chorba7 ^ chorba6;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1 ^ chorba8 ^ chorba7;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1 ^ chorba8;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+    }
+
+    for (; (i + 40 + 32) < len; i += 32) {
+        uint64_t in1;
+        uint64_t in2;
+        uint64_t in3;
+        uint64_t in4;
+        uint64_t a1, a2, a3, a4;
+        uint64_t b1, b2, b3, b4;
+        uint64_t c1, c2, c3, c4;
+        uint64_t d1, d2, d3, d4;
+
+        uint64_t out1;
+        uint64_t out2;
+        uint64_t out3;
+        uint64_t out4;
+        uint64_t out5;
+
+        in1 = Z_U64_FROM_LE(input[i / sizeof(uint64_t)]) ^ next1;
+        in2 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 1]) ^ next2;
+
+        a1 = (in1 << 17) ^ (in1 << 55);
+        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
+        a3 = (in1 >> 45) ^ (in1 << 44);
+        a4 = (in1 >> 20);
+
+        b1 = (in2 << 17) ^ (in2 << 55);
+        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
+        b3 = (in2 >> 45) ^ (in2 << 44);
+        b4 = (in2 >> 20);
+
+        in3 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 2]) ^ next3 ^ a1;
+        in4 = Z_U64_FROM_LE(input[i / sizeof(uint64_t) + 3]) ^ next4 ^ a2 ^ b1;
+
+        c1 = (in3 << 17) ^ (in3 << 55);
+        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
+        c3 = (in3 >> 45) ^ (in3 << 44);
+        c4 = (in3 >> 20);
+
+        d1 = (in4 << 17) ^ (in4 << 55);
+        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
+        d3 = (in4 >> 45) ^ (in4 << 44);
+        d4 = (in4 >> 20);
+
+        out1 = a3 ^ b2 ^ c1;
+        out2 = a4 ^ b3 ^ c2 ^ d1;
+        out3 = b4 ^ c3 ^ d2;
+        out4 = c4 ^ d3;
+        out5 = d4;
+
+        next1 = next5 ^ out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    final[0] ^= Z_U64_TO_LE(next1);
+    final[1] ^= Z_U64_TO_LE(next2);
+    final[2] ^= Z_U64_TO_LE(next3);
+    final[3] ^= Z_U64_TO_LE(next4);
+    final[4] ^= Z_U64_TO_LE(next5);
+
+    return crc32_braid(~crc, (uint8_t*)final, len-i);
+}
+
+#else // CHORBA_W == 8
+
+Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit(uint32_t crc, const uint8_t *buf, size_t len) {
+    /* The calling function ensured that this is aligned correctly */
+    const uint32_t* input = (const uint32_t*)buf;
+    uint32_t final[20] = {0};
+
+    uint32_t next1 = ~crc;
+    crc = 0;
+    uint32_t next2 = 0;
+    uint32_t next3 = 0;
+    uint32_t next4 = 0;
+    uint32_t next5 = 0;
+    uint32_t next6 = 0;
+    uint32_t next7 = 0;
+    uint32_t next8 = 0;
+    uint32_t next9 = 0;
+    uint32_t next10 = 0;
+
+    size_t i = 0;
+    for (; i + 80 < len; i += 40) {
+        uint32_t in1;
+        uint32_t in2;
+        uint32_t in3;
+        uint32_t in4;
+        uint32_t in5;
+        uint32_t in6;
+        uint32_t in7;
+        uint32_t in8;
+        uint32_t in9;
+        uint32_t in10;
+
+        uint32_t a1, a2, a3, a4, a6, a7;
+        uint32_t b1, b2, b3, b4, b6, b7;
+        uint32_t c1, c2, c3, c4, c6, c7;
+        uint32_t d1, d2, d3, d4, d6, d7;
+        uint32_t e1, e2, e3, e4, e6, e7;
+        uint32_t f1, f2, f3, f4, f6, f7;
+        uint32_t g1, g2, g3, g4, g6, g7;
+        uint32_t h1, h2, h3, h4, h6, h7;
+        uint32_t i1, i2, i3, i4, i6, i7;
+        uint32_t j1, j2, j3, j4, j6, j7;
+
+        uint32_t out1;
+        uint32_t out2;
+        uint32_t out3;
+        uint32_t out4;
+        uint32_t out5;
+        uint32_t out6;
+        uint32_t out7;
+        uint32_t out8;
+        uint32_t out9;
+        uint32_t out10;
+
+        in1 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 0]) ^ next1;
+        in2 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 1]) ^ next2;
+        in3 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 2]) ^ next3;
+        in4 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 3]) ^ next4;
+
+        a1 = (in1 << 17);
+        a2 = (in1 >> 15) ^ (in1 << 23);
+        a3 = (in1 >> 9) ^ (in1 << 19);
+        a4 = (in1 >> 13);
+        a6 = (in1 << 12);
+        a7 = (in1 >> 20);
+
+        b1 = (in2 << 17);
+        b2 = (in2 >> 15) ^ (in2 << 23);
+        b3 = (in2 >> 9) ^ (in2 << 19);
+        b4 = (in2 >> 13);
+        b6 = (in2 << 12);
+        b7 = (in2 >> 20);
+
+        c1 = (in3 << 17);
+        c2 = (in3 >> 15) ^ (in3 << 23);
+        c3 = (in3 >> 9) ^ (in3 << 19);
+        c4 = (in3 >> 13);
+        c6 = (in3 << 12);
+        c7 = (in3 >> 20);
+
+        d1 = (in4 << 17);
+        d2 = (in4 >> 15) ^ (in4 << 23);
+        d3 = (in4 >> 9) ^ (in4 << 19);
+        d4 = (in4 >> 13);
+        d6 = (in4 << 12);
+        d7 = (in4 >> 20);
+
+        in5 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 4]) ^ next5 ^ a1;
+        in6 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 5]) ^ next6 ^ a2 ^ b1;
+        in7 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 6]) ^ next7 ^ a3 ^ b2 ^ c1;
+        in8 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 7]) ^ next8 ^ a4 ^ b3 ^ c2 ^ d1;
+
+        e1 = (in5 << 17);
+        e2 = (in5 >> 15) ^ (in5 << 23);
+        e3 = (in5 >> 9) ^ (in5 << 19);
+        e4 = (in5 >> 13);
+        e6 = (in5 << 12);
+        e7 = (in5 >> 20);
+
+        f1 = (in6 << 17);
+        f2 = (in6 >> 15) ^ (in6 << 23);
+        f3 = (in6 >> 9) ^ (in6 << 19);
+        f4 = (in6 >> 13);
+        f6 = (in6 << 12);
+        f7 = (in6 >> 20);
+
+        g1 = (in7 << 17);
+        g2 = (in7 >> 15) ^ (in7 << 23);
+        g3 = (in7 >> 9) ^ (in7 << 19);
+        g4 = (in7 >> 13);
+        g6 = (in7 << 12);
+        g7 = (in7 >> 20);
+
+        h1 = (in8 << 17);
+        h2 = (in8 >> 15) ^ (in8 << 23);
+        h3 = (in8 >> 9) ^ (in8 << 19);
+        h4 = (in8 >> 13);
+        h6 = (in8 << 12);
+        h7 = (in8 >> 20);
+
+        in9 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 8]) ^ next9 ^ b4 ^ c3 ^ d2 ^ e1;
+        in10 = Z_U32_FROM_LE(input[i/sizeof(uint32_t) + 9]) ^ next10 ^ a6 ^ c4 ^ d3 ^ e2 ^ f1;
+
+        i1 = (in9 << 17);
+        i2 = (in9 >> 15) ^ (in9 << 23);
+        i3 = (in9 >> 9) ^ (in9 << 19);
+        i4 = (in9 >> 13);
+        i6 = (in9 << 12);
+        i7 = (in9 >> 20);
+
+        j1 = (in10 << 17);
+        j2 = (in10 >> 15) ^ (in10 << 23);
+        j3 = (in10 >> 9) ^ (in10 << 19);
+        j4 = (in10 >> 13);
+        j6 = (in10 << 12);
+        j7 = (in10 >> 20);
+
+        out1 = a7 ^ b6 ^ d4 ^ e3 ^ f2 ^ g1;
+        out2 = b7 ^ c6 ^ e4 ^ f3 ^ g2 ^ h1;
+        out3 = c7 ^ d6 ^ f4 ^ g3 ^ h2 ^ i1;
+        out4 = d7 ^ e6 ^ g4 ^ h3 ^ i2 ^ j1;
+        out5 = e7 ^ f6 ^ h4 ^ i3 ^ j2;
+        out6 = f7 ^ g6 ^ i4 ^ j3;
+        out7 = g7 ^ h6 ^ j4;
+        out8 = h7 ^ i6;
+        out9 = i7 ^ j6;
+        out10 = j7;
+
+        next1 = out1;
+        next2 = out2;
+        next3 = out3;
+        next4 = out4;
+        next5 = out5;
+        next6 = out6;
+        next7 = out7;
+        next8 = out8;
+        next9 = out9;
+        next10 = out10;
+
+    }
+
+    memcpy(final, input+(i/sizeof(uint32_t)), len-i);
+    final[0] ^= Z_U32_TO_LE(next1);
+    final[1] ^= Z_U32_TO_LE(next2);
+    final[2] ^= Z_U32_TO_LE(next3);
+    final[3] ^= Z_U32_TO_LE(next4);
+    final[4] ^= Z_U32_TO_LE(next5);
+    final[5] ^= Z_U32_TO_LE(next6);
+    final[6] ^= Z_U32_TO_LE(next7);
+    final[7] ^= Z_U32_TO_LE(next8);
+    final[8] ^= Z_U32_TO_LE(next9);
+    final[9] ^= Z_U32_TO_LE(next10);
+
+    return crc32_braid(~crc, (uint8_t*)final, len-i);
+}
+#endif // CHORBA_W == 8
+
+Z_INTERNAL uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len) {
+    uintptr_t align_diff = ALIGN_DIFF(buf, 8);
+    if (len <= align_diff + CHORBA_SMALL_THRESHOLD)
+        return crc32_braid(crc, buf, len);
+
+    if (align_diff) {
+        crc = crc32_braid(crc, buf, align_diff);
+        len -= align_diff;
+        buf += align_diff;
+    }
+    if (len > CHORBA_LARGE_THRESHOLD)
+        return crc32_chorba_118960_nondestructive(crc, buf, len);
+#if CHORBA_W == 8
+    if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
+        return crc32_chorba_32768_nondestructive(crc, buf, len);
+    return crc32_chorba_small_nondestructive(crc, buf, len);
+#else
+    return crc32_chorba_small_nondestructive_32bit(crc, buf, len);
+#endif
+}
+
+uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc = crc32_chorba(crc, src, len);
+    memcpy(dst, src, len);
+    return crc;
+}
diff --git a/neozip/arch/generic/generic_functions.h b/neozip/arch/generic/generic_functions.h
new file mode 100644
index 0000000000..c150a2f010
--- /dev/null
+++ b/neozip/arch/generic/generic_functions.h
@@ -0,0 +1,64 @@
+/* generic_functions.h -- generic C implementations for arch-specific functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef GENERIC_FUNCTIONS_H_
+#define GENERIC_FUNCTIONS_H_
+
+#include "zendian.h"
+#include "deflate.h"
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+typedef uint32_t (*adler32_copy_func)(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+typedef uint32_t (*crc32_func)(uint32_t crc, const uint8_t *buf, size_t len);
+typedef uint32_t (*crc32_copy_func)(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+typedef void     (*slide_hash_func)(deflate_state *s);
+
+
+uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, size_t len, size_t left);
+
+#ifdef WITH_ALL_FALLBACKS
+uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1);
+uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
+#endif
+uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_braid(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+
+#ifndef WITHOUT_CHORBA
+  uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len);
+  uint32_t crc32_copy_chorba(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+void     inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+
+uint32_t longest_match_c(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_c(deflate_state *const s, uint32_t cur_match);
+
+void     slide_hash_c(deflate_state *s);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// Generic code
+#  define native_adler32 adler32_c
+#  define native_adler32_copy adler32_copy_c
+#  define native_chunkmemset_safe chunkmemset_safe_c
+#ifndef WITHOUT_CHORBA
+#  define native_crc32 crc32_chorba
+#  define native_crc32_copy crc32_copy_chorba
+#else
+#  define native_crc32 crc32_braid
+#  define native_crc32_copy crc32_copy_braid
+#endif
+#  define native_inflate_fast inflate_fast_c
+#  define native_slide_hash slide_hash_c
+#  define native_longest_match longest_match_c
+#  define native_longest_match_slow longest_match_slow_c
+#  define native_compare256 compare256_c
+#endif
+
+#endif
diff --git a/neozip/arch/generic/slide_hash_c.c b/neozip/arch/generic/slide_hash_c.c
new file mode 100644
index 0000000000..8345b9e36b
--- /dev/null
+++ b/neozip/arch/generic/slide_hash_c.c
@@ -0,0 +1,52 @@
+/* slide_hash.c -- slide hash table C implementation
+ *
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+
+/* ===========================================================================
+ * Slide the hash table when sliding the window down (could be avoided with 32
+ * bit values at the expense of memory usage). We slide even when level == 0 to
+ * keep the hash table consistent if we switch back to level > 0 later.
+ */
+static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+#ifdef NOT_TWEAK_COMPILER
+    table += entries;
+    do {
+        unsigned m;
+        m = *--table;
+        *table = (Pos)(m >= wsize ? m-wsize : 0);
+        /* If entries is not on any hash chain, prev[entries] is garbage but
+         * its value will never be used.
+         */
+    } while (--entries);
+#else
+    {
+    /* As of I make this change, gcc (4.8.*) isn't able to vectorize
+     * this hot loop using saturated-subtraction on x86-64 architecture.
+     * To avoid this defect, we can change the loop such that
+     *    o. the pointer advance forward, and
+     *    o. demote the variable 'm' to be local to the loop, and
+     *       choose type "Pos" (instead of 'unsigned int') for the
+     *       variable to avoid unnecessary zero-extension.
+     */
+        unsigned int i;
+        Pos *q = table;
+        for (i = 0; i < entries; i++) {
+            Pos m = *q;
+            Pos t = (Pos)wsize;
+            *q++ = (Pos)(m >= t ? m-t: 0);
+        }
+    }
+#endif /* NOT_TWEAK_COMPILER */
+}
+
+Z_INTERNAL void slide_hash_c(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_c_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_c_chain(s->prev, wsize, wsize);
+}