summaryrefslogtreecommitdiff
path: root/neozip/arch/riscv
diff options
context:
space:
mode:
authorMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
committerMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
commit7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
treeb43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch/riscv
parentb1e34e861b5d732afe828d58aad2c638135061fd (diff)
parentc2712b8a345191f6ed79558c089777df94590087 (diff)
downloadProject-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz
Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/arch/riscv')
-rw-r--r--neozip/arch/riscv/Makefile.in72
-rw-r--r--neozip/arch/riscv/README.md45
-rw-r--r--neozip/arch/riscv/adler32_rvv.c119
-rw-r--r--neozip/arch/riscv/chunkset_rvv.c126
-rw-r--r--neozip/arch/riscv/compare256_rvv.c48
-rw-r--r--neozip/arch/riscv/crc32_zbc.c103
-rw-r--r--neozip/arch/riscv/riscv_features.c99
-rw-r--r--neozip/arch/riscv/riscv_features.h19
-rw-r--r--neozip/arch/riscv/riscv_functions.h60
-rw-r--r--neozip/arch/riscv/riscv_natives.h19
-rw-r--r--neozip/arch/riscv/slide_hash_rvv.c33
11 files changed, 743 insertions, 0 deletions
diff --git a/neozip/arch/riscv/Makefile.in b/neozip/arch/riscv/Makefile.in
new file mode 100644
index 0000000000..43176eee6e
--- /dev/null
+++ b/neozip/arch/riscv/Makefile.in
@@ -0,0 +1,72 @@
+# Makefile for zlib-ng
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# Copyright (C) 2024 Hans Kristian Rosbach
+# Copyright (C) 2025 Yin Tong <yintong.ustc@bytedance.com>, ByteDance
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+RVVFLAG=
+RVVZBCFLAG=
+ZBCFLAG=
+
+all: \
+ riscv_features.o riscv_features.lo \
+ adler32_rvv.o adler32_rvv.lo \
+ chunkset_rvv.o chunkset_rvv.lo \
+ compare256_rvv.o compare256_rvv.lo \
+ slide_hash_rvv.o slide_hash_rvv.lo \
+ crc32_zbc.o crc32_zbc.lo
+
+riscv_features.o: $(SRCDIR)/riscv_features.c
+ $(CC) $(CFLAGS) $(RVVZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c
+
+riscv_features.lo: $(SRCDIR)/riscv_features.c
+ $(CC) $(SFLAGS) $(RVVZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c
+
+adler32_rvv.o: $(SRCDIR)/adler32_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c
+
+adler32_rvv.lo: $(SRCDIR)/adler32_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c
+
+chunkset_rvv.o: $(SRCDIR)/chunkset_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c
+
+chunkset_rvv.lo: $(SRCDIR)/chunkset_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c
+
+compare256_rvv.o: $(SRCDIR)/compare256_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c
+
+compare256_rvv.lo: $(SRCDIR)/compare256_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c
+
+slide_hash_rvv.o: $(SRCDIR)/slide_hash_rvv.c
+ $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c
+
+slide_hash_rvv.lo: $(SRCDIR)/slide_hash_rvv.c
+ $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c
+
+crc32_zbc.o: $(SRCDIR)/crc32_zbc.c
+ $(CC) $(CFLAGS) $(ZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c
+
+crc32_zbc.lo: $(SRCDIR)/crc32_zbc.c
+ $(CC) $(SFLAGS) $(ZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c
+
+mostlyclean: clean
+clean:
+ rm -f *.o *.lo *~
+ rm -rf objs
+ rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+ rm -f Makefile
diff --git a/neozip/arch/riscv/README.md b/neozip/arch/riscv/README.md
new file mode 100644
index 0000000000..013095c373
--- /dev/null
+++ b/neozip/arch/riscv/README.md
@@ -0,0 +1,45 @@
+# Building RISC-V Target with Cmake #
+
+> **Warning**
+> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer.
+>
+> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu.
+> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it.
+## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
+
+If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
+
+```bash
+./prepare_riscv_toolchain_qemu.sh
+```
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
+`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
+
+You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
+
+## Cross-Compile for RISC-V Target ##
+
+```bash
+cmake -G Ninja -B ./build-riscv \
+ -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
+ -D CMAKE_INSTALL_PREFIX=./build-riscv/install \
+ -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+ -D QEMU_PATH={QEMU_PATH} \
+ .
+
+cmake --build ./build-riscv
+```
+
+Disable the option if there is no RVV support:
+```
+-D WITH_RVV=OFF
+```
+
+## Run Unittests on User Mode QEMU ##
+
+```bash
+cd ./build-riscv && ctest --verbose
+```
diff --git a/neozip/arch/riscv/adler32_rvv.c b/neozip/arch/riscv/adler32_rvv.c
new file mode 100644
index 0000000000..e446189302
--- /dev/null
+++ b/neozip/arch/riscv/adler32_rvv.c
@@ -0,0 +1,119 @@
+/* adler32_rvv.c - RVV version of adler32
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "adler32_p.h"
+
+#include <riscv_vector.h>
+
+Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY);
+
+ size_t left = len;
+ size_t vl = __riscv_vsetvlmax_e8m1();
+ vl = MIN(vl, 256);
+ vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+ vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+ vuint16m2_t v_buf16_accu;
+
+ /*
+ * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+ * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+ * accumulators to boost performance.
+ *
+ * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+ * vl > 256 (255 * 256 <= UINT16_MAX).
+ *
+ * We accumulate 8-bit data into a 16-bit accumulator and then
+ * move the data into the 32-bit accumulator at the last iteration.
+ */
+ size_t block_size = (256 / vl) * vl;
+ size_t nmax_limit = (NMAX / block_size);
+ size_t cnt = 0;
+ while (left >= block_size) {
+ v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+ size_t subprob = block_size;
+ while (subprob > 0) {
+ vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+ if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+ v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+ v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+ src += vl;
+ if (COPY) dst += vl;
+ subprob -= vl;
+ }
+ v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+ v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+ left -= block_size;
+ /* do modulo once each block of NMAX size */
+ if (++cnt >= nmax_limit) {
+ v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+ v_buf32_accu = __riscv_vremu_vx_u32m4(v_buf32_accu, BASE, vl);
+ cnt = 0;
+ }
+ }
+ /* the left len <= 256 now, we can use 16-bit accum safely */
+ v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+ size_t res = left;
+ while (left >= vl) {
+ vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+ if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+ v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+ v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+ src += vl;
+ if (COPY) dst += vl;
+ left -= vl;
+ }
+ v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+ v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+ v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+ vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+ vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+ vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+ v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+ vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+ v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+ uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum) % BASE;
+
+ sum2 += (sum2_sum + adler * ((len - left) % BASE));
+
+ vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+ v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+ uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum) % BASE;
+
+ adler += adler_sum;
+
+ sum2 %= BASE;
+ adler %= BASE;
+
+ /* Process tail (left < 256). */
+ return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY);
+}
+
+Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
+ return adler32_copy_impl(adler, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ return adler32_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/riscv/chunkset_rvv.c b/neozip/arch/riscv/chunkset_rvv.c
new file mode 100644
index 0000000000..cd8ed3cfd2
--- /dev/null
+++ b/neozip/arch/riscv/chunkset_rvv.c
@@ -0,0 +1,126 @@
+/* chunkset_rvv.c - RVV version of chunkset
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+
+#include <riscv_vector.h>
+
+/*
+ * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * so we prefer using large size chunk and copy memory as much as possible.
+ */
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+#define CHUNK_MEMSET_RVV_IMPL(from, chunk, elen) \
+do { \
+ size_t vl, len = sizeof(*chunk) / sizeof(uint##elen##_t); \
+ uint##elen##_t val = *(uint##elen##_t*)from; \
+ uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \
+ do { \
+ vl = __riscv_vsetvl_e##elen##m4(len); \
+ vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
+ __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \
+ len -= vl; chunk_p += vl; \
+ } while (len > 0); \
+} while (0)
+
+/* We don't have a 32-byte datatype for RISC-V arch. */
+typedef struct chunk_s {
+ uint64_t data[4];
+} chunk_t;
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(from, chunk, 16);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(from, chunk, 32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(from, chunk, 64);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ memcpy(chunk->data, (uint8_t *)s, sizeof(*chunk));
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ memcpy(out, chunk->data, sizeof(*chunk));
+}
+
+#define CHUNKSIZE chunksize_rvv
+#define CHUNKCOPY chunkcopy_rvv
+#define CHUNKUNROLL chunkunroll_rvv
+#define CHUNKMEMSET chunkmemset_rvv
+#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
+
+#define HAVE_CHUNKCOPY
+
+/*
+ * Assuming that the length is non-zero, and that `from` lags `out` by at least
+ * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
+ *
+ * We load/store a single chunk once in the `CHUNKCOPY`.
+ * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * such that, we prefer copy large memory size once to make good use of the the RVV advance.
+ *
+ * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
+ * but we still copy as much memory as possible for some conditions.
+ *
+ * case 1: out - from >= len (no overlap)
+ * We can use memcpy to copy `len` size once
+ * because the memory layout would be the same.
+ *
+ * case 2: overlap
+ * We copy N chunks using memcpy at once, aiming to achieve our goal:
+ * to copy as much memory as possible.
+ *
+ * After using a single memcpy to copy N chunks, we have to use series of
+ * loadchunk and storechunk to ensure the result is correct.
+ */
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+ size_t dist = out - from;
+ if (out < from || dist >= len) {
+ memcpy(out, from, len);
+ out += len;
+ from += len;
+ return out;
+ }
+
+ size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+ memcpy(out, from, sizeof(chunk_t));
+ out += align;
+ from += align;
+ len -= align;
+
+ size_t vl = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
+ while (len > dist) {
+ memcpy(out, from, vl);
+ out += vl;
+ from += vl;
+ len -= vl;
+ }
+
+ if (len > 0) {
+ memcpy(out, from, len);
+ out += len;
+ }
+ return out;
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_rvv
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/neozip/arch/riscv/compare256_rvv.c b/neozip/arch/riscv/compare256_rvv.c
new file mode 100644
index 0000000000..edb18a3766
--- /dev/null
+++ b/neozip/arch/riscv/compare256_rvv.c
@@ -0,0 +1,48 @@
+/* compare256_rvv.c - RVV version of compare256
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+
+#include <riscv_vector.h>
+
+static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+ size_t vl;
+ long found_diff;
+ do {
+ vl = __riscv_vsetvl_e8m4(256 - len);
+ vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+ vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+ vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
+ found_diff = __riscv_vfirst_m_b2(v_mask, vl);
+ if (found_diff >= 0)
+ return len + (uint32_t)found_diff;
+ src0 += vl, src1 += vl, len += vl;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_rvv_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_rvv
+#define COMPARE256 compare256_rvv_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_rvv
+#define COMPARE256 compare256_rvv_static
+
+#include "match_tpl.h"
+
+#endif // RISCV_RVV
diff --git a/neozip/arch/riscv/crc32_zbc.c b/neozip/arch/riscv/crc32_zbc.c
new file mode 100644
index 0000000000..cf52279b80
--- /dev/null
+++ b/neozip/arch/riscv/crc32_zbc.c
@@ -0,0 +1,103 @@
+/* crc32_zbc.c - RISCV Zbc version of crc32
+ * Copyright (C) 2025 ByteDance. All rights reserved.
+ * Contributed by Yin Tong <yintong.ustc@bytedance.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_CRC32_ZBC
+
+#include "zbuild.h"
+#include "arch_functions.h"
+
+#define CLMUL_MIN_LEN 16 // Minimum size of buffer for _crc32_clmul
+#define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul
+
+#define CONSTANT_R3 0x1751997d0ULL
+#define CONSTANT_R4 0x0ccaa009eULL
+#define CONSTANT_R5 0x163cd6124ULL
+#define MASK32 0xFFFFFFFF
+#define CRCPOLY_TRUE_LE_FULL 0x1DB710641ULL
+#define CONSTANT_RU 0x1F7011641ULL
+
+static inline uint64_t clmul(uint64_t a, uint64_t b) {
+ uint64_t res;
+ __asm__ volatile("clmul %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
+ return res;
+}
+
+static inline uint64_t clmulh(uint64_t a, uint64_t b) {
+ uint64_t res;
+ __asm__ volatile("clmulh %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
+ return res;
+}
+
+Z_FORCEINLINE static uint32_t crc32_clmul_impl(uint64_t crc, const unsigned char *buf, uint64_t len) {
+ const uint64_t *buf64 = (const uint64_t *)buf;
+ uint64_t low = buf64[0] ^ crc;
+ uint64_t high = buf64[1];
+
+ if (len < 16)
+ goto finish_fold;
+ len -= 16;
+ buf64 += 2;
+
+ // process each 16-byte block
+ while (len >= 16) {
+ uint64_t t2 = clmul(CONSTANT_R4, high);
+ uint64_t t3 = clmulh(CONSTANT_R4, high);
+
+ uint64_t t0_new = clmul(CONSTANT_R3, low);
+ uint64_t t1_new = clmulh(CONSTANT_R3, low);
+
+ // Combine the results and XOR with new data
+ low = t0_new ^ t2;
+ high = t1_new ^ t3;
+ low ^= buf64[0];
+ high ^= buf64[1];
+
+ buf64 += 2;
+ len -= 16;
+ }
+
+finish_fold:
+ // Fold the 128-bit result into 64 bits
+ uint64_t fold_t3 = clmulh(low, CONSTANT_R4);
+ uint64_t fold_t2 = clmul(low, CONSTANT_R4);
+ low = high ^ fold_t2;
+ high = fold_t3;
+
+ // Combine the low and high parts and perform polynomial reduction
+ uint64_t combined = (low >> 32) | ((high & MASK32) << 32);
+ uint64_t reduced_low = clmul(low & MASK32, CONSTANT_R5) ^ combined;
+
+ // Barrett reduction step
+ uint64_t barrett = clmul(reduced_low & MASK32, CONSTANT_RU) & MASK32;
+ barrett = clmul(barrett, CRCPOLY_TRUE_LE_FULL);
+ uint64_t final = barrett ^ reduced_low;
+
+ // Return the high 32 bits as the final CRC
+ return (uint32_t)(final >> 32);
+}
+
+Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len) {
+ if (len < CLMUL_MIN_LEN) {
+ return crc32_braid(crc, buf, len);
+ }
+
+ uint64_t unaligned_length = len % CLMUL_CHUNK_LEN;
+ if (unaligned_length) {
+ crc = crc32_braid(crc, buf, unaligned_length);
+ buf += unaligned_length;
+ len -= unaligned_length;
+ }
+
+ crc = crc32_clmul_impl(~crc, buf, len);
+ return ~crc;
+}
+
+Z_INTERNAL uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ crc = crc32_riscv64_zbc(crc, src, len);
+ memcpy(dst, src, len);
+ return crc;
+}
+#endif
diff --git a/neozip/arch/riscv/riscv_features.c b/neozip/arch/riscv/riscv_features.c
new file mode 100644
index 0000000000..b23f10a699
--- /dev/null
+++ b/neozip/arch/riscv/riscv_features.c
@@ -0,0 +1,99 @@
+#ifdef RISCV_FEATURES
+
+#define _DEFAULT_SOURCE 1 /* For syscall() */
+
+#include "zbuild.h"
+#include "riscv_features.h"
+
+#include <sys/utsname.h>
+
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+# include <sys/auxv.h>
+#endif
+
+#if defined(__linux__) && defined(HAVE_ASM_HWPROBE_H)
+# include <asm/hwprobe.h>
+# include <sys/syscall.h> /* For __NR_riscv_hwprobe */
+# include <unistd.h> /* For syscall() */
+#endif
+
+#define ISA_V_HWCAP (1 << ('v' - 'a'))
+#define ISA_ZBC_HWCAP (1 << 29)
+
+static int riscv_check_features_runtime_hwprobe(struct riscv_cpu_features *features) {
+#if defined(__NR_riscv_hwprobe) && defined(RISCV_HWPROBE_KEY_IMA_EXT_0)
+ struct riscv_hwprobe probes[] = {
+ {RISCV_HWPROBE_KEY_IMA_EXT_0, 0},
+ };
+ int ret;
+ unsigned i;
+
+ ret = syscall(__NR_riscv_hwprobe, probes, sizeof(probes) / sizeof(probes[0]), 0, NULL, 0);
+
+ if (ret != 0) {
+ /* Kernel does not support hwprobe */
+ return 0;
+ }
+
+ for (i = 0; i < sizeof(probes) / sizeof(probes[0]); i++) {
+ switch (probes[i].key) {
+ case RISCV_HWPROBE_KEY_IMA_EXT_0:
+# ifdef RISCV_HWPROBE_IMA_V
+ features->has_rvv = !!(probes[i].value & RISCV_HWPROBE_IMA_V);
+# endif
+# ifdef RISCV_HWPROBE_EXT_ZBC
+ features->has_zbc = !!(probes[i].value & RISCV_HWPROBE_EXT_ZBC);
+# endif
+ break;
+ }
+ }
+
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+static int riscv_check_features_runtime_hwcap(struct riscv_cpu_features *features) {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+
+ features->has_rvv = hw_cap & ISA_V_HWCAP;
+ features->has_zbc = hw_cap & ISA_ZBC_HWCAP;
+
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+static void riscv_check_features_runtime(struct riscv_cpu_features *features) {
+ if (riscv_check_features_runtime_hwprobe(features))
+ return;
+
+ riscv_check_features_runtime_hwcap(features);
+}
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
+ riscv_check_features_runtime(features);
+#ifdef RISCV_RVV
+ if (features->has_rvv) {
+ size_t e8m1_vec_len;
+ intptr_t vtype_reg_val;
+ // Check that a vuint8m1_t vector is at least 16 bytes and that tail
+ // agnostic and mask agnostic mode are supported
+ //
+ __asm__ volatile(
+ "vsetvli %0, zero, e8, m1, ta, ma\n\t"
+ "csrr %1, vtype"
+ : "=r"(e8m1_vec_len), "=r"(vtype_reg_val));
+
+ // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of
+ // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16
+ // bytes
+ features->has_rvv = (vtype_reg_val >= 0 && e8m1_vec_len >= 16);
+ }
+#endif
+}
+
+#endif
diff --git a/neozip/arch/riscv/riscv_features.h b/neozip/arch/riscv/riscv_features.h
new file mode 100644
index 0000000000..42855a1b6b
--- /dev/null
+++ b/neozip/arch/riscv/riscv_features.h
@@ -0,0 +1,19 @@
+/* riscv_features.h -- check for riscv features.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FEATURES_H_
+#define RISCV_FEATURES_H_
+
+struct riscv_cpu_features {
+ int has_rvv;
+ int has_zbc;
+};
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
+
+#endif /* RISCV_FEATURES_H_ */
diff --git a/neozip/arch/riscv/riscv_functions.h b/neozip/arch/riscv/riscv_functions.h
new file mode 100644
index 0000000000..89120ffabf
--- /dev/null
+++ b/neozip/arch/riscv/riscv_functions.h
@@ -0,0 +1,60 @@
+/* riscv_functions.h -- RISCV implementations for arch-specific functions.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_FUNCTIONS_H_
+#define RISCV_FUNCTIONS_H_
+
+#include "riscv_natives.h"
+
+#ifdef RISCV_RVV
+uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, size_t len, size_t left);
+uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+
+uint32_t longest_match_rvv(deflate_state *const s, uint32_t cur_match);
+uint32_t longest_match_slow_rvv(deflate_state *const s, uint32_t cur_match);
+void slide_hash_rvv(deflate_state *s);
+void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+#ifdef RISCV_CRC32_ZBC
+uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+// RISCV - RVV
+# ifdef RISCV_RVV_NATIVE
+# undef native_adler32
+# define native_adler32 adler32_rvv
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_rvv
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_rvv
+# undef native_compare256
+# define native_compare256 compare256_rvv
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_rvv
+# undef native_longest_match
+# define native_longest_match longest_match_rvv
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_rvv
+# undef native_slide_hash
+# define native_slide_hash slide_hash_rvv
+# endif
+// RISCV - CRC32
+# ifdef RISCV_ZBC_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_riscv64_zbc
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_riscv64_zbc
+# endif
+#endif
+
+#endif /* RISCV_FUNCTIONS_H_ */
diff --git a/neozip/arch/riscv/riscv_natives.h b/neozip/arch/riscv/riscv_natives.h
new file mode 100644
index 0000000000..38d7aba648
--- /dev/null
+++ b/neozip/arch/riscv/riscv_natives.h
@@ -0,0 +1,19 @@
+/* riscv_natives.h -- RISCV compile-time feature detection macros.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_NATIVES_H_
+#define RISCV_NATIVES_H_
+
+#if defined(__riscv_v) && defined(__linux__)
+# ifdef RISCV_RVV
+# define RISCV_RVV_NATIVE
+# endif
+#endif
+#if defined(__riscv_zbc)
+# ifdef RISCV_CRC32_ZBC
+# define RISCV_ZBC_NATIVE
+# endif
+#endif
+
+#endif /* RISCV_NATIVES_H_ */
diff --git a/neozip/arch/riscv/slide_hash_rvv.c b/neozip/arch/riscv/slide_hash_rvv.c
new file mode 100644
index 0000000000..e794c38204
--- /dev/null
+++ b/neozip/arch/riscv/slide_hash_rvv.c
@@ -0,0 +1,33 @@
+/* slide_hash_rvv.c - RVV version of slide_hash
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <riscv_vector.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ size_t vl;
+ while (entries > 0) {
+ vl = __riscv_vsetvl_e16m4(entries);
+ vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
+ vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl);
+ __riscv_vse16_v_u16m4(table, v_diff, vl);
+ table += vl, entries -= vl;
+ }
+}
+
+Z_INTERNAL void slide_hash_rvv(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+
+#endif // RISCV_RVV