diff options
| author | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
|---|---|---|
| committer | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
| commit | 7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch) | |
| tree | b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch/riscv | |
| parent | b1e34e861b5d732afe828d58aad2c638135061fd (diff) | |
| parent | c2712b8a345191f6ed79558c089777df94590087 (diff) | |
| download | Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip | |
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip
git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd
git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/arch/riscv')
| -rw-r--r-- | neozip/arch/riscv/Makefile.in | 72 | ||||
| -rw-r--r-- | neozip/arch/riscv/README.md | 45 | ||||
| -rw-r--r-- | neozip/arch/riscv/adler32_rvv.c | 119 | ||||
| -rw-r--r-- | neozip/arch/riscv/chunkset_rvv.c | 126 | ||||
| -rw-r--r-- | neozip/arch/riscv/compare256_rvv.c | 48 | ||||
| -rw-r--r-- | neozip/arch/riscv/crc32_zbc.c | 103 | ||||
| -rw-r--r-- | neozip/arch/riscv/riscv_features.c | 99 | ||||
| -rw-r--r-- | neozip/arch/riscv/riscv_features.h | 19 | ||||
| -rw-r--r-- | neozip/arch/riscv/riscv_functions.h | 60 | ||||
| -rw-r--r-- | neozip/arch/riscv/riscv_natives.h | 19 | ||||
| -rw-r--r-- | neozip/arch/riscv/slide_hash_rvv.c | 33 |
11 files changed, 743 insertions, 0 deletions
diff --git a/neozip/arch/riscv/Makefile.in b/neozip/arch/riscv/Makefile.in new file mode 100644 index 0000000000..43176eee6e --- /dev/null +++ b/neozip/arch/riscv/Makefile.in @@ -0,0 +1,72 @@ +# Makefile for zlib-ng +# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler +# Copyright (C) 2024 Hans Kristian Rosbach +# Copyright (C) 2025 Yin Tong <yintong.ustc@bytedance.com>, ByteDance +# For conditions of distribution and use, see copyright notice in zlib.h + +CC= +CFLAGS= +SFLAGS= +INCLUDES= +SUFFIX= + +SRCDIR=. +SRCTOP=../.. +TOPDIR=$(SRCTOP) + +RVVFLAG= +RVVZBCFLAG= +ZBCFLAG= + +all: \ + riscv_features.o riscv_features.lo \ + adler32_rvv.o adler32_rvv.lo \ + chunkset_rvv.o chunkset_rvv.lo \ + compare256_rvv.o compare256_rvv.lo \ + slide_hash_rvv.o slide_hash_rvv.lo \ + crc32_zbc.o crc32_zbc.lo + +riscv_features.o: $(SRCDIR)/riscv_features.c + $(CC) $(CFLAGS) $(RVVZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c + +riscv_features.lo: $(SRCDIR)/riscv_features.c + $(CC) $(SFLAGS) $(RVVZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/riscv_features.c + +adler32_rvv.o: $(SRCDIR)/adler32_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c + +adler32_rvv.lo: $(SRCDIR)/adler32_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_rvv.c + +chunkset_rvv.o: $(SRCDIR)/chunkset_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c + +chunkset_rvv.lo: $(SRCDIR)/chunkset_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_rvv.c + +compare256_rvv.o: $(SRCDIR)/compare256_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c + +compare256_rvv.lo: $(SRCDIR)/compare256_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_rvv.c + +slide_hash_rvv.o: $(SRCDIR)/slide_hash_rvv.c + $(CC) $(CFLAGS) $(RVVFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c + +slide_hash_rvv.lo: $(SRCDIR)/slide_hash_rvv.c + $(CC) $(SFLAGS) $(RVVFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_rvv.c + +crc32_zbc.o: $(SRCDIR)/crc32_zbc.c + $(CC) $(CFLAGS) $(ZBCFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c + +crc32_zbc.lo: $(SRCDIR)/crc32_zbc.c + $(CC) $(SFLAGS) $(ZBCFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_zbc.c + +mostlyclean: clean +clean: + rm -f *.o *.lo *~ + rm -rf objs + rm -f *.gcda *.gcno *.gcov + +distclean: clean + rm -f Makefile diff --git a/neozip/arch/riscv/README.md b/neozip/arch/riscv/README.md new file mode 100644 index 0000000000..013095c373 --- /dev/null +++ b/neozip/arch/riscv/README.md @@ -0,0 +1,45 @@ +# Building RISC-V Target with Cmake # + +> **Warning** +> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer. +> +> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu. +> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it. +## Prerequisite: Build RISC-V Clang Toolchain and QEMU ## + +If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version). + +```bash +./prepare_riscv_toolchain_qemu.sh +``` + +After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`. + +`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`. +`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`. + +You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them. + +## Cross-Compile for RISC-V Target ## + +```bash +cmake -G Ninja -B ./build-riscv \ + -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \ + -D CMAKE_INSTALL_PREFIX=./build-riscv/install \ + -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \ + -D QEMU_PATH={QEMU_PATH} \ + . + +cmake --build ./build-riscv +``` + +Disable the option if there is no RVV support: +``` +-D WITH_RVV=OFF +``` + +## Run Unittests on User Mode QEMU ## + +```bash +cd ./build-riscv && ctest --verbose +``` diff --git a/neozip/arch/riscv/adler32_rvv.c b/neozip/arch/riscv/adler32_rvv.c new file mode 100644 index 0000000000..e446189302 --- /dev/null +++ b/neozip/arch/riscv/adler32_rvv.c @@ -0,0 +1,119 @@ +/* adler32_rvv.c - RVV version of adler32 + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" +#include "adler32_p.h" + +#include <riscv_vector.h> + +Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) { + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (UNLIKELY(len == 1)) + return adler32_copy_tail(adler, dst, src, 1, sum2, 1, 1, COPY); + + /* in case short lengths are provided, keep it somewhat fast */ + if (UNLIKELY(len < 16)) + return adler32_copy_tail(adler, dst, src, len, sum2, 1, 15, COPY); + + size_t left = len; + size_t vl = __riscv_vsetvlmax_e8m1(); + vl = MIN(vl, 256); + vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint16m2_t v_buf16_accu; + + /* + * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. + * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit + * accumulators to boost performance. + * + * The block_size is the largest multiple of vl that <= 256, because overflow would occur when + * vl > 256 (255 * 256 <= UINT16_MAX). + * + * We accumulate 8-bit data into a 16-bit accumulator and then + * move the data into the 32-bit accumulator at the last iteration. + */ + size_t block_size = (256 / vl) * vl; + size_t nmax_limit = (NMAX / block_size); + size_t cnt = 0; + while (left >= block_size) { + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t subprob = block_size; + while (subprob > 0) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); + if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + src += vl; + if (COPY) dst += vl; + subprob -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + left -= block_size; + /* do modulo once each block of NMAX size */ + if (++cnt >= nmax_limit) { + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); + v_buf32_accu = __riscv_vremu_vx_u32m4(v_buf32_accu, BASE, vl); + cnt = 0; + } + } + /* the left len <= 256 now, we can use 16-bit accum safely */ + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t res = left; + while (left >= vl) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); + if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + src += vl; + if (COPY) dst += vl; + left -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + + vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); + vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); + vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); + + v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); + + vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); + uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum) % BASE; + + sum2 += (sum2_sum + adler * ((len - left) % BASE)); + + vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); + uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum) % BASE; + + adler += adler_sum; + + sum2 %= BASE; + adler %= BASE; + + /* Process tail (left < 256). */ + return adler32_copy_tail(adler, dst, src, left, sum2, left != 0, 255, COPY); +} + +Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { + return adler32_copy_impl(adler, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_copy_impl(adler, dst, src, len, 1); +} + +#endif // RISCV_RVV diff --git a/neozip/arch/riscv/chunkset_rvv.c b/neozip/arch/riscv/chunkset_rvv.c new file mode 100644 index 0000000000..cd8ed3cfd2 --- /dev/null +++ b/neozip/arch/riscv/chunkset_rvv.c @@ -0,0 +1,126 @@ +/* chunkset_rvv.c - RVV version of chunkset + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" + +#include <riscv_vector.h> + +/* + * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, + * so we prefer using large size chunk and copy memory as much as possible. + */ +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +#define CHUNK_MEMSET_RVV_IMPL(from, chunk, elen) \ +do { \ + size_t vl, len = sizeof(*chunk) / sizeof(uint##elen##_t); \ + uint##elen##_t val = *(uint##elen##_t*)from; \ + uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \ + do { \ + vl = __riscv_vsetvl_e##elen##m4(len); \ + vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \ + __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \ + len -= vl; chunk_p += vl; \ + } while (len > 0); \ +} while (0) + +/* We don't have a 32-byte datatype for RISC-V arch. */ +typedef struct chunk_s { + uint64_t data[4]; +} chunk_t; + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + CHUNK_MEMSET_RVV_IMPL(from, chunk, 16); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + CHUNK_MEMSET_RVV_IMPL(from, chunk, 32); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + CHUNK_MEMSET_RVV_IMPL(from, chunk, 64); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + memcpy(chunk->data, (uint8_t *)s, sizeof(*chunk)); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + memcpy(out, chunk->data, sizeof(*chunk)); +} + +#define CHUNKSIZE chunksize_rvv +#define CHUNKCOPY chunkcopy_rvv +#define CHUNKUNROLL chunkunroll_rvv +#define CHUNKMEMSET chunkmemset_rvv +#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv + +#define HAVE_CHUNKCOPY + +/* + * Assuming that the length is non-zero, and that `from` lags `out` by at least + * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h. + * + * We load/store a single chunk once in the `CHUNKCOPY`. + * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, + * such that, we prefer copy large memory size once to make good use of the the RVV advance. + * + * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot, + * but we still copy as much memory as possible for some conditions. + * + * case 1: out - from >= len (no overlap) + * We can use memcpy to copy `len` size once + * because the memory layout would be the same. + * + * case 2: overlap + * We copy N chunks using memcpy at once, aiming to achieve our goal: + * to copy as much memory as possible. + * + * After using a single memcpy to copy N chunks, we have to use series of + * loadchunk and storechunk to ensure the result is correct. + */ +static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + size_t dist = out - from; + if (out < from || dist >= len) { + memcpy(out, from, len); + out += len; + from += len; + return out; + } + + size_t align = ((len - 1) % sizeof(chunk_t)) + 1; + memcpy(out, from, sizeof(chunk_t)); + out += align; + from += align; + len -= align; + + size_t vl = (dist / sizeof(chunk_t)) * sizeof(chunk_t); + while (len > dist) { + memcpy(out, from, vl); + out += vl; + from += vl; + len -= vl; + } + + if (len > 0) { + memcpy(out, from, len); + out += len; + } + return out; +} + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_rvv + +#include "inffast_tpl.h" + +#endif diff --git a/neozip/arch/riscv/compare256_rvv.c b/neozip/arch/riscv/compare256_rvv.c new file mode 100644 index 0000000000..edb18a3766 --- /dev/null +++ b/neozip/arch/riscv/compare256_rvv.c @@ -0,0 +1,48 @@ +/* compare256_rvv.c - RVV version of compare256 + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" + +#include <riscv_vector.h> + +static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + size_t vl; + long found_diff; + do { + vl = __riscv_vsetvl_e8m4(256 - len); + vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); + vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); + vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl); + found_diff = __riscv_vfirst_m_b2(v_mask, vl); + if (found_diff >= 0) + return len + (uint32_t)found_diff; + src0 += vl, src1 += vl, len += vl; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) { + return compare256_rvv_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_rvv +#define COMPARE256 compare256_rvv_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_rvv +#define COMPARE256 compare256_rvv_static + +#include "match_tpl.h" + +#endif // RISCV_RVV diff --git a/neozip/arch/riscv/crc32_zbc.c b/neozip/arch/riscv/crc32_zbc.c new file mode 100644 index 0000000000..cf52279b80 --- /dev/null +++ b/neozip/arch/riscv/crc32_zbc.c @@ -0,0 +1,103 @@ +/* crc32_zbc.c - RISCV Zbc version of crc32 + * Copyright (C) 2025 ByteDance. All rights reserved. + * Contributed by Yin Tong <yintong.ustc@bytedance.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_CRC32_ZBC + +#include "zbuild.h" +#include "arch_functions.h" + +#define CLMUL_MIN_LEN 16 // Minimum size of buffer for _crc32_clmul +#define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul + +#define CONSTANT_R3 0x1751997d0ULL +#define CONSTANT_R4 0x0ccaa009eULL +#define CONSTANT_R5 0x163cd6124ULL +#define MASK32 0xFFFFFFFF +#define CRCPOLY_TRUE_LE_FULL 0x1DB710641ULL +#define CONSTANT_RU 0x1F7011641ULL + +static inline uint64_t clmul(uint64_t a, uint64_t b) { + uint64_t res; + __asm__ volatile("clmul %0, %1, %2" : "=r"(res) : "r"(a), "r"(b)); + return res; +} + +static inline uint64_t clmulh(uint64_t a, uint64_t b) { + uint64_t res; + __asm__ volatile("clmulh %0, %1, %2" : "=r"(res) : "r"(a), "r"(b)); + return res; +} + +Z_FORCEINLINE static uint32_t crc32_clmul_impl(uint64_t crc, const unsigned char *buf, uint64_t len) { + const uint64_t *buf64 = (const uint64_t *)buf; + uint64_t low = buf64[0] ^ crc; + uint64_t high = buf64[1]; + + if (len < 16) + goto finish_fold; + len -= 16; + buf64 += 2; + + // process each 16-byte block + while (len >= 16) { + uint64_t t2 = clmul(CONSTANT_R4, high); + uint64_t t3 = clmulh(CONSTANT_R4, high); + + uint64_t t0_new = clmul(CONSTANT_R3, low); + uint64_t t1_new = clmulh(CONSTANT_R3, low); + + // Combine the results and XOR with new data + low = t0_new ^ t2; + high = t1_new ^ t3; + low ^= buf64[0]; + high ^= buf64[1]; + + buf64 += 2; + len -= 16; + } + +finish_fold: + // Fold the 128-bit result into 64 bits + uint64_t fold_t3 = clmulh(low, CONSTANT_R4); + uint64_t fold_t2 = clmul(low, CONSTANT_R4); + low = high ^ fold_t2; + high = fold_t3; + + // Combine the low and high parts and perform polynomial reduction + uint64_t combined = (low >> 32) | ((high & MASK32) << 32); + uint64_t reduced_low = clmul(low & MASK32, CONSTANT_R5) ^ combined; + + // Barrett reduction step + uint64_t barrett = clmul(reduced_low & MASK32, CONSTANT_RU) & MASK32; + barrett = clmul(barrett, CRCPOLY_TRUE_LE_FULL); + uint64_t final = barrett ^ reduced_low; + + // Return the high 32 bits as the final CRC + return (uint32_t)(final >> 32); +} + +Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len) { + if (len < CLMUL_MIN_LEN) { + return crc32_braid(crc, buf, len); + } + + uint64_t unaligned_length = len % CLMUL_CHUNK_LEN; + if (unaligned_length) { + crc = crc32_braid(crc, buf, unaligned_length); + buf += unaligned_length; + len -= unaligned_length; + } + + crc = crc32_clmul_impl(~crc, buf, len); + return ~crc; +} + +Z_INTERNAL uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc = crc32_riscv64_zbc(crc, src, len); + memcpy(dst, src, len); + return crc; +} +#endif diff --git a/neozip/arch/riscv/riscv_features.c b/neozip/arch/riscv/riscv_features.c new file mode 100644 index 0000000000..b23f10a699 --- /dev/null +++ b/neozip/arch/riscv/riscv_features.c @@ -0,0 +1,99 @@ +#ifdef RISCV_FEATURES + +#define _DEFAULT_SOURCE 1 /* For syscall() */ + +#include "zbuild.h" +#include "riscv_features.h" + +#include <sys/utsname.h> + +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) +# include <sys/auxv.h> +#endif + +#if defined(__linux__) && defined(HAVE_ASM_HWPROBE_H) +# include <asm/hwprobe.h> +# include <sys/syscall.h> /* For __NR_riscv_hwprobe */ +# include <unistd.h> /* For syscall() */ +#endif + +#define ISA_V_HWCAP (1 << ('v' - 'a')) +#define ISA_ZBC_HWCAP (1 << 29) + +static int riscv_check_features_runtime_hwprobe(struct riscv_cpu_features *features) { +#if defined(__NR_riscv_hwprobe) && defined(RISCV_HWPROBE_KEY_IMA_EXT_0) + struct riscv_hwprobe probes[] = { + {RISCV_HWPROBE_KEY_IMA_EXT_0, 0}, + }; + int ret; + unsigned i; + + ret = syscall(__NR_riscv_hwprobe, probes, sizeof(probes) / sizeof(probes[0]), 0, NULL, 0); + + if (ret != 0) { + /* Kernel does not support hwprobe */ + return 0; + } + + for (i = 0; i < sizeof(probes) / sizeof(probes[0]); i++) { + switch (probes[i].key) { + case RISCV_HWPROBE_KEY_IMA_EXT_0: +# ifdef RISCV_HWPROBE_IMA_V + features->has_rvv = !!(probes[i].value & RISCV_HWPROBE_IMA_V); +# endif +# ifdef RISCV_HWPROBE_EXT_ZBC + features->has_zbc = !!(probes[i].value & RISCV_HWPROBE_EXT_ZBC); +# endif + break; + } + } + + return 1; +#else + return 0; +#endif +} + +static int riscv_check_features_runtime_hwcap(struct riscv_cpu_features *features) { +#if defined(__linux__) && defined(HAVE_SYS_AUXV_H) + unsigned long hw_cap = getauxval(AT_HWCAP); + + features->has_rvv = hw_cap & ISA_V_HWCAP; + features->has_zbc = hw_cap & ISA_ZBC_HWCAP; + + return 1; +#else + return 0; +#endif +} + +static void riscv_check_features_runtime(struct riscv_cpu_features *features) { + if (riscv_check_features_runtime_hwprobe(features)) + return; + + riscv_check_features_runtime_hwcap(features); +} + +void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) { + riscv_check_features_runtime(features); +#ifdef RISCV_RVV + if (features->has_rvv) { + size_t e8m1_vec_len; + intptr_t vtype_reg_val; + // Check that a vuint8m1_t vector is at least 16 bytes and that tail + // agnostic and mask agnostic mode are supported + // + __asm__ volatile( + "vsetvli %0, zero, e8, m1, ta, ma\n\t" + "csrr %1, vtype" + : "=r"(e8m1_vec_len), "=r"(vtype_reg_val)); + + // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of + // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16 + // bytes + features->has_rvv = (vtype_reg_val >= 0 && e8m1_vec_len >= 16); + } +#endif +} + +#endif diff --git a/neozip/arch/riscv/riscv_features.h b/neozip/arch/riscv/riscv_features.h new file mode 100644 index 0000000000..42855a1b6b --- /dev/null +++ b/neozip/arch/riscv/riscv_features.h @@ -0,0 +1,19 @@ +/* riscv_features.h -- check for riscv features. + * + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_FEATURES_H_ +#define RISCV_FEATURES_H_ + +struct riscv_cpu_features { + int has_rvv; + int has_zbc; +}; + +void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features); + +#endif /* RISCV_FEATURES_H_ */ diff --git a/neozip/arch/riscv/riscv_functions.h b/neozip/arch/riscv/riscv_functions.h new file mode 100644 index 0000000000..89120ffabf --- /dev/null +++ b/neozip/arch/riscv/riscv_functions.h @@ -0,0 +1,60 @@ +/* riscv_functions.h -- RISCV implementations for arch-specific functions. + * + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_FUNCTIONS_H_ +#define RISCV_FUNCTIONS_H_ + +#include "riscv_natives.h" + +#ifdef RISCV_RVV +uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); +uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, size_t len, size_t left); +uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); + +uint32_t longest_match_rvv(deflate_state *const s, uint32_t cur_match); +uint32_t longest_match_slow_rvv(deflate_state *const s, uint32_t cur_match); +void slide_hash_rvv(deflate_state *s); +void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); +#endif + +#ifdef RISCV_CRC32_ZBC +uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// RISCV - RVV +# ifdef RISCV_RVV_NATIVE +# undef native_adler32 +# define native_adler32 adler32_rvv +# undef native_adler32_copy +# define native_adler32_copy adler32_copy_rvv +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_rvv +# undef native_compare256 +# define native_compare256 compare256_rvv +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_rvv +# undef native_longest_match +# define native_longest_match longest_match_rvv +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_rvv +# undef native_slide_hash +# define native_slide_hash slide_hash_rvv +# endif +// RISCV - CRC32 +# ifdef RISCV_ZBC_NATIVE +# undef native_crc32 +# define native_crc32 crc32_riscv64_zbc +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_riscv64_zbc +# endif +#endif + +#endif /* RISCV_FUNCTIONS_H_ */ diff --git a/neozip/arch/riscv/riscv_natives.h b/neozip/arch/riscv/riscv_natives.h new file mode 100644 index 0000000000..38d7aba648 --- /dev/null +++ b/neozip/arch/riscv/riscv_natives.h @@ -0,0 +1,19 @@ +/* riscv_natives.h -- RISCV compile-time feature detection macros. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef RISCV_NATIVES_H_ +#define RISCV_NATIVES_H_ + +#if defined(__riscv_v) && defined(__linux__) +# ifdef RISCV_RVV +# define RISCV_RVV_NATIVE +# endif +#endif +#if defined(__riscv_zbc) +# ifdef RISCV_CRC32_ZBC +# define RISCV_ZBC_NATIVE +# endif +#endif + +#endif /* RISCV_NATIVES_H_ */ diff --git a/neozip/arch/riscv/slide_hash_rvv.c b/neozip/arch/riscv/slide_hash_rvv.c new file mode 100644 index 0000000000..e794c38204 --- /dev/null +++ b/neozip/arch/riscv/slide_hash_rvv.c @@ -0,0 +1,33 @@ +/* slide_hash_rvv.c - RVV version of slide_hash + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang <alex.chiang@sifive.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "zbuild.h" +#include "deflate.h" + +#include <riscv_vector.h> + +static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) { + size_t vl; + while (entries > 0) { + vl = __riscv_vsetvl_e16m4(entries); + vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl); + vuint16m4_t v_diff = __riscv_vssubu_vx_u16m4(v_tab, wsize, vl); + __riscv_vse16_v_u16m4(table, v_diff, vl); + table += vl, entries -= vl; + } +} + +Z_INTERNAL void slide_hash_rvv(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + + slide_hash_chain(s->head, HASH_SIZE, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} + +#endif // RISCV_RVV |
