Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'

git-subtree-dir: neozip git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
author: Mehmet Samet Duman <yongdohyun@projecttick.org> 2026-04-02 19:56:09 +0300
committer: Mehmet Samet Duman <yongdohyun@projecttick.org> 2026-04-02 19:56:09 +0300
commit: 7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
tree: b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/test/benchmarks
parent: b1e34e861b5d732afe828d58aad2c638135061fd (diff)
parent: c2712b8a345191f6ed79558c089777df94590087 (diff)
download: Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz
Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip
18 files changed, 2092 insertions, 0 deletions
diff --git a/neozip/test/benchmarks/CMakeLists.txt b/neozip/test/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000000..df6f5a7e69
--- /dev/null
+++ b/neozip/test/benchmarks/CMakeLists.txt
@@ -0,0 +1,126 @@
+cmake_minimum_required(VERSION 3.14...4.2.1)
+
+include(FetchContent)
+
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_STANDARD 11)
+endif()
+if(NOT DEFINED CMAKE_CXX_STANDARD_REQUIRED)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+if(NOT DEFINED CMAKE_CXX_EXTENSIONS)
+    set(CMAKE_CXX_EXTENSIONS ON)
+endif()
+
+# Search for Google benchmark package
+find_package(benchmark QUIET)
+if(NOT benchmark_FOUND)
+    # Fetch google benchmark source code from official repository
+    set(BENCHMARK_ENABLE_TESTING OFF)
+
+    # Allow specifying alternative Google benchmark repository
+    if(NOT DEFINED GBENCHMARK_REPOSITORY)
+        set(GBENCHMARK_REPOSITORY https://github.com/google/benchmark.git)
+    endif()
+    if(NOT DEFINED GBENCHMARK_TAG)
+        set(GBENCHMARK_TAG v1.9.4)
+    endif()
+
+    FetchContent_Declare(benchmark
+        GIT_REPOSITORY ${GBENCHMARK_REPOSITORY}
+        GIT_TAG ${GBENCHMARK_TAG}
+        ${ZNG_FetchContent_Declare_EXCLUDE_FROM_ALL})
+
+    ZNG_FetchContent_MakeAvailable(benchmark)
+endif()
+
+# Public API benchmarks
+set(BENCH_PUBLIC_SRCS
+    benchmark_compress.cc
+    benchmark_inflate.cc
+    benchmark_uncompress.cc
+    benchmark_main.cc
+    )
+
+# Internal benchmarks
+set(BENCH_INTERNAL_SRCS
+    benchmark_adler32.cc
+    benchmark_adler32_copy.cc
+    benchmark_compare256.cc
+    benchmark_compare256_rle.cc
+    benchmark_crc32.cc
+    benchmark_crc32_copy.cc
+    benchmark_deflate.cc
+    benchmark_insert_string.cc
+    benchmark_slidehash.cc
+    )
+
+add_executable(benchmark_zlib ${BENCH_PUBLIC_SRCS})
+
+target_compile_definitions(benchmark_zlib PRIVATE -DBENCHMARK_STATIC_DEFINE)
+target_include_directories(benchmark_zlib PRIVATE
+    ${PROJECT_SOURCE_DIR}
+    ${PROJECT_BINARY_DIR}
+    ${benchmark_SOURCE_DIR}/benchmark/include)
+
+target_link_libraries(benchmark_zlib benchmark::benchmark)
+if(ZLIB_LIBRARY)
+    target_link_libraries(benchmark_zlib ${ZLIB_LIBRARY})
+else()
+    target_sources(benchmark_zlib PRIVATE ${BENCH_INTERNAL_SRCS})
+    target_link_libraries(benchmark_zlib zlib-ng-static)
+endif()
+
+if(WIN32)
+    target_link_libraries(benchmark_zlib shlwapi)
+endif()
+
+add_test(NAME benchmark_zlib
+    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:benchmark_zlib> "--benchmark_min_time=0")
+
+if(WITH_BENCHMARK_APPS)
+    option(BUILD_ALT_BENCH "Link against alternative zlib implementation" OFF)
+
+    # Search for libpng package
+    find_package(PNG QUIET)
+
+    if(NOT PNG_FOUND)
+        FetchContent_Declare(PNG
+            GIT_REPOSITORY https://github.com/glennrp/libpng.git
+            ${ZNG_FetchContent_Declare_EXCLUDE_FROM_ALL})
+
+        ZNG_FetchContent_MakeAvailable(PNG)
+        set(PNG_INCLUDE_DIR ${png_SOURCE_DIR})
+    endif()
+
+    set(BENCH_APP_SRCS
+        benchmark_png_encode.cc
+        benchmark_png_decode.cc
+        benchmark_main.cc
+    )
+
+    add_executable(benchmark_zlib_apps ${BENCH_APP_SRCS})
+
+    if(DEFINED BUILD_ALT_BENCH)
+        set(ZLIB_ALT_LIB "libz.a" CACHE FILEPATH "Optional alternative zlib implementation (defaults to stock zlib)")
+        add_executable(benchmark_zlib_apps_alt ${BENCH_APP_SRCS})
+        target_link_libraries(benchmark_zlib_apps_alt libpng.a ${ZLIB_ALT_LIB} benchmark::benchmark)
+        target_compile_definitions(benchmark_zlib_apps_alt PRIVATE BUILD_ALT=1)
+        target_include_directories(benchmark_zlib_apps_alt PRIVATE
+            ${PROJECT_SOURCE_DIR}
+            ${PROJECT_BINARY_DIR}
+            ${PNG_INCLUDE_DIR}
+            ${benchmark_SOURCE_DIR}/benchmark/include)
+    endif()
+
+    target_include_directories(benchmark_zlib_apps PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_BINARY_DIR}
+        ${PNG_INCLUDE_DIR}
+        ${benchmark_SOURCE_DIR}/benchmark/include)
+
+    # We need the static png library if we're statically linking to zlib,
+    # otherwise it will resolve these things in the system provided dynamic
+    # libraries (likely linked to stock zlib)
+    target_link_libraries(benchmark_zlib_apps libpng.a zlib-ng-static benchmark::benchmark)
+endif()
diff --git a/neozip/test/benchmarks/README.md b/neozip/test/benchmarks/README.md
new file mode 100644
index 0000000000..08ccea233e
--- /dev/null
+++ b/neozip/test/benchmarks/README.md
@@ -0,0 +1,63 @@
+## Benchmarks
+These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark).
+
+*Repetitions*
+
+To increase the number of times each benchmark iteration is run use:
+
+```
+--benchmark_repetitions=20
+```
+
+*Filters*
+
+To filter out which benchmarks are performed use:
+
+```
+--benchmark_filter="adler32*"
+```
+
+There are two different benchmarks, micro and macro.
+
+### Benchmark benchmark_zlib
+These are microbenchmarks intended to test lower level subfunctions of the library.
+
+Benchmarks include implementations of:
+    - Adler32
+    - CRC
+    - 256 byte comparisons
+    - SIMD accelerated "slide hash" routine
+
+By default these benchmarks report things on the nanosecond scale and are small enough
+to measure very minute differences.
+
+*Alternative zlib library*
+
+To benchmark against an alternative zlib-compatible library, use the `ZLIB_LIBRARY`
+CMake argument. When set, only the public API benchmarks are built:
+
+```sh
+cmake -S . -B build-alt \
+    -DZLIB_COMPAT=ON \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DBUILD_TESTING=ON \
+    -DWITH_BENCHMARKS=ON \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DWITH_RUNTIME_CPU_DETECTION=OFF \
+    -DZLIB_LIBRARY=/path/to/libz.a
+```
+
+### Benchmark benchmark_zlib_apps
+These benchmarks measure applications of zlib as a whole.  Currently the only examples
+are PNG encoding and decoding. The PNG encode and decode tests leveraging procedurally
+generated and highly compressible image data.
+
+Additionally, a test called `png_decode_realistic` that will decode any RGB 8 BPP encoded
+set of PNGs in the working directory under a directory named "test_pngs" with files named
+{0..1}.png. If these images do not exist, they will error out and the benchmark will move
+on to the next set of benchmarks.
+
+*benchmark_zlib_apps_alt*
+
+The user can compile a comparison benchmark application linking to any zlib-compatible
+implementation of his or her choosing.
diff --git a/neozip/test/benchmarks/benchmark_adler32.cc b/neozip/test/benchmarks/benchmark_adler32.cc
new file mode 100644
index 0000000000..5ee9102e23
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_adler32.cc
@@ -0,0 +1,121 @@
+/* benchmark_adler32.cc -- benchmark adler32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+}
+
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
+
+class adler32: public benchmark::Fixture {
+private:
+    uint32_t *testdata;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
+        }
+    }
+
+    // Benchmark Adler32, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) {
+        int misalign = 0;
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
+
+            // Prevent the result from being optimized away
+            benchmark::DoNotOptimize(hash);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        zng_free_aligned(testdata);
+    }
+};
+
+#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 0); \
+    } \
+    BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \
+    BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \
+    BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag);
+
+BENCHMARK_ADLER32(c, adler32_c, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_ADLER32(native, native_adler32, 1);
+#else
+
+#ifdef ARM_NEON
+BENCHMARK_ADLER32(neon, adler32_neon, test_cpu_features.arm.has_neon);
+#endif
+
+#ifdef PPC_VMX
+BENCHMARK_ADLER32(vmx, adler32_vmx, test_cpu_features.power.has_altivec);
+#endif
+#ifdef POWER8_VSX
+BENCHMARK_ADLER32(power8, adler32_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+
+#ifdef RISCV_RVV
+BENCHMARK_ADLER32(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+
+#ifdef X86_SSSE3
+BENCHMARK_ADLER32(ssse3, adler32_ssse3, test_cpu_features.x86.has_ssse3);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef X86_AVX512
+BENCHMARK_ADLER32(avx512, adler32_avx512, test_cpu_features.x86.has_avx512_common);
+#endif
+#ifdef X86_AVX512VNNI
+BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
+#endif
+
+#ifdef LOONGARCH_LSX
+BENCHMARK_ADLER32(lsx, adler32_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_ADLER32(lasx, adler32_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_adler32_copy.cc b/neozip/test/benchmarks/benchmark_adler32_copy.cc
new file mode 100644
index 0000000000..6d913b1d19
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_adler32_copy.cc
@@ -0,0 +1,176 @@
+/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+}
+
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
+
+class adler32_copy: public benchmark::Fixture {
+private:
+    uint32_t *testdata;
+    uint8_t *dstbuf;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL || dstbuf == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
+        }
+    }
+
+    // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) {
+        int misalign = 0;
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
+
+            // Prevent the result from being optimized away
+            benchmark::DoNotOptimize(hash);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        zng_free_aligned(testdata);
+        zng_free_aligned(dstbuf);
+    }
+};
+
+// Misaligned
+#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 0); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+
+// Adler32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 0); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 1); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
+
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
+BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1);
+#else
+
+#ifdef ARM_NEON
+BENCHMARK_ADLER32_COPY(neon, adler32_neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
+#endif
+
+#ifdef PPC_VMX
+BENCHMARK_ADLER32_COPY(vmx, adler32_vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
+#endif
+#ifdef POWER8_VSX
+BENCHMARK_ADLER32_COPY(power8, adler32_power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+
+#ifdef RISCV_RVV
+BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, adler32_copy_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+
+#ifdef X86_SSSE3
+BENCHMARK_ADLER32_COPY(ssse3, adler32_ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
+#endif
+#ifdef X86_SSE42
+// There is no adler32_sse42, so only test the copy variant
+BENCHMARK_ADLER32_COPY_ONLY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_ADLER32_COPY(avx2, adler32_avx, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef X86_AVX512
+BENCHMARK_ADLER32_COPY(avx512, adler32_avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
+#endif
+#ifdef X86_AVX512VNNI
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
+#endif
+
+#ifdef LOONGARCH_LSX
+BENCHMARK_ADLER32_COPY(lsx, adler32_lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_ADLER32_COPY(lasx, adler32_lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_compare256.cc b/neozip/test/benchmarks/benchmark_compare256.cc
new file mode 100644
index 0000000000..2d8352879d
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_compare256.cc
@@ -0,0 +1,106 @@
+/* benchmark_compare256.cc -- benchmark compare256 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+}
+
+#define MAX_COMPARE_SIZE (256 + 64)
+
+class compare256: public benchmark::Fixture {
+private:
+    uint8_t *str1;
+    uint8_t *str2;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        str1 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+        str2 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+        if (str1 == NULL || str2 == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        memset(str1, 'a', MAX_COMPARE_SIZE);
+        memset(str2, 'a', MAX_COMPARE_SIZE);
+    }
+
+    // Benchmark compare256, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, compare256_func compare256) {
+        int misalign = 0;
+        int32_t match_len = (int32_t)state.range(0) - 1;
+        uint32_t len = 0;
+
+        for (auto _ : state) {
+            str2[match_len + misalign] = 0;   // Set new match limit
+
+            len = compare256((const uint8_t *)str1 + misalign, (const uint8_t *)str2 + misalign);
+
+            str2[match_len + misalign] = 'a'; // Reset match limit
+
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign++;
+
+            // Prevent the result from being optimized away
+            benchmark::DoNotOptimize(len);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        free(str1);
+        free(str2);
+    }
+};
+
+#define BENCHMARK_COMPARE256(name, comparefunc, support_flag) \
+    BENCHMARK_DEFINE_F(compare256, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, comparefunc); \
+    } \
+    BENCHMARK_REGISTER_F(compare256, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_COMPARE256(native, native_compare256, 1);
+#else
+
+#ifdef WITH_ALL_FALLBACKS
+BENCHMARK_COMPARE256(8, compare256_8, 1);
+BENCHMARK_COMPARE256(64, compare256_64, 1);
+#endif
+
+#ifdef X86_SSE2
+BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef X86_AVX512
+BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common);
+#endif
+#ifdef ARM_NEON
+BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon);
+#endif
+#ifdef POWER9
+BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00);
+#endif
+#ifdef RISCV_RVV
+BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+#ifdef LOONGARCH_LSX
+BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_compare256_rle.cc b/neozip/test/benchmarks/benchmark_compare256_rle.cc
new file mode 100644
index 0000000000..db5adacc19
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_compare256_rle.cc
@@ -0,0 +1,72 @@
+/* benchmark_compare256_rle.cc -- benchmark compare256_rle variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "compare256_rle.h"
+}
+
+#define MAX_COMPARE_SIZE (256 + 64)
+
+class compare256_rle: public benchmark::Fixture {
+private:
+    uint8_t *str1;
+    uint8_t *str2;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        str1 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+        str2 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+        if (str1 == NULL || str2 == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        memset(str1, 'a', MAX_COMPARE_SIZE);
+        memset(str2, 'a', MAX_COMPARE_SIZE);
+    }
+
+    // Benchmark compare256_rle, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, compare256_rle_func compare256_rle) {
+        int misalign = 0;
+        int32_t match_len = (int32_t)state.range(0) - 1;
+        uint32_t len = 0;
+
+        for (auto _ : state) {
+            str2[match_len + misalign] = 0;   // Set new match limit
+
+            len = compare256_rle((const uint8_t *)str1 + misalign, (const uint8_t *)str2 + misalign);
+
+            str2[match_len + misalign] = 'a'; // Reset match limit
+
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign++;
+
+            // Prevent the result from being optimized away
+            benchmark::DoNotOptimize(len);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        free(str1);
+        free(str2);
+    }
+};
+
+#define BENCHMARK_COMPARE256_RLE(name, comparefunc, support_flag) \
+    BENCHMARK_DEFINE_F(compare256_rle, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, comparefunc); \
+    } \
+    BENCHMARK_REGISTER_F(compare256_rle, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);;
+
+BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
+BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
diff --git a/neozip/test/benchmarks/benchmark_compress.cc b/neozip/test/benchmarks/benchmark_compress.cc
new file mode 100644
index 0000000000..df042f7153
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_compress.cc
@@ -0,0 +1,75 @@
+/* benchmark_compress.cc -- benchmark compress()
+ * Copyright (C) 2024-2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  if defined(ZLIB_COMPAT)
+#    include "zlib.h"
+#  else
+#    include "zlib-ng.h"
+#  endif
+#  include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (64 * 1024)
+
+class compress_bench: public benchmark::Fixture {
+private:
+    uint8_t *inbuff;
+    uint8_t *outbuff;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        outbuff = (uint8_t *)malloc(MAX_SIZE + 16);
+        if (outbuff == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        // Initialize input buffer with highly compressible data, interspersed
+        // with small amounts of random data and 3-byte matches.
+        inbuff = gen_compressible_data(MAX_SIZE);
+        if (inbuff == NULL) {
+            free(outbuff);
+            outbuff = NULL;
+            state.SkipWithError("gen_compressible_data() failed");
+            return;
+        }
+    }
+
+    void Bench(benchmark::State& state) {
+        int err = 0;
+
+        for (auto _ : state) {
+            z_uintmax_t compressed_size = MAX_SIZE + 16;
+            err = PREFIX(compress)(outbuff, &compressed_size, inbuff, (size_t)state.range(0));
+            if (err != Z_OK) {
+                fprintf(stderr, "compress() failed with error %d\n", err);
+                abort();
+            }
+
+            // Prevent the result from being optimized away
+            benchmark::DoNotOptimize(err);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        free(inbuff);
+        free(outbuff);
+    }
+};
+
+#define BENCHMARK_COMPRESS(name) \
+    BENCHMARK_DEFINE_F(compress_bench, name)(benchmark::State& state) { \
+        Bench(state); \
+    } \
+    BENCHMARK_REGISTER_F(compress_bench, name)->Arg(1)->Arg(16)->Arg(48)->Arg(256)->Arg(1<<10)->Arg(4<<10)->Arg(16<<10)->Arg(64<<10);
+
+BENCHMARK_COMPRESS(compress_bench);
diff --git a/neozip/test/benchmarks/benchmark_crc32.cc b/neozip/test/benchmarks/benchmark_crc32.cc
new file mode 100644
index 0000000000..1e2cf88590
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_crc32.cc
@@ -0,0 +1,125 @@
+/* benchmark_crc32.cc -- benchmark crc32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+}
+
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
+
+class crc32: public benchmark::Fixture {
+private:
+    uint32_t *testdata;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
+        }
+    }
+
+    // Benchmark CRC32, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) {
+        int misalign = 0;
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
+
+            // Prevent the result from being optimized away
+            benchmark::DoNotOptimize(hash);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        zng_free_aligned(testdata);
+    }
+};
+
+#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 0); \
+    } \
+    BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32(name, hashfunc, support_flag) \
+    BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \
+    BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag);
+
+BENCHMARK_CRC32(braid, crc32_braid, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_CRC32(native, native_crc32, 1);
+#else
+
+#ifndef WITHOUT_CHORBA
+BENCHMARK_CRC32(chorba_c, crc32_chorba, 1);
+#endif
+#ifndef WITHOUT_CHORBA_SSE
+#   ifdef X86_SSE2
+    BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+#   endif
+#   ifdef X86_SSE41
+    BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
+#   endif
+#endif
+#ifdef ARM_CRC32
+BENCHMARK_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32);
+#endif
+#ifdef ARM_PMULL_EOR3
+BENCHMARK_CRC32(armv8_pmull_eor3, crc32_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3);
+#endif
+#ifdef RISCV_CRC32_ZBC
+BENCHMARK_CRC32(riscv, crc32_riscv64_zbc, test_cpu_features.riscv.has_zbc);
+#endif
+#ifdef POWER8_VSX_CRC32
+BENCHMARK_CRC32(power8, crc32_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+#ifdef S390_CRC32_VX
+BENCHMARK_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx);
+#endif
+#ifdef X86_PCLMULQDQ_CRC
+BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX2
+BENCHMARK_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq));
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+BENCHMARK_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq));
+#endif
+#ifdef LOONGARCH_CRC
+BENCHMARK_CRC32(loongarch64, crc32_loongarch64, test_cpu_features.loongarch.has_crc);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_crc32_copy.cc b/neozip/test/benchmarks/benchmark_crc32_copy.cc
new file mode 100644
index 0000000000..eafa5aee44
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_crc32_copy.cc
@@ -0,0 +1,177 @@
+/* benchmark_crc32_copy.cc -- benchmark for crc32 implementations with copying
+ * Copyright (C) 2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+}
+
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
+
+class crc32_copy: public benchmark::Fixture {
+protected:
+    uint32_t *testdata;
+    uint8_t *dstbuf;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL || dstbuf == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
+        }
+    }
+
+    // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) {
+        int misalign = 0;
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
+
+            // Prevent the result from being optimized away
+            benchmark::DoNotOptimize(hash);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        zng_free_aligned(testdata);
+        zng_free_aligned(dstbuf);
+    }
+};
+
+// Misaligned
+#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 0); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// CRC32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+	Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 0); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_CRC32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+	Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
+
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
+// Base test
+BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+    // Native
+    BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1)
+#else
+    // Optimized functions
+#  ifndef WITHOUT_CHORBA
+    BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1)
+#  endif
+#  ifndef WITHOUT_CHORBA_SSE
+#    ifdef X86_SSE2
+    BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
+#    endif
+#    ifdef X86_SSE41
+    BENCHMARK_CRC32_COPY(chorba_sse41, crc32_chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
+#    endif
+#  endif
+#  ifdef ARM_CRC32
+    BENCHMARK_CRC32_COPY(armv8, crc32_armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
+#  endif
+#  ifdef ARM_PMULL_EOR3
+    BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
+#  endif
+#  ifdef LOONGARCH_CRC
+    BENCHMARK_CRC32_COPY(loongarch, crc32_loongarch64, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
+#  endif
+#  ifdef POWER8_VSX_CRC32
+    BENCHMARK_CRC32_COPY(power8, crc32_power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
+#  endif
+#  ifdef RISCV_CRC32_ZBC
+    BENCHMARK_CRC32_COPY(riscv, crc32_riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
+#  endif
+#  ifdef S390_CRC32_VX
+    BENCHMARK_CRC32_COPY(vx, crc32_s390_vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
+#  endif
+#  ifdef X86_PCLMULQDQ_CRC
+    BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
+#  endif
+#  ifdef X86_VPCLMULQDQ_AVX2
+    BENCHMARK_CRC32_COPY(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, crc32_copy_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq))
+#  endif
+#  ifdef X86_VPCLMULQDQ_AVX512
+    BENCHMARK_CRC32_COPY(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, crc32_copy_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+#  endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_deflate.cc b/neozip/test/benchmarks/benchmark_deflate.cc
new file mode 100644
index 0000000000..f60e2589d1
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_deflate.cc
@@ -0,0 +1,147 @@
+/* benchmark_deflate.cc -- benchmark deflate() with various levels and raw mode
+ * Copyright (C) 2026 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  if defined(ZLIB_COMPAT)
+#    include "zlib.h"
+#  else
+#    include "zlib-ng.h"
+#  endif
+#  include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (1024 * 1024)
+
+/* Parameterized deflate benchmark: Args(size, level) */
+class deflate_bench: public benchmark::Fixture {
+private:
+    uint8_t *inbuff = nullptr;
+    uint8_t *outbuff = nullptr;
+    z_uintmax_t outbuff_size = 0;
+
+public:
+    void SetUp(::benchmark::State& state) {
+        outbuff_size = PREFIX(deflateBound)(NULL, MAX_SIZE);
+        outbuff = (uint8_t *)malloc(outbuff_size);
+        if (outbuff == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        inbuff = gen_compressible_data(MAX_SIZE);
+        if (inbuff == NULL) {
+            free(outbuff);
+            outbuff = NULL;
+            state.SkipWithError("gen_compressible_data() failed");
+            return;
+        }
+    }
+
+    void Bench(benchmark::State& state, int window_bits, int strategy = Z_DEFAULT_STRATEGY) {
+        int err;
+        size_t size = (size_t)state.range(0);
+        int level = (int)state.range(1);
+
+        PREFIX3(stream) strm;
+        strm.zalloc = NULL;
+        strm.zfree = NULL;
+        strm.opaque = NULL;
+        strm.total_in = 0;
+        strm.total_out = 0;
+        strm.next_out = NULL;
+        strm.avail_out = 0;
+
+        err = PREFIX(deflateInit2)(&strm, level, Z_DEFLATED, window_bits, MAX_MEM_LEVEL, strategy);
+        if (err != Z_OK) {
+            state.SkipWithError("deflateInit2 did not return Z_OK");
+            return;
+        }
+
+        for (auto _ : state) {
+            err = PREFIX(deflateReset)(&strm);
+            if (err != Z_OK) {
+                state.SkipWithError("deflateReset did not return Z_OK");
+                PREFIX(deflateEnd)(&strm);
+                return;
+            }
+
+            strm.avail_in = (uint32_t)size;
+            strm.next_in = (z_const uint8_t *)inbuff;
+            strm.next_out = outbuff;
+            strm.avail_out = (uint32_t)outbuff_size;
+
+            err = PREFIX(deflate)(&strm, Z_FINISH);
+            if (err != Z_STREAM_END) {
+                state.SkipWithError("deflate did not return Z_STREAM_END");
+                PREFIX(deflateEnd)(&strm);
+                return;
+            }
+        }
+
+        err = PREFIX(deflateEnd)(&strm);
+        if (err != Z_OK) {
+            state.SkipWithError("deflateEnd did not return Z_OK");
+            return;
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        free(inbuff);
+        free(outbuff);
+    }
+};
+
+#define BENCHMARK_DEFLATE_ARGS \
+    ->Args({1024, 1})->Args({1024, 3})->Args({1024, 6})->Args({1024, 9}) \
+    ->Args({16384, 1})->Args({16384, 3})->Args({16384, 6})->Args({16384, 9}) \
+    ->Args({131072, 1})->Args({131072, 3})->Args({131072, 6})->Args({131072, 9}) \
+    ->Args({1048576, 1})->Args({1048576, 3})->Args({1048576, 6})->Args({1048576, 9})
+
+/* Parameterized deflate with zlib wrapping (includes adler32 checksum) */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_level)(benchmark::State& state) {
+    Bench(state, MAX_WBITS);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_level) BENCHMARK_DEFLATE_ARGS;
+
+/* Parameterized raw deflate without checksum */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_nocrc)(benchmark::State& state) {
+    Bench(state, -MAX_WBITS);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_nocrc) BENCHMARK_DEFLATE_ARGS;
+
+/* Strategy benchmarks use fewer size/level combos to keep test count reasonable */
+#define BENCHMARK_DEFLATE_STRATEGY_ARGS \
+    ->Args({1024, 1})->Args({1024, 6})->Args({1024, 9}) \
+    ->Args({1048576, 1})->Args({1048576, 6})->Args({1048576, 9})
+
+/* Parameterized deflate with filtered strategy */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_filtered)(benchmark::State& state) {
+    Bench(state, MAX_WBITS, Z_FILTERED);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_filtered) BENCHMARK_DEFLATE_STRATEGY_ARGS;
+
+/* Parameterized deflate with Huffman-only strategy */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_huffman)(benchmark::State& state) {
+    Bench(state, MAX_WBITS, Z_HUFFMAN_ONLY);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_huffman) BENCHMARK_DEFLATE_STRATEGY_ARGS;
+
+/* Parameterized deflate with RLE strategy */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_rle)(benchmark::State& state) {
+    Bench(state, MAX_WBITS, Z_RLE);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_rle) BENCHMARK_DEFLATE_STRATEGY_ARGS;
+
+/* Parameterized deflate with fixed Huffman codes */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_fixed)(benchmark::State& state) {
+    Bench(state, MAX_WBITS, Z_FIXED);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_fixed) BENCHMARK_DEFLATE_STRATEGY_ARGS;
diff --git a/neozip/test/benchmarks/benchmark_inflate.cc b/neozip/test/benchmarks/benchmark_inflate.cc
new file mode 100644
index 0000000000..ac6ef7229f
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_inflate.cc
@@ -0,0 +1,169 @@
+/* benchmark_inflate.cc -- benchmark inflate() without crc32/adler32
+ * Copyright (C) 2024-2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  if defined(ZLIB_COMPAT)
+#    include "zlib.h"
+#  else
+#    include "zlib-ng.h"
+#  endif
+#  include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (1024 * 1024)
+#define NUM_TESTS 6
+
+class inflate_bench: public benchmark::Fixture {
+private:
+    uint8_t *inbuff;
+    uint8_t *outbuff;
+    uint8_t *compressed_buff[NUM_TESTS];
+    z_uintmax_t compressed_sizes[NUM_TESTS];
+    uint32_t sizes[NUM_TESTS] = {1, 64, 1024, 16384, 128*1024, 1024*1024};
+
+public:
+    void SetUp(::benchmark::State& state) {
+        int err;
+        outbuff = (uint8_t *)malloc(MAX_SIZE + 16);
+        if (outbuff == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        // Initialize input buffer with highly compressible data, interspersed
+        // with small amounts of random data and 3-byte matches.
+        inbuff = gen_compressible_data(MAX_SIZE);
+        if (inbuff == NULL) {
+            free(outbuff);
+            outbuff = NULL;
+            state.SkipWithError("gen_compressible_data() failed");
+            return;
+        }
+
+        // Initialize Deflate state
+        PREFIX3(stream) strm;
+        strm.zalloc = NULL;
+        strm.zfree = NULL;
+        strm.opaque = NULL;
+        strm.total_in = 0;
+        strm.total_out = 0;
+        strm.next_out = NULL;
+        strm.avail_out = 0;
+
+        err = PREFIX(deflateInit2)(&strm, Z_BEST_COMPRESSION, Z_DEFLATED, -15, MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+        if (err != Z_OK) {
+            state.SkipWithError("deflateInit2 did not return Z_OK");
+            return;
+        }
+
+
+        // Compress data into different buffers
+        for (int i = 0; i < NUM_TESTS; ++i) {
+            compressed_buff[i] = (uint8_t *)malloc(sizes[i] + 64);
+            if (compressed_buff[i] == NULL) {
+                state.SkipWithError("malloc failed");
+                return;
+            }
+
+            strm.avail_in = sizes[i];                   // Size of the input buffer
+            strm.next_in = (z_const uint8_t *)inbuff;   // Pointer to the input buffer
+            strm.next_out = compressed_buff[i];         // Pointer to the output buffer
+            strm.avail_out = sizes[i] + 64;             // Maximum size of the output buffer
+
+            err = PREFIX(deflate)(&strm, Z_FINISH);     // Perform compression
+            if (err != Z_STREAM_END ) {
+                state.SkipWithError("deflate did not return Z_STREAM_END");
+                PREFIX(deflateEnd)(&strm);
+                return;
+            }
+
+            compressed_sizes[i] = strm.total_out;       // Total compressed size
+
+            err = PREFIX(deflateReset)(&strm);                // Reset Deflate state
+            if (err != Z_OK) {
+                state.SkipWithError("deflateReset did not return Z_OK");
+                return;
+            }
+        }
+
+        err = PREFIX(deflateEnd)(&strm);                // Clean up the deflate stream
+        if (err != Z_OK) {
+            state.SkipWithError("deflateEnd did not return Z_OK");
+            return;
+        }
+    }
+
+    void Bench(benchmark::State& state) {
+        int err;
+        int index = 0;
+        while (sizes[index] != (uint32_t)state.range(0)) ++index;
+
+        // Initialize the inflate stream
+        PREFIX3(stream) strm;
+        strm.zalloc = NULL;
+        strm.zfree = NULL;
+        strm.opaque = NULL;
+        strm.next_in = NULL;
+        strm.avail_in = 0;
+
+        err = PREFIX(inflateInit2)(&strm, -15);  // Initialize the inflate state, no crc/adler
+        if (err != Z_OK) {
+            state.SkipWithError("inflateInit did not return Z_OK");
+            return;
+        }
+
+        for (auto _ : state) {
+            // Perform reset, avoids benchmarking inflateInit and inflateEnd
+            err = PREFIX(inflateReset)(&strm);
+            if (err != Z_OK) {
+                state.SkipWithError("inflateReset did not return Z_OK");
+                return;
+            }
+
+            strm.avail_in = (uint32_t)compressed_sizes[index];  // Size of the input
+            strm.next_in = compressed_buff[index];              // Pointer to the compressed data
+            strm.avail_out = MAX_SIZE;                          // Max size for output
+            strm.next_out = outbuff;                            // Output buffer
+
+            // Perform decompression
+            err = PREFIX(inflate)(&strm, Z_FINISH);
+            if (err != Z_STREAM_END) {
+                state.SkipWithError("inflate did not return Z_STREAM_END");
+                PREFIX(inflateEnd)(&strm);
+                return;
+            }
+        }
+
+        // Finalize the inflation process
+        err = PREFIX(inflateEnd)(&strm);
+        if (err != Z_OK) {
+            state.SkipWithError("inflateEnd did not return Z_OK");
+            return;
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        free(inbuff);
+        free(outbuff);
+
+        for (int i = 0; i < NUM_TESTS; ++i) {
+            free(compressed_buff[i]);
+        }
+    }
+};
+
+#define BENCHMARK_INFLATE(name) \
+    BENCHMARK_DEFINE_F(inflate_bench, name)(benchmark::State& state) { \
+        Bench(state); \
+    } \
+    BENCHMARK_REGISTER_F(inflate_bench, name)->Arg(1)->Arg(64)->Arg(1024)->Arg(16<<10)->Arg(128<<10)->Arg(1024<<10);
+
+BENCHMARK_INFLATE(inflate_nocrc);
diff --git a/neozip/test/benchmarks/benchmark_insert_string.cc b/neozip/test/benchmarks/benchmark_insert_string.cc
new file mode 100644
index 0000000000..fafba3c4cd
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_insert_string.cc
@@ -0,0 +1,164 @@
+/* benchmark_insert_string.cc -- benchmark insert_string variants
+ * Copyright (C) 2025 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <limits.h>
+#include <cstring>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "deflate.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+#  include "insert_string_p.h"
+}
+
+#define MAX_WSIZE 32768
+#define TEST_WINDOW_SIZE (MAX_WSIZE * 2)
+
+typedef uint32_t (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
+
+// Base class with common setup/teardown for both insert_string benchmarks
+class insert_string_base: public benchmark::Fixture {
+protected:
+    deflate_state *s;
+
+public:
+    void SetUp(const ::benchmark::State&) {
+        s = (deflate_state*)zng_alloc_aligned(sizeof(deflate_state), 64);
+        memset(s, 0, sizeof(deflate_state));
+
+        // Set up window parameters
+        s->w_size = MAX_WSIZE;
+        s->window_size = TEST_WINDOW_SIZE;
+
+        // Allocate window
+        s->window = (uint8_t*)zng_alloc_aligned(TEST_WINDOW_SIZE, 64);
+
+        // Allocate hash tables
+        s->head = (Pos*)zng_alloc_aligned(HASH_SIZE * sizeof(Pos), 64);
+        s->prev = (Pos*)zng_alloc_aligned(MAX_WSIZE * sizeof(Pos), 64);
+
+        // Initialize hash tables
+        memset(s->head, 0, HASH_SIZE * sizeof(Pos));
+        memset(s->prev, 0, MAX_WSIZE * sizeof(Pos));
+
+        // Initialize rolling hash state for rolling variant
+        s->ins_h = 0;
+
+        // Fill window with deterministic data patterns
+        for (size_t i = 0; i < TEST_WINDOW_SIZE; i++) {
+            // Create patterns that will exercise the hash function well
+            s->window[i] = (uint8_t)((i * 17 + (i >> 4) * 31 + (i >> 8) * 13) & 0xFF);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        zng_free_aligned(s->window);
+        zng_free_aligned(s->head);
+        zng_free_aligned(s->prev);
+        zng_free_aligned(s);
+    }
+};
+
+class insert_string_bench: public insert_string_base {
+public:
+    void Bench(benchmark::State& state, insert_string_cb insert_func) {
+        uint32_t str_pos = (uint32_t)state.range(0);  // Starting position
+        uint32_t count = (uint32_t)state.range(1);    // Number of strings to insert
+
+        // Ensure we don't go beyond window bounds
+        if (str_pos + count >= TEST_WINDOW_SIZE - 4) {
+            state.SkipWithError("Parameters exceed window size");
+            return;
+        }
+
+        for (auto _ : state) {
+            state.PauseTiming();
+
+            // Reset hash tables to ensure consistent starting state
+            memset(s->head, 0, HASH_SIZE * sizeof(Pos));
+            memset(s->prev, 0, MAX_WSIZE * sizeof(Pos));
+            s->ins_h = 0;
+
+            state.ResumeTiming();
+
+            // Benchmark the insert_string function
+            insert_func(s, str_pos, count);
+        }
+    }
+};
+
+#define BENCHMARK_INSERT_STRING(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(insert_string_bench, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("Function " #name " not supported"); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(insert_string_bench, name) \
+        ->Args({100, 3})        /* Most common case */ \
+        ->Args({100, 4})        \
+        ->Args({100, 5})        \
+        ->Args({100, 7})        \
+        ->Args({100, 14})       /* Mid-range cluster */ \
+        ->Args({100, 32})       /* Transition point */ \
+        ->Args({100, 127})      /* Large cluster around powers of 2 */ \
+        ->Args({100, 255})      /* Near maximum observed values */ \
+        ->Unit(benchmark::kNanosecond);
+
+// Benchmark the standard integer hash variant
+BENCHMARK_INSERT_STRING(integer_hash, ::insert_string, 1);
+
+// Benchmark the rolling hash variant
+BENCHMARK_INSERT_STRING(rolling_hash, ::insert_string_roll, 1);
+
+// Additional benchmark class for quick_insert_string functions
+class quick_insert_string_bench: public insert_string_base {
+public:
+    void Bench(benchmark::State& state, quick_insert_string_cb quick_insert_func) {
+        uint32_t start_pos = (uint32_t)state.range(0);  // Starting position
+        uint32_t count = (uint32_t)state.range(1);      // Number of insertions
+
+        if (start_pos + count >= TEST_WINDOW_SIZE - 4) {
+            state.SkipWithError("Parameters exceed window size");
+            return;
+        }
+
+        for (auto _ : state) {
+            state.PauseTiming();
+
+            // Reset hash tables
+            memset(s->head, 0, HASH_SIZE * sizeof(Pos));
+            memset(s->prev, 0, MAX_WSIZE * sizeof(Pos));
+            s->ins_h = 0;
+
+            state.ResumeTiming();
+
+            // Benchmark quick_insert_string (single insertions)
+            for (uint32_t i = 0; i < count; i++) {
+                uint32_t result = quick_insert_func(s, start_pos + i);
+                benchmark::DoNotOptimize(result);
+            }
+        }
+    }
+};
+
+#define BENCHMARK_QUICK_INSERT_STRING(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(quick_insert_string_bench, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("Function " #name " not supported"); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(quick_insert_string_bench, name) \
+        ->Args({100, 1})        /* Single insertion (baseline) */ \
+        ->Args({100, 100})      /* 100 insertions (measure amortized cost) */ \
+        ->Args({16000, 100})    /* 100 insertions at mid window (different hash distribution) */ \
+        ->Unit(benchmark::kNanosecond);
+
+BENCHMARK_QUICK_INSERT_STRING(quick_integer_hash, ::quick_insert_string, 1);
+BENCHMARK_QUICK_INSERT_STRING(quick_rolling_hash, ::quick_insert_string_roll, 1);
diff --git a/neozip/test/benchmarks/benchmark_main.cc b/neozip/test/benchmarks/benchmark_main.cc
new file mode 100644
index 0000000000..f3c227bdf7
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_main.cc
@@ -0,0 +1,32 @@
+/* benchmark_main.cc -- benchmark suite main entry point
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+
+#include <benchmark/benchmark.h>
+
+#ifndef BUILD_ALT
+extern "C" {
+#  include "zbuild.h"
+#  include "../test_cpu_features.h"
+
+#  ifndef DISABLE_RUNTIME_CPU_DETECTION
+    struct cpu_features test_cpu_features;
+#  endif
+}
+#endif
+
+int main(int argc, char** argv) {
+#ifndef BUILD_ALT
+#  ifndef DISABLE_RUNTIME_CPU_DETECTION
+    cpu_check_features(&test_cpu_features);
+#  endif
+#endif
+
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+
+    return EXIT_SUCCESS;
+}
diff --git a/neozip/test/benchmarks/benchmark_png_decode.cc b/neozip/test/benchmarks/benchmark_png_decode.cc
new file mode 100644
index 0000000000..ce7c8f9304
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_png_decode.cc
@@ -0,0 +1,126 @@
+#include <stdio.h>
+#include <benchmark/benchmark.h>
+#include "benchmark_png_shared.h"
+#include <assert.h>
+
+class png_decode: public benchmark::Fixture {
+protected:
+    png_dat inpng[10];
+
+    /* Backing this on the heap is a more realistic benchmark */
+    uint8_t *output_img_buf = NULL;
+
+public:
+    /* Let's make the vanilla version have something extremely compressible */
+    virtual void init_img(png_bytep img_bytes, size_t width, size_t height) {
+        init_compressible(img_bytes, width*height);
+    }
+
+    void SetUp(const ::benchmark::State&) {
+        output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+        assert(output_img_buf != NULL);
+        init_img(output_img_buf, IMWIDTH, IMHEIGHT);
+
+        /* First we need to author the png bytes to be decoded */
+        for (int i = 0; i < 10; ++i) {
+            inpng[i] = {NULL, 0, 0};
+            encode_png(output_img_buf, &inpng[i], i, IMWIDTH, IMHEIGHT);
+        }
+    }
+
+    /* State in this circumstance will convey the compression level */
+    void Bench(benchmark::State &state) {
+        for (auto _ : state) {
+            int compress_lvl = state.range(0);
+            png_parse_dat in = { inpng[compress_lvl].buf };
+            uint32_t width, height;
+            decode_png(&in, (png_bytepp)&output_img_buf, IMWIDTH * IMHEIGHT * 3, width, height);
+        }
+    }
+
+    void TearDown(const ::benchmark::State &) {
+        free(output_img_buf);
+        for (int i = 0; i < 10; ++i) {
+            free(inpng[i].buf);
+        }
+    }
+};
+
+class png_decode_realistic: public png_decode {
+private:
+    bool test_files_found = false;
+
+public:
+    void SetUp(const ::benchmark::State &) {
+        output_img_buf = NULL;
+        output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+        /* Let's take all the images at different compression levels and jam their bytes into buffers */
+        char test_fname[25];
+        FILE *files[10];
+
+        /* Set all to NULL */
+        memset(files, 0, sizeof(FILE*));
+
+        for (size_t i = 0; i < 10; ++i) {
+            sprintf(test_fname, "test_pngs/%1lu.png", i);
+            FILE *in_img = fopen(test_fname, "r");
+            if (in_img == NULL) {
+                for (size_t j = 0; j < i; ++j) {
+                    if (files[j])
+                        fclose(files[j]);
+                }
+
+                /* For proper cleanup */
+                for (size_t j = i; j < 10; ++j) {
+                    inpng[i] = { NULL, 0, 0 };
+                }
+
+                return;
+            }
+            files[i] = in_img;
+        }
+
+        test_files_found = true;
+        /* Now that we've established we have all the png files, let's read all of their bytes into buffers */
+        for (size_t i = 0; i < 10; ++i) {
+            FILE *in_file = files[i];
+            fseek(in_file, 0, SEEK_END);
+            size_t num_bytes = ftell(in_file);
+            rewind(in_file);
+
+            uint8_t *raw_file = (uint8_t*)malloc(num_bytes);
+            if (raw_file == NULL)
+                abort();
+
+            inpng[i].buf = raw_file;
+            inpng[i].len = num_bytes;
+            inpng[i].buf_rem = 0;
+
+            size_t bytes_read = fread(raw_file, 1, num_bytes, in_file);
+            if (bytes_read != num_bytes) {
+                fprintf(stderr, "couldn't read all of the bytes for file test_pngs/%lu.png", i);
+                abort();
+            }
+
+            fclose(in_file);
+        }
+    }
+
+    void Bench(benchmark::State &state) {
+        if (!test_files_found) {
+            state.SkipWithError("Test imagery in test_pngs not found");
+        }
+
+        png_decode::Bench(state);
+    }
+};
+
+BENCHMARK_DEFINE_F(png_decode, png_decode)(benchmark::State &state) {
+    Bench(state);
+}
+BENCHMARK_REGISTER_F(png_decode, png_decode)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);
+
+BENCHMARK_DEFINE_F(png_decode_realistic, png_decode_realistic)(benchmark::State &state) {
+    Bench(state);
+}
+BENCHMARK_REGISTER_F(png_decode_realistic, png_decode_realistic)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);
diff --git a/neozip/test/benchmarks/benchmark_png_encode.cc b/neozip/test/benchmarks/benchmark_png_encode.cc
new file mode 100644
index 0000000000..d5e25cbc9d
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_png_encode.cc
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+#include "benchmark_png_shared.h"
+
+#define IMWIDTH 1024
+#define IMHEIGHT 1024
+
+class png_encode: public benchmark::Fixture {
+private:
+    png_dat outpng;
+
+    /* Backing this on the heap is a more realistic benchmark */
+    uint8_t *input_img_buf = NULL;
+
+public:
+    /* Let's make the vanilla version have something extremely compressible */
+    virtual void init_img(png_bytep img_bytes, size_t width, size_t height) {
+        init_compressible(img_bytes, width * height);
+    }
+
+    void SetUp(const ::benchmark::State&) {
+        input_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+        outpng.buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+        /* Using malloc rather than zng_alloc so that we can call realloc.
+         * IMWIDTH * IMHEIGHT is likely to be more than enough bytes, though,
+         * given that a simple run length encoding already pretty much can
+         * reduce to this */
+        outpng.len = 0;
+        outpng.buf_rem = IMWIDTH * IMHEIGHT * 3;
+        assert(input_img_buf != NULL);
+        assert(outpng.buf != NULL);
+        init_img(input_img_buf, IMWIDTH, IMHEIGHT);
+    }
+
+    /* State in this circumstance will convey the compression level */
+    void Bench(benchmark::State &state) {
+        for (auto _ : state) {
+            encode_png((png_bytep)input_img_buf, &outpng, state.range(0), IMWIDTH, IMHEIGHT);
+            outpng.buf_rem = outpng.len;
+            outpng.len = 0;
+        }
+    }
+
+    void TearDown(const ::benchmark::State &) {
+        free(input_img_buf);
+        free(outpng.buf);
+    }
+};
+
+BENCHMARK_DEFINE_F(png_encode, encode_compressible)(benchmark::State &state) {
+    Bench(state);
+}
+BENCHMARK_REGISTER_F(png_encode, encode_compressible)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);
diff --git a/neozip/test/benchmarks/benchmark_png_shared.h b/neozip/test/benchmarks/benchmark_png_shared.h
new file mode 100644
index 0000000000..bde679e7d3
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_png_shared.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define IMWIDTH 1024
+#define IMHEIGHT 1024
+
+extern "C" {
+#  include <png.h>
+}
+
+typedef struct _png_dat {
+    uint8_t *buf;
+    int64_t len;
+    size_t buf_rem;
+} png_dat;
+
+typedef struct _png_parse_dat {
+    uint8_t *cur_pos;
+} png_parse_dat;
+
+/* Write a customized write callback so that we write back to an in-memory buffer.
+ * This allows the testing to not involve disk IO */
+static void png_write_cb(png_structp pngp, png_bytep data, png_size_t len) {
+    png_dat *dat = (png_dat*)png_get_io_ptr(pngp);
+    size_t curSize = dat->len + len;
+
+    /* realloc double the requested buffer size to prevent excessive reallocs */
+    if (dat->buf_rem < len) {
+        dat->buf = (uint8_t*)realloc(dat->buf, dat->len + dat->buf_rem + 2 * len);
+
+        if (!dat->buf) {
+            /* Pretty unlikely but we'll put it here just in case */
+            fprintf(stderr, "realloc failed, exiting\n");
+            exit(1);
+        }
+
+        dat->buf_rem += 2 * len;
+    }
+
+    memcpy(dat->buf + dat->len, data, len);
+    dat->len = curSize;
+    dat->buf_rem -= len;
+}
+
+static void init_compressible(png_bytep buf, size_t num_pix) {
+    /* It doesn't actually matter what we make this, but for
+     * the sake of a reasonable test image, let's make this
+     * be a stripe of R, G, & B, with no alpha channel */
+    int32_t i = 0;
+    int32_t red_stop = num_pix / 3;
+    int32_t blue_stop = 2 * num_pix / 3;
+    int32_t green_stop = num_pix;
+
+    for (int32_t x = 0; i < red_stop; x += 3, ++i) {
+       buf[x] = 255;
+       buf[x + 1] = 0;
+       buf[x + 2] = 0;
+    }
+
+    for (int32_t x = 3 * i; i < blue_stop; x+= 3, ++i) {
+       buf[x] = 0;
+       buf[x + 1] = 255;
+       buf[x + 2] = 0;
+    }
+
+    for (int32_t x = 3 * i; i < green_stop; x += 3, ++i) {
+       buf[x] = 0;
+       buf[x + 1] = 0;
+       buf[x + 2] = 255;
+    }
+}
+
+static inline void encode_png(png_bytep buf, png_dat *outpng, int32_t comp_level, uint32_t width, uint32_t height) {
+    png_structp png = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+
+    /* Most of this error handling is _likely_ not necessary. Likewise it's likely
+     * a lot of this stuff can be done in the setup function to avoid measuring this
+     * fixed setup time, but for now we'll do it here */
+    if (!png) abort();
+
+    png_infop info  = png_create_info_struct(png);
+    if (!info) abort();
+
+    png_set_write_fn(png, outpng, png_write_cb, NULL);
+    png_bytep *png_row_ptrs = new png_bytep[height];
+    for (int i = 0; i < IMHEIGHT; ++i) {
+        png_row_ptrs[i] = (png_bytep)&buf[3*i*width];
+    }
+
+    png_set_IHDR(png, info, IMWIDTH, IMHEIGHT, 8, PNG_COLOR_TYPE_RGB,
+                 PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
+                 PNG_FILTER_TYPE_DEFAULT);
+
+    png_write_info(png, info);
+    png_set_compression_level(png, comp_level);
+    png_set_filter(png, 0, PNG_FILTER_NONE);
+    png_write_image(png, (png_bytepp)png_row_ptrs);
+    png_write_end(png, NULL);
+    png_destroy_write_struct(&png, &info);
+    delete[] png_row_ptrs;
+}
+
+static void read_from_pngdat(png_structp png, png_bytep out, png_size_t bytes_to_read) {
+    png_parse_dat *io = (png_parse_dat*)png_get_io_ptr(png);
+    memcpy(out, io->cur_pos, bytes_to_read);
+    io->cur_pos += bytes_to_read;
+}
+
+static inline int decode_png(png_parse_dat *dat, png_bytepp out_bytes, size_t in_size, uint32_t &width, uint32_t &height) {
+    png_structp png = NULL;
+    png = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+
+    if (!png) abort();
+    png_infop info = NULL;
+    info = png_create_info_struct(png);
+    if (!info) abort();
+
+    png_set_read_fn(png, dat, read_from_pngdat);
+    png_read_info(png, info);
+
+    int bit_depth = 0, color_type = -1;
+    png_get_IHDR(png, info, &width, &height, &bit_depth, &color_type, NULL, NULL, NULL);
+
+    size_t im_size = width * height * bit_depth/8 * 3;
+    if (color_type != PNG_COLOR_TYPE_RGB) {
+        fprintf(stderr, "expected an 8 bpp RGB image\n");
+        abort();
+    }
+
+    if (im_size > in_size) {
+       *out_bytes = (png_bytep)realloc(*out_bytes, im_size);
+    }
+
+    png_bytep *out_rows = new png_bytep[height];
+    for (size_t i = 0; i < height; ++i)
+        out_rows[i] = *out_bytes + (width*i*3);
+
+    png_read_rows(png, out_rows, NULL, height);
+    png_destroy_read_struct(&png, &info, NULL);
+    delete[] out_rows;
+
+    return im_size;
+}
diff --git a/neozip/test/benchmarks/benchmark_slidehash.cc b/neozip/test/benchmarks/benchmark_slidehash.cc
new file mode 100644
index 0000000000..e74c06e873
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_slidehash.cc
@@ -0,0 +1,116 @@
+/* benchmark_slidehash.cc -- benchmark slide_hash variants
+ * Copyright (C) 2022 Adam Stylinski, Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <limits.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  include "deflate.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS 32768
+
+class slide_hash: public benchmark::Fixture {
+private:
+    uint16_t *l0;
+    uint16_t *l1;
+    deflate_state *s_g;
+
+public:
+    /**
+     * @brief Prepare the benchmark fixture by allocating and initializing working data.
+     *
+     * Allocates two 64-byte-aligned arrays of `uint16_t` (one of size HASH_SIZE, one of size MAX_RANDOM_INTS),
+     * fills them with pseudorandom `uint16_t` values, allocates a `deflate_state` structure, and sets
+     * its `head` and `prev` pointers to the allocated arrays.
+     *
+     * @param state Benchmark-provided state object from Google Benchmark (supplied by the framework).
+     */
+    void SetUp(const ::benchmark::State&) {
+        l0 = (uint16_t *)zng_alloc_aligned(HASH_SIZE * sizeof(uint16_t), 64);
+
+        for (uint32_t i = 0; i < HASH_SIZE; i++) {
+            l0[i] = (uint16_t)rand();
+        }
+
+        l1 = (uint16_t *)zng_alloc_aligned(MAX_RANDOM_INTS * sizeof(uint16_t), 64);
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            l1[i] = (uint16_t)rand();
+        }
+
+        deflate_state *s = (deflate_state*)malloc(sizeof(deflate_state));
+        s->head = l0;
+        s->prev = l1;
+        s_g = s;
+    }
+
+    void Bench(benchmark::State& state, slide_hash_func slide_hash) {
+        s_g->w_size = (uint32_t)state.range(0);
+
+        for (auto _ : state) {
+            slide_hash(s_g);
+            benchmark::DoNotOptimize(s_g);
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        zng_free_aligned(l0);
+        zng_free_aligned(l1);
+        free(s_g);
+    }
+};
+
+#define BENCHMARK_SLIDEHASH(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(slide_hash, name)(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(512, MAX_RANDOM_INTS);
+
+#if defined(WITH_ALL_FALLBACKS) || !(defined(__x86_64__) || defined(_M_X64))
+BENCHMARK_SLIDEHASH(c, slide_hash_c, 1);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_SLIDEHASH(native, native_slide_hash, 1);
+#else
+
+#ifdef ARM_SIMD
+BENCHMARK_SLIDEHASH(armv6, slide_hash_armv6, test_cpu_features.arm.has_simd);
+#endif
+#ifdef ARM_NEON
+BENCHMARK_SLIDEHASH(neon, slide_hash_neon, test_cpu_features.arm.has_neon);
+#endif
+#ifdef POWER8_VSX
+BENCHMARK_SLIDEHASH(power8, slide_hash_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+#ifdef PPC_VMX
+BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, test_cpu_features.power.has_altivec);
+#endif
+#ifdef RISCV_RVV
+BENCHMARK_SLIDEHASH(rvv, slide_hash_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+#ifdef X86_SSE2
+BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef LOONGARCH_LSX
+BENCHMARK_SLIDEHASH(lsx, slide_hash_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_SLIDEHASH(lasx, slide_hash_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif
+\ No newline at end of file
diff --git a/neozip/test/benchmarks/benchmark_uncompress.cc b/neozip/test/benchmarks/benchmark_uncompress.cc
new file mode 100644
index 0000000000..6a82c05d01
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_uncompress.cc
@@ -0,0 +1,97 @@
+/* benchmark_uncompress.cc -- benchmark uncompress()
+ * Copyright (C) 2024-2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  if defined(ZLIB_COMPAT)
+#    include "zlib.h"
+#  else
+#    include "zlib-ng.h"
+#  endif
+#  include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (1024 * 1024)
+#define NUM_TESTS 6
+
+class uncompress_bench: public benchmark::Fixture {
+private:
+    uint8_t *inbuff;
+    uint8_t *outbuff;
+    uint8_t *compressed_buff[NUM_TESTS];
+    z_uintmax_t compressed_sizes[NUM_TESTS];
+    uint32_t sizes[NUM_TESTS] = {1, 64, 1024, 16384, 128*1024, 1024*1024};
+
+public:
+    void SetUp(::benchmark::State& state) {
+        outbuff = (uint8_t *)malloc(MAX_SIZE + 16);
+        if (outbuff == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        // Initialize input buffer with highly compressible data, interspersed
+        // with small amounts of random data and 3-byte matches.
+        inbuff = gen_compressible_data(MAX_SIZE);
+        if (inbuff == NULL) {
+            free(outbuff);
+            outbuff = NULL;
+            state.SkipWithError("gen_compressible_data() failed");
+            return;
+        }
+
+        // Compress data into different buffers
+        for (int i = 0; i < NUM_TESTS; ++i) {
+            compressed_buff[i] = (uint8_t *)zng_alloc(sizes[i] + 64);
+            assert(compressed_buff[i] != NULL);
+
+            z_uintmax_t compressed_size = sizes[i] + 64;
+            int err = PREFIX(compress2)(compressed_buff[i], &compressed_size, inbuff, sizes[i], Z_BEST_COMPRESSION);
+            if (err != Z_OK) {
+                fprintf(stderr, "compress() failed with error %d\n", err);
+                abort();
+            }
+            compressed_sizes[i] = compressed_size;
+        }
+    }
+
+    void Bench(benchmark::State& state) {
+        int err;
+
+        for (auto _ : state) {
+            int index = 0;
+            while (sizes[index] != (uint32_t)state.range(0)) ++index;
+
+            z_uintmax_t out_size = MAX_SIZE;
+            err = PREFIX(uncompress)(outbuff, &out_size, compressed_buff[index], compressed_sizes[index]);
+            if (err != Z_OK) {
+                fprintf(stderr, "uncompress() failed with error %d\n", err);
+                abort();
+            }
+        }
+    }
+
+    void TearDown(const ::benchmark::State&) {
+        free(inbuff);
+        free(outbuff);
+
+        for (int i = 0; i < NUM_TESTS; ++i) {
+            zng_free(compressed_buff[i]);
+        }
+    }
+};
+
+#define BENCHMARK_UNCOMPRESS(name) \
+    BENCHMARK_DEFINE_F(uncompress_bench, name)(benchmark::State& state) { \
+        Bench(state); \
+    } \
+    BENCHMARK_REGISTER_F(uncompress_bench, name)->Arg(1)->Arg(64)->Arg(1024)->Arg(16<<10)->Arg(128<<10)->Arg(1024<<10);
+
+BENCHMARK_UNCOMPRESS(uncompress_bench);
author	Mehmet Samet Duman <yongdohyun@projecttick.org>	2026-04-02 19:56:09 +0300
committer	Mehmet Samet Duman <yongdohyun@projecttick.org>	2026-04-02 19:56:09 +0300
commit	7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
tree	b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/test/benchmarks
parent	b1e34e861b5d732afe828d58aad2c638135061fd (diff)
parent	c2712b8a345191f6ed79558c089777df94590087 (diff)
download	Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip