summaryrefslogtreecommitdiff
path: root/neozip/test/benchmarks
diff options
context:
space:
mode:
authorMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
committerMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
commit7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
treeb43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/test/benchmarks
parentb1e34e861b5d732afe828d58aad2c638135061fd (diff)
parentc2712b8a345191f6ed79558c089777df94590087 (diff)
downloadProject-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz
Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/test/benchmarks')
-rw-r--r--neozip/test/benchmarks/CMakeLists.txt126
-rw-r--r--neozip/test/benchmarks/README.md63
-rw-r--r--neozip/test/benchmarks/benchmark_adler32.cc121
-rw-r--r--neozip/test/benchmarks/benchmark_adler32_copy.cc176
-rw-r--r--neozip/test/benchmarks/benchmark_compare256.cc106
-rw-r--r--neozip/test/benchmarks/benchmark_compare256_rle.cc72
-rw-r--r--neozip/test/benchmarks/benchmark_compress.cc75
-rw-r--r--neozip/test/benchmarks/benchmark_crc32.cc125
-rw-r--r--neozip/test/benchmarks/benchmark_crc32_copy.cc177
-rw-r--r--neozip/test/benchmarks/benchmark_deflate.cc147
-rw-r--r--neozip/test/benchmarks/benchmark_inflate.cc169
-rw-r--r--neozip/test/benchmarks/benchmark_insert_string.cc164
-rw-r--r--neozip/test/benchmarks/benchmark_main.cc32
-rw-r--r--neozip/test/benchmarks/benchmark_png_decode.cc126
-rw-r--r--neozip/test/benchmarks/benchmark_png_encode.cc54
-rw-r--r--neozip/test/benchmarks/benchmark_png_shared.h146
-rw-r--r--neozip/test/benchmarks/benchmark_slidehash.cc116
-rw-r--r--neozip/test/benchmarks/benchmark_uncompress.cc97
18 files changed, 2092 insertions, 0 deletions
diff --git a/neozip/test/benchmarks/CMakeLists.txt b/neozip/test/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000000..df6f5a7e69
--- /dev/null
+++ b/neozip/test/benchmarks/CMakeLists.txt
@@ -0,0 +1,126 @@
+cmake_minimum_required(VERSION 3.14...4.2.1)
+
+include(FetchContent)
+
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 11)
+endif()
+if(NOT DEFINED CMAKE_CXX_STANDARD_REQUIRED)
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+if(NOT DEFINED CMAKE_CXX_EXTENSIONS)
+ set(CMAKE_CXX_EXTENSIONS ON)
+endif()
+
+# Search for Google benchmark package
+find_package(benchmark QUIET)
+if(NOT benchmark_FOUND)
+ # Fetch google benchmark source code from official repository
+ set(BENCHMARK_ENABLE_TESTING OFF)
+
+ # Allow specifying alternative Google benchmark repository
+ if(NOT DEFINED GBENCHMARK_REPOSITORY)
+ set(GBENCHMARK_REPOSITORY https://github.com/google/benchmark.git)
+ endif()
+ if(NOT DEFINED GBENCHMARK_TAG)
+ set(GBENCHMARK_TAG v1.9.4)
+ endif()
+
+ FetchContent_Declare(benchmark
+ GIT_REPOSITORY ${GBENCHMARK_REPOSITORY}
+ GIT_TAG ${GBENCHMARK_TAG}
+ ${ZNG_FetchContent_Declare_EXCLUDE_FROM_ALL})
+
+ ZNG_FetchContent_MakeAvailable(benchmark)
+endif()
+
+# Public API benchmarks
+set(BENCH_PUBLIC_SRCS
+ benchmark_compress.cc
+ benchmark_inflate.cc
+ benchmark_uncompress.cc
+ benchmark_main.cc
+ )
+
+# Internal benchmarks
+set(BENCH_INTERNAL_SRCS
+ benchmark_adler32.cc
+ benchmark_adler32_copy.cc
+ benchmark_compare256.cc
+ benchmark_compare256_rle.cc
+ benchmark_crc32.cc
+ benchmark_crc32_copy.cc
+ benchmark_deflate.cc
+ benchmark_insert_string.cc
+ benchmark_slidehash.cc
+ )
+
+add_executable(benchmark_zlib ${BENCH_PUBLIC_SRCS})
+
+target_compile_definitions(benchmark_zlib PRIVATE -DBENCHMARK_STATIC_DEFINE)
+target_include_directories(benchmark_zlib PRIVATE
+ ${PROJECT_SOURCE_DIR}
+ ${PROJECT_BINARY_DIR}
+ ${benchmark_SOURCE_DIR}/benchmark/include)
+
+target_link_libraries(benchmark_zlib benchmark::benchmark)
+if(ZLIB_LIBRARY)
+ target_link_libraries(benchmark_zlib ${ZLIB_LIBRARY})
+else()
+ target_sources(benchmark_zlib PRIVATE ${BENCH_INTERNAL_SRCS})
+ target_link_libraries(benchmark_zlib zlib-ng-static)
+endif()
+
+if(WIN32)
+ target_link_libraries(benchmark_zlib shlwapi)
+endif()
+
+add_test(NAME benchmark_zlib
+ COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:benchmark_zlib> "--benchmark_min_time=0")
+
+if(WITH_BENCHMARK_APPS)
+ option(BUILD_ALT_BENCH "Link against alternative zlib implementation" OFF)
+
+ # Search for libpng package
+ find_package(PNG QUIET)
+
+ if(NOT PNG_FOUND)
+ FetchContent_Declare(PNG
+ GIT_REPOSITORY https://github.com/glennrp/libpng.git
+ ${ZNG_FetchContent_Declare_EXCLUDE_FROM_ALL})
+
+ ZNG_FetchContent_MakeAvailable(PNG)
+ set(PNG_INCLUDE_DIR ${png_SOURCE_DIR})
+ endif()
+
+ set(BENCH_APP_SRCS
+ benchmark_png_encode.cc
+ benchmark_png_decode.cc
+ benchmark_main.cc
+ )
+
+ add_executable(benchmark_zlib_apps ${BENCH_APP_SRCS})
+
+ if(DEFINED BUILD_ALT_BENCH)
+ set(ZLIB_ALT_LIB "libz.a" CACHE FILEPATH "Optional alternative zlib implementation (defaults to stock zlib)")
+ add_executable(benchmark_zlib_apps_alt ${BENCH_APP_SRCS})
+ target_link_libraries(benchmark_zlib_apps_alt libpng.a ${ZLIB_ALT_LIB} benchmark::benchmark)
+ target_compile_definitions(benchmark_zlib_apps_alt PRIVATE BUILD_ALT=1)
+ target_include_directories(benchmark_zlib_apps_alt PRIVATE
+ ${PROJECT_SOURCE_DIR}
+ ${PROJECT_BINARY_DIR}
+ ${PNG_INCLUDE_DIR}
+ ${benchmark_SOURCE_DIR}/benchmark/include)
+ endif()
+
+ target_include_directories(benchmark_zlib_apps PRIVATE
+ ${PROJECT_SOURCE_DIR}
+ ${PROJECT_BINARY_DIR}
+ ${PNG_INCLUDE_DIR}
+ ${benchmark_SOURCE_DIR}/benchmark/include)
+
+ # We need the static png library if we're statically linking to zlib,
+ # otherwise it will resolve these things in the system provided dynamic
+ # libraries (likely linked to stock zlib)
+ target_link_libraries(benchmark_zlib_apps libpng.a zlib-ng-static benchmark::benchmark)
+endif()
diff --git a/neozip/test/benchmarks/README.md b/neozip/test/benchmarks/README.md
new file mode 100644
index 0000000000..08ccea233e
--- /dev/null
+++ b/neozip/test/benchmarks/README.md
@@ -0,0 +1,63 @@
+## Benchmarks
+These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark).
+
+*Repetitions*
+
+To increase the number of times each benchmark iteration is run use:
+
+```
+--benchmark_repetitions=20
+```
+
+*Filters*
+
+To filter out which benchmarks are performed use:
+
+```
+--benchmark_filter="adler32*"
+```
+
+There are two different benchmarks, micro and macro.
+
+### Benchmark benchmark_zlib
+These are microbenchmarks intended to test lower level subfunctions of the library.
+
+Benchmarks include implementations of:
+ - Adler32
+ - CRC
+ - 256 byte comparisons
+ - SIMD accelerated "slide hash" routine
+
+By default these benchmarks report things on the nanosecond scale and are small enough
+to measure very minute differences.
+
+*Alternative zlib library*
+
+To benchmark against an alternative zlib-compatible library, use the `ZLIB_LIBRARY`
+CMake argument. When set, only the public API benchmarks are built:
+
+```sh
+cmake -S . -B build-alt \
+ -DZLIB_COMPAT=ON \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DBUILD_TESTING=ON \
+ -DWITH_BENCHMARKS=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DWITH_RUNTIME_CPU_DETECTION=OFF \
+ -DZLIB_LIBRARY=/path/to/libz.a
+```
+
+### Benchmark benchmark_zlib_apps
+These benchmarks measure applications of zlib as a whole. Currently the only examples
+are PNG encoding and decoding. The PNG encode and decode tests leveraging procedurally
+generated and highly compressible image data.
+
+Additionally, a test called `png_decode_realistic` that will decode any RGB 8 BPP encoded
+set of PNGs in the working directory under a directory named "test_pngs" with files named
+{0..1}.png. If these images do not exist, they will error out and the benchmark will move
+on to the next set of benchmarks.
+
+*benchmark_zlib_apps_alt*
+
+The user can compile a comparison benchmark application linking to any zlib-compatible
+implementation of his or her choosing.
diff --git a/neozip/test/benchmarks/benchmark_adler32.cc b/neozip/test/benchmarks/benchmark_adler32.cc
new file mode 100644
index 0000000000..5ee9102e23
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_adler32.cc
@@ -0,0 +1,121 @@
+/* benchmark_adler32.cc -- benchmark adler32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "arch_functions.h"
+# include "../test_cpu_features.h"
+}
+
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
+
+class adler32: public benchmark::Fixture {
+private:
+ uint32_t *testdata;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+ testdata[i] = rand();
+ }
+ }
+
+ // Benchmark Adler32, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) {
+ int misalign = 0;
+ uint32_t hash = 0;
+
+ for (auto _ : state) {
+ hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
+
+ // Prevent the result from being optimized away
+ benchmark::DoNotOptimize(hash);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ zng_free_aligned(testdata);
+ }
+};
+
+#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, hashfunc, 0); \
+ } \
+ BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, hashfunc, 1); \
+ } \
+ BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \
+ BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \
+ BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag);
+
+BENCHMARK_ADLER32(c, adler32_c, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_ADLER32(native, native_adler32, 1);
+#else
+
+#ifdef ARM_NEON
+BENCHMARK_ADLER32(neon, adler32_neon, test_cpu_features.arm.has_neon);
+#endif
+
+#ifdef PPC_VMX
+BENCHMARK_ADLER32(vmx, adler32_vmx, test_cpu_features.power.has_altivec);
+#endif
+#ifdef POWER8_VSX
+BENCHMARK_ADLER32(power8, adler32_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+
+#ifdef RISCV_RVV
+BENCHMARK_ADLER32(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+
+#ifdef X86_SSSE3
+BENCHMARK_ADLER32(ssse3, adler32_ssse3, test_cpu_features.x86.has_ssse3);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef X86_AVX512
+BENCHMARK_ADLER32(avx512, adler32_avx512, test_cpu_features.x86.has_avx512_common);
+#endif
+#ifdef X86_AVX512VNNI
+BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
+#endif
+
+#ifdef LOONGARCH_LSX
+BENCHMARK_ADLER32(lsx, adler32_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_ADLER32(lasx, adler32_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_adler32_copy.cc b/neozip/test/benchmarks/benchmark_adler32_copy.cc
new file mode 100644
index 0000000000..6d913b1d19
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_adler32_copy.cc
@@ -0,0 +1,176 @@
+/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "arch_functions.h"
+# include "../test_cpu_features.h"
+}
+
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
+
+class adler32_copy: public benchmark::Fixture {
+private:
+ uint32_t *testdata;
+ uint8_t *dstbuf;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL || dstbuf == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+ testdata[i] = rand();
+ }
+ }
+
+ // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) {
+ int misalign = 0;
+ uint32_t hash = 0;
+
+ for (auto _ : state) {
+ hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
+
+ // Prevent the result from being optimized away
+ benchmark::DoNotOptimize(hash);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ zng_free_aligned(testdata);
+ zng_free_aligned(dstbuf);
+ }
+};
+
+// Misaligned
+#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, copyfunc, 0); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, copyfunc, 1); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+
+// Adler32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const uint8_t *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, (size_t)len); \
+ return hashfunc(init_sum, buf, len); \
+ }, 0); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const uint8_t *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, (size_t)len); \
+ return hashfunc(init_sum, buf, len); \
+ }, 1); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) \
+ BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
+
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
+BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1);
+#else
+
+#ifdef ARM_NEON
+BENCHMARK_ADLER32_COPY(neon, adler32_neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
+#endif
+
+#ifdef PPC_VMX
+BENCHMARK_ADLER32_COPY(vmx, adler32_vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
+#endif
+#ifdef POWER8_VSX
+BENCHMARK_ADLER32_COPY(power8, adler32_power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+
+#ifdef RISCV_RVV
+BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, adler32_copy_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+
+#ifdef X86_SSSE3
+BENCHMARK_ADLER32_COPY(ssse3, adler32_ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
+#endif
+#ifdef X86_SSE42
+// There is no adler32_sse42, so only test the copy variant
+BENCHMARK_ADLER32_COPY_ONLY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_ADLER32_COPY(avx2, adler32_avx, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef X86_AVX512
+BENCHMARK_ADLER32_COPY(avx512, adler32_avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
+#endif
+#ifdef X86_AVX512VNNI
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
+#endif
+
+#ifdef LOONGARCH_LSX
+BENCHMARK_ADLER32_COPY(lsx, adler32_lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_ADLER32_COPY(lasx, adler32_lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_compare256.cc b/neozip/test/benchmarks/benchmark_compare256.cc
new file mode 100644
index 0000000000..2d8352879d
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_compare256.cc
@@ -0,0 +1,106 @@
+/* benchmark_compare256.cc -- benchmark compare256 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "arch_functions.h"
+# include "../test_cpu_features.h"
+}
+
+#define MAX_COMPARE_SIZE (256 + 64)
+
+class compare256: public benchmark::Fixture {
+private:
+ uint8_t *str1;
+ uint8_t *str2;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ str1 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+ str2 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+ if (str1 == NULL || str2 == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ memset(str1, 'a', MAX_COMPARE_SIZE);
+ memset(str2, 'a', MAX_COMPARE_SIZE);
+ }
+
+ // Benchmark compare256, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, compare256_func compare256) {
+ int misalign = 0;
+ int32_t match_len = (int32_t)state.range(0) - 1;
+ uint32_t len = 0;
+
+ for (auto _ : state) {
+ str2[match_len + misalign] = 0; // Set new match limit
+
+ len = compare256((const uint8_t *)str1 + misalign, (const uint8_t *)str2 + misalign);
+
+ str2[match_len + misalign] = 'a'; // Reset match limit
+
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign++;
+
+ // Prevent the result from being optimized away
+ benchmark::DoNotOptimize(len);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ free(str1);
+ free(str2);
+ }
+};
+
+#define BENCHMARK_COMPARE256(name, comparefunc, support_flag) \
+ BENCHMARK_DEFINE_F(compare256, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, comparefunc); \
+ } \
+ BENCHMARK_REGISTER_F(compare256, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_COMPARE256(native, native_compare256, 1);
+#else
+
+#ifdef WITH_ALL_FALLBACKS
+BENCHMARK_COMPARE256(8, compare256_8, 1);
+BENCHMARK_COMPARE256(64, compare256_64, 1);
+#endif
+
+#ifdef X86_SSE2
+BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef X86_AVX512
+BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common);
+#endif
+#ifdef ARM_NEON
+BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon);
+#endif
+#ifdef POWER9
+BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00);
+#endif
+#ifdef RISCV_RVV
+BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+#ifdef LOONGARCH_LSX
+BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_compare256_rle.cc b/neozip/test/benchmarks/benchmark_compare256_rle.cc
new file mode 100644
index 0000000000..db5adacc19
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_compare256_rle.cc
@@ -0,0 +1,72 @@
+/* benchmark_compare256_rle.cc -- benchmark compare256_rle variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "compare256_rle.h"
+}
+
+#define MAX_COMPARE_SIZE (256 + 64)
+
+class compare256_rle: public benchmark::Fixture {
+private:
+ uint8_t *str1;
+ uint8_t *str2;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ str1 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+ str2 = (uint8_t *)malloc(MAX_COMPARE_SIZE);
+ if (str1 == NULL || str2 == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ memset(str1, 'a', MAX_COMPARE_SIZE);
+ memset(str2, 'a', MAX_COMPARE_SIZE);
+ }
+
+ // Benchmark compare256_rle, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, compare256_rle_func compare256_rle) {
+ int misalign = 0;
+ int32_t match_len = (int32_t)state.range(0) - 1;
+ uint32_t len = 0;
+
+ for (auto _ : state) {
+ str2[match_len + misalign] = 0; // Set new match limit
+
+ len = compare256_rle((const uint8_t *)str1 + misalign, (const uint8_t *)str2 + misalign);
+
+ str2[match_len + misalign] = 'a'; // Reset match limit
+
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign++;
+
+ // Prevent the result from being optimized away
+ benchmark::DoNotOptimize(len);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ free(str1);
+ free(str2);
+ }
+};
+
+#define BENCHMARK_COMPARE256_RLE(name, comparefunc, support_flag) \
+ BENCHMARK_DEFINE_F(compare256_rle, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, comparefunc); \
+ } \
+ BENCHMARK_REGISTER_F(compare256_rle, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);;
+
+BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
+BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
diff --git a/neozip/test/benchmarks/benchmark_compress.cc b/neozip/test/benchmarks/benchmark_compress.cc
new file mode 100644
index 0000000000..df042f7153
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_compress.cc
@@ -0,0 +1,75 @@
+/* benchmark_compress.cc -- benchmark compress()
+ * Copyright (C) 2024-2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil_p.h"
+# if defined(ZLIB_COMPAT)
+# include "zlib.h"
+# else
+# include "zlib-ng.h"
+# endif
+# include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (64 * 1024)
+
+class compress_bench: public benchmark::Fixture {
+private:
+ uint8_t *inbuff;
+ uint8_t *outbuff;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ outbuff = (uint8_t *)malloc(MAX_SIZE + 16);
+ if (outbuff == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ // Initialize input buffer with highly compressible data, interspersed
+ // with small amounts of random data and 3-byte matches.
+ inbuff = gen_compressible_data(MAX_SIZE);
+ if (inbuff == NULL) {
+ free(outbuff);
+ outbuff = NULL;
+ state.SkipWithError("gen_compressible_data() failed");
+ return;
+ }
+ }
+
+ void Bench(benchmark::State& state) {
+ int err = 0;
+
+ for (auto _ : state) {
+ z_uintmax_t compressed_size = MAX_SIZE + 16;
+ err = PREFIX(compress)(outbuff, &compressed_size, inbuff, (size_t)state.range(0));
+ if (err != Z_OK) {
+ fprintf(stderr, "compress() failed with error %d\n", err);
+ abort();
+ }
+
+ // Prevent the result from being optimized away
+ benchmark::DoNotOptimize(err);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ free(inbuff);
+ free(outbuff);
+ }
+};
+
+#define BENCHMARK_COMPRESS(name) \
+ BENCHMARK_DEFINE_F(compress_bench, name)(benchmark::State& state) { \
+ Bench(state); \
+ } \
+ BENCHMARK_REGISTER_F(compress_bench, name)->Arg(1)->Arg(16)->Arg(48)->Arg(256)->Arg(1<<10)->Arg(4<<10)->Arg(16<<10)->Arg(64<<10);
+
+BENCHMARK_COMPRESS(compress_bench);
diff --git a/neozip/test/benchmarks/benchmark_crc32.cc b/neozip/test/benchmarks/benchmark_crc32.cc
new file mode 100644
index 0000000000..1e2cf88590
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_crc32.cc
@@ -0,0 +1,125 @@
+/* benchmark_crc32.cc -- benchmark crc32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "arch_functions.h"
+# include "../test_cpu_features.h"
+}
+
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
+
+class crc32: public benchmark::Fixture {
+private:
+ uint32_t *testdata;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+ testdata[i] = rand();
+ }
+ }
+
+ // Benchmark CRC32, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) {
+ int misalign = 0;
+ uint32_t hash = 0;
+
+ for (auto _ : state) {
+ hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
+
+ // Prevent the result from being optimized away
+ benchmark::DoNotOptimize(hash);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ zng_free_aligned(testdata);
+ }
+};
+
+#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, hashfunc, 0); \
+ } \
+ BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, hashfunc, 1); \
+ } \
+ BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32(name, hashfunc, support_flag) \
+ BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \
+ BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag);
+
+BENCHMARK_CRC32(braid, crc32_braid, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_CRC32(native, native_crc32, 1);
+#else
+
+#ifndef WITHOUT_CHORBA
+BENCHMARK_CRC32(chorba_c, crc32_chorba, 1);
+#endif
+#ifndef WITHOUT_CHORBA_SSE
+# ifdef X86_SSE2
+ BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+# endif
+# ifdef X86_SSE41
+ BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
+# endif
+#endif
+#ifdef ARM_CRC32
+BENCHMARK_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32);
+#endif
+#ifdef ARM_PMULL_EOR3
+BENCHMARK_CRC32(armv8_pmull_eor3, crc32_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3);
+#endif
+#ifdef RISCV_CRC32_ZBC
+BENCHMARK_CRC32(riscv, crc32_riscv64_zbc, test_cpu_features.riscv.has_zbc);
+#endif
+#ifdef POWER8_VSX_CRC32
+BENCHMARK_CRC32(power8, crc32_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+#ifdef S390_CRC32_VX
+BENCHMARK_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx);
+#endif
+#ifdef X86_PCLMULQDQ_CRC
+BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX2
+BENCHMARK_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq));
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+BENCHMARK_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq));
+#endif
+#ifdef LOONGARCH_CRC
+BENCHMARK_CRC32(loongarch64, crc32_loongarch64, test_cpu_features.loongarch.has_crc);
+#endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_crc32_copy.cc b/neozip/test/benchmarks/benchmark_crc32_copy.cc
new file mode 100644
index 0000000000..eafa5aee44
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_crc32_copy.cc
@@ -0,0 +1,177 @@
+/* benchmark_crc32_copy.cc -- benchmark for crc32 implementations with copying
+ * Copyright (C) 2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "arch_functions.h"
+# include "../test_cpu_features.h"
+}
+
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
+
+class crc32_copy: public benchmark::Fixture {
+protected:
+ uint32_t *testdata;
+ uint8_t *dstbuf;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL || dstbuf == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+ testdata[i] = rand();
+ }
+ }
+
+ // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) {
+ int misalign = 0;
+ uint32_t hash = 0;
+
+ for (auto _ : state) {
+ hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
+
+ // Prevent the result from being optimized away
+ benchmark::DoNotOptimize(hash);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ zng_free_aligned(testdata);
+ zng_free_aligned(dstbuf);
+ }
+};
+
+// Misaligned
+#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, copyfunc, 0); \
+ } \
+ BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, copyfunc, 1); \
+ } \
+ BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// CRC32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const uint8_t *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, (size_t)len); \
+ return hashfunc(init_sum, buf, len); \
+ }, 0); \
+ } \
+ BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_CRC32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const uint8_t *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, (size_t)len); \
+ return hashfunc(init_sum, buf, len); \
+ }, 1); \
+ } \
+ BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) \
+ BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
+
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
+// Base test
+BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1);
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+ // Native
+ BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1)
+#else
+ // Optimized functions
+# ifndef WITHOUT_CHORBA
+ BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1)
+# endif
+# ifndef WITHOUT_CHORBA_SSE
+# ifdef X86_SSE2
+ BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
+# endif
+# ifdef X86_SSE41
+ BENCHMARK_CRC32_COPY(chorba_sse41, crc32_chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
+# endif
+# endif
+# ifdef ARM_CRC32
+ BENCHMARK_CRC32_COPY(armv8, crc32_armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
+# endif
+# ifdef ARM_PMULL_EOR3
+ BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
+# endif
+# ifdef LOONGARCH_CRC
+ BENCHMARK_CRC32_COPY(loongarch, crc32_loongarch64, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
+# endif
+# ifdef POWER8_VSX_CRC32
+ BENCHMARK_CRC32_COPY(power8, crc32_power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
+# endif
+# ifdef RISCV_CRC32_ZBC
+ BENCHMARK_CRC32_COPY(riscv, crc32_riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
+# endif
+# ifdef S390_CRC32_VX
+ BENCHMARK_CRC32_COPY(vx, crc32_s390_vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
+# endif
+# ifdef X86_PCLMULQDQ_CRC
+ BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
+# endif
+# ifdef X86_VPCLMULQDQ_AVX2
+ BENCHMARK_CRC32_COPY(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, crc32_copy_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq))
+# endif
+# ifdef X86_VPCLMULQDQ_AVX512
+ BENCHMARK_CRC32_COPY(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, crc32_copy_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+# endif
+
+#endif
diff --git a/neozip/test/benchmarks/benchmark_deflate.cc b/neozip/test/benchmarks/benchmark_deflate.cc
new file mode 100644
index 0000000000..f60e2589d1
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_deflate.cc
@@ -0,0 +1,147 @@
+/* benchmark_deflate.cc -- benchmark deflate() with various levels and raw mode
+ * Copyright (C) 2026 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil_p.h"
+# if defined(ZLIB_COMPAT)
+# include "zlib.h"
+# else
+# include "zlib-ng.h"
+# endif
+# include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (1024 * 1024)
+
+/* Parameterized deflate benchmark: Args(size, level) */
+class deflate_bench: public benchmark::Fixture {
+private:
+ uint8_t *inbuff = nullptr;
+ uint8_t *outbuff = nullptr;
+ z_uintmax_t outbuff_size = 0;
+
+public:
+ void SetUp(::benchmark::State& state) {
+ outbuff_size = PREFIX(deflateBound)(NULL, MAX_SIZE);
+ outbuff = (uint8_t *)malloc(outbuff_size);
+ if (outbuff == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ inbuff = gen_compressible_data(MAX_SIZE);
+ if (inbuff == NULL) {
+ free(outbuff);
+ outbuff = NULL;
+ state.SkipWithError("gen_compressible_data() failed");
+ return;
+ }
+ }
+
+ void Bench(benchmark::State& state, int window_bits, int strategy = Z_DEFAULT_STRATEGY) {
+ int err;
+ size_t size = (size_t)state.range(0);
+ int level = (int)state.range(1);
+
+ PREFIX3(stream) strm;
+ strm.zalloc = NULL;
+ strm.zfree = NULL;
+ strm.opaque = NULL;
+ strm.total_in = 0;
+ strm.total_out = 0;
+ strm.next_out = NULL;
+ strm.avail_out = 0;
+
+ err = PREFIX(deflateInit2)(&strm, level, Z_DEFLATED, window_bits, MAX_MEM_LEVEL, strategy);
+ if (err != Z_OK) {
+ state.SkipWithError("deflateInit2 did not return Z_OK");
+ return;
+ }
+
+ for (auto _ : state) {
+ err = PREFIX(deflateReset)(&strm);
+ if (err != Z_OK) {
+ state.SkipWithError("deflateReset did not return Z_OK");
+ PREFIX(deflateEnd)(&strm);
+ return;
+ }
+
+ strm.avail_in = (uint32_t)size;
+ strm.next_in = (z_const uint8_t *)inbuff;
+ strm.next_out = outbuff;
+ strm.avail_out = (uint32_t)outbuff_size;
+
+ err = PREFIX(deflate)(&strm, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ state.SkipWithError("deflate did not return Z_STREAM_END");
+ PREFIX(deflateEnd)(&strm);
+ return;
+ }
+ }
+
+ err = PREFIX(deflateEnd)(&strm);
+ if (err != Z_OK) {
+ state.SkipWithError("deflateEnd did not return Z_OK");
+ return;
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ free(inbuff);
+ free(outbuff);
+ }
+};
+
+#define BENCHMARK_DEFLATE_ARGS \
+ ->Args({1024, 1})->Args({1024, 3})->Args({1024, 6})->Args({1024, 9}) \
+ ->Args({16384, 1})->Args({16384, 3})->Args({16384, 6})->Args({16384, 9}) \
+ ->Args({131072, 1})->Args({131072, 3})->Args({131072, 6})->Args({131072, 9}) \
+ ->Args({1048576, 1})->Args({1048576, 3})->Args({1048576, 6})->Args({1048576, 9})
+
+/* Parameterized deflate with zlib wrapping (includes adler32 checksum) */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_level)(benchmark::State& state) {
+ Bench(state, MAX_WBITS);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_level) BENCHMARK_DEFLATE_ARGS;
+
+/* Parameterized raw deflate without checksum */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_nocrc)(benchmark::State& state) {
+ Bench(state, -MAX_WBITS);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_nocrc) BENCHMARK_DEFLATE_ARGS;
+
+/* Strategy benchmarks use fewer size/level combos to keep test count reasonable */
+#define BENCHMARK_DEFLATE_STRATEGY_ARGS \
+ ->Args({1024, 1})->Args({1024, 6})->Args({1024, 9}) \
+ ->Args({1048576, 1})->Args({1048576, 6})->Args({1048576, 9})
+
+/* Parameterized deflate with filtered strategy */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_filtered)(benchmark::State& state) {
+ Bench(state, MAX_WBITS, Z_FILTERED);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_filtered) BENCHMARK_DEFLATE_STRATEGY_ARGS;
+
+/* Parameterized deflate with Huffman-only strategy */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_huffman)(benchmark::State& state) {
+ Bench(state, MAX_WBITS, Z_HUFFMAN_ONLY);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_huffman) BENCHMARK_DEFLATE_STRATEGY_ARGS;
+
+/* Parameterized deflate with RLE strategy */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_rle)(benchmark::State& state) {
+ Bench(state, MAX_WBITS, Z_RLE);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_rle) BENCHMARK_DEFLATE_STRATEGY_ARGS;
+
+/* Parameterized deflate with fixed Huffman codes */
+BENCHMARK_DEFINE_F(deflate_bench, deflate_fixed)(benchmark::State& state) {
+ Bench(state, MAX_WBITS, Z_FIXED);
+}
+BENCHMARK_REGISTER_F(deflate_bench, deflate_fixed) BENCHMARK_DEFLATE_STRATEGY_ARGS;
diff --git a/neozip/test/benchmarks/benchmark_inflate.cc b/neozip/test/benchmarks/benchmark_inflate.cc
new file mode 100644
index 0000000000..ac6ef7229f
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_inflate.cc
@@ -0,0 +1,169 @@
+/* benchmark_inflate.cc -- benchmark inflate() without crc32/adler32
+ * Copyright (C) 2024-2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil_p.h"
+# if defined(ZLIB_COMPAT)
+# include "zlib.h"
+# else
+# include "zlib-ng.h"
+# endif
+# include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (1024 * 1024)
+#define NUM_TESTS 6
+
+class inflate_bench: public benchmark::Fixture {
+private:
+ uint8_t *inbuff;
+ uint8_t *outbuff;
+ uint8_t *compressed_buff[NUM_TESTS];
+ z_uintmax_t compressed_sizes[NUM_TESTS];
+ uint32_t sizes[NUM_TESTS] = {1, 64, 1024, 16384, 128*1024, 1024*1024};
+
+public:
+ void SetUp(::benchmark::State& state) {
+ int err;
+ outbuff = (uint8_t *)malloc(MAX_SIZE + 16);
+ if (outbuff == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ // Initialize input buffer with highly compressible data, interspersed
+ // with small amounts of random data and 3-byte matches.
+ inbuff = gen_compressible_data(MAX_SIZE);
+ if (inbuff == NULL) {
+ free(outbuff);
+ outbuff = NULL;
+ state.SkipWithError("gen_compressible_data() failed");
+ return;
+ }
+
+ // Initialize Deflate state
+ PREFIX3(stream) strm;
+ strm.zalloc = NULL;
+ strm.zfree = NULL;
+ strm.opaque = NULL;
+ strm.total_in = 0;
+ strm.total_out = 0;
+ strm.next_out = NULL;
+ strm.avail_out = 0;
+
+ err = PREFIX(deflateInit2)(&strm, Z_BEST_COMPRESSION, Z_DEFLATED, -15, MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (err != Z_OK) {
+ state.SkipWithError("deflateInit2 did not return Z_OK");
+ return;
+ }
+
+
+ // Compress data into different buffers
+ for (int i = 0; i < NUM_TESTS; ++i) {
+ compressed_buff[i] = (uint8_t *)malloc(sizes[i] + 64);
+ if (compressed_buff[i] == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ strm.avail_in = sizes[i]; // Size of the input buffer
+ strm.next_in = (z_const uint8_t *)inbuff; // Pointer to the input buffer
+ strm.next_out = compressed_buff[i]; // Pointer to the output buffer
+ strm.avail_out = sizes[i] + 64; // Maximum size of the output buffer
+
+ err = PREFIX(deflate)(&strm, Z_FINISH); // Perform compression
+ if (err != Z_STREAM_END ) {
+ state.SkipWithError("deflate did not return Z_STREAM_END");
+ PREFIX(deflateEnd)(&strm);
+ return;
+ }
+
+ compressed_sizes[i] = strm.total_out; // Total compressed size
+
+ err = PREFIX(deflateReset)(&strm); // Reset Deflate state
+ if (err != Z_OK) {
+ state.SkipWithError("deflateReset did not return Z_OK");
+ return;
+ }
+ }
+
+ err = PREFIX(deflateEnd)(&strm); // Clean up the deflate stream
+ if (err != Z_OK) {
+ state.SkipWithError("deflateEnd did not return Z_OK");
+ return;
+ }
+ }
+
+ void Bench(benchmark::State& state) {
+ int err;
+ int index = 0;
+ while (sizes[index] != (uint32_t)state.range(0)) ++index;
+
+ // Initialize the inflate stream
+ PREFIX3(stream) strm;
+ strm.zalloc = NULL;
+ strm.zfree = NULL;
+ strm.opaque = NULL;
+ strm.next_in = NULL;
+ strm.avail_in = 0;
+
+ err = PREFIX(inflateInit2)(&strm, -15); // Initialize the inflate state, no crc/adler
+ if (err != Z_OK) {
+ state.SkipWithError("inflateInit did not return Z_OK");
+ return;
+ }
+
+ for (auto _ : state) {
+ // Perform reset, avoids benchmarking inflateInit and inflateEnd
+ err = PREFIX(inflateReset)(&strm);
+ if (err != Z_OK) {
+ state.SkipWithError("inflateReset did not return Z_OK");
+ return;
+ }
+
+ strm.avail_in = (uint32_t)compressed_sizes[index]; // Size of the input
+ strm.next_in = compressed_buff[index]; // Pointer to the compressed data
+ strm.avail_out = MAX_SIZE; // Max size for output
+ strm.next_out = outbuff; // Output buffer
+
+ // Perform decompression
+ err = PREFIX(inflate)(&strm, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ state.SkipWithError("inflate did not return Z_STREAM_END");
+ PREFIX(inflateEnd)(&strm);
+ return;
+ }
+ }
+
+ // Finalize the inflation process
+ err = PREFIX(inflateEnd)(&strm);
+ if (err != Z_OK) {
+ state.SkipWithError("inflateEnd did not return Z_OK");
+ return;
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ free(inbuff);
+ free(outbuff);
+
+ for (int i = 0; i < NUM_TESTS; ++i) {
+ free(compressed_buff[i]);
+ }
+ }
+};
+
+#define BENCHMARK_INFLATE(name) \
+ BENCHMARK_DEFINE_F(inflate_bench, name)(benchmark::State& state) { \
+ Bench(state); \
+ } \
+ BENCHMARK_REGISTER_F(inflate_bench, name)->Arg(1)->Arg(64)->Arg(1024)->Arg(16<<10)->Arg(128<<10)->Arg(1024<<10);
+
+BENCHMARK_INFLATE(inflate_nocrc);
diff --git a/neozip/test/benchmarks/benchmark_insert_string.cc b/neozip/test/benchmarks/benchmark_insert_string.cc
new file mode 100644
index 0000000000..fafba3c4cd
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_insert_string.cc
@@ -0,0 +1,164 @@
+/* benchmark_insert_string.cc -- benchmark insert_string variants
+ * Copyright (C) 2025 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <limits.h>
+#include <cstring>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "deflate.h"
+# include "arch_functions.h"
+# include "../test_cpu_features.h"
+# include "insert_string_p.h"
+}
+
+#define MAX_WSIZE 32768
+#define TEST_WINDOW_SIZE (MAX_WSIZE * 2)
+
+typedef uint32_t (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
+
+// Base class with common setup/teardown for both insert_string benchmarks
+class insert_string_base: public benchmark::Fixture {
+protected:
+ deflate_state *s;
+
+public:
+ void SetUp(const ::benchmark::State&) {
+ s = (deflate_state*)zng_alloc_aligned(sizeof(deflate_state), 64);
+ memset(s, 0, sizeof(deflate_state));
+
+ // Set up window parameters
+ s->w_size = MAX_WSIZE;
+ s->window_size = TEST_WINDOW_SIZE;
+
+ // Allocate window
+ s->window = (uint8_t*)zng_alloc_aligned(TEST_WINDOW_SIZE, 64);
+
+ // Allocate hash tables
+ s->head = (Pos*)zng_alloc_aligned(HASH_SIZE * sizeof(Pos), 64);
+ s->prev = (Pos*)zng_alloc_aligned(MAX_WSIZE * sizeof(Pos), 64);
+
+ // Initialize hash tables
+ memset(s->head, 0, HASH_SIZE * sizeof(Pos));
+ memset(s->prev, 0, MAX_WSIZE * sizeof(Pos));
+
+ // Initialize rolling hash state for rolling variant
+ s->ins_h = 0;
+
+ // Fill window with deterministic data patterns
+ for (size_t i = 0; i < TEST_WINDOW_SIZE; i++) {
+ // Create patterns that will exercise the hash function well
+ s->window[i] = (uint8_t)((i * 17 + (i >> 4) * 31 + (i >> 8) * 13) & 0xFF);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ zng_free_aligned(s->window);
+ zng_free_aligned(s->head);
+ zng_free_aligned(s->prev);
+ zng_free_aligned(s);
+ }
+};
+
+class insert_string_bench: public insert_string_base {
+public:
+ void Bench(benchmark::State& state, insert_string_cb insert_func) {
+ uint32_t str_pos = (uint32_t)state.range(0); // Starting position
+ uint32_t count = (uint32_t)state.range(1); // Number of strings to insert
+
+ // Ensure we don't go beyond window bounds
+ if (str_pos + count >= TEST_WINDOW_SIZE - 4) {
+ state.SkipWithError("Parameters exceed window size");
+ return;
+ }
+
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ // Reset hash tables to ensure consistent starting state
+ memset(s->head, 0, HASH_SIZE * sizeof(Pos));
+ memset(s->prev, 0, MAX_WSIZE * sizeof(Pos));
+ s->ins_h = 0;
+
+ state.ResumeTiming();
+
+ // Benchmark the insert_string function
+ insert_func(s, str_pos, count);
+ }
+ }
+};
+
+#define BENCHMARK_INSERT_STRING(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(insert_string_bench, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("Function " #name " not supported"); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(insert_string_bench, name) \
+ ->Args({100, 3}) /* Most common case */ \
+ ->Args({100, 4}) \
+ ->Args({100, 5}) \
+ ->Args({100, 7}) \
+ ->Args({100, 14}) /* Mid-range cluster */ \
+ ->Args({100, 32}) /* Transition point */ \
+ ->Args({100, 127}) /* Large cluster around powers of 2 */ \
+ ->Args({100, 255}) /* Near maximum observed values */ \
+ ->Unit(benchmark::kNanosecond);
+
+// Benchmark the standard integer hash variant
+BENCHMARK_INSERT_STRING(integer_hash, ::insert_string, 1);
+
+// Benchmark the rolling hash variant
+BENCHMARK_INSERT_STRING(rolling_hash, ::insert_string_roll, 1);
+
+// Additional benchmark class for quick_insert_string functions
+class quick_insert_string_bench: public insert_string_base {
+public:
+ void Bench(benchmark::State& state, quick_insert_string_cb quick_insert_func) {
+ uint32_t start_pos = (uint32_t)state.range(0); // Starting position
+ uint32_t count = (uint32_t)state.range(1); // Number of insertions
+
+ if (start_pos + count >= TEST_WINDOW_SIZE - 4) {
+ state.SkipWithError("Parameters exceed window size");
+ return;
+ }
+
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ // Reset hash tables
+ memset(s->head, 0, HASH_SIZE * sizeof(Pos));
+ memset(s->prev, 0, MAX_WSIZE * sizeof(Pos));
+ s->ins_h = 0;
+
+ state.ResumeTiming();
+
+ // Benchmark quick_insert_string (single insertions)
+ for (uint32_t i = 0; i < count; i++) {
+ uint32_t result = quick_insert_func(s, start_pos + i);
+ benchmark::DoNotOptimize(result);
+ }
+ }
+ }
+};
+
+#define BENCHMARK_QUICK_INSERT_STRING(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(quick_insert_string_bench, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("Function " #name " not supported"); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(quick_insert_string_bench, name) \
+ ->Args({100, 1}) /* Single insertion (baseline) */ \
+ ->Args({100, 100}) /* 100 insertions (measure amortized cost) */ \
+ ->Args({16000, 100}) /* 100 insertions at mid window (different hash distribution) */ \
+ ->Unit(benchmark::kNanosecond);
+
+BENCHMARK_QUICK_INSERT_STRING(quick_integer_hash, ::quick_insert_string, 1);
+BENCHMARK_QUICK_INSERT_STRING(quick_rolling_hash, ::quick_insert_string_roll, 1);
diff --git a/neozip/test/benchmarks/benchmark_main.cc b/neozip/test/benchmarks/benchmark_main.cc
new file mode 100644
index 0000000000..f3c227bdf7
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_main.cc
@@ -0,0 +1,32 @@
+/* benchmark_main.cc -- benchmark suite main entry point
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+
+#include <benchmark/benchmark.h>
+
+#ifndef BUILD_ALT
+extern "C" {
+# include "zbuild.h"
+# include "../test_cpu_features.h"
+
+# ifndef DISABLE_RUNTIME_CPU_DETECTION
+ struct cpu_features test_cpu_features;
+# endif
+}
+#endif
+
+int main(int argc, char** argv) {
+#ifndef BUILD_ALT
+# ifndef DISABLE_RUNTIME_CPU_DETECTION
+ cpu_check_features(&test_cpu_features);
+# endif
+#endif
+
+ ::benchmark::Initialize(&argc, argv);
+ ::benchmark::RunSpecifiedBenchmarks();
+
+ return EXIT_SUCCESS;
+}
diff --git a/neozip/test/benchmarks/benchmark_png_decode.cc b/neozip/test/benchmarks/benchmark_png_decode.cc
new file mode 100644
index 0000000000..ce7c8f9304
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_png_decode.cc
@@ -0,0 +1,126 @@
+#include <stdio.h>
+#include <benchmark/benchmark.h>
+#include "benchmark_png_shared.h"
+#include <assert.h>
+
+class png_decode: public benchmark::Fixture {
+protected:
+ png_dat inpng[10];
+
+ /* Backing this on the heap is a more realistic benchmark */
+ uint8_t *output_img_buf = NULL;
+
+public:
+ /* Let's make the vanilla version have something extremely compressible */
+ virtual void init_img(png_bytep img_bytes, size_t width, size_t height) {
+ init_compressible(img_bytes, width*height);
+ }
+
+ void SetUp(const ::benchmark::State&) {
+ output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+ assert(output_img_buf != NULL);
+ init_img(output_img_buf, IMWIDTH, IMHEIGHT);
+
+ /* First we need to author the png bytes to be decoded */
+ for (int i = 0; i < 10; ++i) {
+ inpng[i] = {NULL, 0, 0};
+ encode_png(output_img_buf, &inpng[i], i, IMWIDTH, IMHEIGHT);
+ }
+ }
+
+ /* State in this circumstance will convey the compression level */
+ void Bench(benchmark::State &state) {
+ for (auto _ : state) {
+ int compress_lvl = state.range(0);
+ png_parse_dat in = { inpng[compress_lvl].buf };
+ uint32_t width, height;
+ decode_png(&in, (png_bytepp)&output_img_buf, IMWIDTH * IMHEIGHT * 3, width, height);
+ }
+ }
+
+ void TearDown(const ::benchmark::State &) {
+ free(output_img_buf);
+ for (int i = 0; i < 10; ++i) {
+ free(inpng[i].buf);
+ }
+ }
+};
+
+class png_decode_realistic: public png_decode {
+private:
+ bool test_files_found = false;
+
+public:
+ void SetUp(const ::benchmark::State &) {
+ output_img_buf = NULL;
+ output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+ /* Let's take all the images at different compression levels and jam their bytes into buffers */
+ char test_fname[25];
+ FILE *files[10];
+
+ /* Set all to NULL */
+ memset(files, 0, sizeof(FILE*));
+
+ for (size_t i = 0; i < 10; ++i) {
+ sprintf(test_fname, "test_pngs/%1lu.png", i);
+ FILE *in_img = fopen(test_fname, "r");
+ if (in_img == NULL) {
+ for (size_t j = 0; j < i; ++j) {
+ if (files[j])
+ fclose(files[j]);
+ }
+
+ /* For proper cleanup */
+ for (size_t j = i; j < 10; ++j) {
+ inpng[i] = { NULL, 0, 0 };
+ }
+
+ return;
+ }
+ files[i] = in_img;
+ }
+
+ test_files_found = true;
+ /* Now that we've established we have all the png files, let's read all of their bytes into buffers */
+ for (size_t i = 0; i < 10; ++i) {
+ FILE *in_file = files[i];
+ fseek(in_file, 0, SEEK_END);
+ size_t num_bytes = ftell(in_file);
+ rewind(in_file);
+
+ uint8_t *raw_file = (uint8_t*)malloc(num_bytes);
+ if (raw_file == NULL)
+ abort();
+
+ inpng[i].buf = raw_file;
+ inpng[i].len = num_bytes;
+ inpng[i].buf_rem = 0;
+
+ size_t bytes_read = fread(raw_file, 1, num_bytes, in_file);
+ if (bytes_read != num_bytes) {
+ fprintf(stderr, "couldn't read all of the bytes for file test_pngs/%lu.png", i);
+ abort();
+ }
+
+ fclose(in_file);
+ }
+ }
+
+ void Bench(benchmark::State &state) {
+ if (!test_files_found) {
+ state.SkipWithError("Test imagery in test_pngs not found");
+ }
+
+ png_decode::Bench(state);
+ }
+};
+
+BENCHMARK_DEFINE_F(png_decode, png_decode)(benchmark::State &state) {
+ Bench(state);
+}
+BENCHMARK_REGISTER_F(png_decode, png_decode)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);
+
+BENCHMARK_DEFINE_F(png_decode_realistic, png_decode_realistic)(benchmark::State &state) {
+ Bench(state);
+}
+BENCHMARK_REGISTER_F(png_decode_realistic, png_decode_realistic)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);
diff --git a/neozip/test/benchmarks/benchmark_png_encode.cc b/neozip/test/benchmarks/benchmark_png_encode.cc
new file mode 100644
index 0000000000..d5e25cbc9d
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_png_encode.cc
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+#include "benchmark_png_shared.h"
+
+#define IMWIDTH 1024
+#define IMHEIGHT 1024
+
+class png_encode: public benchmark::Fixture {
+private:
+ png_dat outpng;
+
+ /* Backing this on the heap is a more realistic benchmark */
+ uint8_t *input_img_buf = NULL;
+
+public:
+ /* Let's make the vanilla version have something extremely compressible */
+ virtual void init_img(png_bytep img_bytes, size_t width, size_t height) {
+ init_compressible(img_bytes, width * height);
+ }
+
+ void SetUp(const ::benchmark::State&) {
+ input_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+ outpng.buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3);
+ /* Using malloc rather than zng_alloc so that we can call realloc.
+ * IMWIDTH * IMHEIGHT is likely to be more than enough bytes, though,
+ * given that a simple run length encoding already pretty much can
+ * reduce to this */
+ outpng.len = 0;
+ outpng.buf_rem = IMWIDTH * IMHEIGHT * 3;
+ assert(input_img_buf != NULL);
+ assert(outpng.buf != NULL);
+ init_img(input_img_buf, IMWIDTH, IMHEIGHT);
+ }
+
+ /* State in this circumstance will convey the compression level */
+ void Bench(benchmark::State &state) {
+ for (auto _ : state) {
+ encode_png((png_bytep)input_img_buf, &outpng, state.range(0), IMWIDTH, IMHEIGHT);
+ outpng.buf_rem = outpng.len;
+ outpng.len = 0;
+ }
+ }
+
+ void TearDown(const ::benchmark::State &) {
+ free(input_img_buf);
+ free(outpng.buf);
+ }
+};
+
+BENCHMARK_DEFINE_F(png_encode, encode_compressible)(benchmark::State &state) {
+ Bench(state);
+}
+BENCHMARK_REGISTER_F(png_encode, encode_compressible)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond);
diff --git a/neozip/test/benchmarks/benchmark_png_shared.h b/neozip/test/benchmarks/benchmark_png_shared.h
new file mode 100644
index 0000000000..bde679e7d3
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_png_shared.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define IMWIDTH 1024
+#define IMHEIGHT 1024
+
+extern "C" {
+# include <png.h>
+}
+
+typedef struct _png_dat {
+ uint8_t *buf;
+ int64_t len;
+ size_t buf_rem;
+} png_dat;
+
+typedef struct _png_parse_dat {
+ uint8_t *cur_pos;
+} png_parse_dat;
+
+/* Write a customized write callback so that we write back to an in-memory buffer.
+ * This allows the testing to not involve disk IO */
+static void png_write_cb(png_structp pngp, png_bytep data, png_size_t len) {
+ png_dat *dat = (png_dat*)png_get_io_ptr(pngp);
+ size_t curSize = dat->len + len;
+
+ /* realloc double the requested buffer size to prevent excessive reallocs */
+ if (dat->buf_rem < len) {
+ dat->buf = (uint8_t*)realloc(dat->buf, dat->len + dat->buf_rem + 2 * len);
+
+ if (!dat->buf) {
+ /* Pretty unlikely but we'll put it here just in case */
+ fprintf(stderr, "realloc failed, exiting\n");
+ exit(1);
+ }
+
+ dat->buf_rem += 2 * len;
+ }
+
+ memcpy(dat->buf + dat->len, data, len);
+ dat->len = curSize;
+ dat->buf_rem -= len;
+}
+
+static void init_compressible(png_bytep buf, size_t num_pix) {
+ /* It doesn't actually matter what we make this, but for
+ * the sake of a reasonable test image, let's make this
+ * be a stripe of R, G, & B, with no alpha channel */
+ int32_t i = 0;
+ int32_t red_stop = num_pix / 3;
+ int32_t blue_stop = 2 * num_pix / 3;
+ int32_t green_stop = num_pix;
+
+ for (int32_t x = 0; i < red_stop; x += 3, ++i) {
+ buf[x] = 255;
+ buf[x + 1] = 0;
+ buf[x + 2] = 0;
+ }
+
+ for (int32_t x = 3 * i; i < blue_stop; x+= 3, ++i) {
+ buf[x] = 0;
+ buf[x + 1] = 255;
+ buf[x + 2] = 0;
+ }
+
+ for (int32_t x = 3 * i; i < green_stop; x += 3, ++i) {
+ buf[x] = 0;
+ buf[x + 1] = 0;
+ buf[x + 2] = 255;
+ }
+}
+
+static inline void encode_png(png_bytep buf, png_dat *outpng, int32_t comp_level, uint32_t width, uint32_t height) {
+ png_structp png = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+
+ /* Most of this error handling is _likely_ not necessary. Likewise it's likely
+ * a lot of this stuff can be done in the setup function to avoid measuring this
+ * fixed setup time, but for now we'll do it here */
+ if (!png) abort();
+
+ png_infop info = png_create_info_struct(png);
+ if (!info) abort();
+
+ png_set_write_fn(png, outpng, png_write_cb, NULL);
+ png_bytep *png_row_ptrs = new png_bytep[height];
+ for (int i = 0; i < IMHEIGHT; ++i) {
+ png_row_ptrs[i] = (png_bytep)&buf[3*i*width];
+ }
+
+ png_set_IHDR(png, info, IMWIDTH, IMHEIGHT, 8, PNG_COLOR_TYPE_RGB,
+ PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
+ PNG_FILTER_TYPE_DEFAULT);
+
+ png_write_info(png, info);
+ png_set_compression_level(png, comp_level);
+ png_set_filter(png, 0, PNG_FILTER_NONE);
+ png_write_image(png, (png_bytepp)png_row_ptrs);
+ png_write_end(png, NULL);
+ png_destroy_write_struct(&png, &info);
+ delete[] png_row_ptrs;
+}
+
+static void read_from_pngdat(png_structp png, png_bytep out, png_size_t bytes_to_read) {
+ png_parse_dat *io = (png_parse_dat*)png_get_io_ptr(png);
+ memcpy(out, io->cur_pos, bytes_to_read);
+ io->cur_pos += bytes_to_read;
+}
+
+static inline int decode_png(png_parse_dat *dat, png_bytepp out_bytes, size_t in_size, uint32_t &width, uint32_t &height) {
+ png_structp png = NULL;
+ png = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+
+ if (!png) abort();
+ png_infop info = NULL;
+ info = png_create_info_struct(png);
+ if (!info) abort();
+
+ png_set_read_fn(png, dat, read_from_pngdat);
+ png_read_info(png, info);
+
+ int bit_depth = 0, color_type = -1;
+ png_get_IHDR(png, info, &width, &height, &bit_depth, &color_type, NULL, NULL, NULL);
+
+ size_t im_size = width * height * bit_depth/8 * 3;
+ if (color_type != PNG_COLOR_TYPE_RGB) {
+ fprintf(stderr, "expected an 8 bpp RGB image\n");
+ abort();
+ }
+
+ if (im_size > in_size) {
+ *out_bytes = (png_bytep)realloc(*out_bytes, im_size);
+ }
+
+ png_bytep *out_rows = new png_bytep[height];
+ for (size_t i = 0; i < height; ++i)
+ out_rows[i] = *out_bytes + (width*i*3);
+
+ png_read_rows(png, out_rows, NULL, height);
+ png_destroy_read_struct(&png, &info, NULL);
+ delete[] out_rows;
+
+ return im_size;
+}
diff --git a/neozip/test/benchmarks/benchmark_slidehash.cc b/neozip/test/benchmarks/benchmark_slidehash.cc
new file mode 100644
index 0000000000..e74c06e873
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_slidehash.cc
@@ -0,0 +1,116 @@
+/* benchmark_slidehash.cc -- benchmark slide_hash variants
+ * Copyright (C) 2022 Adam Stylinski, Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <limits.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil_p.h"
+# include "deflate.h"
+# include "arch_functions.h"
+# include "../test_cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS 32768
+
+class slide_hash: public benchmark::Fixture {
+private:
+ uint16_t *l0;
+ uint16_t *l1;
+ deflate_state *s_g;
+
+public:
+ /**
+ * @brief Prepare the benchmark fixture by allocating and initializing working data.
+ *
+ * Allocates two 64-byte-aligned arrays of `uint16_t` (one of size HASH_SIZE, one of size MAX_RANDOM_INTS),
+ * fills them with pseudorandom `uint16_t` values, allocates a `deflate_state` structure, and sets
+ * its `head` and `prev` pointers to the allocated arrays.
+ *
+ * @param state Benchmark-provided state object from Google Benchmark (supplied by the framework).
+ */
+ void SetUp(const ::benchmark::State&) {
+ l0 = (uint16_t *)zng_alloc_aligned(HASH_SIZE * sizeof(uint16_t), 64);
+
+ for (uint32_t i = 0; i < HASH_SIZE; i++) {
+ l0[i] = (uint16_t)rand();
+ }
+
+ l1 = (uint16_t *)zng_alloc_aligned(MAX_RANDOM_INTS * sizeof(uint16_t), 64);
+
+ for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+ l1[i] = (uint16_t)rand();
+ }
+
+ deflate_state *s = (deflate_state*)malloc(sizeof(deflate_state));
+ s->head = l0;
+ s->prev = l1;
+ s_g = s;
+ }
+
+ void Bench(benchmark::State& state, slide_hash_func slide_hash) {
+ s_g->w_size = (uint32_t)state.range(0);
+
+ for (auto _ : state) {
+ slide_hash(s_g);
+ benchmark::DoNotOptimize(s_g);
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ zng_free_aligned(l0);
+ zng_free_aligned(l1);
+ free(s_g);
+ }
+};
+
+#define BENCHMARK_SLIDEHASH(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(slide_hash, name)(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(512, MAX_RANDOM_INTS);
+
+#if defined(WITH_ALL_FALLBACKS) || !(defined(__x86_64__) || defined(_M_X64))
+BENCHMARK_SLIDEHASH(c, slide_hash_c, 1);
+#endif
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+BENCHMARK_SLIDEHASH(native, native_slide_hash, 1);
+#else
+
+#ifdef ARM_SIMD
+BENCHMARK_SLIDEHASH(armv6, slide_hash_armv6, test_cpu_features.arm.has_simd);
+#endif
+#ifdef ARM_NEON
+BENCHMARK_SLIDEHASH(neon, slide_hash_neon, test_cpu_features.arm.has_neon);
+#endif
+#ifdef POWER8_VSX
+BENCHMARK_SLIDEHASH(power8, slide_hash_power8, test_cpu_features.power.has_arch_2_07);
+#endif
+#ifdef PPC_VMX
+BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, test_cpu_features.power.has_altivec);
+#endif
+#ifdef RISCV_RVV
+BENCHMARK_SLIDEHASH(rvv, slide_hash_rvv, test_cpu_features.riscv.has_rvv);
+#endif
+#ifdef X86_SSE2
+BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2);
+#endif
+#ifdef LOONGARCH_LSX
+BENCHMARK_SLIDEHASH(lsx, slide_hash_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_SLIDEHASH(lasx, slide_hash_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
+
+#endif \ No newline at end of file
diff --git a/neozip/test/benchmarks/benchmark_uncompress.cc b/neozip/test/benchmarks/benchmark_uncompress.cc
new file mode 100644
index 0000000000..6a82c05d01
--- /dev/null
+++ b/neozip/test/benchmarks/benchmark_uncompress.cc
@@ -0,0 +1,97 @@
+/* benchmark_uncompress.cc -- benchmark uncompress()
+ * Copyright (C) 2024-2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil_p.h"
+# if defined(ZLIB_COMPAT)
+# include "zlib.h"
+# else
+# include "zlib-ng.h"
+# endif
+# include "test/compressible_data_p.h"
+}
+
+#define MAX_SIZE (1024 * 1024)
+#define NUM_TESTS 6
+
+class uncompress_bench: public benchmark::Fixture {
+private:
+ uint8_t *inbuff;
+ uint8_t *outbuff;
+ uint8_t *compressed_buff[NUM_TESTS];
+ z_uintmax_t compressed_sizes[NUM_TESTS];
+ uint32_t sizes[NUM_TESTS] = {1, 64, 1024, 16384, 128*1024, 1024*1024};
+
+public:
+ void SetUp(::benchmark::State& state) {
+ outbuff = (uint8_t *)malloc(MAX_SIZE + 16);
+ if (outbuff == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ // Initialize input buffer with highly compressible data, interspersed
+ // with small amounts of random data and 3-byte matches.
+ inbuff = gen_compressible_data(MAX_SIZE);
+ if (inbuff == NULL) {
+ free(outbuff);
+ outbuff = NULL;
+ state.SkipWithError("gen_compressible_data() failed");
+ return;
+ }
+
+ // Compress data into different buffers
+ for (int i = 0; i < NUM_TESTS; ++i) {
+ compressed_buff[i] = (uint8_t *)zng_alloc(sizes[i] + 64);
+ assert(compressed_buff[i] != NULL);
+
+ z_uintmax_t compressed_size = sizes[i] + 64;
+ int err = PREFIX(compress2)(compressed_buff[i], &compressed_size, inbuff, sizes[i], Z_BEST_COMPRESSION);
+ if (err != Z_OK) {
+ fprintf(stderr, "compress() failed with error %d\n", err);
+ abort();
+ }
+ compressed_sizes[i] = compressed_size;
+ }
+ }
+
+ void Bench(benchmark::State& state) {
+ int err;
+
+ for (auto _ : state) {
+ int index = 0;
+ while (sizes[index] != (uint32_t)state.range(0)) ++index;
+
+ z_uintmax_t out_size = MAX_SIZE;
+ err = PREFIX(uncompress)(outbuff, &out_size, compressed_buff[index], compressed_sizes[index]);
+ if (err != Z_OK) {
+ fprintf(stderr, "uncompress() failed with error %d\n", err);
+ abort();
+ }
+ }
+ }
+
+ void TearDown(const ::benchmark::State&) {
+ free(inbuff);
+ free(outbuff);
+
+ for (int i = 0; i < NUM_TESTS; ++i) {
+ zng_free(compressed_buff[i]);
+ }
+ }
+};
+
+#define BENCHMARK_UNCOMPRESS(name) \
+ BENCHMARK_DEFINE_F(uncompress_bench, name)(benchmark::State& state) { \
+ Bench(state); \
+ } \
+ BENCHMARK_REGISTER_F(uncompress_bench, name)->Arg(1)->Arg(64)->Arg(1024)->Arg(16<<10)->Arg(128<<10)->Arg(1024<<10);
+
+BENCHMARK_UNCOMPRESS(uncompress_bench);