diff options
| author | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
|---|---|---|
| committer | Mehmet Samet Duman <yongdohyun@projecttick.org> | 2026-04-02 19:56:09 +0300 |
| commit | 7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch) | |
| tree | b43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/test/benchmarks | |
| parent | b1e34e861b5d732afe828d58aad2c638135061fd (diff) | |
| parent | c2712b8a345191f6ed79558c089777df94590087 (diff) | |
| download | Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip | |
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip
git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd
git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/test/benchmarks')
18 files changed, 2092 insertions, 0 deletions
diff --git a/neozip/test/benchmarks/CMakeLists.txt b/neozip/test/benchmarks/CMakeLists.txt new file mode 100644 index 0000000000..df6f5a7e69 --- /dev/null +++ b/neozip/test/benchmarks/CMakeLists.txt @@ -0,0 +1,126 @@ +cmake_minimum_required(VERSION 3.14...4.2.1) + +include(FetchContent) + +if(NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) +endif() +if(NOT DEFINED CMAKE_CXX_STANDARD_REQUIRED) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() +if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS ON) +endif() + +# Search for Google benchmark package +find_package(benchmark QUIET) +if(NOT benchmark_FOUND) + # Fetch google benchmark source code from official repository + set(BENCHMARK_ENABLE_TESTING OFF) + + # Allow specifying alternative Google benchmark repository + if(NOT DEFINED GBENCHMARK_REPOSITORY) + set(GBENCHMARK_REPOSITORY https://github.com/google/benchmark.git) + endif() + if(NOT DEFINED GBENCHMARK_TAG) + set(GBENCHMARK_TAG v1.9.4) + endif() + + FetchContent_Declare(benchmark + GIT_REPOSITORY ${GBENCHMARK_REPOSITORY} + GIT_TAG ${GBENCHMARK_TAG} + ${ZNG_FetchContent_Declare_EXCLUDE_FROM_ALL}) + + ZNG_FetchContent_MakeAvailable(benchmark) +endif() + +# Public API benchmarks +set(BENCH_PUBLIC_SRCS + benchmark_compress.cc + benchmark_inflate.cc + benchmark_uncompress.cc + benchmark_main.cc + ) + +# Internal benchmarks +set(BENCH_INTERNAL_SRCS + benchmark_adler32.cc + benchmark_adler32_copy.cc + benchmark_compare256.cc + benchmark_compare256_rle.cc + benchmark_crc32.cc + benchmark_crc32_copy.cc + benchmark_deflate.cc + benchmark_insert_string.cc + benchmark_slidehash.cc + ) + +add_executable(benchmark_zlib ${BENCH_PUBLIC_SRCS}) + +target_compile_definitions(benchmark_zlib PRIVATE -DBENCHMARK_STATIC_DEFINE) +target_include_directories(benchmark_zlib PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_BINARY_DIR} + ${benchmark_SOURCE_DIR}/benchmark/include) + +target_link_libraries(benchmark_zlib benchmark::benchmark) +if(ZLIB_LIBRARY) + target_link_libraries(benchmark_zlib ${ZLIB_LIBRARY}) +else() + target_sources(benchmark_zlib PRIVATE ${BENCH_INTERNAL_SRCS}) + target_link_libraries(benchmark_zlib zlib-ng-static) +endif() + +if(WIN32) + target_link_libraries(benchmark_zlib shlwapi) +endif() + +add_test(NAME benchmark_zlib + COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:benchmark_zlib> "--benchmark_min_time=0") + +if(WITH_BENCHMARK_APPS) + option(BUILD_ALT_BENCH "Link against alternative zlib implementation" OFF) + + # Search for libpng package + find_package(PNG QUIET) + + if(NOT PNG_FOUND) + FetchContent_Declare(PNG + GIT_REPOSITORY https://github.com/glennrp/libpng.git + ${ZNG_FetchContent_Declare_EXCLUDE_FROM_ALL}) + + ZNG_FetchContent_MakeAvailable(PNG) + set(PNG_INCLUDE_DIR ${png_SOURCE_DIR}) + endif() + + set(BENCH_APP_SRCS + benchmark_png_encode.cc + benchmark_png_decode.cc + benchmark_main.cc + ) + + add_executable(benchmark_zlib_apps ${BENCH_APP_SRCS}) + + if(DEFINED BUILD_ALT_BENCH) + set(ZLIB_ALT_LIB "libz.a" CACHE FILEPATH "Optional alternative zlib implementation (defaults to stock zlib)") + add_executable(benchmark_zlib_apps_alt ${BENCH_APP_SRCS}) + target_link_libraries(benchmark_zlib_apps_alt libpng.a ${ZLIB_ALT_LIB} benchmark::benchmark) + target_compile_definitions(benchmark_zlib_apps_alt PRIVATE BUILD_ALT=1) + target_include_directories(benchmark_zlib_apps_alt PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_BINARY_DIR} + ${PNG_INCLUDE_DIR} + ${benchmark_SOURCE_DIR}/benchmark/include) + endif() + + target_include_directories(benchmark_zlib_apps PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_BINARY_DIR} + ${PNG_INCLUDE_DIR} + ${benchmark_SOURCE_DIR}/benchmark/include) + + # We need the static png library if we're statically linking to zlib, + # otherwise it will resolve these things in the system provided dynamic + # libraries (likely linked to stock zlib) + target_link_libraries(benchmark_zlib_apps libpng.a zlib-ng-static benchmark::benchmark) +endif() diff --git a/neozip/test/benchmarks/README.md b/neozip/test/benchmarks/README.md new file mode 100644 index 0000000000..08ccea233e --- /dev/null +++ b/neozip/test/benchmarks/README.md @@ -0,0 +1,63 @@ +## Benchmarks +These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark). + +*Repetitions* + +To increase the number of times each benchmark iteration is run use: + +``` +--benchmark_repetitions=20 +``` + +*Filters* + +To filter out which benchmarks are performed use: + +``` +--benchmark_filter="adler32*" +``` + +There are two different benchmarks, micro and macro. + +### Benchmark benchmark_zlib +These are microbenchmarks intended to test lower level subfunctions of the library. + +Benchmarks include implementations of: + - Adler32 + - CRC + - 256 byte comparisons + - SIMD accelerated "slide hash" routine + +By default these benchmarks report things on the nanosecond scale and are small enough +to measure very minute differences. + +*Alternative zlib library* + +To benchmark against an alternative zlib-compatible library, use the `ZLIB_LIBRARY` +CMake argument. When set, only the public API benchmarks are built: + +```sh +cmake -S . -B build-alt \ + -DZLIB_COMPAT=ON \ + -DBUILD_SHARED_LIBS=OFF \ + -DBUILD_TESTING=ON \ + -DWITH_BENCHMARKS=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_RUNTIME_CPU_DETECTION=OFF \ + -DZLIB_LIBRARY=/path/to/libz.a +``` + +### Benchmark benchmark_zlib_apps +These benchmarks measure applications of zlib as a whole. Currently the only examples +are PNG encoding and decoding. The PNG encode and decode tests leveraging procedurally +generated and highly compressible image data. + +Additionally, a test called `png_decode_realistic` that will decode any RGB 8 BPP encoded +set of PNGs in the working directory under a directory named "test_pngs" with files named +{0..1}.png. If these images do not exist, they will error out and the benchmark will move +on to the next set of benchmarks. + +*benchmark_zlib_apps_alt* + +The user can compile a comparison benchmark application linking to any zlib-compatible +implementation of his or her choosing. diff --git a/neozip/test/benchmarks/benchmark_adler32.cc b/neozip/test/benchmarks/benchmark_adler32.cc new file mode 100644 index 0000000000..5ee9102e23 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_adler32.cc @@ -0,0 +1,121 @@ +/* benchmark_adler32.cc -- benchmark adler32 variants + * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +#define BUFSIZE ((4 * 1024 * 1024) + 64) + +class adler32: public benchmark::Fixture { +private: + uint32_t *testdata; + +public: + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); + } + } + + // Benchmark Adler32, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) { + int misalign = 0; + uint32_t hash = 0; + + for (auto _ : state) { + hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(hash); + } + } + + void TearDown(const ::benchmark::State&) { + zng_free_aligned(testdata); + } +}; + +#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 0); \ + } \ + BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \ + BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \ + BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag); + +BENCHMARK_ADLER32(c, adler32_c, 1); + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +BENCHMARK_ADLER32(native, native_adler32, 1); +#else + +#ifdef ARM_NEON +BENCHMARK_ADLER32(neon, adler32_neon, test_cpu_features.arm.has_neon); +#endif + +#ifdef PPC_VMX +BENCHMARK_ADLER32(vmx, adler32_vmx, test_cpu_features.power.has_altivec); +#endif +#ifdef POWER8_VSX +BENCHMARK_ADLER32(power8, adler32_power8, test_cpu_features.power.has_arch_2_07); +#endif + +#ifdef RISCV_RVV +BENCHMARK_ADLER32(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); +#endif + +#ifdef X86_SSSE3 +BENCHMARK_ADLER32(ssse3, adler32_ssse3, test_cpu_features.x86.has_ssse3); +#endif +#ifdef X86_AVX2 +BENCHMARK_ADLER32(avx2, adler32_avx2, test_cpu_features.x86.has_avx2); +#endif +#ifdef X86_AVX512 +BENCHMARK_ADLER32(avx512, adler32_avx512, test_cpu_features.x86.has_avx512_common); +#endif +#ifdef X86_AVX512VNNI +BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni); +#endif + +#ifdef LOONGARCH_LSX +BENCHMARK_ADLER32(lsx, adler32_lsx, test_cpu_features.loongarch.has_lsx); +#endif +#ifdef LOONGARCH_LASX +BENCHMARK_ADLER32(lasx, adler32_lasx, test_cpu_features.loongarch.has_lasx); +#endif + +#endif diff --git a/neozip/test/benchmarks/benchmark_adler32_copy.cc b/neozip/test/benchmarks/benchmark_adler32_copy.cc new file mode 100644 index 0000000000..6d913b1d19 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_adler32_copy.cc @@ -0,0 +1,176 @@ +/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants + * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +// Hash copy functions are used on strm->next_in buffers, we process +// 512-32k sizes (x2 for initial fill) at a time if enough data is available. +#define BUFSIZE (65536 + 64) + +class adler32_copy: public benchmark::Fixture { +private: + uint32_t *testdata; + uint8_t *dstbuf; + +public: + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL || dstbuf == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); + } + } + + // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) { + int misalign = 0; + uint32_t hash = 0; + + for (auto _ : state) { + hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(hash); + } + } + + void TearDown(const ::benchmark::State&) { + zng_free_aligned(testdata); + zng_free_aligned(dstbuf); + } +}; + +// Misaligned +#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 0); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + + +// Adler32 + memcpy benchmarks for reference +#ifdef HASH_BASELINE +#define MEMCPY_NAME(name) name##_memcpy +#define BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const uint8_t *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, (size_t)len); \ + return hashfunc(init_sum, buf, len); \ + }, 0); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned +#define BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const uint8_t *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, (size_t)len); \ + return hashfunc(init_sum, buf, len); \ + }, 1); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); +#endif + + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); + +// Optionally also benchmark using memcpy with normal hash function for baseline +#ifdef HASH_BASELINE +#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, copyfunc, support_flag); +#else +#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) +#endif + +BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1); + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1); +#else + +#ifdef ARM_NEON +BENCHMARK_ADLER32_COPY(neon, adler32_neon, adler32_copy_neon, test_cpu_features.arm.has_neon); +#endif + +#ifdef PPC_VMX +BENCHMARK_ADLER32_COPY(vmx, adler32_vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec); +#endif +#ifdef POWER8_VSX +BENCHMARK_ADLER32_COPY(power8, adler32_power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07); +#endif + +#ifdef RISCV_RVV +BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, adler32_copy_rvv, test_cpu_features.riscv.has_rvv); +#endif + +#ifdef X86_SSSE3 +BENCHMARK_ADLER32_COPY(ssse3, adler32_ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3); +#endif +#ifdef X86_SSE42 +// There is no adler32_sse42, so only test the copy variant +BENCHMARK_ADLER32_COPY_ONLY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42); +#endif +#ifdef X86_AVX2 +BENCHMARK_ADLER32_COPY(avx2, adler32_avx, adler32_copy_avx2, test_cpu_features.x86.has_avx2); +#endif +#ifdef X86_AVX512 +BENCHMARK_ADLER32_COPY(avx512, adler32_avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common); +#endif +#ifdef X86_AVX512VNNI +BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni); +#endif + +#ifdef LOONGARCH_LSX +BENCHMARK_ADLER32_COPY(lsx, adler32_lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx); +#endif +#ifdef LOONGARCH_LASX +BENCHMARK_ADLER32_COPY(lasx, adler32_lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx); +#endif + +#endif diff --git a/neozip/test/benchmarks/benchmark_compare256.cc b/neozip/test/benchmarks/benchmark_compare256.cc new file mode 100644 index 0000000000..2d8352879d --- /dev/null +++ b/neozip/test/benchmarks/benchmark_compare256.cc @@ -0,0 +1,106 @@ +/* benchmark_compare256.cc -- benchmark compare256 variants + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +#define MAX_COMPARE_SIZE (256 + 64) + +class compare256: public benchmark::Fixture { +private: + uint8_t *str1; + uint8_t *str2; + +public: + void SetUp(::benchmark::State& state) { + str1 = (uint8_t *)malloc(MAX_COMPARE_SIZE); + str2 = (uint8_t *)malloc(MAX_COMPARE_SIZE); + if (str1 == NULL || str2 == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + memset(str1, 'a', MAX_COMPARE_SIZE); + memset(str2, 'a', MAX_COMPARE_SIZE); + } + + // Benchmark compare256, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, compare256_func compare256) { + int misalign = 0; + int32_t match_len = (int32_t)state.range(0) - 1; + uint32_t len = 0; + + for (auto _ : state) { + str2[match_len + misalign] = 0; // Set new match limit + + len = compare256((const uint8_t *)str1 + misalign, (const uint8_t *)str2 + misalign); + + str2[match_len + misalign] = 'a'; // Reset match limit + + if (misalign >= 63) + misalign = 0; + else + misalign++; + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(len); + } + } + + void TearDown(const ::benchmark::State&) { + free(str1); + free(str2); + } +}; + +#define BENCHMARK_COMPARE256(name, comparefunc, support_flag) \ + BENCHMARK_DEFINE_F(compare256, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, comparefunc); \ + } \ + BENCHMARK_REGISTER_F(compare256, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256); + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +BENCHMARK_COMPARE256(native, native_compare256, 1); +#else + +#ifdef WITH_ALL_FALLBACKS +BENCHMARK_COMPARE256(8, compare256_8, 1); +BENCHMARK_COMPARE256(64, compare256_64, 1); +#endif + +#ifdef X86_SSE2 +BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2); +#endif +#ifdef X86_AVX2 +BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2); +#endif +#ifdef X86_AVX512 +BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common); +#endif +#ifdef ARM_NEON +BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon); +#endif +#ifdef POWER9 +BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00); +#endif +#ifdef RISCV_RVV +BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv); +#endif +#ifdef LOONGARCH_LSX +BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx); +#endif +#ifdef LOONGARCH_LASX +BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx); +#endif + +#endif diff --git a/neozip/test/benchmarks/benchmark_compare256_rle.cc b/neozip/test/benchmarks/benchmark_compare256_rle.cc new file mode 100644 index 0000000000..db5adacc19 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_compare256_rle.cc @@ -0,0 +1,72 @@ +/* benchmark_compare256_rle.cc -- benchmark compare256_rle variants + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "compare256_rle.h" +} + +#define MAX_COMPARE_SIZE (256 + 64) + +class compare256_rle: public benchmark::Fixture { +private: + uint8_t *str1; + uint8_t *str2; + +public: + void SetUp(::benchmark::State& state) { + str1 = (uint8_t *)malloc(MAX_COMPARE_SIZE); + str2 = (uint8_t *)malloc(MAX_COMPARE_SIZE); + if (str1 == NULL || str2 == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + memset(str1, 'a', MAX_COMPARE_SIZE); + memset(str2, 'a', MAX_COMPARE_SIZE); + } + + // Benchmark compare256_rle, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, compare256_rle_func compare256_rle) { + int misalign = 0; + int32_t match_len = (int32_t)state.range(0) - 1; + uint32_t len = 0; + + for (auto _ : state) { + str2[match_len + misalign] = 0; // Set new match limit + + len = compare256_rle((const uint8_t *)str1 + misalign, (const uint8_t *)str2 + misalign); + + str2[match_len + misalign] = 'a'; // Reset match limit + + if (misalign >= 63) + misalign = 0; + else + misalign++; + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(len); + } + } + + void TearDown(const ::benchmark::State&) { + free(str1); + free(str2); + } +}; + +#define BENCHMARK_COMPARE256_RLE(name, comparefunc, support_flag) \ + BENCHMARK_DEFINE_F(compare256_rle, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, comparefunc); \ + } \ + BENCHMARK_REGISTER_F(compare256_rle, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);; + +BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1); +BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1); diff --git a/neozip/test/benchmarks/benchmark_compress.cc b/neozip/test/benchmarks/benchmark_compress.cc new file mode 100644 index 0000000000..df042f7153 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_compress.cc @@ -0,0 +1,75 @@ +/* benchmark_compress.cc -- benchmark compress() + * Copyright (C) 2024-2025 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <stdio.h> +#include <assert.h> +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "zutil_p.h" +# if defined(ZLIB_COMPAT) +# include "zlib.h" +# else +# include "zlib-ng.h" +# endif +# include "test/compressible_data_p.h" +} + +#define MAX_SIZE (64 * 1024) + +class compress_bench: public benchmark::Fixture { +private: + uint8_t *inbuff; + uint8_t *outbuff; + +public: + void SetUp(::benchmark::State& state) { + outbuff = (uint8_t *)malloc(MAX_SIZE + 16); + if (outbuff == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + // Initialize input buffer with highly compressible data, interspersed + // with small amounts of random data and 3-byte matches. + inbuff = gen_compressible_data(MAX_SIZE); + if (inbuff == NULL) { + free(outbuff); + outbuff = NULL; + state.SkipWithError("gen_compressible_data() failed"); + return; + } + } + + void Bench(benchmark::State& state) { + int err = 0; + + for (auto _ : state) { + z_uintmax_t compressed_size = MAX_SIZE + 16; + err = PREFIX(compress)(outbuff, &compressed_size, inbuff, (size_t)state.range(0)); + if (err != Z_OK) { + fprintf(stderr, "compress() failed with error %d\n", err); + abort(); + } + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(err); + } + } + + void TearDown(const ::benchmark::State&) { + free(inbuff); + free(outbuff); + } +}; + +#define BENCHMARK_COMPRESS(name) \ + BENCHMARK_DEFINE_F(compress_bench, name)(benchmark::State& state) { \ + Bench(state); \ + } \ + BENCHMARK_REGISTER_F(compress_bench, name)->Arg(1)->Arg(16)->Arg(48)->Arg(256)->Arg(1<<10)->Arg(4<<10)->Arg(16<<10)->Arg(64<<10); + +BENCHMARK_COMPRESS(compress_bench); diff --git a/neozip/test/benchmarks/benchmark_crc32.cc b/neozip/test/benchmarks/benchmark_crc32.cc new file mode 100644 index 0000000000..1e2cf88590 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_crc32.cc @@ -0,0 +1,125 @@ +/* benchmark_crc32.cc -- benchmark crc32 variants + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +#define BUFSIZE ((4 * 1024 * 1024) + 64) + +class crc32: public benchmark::Fixture { +private: + uint32_t *testdata; + +public: + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); + } + } + + // Benchmark CRC32, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) { + int misalign = 0; + uint32_t hash = 0; + + for (auto _ : state) { + hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(hash); + } + } + + void TearDown(const ::benchmark::State&) { + zng_free_aligned(testdata); + } +}; + +#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 0); \ + } \ + BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_CRC32(name, hashfunc, support_flag) \ + BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \ + BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag); + +BENCHMARK_CRC32(braid, crc32_braid, 1); + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +BENCHMARK_CRC32(native, native_crc32, 1); +#else + +#ifndef WITHOUT_CHORBA +BENCHMARK_CRC32(chorba_c, crc32_chorba, 1); +#endif +#ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 + BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2); +# endif +# ifdef X86_SSE41 + BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41); +# endif +#endif +#ifdef ARM_CRC32 +BENCHMARK_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32); +#endif +#ifdef ARM_PMULL_EOR3 +BENCHMARK_CRC32(armv8_pmull_eor3, crc32_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3); +#endif +#ifdef RISCV_CRC32_ZBC +BENCHMARK_CRC32(riscv, crc32_riscv64_zbc, test_cpu_features.riscv.has_zbc); +#endif +#ifdef POWER8_VSX_CRC32 +BENCHMARK_CRC32(power8, crc32_power8, test_cpu_features.power.has_arch_2_07); +#endif +#ifdef S390_CRC32_VX +BENCHMARK_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx); +#endif +#ifdef X86_PCLMULQDQ_CRC +BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq); +#endif +#ifdef X86_VPCLMULQDQ_AVX2 +BENCHMARK_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq)); +#endif +#ifdef X86_VPCLMULQDQ_AVX512 +BENCHMARK_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)); +#endif +#ifdef LOONGARCH_CRC +BENCHMARK_CRC32(loongarch64, crc32_loongarch64, test_cpu_features.loongarch.has_crc); +#endif + +#endif diff --git a/neozip/test/benchmarks/benchmark_crc32_copy.cc b/neozip/test/benchmarks/benchmark_crc32_copy.cc new file mode 100644 index 0000000000..eafa5aee44 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_crc32_copy.cc @@ -0,0 +1,177 @@ +/* benchmark_crc32_copy.cc -- benchmark for crc32 implementations with copying + * Copyright (C) 2025 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +// Hash copy functions are used on strm->next_in buffers, we process +// 512-32k sizes (x2 for initial fill) at a time if enough data is available. +#define BUFSIZE (65536 + 64) + +class crc32_copy: public benchmark::Fixture { +protected: + uint32_t *testdata; + uint8_t *dstbuf; + +public: + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL || dstbuf == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); + } + } + + // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) { + int misalign = 0; + uint32_t hash = 0; + + for (auto _ : state) { + hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(hash); + } + } + + void TearDown(const ::benchmark::State&) { + zng_free_aligned(testdata); + zng_free_aligned(dstbuf); + } +}; + +// Misaligned +#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 0); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// CRC32 + memcpy benchmarks for reference +#ifdef HASH_BASELINE +#define MEMCPY_NAME(name) name##_memcpy +#define BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const uint8_t *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, (size_t)len); \ + return hashfunc(init_sum, buf, len); \ + }, 0); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned +#define BENCHMARK_CRC32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const uint8_t *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, (size_t)len); \ + return hashfunc(init_sum, buf, len); \ + }, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); +#endif + + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) \ + BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); + +// Optionally also benchmark using memcpy with normal hash function for baseline +#ifdef HASH_BASELINE +#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_MEMCPY_ALIGNED(name, copyfunc, support_flag); +#else +#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) +#endif + +// Base test +BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1); + +#ifdef DISABLE_RUNTIME_CPU_DETECTION + // Native + BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1) +#else + // Optimized functions +# ifndef WITHOUT_CHORBA + BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1) +# endif +# ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 + BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2); +# endif +# ifdef X86_SSE41 + BENCHMARK_CRC32_COPY(chorba_sse41, crc32_chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41); +# endif +# endif +# ifdef ARM_CRC32 + BENCHMARK_CRC32_COPY(armv8, crc32_armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32) +# endif +# ifdef ARM_PMULL_EOR3 + BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3) +# endif +# ifdef LOONGARCH_CRC + BENCHMARK_CRC32_COPY(loongarch, crc32_loongarch64, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc) +# endif +# ifdef POWER8_VSX_CRC32 + BENCHMARK_CRC32_COPY(power8, crc32_power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07) +# endif +# ifdef RISCV_CRC32_ZBC + BENCHMARK_CRC32_COPY(riscv, crc32_riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc) +# endif +# ifdef S390_CRC32_VX + BENCHMARK_CRC32_COPY(vx, crc32_s390_vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx) +# endif +# ifdef X86_PCLMULQDQ_CRC + BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq) +# endif +# ifdef X86_VPCLMULQDQ_AVX2 + BENCHMARK_CRC32_COPY(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, crc32_copy_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq)) +# endif +# ifdef X86_VPCLMULQDQ_AVX512 + BENCHMARK_CRC32_COPY(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, crc32_copy_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) +# endif + +#endif diff --git a/neozip/test/benchmarks/benchmark_deflate.cc b/neozip/test/benchmarks/benchmark_deflate.cc new file mode 100644 index 0000000000..f60e2589d1 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_deflate.cc @@ -0,0 +1,147 @@ +/* benchmark_deflate.cc -- benchmark deflate() with various levels and raw mode + * Copyright (C) 2026 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <stdio.h> +#include <assert.h> +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "zutil_p.h" +# if defined(ZLIB_COMPAT) +# include "zlib.h" +# else +# include "zlib-ng.h" +# endif +# include "test/compressible_data_p.h" +} + +#define MAX_SIZE (1024 * 1024) + +/* Parameterized deflate benchmark: Args(size, level) */ +class deflate_bench: public benchmark::Fixture { +private: + uint8_t *inbuff = nullptr; + uint8_t *outbuff = nullptr; + z_uintmax_t outbuff_size = 0; + +public: + void SetUp(::benchmark::State& state) { + outbuff_size = PREFIX(deflateBound)(NULL, MAX_SIZE); + outbuff = (uint8_t *)malloc(outbuff_size); + if (outbuff == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + inbuff = gen_compressible_data(MAX_SIZE); + if (inbuff == NULL) { + free(outbuff); + outbuff = NULL; + state.SkipWithError("gen_compressible_data() failed"); + return; + } + } + + void Bench(benchmark::State& state, int window_bits, int strategy = Z_DEFAULT_STRATEGY) { + int err; + size_t size = (size_t)state.range(0); + int level = (int)state.range(1); + + PREFIX3(stream) strm; + strm.zalloc = NULL; + strm.zfree = NULL; + strm.opaque = NULL; + strm.total_in = 0; + strm.total_out = 0; + strm.next_out = NULL; + strm.avail_out = 0; + + err = PREFIX(deflateInit2)(&strm, level, Z_DEFLATED, window_bits, MAX_MEM_LEVEL, strategy); + if (err != Z_OK) { + state.SkipWithError("deflateInit2 did not return Z_OK"); + return; + } + + for (auto _ : state) { + err = PREFIX(deflateReset)(&strm); + if (err != Z_OK) { + state.SkipWithError("deflateReset did not return Z_OK"); + PREFIX(deflateEnd)(&strm); + return; + } + + strm.avail_in = (uint32_t)size; + strm.next_in = (z_const uint8_t *)inbuff; + strm.next_out = outbuff; + strm.avail_out = (uint32_t)outbuff_size; + + err = PREFIX(deflate)(&strm, Z_FINISH); + if (err != Z_STREAM_END) { + state.SkipWithError("deflate did not return Z_STREAM_END"); + PREFIX(deflateEnd)(&strm); + return; + } + } + + err = PREFIX(deflateEnd)(&strm); + if (err != Z_OK) { + state.SkipWithError("deflateEnd did not return Z_OK"); + return; + } + } + + void TearDown(const ::benchmark::State&) { + free(inbuff); + free(outbuff); + } +}; + +#define BENCHMARK_DEFLATE_ARGS \ + ->Args({1024, 1})->Args({1024, 3})->Args({1024, 6})->Args({1024, 9}) \ + ->Args({16384, 1})->Args({16384, 3})->Args({16384, 6})->Args({16384, 9}) \ + ->Args({131072, 1})->Args({131072, 3})->Args({131072, 6})->Args({131072, 9}) \ + ->Args({1048576, 1})->Args({1048576, 3})->Args({1048576, 6})->Args({1048576, 9}) + +/* Parameterized deflate with zlib wrapping (includes adler32 checksum) */ +BENCHMARK_DEFINE_F(deflate_bench, deflate_level)(benchmark::State& state) { + Bench(state, MAX_WBITS); +} +BENCHMARK_REGISTER_F(deflate_bench, deflate_level) BENCHMARK_DEFLATE_ARGS; + +/* Parameterized raw deflate without checksum */ +BENCHMARK_DEFINE_F(deflate_bench, deflate_nocrc)(benchmark::State& state) { + Bench(state, -MAX_WBITS); +} +BENCHMARK_REGISTER_F(deflate_bench, deflate_nocrc) BENCHMARK_DEFLATE_ARGS; + +/* Strategy benchmarks use fewer size/level combos to keep test count reasonable */ +#define BENCHMARK_DEFLATE_STRATEGY_ARGS \ + ->Args({1024, 1})->Args({1024, 6})->Args({1024, 9}) \ + ->Args({1048576, 1})->Args({1048576, 6})->Args({1048576, 9}) + +/* Parameterized deflate with filtered strategy */ +BENCHMARK_DEFINE_F(deflate_bench, deflate_filtered)(benchmark::State& state) { + Bench(state, MAX_WBITS, Z_FILTERED); +} +BENCHMARK_REGISTER_F(deflate_bench, deflate_filtered) BENCHMARK_DEFLATE_STRATEGY_ARGS; + +/* Parameterized deflate with Huffman-only strategy */ +BENCHMARK_DEFINE_F(deflate_bench, deflate_huffman)(benchmark::State& state) { + Bench(state, MAX_WBITS, Z_HUFFMAN_ONLY); +} +BENCHMARK_REGISTER_F(deflate_bench, deflate_huffman) BENCHMARK_DEFLATE_STRATEGY_ARGS; + +/* Parameterized deflate with RLE strategy */ +BENCHMARK_DEFINE_F(deflate_bench, deflate_rle)(benchmark::State& state) { + Bench(state, MAX_WBITS, Z_RLE); +} +BENCHMARK_REGISTER_F(deflate_bench, deflate_rle) BENCHMARK_DEFLATE_STRATEGY_ARGS; + +/* Parameterized deflate with fixed Huffman codes */ +BENCHMARK_DEFINE_F(deflate_bench, deflate_fixed)(benchmark::State& state) { + Bench(state, MAX_WBITS, Z_FIXED); +} +BENCHMARK_REGISTER_F(deflate_bench, deflate_fixed) BENCHMARK_DEFLATE_STRATEGY_ARGS; diff --git a/neozip/test/benchmarks/benchmark_inflate.cc b/neozip/test/benchmarks/benchmark_inflate.cc new file mode 100644 index 0000000000..ac6ef7229f --- /dev/null +++ b/neozip/test/benchmarks/benchmark_inflate.cc @@ -0,0 +1,169 @@ +/* benchmark_inflate.cc -- benchmark inflate() without crc32/adler32 + * Copyright (C) 2024-2025 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <stdio.h> +#include <assert.h> +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "zutil_p.h" +# if defined(ZLIB_COMPAT) +# include "zlib.h" +# else +# include "zlib-ng.h" +# endif +# include "test/compressible_data_p.h" +} + +#define MAX_SIZE (1024 * 1024) +#define NUM_TESTS 6 + +class inflate_bench: public benchmark::Fixture { +private: + uint8_t *inbuff; + uint8_t *outbuff; + uint8_t *compressed_buff[NUM_TESTS]; + z_uintmax_t compressed_sizes[NUM_TESTS]; + uint32_t sizes[NUM_TESTS] = {1, 64, 1024, 16384, 128*1024, 1024*1024}; + +public: + void SetUp(::benchmark::State& state) { + int err; + outbuff = (uint8_t *)malloc(MAX_SIZE + 16); + if (outbuff == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + // Initialize input buffer with highly compressible data, interspersed + // with small amounts of random data and 3-byte matches. + inbuff = gen_compressible_data(MAX_SIZE); + if (inbuff == NULL) { + free(outbuff); + outbuff = NULL; + state.SkipWithError("gen_compressible_data() failed"); + return; + } + + // Initialize Deflate state + PREFIX3(stream) strm; + strm.zalloc = NULL; + strm.zfree = NULL; + strm.opaque = NULL; + strm.total_in = 0; + strm.total_out = 0; + strm.next_out = NULL; + strm.avail_out = 0; + + err = PREFIX(deflateInit2)(&strm, Z_BEST_COMPRESSION, Z_DEFLATED, -15, MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) { + state.SkipWithError("deflateInit2 did not return Z_OK"); + return; + } + + + // Compress data into different buffers + for (int i = 0; i < NUM_TESTS; ++i) { + compressed_buff[i] = (uint8_t *)malloc(sizes[i] + 64); + if (compressed_buff[i] == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + strm.avail_in = sizes[i]; // Size of the input buffer + strm.next_in = (z_const uint8_t *)inbuff; // Pointer to the input buffer + strm.next_out = compressed_buff[i]; // Pointer to the output buffer + strm.avail_out = sizes[i] + 64; // Maximum size of the output buffer + + err = PREFIX(deflate)(&strm, Z_FINISH); // Perform compression + if (err != Z_STREAM_END ) { + state.SkipWithError("deflate did not return Z_STREAM_END"); + PREFIX(deflateEnd)(&strm); + return; + } + + compressed_sizes[i] = strm.total_out; // Total compressed size + + err = PREFIX(deflateReset)(&strm); // Reset Deflate state + if (err != Z_OK) { + state.SkipWithError("deflateReset did not return Z_OK"); + return; + } + } + + err = PREFIX(deflateEnd)(&strm); // Clean up the deflate stream + if (err != Z_OK) { + state.SkipWithError("deflateEnd did not return Z_OK"); + return; + } + } + + void Bench(benchmark::State& state) { + int err; + int index = 0; + while (sizes[index] != (uint32_t)state.range(0)) ++index; + + // Initialize the inflate stream + PREFIX3(stream) strm; + strm.zalloc = NULL; + strm.zfree = NULL; + strm.opaque = NULL; + strm.next_in = NULL; + strm.avail_in = 0; + + err = PREFIX(inflateInit2)(&strm, -15); // Initialize the inflate state, no crc/adler + if (err != Z_OK) { + state.SkipWithError("inflateInit did not return Z_OK"); + return; + } + + for (auto _ : state) { + // Perform reset, avoids benchmarking inflateInit and inflateEnd + err = PREFIX(inflateReset)(&strm); + if (err != Z_OK) { + state.SkipWithError("inflateReset did not return Z_OK"); + return; + } + + strm.avail_in = (uint32_t)compressed_sizes[index]; // Size of the input + strm.next_in = compressed_buff[index]; // Pointer to the compressed data + strm.avail_out = MAX_SIZE; // Max size for output + strm.next_out = outbuff; // Output buffer + + // Perform decompression + err = PREFIX(inflate)(&strm, Z_FINISH); + if (err != Z_STREAM_END) { + state.SkipWithError("inflate did not return Z_STREAM_END"); + PREFIX(inflateEnd)(&strm); + return; + } + } + + // Finalize the inflation process + err = PREFIX(inflateEnd)(&strm); + if (err != Z_OK) { + state.SkipWithError("inflateEnd did not return Z_OK"); + return; + } + } + + void TearDown(const ::benchmark::State&) { + free(inbuff); + free(outbuff); + + for (int i = 0; i < NUM_TESTS; ++i) { + free(compressed_buff[i]); + } + } +}; + +#define BENCHMARK_INFLATE(name) \ + BENCHMARK_DEFINE_F(inflate_bench, name)(benchmark::State& state) { \ + Bench(state); \ + } \ + BENCHMARK_REGISTER_F(inflate_bench, name)->Arg(1)->Arg(64)->Arg(1024)->Arg(16<<10)->Arg(128<<10)->Arg(1024<<10); + +BENCHMARK_INFLATE(inflate_nocrc); diff --git a/neozip/test/benchmarks/benchmark_insert_string.cc b/neozip/test/benchmarks/benchmark_insert_string.cc new file mode 100644 index 0000000000..fafba3c4cd --- /dev/null +++ b/neozip/test/benchmarks/benchmark_insert_string.cc @@ -0,0 +1,164 @@ +/* benchmark_insert_string.cc -- benchmark insert_string variants + * Copyright (C) 2025 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <limits.h> +#include <cstring> + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "deflate.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +# include "insert_string_p.h" +} + +#define MAX_WSIZE 32768 +#define TEST_WINDOW_SIZE (MAX_WSIZE * 2) + +typedef uint32_t (* quick_insert_string_cb)(deflate_state *const s, uint32_t str); + +// Base class with common setup/teardown for both insert_string benchmarks +class insert_string_base: public benchmark::Fixture { +protected: + deflate_state *s; + +public: + void SetUp(const ::benchmark::State&) { + s = (deflate_state*)zng_alloc_aligned(sizeof(deflate_state), 64); + memset(s, 0, sizeof(deflate_state)); + + // Set up window parameters + s->w_size = MAX_WSIZE; + s->window_size = TEST_WINDOW_SIZE; + + // Allocate window + s->window = (uint8_t*)zng_alloc_aligned(TEST_WINDOW_SIZE, 64); + + // Allocate hash tables + s->head = (Pos*)zng_alloc_aligned(HASH_SIZE * sizeof(Pos), 64); + s->prev = (Pos*)zng_alloc_aligned(MAX_WSIZE * sizeof(Pos), 64); + + // Initialize hash tables + memset(s->head, 0, HASH_SIZE * sizeof(Pos)); + memset(s->prev, 0, MAX_WSIZE * sizeof(Pos)); + + // Initialize rolling hash state for rolling variant + s->ins_h = 0; + + // Fill window with deterministic data patterns + for (size_t i = 0; i < TEST_WINDOW_SIZE; i++) { + // Create patterns that will exercise the hash function well + s->window[i] = (uint8_t)((i * 17 + (i >> 4) * 31 + (i >> 8) * 13) & 0xFF); + } + } + + void TearDown(const ::benchmark::State&) { + zng_free_aligned(s->window); + zng_free_aligned(s->head); + zng_free_aligned(s->prev); + zng_free_aligned(s); + } +}; + +class insert_string_bench: public insert_string_base { +public: + void Bench(benchmark::State& state, insert_string_cb insert_func) { + uint32_t str_pos = (uint32_t)state.range(0); // Starting position + uint32_t count = (uint32_t)state.range(1); // Number of strings to insert + + // Ensure we don't go beyond window bounds + if (str_pos + count >= TEST_WINDOW_SIZE - 4) { + state.SkipWithError("Parameters exceed window size"); + return; + } + + for (auto _ : state) { + state.PauseTiming(); + + // Reset hash tables to ensure consistent starting state + memset(s->head, 0, HASH_SIZE * sizeof(Pos)); + memset(s->prev, 0, MAX_WSIZE * sizeof(Pos)); + s->ins_h = 0; + + state.ResumeTiming(); + + // Benchmark the insert_string function + insert_func(s, str_pos, count); + } + } +}; + +#define BENCHMARK_INSERT_STRING(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(insert_string_bench, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("Function " #name " not supported"); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(insert_string_bench, name) \ + ->Args({100, 3}) /* Most common case */ \ + ->Args({100, 4}) \ + ->Args({100, 5}) \ + ->Args({100, 7}) \ + ->Args({100, 14}) /* Mid-range cluster */ \ + ->Args({100, 32}) /* Transition point */ \ + ->Args({100, 127}) /* Large cluster around powers of 2 */ \ + ->Args({100, 255}) /* Near maximum observed values */ \ + ->Unit(benchmark::kNanosecond); + +// Benchmark the standard integer hash variant +BENCHMARK_INSERT_STRING(integer_hash, ::insert_string, 1); + +// Benchmark the rolling hash variant +BENCHMARK_INSERT_STRING(rolling_hash, ::insert_string_roll, 1); + +// Additional benchmark class for quick_insert_string functions +class quick_insert_string_bench: public insert_string_base { +public: + void Bench(benchmark::State& state, quick_insert_string_cb quick_insert_func) { + uint32_t start_pos = (uint32_t)state.range(0); // Starting position + uint32_t count = (uint32_t)state.range(1); // Number of insertions + + if (start_pos + count >= TEST_WINDOW_SIZE - 4) { + state.SkipWithError("Parameters exceed window size"); + return; + } + + for (auto _ : state) { + state.PauseTiming(); + + // Reset hash tables + memset(s->head, 0, HASH_SIZE * sizeof(Pos)); + memset(s->prev, 0, MAX_WSIZE * sizeof(Pos)); + s->ins_h = 0; + + state.ResumeTiming(); + + // Benchmark quick_insert_string (single insertions) + for (uint32_t i = 0; i < count; i++) { + uint32_t result = quick_insert_func(s, start_pos + i); + benchmark::DoNotOptimize(result); + } + } + } +}; + +#define BENCHMARK_QUICK_INSERT_STRING(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(quick_insert_string_bench, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("Function " #name " not supported"); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(quick_insert_string_bench, name) \ + ->Args({100, 1}) /* Single insertion (baseline) */ \ + ->Args({100, 100}) /* 100 insertions (measure amortized cost) */ \ + ->Args({16000, 100}) /* 100 insertions at mid window (different hash distribution) */ \ + ->Unit(benchmark::kNanosecond); + +BENCHMARK_QUICK_INSERT_STRING(quick_integer_hash, ::quick_insert_string, 1); +BENCHMARK_QUICK_INSERT_STRING(quick_rolling_hash, ::quick_insert_string_roll, 1); diff --git a/neozip/test/benchmarks/benchmark_main.cc b/neozip/test/benchmarks/benchmark_main.cc new file mode 100644 index 0000000000..f3c227bdf7 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_main.cc @@ -0,0 +1,32 @@ +/* benchmark_main.cc -- benchmark suite main entry point + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <stdio.h> + +#include <benchmark/benchmark.h> + +#ifndef BUILD_ALT +extern "C" { +# include "zbuild.h" +# include "../test_cpu_features.h" + +# ifndef DISABLE_RUNTIME_CPU_DETECTION + struct cpu_features test_cpu_features; +# endif +} +#endif + +int main(int argc, char** argv) { +#ifndef BUILD_ALT +# ifndef DISABLE_RUNTIME_CPU_DETECTION + cpu_check_features(&test_cpu_features); +# endif +#endif + + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + + return EXIT_SUCCESS; +} diff --git a/neozip/test/benchmarks/benchmark_png_decode.cc b/neozip/test/benchmarks/benchmark_png_decode.cc new file mode 100644 index 0000000000..ce7c8f9304 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_png_decode.cc @@ -0,0 +1,126 @@ +#include <stdio.h> +#include <benchmark/benchmark.h> +#include "benchmark_png_shared.h" +#include <assert.h> + +class png_decode: public benchmark::Fixture { +protected: + png_dat inpng[10]; + + /* Backing this on the heap is a more realistic benchmark */ + uint8_t *output_img_buf = NULL; + +public: + /* Let's make the vanilla version have something extremely compressible */ + virtual void init_img(png_bytep img_bytes, size_t width, size_t height) { + init_compressible(img_bytes, width*height); + } + + void SetUp(const ::benchmark::State&) { + output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + assert(output_img_buf != NULL); + init_img(output_img_buf, IMWIDTH, IMHEIGHT); + + /* First we need to author the png bytes to be decoded */ + for (int i = 0; i < 10; ++i) { + inpng[i] = {NULL, 0, 0}; + encode_png(output_img_buf, &inpng[i], i, IMWIDTH, IMHEIGHT); + } + } + + /* State in this circumstance will convey the compression level */ + void Bench(benchmark::State &state) { + for (auto _ : state) { + int compress_lvl = state.range(0); + png_parse_dat in = { inpng[compress_lvl].buf }; + uint32_t width, height; + decode_png(&in, (png_bytepp)&output_img_buf, IMWIDTH * IMHEIGHT * 3, width, height); + } + } + + void TearDown(const ::benchmark::State &) { + free(output_img_buf); + for (int i = 0; i < 10; ++i) { + free(inpng[i].buf); + } + } +}; + +class png_decode_realistic: public png_decode { +private: + bool test_files_found = false; + +public: + void SetUp(const ::benchmark::State &) { + output_img_buf = NULL; + output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + /* Let's take all the images at different compression levels and jam their bytes into buffers */ + char test_fname[25]; + FILE *files[10]; + + /* Set all to NULL */ + memset(files, 0, sizeof(FILE*)); + + for (size_t i = 0; i < 10; ++i) { + sprintf(test_fname, "test_pngs/%1lu.png", i); + FILE *in_img = fopen(test_fname, "r"); + if (in_img == NULL) { + for (size_t j = 0; j < i; ++j) { + if (files[j]) + fclose(files[j]); + } + + /* For proper cleanup */ + for (size_t j = i; j < 10; ++j) { + inpng[i] = { NULL, 0, 0 }; + } + + return; + } + files[i] = in_img; + } + + test_files_found = true; + /* Now that we've established we have all the png files, let's read all of their bytes into buffers */ + for (size_t i = 0; i < 10; ++i) { + FILE *in_file = files[i]; + fseek(in_file, 0, SEEK_END); + size_t num_bytes = ftell(in_file); + rewind(in_file); + + uint8_t *raw_file = (uint8_t*)malloc(num_bytes); + if (raw_file == NULL) + abort(); + + inpng[i].buf = raw_file; + inpng[i].len = num_bytes; + inpng[i].buf_rem = 0; + + size_t bytes_read = fread(raw_file, 1, num_bytes, in_file); + if (bytes_read != num_bytes) { + fprintf(stderr, "couldn't read all of the bytes for file test_pngs/%lu.png", i); + abort(); + } + + fclose(in_file); + } + } + + void Bench(benchmark::State &state) { + if (!test_files_found) { + state.SkipWithError("Test imagery in test_pngs not found"); + } + + png_decode::Bench(state); + } +}; + +BENCHMARK_DEFINE_F(png_decode, png_decode)(benchmark::State &state) { + Bench(state); +} +BENCHMARK_REGISTER_F(png_decode, png_decode)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond); + +BENCHMARK_DEFINE_F(png_decode_realistic, png_decode_realistic)(benchmark::State &state) { + Bench(state); +} +BENCHMARK_REGISTER_F(png_decode_realistic, png_decode_realistic)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond); diff --git a/neozip/test/benchmarks/benchmark_png_encode.cc b/neozip/test/benchmarks/benchmark_png_encode.cc new file mode 100644 index 0000000000..d5e25cbc9d --- /dev/null +++ b/neozip/test/benchmarks/benchmark_png_encode.cc @@ -0,0 +1,54 @@ +#include <stdio.h> +#include <assert.h> +#include <benchmark/benchmark.h> +#include "benchmark_png_shared.h" + +#define IMWIDTH 1024 +#define IMHEIGHT 1024 + +class png_encode: public benchmark::Fixture { +private: + png_dat outpng; + + /* Backing this on the heap is a more realistic benchmark */ + uint8_t *input_img_buf = NULL; + +public: + /* Let's make the vanilla version have something extremely compressible */ + virtual void init_img(png_bytep img_bytes, size_t width, size_t height) { + init_compressible(img_bytes, width * height); + } + + void SetUp(const ::benchmark::State&) { + input_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + outpng.buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + /* Using malloc rather than zng_alloc so that we can call realloc. + * IMWIDTH * IMHEIGHT is likely to be more than enough bytes, though, + * given that a simple run length encoding already pretty much can + * reduce to this */ + outpng.len = 0; + outpng.buf_rem = IMWIDTH * IMHEIGHT * 3; + assert(input_img_buf != NULL); + assert(outpng.buf != NULL); + init_img(input_img_buf, IMWIDTH, IMHEIGHT); + } + + /* State in this circumstance will convey the compression level */ + void Bench(benchmark::State &state) { + for (auto _ : state) { + encode_png((png_bytep)input_img_buf, &outpng, state.range(0), IMWIDTH, IMHEIGHT); + outpng.buf_rem = outpng.len; + outpng.len = 0; + } + } + + void TearDown(const ::benchmark::State &) { + free(input_img_buf); + free(outpng.buf); + } +}; + +BENCHMARK_DEFINE_F(png_encode, encode_compressible)(benchmark::State &state) { + Bench(state); +} +BENCHMARK_REGISTER_F(png_encode, encode_compressible)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond); diff --git a/neozip/test/benchmarks/benchmark_png_shared.h b/neozip/test/benchmarks/benchmark_png_shared.h new file mode 100644 index 0000000000..bde679e7d3 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_png_shared.h @@ -0,0 +1,146 @@ +#pragma once + +#include <stdlib.h> +#include <stdint.h> +#include <string.h> + +#define IMWIDTH 1024 +#define IMHEIGHT 1024 + +extern "C" { +# include <png.h> +} + +typedef struct _png_dat { + uint8_t *buf; + int64_t len; + size_t buf_rem; +} png_dat; + +typedef struct _png_parse_dat { + uint8_t *cur_pos; +} png_parse_dat; + +/* Write a customized write callback so that we write back to an in-memory buffer. + * This allows the testing to not involve disk IO */ +static void png_write_cb(png_structp pngp, png_bytep data, png_size_t len) { + png_dat *dat = (png_dat*)png_get_io_ptr(pngp); + size_t curSize = dat->len + len; + + /* realloc double the requested buffer size to prevent excessive reallocs */ + if (dat->buf_rem < len) { + dat->buf = (uint8_t*)realloc(dat->buf, dat->len + dat->buf_rem + 2 * len); + + if (!dat->buf) { + /* Pretty unlikely but we'll put it here just in case */ + fprintf(stderr, "realloc failed, exiting\n"); + exit(1); + } + + dat->buf_rem += 2 * len; + } + + memcpy(dat->buf + dat->len, data, len); + dat->len = curSize; + dat->buf_rem -= len; +} + +static void init_compressible(png_bytep buf, size_t num_pix) { + /* It doesn't actually matter what we make this, but for + * the sake of a reasonable test image, let's make this + * be a stripe of R, G, & B, with no alpha channel */ + int32_t i = 0; + int32_t red_stop = num_pix / 3; + int32_t blue_stop = 2 * num_pix / 3; + int32_t green_stop = num_pix; + + for (int32_t x = 0; i < red_stop; x += 3, ++i) { + buf[x] = 255; + buf[x + 1] = 0; + buf[x + 2] = 0; + } + + for (int32_t x = 3 * i; i < blue_stop; x+= 3, ++i) { + buf[x] = 0; + buf[x + 1] = 255; + buf[x + 2] = 0; + } + + for (int32_t x = 3 * i; i < green_stop; x += 3, ++i) { + buf[x] = 0; + buf[x + 1] = 0; + buf[x + 2] = 255; + } +} + +static inline void encode_png(png_bytep buf, png_dat *outpng, int32_t comp_level, uint32_t width, uint32_t height) { + png_structp png = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + + /* Most of this error handling is _likely_ not necessary. Likewise it's likely + * a lot of this stuff can be done in the setup function to avoid measuring this + * fixed setup time, but for now we'll do it here */ + if (!png) abort(); + + png_infop info = png_create_info_struct(png); + if (!info) abort(); + + png_set_write_fn(png, outpng, png_write_cb, NULL); + png_bytep *png_row_ptrs = new png_bytep[height]; + for (int i = 0; i < IMHEIGHT; ++i) { + png_row_ptrs[i] = (png_bytep)&buf[3*i*width]; + } + + png_set_IHDR(png, info, IMWIDTH, IMHEIGHT, 8, PNG_COLOR_TYPE_RGB, + PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, + PNG_FILTER_TYPE_DEFAULT); + + png_write_info(png, info); + png_set_compression_level(png, comp_level); + png_set_filter(png, 0, PNG_FILTER_NONE); + png_write_image(png, (png_bytepp)png_row_ptrs); + png_write_end(png, NULL); + png_destroy_write_struct(&png, &info); + delete[] png_row_ptrs; +} + +static void read_from_pngdat(png_structp png, png_bytep out, png_size_t bytes_to_read) { + png_parse_dat *io = (png_parse_dat*)png_get_io_ptr(png); + memcpy(out, io->cur_pos, bytes_to_read); + io->cur_pos += bytes_to_read; +} + +static inline int decode_png(png_parse_dat *dat, png_bytepp out_bytes, size_t in_size, uint32_t &width, uint32_t &height) { + png_structp png = NULL; + png = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + + if (!png) abort(); + png_infop info = NULL; + info = png_create_info_struct(png); + if (!info) abort(); + + png_set_read_fn(png, dat, read_from_pngdat); + png_read_info(png, info); + + int bit_depth = 0, color_type = -1; + png_get_IHDR(png, info, &width, &height, &bit_depth, &color_type, NULL, NULL, NULL); + + size_t im_size = width * height * bit_depth/8 * 3; + if (color_type != PNG_COLOR_TYPE_RGB) { + fprintf(stderr, "expected an 8 bpp RGB image\n"); + abort(); + } + + if (im_size > in_size) { + *out_bytes = (png_bytep)realloc(*out_bytes, im_size); + } + + png_bytep *out_rows = new png_bytep[height]; + for (size_t i = 0; i < height; ++i) + out_rows[i] = *out_bytes + (width*i*3); + + png_read_rows(png, out_rows, NULL, height); + png_destroy_read_struct(&png, &info, NULL); + delete[] out_rows; + + return im_size; +} diff --git a/neozip/test/benchmarks/benchmark_slidehash.cc b/neozip/test/benchmarks/benchmark_slidehash.cc new file mode 100644 index 0000000000..e74c06e873 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_slidehash.cc @@ -0,0 +1,116 @@ +/* benchmark_slidehash.cc -- benchmark slide_hash variants + * Copyright (C) 2022 Adam Stylinski, Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <limits.h> + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "zutil_p.h" +# include "deflate.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +#define MAX_RANDOM_INTS 32768 + +class slide_hash: public benchmark::Fixture { +private: + uint16_t *l0; + uint16_t *l1; + deflate_state *s_g; + +public: + /** + * @brief Prepare the benchmark fixture by allocating and initializing working data. + * + * Allocates two 64-byte-aligned arrays of `uint16_t` (one of size HASH_SIZE, one of size MAX_RANDOM_INTS), + * fills them with pseudorandom `uint16_t` values, allocates a `deflate_state` structure, and sets + * its `head` and `prev` pointers to the allocated arrays. + * + * @param state Benchmark-provided state object from Google Benchmark (supplied by the framework). + */ + void SetUp(const ::benchmark::State&) { + l0 = (uint16_t *)zng_alloc_aligned(HASH_SIZE * sizeof(uint16_t), 64); + + for (uint32_t i = 0; i < HASH_SIZE; i++) { + l0[i] = (uint16_t)rand(); + } + + l1 = (uint16_t *)zng_alloc_aligned(MAX_RANDOM_INTS * sizeof(uint16_t), 64); + + for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { + l1[i] = (uint16_t)rand(); + } + + deflate_state *s = (deflate_state*)malloc(sizeof(deflate_state)); + s->head = l0; + s->prev = l1; + s_g = s; + } + + void Bench(benchmark::State& state, slide_hash_func slide_hash) { + s_g->w_size = (uint32_t)state.range(0); + + for (auto _ : state) { + slide_hash(s_g); + benchmark::DoNotOptimize(s_g); + } + } + + void TearDown(const ::benchmark::State&) { + zng_free_aligned(l0); + zng_free_aligned(l1); + free(s_g); + } +}; + +#define BENCHMARK_SLIDEHASH(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(slide_hash, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(512, MAX_RANDOM_INTS); + +#if defined(WITH_ALL_FALLBACKS) || !(defined(__x86_64__) || defined(_M_X64)) +BENCHMARK_SLIDEHASH(c, slide_hash_c, 1); +#endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +BENCHMARK_SLIDEHASH(native, native_slide_hash, 1); +#else + +#ifdef ARM_SIMD +BENCHMARK_SLIDEHASH(armv6, slide_hash_armv6, test_cpu_features.arm.has_simd); +#endif +#ifdef ARM_NEON +BENCHMARK_SLIDEHASH(neon, slide_hash_neon, test_cpu_features.arm.has_neon); +#endif +#ifdef POWER8_VSX +BENCHMARK_SLIDEHASH(power8, slide_hash_power8, test_cpu_features.power.has_arch_2_07); +#endif +#ifdef PPC_VMX +BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, test_cpu_features.power.has_altivec); +#endif +#ifdef RISCV_RVV +BENCHMARK_SLIDEHASH(rvv, slide_hash_rvv, test_cpu_features.riscv.has_rvv); +#endif +#ifdef X86_SSE2 +BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2); +#endif +#ifdef X86_AVX2 +BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2); +#endif +#ifdef LOONGARCH_LSX +BENCHMARK_SLIDEHASH(lsx, slide_hash_lsx, test_cpu_features.loongarch.has_lsx); +#endif +#ifdef LOONGARCH_LASX +BENCHMARK_SLIDEHASH(lasx, slide_hash_lasx, test_cpu_features.loongarch.has_lasx); +#endif + +#endif
\ No newline at end of file diff --git a/neozip/test/benchmarks/benchmark_uncompress.cc b/neozip/test/benchmarks/benchmark_uncompress.cc new file mode 100644 index 0000000000..6a82c05d01 --- /dev/null +++ b/neozip/test/benchmarks/benchmark_uncompress.cc @@ -0,0 +1,97 @@ +/* benchmark_uncompress.cc -- benchmark uncompress() + * Copyright (C) 2024-2025 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <stdio.h> +#include <assert.h> +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "zutil_p.h" +# if defined(ZLIB_COMPAT) +# include "zlib.h" +# else +# include "zlib-ng.h" +# endif +# include "test/compressible_data_p.h" +} + +#define MAX_SIZE (1024 * 1024) +#define NUM_TESTS 6 + +class uncompress_bench: public benchmark::Fixture { +private: + uint8_t *inbuff; + uint8_t *outbuff; + uint8_t *compressed_buff[NUM_TESTS]; + z_uintmax_t compressed_sizes[NUM_TESTS]; + uint32_t sizes[NUM_TESTS] = {1, 64, 1024, 16384, 128*1024, 1024*1024}; + +public: + void SetUp(::benchmark::State& state) { + outbuff = (uint8_t *)malloc(MAX_SIZE + 16); + if (outbuff == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + // Initialize input buffer with highly compressible data, interspersed + // with small amounts of random data and 3-byte matches. + inbuff = gen_compressible_data(MAX_SIZE); + if (inbuff == NULL) { + free(outbuff); + outbuff = NULL; + state.SkipWithError("gen_compressible_data() failed"); + return; + } + + // Compress data into different buffers + for (int i = 0; i < NUM_TESTS; ++i) { + compressed_buff[i] = (uint8_t *)zng_alloc(sizes[i] + 64); + assert(compressed_buff[i] != NULL); + + z_uintmax_t compressed_size = sizes[i] + 64; + int err = PREFIX(compress2)(compressed_buff[i], &compressed_size, inbuff, sizes[i], Z_BEST_COMPRESSION); + if (err != Z_OK) { + fprintf(stderr, "compress() failed with error %d\n", err); + abort(); + } + compressed_sizes[i] = compressed_size; + } + } + + void Bench(benchmark::State& state) { + int err; + + for (auto _ : state) { + int index = 0; + while (sizes[index] != (uint32_t)state.range(0)) ++index; + + z_uintmax_t out_size = MAX_SIZE; + err = PREFIX(uncompress)(outbuff, &out_size, compressed_buff[index], compressed_sizes[index]); + if (err != Z_OK) { + fprintf(stderr, "uncompress() failed with error %d\n", err); + abort(); + } + } + } + + void TearDown(const ::benchmark::State&) { + free(inbuff); + free(outbuff); + + for (int i = 0; i < NUM_TESTS; ++i) { + zng_free(compressed_buff[i]); + } + } +}; + +#define BENCHMARK_UNCOMPRESS(name) \ + BENCHMARK_DEFINE_F(uncompress_bench, name)(benchmark::State& state) { \ + Bench(state); \ + } \ + BENCHMARK_REGISTER_F(uncompress_bench, name)->Arg(1)->Arg(64)->Arg(1024)->Arg(16<<10)->Arg(128<<10)->Arg(1024<<10); + +BENCHMARK_UNCOMPRESS(uncompress_bench); |
