diff options
| author | Hans Kristian Rosbach <hk-git@circlestorm.org> | 2026-01-14 17:18:02 +0100 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-01-20 22:59:40 +0100 |
| commit | b233f8675b08220aa0db9207cb4295df871299fd (patch) | |
| tree | d506652c31b78df7d58808e8755bc446f546e173 /test | |
| parent | 3bae47cffbf4030843d6e061f1cc094eeab0e53f (diff) | |
| download | Project-Tick-b233f8675b08220aa0db9207cb4295df871299fd.tar.gz Project-Tick-b233f8675b08220aa0db9207cb4295df871299fd.zip | |
Unify adler32/crc32 benchmarks and add rotating misalignment
Add aligned benchmarks for adler32/crc32
Diffstat (limited to 'test')
| -rw-r--r-- | test/benchmarks/benchmark_adler32.cc | 66 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_adler32_copy.cc | 89 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_crc32.cc | 55 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_crc32_copy.cc | 58 |
4 files changed, 169 insertions, 99 deletions
diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc index 48121db428..8fb2130041 100644 --- a/test/benchmarks/benchmark_adler32.cc +++ b/test/benchmarks/benchmark_adler32.cc @@ -3,65 +3,79 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include <stdio.h> -#include <assert.h> - #include <benchmark/benchmark.h> extern "C" { # include "zbuild.h" -# include "zutil_p.h" # include "arch_functions.h" # include "../test_cpu_features.h" } -#define MAX_RANDOM_INTS (1024 * 1024) -#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) +#define BUFSIZE ((4 * 1024 * 1024) + 64) class adler32: public benchmark::Fixture { private: - uint32_t *random_ints; + uint32_t *testdata; public: - void SetUp(const ::benchmark::State&) { - /* Control the alignment so that we have the best case scenario for loads. With - * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load. - * And while this is a realistic scenario, it makes it difficult to compare benchmark - * to benchmark because one allocation could have been aligned perfectly for the loads - * while the subsequent one happened to not be. This is not to be advantageous to AVX512 - * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to - * control the _consistency_ of the results */ - random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - assert(random_ints != NULL); - - for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { - random_ints[i] = rand(); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); } } - void Bench(benchmark::State& state, adler32_func adler32) { + // Benchmark Adler32, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) { + int misalign = 0; uint32_t hash = 0; for (auto _ : state) { - hash = adler32(hash, (const unsigned char *)random_ints, (size_t)state.range(0)); + hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } + // Prevent the result from being optimized away benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - zng_free(random_ints); + zng_free_aligned(testdata); } }; -#define BENCHMARK_ADLER32(name, fptr, support_flag) \ +#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \ BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, fptr); \ + Bench(state, hashfunc, 0); \ } \ - BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10) + BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \ + BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \ + BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag); BENCHMARK_ADLER32(c, adler32_c, 1); diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index d8efa0d22e..fbfb85602e 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -3,75 +3,88 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include <stdio.h> -#include <assert.h> -#include <string.h> - #include <benchmark/benchmark.h> extern "C" { # include "zbuild.h" -# include "zutil_p.h" # include "arch_functions.h" # include "../test_cpu_features.h" } -#define MAX_RANDOM_INTS (1024 * 1024) -#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) - -typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len); +// Hash copy functions are used on strm->next_in buffers, we process +// 512-32k sizes (x2 for initial fill) at a time if enough data is available. +#define BUFSIZE (65536 + 64) class adler32_copy: public benchmark::Fixture { private: - uint32_t *random_ints_src; - uint32_t *random_ints_dst; + uint32_t *testdata; + uint8_t *dstbuf; public: - void SetUp(const ::benchmark::State&) { - /* Control the alignment so that we have the best case scenario for loads. With - * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load. - * And while this is a realistic scenario, it makes it difficult to compare benchmark - * to benchmark because one allocation could have been aligned perfectly for the loads - * while the subsequent one happened to not be. This is not to be advantageous to AVX512 - * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to - * control the _consistency_ of the results */ - random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - assert(random_ints_src != NULL); - assert(random_ints_dst != NULL); - - for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { - random_ints_src[i] = rand(); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL || dstbuf == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); } } - void Bench(benchmark::State& state, adler32_cpy_func adler32_func) { + // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) { + int misalign = 0; uint32_t hash = 0; for (auto _ : state) { - hash = adler32_func(hash, (unsigned char *)random_ints_dst, - (const unsigned char*)random_ints_src, (size_t)state.range(0)); + hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } + // Prevent the result from being optimized away benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - zng_free(random_ints_src); - zng_free(random_ints_dst); + zng_free_aligned(testdata); + zng_free_aligned(dstbuf); } }; -#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \ +// Misaligned +#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \ BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, fptr); \ + Bench(state, copyfunc, 0); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 1); \ } \ - BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); -#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \ +// Adler32 + memcpy benchmark for reference +#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \ BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ @@ -79,10 +92,10 @@ public: Bench(state, [](uint32_t init_sum, unsigned char *dst, \ const uint8_t *buf, size_t len) -> uint32_t { \ memcpy(dst, buf, (size_t)len); \ - return fptr(init_sum, buf, len); \ - }); \ + return copyfunc(init_sum, buf, len); \ + }, 1); \ } \ - BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10); BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1); diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc index e3c4d9e2c2..df7eaec3e6 100644 --- a/test/benchmarks/benchmark_crc32.cc +++ b/test/benchmarks/benchmark_crc32.cc @@ -3,59 +3,80 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include <stdio.h> -#include <assert.h> - #include <benchmark/benchmark.h> extern "C" { # include "zbuild.h" -# include "zutil_p.h" # include "arch_functions.h" # include "../test_cpu_features.h" } -#define MAX_RANDOM_INTS (1024 * 1024) -#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) +#define BUFSIZE ((4 * 1024 * 1024) + 64) class crc32: public benchmark::Fixture { private: - uint32_t *random_ints; + uint32_t *testdata; public: - void SetUp(const ::benchmark::State&) { - random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - assert(random_ints != NULL); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL) { + state.SkipWithError("malloc failed"); + return; + } - for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { - random_ints[i] = rand(); + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); } } - void Bench(benchmark::State& state, crc32_func crc32) { + // Benchmark CRC32, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) { + int misalign = 0; uint32_t hash = 0; for (auto _ : state) { - hash = crc32(hash, (const unsigned char *)random_ints, (size_t)state.range(0)); + hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } + // Prevent the result from being optimized away benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - zng_free(random_ints); + zng_free_aligned(testdata); } }; -#define BENCHMARK_CRC32(name, fptr, support_flag) \ +#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \ BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, fptr); \ + Bench(state, hashfunc, 0); \ } \ BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_CRC32(name, hashfunc, support_flag) \ + BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \ + BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag); + BENCHMARK_CRC32(braid, crc32_braid, 1); #ifdef DISABLE_RUNTIME_CPU_DETECTION diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc index e2de0f5a59..71497e9aca 100644 --- a/test/benchmarks/benchmark_crc32_copy.cc +++ b/test/benchmarks/benchmark_crc32_copy.cc @@ -4,7 +4,6 @@ */ #include <benchmark/benchmark.h> -#include <assert.h> extern "C" { # include "zbuild.h" @@ -12,7 +11,9 @@ extern "C" { # include "../test_cpu_features.h" } -#define BUFSIZE (32768 + 16 + 16) +// Hash copy functions are used on strm->next_in buffers, we process +// 512-32k sizes (x2 for initial fill) at a time if enough data is available. +#define BUFSIZE (65536 + 64) class crc32_copy: public benchmark::Fixture { protected: @@ -20,46 +21,67 @@ protected: uint8_t *dstbuf; public: - void SetUp(const ::benchmark::State&) { - testdata = (uint32_t *)malloc(BUFSIZE); - dstbuf = (uint8_t *)malloc(BUFSIZE); - assert((testdata != NULL) && (dstbuf != NULL)); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL || dstbuf == NULL) { + state.SkipWithError("malloc failed"); + return; + } for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { testdata[i] = rand(); } } - void Bench(benchmark::State& state, crc32_copy_func crc32_copy) { + // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) { int misalign = 0; - uint32_t crc = 0; + uint32_t hash = 0; - // Benchmark the CRC32 copy operation for (auto _ : state) { - crc = crc32_copy(crc, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); - misalign++; - if (misalign > 14) + hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } // Prevent the result from being optimized away - benchmark::DoNotOptimize(crc); + benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - free(testdata); - free(dstbuf); + zng_free_aligned(testdata); + zng_free_aligned(dstbuf); } }; -#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \ +// Misaligned +#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \ BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, copyfunc); \ + Bench(state, copyfunc, 0); \ } \ - BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10); + BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \ + BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); // Base test BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1); |
