diff options
| author | Adam Stylinski <kungfujesus06@gmail.com> | 2022-04-08 13:24:21 -0400 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2022-05-23 16:13:39 +0200 |
| commit | d79984b5bcaccab15e6cd13d7d1edea32ac36977 (patch) | |
| tree | 7b8e0053bfc6d237bb3ff493e0ad580923ef2526 /test | |
| parent | b8269bb7d4702f8e694441112bb4ba7c59ff2362 (diff) | |
| download | Project-Tick-d79984b5bcaccab15e6cd13d7d1edea32ac36977.tar.gz Project-Tick-d79984b5bcaccab15e6cd13d7d1edea32ac36977.zip | |
Adding avx512_vnni inline + copy elision
Interesting revelation while benchmarking all of this is that our
chunkmemset_avx seems to be slower in a lot of use cases than
chunkmemset_sse. That will be an interesting function to attempt to
optimize.
Right now though, we're basically beating google for all PNG decode and
encode benchmarks. There are some variations of flags that can
basically have us trading blows, but we're about as much as 14% faster
than chromium's zlib patches.
While we're here, add a more direct benchmark of the folded copy method
versus the explicit copy + checksum.
Diffstat (limited to 'test')
| -rw-r--r-- | test/benchmarks/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_adler32_copy.cc | 117 |
2 files changed, 118 insertions, 0 deletions
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt index df1df49731..19762fc738 100644 --- a/test/benchmarks/CMakeLists.txt +++ b/test/benchmarks/CMakeLists.txt @@ -24,6 +24,7 @@ endif() add_executable(benchmark_zlib benchmark_adler32.cc + benchmark_adler32_copy.cc benchmark_compare256.cc benchmark_crc32.cc benchmark_main.cc diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc new file mode 100644 index 0000000000..fac4c7f1cd --- /dev/null +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -0,0 +1,117 @@ +/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants + * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <stdio.h> +#include <assert.h> +#include <string.h> + +#include <benchmark/benchmark.h> + +extern "C" { +# include "zbuild.h" +# include "zutil_p.h" +# include "cpu_features.h" +} + +#define MAX_RANDOM_INTS (1024 * 1024) +#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) + +typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len); + +class adler32_copy: public benchmark::Fixture { +private: + uint32_t *random_ints_src; + uint32_t *random_ints_dst; + +public: + void SetUp(const ::benchmark::State& state) { + /* Control the alignment so that we have the best case scenario for loads. With + * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load. + * And while this is a realistic scenario, it makes it difficult to compare benchmark + * to benchmark because one allocation could have been aligned perfectly for the loads + * while the subsequent one happened to not be. This is not to be advantageous to AVX512 + * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to + * control the _consistency_ of the results */ + random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); + random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); + assert(random_ints != NULL); + + for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { + random_ints_src[i] = rand(); + } + } + + void Bench(benchmark::State& state, adler32_cpy_func adler32_func) { + uint32_t hash = 0; + + for (auto _ : state) { + hash = adler32_func(hash, (unsigned char *)random_ints_dst, + (const unsigned char*)random_ints_src, state.range(0)); + } + + benchmark::DoNotOptimize(hash); + } + + void TearDown(const ::benchmark::State& state) { + zng_free(random_ints_src); + zng_free(random_ints_dst); + } +}; + +#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + +#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const unsigned char *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, len); \ + return fptr(init_sum, buf, len); \ + }); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + +BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1); + +#ifdef ARM_NEON_ADLER32 +/* If we inline this copy for neon, the function would go here */ +//BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon); +BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon); +#endif + +#ifdef PPC_VMX_ADLER32 +//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec); +BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec); +#endif +#ifdef POWER8_VSX_ADLER32 +//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07); +BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07); +#endif + +#ifdef X86_SSE42_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3); +BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42); +#endif +#ifdef X86_AVX2_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2); +BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2); +#endif +#ifdef X86_AVX512_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512); +BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512); +#endif +#ifdef X86_AVX512VNNI_ADLER32 +BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni); +BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni); +#endif |
