Adding avx512_vnni inline + copy elision

Interesting revelation while benchmarking all of this is that our chunkmemset_avx seems to be slower in a lot of use cases than chunkmemset_sse. That will be an interesting function to attempt to optimize. Right now though, we're basically beating google for all PNG decode and encode benchmarks. There are some variations of flags that can basically have us trading blows, but we're about as much as 14% faster than chromium's zlib patches. While we're here, add a more direct benchmark of the folded copy method versus the explicit copy + checksum.
author: Adam Stylinski <kungfujesus06@gmail.com> 2022-04-08 13:24:21 -0400
committer: Hans Kristian Rosbach <hk-github@circlestorm.org> 2022-05-23 16:13:39 +0200
commit: d79984b5bcaccab15e6cd13d7d1edea32ac36977 (patch)
tree: 7b8e0053bfc6d237bb3ff493e0ad580923ef2526 /test
parent: b8269bb7d4702f8e694441112bb4ba7c59ff2362 (diff)
download: Project-Tick-d79984b5bcaccab15e6cd13d7d1edea32ac36977.tar.gz
Project-Tick-d79984b5bcaccab15e6cd13d7d1edea32ac36977.zip
2 files changed, 118 insertions, 0 deletions
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index df1df49731..19762fc738 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 
 add_executable(benchmark_zlib
     benchmark_adler32.cc
+    benchmark_adler32_copy.cc
     benchmark_compare256.cc
     benchmark_crc32.cc
     benchmark_main.cc
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc
new file mode 100644
index 0000000000..fac4c7f1cd
--- /dev/null
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -0,0 +1,117 @@
+/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len);
+
+class adler32_copy: public benchmark::Fixture {
+private:
+    uint32_t *random_ints_src;
+    uint32_t *random_ints_dst;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        /* Control the alignment so that we have the best case scenario for loads. With
+         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
+         * And while this is a realistic scenario, it makes it difficult to compare benchmark
+         * to benchmark because one allocation could have been aligned perfectly for the loads
+         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
+         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
+         * control the _consistency_ of the results */
+        random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        assert(random_ints != NULL);
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            random_ints_src[i] = rand();
+        }
+    }
+
+    void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = adler32_func(hash, (unsigned char *)random_ints_dst,
+                                (const unsigned char*)random_ints_src, state.range(0));
+        }
+
+        benchmark::DoNotOptimize(hash);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(random_ints_src);
+        zng_free(random_ints_dst);
+    }
+};
+
+#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const unsigned char *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, len); \
+            return fptr(init_sum, buf, len); \
+        }); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+
+#ifdef ARM_NEON_ADLER32
+/* If we inline this copy for neon, the function would go here */
+//BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon);
+BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon);
+#endif
+
+#ifdef PPC_VMX_ADLER32
+//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec);
+BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec);
+#endif
+#ifdef POWER8_VSX_ADLER32
+//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07);
+BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07);
+#endif
+
+#ifdef X86_SSE42_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3);
+BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42);
+#endif
+#ifdef X86_AVX2_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2);
+#endif
+#ifdef X86_AVX512_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512);
+BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni);
+#endif
author	Adam Stylinski <kungfujesus06@gmail.com>	2022-04-08 13:24:21 -0400
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>	2022-05-23 16:13:39 +0200
commit	d79984b5bcaccab15e6cd13d7d1edea32ac36977 (patch)
tree	7b8e0053bfc6d237bb3ff493e0ad580923ef2526 /test
parent	b8269bb7d4702f8e694441112bb4ba7c59ff2362 (diff)
download	Project-Tick-d79984b5bcaccab15e6cd13d7d1edea32ac36977.tar.gz Project-Tick-d79984b5bcaccab15e6cd13d7d1edea32ac36977.zip