From c1fed4ba05a49fd3b5ca8743461d9782f70fa820 Mon Sep 17 00:00:00 2001 From: Nathan Moinvaziri Date: Thu, 18 Dec 2025 16:17:18 -0800 Subject: Refactor crc32_fold functions into single crc32_copy --- test/CMakeLists.txt | 2 +- test/benchmarks/CMakeLists.txt | 2 +- test/benchmarks/benchmark_crc32_copy.cc | 131 +++++++++++++++++++++++++++ test/benchmarks/benchmark_crc32_fold_copy.cc | 131 --------------------------- test/test_crc32.cc | 2 +- test/test_crc32_copy.cc | 81 +++++++++++++++++ test/test_crc32_fold_copy.cc | 81 ----------------- 7 files changed, 215 insertions(+), 215 deletions(-) create mode 100644 test/benchmarks/benchmark_crc32_copy.cc delete mode 100644 test/benchmarks/benchmark_crc32_fold_copy.cc create mode 100644 test/test_crc32_copy.cc delete mode 100644 test/test_crc32_fold_copy.cc (limited to 'test') diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2ce549e8d8..bbaf1035c2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -180,7 +180,7 @@ if(WITH_GTEST) test_compare256.cc # compare256_neon(), etc test_compare256_rle.cc # compare256_rle(), etc test_crc32.cc # crc32_armv8(), etc - test_crc32_fold_copy.cc # crc32_fold_copy implementations + test_crc32_copy.cc # crc32_copy implementations test_inflate_sync.cc # expects a certain compressed block layout test_main.cc # cpu_check_features() test_version.cc # expects a fixed version string diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt index 79f49b1fa5..c9fe520ad3 100644 --- a/test/benchmarks/CMakeLists.txt +++ b/test/benchmarks/CMakeLists.txt @@ -44,7 +44,7 @@ add_executable(benchmark_zlib benchmark_compare256_rle.cc benchmark_compress.cc benchmark_crc32.cc - benchmark_crc32_fold_copy.cc + benchmark_crc32_copy.cc benchmark_insert_string.cc benchmark_main.cc benchmark_slidehash.cc diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc new file mode 100644 index 0000000000..7b16952018 --- /dev/null +++ b/test/benchmarks/benchmark_crc32_copy.cc @@ -0,0 +1,131 @@ +/* benchmark_crc32_fold_copy.cc -- benchmark for crc32 implementations doing folded copying + * Copyright (C) 2025 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +#define BUFSIZE (32768 + 16 + 16) + +// We have no function that gives us direct access to these, so we have a local implementation for benchmarks +static void crc32_fold_copy_braid(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_braid(crc->value, src, len); + memcpy(dst, src, len); +} +#ifndef WITHOUT_CHORBA +static void crc32_fold_copy_chorba(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_chorba(crc->value, src, len); + memcpy(dst, src, len); +} +#endif +#ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 + static void crc32_fold_copy_chorba_sse2(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_chorba_sse2(crc->value, src, len); + memcpy(dst, src, len); + } +# endif +# ifdef X86_SSE41 + static void crc32_fold_copy_chorba_sse41(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_chorba_sse41(crc->value, src, len); + memcpy(dst, src, len); + } +# endif +#endif + +class crc32_fc: public benchmark::Fixture { +protected: + uint32_t *testdata; + uint8_t *dstbuf; + uint32_t crc; + +public: + void SetUp(const ::benchmark::State&) { + testdata = (uint32_t *)malloc(BUFSIZE); + dstbuf = (uint8_t *)malloc(BUFSIZE); + assert((testdata != NULL) && (dstbuf != NULL)); + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); + } + } + + void Bench(benchmark::State& state, crc32_fold_reset_func fold_reset, crc32_fold_copy_func fold_copy, + crc32_fold_final_func fold_final) { + ALIGNED_(16) crc32_fold crc_st; + int misalign = 0; + // Prepare an initial crc state + fold_reset(&crc_st); + crc = 0; + + // Benchmark the CRC32 fold copy operation + for (auto _ : state) { + fold_copy(&crc_st, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + misalign++; + if (misalign > 14) + misalign = 0; + } + + // Finalize the CRC32 calculation + crc = fold_final(&crc_st); + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(crc); + } + + void TearDown(const ::benchmark::State&) { + free(testdata); + free(dstbuf); + } +}; + +#define BENCHMARK_CRC32_FOLD(name, resfunc, copyfunc, finfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_fc, name)(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, resfunc, copyfunc, finfunc); \ + } \ + BENCHMARK_REGISTER_F(crc32_fc, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10); + +// Generic +BENCHMARK_CRC32_FOLD(braid_c, crc32_fold_reset_c, crc32_fold_copy_braid, crc32_fold_final_c, 1) + +#ifdef DISABLE_RUNTIME_CPU_DETECTION + // Native + BENCHMARK_CRC32_FOLD(native, native_crc32_fold_reset, native_crc32_fold_copy, native_crc32_fold_final, 1) +#else + + // Optimized functions +# ifndef WITHOUT_CHORBA + BENCHMARK_CRC32_FOLD(chorba_c, crc32_fold_reset_c, crc32_fold_copy_chorba, crc32_fold_final_c, 1) +# endif +# ifdef ARM_CRC32 + BENCHMARK_CRC32_FOLD(armv8, crc32_fold_reset_c, crc32_fold_copy_armv8, crc32_fold_final_c, test_cpu_features.arm.has_crc32) +# endif +# ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 + BENCHMARK_CRC32_FOLD(chorba_sse2, crc32_fold_reset_c, crc32_fold_copy_chorba_sse2, crc32_fold_final_c, test_cpu_features.x86.has_sse2) +# endif +# ifdef X86_SSE41 + BENCHMARK_CRC32_FOLD(chorba_sse41, crc32_fold_reset_c, crc32_fold_copy_chorba_sse41, crc32_fold_final_c, test_cpu_features.x86.has_sse41) +# endif +# endif +# ifdef X86_PCLMULQDQ_CRC + BENCHMARK_CRC32_FOLD(pclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_pclmulqdq_copy, crc32_fold_pclmulqdq_final, test_cpu_features.x86.has_pclmulqdq) +# endif +# ifdef X86_VPCLMULQDQ_CRC + BENCHMARK_CRC32_FOLD(vpclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_vpclmulqdq_copy, crc32_fold_pclmulqdq_final, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) +# endif +# ifdef LOONGARCH_CRC + BENCHMARK_CRC32_FOLD(loongarch64, crc32_fold_reset_c, crc32_fold_copy_loongarch64, crc32_fold_final_c, test_cpu_features.loongarch.has_crc) +# endif + +#endif diff --git a/test/benchmarks/benchmark_crc32_fold_copy.cc b/test/benchmarks/benchmark_crc32_fold_copy.cc deleted file mode 100644 index 7b16952018..0000000000 --- a/test/benchmarks/benchmark_crc32_fold_copy.cc +++ /dev/null @@ -1,131 +0,0 @@ -/* benchmark_crc32_fold_copy.cc -- benchmark for crc32 implementations doing folded copying - * Copyright (C) 2025 Hans Kristian Rosbach - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include -#include - -extern "C" { -# include "zbuild.h" -# include "arch_functions.h" -# include "../test_cpu_features.h" -} - -#define BUFSIZE (32768 + 16 + 16) - -// We have no function that gives us direct access to these, so we have a local implementation for benchmarks -static void crc32_fold_copy_braid(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { - crc->value = crc32_braid(crc->value, src, len); - memcpy(dst, src, len); -} -#ifndef WITHOUT_CHORBA -static void crc32_fold_copy_chorba(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { - crc->value = crc32_chorba(crc->value, src, len); - memcpy(dst, src, len); -} -#endif -#ifndef WITHOUT_CHORBA_SSE -# ifdef X86_SSE2 - static void crc32_fold_copy_chorba_sse2(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { - crc->value = crc32_chorba_sse2(crc->value, src, len); - memcpy(dst, src, len); - } -# endif -# ifdef X86_SSE41 - static void crc32_fold_copy_chorba_sse41(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { - crc->value = crc32_chorba_sse41(crc->value, src, len); - memcpy(dst, src, len); - } -# endif -#endif - -class crc32_fc: public benchmark::Fixture { -protected: - uint32_t *testdata; - uint8_t *dstbuf; - uint32_t crc; - -public: - void SetUp(const ::benchmark::State&) { - testdata = (uint32_t *)malloc(BUFSIZE); - dstbuf = (uint8_t *)malloc(BUFSIZE); - assert((testdata != NULL) && (dstbuf != NULL)); - - for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { - testdata[i] = rand(); - } - } - - void Bench(benchmark::State& state, crc32_fold_reset_func fold_reset, crc32_fold_copy_func fold_copy, - crc32_fold_final_func fold_final) { - ALIGNED_(16) crc32_fold crc_st; - int misalign = 0; - // Prepare an initial crc state - fold_reset(&crc_st); - crc = 0; - - // Benchmark the CRC32 fold copy operation - for (auto _ : state) { - fold_copy(&crc_st, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); - misalign++; - if (misalign > 14) - misalign = 0; - } - - // Finalize the CRC32 calculation - crc = fold_final(&crc_st); - - // Prevent the result from being optimized away - benchmark::DoNotOptimize(crc); - } - - void TearDown(const ::benchmark::State&) { - free(testdata); - free(dstbuf); - } -}; - -#define BENCHMARK_CRC32_FOLD(name, resfunc, copyfunc, finfunc, support_flag) \ - BENCHMARK_DEFINE_F(crc32_fc, name)(benchmark::State& state) { \ - if (!(support_flag)) { \ - state.SkipWithError("CPU does not support " #name); \ - } \ - Bench(state, resfunc, copyfunc, finfunc); \ - } \ - BENCHMARK_REGISTER_F(crc32_fc, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10); - -// Generic -BENCHMARK_CRC32_FOLD(braid_c, crc32_fold_reset_c, crc32_fold_copy_braid, crc32_fold_final_c, 1) - -#ifdef DISABLE_RUNTIME_CPU_DETECTION - // Native - BENCHMARK_CRC32_FOLD(native, native_crc32_fold_reset, native_crc32_fold_copy, native_crc32_fold_final, 1) -#else - - // Optimized functions -# ifndef WITHOUT_CHORBA - BENCHMARK_CRC32_FOLD(chorba_c, crc32_fold_reset_c, crc32_fold_copy_chorba, crc32_fold_final_c, 1) -# endif -# ifdef ARM_CRC32 - BENCHMARK_CRC32_FOLD(armv8, crc32_fold_reset_c, crc32_fold_copy_armv8, crc32_fold_final_c, test_cpu_features.arm.has_crc32) -# endif -# ifndef WITHOUT_CHORBA_SSE -# ifdef X86_SSE2 - BENCHMARK_CRC32_FOLD(chorba_sse2, crc32_fold_reset_c, crc32_fold_copy_chorba_sse2, crc32_fold_final_c, test_cpu_features.x86.has_sse2) -# endif -# ifdef X86_SSE41 - BENCHMARK_CRC32_FOLD(chorba_sse41, crc32_fold_reset_c, crc32_fold_copy_chorba_sse41, crc32_fold_final_c, test_cpu_features.x86.has_sse41) -# endif -# endif -# ifdef X86_PCLMULQDQ_CRC - BENCHMARK_CRC32_FOLD(pclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_pclmulqdq_copy, crc32_fold_pclmulqdq_final, test_cpu_features.x86.has_pclmulqdq) -# endif -# ifdef X86_VPCLMULQDQ_CRC - BENCHMARK_CRC32_FOLD(vpclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_vpclmulqdq_copy, crc32_fold_pclmulqdq_final, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) -# endif -# ifdef LOONGARCH_CRC - BENCHMARK_CRC32_FOLD(loongarch64, crc32_fold_reset_c, crc32_fold_copy_loongarch64, crc32_fold_final_c, test_cpu_features.loongarch.has_crc) -# endif - -#endif diff --git a/test/test_crc32.cc b/test/test_crc32.cc index a49c9d72bc..2a0da7eb98 100644 --- a/test/test_crc32.cc +++ b/test/test_crc32.cc @@ -140,7 +140,7 @@ TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41) # endif #endif -#if defined(LOONGARCH_CRC) +#ifdef LOONGARCH_CRC INSTANTIATE_TEST_SUITE_P(crc32_alignment, crc32_align, testing::ValuesIn(align_offsets)); TEST_CRC32(loongarch64, crc32_loongarch64, test_cpu_features.loongarch.has_crc) TEST_CRC32_ALIGN(loongarch64_align, crc32_loongarch64, test_cpu_features.loongarch.has_crc) diff --git a/test/test_crc32_copy.cc b/test/test_crc32_copy.cc new file mode 100644 index 0000000000..81881d8e82 --- /dev/null +++ b/test/test_crc32_copy.cc @@ -0,0 +1,81 @@ +/* test_crc32_fold_copy.cc -- test for crc32 implementations doing folded copying + * Copyright (C) 2025 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "test_cpu_features.h" +# include "crc32_test_strings_p.h" +} + +#define BUFSIZE 615336U + +class crc32_fc_variant : public ::testing::TestWithParam { +protected: + uint8_t dstbuf[BUFSIZE]; + +public: + /* Ensure that crc32 fold copy functions returns the correct crc and copies the data */ + void crc32_fold_test(size_t minlen, int onlyzero, crc32_fold_reset_func fold_reset, crc32_fold_copy_func fold_copy, + crc32_fold_final_func fold_final, crc32_test params) { + ALIGNED_(16) crc32_fold crc_st; + uint32_t crc; + + ASSERT_LE(params.len, BUFSIZE); + + // Some optimized functions cannot take a crc value as start point + // and some have minimum length requirements + if (params.buf == NULL || params.len < minlen || (onlyzero && params.crc != 0)) { + GTEST_SKIP(); + } + + fold_reset(&crc_st); + crc_st.value = params.crc; + + fold_copy(&crc_st, dstbuf, params.buf, params.len); + crc = fold_final(&crc_st); + + EXPECT_EQ(crc, params.expect); + EXPECT_EQ(0, memcmp(params.buf, dstbuf, params.len)); + } +}; + +INSTANTIATE_TEST_SUITE_P(crc32_fc, crc32_fc_variant, testing::ValuesIn(crc32_tests)); + +#define TEST_CRC32_FOLD(name, minlen, onlyzero, resfunc, copyfunc, finfunc, support_flag) \ + TEST_P(crc32_fc_variant, name) { \ + if (!(support_flag)) { \ + GTEST_SKIP(); \ + return; \ + } \ + crc32_fold_test(minlen, onlyzero, resfunc, copyfunc, finfunc, GetParam()); \ + } + +// Generic test +TEST_CRC32_FOLD(generic, 0, 0, crc32_fold_reset_c, crc32_fold_copy_c, crc32_fold_final_c, 1) + +#ifdef DISABLE_RUNTIME_CPU_DETECTION + // Native test + TEST_CRC32_FOLD(native, 16, 1, native_crc32_fold_reset, native_crc32_fold_copy, native_crc32_fold_final, 1) +#else + + // Tests of optimized functions +# ifdef ARM_CRC32 + TEST_CRC32_FOLD(armv8, 0, 0, crc32_fold_reset_c, crc32_fold_copy_armv8, crc32_fold_final_c, test_cpu_features.arm.has_crc32) +# endif +# ifdef X86_PCLMULQDQ_CRC + // Is 16 bytes len the minimum for pclmul functions? + TEST_CRC32_FOLD(pclmulqdq, 16, 1, crc32_fold_pclmulqdq_reset, crc32_fold_pclmulqdq_copy, crc32_fold_pclmulqdq_final, test_cpu_features.x86.has_pclmulqdq) +# endif +# ifdef X86_VPCLMULQDQ_CRC + TEST_CRC32_FOLD(vpclmulqdq, 16, 1, crc32_fold_pclmulqdq_reset, crc32_fold_vpclmulqdq_copy, crc32_fold_pclmulqdq_final, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) +# endif +# ifdef LOONGARCH_CRC + TEST_CRC32_FOLD(loongarch64, 0, 0, crc32_fold_reset_c, crc32_fold_copy_loongarch64, crc32_fold_final_c, test_cpu_features.loongarch.has_crc) +# endif + +#endif diff --git a/test/test_crc32_fold_copy.cc b/test/test_crc32_fold_copy.cc deleted file mode 100644 index 81881d8e82..0000000000 --- a/test/test_crc32_fold_copy.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* test_crc32_fold_copy.cc -- test for crc32 implementations doing folded copying - * Copyright (C) 2025 Hans Kristian Rosbach - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include - -extern "C" { -# include "zbuild.h" -# include "arch_functions.h" -# include "test_cpu_features.h" -# include "crc32_test_strings_p.h" -} - -#define BUFSIZE 615336U - -class crc32_fc_variant : public ::testing::TestWithParam { -protected: - uint8_t dstbuf[BUFSIZE]; - -public: - /* Ensure that crc32 fold copy functions returns the correct crc and copies the data */ - void crc32_fold_test(size_t minlen, int onlyzero, crc32_fold_reset_func fold_reset, crc32_fold_copy_func fold_copy, - crc32_fold_final_func fold_final, crc32_test params) { - ALIGNED_(16) crc32_fold crc_st; - uint32_t crc; - - ASSERT_LE(params.len, BUFSIZE); - - // Some optimized functions cannot take a crc value as start point - // and some have minimum length requirements - if (params.buf == NULL || params.len < minlen || (onlyzero && params.crc != 0)) { - GTEST_SKIP(); - } - - fold_reset(&crc_st); - crc_st.value = params.crc; - - fold_copy(&crc_st, dstbuf, params.buf, params.len); - crc = fold_final(&crc_st); - - EXPECT_EQ(crc, params.expect); - EXPECT_EQ(0, memcmp(params.buf, dstbuf, params.len)); - } -}; - -INSTANTIATE_TEST_SUITE_P(crc32_fc, crc32_fc_variant, testing::ValuesIn(crc32_tests)); - -#define TEST_CRC32_FOLD(name, minlen, onlyzero, resfunc, copyfunc, finfunc, support_flag) \ - TEST_P(crc32_fc_variant, name) { \ - if (!(support_flag)) { \ - GTEST_SKIP(); \ - return; \ - } \ - crc32_fold_test(minlen, onlyzero, resfunc, copyfunc, finfunc, GetParam()); \ - } - -// Generic test -TEST_CRC32_FOLD(generic, 0, 0, crc32_fold_reset_c, crc32_fold_copy_c, crc32_fold_final_c, 1) - -#ifdef DISABLE_RUNTIME_CPU_DETECTION - // Native test - TEST_CRC32_FOLD(native, 16, 1, native_crc32_fold_reset, native_crc32_fold_copy, native_crc32_fold_final, 1) -#else - - // Tests of optimized functions -# ifdef ARM_CRC32 - TEST_CRC32_FOLD(armv8, 0, 0, crc32_fold_reset_c, crc32_fold_copy_armv8, crc32_fold_final_c, test_cpu_features.arm.has_crc32) -# endif -# ifdef X86_PCLMULQDQ_CRC - // Is 16 bytes len the minimum for pclmul functions? - TEST_CRC32_FOLD(pclmulqdq, 16, 1, crc32_fold_pclmulqdq_reset, crc32_fold_pclmulqdq_copy, crc32_fold_pclmulqdq_final, test_cpu_features.x86.has_pclmulqdq) -# endif -# ifdef X86_VPCLMULQDQ_CRC - TEST_CRC32_FOLD(vpclmulqdq, 16, 1, crc32_fold_pclmulqdq_reset, crc32_fold_vpclmulqdq_copy, crc32_fold_pclmulqdq_final, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) -# endif -# ifdef LOONGARCH_CRC - TEST_CRC32_FOLD(loongarch64, 0, 0, crc32_fold_reset_c, crc32_fold_copy_loongarch64, crc32_fold_final_c, test_cpu_features.loongarch.has_crc) -# endif - -#endif -- cgit 0.0.5-2-1-g0f52