diff options
| author | Nathan Moinvaziri <nathan@nathanm.com> | 2026-03-09 00:30:04 -0700 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-03-10 14:14:03 +0100 |
| commit | 36278cbf2e22ca37af07178356e25c7f9c874664 (patch) | |
| tree | 044453559bb77b8ff8a6323813ab3e752b3c32ad | |
| parent | 006166b5c451cae55fca312c1866cb022ce5ce0c (diff) | |
| download | Project-Tick-36278cbf2e22ca37af07178356e25c7f9c874664.tar.gz Project-Tick-36278cbf2e22ca37af07178356e25c7f9c874664.zip | |
Add 256-bit VPCLMULQDQ CRC32 path for systems without AVX-512.
Split VPCLMULQDQ CRC32 into separate AVX2 and AVX-512 compilation
units. Compute fold-by-8 constants for the AVX2 path using
bitreverse(x^d mod G(x), 33) with d=992 and d=1056.
| -rw-r--r-- | CMakeLists.txt | 24 | ||||
| -rw-r--r-- | arch/x86/Makefile.in | 17 | ||||
| -rw-r--r-- | arch/x86/crc32_pclmulqdq_tpl.h | 121 | ||||
| -rw-r--r-- | arch/x86/crc32_vpclmulqdq.c | 18 | ||||
| -rw-r--r-- | arch/x86/crc32_vpclmulqdq_avx2.c | 17 | ||||
| -rw-r--r-- | arch/x86/crc32_vpclmulqdq_avx512.c | 17 | ||||
| -rw-r--r-- | arch/x86/x86_features.c | 2 | ||||
| -rw-r--r-- | arch/x86/x86_functions.h | 27 | ||||
| -rw-r--r-- | arch/x86/x86_natives.h | 11 | ||||
| -rw-r--r-- | cmake/detect-intrinsics.cmake | 12 | ||||
| -rwxr-xr-x | configure | 27 | ||||
| -rw-r--r-- | functable.c | 20 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_crc32.cc | 7 | ||||
| -rw-r--r-- | test/benchmarks/benchmark_crc32_copy.cc | 7 | ||||
| -rw-r--r-- | test/test_crc32.cc | 7 | ||||
| -rw-r--r-- | test/test_crc32_copy.cc | 7 |
16 files changed, 266 insertions, 75 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 550ddcde03..8029c4a68b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,7 +172,7 @@ elseif(BASEARCH_X86_FOUND) cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF) cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF) cmake_dependent_option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON "WITH_AVX512" OFF) - cmake_dependent_option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON "WITH_PCLMULQDQ;WITH_AVX512" OFF) + cmake_dependent_option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON "WITH_PCLMULQDQ;WITH_AVX2" OFF) endif() option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF) @@ -1130,12 +1130,22 @@ if(WITH_OPTIM) endif() if(WITH_VPCLMULQDQ) check_vpclmulqdq_intrinsics() - if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX512) - add_definitions(-DX86_VPCLMULQDQ_CRC) - set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c) - add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}\"") - list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS}) - set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}") + if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX2) + add_definitions(-DX86_VPCLMULQDQ_AVX2) + set(VPCLMULQDQ_AVX2_SRCS ${ARCHDIR}/crc32_vpclmulqdq_avx2.c) + set(VPCLMULQDQ_AVX2_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX2FLAG}") + list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_AVX2_SRCS}) + set_property(SOURCE ${VPCLMULQDQ_AVX2_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULQDQ_AVX2_FLAGS} ${NOLTOFLAG}") + add_feature_info(VPCLMULQDQ_AVX2 1 "Support CRC using VPCLMULQDQ (AVX2), using \"${VPCLMULQDQ_AVX2_FLAGS}\"") + + if(WITH_AVX512) + add_definitions(-DX86_VPCLMULQDQ_AVX512) + set(VPCLMULQDQ_AVX512_SRCS ${ARCHDIR}/crc32_vpclmulqdq_avx512.c) + set(VPCLMULQDQ_AVX512_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}") + list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_AVX512_SRCS}) + set_property(SOURCE ${VPCLMULQDQ_AVX512_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULQDQ_AVX512_FLAGS} ${NOLTOFLAG}") + add_feature_info(VPCLMULQDQ_AVX512 1 "Support CRC using VPCLMULQDQ (AVX-512), using \"${VPCLMULQDQ_AVX512_FLAGS}\"") + endif() else() set(WITH_VPCLMULQDQ OFF) endif() diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index addf5c6ea3..f756844a9f 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -41,7 +41,8 @@ all: \ crc32_chorba_sse2.o crc32_chorba_sse2.lo \ crc32_chorba_sse41.o crc32_chorba_sse41.lo \ crc32_pclmulqdq.o crc32_pclmulqdq.lo \ - crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \ + crc32_vpclmulqdq_avx2.o crc32_vpclmulqdq_avx2.lo \ + crc32_vpclmulqdq_avx512.o crc32_vpclmulqdq_avx512.lo \ slide_hash_avx2.o slide_hash_avx2.lo \ slide_hash_sse2.o slide_hash_sse2.lo @@ -111,11 +112,17 @@ crc32_pclmulqdq.o: crc32_pclmulqdq.lo: $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c -crc32_vpclmulqdq.o: - $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c +crc32_vpclmulqdq_avx2.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c -crc32_vpclmulqdq.lo: - $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c +crc32_vpclmulqdq_avx2.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c + +crc32_vpclmulqdq_avx512.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c + +crc32_vpclmulqdq_avx512.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c slide_hash_avx2.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h index 8677f1e872..e4ea546afd 100644 --- a/arch/x86/crc32_pclmulqdq_tpl.h +++ b/arch/x86/crc32_pclmulqdq_tpl.h @@ -28,7 +28,8 @@ #include "crc32_p.h" #include "x86_intrins.h" -#ifdef X86_VPCLMULQDQ +/* 512-bit VPCLMULQDQ path requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) # if defined(_MSC_VER) && _MSC_VER < 1920 /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */ # define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96) @@ -43,6 +44,10 @@ # define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96) # endif #endif +/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */ +#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) +# define z256_xor3_epi64(a, b, c) _mm256_xor_si256(_mm256_xor_si256(a, b), c) +#endif #ifndef z128_xor3_epi64 # define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c) @@ -117,7 +122,8 @@ static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_cr *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); } -#ifdef X86_VPCLMULQDQ +/* 512-bit fold function requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3, const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) { __m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01); @@ -135,6 +141,25 @@ static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_cr *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3); } #endif +/* 256-bit fold function for VPCLMULQDQ without AVX-512 */ +#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) +static inline void fold_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3, + const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, const __m256i ymm_t3, const __m256i ymm_fold8) { + __m256i y_low0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01); + __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10); + __m256i y_low1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01); + __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10); + __m256i y_low2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01); + __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10); + __m256i y_low3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01); + __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10); + + *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0); + *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1); + *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2); + *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3); +} +#endif Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { size_t copy_len = len; @@ -181,7 +206,8 @@ Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc)); } -#ifdef X86_VPCLMULQDQ +/* 512-bit VPCLMULQDQ path requires AVX-512F */ +#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) if (len >= 256) { len -= 256; @@ -253,6 +279,95 @@ Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const xmm_crc2 = z512_extracti64x2(zmm_crc0, 2); xmm_crc3 = z512_extracti64x2(zmm_crc0, 3); } +/* 256-bit VPCLMULQDQ path */ +#elif defined(X86_VPCLMULQDQ) + if (len >= 128) { + len -= 128; + + __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3; + __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3; + __m256i y_low0, y_high0; + const __m256i ymm_fold4 = _mm256_set_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596, + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m256i ymm_fold8 = _mm256_set_epi32( + 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880, + 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880); + + ymm_crc0 = _mm256_loadu_si256((__m256i *)src); + ymm_crc1 = _mm256_loadu_si256((__m256i *)src + 1); + ymm_crc2 = _mm256_loadu_si256((__m256i *)src + 2); + ymm_crc3 = _mm256_loadu_si256((__m256i *)src + 3); + src += 128; + if (COPY) { + _mm256_storeu_si256((__m256i *)dst, ymm_crc0); + _mm256_storeu_si256((__m256i *)dst + 1, ymm_crc1); + _mm256_storeu_si256((__m256i *)dst + 2, ymm_crc2); + _mm256_storeu_si256((__m256i *)dst + 3, ymm_crc3); + dst += 128; + } + + // Fold existing xmm state into first 32 bytes + ymm_t0 = _mm256_castsi128_si256(xmm_crc0); + ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc1, 1); + + y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); + y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); + ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0); + + ymm_t0 = _mm256_castsi128_si256(xmm_crc2); + ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc3, 1); + + y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); + y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); + ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0); + + while (len >= 128) { + len -= 128; + ymm_t0 = _mm256_loadu_si256((__m256i *)src); + ymm_t1 = _mm256_loadu_si256((__m256i *)src + 1); + ymm_t2 = _mm256_loadu_si256((__m256i *)src + 2); + ymm_t3 = _mm256_loadu_si256((__m256i *)src + 3); + src += 128; + + fold_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8); + if (COPY) { + _mm256_storeu_si256((__m256i *)dst, ymm_t0); + _mm256_storeu_si256((__m256i *)dst + 1, ymm_t1); + _mm256_storeu_si256((__m256i *)dst + 2, ymm_t2); + _mm256_storeu_si256((__m256i *)dst + 3, ymm_t3); + dst += 128; + } + } + + // Extract 8 x 128-bit lanes from 4 x 256-bit registers + __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0); + __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1); + __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1); + __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1); + __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2); + __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1); + __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3); + __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1); + + // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4) + __m128i x_low, x_high; + x_low = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10); + xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4); + + x_low = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10); + xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5); + + x_low = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10); + xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6); + + x_low = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01); + x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10); + xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7); + } #else /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 * We interleave the PCLMUL-base folds with 8x scaled generator diff --git a/arch/x86/crc32_vpclmulqdq.c b/arch/x86/crc32_vpclmulqdq.c deleted file mode 100644 index 793d8ab99a..0000000000 --- a/arch/x86/crc32_vpclmulqdq.c +++ /dev/null @@ -1,18 +0,0 @@ -/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation. - * Copyright Wangyang Guo (wangyang.guo@intel.com) - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifdef X86_VPCLMULQDQ_CRC - -#define X86_VPCLMULQDQ -#include "crc32_pclmulqdq_tpl.h" - -Z_INTERNAL uint32_t crc32_vpclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) { - return crc32_copy_impl(crc, NULL, buf, len, 0); -} - -Z_INTERNAL uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { - return crc32_copy_impl(crc, dst, src, len, 1); -} -#endif diff --git a/arch/x86/crc32_vpclmulqdq_avx2.c b/arch/x86/crc32_vpclmulqdq_avx2.c new file mode 100644 index 0000000000..1cdef13b09 --- /dev/null +++ b/arch/x86/crc32_vpclmulqdq_avx2.c @@ -0,0 +1,17 @@ +/* crc32_vpclmulqdq_avx2.c -- VPCLMULQDQ-based CRC32 with AVX2. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_AVX2 + +#define X86_VPCLMULQDQ +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/arch/x86/crc32_vpclmulqdq_avx512.c b/arch/x86/crc32_vpclmulqdq_avx512.c new file mode 100644 index 0000000000..a95a448f49 --- /dev/null +++ b/arch/x86/crc32_vpclmulqdq_avx512.c @@ -0,0 +1,17 @@ +/* crc32_vpclmulqdq_avx512.c -- VPCLMULQDQ-based CRC32 with AVX-512. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_AVX512 + +#define X86_VPCLMULQDQ +#include "crc32_pclmulqdq_tpl.h" + +Z_INTERNAL uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len) { + return crc32_copy_impl(crc, NULL, buf, len, 0); +} + +Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { + return crc32_copy_impl(crc, dst, src, len, 1); +} +#endif diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c index 30d17fb063..5eba18bf8a 100644 --- a/arch/x86/x86_features.c +++ b/arch/x86/x86_features.c @@ -105,6 +105,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { // check AVX2 bit if the OS supports saving YMM registers if (features->has_os_save_ymm) { features->has_avx2 = ebx & 0x20; + features->has_vpclmulqdq = ecx & 0x400; } // check AVX512 bits if the OS supports saving ZMM registers @@ -120,7 +121,6 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \ && features->has_avx512vl && features->has_bmi2; features->has_avx512vnni = ecx & 0x800; - features->has_vpclmulqdq = ecx & 0x400; } } } diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 7b628a851a..881c6efe23 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -75,9 +75,13 @@ uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *s uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); #endif -#ifdef X86_VPCLMULQDQ_CRC -uint32_t crc32_vpclmulqdq(uint32_t crc, const uint8_t *buf, size_t len); -uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#ifdef X86_VPCLMULQDQ_AVX2 +uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); +#endif +#ifdef X86_VPCLMULQDQ_AVX512 +uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len); +uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef DISABLE_RUNTIME_CPU_DETECTION @@ -174,13 +178,18 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s # undef native_adler32_copy # define native_adler32_copy adler32_copy_avx512_vnni # endif +# endif // X86 - VPCLMULQDQ -# ifdef X86_VPCLMULQDQ_NATIVE -# undef native_crc32 -# define native_crc32 crc32_vpclmulqdq -# undef native_crc32_copy -# define native_crc32_copy crc32_copy_vpclmulqdq -# endif +# ifdef X86_VPCLMULQDQ_AVX512_NATIVE +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq_avx512 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_vpclmulqdq_avx512 +# elif defined(X86_VPCLMULQDQ_AVX2_NATIVE) +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq_avx2 +# undef native_crc32_copy +# define native_crc32_copy crc32_copy_vpclmulqdq_avx2 # endif #endif diff --git a/arch/x86/x86_natives.h b/arch/x86/x86_natives.h index 75f249d909..a39b7a51f0 100644 --- a/arch/x86/x86_natives.h +++ b/arch/x86/x86_natives.h @@ -29,9 +29,6 @@ # ifdef X86_PCLMULQDQ_CRC # define X86_PCLMULQDQ_NATIVE # endif -# if defined(__AVX512F__) && defined(__VPCLMULQDQ__) -# define X86_VPCLMULQDQ_NATIVE -# endif #endif #if defined(__AVX2__) # ifdef X86_AVX2 @@ -48,5 +45,13 @@ # define X86_AVX512VNNI_NATIVE # endif #endif +#if defined(__VPCLMULQDQ__) +# if defined(X86_VPCLMULQDQ_AVX2) && defined(X86_AVX2_NATIVE) +# define X86_VPCLMULQDQ_AVX2_NATIVE +# endif +# if defined(X86_VPCLMULQDQ_AVX512) && defined(X86_AVX512_NATIVE) +# define X86_VPCLMULQDQ_AVX512_NATIVE +# endif +#endif #endif /* X86_NATIVES_H_ */ diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 118bfbc2a5..c524c17bbe 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -359,19 +359,19 @@ endmacro() macro(check_vpclmulqdq_intrinsics) if(NOT NATIVEFLAG) - if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC") - set(VPCLMULFLAG "-mvpclmulqdq -mavx512f") + if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang|IntelLLVM|NVHPC") + set(VPCLMULFLAG "-mvpclmulqdq") endif() endif() # Check whether compiler supports VPCLMULQDQ intrinsics if(NOT (APPLE AND ARCH_32BIT)) - set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") + set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}") check_c_source_compiles( "#include <immintrin.h> #include <wmmintrin.h> - __m512i f(__m512i a) { - __m512i b = _mm512_setzero_si512(); - return _mm512_clmulepi64_epi128(a, b, 0x10); + __m256i f(__m256i a) { + __m256i b = _mm256_setzero_si256(); + return _mm256_clmulepi64_epi128(a, b, 0x10); } int main(void) { return 0; }" HAVE_VPCLMULQDQ_INTRIN @@ -122,7 +122,7 @@ ssse3flag="-mssse3" sse41flag="-msse4.1" sse42flag="-msse4.2" pclmulflag="-mpclmul" -vpclmulflag="-mvpclmulqdq -mavx512f" +vpclmulflag="-mvpclmulqdq" xsaveflag="-mxsave" lsxflag="-mlsx" lasxflag="-mlasx" @@ -1498,13 +1498,13 @@ check_vpclmulqdq_intrinsics() { cat > $test.c << EOF #include <immintrin.h> #include <wmmintrin.h> -__m512i f(__m512i a) { - __m512i b = _mm512_setzero_si512(); - return _mm512_clmulepi64_epi128(a, b, 0x10); +__m256i f(__m256i a) { + __m256i b = _mm256_setzero_si256(); + return _mm256_clmulepi64_epi128(a, b, 0x10); } int main(void) { return 0; } EOF - if try ${CC} ${CFLAGS} ${vpclmulflag} $test.c; then + if try ${CC} ${CFLAGS} ${vpclmulflag} ${avx2flag} $test.c; then echo "Checking for VPCLMULQDQ intrinsics ... Yes." | tee -a configure.log HAVE_VPCLMULQDQ_INTRIN=1 else @@ -1993,14 +1993,21 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo" fi - if test $buildvpclmulqdq -eq 1 && test ${HAVE_PCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then + if test $buildvpclmulqdq -eq 1 && test ${HAVE_PCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX2_INTRIN} -eq 1; then check_vpclmulqdq_intrinsics if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC" - SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq.lo" + CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_AVX2" + SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_AVX2" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq_avx2.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq_avx2.lo" + + if test ${HAVE_AVX512_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_AVX512" + SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_AVX512" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq_avx512.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq_avx512.lo" + fi fi fi fi diff --git a/functable.c b/functable.c index 478051f40d..eec5d07f67 100644 --- a/functable.c +++ b/functable.c @@ -256,14 +256,24 @@ static int init_functable(void) { ft.adler32_copy = &adler32_copy_avx512_vnni; } #endif - // X86 - VPCLMULQDQ -#ifdef X86_VPCLMULQDQ_CRC -# ifndef X86_VPCLMULQDQ_NATIVE + // X86 - VPCLMULQDQ (AVX2) +#ifdef X86_VPCLMULQDQ_AVX2 +# ifndef X86_VPCLMULQDQ_AVX2_NATIVE + if (cf.x86.has_pclmulqdq && cf.x86.has_avx2 && cf.x86.has_vpclmulqdq) +# endif + { + ft.crc32 = &crc32_vpclmulqdq_avx2; + ft.crc32_copy = &crc32_copy_vpclmulqdq_avx2; + } +#endif + // X86 - VPCLMULQDQ (AVX-512) +#ifdef X86_VPCLMULQDQ_AVX512 +# ifndef X86_VPCLMULQDQ_AVX512_NATIVE if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) # endif { - ft.crc32 = &crc32_vpclmulqdq; - ft.crc32_copy = &crc32_copy_vpclmulqdq; + ft.crc32 = &crc32_vpclmulqdq_avx512; + ft.crc32_copy = &crc32_copy_vpclmulqdq_avx512; } #endif diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc index b30b1855fa..1e2cf88590 100644 --- a/test/benchmarks/benchmark_crc32.cc +++ b/test/benchmarks/benchmark_crc32.cc @@ -112,8 +112,11 @@ BENCHMARK_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx); #ifdef X86_PCLMULQDQ_CRC BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq); #endif -#ifdef X86_VPCLMULQDQ_CRC -BENCHMARK_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)); +#ifdef X86_VPCLMULQDQ_AVX2 +BENCHMARK_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq)); +#endif +#ifdef X86_VPCLMULQDQ_AVX512 +BENCHMARK_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)); #endif #ifdef LOONGARCH_CRC BENCHMARK_CRC32(loongarch64, crc32_loongarch64, test_cpu_features.loongarch.has_crc); diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc index 2d054dba49..eafa5aee44 100644 --- a/test/benchmarks/benchmark_crc32_copy.cc +++ b/test/benchmarks/benchmark_crc32_copy.cc @@ -167,8 +167,11 @@ BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1); # ifdef X86_PCLMULQDQ_CRC BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq) # endif -# ifdef X86_VPCLMULQDQ_CRC - BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) +# ifdef X86_VPCLMULQDQ_AVX2 + BENCHMARK_CRC32_COPY(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, crc32_copy_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq)) +# endif +# ifdef X86_VPCLMULQDQ_AVX512 + BENCHMARK_CRC32_COPY(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, crc32_copy_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) # endif #endif diff --git a/test/test_crc32.cc b/test/test_crc32.cc index a44433193e..1fb0771119 100644 --- a/test/test_crc32.cc +++ b/test/test_crc32.cc @@ -123,8 +123,11 @@ TEST_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx) #ifdef X86_PCLMULQDQ_CRC TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq) #endif -#ifdef X86_VPCLMULQDQ_CRC -TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) +#ifdef X86_VPCLMULQDQ_AVX2 +TEST_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq)) +#endif +#ifdef X86_VPCLMULQDQ_AVX512 +TEST_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) #endif #ifndef WITHOUT_CHORBA_SSE # ifdef X86_SSE2 diff --git a/test/test_crc32_copy.cc b/test/test_crc32_copy.cc index d397448017..fa3059e544 100644 --- a/test/test_crc32_copy.cc +++ b/test/test_crc32_copy.cc @@ -79,8 +79,11 @@ TEST_CRC32_COPY(braid, crc32_copy_braid, 1) # ifdef X86_PCLMULQDQ_CRC TEST_CRC32_COPY(pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq) # endif -# ifdef X86_VPCLMULQDQ_CRC - TEST_CRC32_COPY(vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) +# ifdef X86_VPCLMULQDQ_AVX2 + TEST_CRC32_COPY(vpclmulqdq_avx2, crc32_copy_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq)) +# endif +# ifdef X86_VPCLMULQDQ_AVX512 + TEST_CRC32_COPY(vpclmulqdq_avx512, crc32_copy_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) # endif #endif |
