summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Moinvaziri <nathan@nathanm.com>2026-03-09 00:30:04 -0700
committerHans Kristian Rosbach <hk-github@circlestorm.org>2026-03-10 14:14:03 +0100
commit36278cbf2e22ca37af07178356e25c7f9c874664 (patch)
tree044453559bb77b8ff8a6323813ab3e752b3c32ad
parent006166b5c451cae55fca312c1866cb022ce5ce0c (diff)
downloadProject-Tick-36278cbf2e22ca37af07178356e25c7f9c874664.tar.gz
Project-Tick-36278cbf2e22ca37af07178356e25c7f9c874664.zip
Add 256-bit VPCLMULQDQ CRC32 path for systems without AVX-512.
Split VPCLMULQDQ CRC32 into separate AVX2 and AVX-512 compilation units. Compute fold-by-8 constants for the AVX2 path using bitreverse(x^d mod G(x), 33) with d=992 and d=1056.
-rw-r--r--CMakeLists.txt24
-rw-r--r--arch/x86/Makefile.in17
-rw-r--r--arch/x86/crc32_pclmulqdq_tpl.h121
-rw-r--r--arch/x86/crc32_vpclmulqdq.c18
-rw-r--r--arch/x86/crc32_vpclmulqdq_avx2.c17
-rw-r--r--arch/x86/crc32_vpclmulqdq_avx512.c17
-rw-r--r--arch/x86/x86_features.c2
-rw-r--r--arch/x86/x86_functions.h27
-rw-r--r--arch/x86/x86_natives.h11
-rw-r--r--cmake/detect-intrinsics.cmake12
-rwxr-xr-xconfigure27
-rw-r--r--functable.c20
-rw-r--r--test/benchmarks/benchmark_crc32.cc7
-rw-r--r--test/benchmarks/benchmark_crc32_copy.cc7
-rw-r--r--test/test_crc32.cc7
-rw-r--r--test/test_crc32_copy.cc7
16 files changed, 266 insertions, 75 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 550ddcde03..8029c4a68b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,7 +172,7 @@ elseif(BASEARCH_X86_FOUND)
cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF)
cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF)
cmake_dependent_option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON "WITH_AVX512" OFF)
- cmake_dependent_option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON "WITH_PCLMULQDQ;WITH_AVX512" OFF)
+ cmake_dependent_option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON "WITH_PCLMULQDQ;WITH_AVX2" OFF)
endif()
option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF)
@@ -1130,12 +1130,22 @@ if(WITH_OPTIM)
endif()
if(WITH_VPCLMULQDQ)
check_vpclmulqdq_intrinsics()
- if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX512)
- add_definitions(-DX86_VPCLMULQDQ_CRC)
- set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
- add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}\"")
- list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
- set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+ if(HAVE_VPCLMULQDQ_INTRIN AND WITH_PCLMULQDQ AND WITH_AVX2)
+ add_definitions(-DX86_VPCLMULQDQ_AVX2)
+ set(VPCLMULQDQ_AVX2_SRCS ${ARCHDIR}/crc32_vpclmulqdq_avx2.c)
+ set(VPCLMULQDQ_AVX2_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX2FLAG}")
+ list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_AVX2_SRCS})
+ set_property(SOURCE ${VPCLMULQDQ_AVX2_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULQDQ_AVX2_FLAGS} ${NOLTOFLAG}")
+ add_feature_info(VPCLMULQDQ_AVX2 1 "Support CRC using VPCLMULQDQ (AVX2), using \"${VPCLMULQDQ_AVX2_FLAGS}\"")
+
+ if(WITH_AVX512)
+ add_definitions(-DX86_VPCLMULQDQ_AVX512)
+ set(VPCLMULQDQ_AVX512_SRCS ${ARCHDIR}/crc32_vpclmulqdq_avx512.c)
+ set(VPCLMULQDQ_AVX512_FLAGS "${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG}")
+ list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_AVX512_SRCS})
+ set_property(SOURCE ${VPCLMULQDQ_AVX512_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULQDQ_AVX512_FLAGS} ${NOLTOFLAG}")
+ add_feature_info(VPCLMULQDQ_AVX512 1 "Support CRC using VPCLMULQDQ (AVX-512), using \"${VPCLMULQDQ_AVX512_FLAGS}\"")
+ endif()
else()
set(WITH_VPCLMULQDQ OFF)
endif()
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
index addf5c6ea3..f756844a9f 100644
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -41,7 +41,8 @@ all: \
crc32_chorba_sse2.o crc32_chorba_sse2.lo \
crc32_chorba_sse41.o crc32_chorba_sse41.lo \
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
- crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
+ crc32_vpclmulqdq_avx2.o crc32_vpclmulqdq_avx2.lo \
+ crc32_vpclmulqdq_avx512.o crc32_vpclmulqdq_avx512.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
slide_hash_sse2.o slide_hash_sse2.lo
@@ -111,11 +112,17 @@ crc32_pclmulqdq.o:
crc32_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
-crc32_vpclmulqdq.o:
- $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+crc32_vpclmulqdq_avx2.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
-crc32_vpclmulqdq.lo:
- $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+crc32_vpclmulqdq_avx2.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx2.c
+
+crc32_vpclmulqdq_avx512.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
+
+crc32_vpclmulqdq_avx512.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq_avx512.c
slide_hash_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h
index 8677f1e872..e4ea546afd 100644
--- a/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/arch/x86/crc32_pclmulqdq_tpl.h
@@ -28,7 +28,8 @@
#include "crc32_p.h"
#include "x86_intrins.h"
-#ifdef X86_VPCLMULQDQ
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
# if defined(_MSC_VER) && _MSC_VER < 1920
/* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */
# define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96)
@@ -43,6 +44,10 @@
# define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96)
# endif
#endif
+/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+# define z256_xor3_epi64(a, b, c) _mm256_xor_si256(_mm256_xor_si256(a, b), c)
+#endif
#ifndef z128_xor3_epi64
# define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
@@ -117,7 +122,8 @@ static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_cr
*xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
}
-#ifdef X86_VPCLMULQDQ
+/* 512-bit fold function requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3,
const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) {
__m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01);
@@ -135,6 +141,25 @@ static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_cr
*zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3);
}
#endif
+/* 256-bit fold function for VPCLMULQDQ without AVX-512 */
+#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
+static inline void fold_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3,
+ const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, const __m256i ymm_t3, const __m256i ymm_fold8) {
+ __m256i y_low0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01);
+ __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10);
+ __m256i y_low1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01);
+ __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10);
+ __m256i y_low2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01);
+ __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10);
+ __m256i y_low3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01);
+ __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10);
+
+ *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0);
+ *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1);
+ *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2);
+ *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3);
+}
+#endif
Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
size_t copy_len = len;
@@ -181,7 +206,8 @@ Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const
xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc));
}
-#ifdef X86_VPCLMULQDQ
+/* 512-bit VPCLMULQDQ path requires AVX-512F */
+#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
if (len >= 256) {
len -= 256;
@@ -253,6 +279,95 @@ Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const
xmm_crc2 = z512_extracti64x2(zmm_crc0, 2);
xmm_crc3 = z512_extracti64x2(zmm_crc0, 3);
}
+/* 256-bit VPCLMULQDQ path */
+#elif defined(X86_VPCLMULQDQ)
+ if (len >= 128) {
+ len -= 128;
+
+ __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3;
+ __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3;
+ __m256i y_low0, y_high0;
+ const __m256i ymm_fold4 = _mm256_set_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596,
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m256i ymm_fold8 = _mm256_set_epi32(
+ 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880,
+ 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880);
+
+ ymm_crc0 = _mm256_loadu_si256((__m256i *)src);
+ ymm_crc1 = _mm256_loadu_si256((__m256i *)src + 1);
+ ymm_crc2 = _mm256_loadu_si256((__m256i *)src + 2);
+ ymm_crc3 = _mm256_loadu_si256((__m256i *)src + 3);
+ src += 128;
+ if (COPY) {
+ _mm256_storeu_si256((__m256i *)dst, ymm_crc0);
+ _mm256_storeu_si256((__m256i *)dst + 1, ymm_crc1);
+ _mm256_storeu_si256((__m256i *)dst + 2, ymm_crc2);
+ _mm256_storeu_si256((__m256i *)dst + 3, ymm_crc3);
+ dst += 128;
+ }
+
+ // Fold existing xmm state into first 32 bytes
+ ymm_t0 = _mm256_castsi128_si256(xmm_crc0);
+ ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc1, 1);
+
+ y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+ y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+ ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0);
+
+ ymm_t0 = _mm256_castsi128_si256(xmm_crc2);
+ ymm_t0 = _mm256_inserti128_si256(ymm_t0, xmm_crc3, 1);
+
+ y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
+ y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
+ ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0);
+
+ while (len >= 128) {
+ len -= 128;
+ ymm_t0 = _mm256_loadu_si256((__m256i *)src);
+ ymm_t1 = _mm256_loadu_si256((__m256i *)src + 1);
+ ymm_t2 = _mm256_loadu_si256((__m256i *)src + 2);
+ ymm_t3 = _mm256_loadu_si256((__m256i *)src + 3);
+ src += 128;
+
+ fold_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8);
+ if (COPY) {
+ _mm256_storeu_si256((__m256i *)dst, ymm_t0);
+ _mm256_storeu_si256((__m256i *)dst + 1, ymm_t1);
+ _mm256_storeu_si256((__m256i *)dst + 2, ymm_t2);
+ _mm256_storeu_si256((__m256i *)dst + 3, ymm_t3);
+ dst += 128;
+ }
+ }
+
+ // Extract 8 x 128-bit lanes from 4 x 256-bit registers
+ __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0);
+ __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1);
+ __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1);
+ __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1);
+ __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2);
+ __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1);
+ __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3);
+ __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1);
+
+ // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4)
+ __m128i x_low, x_high;
+ x_low = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10);
+ xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4);
+
+ x_low = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10);
+ xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5);
+
+ x_low = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10);
+ xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6);
+
+ x_low = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01);
+ x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10);
+ xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7);
+ }
#else
/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
* We interleave the PCLMUL-base folds with 8x scaled generator
diff --git a/arch/x86/crc32_vpclmulqdq.c b/arch/x86/crc32_vpclmulqdq.c
deleted file mode 100644
index 793d8ab99a..0000000000
--- a/arch/x86/crc32_vpclmulqdq.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
- * Copyright Wangyang Guo (wangyang.guo@intel.com)
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifdef X86_VPCLMULQDQ_CRC
-
-#define X86_VPCLMULQDQ
-#include "crc32_pclmulqdq_tpl.h"
-
-Z_INTERNAL uint32_t crc32_vpclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) {
- return crc32_copy_impl(crc, NULL, buf, len, 0);
-}
-
-Z_INTERNAL uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
- return crc32_copy_impl(crc, dst, src, len, 1);
-}
-#endif
diff --git a/arch/x86/crc32_vpclmulqdq_avx2.c b/arch/x86/crc32_vpclmulqdq_avx2.c
new file mode 100644
index 0000000000..1cdef13b09
--- /dev/null
+++ b/arch/x86/crc32_vpclmulqdq_avx2.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx2.c -- VPCLMULQDQ-based CRC32 with AVX2.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX2
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/arch/x86/crc32_vpclmulqdq_avx512.c b/arch/x86/crc32_vpclmulqdq_avx512.c
new file mode 100644
index 0000000000..a95a448f49
--- /dev/null
+++ b/arch/x86/crc32_vpclmulqdq_avx512.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq_avx512.c -- VPCLMULQDQ-based CRC32 with AVX-512.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_AVX512
+
+#define X86_VPCLMULQDQ
+#include "crc32_pclmulqdq_tpl.h"
+
+Z_INTERNAL uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len) {
+ return crc32_copy_impl(crc, NULL, buf, len, 0);
+}
+
+Z_INTERNAL uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
+ return crc32_copy_impl(crc, dst, src, len, 1);
+}
+#endif
diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c
index 30d17fb063..5eba18bf8a 100644
--- a/arch/x86/x86_features.c
+++ b/arch/x86/x86_features.c
@@ -105,6 +105,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
// check AVX2 bit if the OS supports saving YMM registers
if (features->has_os_save_ymm) {
features->has_avx2 = ebx & 0x20;
+ features->has_vpclmulqdq = ecx & 0x400;
}
// check AVX512 bits if the OS supports saving ZMM registers
@@ -120,7 +121,6 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
&& features->has_avx512vl && features->has_bmi2;
features->has_avx512vnni = ecx & 0x800;
- features->has_vpclmulqdq = ecx & 0x400;
}
}
}
diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h
index 7b628a851a..881c6efe23 100644
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@@ -75,9 +75,13 @@ uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *s
uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
#endif
-#ifdef X86_VPCLMULQDQ_CRC
-uint32_t crc32_vpclmulqdq(uint32_t crc, const uint8_t *buf, size_t len);
-uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#ifdef X86_VPCLMULQDQ_AVX2
+uint32_t crc32_vpclmulqdq_avx2(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx2(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+uint32_t crc32_vpclmulqdq_avx512(uint32_t crc, const uint8_t *buf, size_t len);
+uint32_t crc32_copy_vpclmulqdq_avx512(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
@@ -174,13 +178,18 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
# undef native_adler32_copy
# define native_adler32_copy adler32_copy_avx512_vnni
# endif
+# endif
// X86 - VPCLMULQDQ
-# ifdef X86_VPCLMULQDQ_NATIVE
-# undef native_crc32
-# define native_crc32 crc32_vpclmulqdq
-# undef native_crc32_copy
-# define native_crc32_copy crc32_copy_vpclmulqdq
-# endif
+# ifdef X86_VPCLMULQDQ_AVX512_NATIVE
+# undef native_crc32
+# define native_crc32 crc32_vpclmulqdq_avx512
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_vpclmulqdq_avx512
+# elif defined(X86_VPCLMULQDQ_AVX2_NATIVE)
+# undef native_crc32
+# define native_crc32 crc32_vpclmulqdq_avx2
+# undef native_crc32_copy
+# define native_crc32_copy crc32_copy_vpclmulqdq_avx2
# endif
#endif
diff --git a/arch/x86/x86_natives.h b/arch/x86/x86_natives.h
index 75f249d909..a39b7a51f0 100644
--- a/arch/x86/x86_natives.h
+++ b/arch/x86/x86_natives.h
@@ -29,9 +29,6 @@
# ifdef X86_PCLMULQDQ_CRC
# define X86_PCLMULQDQ_NATIVE
# endif
-# if defined(__AVX512F__) && defined(__VPCLMULQDQ__)
-# define X86_VPCLMULQDQ_NATIVE
-# endif
#endif
#if defined(__AVX2__)
# ifdef X86_AVX2
@@ -48,5 +45,13 @@
# define X86_AVX512VNNI_NATIVE
# endif
#endif
+#if defined(__VPCLMULQDQ__)
+# if defined(X86_VPCLMULQDQ_AVX2) && defined(X86_AVX2_NATIVE)
+# define X86_VPCLMULQDQ_AVX2_NATIVE
+# endif
+# if defined(X86_VPCLMULQDQ_AVX512) && defined(X86_AVX512_NATIVE)
+# define X86_VPCLMULQDQ_AVX512_NATIVE
+# endif
+#endif
#endif /* X86_NATIVES_H_ */
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake
index 118bfbc2a5..c524c17bbe 100644
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -359,19 +359,19 @@ endmacro()
macro(check_vpclmulqdq_intrinsics)
if(NOT NATIVEFLAG)
- if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
- set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang|IntelLLVM|NVHPC")
+ set(VPCLMULFLAG "-mvpclmulqdq")
endif()
endif()
# Check whether compiler supports VPCLMULQDQ intrinsics
if(NOT (APPLE AND ARCH_32BIT))
- set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+ set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <immintrin.h>
#include <wmmintrin.h>
- __m512i f(__m512i a) {
- __m512i b = _mm512_setzero_si512();
- return _mm512_clmulepi64_epi128(a, b, 0x10);
+ __m256i f(__m256i a) {
+ __m256i b = _mm256_setzero_si256();
+ return _mm256_clmulepi64_epi128(a, b, 0x10);
}
int main(void) { return 0; }"
HAVE_VPCLMULQDQ_INTRIN
diff --git a/configure b/configure
index 169fbc83cc..cfa8947bd2 100755
--- a/configure
+++ b/configure
@@ -122,7 +122,7 @@ ssse3flag="-mssse3"
sse41flag="-msse4.1"
sse42flag="-msse4.2"
pclmulflag="-mpclmul"
-vpclmulflag="-mvpclmulqdq -mavx512f"
+vpclmulflag="-mvpclmulqdq"
xsaveflag="-mxsave"
lsxflag="-mlsx"
lasxflag="-mlasx"
@@ -1498,13 +1498,13 @@ check_vpclmulqdq_intrinsics() {
cat > $test.c << EOF
#include <immintrin.h>
#include <wmmintrin.h>
-__m512i f(__m512i a) {
- __m512i b = _mm512_setzero_si512();
- return _mm512_clmulepi64_epi128(a, b, 0x10);
+__m256i f(__m256i a) {
+ __m256i b = _mm256_setzero_si256();
+ return _mm256_clmulepi64_epi128(a, b, 0x10);
}
int main(void) { return 0; }
EOF
- if try ${CC} ${CFLAGS} ${vpclmulflag} $test.c; then
+ if try ${CC} ${CFLAGS} ${vpclmulflag} ${avx2flag} $test.c; then
echo "Checking for VPCLMULQDQ intrinsics ... Yes." | tee -a configure.log
HAVE_VPCLMULQDQ_INTRIN=1
else
@@ -1993,14 +1993,21 @@ case "${ARCH}" in
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512_vnni.lo"
fi
- if test $buildvpclmulqdq -eq 1 && test ${HAVE_PCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then
+ if test $buildvpclmulqdq -eq 1 && test ${HAVE_PCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX2_INTRIN} -eq 1; then
check_vpclmulqdq_intrinsics
if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC"
- SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq.lo"
+ CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_AVX2"
+ SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_AVX2"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq_avx2.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq_avx2.lo"
+
+ if test ${HAVE_AVX512_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_AVX512"
+ SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_AVX512"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq_avx512.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq_avx512.lo"
+ fi
fi
fi
fi
diff --git a/functable.c b/functable.c
index 478051f40d..eec5d07f67 100644
--- a/functable.c
+++ b/functable.c
@@ -256,14 +256,24 @@ static int init_functable(void) {
ft.adler32_copy = &adler32_copy_avx512_vnni;
}
#endif
- // X86 - VPCLMULQDQ
-#ifdef X86_VPCLMULQDQ_CRC
-# ifndef X86_VPCLMULQDQ_NATIVE
+ // X86 - VPCLMULQDQ (AVX2)
+#ifdef X86_VPCLMULQDQ_AVX2
+# ifndef X86_VPCLMULQDQ_AVX2_NATIVE
+ if (cf.x86.has_pclmulqdq && cf.x86.has_avx2 && cf.x86.has_vpclmulqdq)
+# endif
+ {
+ ft.crc32 = &crc32_vpclmulqdq_avx2;
+ ft.crc32_copy = &crc32_copy_vpclmulqdq_avx2;
+ }
+#endif
+ // X86 - VPCLMULQDQ (AVX-512)
+#ifdef X86_VPCLMULQDQ_AVX512
+# ifndef X86_VPCLMULQDQ_AVX512_NATIVE
if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq)
# endif
{
- ft.crc32 = &crc32_vpclmulqdq;
- ft.crc32_copy = &crc32_copy_vpclmulqdq;
+ ft.crc32 = &crc32_vpclmulqdq_avx512;
+ ft.crc32_copy = &crc32_copy_vpclmulqdq_avx512;
}
#endif
diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc
index b30b1855fa..1e2cf88590 100644
--- a/test/benchmarks/benchmark_crc32.cc
+++ b/test/benchmarks/benchmark_crc32.cc
@@ -112,8 +112,11 @@ BENCHMARK_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx);
#ifdef X86_PCLMULQDQ_CRC
BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq);
#endif
-#ifdef X86_VPCLMULQDQ_CRC
-BENCHMARK_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq));
+#ifdef X86_VPCLMULQDQ_AVX2
+BENCHMARK_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq));
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+BENCHMARK_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq));
#endif
#ifdef LOONGARCH_CRC
BENCHMARK_CRC32(loongarch64, crc32_loongarch64, test_cpu_features.loongarch.has_crc);
diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc
index 2d054dba49..eafa5aee44 100644
--- a/test/benchmarks/benchmark_crc32_copy.cc
+++ b/test/benchmarks/benchmark_crc32_copy.cc
@@ -167,8 +167,11 @@ BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1);
# ifdef X86_PCLMULQDQ_CRC
BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
# endif
-# ifdef X86_VPCLMULQDQ_CRC
- BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+# ifdef X86_VPCLMULQDQ_AVX2
+ BENCHMARK_CRC32_COPY(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, crc32_copy_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq))
+# endif
+# ifdef X86_VPCLMULQDQ_AVX512
+ BENCHMARK_CRC32_COPY(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, crc32_copy_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
# endif
#endif
diff --git a/test/test_crc32.cc b/test/test_crc32.cc
index a44433193e..1fb0771119 100644
--- a/test/test_crc32.cc
+++ b/test/test_crc32.cc
@@ -123,8 +123,11 @@ TEST_CRC32(vx, crc32_s390_vx, test_cpu_features.s390.has_vx)
#ifdef X86_PCLMULQDQ_CRC
TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
#endif
-#ifdef X86_VPCLMULQDQ_CRC
-TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+#ifdef X86_VPCLMULQDQ_AVX2
+TEST_CRC32(vpclmulqdq_avx2, crc32_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq))
+#endif
+#ifdef X86_VPCLMULQDQ_AVX512
+TEST_CRC32(vpclmulqdq_avx512, crc32_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
#endif
#ifndef WITHOUT_CHORBA_SSE
# ifdef X86_SSE2
diff --git a/test/test_crc32_copy.cc b/test/test_crc32_copy.cc
index d397448017..fa3059e544 100644
--- a/test/test_crc32_copy.cc
+++ b/test/test_crc32_copy.cc
@@ -79,8 +79,11 @@ TEST_CRC32_COPY(braid, crc32_copy_braid, 1)
# ifdef X86_PCLMULQDQ_CRC
TEST_CRC32_COPY(pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
# endif
-# ifdef X86_VPCLMULQDQ_CRC
- TEST_CRC32_COPY(vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+# ifdef X86_VPCLMULQDQ_AVX2
+ TEST_CRC32_COPY(vpclmulqdq_avx2, crc32_copy_vpclmulqdq_avx2, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx2 && test_cpu_features.x86.has_vpclmulqdq))
+# endif
+# ifdef X86_VPCLMULQDQ_AVX512
+ TEST_CRC32_COPY(vpclmulqdq_avx512, crc32_copy_vpclmulqdq_avx512, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
# endif
#endif