diff options
| author | Nathan Moinvaziri <nathan@nathanm.com> | 2026-01-13 08:43:26 -0800 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-01-17 20:37:25 +0100 |
| commit | 214254edce6fa5a6069db7b13daabe47ab953bc6 (patch) | |
| tree | 8f28e8ae9a28731e059d95f8946e397ff302a90e | |
| parent | 4254f67d194e1ecb28c92a33dc8bdf210fac8272 (diff) | |
| download | Project-Tick-214254edce6fa5a6069db7b13daabe47ab953bc6.tar.gz Project-Tick-214254edce6fa5a6069db7b13daabe47ab953bc6.zip | |
Use epi32 variants for older MSVC (v141/v140) to avoid cast warnings
| -rw-r--r-- | arch/x86/crc32_pclmulqdq_tpl.h | 35 |
1 files changed, 24 insertions, 11 deletions
diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h index 2b51b252dd..f695d6d251 100644 --- a/arch/x86/crc32_pclmulqdq_tpl.h +++ b/arch/x86/crc32_pclmulqdq_tpl.h @@ -28,10 +28,23 @@ #include "crc32_braid_tbl.h" #include "x86_intrins.h" -#if defined(X86_VPCLMULQDQ) && defined(__AVX512VL__) -# define mm_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, (uint8_t)0x96) -# define mm512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, (uint8_t)0x96) -#else +#ifdef X86_VPCLMULQDQ +# if defined(_MSC_VER) && _MSC_VER < 1920 + /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */ +# define mm512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96) +# define mm512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm) +# define mm512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm) +# else +# define mm512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, 0x96) +# define mm512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm) +# define mm512_extracti64x2(a, imm) _mm512_extracti64x2_epi64(a, imm) +# endif +# ifdef __AVX512VL__ +# define mm_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96) +# endif +#endif + +#ifndef mm_xor3_epi64 # define mm_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c) #endif @@ -210,9 +223,9 @@ Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const // Fold existing xmm state into first 64 bytes zmm_t0 = _mm512_castsi128_si512(xmm_crc0); - zmm_t0 = _mm512_inserti64x2(zmm_t0, xmm_crc1, 1); - zmm_t0 = _mm512_inserti64x2(zmm_t0, xmm_crc2, 2); - zmm_t0 = _mm512_inserti64x2(zmm_t0, xmm_crc3, 3); + zmm_t0 = mm512_inserti64x2(zmm_t0, xmm_crc1, 1); + zmm_t0 = mm512_inserti64x2(zmm_t0, xmm_crc2, 2); + zmm_t0 = mm512_inserti64x2(zmm_t0, xmm_crc3, 3); z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01); z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10); @@ -250,10 +263,10 @@ Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const zmm_crc0 = mm512_xor3_epi64(z_low0, z_high0, zmm_crc3); // zmm_crc0 -> xmm_crc[0, 1, 2, 3] - xmm_crc0 = _mm512_extracti64x2_epi64(zmm_crc0, 0); - xmm_crc1 = _mm512_extracti64x2_epi64(zmm_crc0, 1); - xmm_crc2 = _mm512_extracti64x2_epi64(zmm_crc0, 2); - xmm_crc3 = _mm512_extracti64x2_epi64(zmm_crc0, 3); + xmm_crc0 = mm512_extracti64x2(zmm_crc0, 0); + xmm_crc1 = mm512_extracti64x2(zmm_crc0, 1); + xmm_crc2 = mm512_extracti64x2(zmm_crc0, 2); + xmm_crc3 = mm512_extracti64x2(zmm_crc0, 3); } #else /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 |
