summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Kristian Rosbach <hk-git@circlestorm.org>2025-11-11 20:23:24 +0100
committerHans Kristian Rosbach <hk-github@circlestorm.org>2025-11-13 23:17:07 +0100
commit8003f57828f7310aaa035519bfa17c93b5621977 (patch)
treedf66259bbee3156c56cb24559a0e4b997aeda059
parent29cf6242ebb56810c3516e3ce5b4ee8f27ad9b07 (diff)
downloadProject-Tick-8003f57828f7310aaa035519bfa17c93b5621977.tar.gz
Project-Tick-8003f57828f7310aaa035519bfa17c93b5621977.zip
Reorganize Chorba activation.
Now WITHOUT_CHORBA will only disable the crc32_chorba C fallback. SSE2, SSE41 and pclmul variants will still be able to use their Chorba-algorithm based code, but their fallback to the generic crc32_chorba C code in SSE2 and SSE41 will be disabled, reducing their performance on really big input buffers (not used during deflate/inflate, only when calling crc32 directly). Remove the crc32_c function (and its file crc32_c.c), instead use the normal functable routing to select between crc32_braid and crc32_chorba. Disable sse2 and sse4.1 variants of Chorba-crc32 on MSVC older than 2022 due to code generation bug in 2019 causing segfaults. Compile either crc32_chorba_small_nondestructive or crc32_chorba_small_nondestructive_32bit, not both. Don't compile crc32_chorba_32768_nondestructive on 32bit arch.
-rw-r--r--CMakeLists.txt6
-rw-r--r--Makefile.in2
-rw-r--r--arch/generic/Makefile.in7
-rw-r--r--arch/generic/crc32_c.c42
-rw-r--r--arch/generic/crc32_chorba_c.c38
-rw-r--r--arch/generic/generic_functions.h8
-rw-r--r--arch/riscv/crc32_zbc.c7
-rw-r--r--arch/s390/crc32-vx.c6
-rw-r--r--arch/x86/chorba_sse2.c12
-rw-r--r--arch/x86/chorba_sse41.c13
-rw-r--r--arch/x86/crc32_fold_pclmulqdq_tpl.h384
-rw-r--r--arch/x86/crc32_pclmulqdq_tpl.h5
-rw-r--r--arch/x86/x86_functions.h17
-rw-r--r--crc32.h2
-rw-r--r--functable.c15
-rw-r--r--test/benchmarks/benchmark_crc32.cc18
-rw-r--r--test/test_crc32.cc21
17 files changed, 291 insertions, 312 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cbe245a43f..df83113386 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,6 +217,10 @@ elseif(MSVC)
if(MSVC_VERSION VERSION_LESS 1800)
message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).")
endif()
+ if(MSVC_VERSION VERSION_LESS 1930)
+ message(STATUS "Old Visual Studio compiler version, disabling SSE2/SSE4.1 Chorba variants (requires 2022 or later).")
+ add_definitions(-DWITHOUT_CHORBA_SSE)
+ endif()
# TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination
# (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should
# avoid mistakes.
@@ -1312,7 +1316,6 @@ set(ZLIB_ALL_FALLBACK_SRCS
arch/generic/chunkset_c.c
arch/generic/compare256_c.c
arch/generic/crc32_braid_c.c
- arch/generic/crc32_c.c
arch/generic/crc32_fold_c.c
arch/generic/slide_hash_c.c
)
@@ -1326,7 +1329,6 @@ elseif(${ARCH} STREQUAL "x86_64" AND WITH_SSE2)
arch/generic/adler32_c.c
arch/generic/adler32_fold_c.c
arch/generic/crc32_braid_c.c
- arch/generic/crc32_c.c
arch/generic/crc32_fold_c.c
)
diff --git a/Makefile.in b/Makefile.in
index fbf1e3f4c8..1785e5f1f8 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -80,7 +80,6 @@ OBJZ = \
arch/generic/chunkset_c.o \
arch/generic/compare256_c.o \
arch/generic/crc32_braid_c.o \
- arch/generic/crc32_c.o \
arch/generic/crc32_fold_c.o \
arch/generic/slide_hash_c.o \
adler32.o \
@@ -122,7 +121,6 @@ PIC_OBJZ = \
arch/generic/chunkset_c.lo \
arch/generic/compare256_c.lo \
arch/generic/crc32_braid_c.lo \
- arch/generic/crc32_c.lo \
arch/generic/crc32_fold_c.lo \
arch/generic/slide_hash_c.lo \
adler32.lo \
diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in
index 6040083f66..ba20e9e5fb 100644
--- a/arch/generic/Makefile.in
+++ b/arch/generic/Makefile.in
@@ -18,7 +18,6 @@ all: \
chunkset_c.o chunkset_c.lo \
compare256_c.o compare256_c.lo \
crc32_braid_c.o crc32_braid_c.lo \
- crc32_c.o crc32_c.lo \
crc32_chorba_c.o crc32_chorba_c.lo \
crc32_fold_c.o crc32_fold_c.lo \
slide_hash_c.o slide_hash_c.lo
@@ -54,12 +53,6 @@ crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_b
crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
-crc32_c.o: $(SRCDIR)/crc32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h
- $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_c.c
-
-crc32_c.lo: $(SRCDIR)/crc32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h
- $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_c.c
-
crc32_chorba_c.o: $(SRCDIR)/crc32_chorba_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c
diff --git a/arch/generic/crc32_c.c b/arch/generic/crc32_c.c
deleted file mode 100644
index e7394a8c93..0000000000
--- a/arch/generic/crc32_c.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include "zbuild.h"
-#include "crc32.h"
-#include "crc32_braid_p.h"
-#include "generic_functions.h"
-
-Z_INTERNAL uint32_t crc32_c(uint32_t crc, const uint8_t *buf, size_t len) {
- uint32_t c = (~crc) & 0xffffffff;
-
-#ifndef WITHOUT_CHORBA
- uint64_t* aligned_buf;
- size_t aligned_len;
- unsigned long algn_diff = ((uintptr_t)8 - ((uintptr_t)buf & 0xF)) & 0xF;
- if (algn_diff < len) {
- if (algn_diff) {
- c = crc32_braid_internal(c, buf, algn_diff);
- }
- aligned_buf = (uint64_t*) (buf + algn_diff);
- aligned_len = len - algn_diff;
- if(aligned_len > CHORBA_LARGE_THRESHOLD)
- c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
-# if OPTIMAL_CMP == 64
- else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
- c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len);
- else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT)
- c = crc32_chorba_small_nondestructive(c, (uint64_t*) aligned_buf, aligned_len);
-# else
- else if (aligned_len > CHORBA_SMALL_THRESHOLD_32BIT)
- c = crc32_chorba_small_nondestructive_32bit(c, (uint32_t*) aligned_buf, aligned_len);
-# endif
- else
- c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
- }
- else {
- c = crc32_braid_internal(c, buf, len);
- }
-#else
- c = crc32_braid_internal(c, buf, len);
-#endif /* WITHOUT_CHORBA */
-
- /* Return the CRC, post-conditioned. */
- return c ^ 0xffffffff;
-}
diff --git a/arch/generic/crc32_chorba_c.c b/arch/generic/crc32_chorba_c.c
index 76b050f295..4041abd46e 100644
--- a/arch/generic/crc32_chorba_c.c
+++ b/arch/generic/crc32_chorba_c.c
@@ -495,6 +495,7 @@ Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive (uint32_t crc, const z_wo
return crc;
}
+# if OPTIMAL_CMP == 64
/* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */
Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive (uint32_t crc, const uint64_t* buf, size_t len) {
const uint64_t* input = buf;
@@ -1230,6 +1231,8 @@ Z_INTERNAL uint32_t crc32_chorba_small_nondestructive (uint32_t crc, const uint6
return crc;
}
+#else // OPTIMAL_CMP == 64
+
Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const uint32_t* buf, size_t len) {
const uint32_t* input = buf;
uint32_t final[20] = {0};
@@ -1442,3 +1445,38 @@ Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const
return crc;
}
+#endif // OPTIMAL_CMP == 64
+
+Z_INTERNAL uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len) {
+ uint32_t c = (~crc) & 0xffffffff;
+
+ uint64_t* aligned_buf;
+ size_t aligned_len;
+ unsigned long algn_diff = ((uintptr_t)8 - ((uintptr_t)buf & 0xF)) & 0xF;
+ if (algn_diff < len) {
+ if (algn_diff) {
+ c = crc32_braid_internal(c, buf, algn_diff);
+ }
+ aligned_buf = (uint64_t*) (buf + algn_diff);
+ aligned_len = len - algn_diff;
+ if(aligned_len > CHORBA_LARGE_THRESHOLD)
+ c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
+# if OPTIMAL_CMP == 64
+ else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
+ c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len);
+ else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT)
+ c = crc32_chorba_small_nondestructive(c, (uint64_t*) aligned_buf, aligned_len);
+# else
+ else if (aligned_len > CHORBA_SMALL_THRESHOLD_32BIT)
+ c = crc32_chorba_small_nondestructive_32bit(c, (uint32_t*) aligned_buf, aligned_len);
+# endif
+ else
+ c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
+ }
+ else {
+ c = crc32_braid_internal(c, buf, len);
+ }
+
+ /* Return the CRC, post-conditioned. */
+ return c ^ 0xffffffff;
+}
diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h
index 21358f0691..cb92bd3738 100644
--- a/arch/generic/generic_functions.h
+++ b/arch/generic/generic_functions.h
@@ -22,11 +22,11 @@ uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
-uint32_t crc32_c(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len);
uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len);
#ifndef WITHOUT_CHORBA
+ uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t crc32_chorba_118960_nondestructive (uint32_t crc, const z_word_t* input, size_t len);
uint32_t crc32_chorba_32768_nondestructive (uint32_t crc, const uint64_t* buf, size_t len);
uint32_t crc32_chorba_small_nondestructive (uint32_t crc, const uint64_t* buf, size_t len);
@@ -50,7 +50,11 @@ void slide_hash_c(deflate_state *s);
# define native_adler32 adler32_c
# define native_adler32_fold_copy adler32_fold_copy_c
# define native_chunkmemset_safe chunkmemset_safe_c
-# define native_crc32 crc32_c
+#ifndef WITHOUT_CHORBA
+# define native_crc32 crc32_chorba
+#else
+# define native_crc32 crc32_braid
+#endif
# define native_crc32_fold crc32_fold_c
# define native_crc32_fold_copy crc32_fold_copy_c
# define native_crc32_fold_final crc32_fold_final_c
diff --git a/arch/riscv/crc32_zbc.c b/arch/riscv/crc32_zbc.c
index d5dc71cc9b..e3f3c71649 100644
--- a/arch/riscv/crc32_zbc.c
+++ b/arch/riscv/crc32_zbc.c
@@ -6,13 +6,12 @@
#if defined(RISCV_CRC32_ZBC)
#include "zbuild.h"
+#include "arch_functions.h"
#include <stdint.h>
#define CLMUL_MIN_LEN 16 // Minimum size of buffer for _crc32_clmul
#define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul
-extern uint32_t crc32_c(uint32_t crc, const uint8_t *buf, size_t len);
-
#define CONSTANT_R3 0x1751997d0ULL
#define CONSTANT_R4 0x0ccaa009eULL
#define CONSTANT_R5 0x163cd6124ULL
@@ -84,12 +83,12 @@ finish_fold:
Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf,
size_t len) {
if (len < CLMUL_MIN_LEN) {
- return crc32_c(crc, buf, len);
+ return crc32_braid(crc, buf, len);
}
uint64_t unaligned_length = len % CLMUL_CHUNK_LEN;
if (unaligned_length) {
- crc = crc32_c(crc, buf, unaligned_length);
+ crc = crc32_braid(crc, buf, unaligned_length);
buf += unaligned_length;
len -= unaligned_length;
}
diff --git a/arch/s390/crc32-vx.c b/arch/s390/crc32-vx.c
index d8fcf79cb2..155eee4946 100644
--- a/arch/s390/crc32-vx.c
+++ b/arch/s390/crc32-vx.c
@@ -202,12 +202,12 @@ uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t
size_t prealign, aligned, remaining;
if (len < VX_MIN_LEN + VX_ALIGN_MASK)
- return crc32_c(crc, buf, len);
+ return crc32_braid(crc, buf, len);
if ((uintptr_t)buf & VX_ALIGN_MASK) {
prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
len -= prealign;
- crc = crc32_c(crc, buf, prealign);
+ crc = crc32_braid(crc, buf, prealign);
buf += prealign;
}
aligned = len & ~VX_ALIGN_MASK;
@@ -216,7 +216,7 @@ uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t
crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
if (remaining)
- crc = crc32_c(crc, buf + aligned, remaining);
+ crc = crc32_braid(crc, buf + aligned, remaining);
return crc;
}
diff --git a/arch/x86/chorba_sse2.c b/arch/x86/chorba_sse2.c
index ac98e994c6..3e25d7586b 100644
--- a/arch/x86/chorba_sse2.c
+++ b/arch/x86/chorba_sse2.c
@@ -1,4 +1,4 @@
-#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2)
+#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE)
#include "zbuild.h"
#include "crc32_braid_p.h"
@@ -6,10 +6,7 @@
#include "crc32.h"
#include <emmintrin.h>
#include "arch/x86/x86_intrins.h"
-#include "arch/generic/generic_functions.h"
-#include <assert.h>
-
-uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
+#include "arch_functions.h"
#define READ_NEXT(in, off, a, b) do { \
a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
@@ -862,9 +859,12 @@ Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t l
}
aligned_buf = (uint64_t*) (buf + algn_diff);
aligned_len = len - algn_diff;
+#if !defined(WITHOUT_CHORBA)
if(aligned_len > CHORBA_LARGE_THRESHOLD) {
c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
- } else if (aligned_len > 72) {
+ } else
+#endif
+ if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) {
c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len);
} else {
c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
diff --git a/arch/x86/chorba_sse41.c b/arch/x86/chorba_sse41.c
index 53d6e156c4..aebede45e2 100644
--- a/arch/x86/chorba_sse41.c
+++ b/arch/x86/chorba_sse41.c
@@ -1,4 +1,4 @@
-#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41)
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
#include "zbuild.h"
#include "crc32_braid_p.h"
@@ -7,11 +7,7 @@
#include <emmintrin.h>
#include <smmintrin.h>
#include "arch/x86/x86_intrins.h"
-#include "arch/generic/generic_functions.h"
-#include <assert.h>
-
-uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
-uint32_t chorba_small_nondestructive_sse2(uint32_t c, const uint64_t *aligned_buf, size_t aligned_len);
+#include "arch_functions.h"
#define READ_NEXT(in, off, a, b) do { \
a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
@@ -321,9 +317,12 @@ Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t
}
aligned_buf = (uint64_t*) (buf + algn_diff);
aligned_len = len - algn_diff;
+#if !defined(WITHOUT_CHORBA)
if(aligned_len > CHORBA_LARGE_THRESHOLD) {
c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
- } else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD &&
+ } else
+#endif
+ if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD &&
aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) {
c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len);
} else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) {
diff --git a/arch/x86/crc32_fold_pclmulqdq_tpl.h b/arch/x86/crc32_fold_pclmulqdq_tpl.h
index 4e5b11bf99..f4c924903d 100644
--- a/arch/x86/crc32_fold_pclmulqdq_tpl.h
+++ b/arch/x86/crc32_fold_pclmulqdq_tpl.h
@@ -105,225 +105,223 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint
}
#endif
-#ifndef WITHOUT_CHORBA
- /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
- * We interleave the PCLMUL-base folds with 8x scaled generator
- * polynomial copies; we read 8x QWORDS and then XOR them into
- * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
- * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
- * as "generator_64_bits_unrolled_8" */
- while (len >= 512 + 64 + 16*8) {
- __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
- __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
- __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
- __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
- __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
- __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
- __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
- __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
+ /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
+ * We interleave the PCLMUL-base folds with 8x scaled generator
+ * polynomial copies; we read 8x QWORDS and then XOR them into
+ * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
+ * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
+ * as "generator_64_bits_unrolled_8" */
+ while (len >= 512 + 64 + 16*8) {
+ __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
+ __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
+ __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
+ __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
+ __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
+ __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
+ __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
+ __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, chorba8);
- _mm_storeu_si128((__m128i *)dst + 1, chorba7);
- _mm_storeu_si128((__m128i *)dst + 2, chorba6);
- _mm_storeu_si128((__m128i *)dst + 3, chorba5);
- _mm_storeu_si128((__m128i *)dst + 4, chorba4);
- _mm_storeu_si128((__m128i *)dst + 5, chorba3);
- _mm_storeu_si128((__m128i *)dst + 6, chorba2);
- _mm_storeu_si128((__m128i *)dst + 7, chorba1);
- dst += 16*8;
+ _mm_storeu_si128((__m128i *)dst, chorba8);
+ _mm_storeu_si128((__m128i *)dst + 1, chorba7);
+ _mm_storeu_si128((__m128i *)dst + 2, chorba6);
+ _mm_storeu_si128((__m128i *)dst + 3, chorba5);
+ _mm_storeu_si128((__m128i *)dst + 4, chorba4);
+ _mm_storeu_si128((__m128i *)dst + 5, chorba3);
+ _mm_storeu_si128((__m128i *)dst + 6, chorba2);
+ _mm_storeu_si128((__m128i *)dst + 7, chorba1);
+ dst += 16*8;
#else
- XOR_INITIAL128(chorba8);
+ XOR_INITIAL128(chorba8);
#endif
- chorba2 = _mm_xor_si128(chorba2, chorba8);
- chorba1 = _mm_xor_si128(chorba1, chorba7);
- src += 16*8;
- len -= 16*8;
+ chorba2 = _mm_xor_si128(chorba2, chorba8);
+ chorba1 = _mm_xor_si128(chorba1, chorba7);
+ src += 16*8;
+ len -= 16*8;
- xmm_t0 = _mm_loadu_si128((__m128i *)src);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
+ xmm_t0 = _mm_loadu_si128((__m128i *)src);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
- fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
- xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
- xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
+ xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
+ xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
- xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- len -= 512;
- src += 512;
- }
-#endif /* WITHOUT_CHORBA */
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
+ xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ len -= 512;
+ src += 512;
+ }
while (len >= 64) {
len -= 64;
diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h
index 933733af27..c6c4c8f8b3 100644
--- a/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/arch/x86/crc32_pclmulqdq_tpl.h
@@ -22,9 +22,6 @@
#include <immintrin.h>
#include <wmmintrin.h>
#include <smmintrin.h> // _mm_extract_epi32
-#ifdef X86_VPCLMULQDQ
-# include <immintrin.h>
-#endif
#include "crc32.h"
#include "crc32_braid_p.h"
@@ -168,7 +165,6 @@ static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m1
*xmm_crc3 = _mm_castps_si128(ps_res3);
}
-#ifndef WITHOUT_CHORBA
static void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85);
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
@@ -210,7 +206,6 @@ static void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m
*xmm_crc2 = _mm_castps_si128(ps_res2);
*xmm_crc3 = _mm_castps_si128(ps_res3);
}
-#endif
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h
index 918b7e0f67..5d9065e1b3 100644
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@@ -24,9 +24,10 @@ uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsign
void slide_hash_sse2(deflate_state *s);
# endif
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
-# if !defined(WITHOUT_CHORBA)
+# if !defined(WITHOUT_CHORBA_SSE)
uint32_t crc32_chorba_sse2(uint32_t crc32, const uint8_t *buf, size_t len);
-# endif
+ uint32_t chorba_small_nondestructive_sse2(uint32_t c, const uint64_t *aligned_buf, size_t aligned_len);
+# endif
#endif
#ifdef X86_SSSE3
@@ -35,10 +36,8 @@ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsig
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
-#ifdef X86_SSE41
-# if !defined(WITHOUT_CHORBA)
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len);
-# endif
#endif
#ifdef X86_SSE42
@@ -105,9 +104,9 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
# define native_longest_match longest_match_sse2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_sse2
-# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
-# undef native_crc32
-# define native_crc32 crc32_chorba_sse2
+# if !defined(WITHOUT_CHORBA_SSE)
+# undef native_crc32
+# define native_crc32 crc32_chorba_sse2
# endif
# endif
# endif
@@ -121,7 +120,7 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
# define native_inflate_fast inflate_fast_ssse3
# endif
// X86 - SSE4.1
-# if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && defined(__SSE4_1__) && !defined(NO_CHORBA_SSE)
+# if defined(X86_SSE41) && defined(__SSE4_1__) && !defined(WITHOUT_CHORBA_SSE)
# undef native_crc32
# define native_crc32 crc32_chorba_sse41
# endif
diff --git a/crc32.h b/crc32.h
index 4c1eacaea6..e26b59e520 100644
--- a/crc32.h
+++ b/crc32.h
@@ -15,8 +15,6 @@
#define CHORBA_SMALL_THRESHOLD_64BIT 72
#define CHORBA_SMALL_THRESHOLD_32BIT 80
-Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len);
-
typedef struct crc32_fold_s {
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
uint32_t value;
diff --git a/functable.c b/functable.c
index f8a122d8da..8924f7351f 100644
--- a/functable.c
+++ b/functable.c
@@ -80,7 +80,7 @@ static int init_functable(void) {
// x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available.
ft.adler32 = &adler32_c;
ft.adler32_fold_copy = &adler32_fold_copy_c;
- ft.crc32 = &crc32_c;
+ ft.crc32 = &crc32_braid;
ft.crc32_fold = &crc32_fold_c;
ft.crc32_fold_copy = &crc32_fold_copy_c;
ft.crc32_fold_final = &crc32_fold_final_c;
@@ -95,7 +95,7 @@ static int init_functable(void) {
ft.adler32 = &adler32_c;
ft.adler32_fold_copy = &adler32_fold_copy_c;
ft.chunkmemset_safe = &chunkmemset_safe_c;
- ft.crc32 = &crc32_c;
+ ft.crc32 = &crc32_braid;
ft.crc32_fold = &crc32_fold_c;
ft.crc32_fold_copy = &crc32_fold_copy_c;
ft.crc32_fold_final = &crc32_fold_final_c;
@@ -110,6 +110,11 @@ static int init_functable(void) {
// Select arch-optimized functions
#ifdef WITH_OPTIM
+ // Chorba generic C fallback
+#ifndef WITHOUT_CHORBA
+ ft.crc32 = &crc32_chorba;
+#endif
+
// X86 - SSE2
#ifdef X86_SSE2
# if !defined(__x86_64__) && !defined(_M_X64)
@@ -117,7 +122,7 @@ static int init_functable(void) {
# endif
{
ft.chunkmemset_safe = &chunkmemset_safe_sse2;
-# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
+# if !defined(WITHOUT_CHORBA_SSE)
ft.crc32 = &crc32_chorba_sse2;
# endif
ft.inflate_fast = &inflate_fast_sse2;
@@ -139,11 +144,9 @@ static int init_functable(void) {
#endif
// X86 - SSE4.1
-#ifdef X86_SSE41
+#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
if (cf.x86.has_sse41) {
-#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
ft.crc32 = &crc32_chorba_sse41;
-#endif
}
#endif
diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc
index 1e95b27770..3b00f87d72 100644
--- a/test/benchmarks/benchmark_crc32.cc
+++ b/test/benchmarks/benchmark_crc32.cc
@@ -56,12 +56,6 @@ public:
} \
BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
-#ifndef WITHOUT_CHORBA
-BENCHMARK_CRC32(generic_chorba, crc32_c, 1);
-#else
-BENCHMARK_CRC32(generic, crc32_c, 1);
-#endif
-
BENCHMARK_CRC32(braid, crc32_braid, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
@@ -69,14 +63,16 @@ BENCHMARK_CRC32(native, native_crc32, 1);
#else
#ifndef WITHOUT_CHORBA
-# if defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
+BENCHMARK_CRC32(chorba_c, crc32_chorba, 1);
+#endif
+#ifndef WITHOUT_CHORBA_SSE
+# ifdef X86_SSE2
BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
-# if defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
- BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
-# endif
+# endif
+# ifdef X86_SSE41
+ BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
# endif
#endif
-
#ifdef ARM_CRC32
BENCHMARK_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32);
#endif
diff --git a/test/test_crc32.cc b/test/test_crc32.cc
index d44d079e97..ca0767d468 100644
--- a/test/test_crc32.cc
+++ b/test/test_crc32.cc
@@ -269,12 +269,6 @@ INSTANTIATE_TEST_SUITE_P(crc32, crc32_variant, testing::ValuesIn(tests));
hash(func); \
}
-#ifndef WITHOUT_CHORBA
-TEST_CRC32(generic_chorba, crc32_c, 1)
-#else
-TEST_CRC32(generic, crc32_c, 1)
-#endif
-
TEST_CRC32(braid, crc32_braid, 1)
#ifdef DISABLE_RUNTIME_CPU_DETECTION
@@ -297,6 +291,9 @@ static const int align_offsets[] = {
}
#endif
+#ifndef WITHOUT_CHORBA
+TEST_CRC32(chorba_c, crc32_chorba, 1)
+#endif
#ifdef ARM_CRC32
INSTANTIATE_TEST_SUITE_P(crc32_alignment, crc32_align, testing::ValuesIn(align_offsets));
TEST_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32)
@@ -317,11 +314,13 @@ TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
#ifdef X86_VPCLMULQDQ_CRC
TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
#endif
-#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
-TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
-#endif
-#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
-TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41)
+#ifndef WITHOUT_CHORBA_SSE
+# ifdef X86_SSE2
+ TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
+# endif
+# ifdef X86_SSE41
+ TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41)
+# endif
#endif
#if defined(LOONGARCH_CRC)
INSTANTIATE_TEST_SUITE_P(crc32_alignment, crc32_align, testing::ValuesIn(align_offsets));