summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Moinvaziri <nathan@nathanm.com>2025-12-17 16:35:18 -0800
committerHans Kristian Rosbach <hk-github@circlestorm.org>2025-12-28 14:07:36 +0100
commit0dfd7c0acbd463fc2d083756afb97a5e3d84e9ec (patch)
tree35adba91174f976e7420dbc76ab15b41fa8c1453
parent1a32c9977f6d220e178d9bf0856d713ceb0837dc (diff)
downloadProject-Tick-0dfd7c0acbd463fc2d083756afb97a5e3d84e9ec.tar.gz
Project-Tick-0dfd7c0acbd463fc2d083756afb97a5e3d84e9ec.zip
Add missing adler32_copy_ssse3 implementation
-rw-r--r--arch/x86/adler32_ssse3.c12
-rw-r--r--arch/x86/x86_functions.h3
-rw-r--r--functable.c1
-rw-r--r--test/benchmarks/benchmark_adler32_copy.cc4
4 files changed, 18 insertions, 2 deletions
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c
index 7dab9b4973..7c1dc84c9b 100644
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -14,7 +14,7 @@
#include <immintrin.h>
-Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+static inline uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
@@ -153,4 +153,14 @@ unaligned_jmp:
return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
}
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+ return adler32_impl(adler, buf, len);
+}
+
+/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
+Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = adler32_impl(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
#endif
diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h
index 0f9aa1824c..1c197e849c 100644
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@@ -33,6 +33,7 @@ uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsign
#ifdef X86_SSSE3
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
@@ -110,6 +111,8 @@ uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, s
# if defined(X86_SSSE3) && defined(__SSSE3__)
# undef native_adler32
# define native_adler32 adler32_ssse3
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_ssse3
# undef native_chunkmemset_safe
# define native_chunkmemset_safe chunkmemset_safe_ssse3
# undef native_inflate_fast
diff --git a/functable.c b/functable.c
index 1085751916..758a335538 100644
--- a/functable.c
+++ b/functable.c
@@ -135,6 +135,7 @@ static int init_functable(void) {
#ifdef X86_SSSE3
if (cf.x86.has_ssse3) {
ft.adler32 = &adler32_ssse3;
+ ft.adler32_copy = &adler32_copy_ssse3;
ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
ft.inflate_fast = &inflate_fast_ssse3;
}
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc
index 05b1f0fac4..c506fccd8a 100644
--- a/test/benchmarks/benchmark_adler32_copy.cc
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -108,7 +108,9 @@ BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, test_cpu_features.power.
//BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
#endif
-
+#ifdef X86_SSSE3
+BENCHMARK_ADLER32_COPY(ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
+#endif
#ifdef X86_SSE42
BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3);
BENCHMARK_ADLER32_COPY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);