summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Moinvaziri <nathan@nathanm.com>2025-12-26 08:56:41 -0800
committerHans Kristian Rosbach <hk-github@circlestorm.org>2025-12-28 14:07:36 +0100
commit1a32c9977f6d220e178d9bf0856d713ceb0837dc (patch)
treeae4a4fc919592df0a86772f0214b16daf2cca781
parenta8a12f465758accea6a9dd37d6979c014a42a7c6 (diff)
downloadProject-Tick-1a32c9977f6d220e178d9bf0856d713ceb0837dc.tar.gz
Project-Tick-1a32c9977f6d220e178d9bf0856d713ceb0837dc.zip
Add missing adler32_copy_vmx implementation
-rw-r--r--arch/power/adler32_vmx.c13
-rw-r--r--arch/power/power_functions.h3
-rw-r--r--functable.c1
-rw-r--r--test/benchmarks/benchmark_adler32_copy.cc3
4 files changed, 17 insertions, 3 deletions
diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
index 9ab53e1fa0..004d3fce68 100644
--- a/arch/power/adler32_vmx.c
+++ b/arch/power/adler32_vmx.c
@@ -118,7 +118,7 @@ static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
vec_ste(s2acc, 0, s+1);
}
-Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+static inline uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
@@ -183,4 +183,15 @@ Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len)
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
return (pair[1] << 16) | pair[0];
}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+ return adler32_impl(adler, buf, len);
+}
+
+/* VMX stores can have higher latency than optimized memcpy */
+Z_INTERNAL uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = adler32_impl(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
#endif
diff --git a/arch/power/power_functions.h b/arch/power/power_functions.h
index fe3a64821a..7697073afb 100644
--- a/arch/power/power_functions.h
+++ b/arch/power/power_functions.h
@@ -9,6 +9,7 @@
#ifdef PPC_VMX
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
void slide_hash_vmx(deflate_state *s);
#endif
@@ -33,6 +34,8 @@ uint32_t longest_match_slow_power9(deflate_state *const s, uint32_t cur_match);
# if defined(PPC_VMX) && defined(__ALTIVEC__)
# undef native_adler32
# define native_adler32 adler32_vmx
+# undef native_adler32_copy
+# define native_adler32_copy adler32_copy_vmx
# undef native_slide_hash
# define native_slide_hash slide_hash_vmx
# endif
diff --git a/functable.c b/functable.c
index e33d8ef3aa..1085751916 100644
--- a/functable.c
+++ b/functable.c
@@ -249,6 +249,7 @@ static int init_functable(void) {
#ifdef PPC_VMX
if (cf.power.has_altivec) {
ft.adler32 = &adler32_vmx;
+ ft.adler32_copy = &adler32_copy_vmx;
ft.slide_hash = &slide_hash_vmx;
}
#endif
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc
index 58c3b4a01e..05b1f0fac4 100644
--- a/test/benchmarks/benchmark_adler32_copy.cc
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -97,8 +97,7 @@ BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, test_cpu_featu
#endif
#ifdef PPC_VMX
-//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_copy_vmx, test_cpu_features.power.has_altivec);
-BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, test_cpu_features.power.has_altivec);
+BENCHMARK_ADLER32_COPY(vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
#endif
#ifdef POWER8_VSX
//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);