summaryrefslogtreecommitdiff
path: root/neozip/arch/power/adler32_vmx.c
diff options
context:
space:
mode:
authorMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
committerMehmet Samet Duman <yongdohyun@projecttick.org>2026-04-02 19:56:09 +0300
commit7fb132859fda54aa96bc9dd46d302b343eeb5a02 (patch)
treeb43ae77d7451fb470a260c03349a1caf2846c5e5 /neozip/arch/power/adler32_vmx.c
parentb1e34e861b5d732afe828d58aad2c638135061fd (diff)
parentc2712b8a345191f6ed79558c089777df94590087 (diff)
downloadProject-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.tar.gz
Project-Tick-7fb132859fda54aa96bc9dd46d302b343eeb5a02.zip
Add 'neozip/' from commit 'c2712b8a345191f6ed79558c089777df94590087'
git-subtree-dir: neozip git-subtree-mainline: b1e34e861b5d732afe828d58aad2c638135061fd git-subtree-split: c2712b8a345191f6ed79558c089777df94590087
Diffstat (limited to 'neozip/arch/power/adler32_vmx.c')
-rw-r--r--neozip/arch/power/adler32_vmx.c168
1 files changed, 168 insertions, 0 deletions
diff --git a/neozip/arch/power/adler32_vmx.c b/neozip/arch/power/adler32_vmx.c
new file mode 100644
index 0000000000..5171bab35b
--- /dev/null
+++ b/neozip/arch/power/adler32_vmx.c
@@ -0,0 +1,168 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "adler32_p.h"
+
+#include <altivec.h>
+
+#define vmx_zero() (vec_splat_u32(0))
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+ /* Different taps for the separable components of sums */
+ const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+ const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+ const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+ const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+ /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+ * a 2 element vector from a single load + a subsequent shift is just barely faster
+ * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+ const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+ const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+ vector unsigned int adacc, s2acc;
+ vector unsigned int pair_vec = vec_ld(0, s);
+ adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+#if BYTE_ORDER == LITTLE_ENDIAN
+ s2acc = vec_sro(pair_vec, shift_vec);
+#else
+ s2acc = vec_slo(pair_vec, shift_vec);
+#endif
+
+ vector unsigned int zero = vmx_zero();
+ vector unsigned int s3acc = zero;
+ vector unsigned int s3acc_0 = zero;
+ vector unsigned int adacc_prev = adacc;
+ vector unsigned int adacc_prev_0 = zero;
+
+ vector unsigned int s2acc_0 = zero;
+ vector unsigned int s2acc_1 = zero;
+ vector unsigned int s2acc_2 = zero;
+
+ /* Maintain a running sum of a second half, this might help use break yet another
+ * data dependency bubble in the sum */
+ vector unsigned int adacc_0 = zero;
+
+ int num_iter = len / 4;
+ int rem = len & 3;
+
+ for (int i = 0; i < num_iter; ++i) {
+ vector unsigned char d0 = vec_ld(0, buf);
+ vector unsigned char d1 = vec_ld(16, buf);
+ vector unsigned char d2 = vec_ld(32, buf);
+ vector unsigned char d3 = vec_ld(48, buf);
+
+ /* The core operation of the loop, basically
+ * what is being unrolled below */
+ adacc = vec_sum4s(d0, adacc);
+ s3acc = vec_add(s3acc, adacc_prev);
+ s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+ s2acc = vec_msum(t0, d0, s2acc);
+
+ /* interleave dependent sums in here */
+ adacc_0 = vec_sum4s(d1, adacc_0);
+ s2acc_0 = vec_msum(t1, d1, s2acc_0);
+ adacc = vec_sum4s(d2, adacc);
+ s2acc_1 = vec_msum(t2, d2, s2acc_1);
+ s2acc_2 = vec_msum(t3, d3, s2acc_2);
+ adacc_0 = vec_sum4s(d3, adacc_0);
+
+ adacc_prev = adacc;
+ adacc_prev_0 = adacc_0;
+ buf += 64;
+ }
+
+ adacc = vec_add(adacc, adacc_0);
+ s3acc = vec_add(s3acc, s3acc_0);
+ s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+ if (rem) {
+ adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+ adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+ while (rem--) {
+ vector unsigned char d0 = vec_ld(0, buf);
+ adacc = vec_sum4s(d0, adacc);
+ s3acc = vec_add(s3acc, adacc_prev);
+ s2acc = vec_msum(t3, d0, s2acc);
+ adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+ buf += 16;
+ }
+ }
+
+
+ /* Sum up independent second sums */
+ s2acc = vec_add(s2acc, s2acc_0);
+ s2acc_2 = vec_add(s2acc_1, s2acc_2);
+ s2acc = vec_add(s2acc, s2acc_2);
+
+ s2acc = vec_add(s2acc, s3acc);
+
+ adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+ s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+ adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+ s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+ vec_ste(adacc, 0, s);
+ vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+ /* Split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (UNLIKELY(len == 1))
+ return adler32_copy_tail(adler, NULL, buf, 1, sum2, 1, 1, 0);
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (UNLIKELY(len < 16))
+ return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
+
+ uint32_t pair[4] ALIGNED_(16);
+ pair[0] = adler;
+ pair[1] = sum2;
+ pair[2] = 0;
+ pair[3] = 0;
+
+ // Align buffer
+ size_t align_diff = MIN(ALIGN_DIFF(buf, 16), len);
+ size_t n = NMAX;
+ if (align_diff) {
+ adler32_copy_align(&pair[0], NULL, buf, align_diff, &pair[1], 15, 0);
+
+ buf += align_diff;
+ len -= align_diff;
+ n -= align_diff;
+ }
+
+ while (len >= 16) {
+ n = MIN(len, n);
+
+ vmx_accum32(pair, buf, n / 16);
+ pair[0] %= BASE;
+ pair[1] %= BASE;
+
+ size_t k = (n / 16) * 16;
+ buf += k;
+ len -= k;
+ n = NMAX;
+ }
+
+ /* Process tail (len < 16). */
+ return adler32_copy_tail(pair[0], NULL, buf, len, pair[1], len != 0 || align_diff, 15, 0);
+}
+
+/* VMX stores can have higher latency than optimized memcpy */
+Z_INTERNAL uint32_t adler32_copy_vmx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ adler = adler32_vmx(adler, src, len);
+ memcpy(dst, src, len);
+ return adler;
+}
+#endif