summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCameron Cawley <ccawley2011@gmail.com>2025-02-08 20:36:58 +0000
committerHans Kristian Rosbach <hk-github@circlestorm.org>2026-03-05 17:04:25 +0100
commitd8136aea2a074c950b91f6c609c43a55a7990056 (patch)
tree8d7f5723922db7524fc5bdb41db87b2062c7c316
parent53abd150bca334b424c966def67f0c06258aaec4 (diff)
downloadProject-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.tar.gz
Project-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.zip
Make use of NEON alignment hints
-rw-r--r--arch/arm/adler32_neon.c47
-rw-r--r--arch/arm/chunkset_neon.c6
-rw-r--r--arch/arm/crc32_armv8_pmull_eor3.c44
-rw-r--r--arch/arm/neon_intrins.h12
-rw-r--r--arch/arm/slide_hash_neon.c8
5 files changed, 65 insertions, 52 deletions
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index cbb1c784ef..a55c8c1353 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -45,10 +45,10 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin
int rem = len & 3;
for (size_t i = 0; i < num_iter; ++i) {
- uint8x16_t d0 = vld1q_u8(buf);
- uint8x16_t d1 = vld1q_u8(buf + 16);
- uint8x16_t d2 = vld1q_u8(buf + 32);
- uint8x16_t d3 = vld1q_u8(buf + 48);
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+ uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128);
+ uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128);
+ uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128);
vst1q_u8(dst, d0);
vst1q_u8(dst + 16, d1);
@@ -93,7 +93,7 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin
if (rem) {
uint32x4_t s3acc_0 = vdupq_n_u32(0);
while (rem--) {
- uint8x16_t d0 = vld1q_u8(buf);
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
vst1q_u8(dst, d0);
dst += 16;
uint16x8_t adler;
@@ -110,8 +110,8 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin
s3acc = vaddq_u32(s3acc_0, s3acc);
}
- uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
- uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+ uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+ uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
@@ -169,7 +169,7 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
int rem = len & 3;
for (size_t i = 0; i < num_iter; ++i) {
- uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+ uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256);
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
* bit instruction, we'll have to make due summing to 16 bits first */
@@ -208,7 +208,7 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
if (rem) {
uint32x4_t s3acc_0 = vdupq_n_u32(0);
while (rem--) {
- uint8x16_t d0 = vld1q_u8(buf);
+ uint8x16_t d0 = vld1q_u8_ex(buf, 128);
uint16x8_t adler;
adler = vpaddlq_u8(d0);
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
@@ -223,8 +223,8 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
s3acc = vaddq_u32(s3acc_0, s3acc);
}
- uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
- uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+ uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+ uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
@@ -285,21 +285,22 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
/* If memory is not SIMD aligned, do scalar sums to an aligned
* offset, provided that doing so doesn't completely eliminate
* SIMD operation. Aligned loads are still faster on ARM, even
- * though there's no explicit aligned load instruction. Note:
- * on Android and iOS, their ABIs specify stricter alignment
- * requirements for the 2,3,4x register ld1 variants. Clang for
- * these platforms emits an alignment hint in the instruction for exactly
- * 256 bits. Several ARM SIPs have small penalties for cacheline
- * crossing loads as well (so really 512 bits is the optimal alignment
- * of the buffer). 32 bytes should strike a balance, though. Clang and
- * GCC on Linux will not emit this hint in the encoded instruction and
- * it's unclear how many SIPs will benefit from it. For Android/iOS, we
- * fallback to 4x loads and 4x stores, instead. In the copying variant we
- * do this anyway, as ld1x4 seems to block ILP when stores are in the mix */
+ * when there's no explicit aligned load instruction. Note:
+ * the code currently emits an alignment hint in the instruction
+ * for exactly 256 bits when supported by the compiler. Several ARM
+ * SIPs have small penalties for cacheline crossing loads as well (so
+ * really 512 bits is the optimal alignment of the buffer). 32 bytes
+ * should strike a balance, though. The Cortex-A8 and Cortex-A9
+ * processors are documented to benefit from 128 bit and 64 bit
+ * alignment, but it's unclear which other SIPs will benefit from it.
+ * In the copying variant we use fallback to 4x loads and 4x stores,
+ * as ld1x4 seems to block ILP when stores are in the mix */
unsigned int align_offset = ((uintptr_t)src & 31);
unsigned int align_adj = (align_offset) ? 32 - align_offset : 0;
- if (align_offset && len >= (16 + align_adj)) {
+ if (len < (16 + align_adj)) {
+ return adler32_copy_tail(pair[0], dst, src, len, pair[1], 1, 15, COPY);
+ } else if (align_offset) {
adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY);
n -= align_adj;
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
index 7bc932f939..0a06122ae1 100644
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -54,12 +54,12 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist
#if defined(ARCH_ARM) && defined(ARCH_64BIT)
uint8x16_t ret_vec = vld1q_u8(buf);
- uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+ uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128);
return vqtbl1q_u8(ret_vec, perm_vec);
#else
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
- perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
- perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+ perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64);
+ perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
a = vld1_u8(buf);
b = vld1_u8(buf + 8);
ret0 = vtbl1_u8(a, perm_vec0);
diff --git a/arch/arm/crc32_armv8_pmull_eor3.c b/arch/arm/crc32_armv8_pmull_eor3.c
index 38f2a854b1..5b491be4ab 100644
--- a/arch/arm/crc32_armv8_pmull_eor3.c
+++ b/arch/arm/crc32_armv8_pmull_eor3.c
@@ -113,18 +113,18 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
uint64_t vc;
/* Load first 9 vector chunks (144 bytes) */
- uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
- uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
- uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
- uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
- uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
- uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
- uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
- uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
- uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
+ uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)buf2, 128), y0;
+ uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128), y1;
+ uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128), y2;
+ uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128), y3;
+ uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128), y4;
+ uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128), y5;
+ uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128), y6;
+ uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128), y7;
+ uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128), y8;
uint64x2_t k;
/* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
- { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); }
+ { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
buf2 += 144;
/* Fold 9 vectors + 3-way parallel scalar CRC */
@@ -144,15 +144,15 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
/* EOR3: combine hi*k, lo*k, and new data in one instruction */
- x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
- x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
- x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
- x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
- x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
- x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
- x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
- x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
- x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
+ x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)buf2, 128));
+ x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128));
+ x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128));
+ x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128));
+ x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128));
+ x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128));
+ x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128));
+ x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128));
+ x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128));
/* 3-way parallel scalar CRC (16 bytes each) */
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
@@ -168,7 +168,7 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
/* Reduce 9 vectors to 1 using tree reduction */
/* Step 1: x0 = fold(x0, x1), shift x2..x8 down */
- { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
+ { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); }
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
x0 = veor3q_u64(x0, y0, x1);
x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
@@ -184,14 +184,14 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
x6 = veor3q_u64(x6, y6, x7);
/* Step 3: fold pairs (x0,x2), (x4,x6) */
- { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
+ { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); }
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
x0 = veor3q_u64(x0, y0, x2);
x4 = veor3q_u64(x4, y4, x6);
/* Step 4: final fold (x0, x4) -> x0 */
- { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
+ { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); }
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
x0 = veor3q_u64(x0, y0, x4);
diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h
index ac9c86bf4c..449916e0b7 100644
--- a/arch/arm/neon_intrins.h
+++ b/arch/arm/neon_intrins.h
@@ -62,6 +62,18 @@ static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
vst1q_u16(p + 24, a.val[3]);
}
# endif // HASLD4 check
+
+# ifndef _MSC_VER
+# define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8))
+# endif
+# if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
+# define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
+# define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
+# define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
+# endif
+
#endif
#endif // include guard ARM_NEON_INTRINS_H
diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c
index e5a636baef..2f9e94a33d 100644
--- a/arch/arm/slide_hash_neon.c
+++ b/arch/arm/slide_hash_neon.c
@@ -28,12 +28,12 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize
n = size / (sizeof(uint16x8_t) * 8);
do {
- p0 = vld1q_u16_x4(table);
- p1 = vld1q_u16_x4(table+32);
+ p0 = vld1q_u16_x4_ex(table, 256);
+ p1 = vld1q_u16_x4_ex(table+32, 256);
vqsubq_u16_x4_x1(p0, p0, v);
vqsubq_u16_x4_x1(p1, p1, v);
- vst1q_u16_x4(table, p0);
- vst1q_u16_x4(table+32, p1);
+ vst1q_u16_x4_ex(table, p0, 256);
+ vst1q_u16_x4_ex(table+32, p1, 256);
table += 64;
} while (--n);
}