diff options
| author | Cameron Cawley <ccawley2011@gmail.com> | 2025-02-08 20:36:58 +0000 |
|---|---|---|
| committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2026-03-05 17:04:25 +0100 |
| commit | d8136aea2a074c950b91f6c609c43a55a7990056 (patch) | |
| tree | 8d7f5723922db7524fc5bdb41db87b2062c7c316 | |
| parent | 53abd150bca334b424c966def67f0c06258aaec4 (diff) | |
| download | Project-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.tar.gz Project-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.zip | |
Make use of NEON alignment hints
| -rw-r--r-- | arch/arm/adler32_neon.c | 47 | ||||
| -rw-r--r-- | arch/arm/chunkset_neon.c | 6 | ||||
| -rw-r--r-- | arch/arm/crc32_armv8_pmull_eor3.c | 44 | ||||
| -rw-r--r-- | arch/arm/neon_intrins.h | 12 | ||||
| -rw-r--r-- | arch/arm/slide_hash_neon.c | 8 |
5 files changed, 65 insertions, 52 deletions
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index cbb1c784ef..a55c8c1353 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -45,10 +45,10 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin int rem = len & 3; for (size_t i = 0; i < num_iter; ++i) { - uint8x16_t d0 = vld1q_u8(buf); - uint8x16_t d1 = vld1q_u8(buf + 16); - uint8x16_t d2 = vld1q_u8(buf + 32); - uint8x16_t d3 = vld1q_u8(buf + 48); + uint8x16_t d0 = vld1q_u8_ex(buf, 128); + uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128); + uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128); + uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128); vst1q_u8(dst, d0); vst1q_u8(dst + 16, d1); @@ -93,7 +93,7 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin if (rem) { uint32x4_t s3acc_0 = vdupq_n_u32(0); while (rem--) { - uint8x16_t d0 = vld1q_u8(buf); + uint8x16_t d0 = vld1q_u8_ex(buf, 128); vst1q_u8(dst, d0); dst += 16; uint16x8_t adler; @@ -110,8 +110,8 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin s3acc = vaddq_u32(s3acc_0, s3acc); } - uint16x8x4_t t0_t3 = vld1q_u16_x4(taps); - uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32); + uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256); + uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256); s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0); s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0)); @@ -169,7 +169,7 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l int rem = len & 3; for (size_t i = 0; i < num_iter; ++i) { - uint8x16x4_t d0_d3 = vld1q_u8_x4(buf); + uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256); /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32 * bit instruction, we'll have to make due summing to 16 bits first */ @@ -208,7 +208,7 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l if (rem) { uint32x4_t s3acc_0 = vdupq_n_u32(0); while (rem--) { - uint8x16_t d0 = vld1q_u8(buf); + uint8x16_t d0 = vld1q_u8_ex(buf, 128); uint16x8_t adler; adler = vpaddlq_u8(d0); s2_6 = vaddw_u8(s2_6, vget_low_u8(d0)); @@ -223,8 +223,8 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l s3acc = vaddq_u32(s3acc_0, s3acc); } - uint16x8x4_t t0_t3 = vld1q_u16_x4(taps); - uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32); + uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256); + uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256); s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0); s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0)); @@ -285,21 +285,22 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co /* If memory is not SIMD aligned, do scalar sums to an aligned * offset, provided that doing so doesn't completely eliminate * SIMD operation. Aligned loads are still faster on ARM, even - * though there's no explicit aligned load instruction. Note: - * on Android and iOS, their ABIs specify stricter alignment - * requirements for the 2,3,4x register ld1 variants. Clang for - * these platforms emits an alignment hint in the instruction for exactly - * 256 bits. Several ARM SIPs have small penalties for cacheline - * crossing loads as well (so really 512 bits is the optimal alignment - * of the buffer). 32 bytes should strike a balance, though. Clang and - * GCC on Linux will not emit this hint in the encoded instruction and - * it's unclear how many SIPs will benefit from it. For Android/iOS, we - * fallback to 4x loads and 4x stores, instead. In the copying variant we - * do this anyway, as ld1x4 seems to block ILP when stores are in the mix */ + * when there's no explicit aligned load instruction. Note: + * the code currently emits an alignment hint in the instruction + * for exactly 256 bits when supported by the compiler. Several ARM + * SIPs have small penalties for cacheline crossing loads as well (so + * really 512 bits is the optimal alignment of the buffer). 32 bytes + * should strike a balance, though. The Cortex-A8 and Cortex-A9 + * processors are documented to benefit from 128 bit and 64 bit + * alignment, but it's unclear which other SIPs will benefit from it. + * In the copying variant we use fallback to 4x loads and 4x stores, + * as ld1x4 seems to block ILP when stores are in the mix */ unsigned int align_offset = ((uintptr_t)src & 31); unsigned int align_adj = (align_offset) ? 32 - align_offset : 0; - if (align_offset && len >= (16 + align_adj)) { + if (len < (16 + align_adj)) { + return adler32_copy_tail(pair[0], dst, src, len, pair[1], 1, 15, COPY); + } else if (align_offset) { adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY); n -= align_adj; diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 7bc932f939..0a06122ae1 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -54,12 +54,12 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist #if defined(ARCH_ARM) && defined(ARCH_64BIT) uint8x16_t ret_vec = vld1q_u8(buf); - uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx); + uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128); return vqtbl1q_u8(ret_vec, perm_vec); #else uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1; - perm_vec0 = vld1_u8(permute_table + lut_rem.idx); - perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8); + perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64); + perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64); a = vld1_u8(buf); b = vld1_u8(buf + 8); ret0 = vtbl1_u8(a, perm_vec0); diff --git a/arch/arm/crc32_armv8_pmull_eor3.c b/arch/arm/crc32_armv8_pmull_eor3.c index 38f2a854b1..5b491be4ab 100644 --- a/arch/arm/crc32_armv8_pmull_eor3.c +++ b/arch/arm/crc32_armv8_pmull_eor3.c @@ -113,18 +113,18 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con uint64_t vc; /* Load first 9 vector chunks (144 bytes) */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8; + uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)buf2, 128), y0; + uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128), y1; + uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128), y2; + uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128), y3; + uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128), y4; + uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128), y5; + uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128), y6; + uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128), y7; + uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128), y8; uint64x2_t k; /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */ - { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); } + { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); } buf2 += 144; /* Fold 9 vectors + 3-way parallel scalar CRC */ @@ -144,15 +144,15 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k); /* EOR3: combine hi*k, lo*k, and new data in one instruction */ - x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2)); - x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16))); - x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32))); - x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48))); - x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64))); - x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80))); - x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96))); - x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112))); - x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128))); + x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)buf2, 128)); + x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128)); + x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128)); + x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128)); + x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128)); + x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128)); + x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128)); + x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128)); + x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128)); /* 3-way parallel scalar CRC (16 bytes each) */ crc0 = __crc32d(crc0, *(const uint64_t*)buf); @@ -168,7 +168,7 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con /* Reduce 9 vectors to 1 using tree reduction */ /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */ - { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } + { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); } y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); x0 = veor3q_u64(x0, y0, x1); x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; @@ -184,14 +184,14 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con x6 = veor3q_u64(x6, y6, x7); /* Step 3: fold pairs (x0,x2), (x4,x6) */ - { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); } + { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); } y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); x0 = veor3q_u64(x0, y0, x2); x4 = veor3q_u64(x4, y4, x6); /* Step 4: final fold (x0, x4) -> x0 */ - { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); } + { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); } y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); x0 = veor3q_u64(x0, y0, x4); diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h index ac9c86bf4c..449916e0b7 100644 --- a/arch/arm/neon_intrins.h +++ b/arch/arm/neon_intrins.h @@ -62,6 +62,18 @@ static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) { vst1q_u16(p + 24, a.val[3]); } # endif // HASLD4 check + +# ifndef _MSC_VER +# define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8)) +# endif +# if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4) +# define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8)) +# define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8)) +# define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a) +# endif + #endif #endif // include guard ARM_NEON_INTRINS_H diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c index e5a636baef..2f9e94a33d 100644 --- a/arch/arm/slide_hash_neon.c +++ b/arch/arm/slide_hash_neon.c @@ -28,12 +28,12 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize n = size / (sizeof(uint16x8_t) * 8); do { - p0 = vld1q_u16_x4(table); - p1 = vld1q_u16_x4(table+32); + p0 = vld1q_u16_x4_ex(table, 256); + p1 = vld1q_u16_x4_ex(table+32, 256); vqsubq_u16_x4_x1(p0, p0, v); vqsubq_u16_x4_x1(p1, p1, v); - vst1q_u16_x4(table, p0); - vst1q_u16_x4(table+32, p1); + vst1q_u16_x4_ex(table, p0, 256); + vst1q_u16_x4_ex(table+32, p1, 256); table += 64; } while (--n); } |
