Make use of NEON alignment hints

author: Cameron Cawley <ccawley2011@gmail.com> 2025-02-08 20:36:58 +0000
committer: Hans Kristian Rosbach <hk-github@circlestorm.org> 2026-03-05 17:04:25 +0100
commit: d8136aea2a074c950b91f6c609c43a55a7990056 (patch)
tree: 8d7f5723922db7524fc5bdb41db87b2062c7c316
parent: 53abd150bca334b424c966def67f0c06258aaec4 (diff)
download: Project-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.tar.gz
Project-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.zip
5 files changed, 65 insertions, 52 deletions
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index cbb1c784ef..a55c8c1353 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -45,10 +45,10 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin
     int rem = len & 3;
 
     for (size_t i = 0; i < num_iter; ++i) {
-        uint8x16_t d0 = vld1q_u8(buf);
-        uint8x16_t d1 = vld1q_u8(buf + 16);
-        uint8x16_t d2 = vld1q_u8(buf + 32);
-        uint8x16_t d3 = vld1q_u8(buf + 48);
+        uint8x16_t d0 = vld1q_u8_ex(buf, 128);
+        uint8x16_t d1 = vld1q_u8_ex(buf + 16, 128);
+        uint8x16_t d2 = vld1q_u8_ex(buf + 32, 128);
+        uint8x16_t d3 = vld1q_u8_ex(buf + 48, 128);
 
         vst1q_u8(dst, d0);
         vst1q_u8(dst + 16, d1);
@@ -93,7 +93,7 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin
     if (rem) {
         uint32x4_t s3acc_0 = vdupq_n_u32(0);
         while (rem--) {
-            uint8x16_t d0 = vld1q_u8(buf);
+            uint8x16_t d0 = vld1q_u8_ex(buf, 128);
             vst1q_u8(dst, d0);
             dst += 16;
             uint16x8_t adler;
@@ -110,8 +110,8 @@ Z_FORCEINLINE static void NEON_accum32_copy(uint32_t *s, uint8_t *dst, const uin
         s3acc = vaddq_u32(s3acc_0, s3acc);
     }
 
-    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
-    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+    uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
 
     s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
     s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
@@ -169,7 +169,7 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
     int rem = len & 3;
 
     for (size_t i = 0; i < num_iter; ++i) {
-        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+        uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 256);
 
         /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
          * bit instruction, we'll have to make due summing to 16 bits first */
@@ -208,7 +208,7 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
     if (rem) {
         uint32x4_t s3acc_0 = vdupq_n_u32(0);
         while (rem--) {
-            uint8x16_t d0 = vld1q_u8(buf);
+            uint8x16_t d0 = vld1q_u8_ex(buf, 128);
             uint16x8_t adler;
             adler = vpaddlq_u8(d0);
             s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
@@ -223,8 +223,8 @@ Z_FORCEINLINE static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t l
         s3acc = vaddq_u32(s3acc_0, s3acc);
     }
 
-    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
-    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+    uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
 
     s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
     s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
@@ -285,21 +285,22 @@ Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, co
     /* If memory is not SIMD aligned, do scalar sums to an aligned
      * offset, provided that doing so doesn't completely eliminate
      * SIMD operation. Aligned loads are still faster on ARM, even
-     * though there's no explicit aligned load instruction. Note:
-     * on Android and iOS, their ABIs specify stricter alignment
-     * requirements for the 2,3,4x register ld1 variants. Clang for
-     * these platforms emits an alignment hint in the instruction for exactly
-     * 256 bits. Several ARM SIPs have small penalties for cacheline
-     * crossing loads as well (so really 512 bits is the optimal alignment
-     * of the buffer). 32 bytes should strike a balance, though. Clang and
-     * GCC on Linux will not emit this hint in the encoded instruction and
-     * it's unclear how many SIPs will benefit from it. For Android/iOS, we
-     * fallback to 4x loads and 4x stores, instead. In the copying variant we
-     * do this anyway, as ld1x4 seems to block ILP when stores are in the mix */
+     * when there's no explicit aligned load instruction. Note:
+     * the code currently emits an alignment hint in the instruction
+     * for exactly 256 bits when supported by the compiler. Several ARM
+     * SIPs have small penalties for cacheline crossing loads as well (so
+     * really 512 bits is the optimal alignment of the buffer). 32 bytes
+     * should strike a balance, though. The Cortex-A8 and Cortex-A9
+     * processors are documented to benefit from 128 bit and 64 bit
+     * alignment, but it's unclear which other SIPs will benefit from it.
+     * In the copying variant we use fallback to 4x loads and 4x stores,
+     * as ld1x4 seems to block ILP when stores are in the mix */
     unsigned int align_offset = ((uintptr_t)src & 31);
     unsigned int align_adj = (align_offset) ? 32 - align_offset : 0;
 
-    if (align_offset && len >= (16 + align_adj)) {
+    if (len < (16 + align_adj)) {
+        return adler32_copy_tail(pair[0], dst, src, len, pair[1], 1, 15, COPY);
+    } else if (align_offset) {
         adler32_copy_align(&pair[0], dst, src, align_adj, &pair[1], 31, COPY);
 
         n -= align_adj;
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
index 7bc932f939..0a06122ae1 100644
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -54,12 +54,12 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist
 #if defined(ARCH_ARM) && defined(ARCH_64BIT)
     uint8x16_t ret_vec = vld1q_u8(buf);
 
-    uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+    uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 128);
     return vqtbl1q_u8(ret_vec, perm_vec);
 #else
     uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
-    perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
-    perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+    perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 64);
+    perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
     a = vld1_u8(buf);
     b = vld1_u8(buf + 8);
     ret0 = vtbl1_u8(a, perm_vec0);
diff --git a/arch/arm/crc32_armv8_pmull_eor3.c b/arch/arm/crc32_armv8_pmull_eor3.c
index 38f2a854b1..5b491be4ab 100644
--- a/arch/arm/crc32_armv8_pmull_eor3.c
+++ b/arch/arm/crc32_armv8_pmull_eor3.c
@@ -113,18 +113,18 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
         uint64_t vc;
 
         /* Load first 9 vector chunks (144 bytes) */
-        uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
-        uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
-        uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
-        uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
-        uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
-        uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
-        uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
-        uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
-        uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
+        uint64x2_t x0 = vld1q_u64_ex((const uint64_t*)buf2, 128), y0;
+        uint64x2_t x1 = vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128), y1;
+        uint64x2_t x2 = vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128), y2;
+        uint64x2_t x3 = vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128), y3;
+        uint64x2_t x4 = vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128), y4;
+        uint64x2_t x5 = vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128), y5;
+        uint64x2_t x6 = vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128), y6;
+        uint64x2_t x7 = vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128), y7;
+        uint64x2_t x8 = vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128), y8;
         uint64x2_t k;
         /* k = {x^144 mod P, x^144+64 mod P} for 144-byte fold */
-        { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); }
+        { static const uint64_t ALIGNED_(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64_ex(k_, 128); }
         buf2 += 144;
 
         /* Fold 9 vectors + 3-way parallel scalar CRC */
@@ -144,15 +144,15 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
                 y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
 
                 /* EOR3: combine hi*k, lo*k, and new data in one instruction */
-                x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
-                x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
-                x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
-                x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
-                x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
-                x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
-                x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
-                x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
-                x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
+                x0 = veor3q_u64(x0, y0, vld1q_u64_ex((const uint64_t*)buf2, 128));
+                x1 = veor3q_u64(x1, y1, vld1q_u64_ex((const uint64_t*)(buf2 + 16), 128));
+                x2 = veor3q_u64(x2, y2, vld1q_u64_ex((const uint64_t*)(buf2 + 32), 128));
+                x3 = veor3q_u64(x3, y3, vld1q_u64_ex((const uint64_t*)(buf2 + 48), 128));
+                x4 = veor3q_u64(x4, y4, vld1q_u64_ex((const uint64_t*)(buf2 + 64), 128));
+                x5 = veor3q_u64(x5, y5, vld1q_u64_ex((const uint64_t*)(buf2 + 80), 128));
+                x6 = veor3q_u64(x6, y6, vld1q_u64_ex((const uint64_t*)(buf2 + 96), 128));
+                x7 = veor3q_u64(x7, y7, vld1q_u64_ex((const uint64_t*)(buf2 + 112), 128));
+                x8 = veor3q_u64(x8, y8, vld1q_u64_ex((const uint64_t*)(buf2 + 128), 128));
 
                 /* 3-way parallel scalar CRC (16 bytes each) */
                 crc0 = __crc32d(crc0, *(const uint64_t*)buf);
@@ -168,7 +168,7 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
 
         /* Reduce 9 vectors to 1 using tree reduction */
         /* Step 1: x0 = fold(x0, x1), shift x2..x8 down */
-        { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
+        { static const uint64_t ALIGNED_(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64_ex(k_, 128); }
         y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
         x0 = veor3q_u64(x0, y0, x1);
         x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
@@ -184,14 +184,14 @@ Z_INTERNAL Z_TARGET_PMULL_EOR3 uint32_t crc32_armv8_pmull_eor3(uint32_t crc, con
         x6 = veor3q_u64(x6, y6, x7);
 
         /* Step 3: fold pairs (x0,x2), (x4,x6) */
-        { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
+        { static const uint64_t ALIGNED_(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64_ex(k_, 128); }
         y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
         y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
         x0 = veor3q_u64(x0, y0, x2);
         x4 = veor3q_u64(x4, y4, x6);
 
         /* Step 4: final fold (x0, x4) -> x0 */
-        { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
+        { static const uint64_t ALIGNED_(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64_ex(k_, 128); }
         y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
         x0 = veor3q_u64(x0, y0, x4);
 
diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h
index ac9c86bf4c..449916e0b7 100644
--- a/arch/arm/neon_intrins.h
+++ b/arch/arm/neon_intrins.h
@@ -62,6 +62,18 @@ static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
     vst1q_u16(p + 24, a.val[3]);
 }
 #  endif // HASLD4 check
+
+#  ifndef _MSC_VER
+#    define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u64_ex(p, align) vld1q_u64(HINT_ALIGNED((p), (align)/8))
+#  endif
+#  if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
+#    define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
+#    define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
+#  endif
+
 #endif
 
 #endif // include guard ARM_NEON_INTRINS_H
diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c
index e5a636baef..2f9e94a33d 100644
--- a/arch/arm/slide_hash_neon.c
+++ b/arch/arm/slide_hash_neon.c
@@ -28,12 +28,12 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize
 
     n = size / (sizeof(uint16x8_t) * 8);
     do {
-        p0 = vld1q_u16_x4(table);
-        p1 = vld1q_u16_x4(table+32);
+        p0 = vld1q_u16_x4_ex(table, 256);
+        p1 = vld1q_u16_x4_ex(table+32, 256);
         vqsubq_u16_x4_x1(p0, p0, v);
         vqsubq_u16_x4_x1(p1, p1, v);
-        vst1q_u16_x4(table, p0);
-        vst1q_u16_x4(table+32, p1);
+        vst1q_u16_x4_ex(table, p0, 256);
+        vst1q_u16_x4_ex(table+32, p1, 256);
         table += 64;
     } while (--n);
 }
author	Cameron Cawley <ccawley2011@gmail.com>	2025-02-08 20:36:58 +0000
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>	2026-03-05 17:04:25 +0100
commit	d8136aea2a074c950b91f6c609c43a55a7990056 (patch)
tree	8d7f5723922db7524fc5bdb41db87b2062c7c316
parent	53abd150bca334b424c966def67f0c06258aaec4 (diff)
download	Project-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.tar.gz Project-Tick-d8136aea2a074c950b91f6c609c43a55a7990056.zip