summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Stylinski <kungfujesus06@gmail.com>2024-11-30 09:23:28 -0500
committerHans Kristian Rosbach <hk-github@circlestorm.org>2024-12-10 22:17:14 +0100
commit43d74a223b30902b44b01bf4c4888d8deb35e253 (patch)
treeef1813e6dfbeee03b01156404456cb81c23fd713
parenta4e7c34a4ac171ba878eec86bdd2a58c1d03f8e5 (diff)
downloadProject-Tick-43d74a223b30902b44b01bf4c4888d8deb35e253.tar.gz
Project-Tick-43d74a223b30902b44b01bf4c4888d8deb35e253.zip
Improve pipeling for AVX512 chunking
For reasons that aren't quite so clear, using the masked writes here did not pipeline very well. Either setting up the mask stalled things or masked moves have issues overlapping regular moves. Simply putting the masked moves behind a branch that is rarely taken seemed to do the trick in improving the ILP. While here, put masked loads behind the same branch in case there were ever a hazard for overreading.
-rw-r--r--arch/x86/chunkset_avx512.c49
-rw-r--r--chunkset_tpl.h4
2 files changed, 19 insertions, 34 deletions
diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c
index b2fab4884f..3d51ad1d9b 100644
--- a/arch/x86/chunkset_avx512.c
+++ b/arch/x86/chunkset_avx512.c
@@ -62,20 +62,22 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
-static inline void storechunk_mask(uint8_t *out, mask_t mask, chunk_t *chunk) {
- _mm256_mask_storeu_epi8(out, mask, *chunk);
-}
-
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
- unsigned rem = len % sizeof(chunk_t);
- mask_t rem_mask = gen_mask(rem);
-
- /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
chunk_t chunk;
+ uint32_t rem = len % sizeof(chunk_t);
+
+ if (len < sizeof(chunk_t)) {
+ mask_t rem_mask = gen_mask(rem);
+ chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
+ _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+ return out + rem;
+ }
+
loadchunk(from, &chunk);
- _mm256_mask_storeu_epi8(out, rem_mask, chunk);
+ rem = (rem == 0) ? sizeof(chunk_t) : rem;
+ storechunk(out, &chunk);
out += rem;
from += rem;
len -= rem;
@@ -122,10 +124,6 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
return ret_vec;
}
-static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
- *chunk = _mm_loadu_si128((__m128i *)s);
-}
-
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
@@ -151,27 +149,18 @@ static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, u
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
-
- unsigned rem = len % sizeof(halfchunk_t);
- halfmask_t rem_mask = gen_half_mask(rem);
-
- /* Since this is only ever called if dist >= a chunk, we don't need a masked load */
halfchunk_t chunk;
- loadhalfchunk(from, &chunk);
- _mm_mask_storeu_epi8(out, rem_mask, chunk);
- out += rem;
- from += rem;
- len -= rem;
- while (len > 0) {
- loadhalfchunk(from, &chunk);
- storehalfchunk(out, &chunk);
- out += sizeof(halfchunk_t);
- from += sizeof(halfchunk_t);
- len -= sizeof(halfchunk_t);
+ uint32_t rem = len % sizeof(halfchunk_t);
+ if (rem == 0) {
+ rem = sizeof(halfchunk_t);
}
- return out;
+ halfmask_t rem_mask = gen_half_mask(rem);
+ chunk = _mm_maskz_loadu_epi8(rem_mask, from);
+ _mm_mask_storeu_epi8(out, rem_mask, chunk);
+
+ return out + rem;
}
#define CHUNKSIZE chunksize_avx512
diff --git a/chunkset_tpl.h b/chunkset_tpl.h
index 5af1fbe8aa..5d4cacbd9d 100644
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -219,11 +219,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
rem_bytes:
#endif
if (len) {
-#ifndef HAVE_MASKED_READWRITE
memcpy(out, &chunk_load, len);
-#else
- storechunk_mask(out, gen_mask(len), &chunk_load);
-#endif
out += len;
}