diff options
| -rw-r--r-- | arch/x86/chunkset_avx512.c | 49 | ||||
| -rw-r--r-- | chunkset_tpl.h | 4 |
2 files changed, 19 insertions, 34 deletions
diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c index b2fab4884f..3d51ad1d9b 100644 --- a/arch/x86/chunkset_avx512.c +++ b/arch/x86/chunkset_avx512.c @@ -62,20 +62,22 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { _mm256_storeu_si256((__m256i *)out, *chunk); } -static inline void storechunk_mask(uint8_t *out, mask_t mask, chunk_t *chunk) { - _mm256_mask_storeu_epi8(out, mask, *chunk); -} - static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { Assert(len > 0, "chunkcopy should never have a length 0"); - unsigned rem = len % sizeof(chunk_t); - mask_t rem_mask = gen_mask(rem); - - /* Since this is only ever called if dist >= a chunk, we don't need a masked load */ chunk_t chunk; + uint32_t rem = len % sizeof(chunk_t); + + if (len < sizeof(chunk_t)) { + mask_t rem_mask = gen_mask(rem); + chunk = _mm256_maskz_loadu_epi8(rem_mask, from); + _mm256_mask_storeu_epi8(out, rem_mask, chunk); + return out + rem; + } + loadchunk(from, &chunk); - _mm256_mask_storeu_epi8(out, rem_mask, chunk); + rem = (rem == 0) ? sizeof(chunk_t) : rem; + storechunk(out, &chunk); out += rem; from += rem; len -= rem; @@ -122,10 +124,6 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t return ret_vec; } -static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { - *chunk = _mm_loadu_si128((__m128i *)s); -} - static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { _mm_storeu_si128((__m128i *)out, *chunk); } @@ -151,27 +149,18 @@ static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, u static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { Assert(len > 0, "chunkcopy should never have a length 0"); - - unsigned rem = len % sizeof(halfchunk_t); - halfmask_t rem_mask = gen_half_mask(rem); - - /* Since this is only ever called if dist >= a chunk, we don't need a masked load */ halfchunk_t chunk; - loadhalfchunk(from, &chunk); - _mm_mask_storeu_epi8(out, rem_mask, chunk); - out += rem; - from += rem; - len -= rem; - while (len > 0) { - loadhalfchunk(from, &chunk); - storehalfchunk(out, &chunk); - out += sizeof(halfchunk_t); - from += sizeof(halfchunk_t); - len -= sizeof(halfchunk_t); + uint32_t rem = len % sizeof(halfchunk_t); + if (rem == 0) { + rem = sizeof(halfchunk_t); } - return out; + halfmask_t rem_mask = gen_half_mask(rem); + chunk = _mm_maskz_loadu_epi8(rem_mask, from); + _mm_mask_storeu_epi8(out, rem_mask, chunk); + + return out + rem; } #define CHUNKSIZE chunksize_avx512 diff --git a/chunkset_tpl.h b/chunkset_tpl.h index 5af1fbe8aa..5d4cacbd9d 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -219,11 +219,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) { rem_bytes: #endif if (len) { -#ifndef HAVE_MASKED_READWRITE memcpy(out, &chunk_load, len); -#else - storechunk_mask(out, gen_mask(len), &chunk_load); -#endif out += len; } |
