diff options
Diffstat (limited to 'neozip/arch/x86/compare256_avx2.c')
| -rw-r--r-- | neozip/arch/x86/compare256_avx2.c | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/neozip/arch/x86/compare256_avx2.c b/neozip/arch/x86/compare256_avx2.c new file mode 100644 index 0000000000..5e2b1716cf --- /dev/null +++ b/neozip/arch/x86/compare256_avx2.c @@ -0,0 +1,61 @@ +/* compare256_avx2.c -- AVX2 version of compare256 + * Copyright Mika T. Lindqvist <postmaster@raasu.org> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zendian.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#ifdef X86_AVX2 + +#include <immintrin.h> +#ifdef _MSC_VER +# include <nmmintrin.h> +#endif + +static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + __m256i ymm_src0, ymm_src1, ymm_cmp; + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ + unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */ + + src0 += 32, src1 += 32, len += 32; + + ymm_src0 = _mm256_loadu_si256((__m256i*)src0); + ymm_src1 = _mm256_loadu_si256((__m256i*)src1); + ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); + mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); + if (mask != 0xFFFFFFFF) + return len + zng_ctz32(~mask); + + src0 += 32, src1 += 32, len += 32; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx2_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_avx2 +#define COMPARE256 compare256_avx2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_avx2 +#define COMPARE256 compare256_avx2_static + +#include "match_tpl.h" + +#endif |
