20 files changed, 317 insertions, 319 deletions
diff --git a/adler32_fold.c b/adler32_fold.c
index 688f848533..20fec2bd3c 100644
--- a/adler32_fold.c
+++ b/adler32_fold.c
@@ -1,4 +1,4 @@
-/* crc32_fold.c -- adler32 folding interface
+/* adler32_fold.c -- adler32 folding interface
  * Copyright (C) 2022 Adam Stylinski 
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
diff --git a/adler32_fold.h b/adler32_fold.h
index ea456adc31..20aa1c7400 100644
--- a/adler32_fold.h
+++ b/adler32_fold.h
@@ -6,8 +6,6 @@
 #ifndef ADLER32_FOLD_H_
 #define ADLER32_FOLD_H_
 
-#include <stdint.h>
-
 Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 
 #endif
diff --git a/adler32_p.h b/adler32_p.h
index 5a14172f73..1d2e77f49f 100644
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -26,10 +26,10 @@ static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, u
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
-    while (len--) {
-        *dst = *buf++; 
-        adler += *dst++;
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
         sum2 += adler;
     }
     adler %= BASE;
@@ -38,10 +38,10 @@ static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
-    while (len) {
-        --len;
-        adler += *buf++;
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++; 
+        adler += *dst++;
         sum2 += adler;
     }
     adler %= BASE;
diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c
index fcca34ec53..dcd1166f34 100644
--- a/arch/x86/adler32_avx2.c
+++ b/arch/x86/adler32_avx2.c
@@ -5,29 +5,13 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../fallback_builtins.h"
-#include "adler32_avx2_p.h"
-#include "../../adler32_fold.h"
-#include <stdio.h>
-
 #include <immintrin.h>
 
 #ifdef X86_AVX2_ADLER32
 
 #include "adler32_avx2_tpl.h"
-#undef ADLER32_AVX2_TPL_H_
+
 #define COPY
 #include "adler32_avx2_tpl.h"
-#undef COPY
-
-/*
-Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
-    if (buf == NULL) return 1L;
-    if (len == 0) return adler;
-    return adler32_fold_avx2(adler, buf, len);
-}
-*/
 
 #endif
diff --git a/arch/x86/adler32_avx2_p.h b/arch/x86/adler32_avx2_p.h
index 1c80bde057..f7079bf3eb 100644
--- a/arch/x86/adler32_avx2_p.h
+++ b/arch/x86/adler32_avx2_p.h
@@ -6,10 +6,10 @@
 #ifndef ADLER32_AVX2_P_H_
 #define ADLER32_AVX2_P_H_
 
-#ifdef X86_AVX2_ADLER32
+#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32)
 
 /* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
-static inline uint32_t hsum(__m256i x) {
+static inline uint32_t hsum256(__m256i x) {
     __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
                                   _mm256_castsi256_si128(x));
     __m128i sum2  = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
@@ -17,7 +17,7 @@ static inline uint32_t hsum(__m256i x) {
     return (uint32_t)_mm_cvtsi128_si32(sum3);
 }
 
-static inline uint32_t partial_hsum(__m256i x) {
+static inline uint32_t partial_hsum256(__m256i x) {
     /* We need a permutation vector to extract every other integer. The
      * rest are going to be zeros */
     const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
diff --git a/arch/x86/adler32_avx2_tpl.h b/arch/x86/adler32_avx2_tpl.h
index 7df51d573d..59cacfa483 100644
--- a/arch/x86/adler32_avx2_tpl.h
+++ b/arch/x86/adler32_avx2_tpl.h
@@ -3,9 +3,6 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifndef ADLER32_AVX2_TPL_H_
-#define ADLER32_AVX2_TPL_H_
-
 #include "../../zbuild.h"
 #include <immintrin.h>
 #include "../../adler32_fold.h"
@@ -38,9 +35,9 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len)
 rem_peel:
     if (len < 16) {
 #ifdef COPY
-       return adler32_copy_len_16(adler0, src, dst, len, adler1);
+        return adler32_copy_len_16(adler0, src, dst, len, adler1);
 #else
-       return adler32_len_16(adler0, src, len, adler1);
+        return adler32_len_16(adler0, src, len, adler1);
 #endif
     } else if (len < 32) {
 #ifdef COPY
@@ -129,8 +126,8 @@ rem_peel:
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
          * what the compiler is doing to avoid integer divisions. */
-        adler0 = partial_hsum(vs1) % BASE;
-        adler1 = hsum(vs2) % BASE;
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
     }
 
     adler = adler0 | (adler1 << 16);
@@ -141,5 +138,3 @@ rem_peel:
 
     return adler;
 }
-
-#endif
diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c
index e26b9cc524..c0bf0721f2 100644
--- a/arch/x86/adler32_avx512.c
+++ b/arch/x86/adler32_avx512.c
@@ -6,20 +6,11 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../cpu_features.h"
-#include "../../fallback_builtins.h"
-#include <immintrin.h>
-#include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
-
 #ifdef X86_AVX512_ADLER32
 
 #include "adler32_avx512_tpl.h"
-#undef ADLER32_AVX512_TPL_H_
+
 #define COPY
 #include "adler32_avx512_tpl.h"
-#undef COPY
 
 #endif
diff --git a/arch/x86/adler32_avx512_tpl.h b/arch/x86/adler32_avx512_tpl.h
index df5dd3810f..d324ce9859 100644
--- a/arch/x86/adler32_avx512_tpl.h
+++ b/arch/x86/adler32_avx512_tpl.h
@@ -3,16 +3,13 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifndef ADLER32_AVX512_TPL_H_
-#define ADLER32_AVX512_TPL_H_
-
 #include "../../zbuild.h"
 #include "../../adler32_p.h"
+#include "../../adler32_fold.h"
 #include "../../cpu_features.h"
 #include "../../fallback_builtins.h"
 #include <immintrin.h>
 #include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
 
 #ifdef X86_AVX512_ADLER32
 
@@ -22,13 +19,13 @@ Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const
 Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
 #endif
 
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
     uint32_t adler0, adler1;
     adler1 = (adler >> 16) & 0xffff;
     adler0 = adler & 0xffff; 
 
-    if (src == NULL) return 1L;
-    if (len == 0) return adler;
-
 rem_peel:
     if (len < 64) {
         /* This handles the remaining copies, just call normal adler checksum after this */
@@ -107,4 +104,3 @@ rem_peel:
 }
 
 #endif
-#endif
diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c
index 253eed9c6a..330bfe38e7 100644
--- a/arch/x86/adler32_avx512_vnni.c
+++ b/arch/x86/adler32_avx512_vnni.c
@@ -7,66 +7,54 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
+#ifdef X86_AVX512VNNI_ADLER32
+
 #include "../../zbuild.h"
 #include "../../adler32_p.h"
 #include "../../cpu_features.h"
 #include "../../fallback_builtins.h"
 #include <immintrin.h>
+#include "../../adler32_fold.h"
 #include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
 
-#ifdef X86_AVX512VNNI_ADLER32
-Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
-    uint32_t sum2;
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff; 
 
-    /* For impossibly tiny sizes, use the smaller width versions. We still need
-     * to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32
+rem_peel:
     if (len < 32)
-        return adler32_sse41(adler, buf, len);
+#if defined(X86_SSSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
 #endif
 
-#ifdef X86_AVX2_ADLER32
     if (len < 64)
-        return adler32_avx2(adler, buf, len);
-#endif
-
-     /* split Adler-32 into component sums */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
-
-    /* Only capture these corner cases if we didn't compile with SSE41 and AVX2 support
-     * This should make for shorter compiled code */
-#if !defined(X86_AVX2_ADLER32) && !defined(X86_SSE41_ADLER32)
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1))
-        return adler32_len_1(adler, buf, sum2);
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(buf == NULL))
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16))
-        return adler32_len_16(adler, buf, len, sum2);
+#ifdef X86_AVX2_ADLER32
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
 #endif
 
-    /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up
-     * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial,
-     * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better
-     * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be
-     * aliased up to a 512 bit wide zmm */
-    __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-    __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
-
     const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
                                           38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                                           56, 57, 58, 59, 60, 61, 62, 63, 64);
 
     const __m512i zero = _mm512_setzero_si512();
+    __m512i vs1, vs2;
 
     while (len >= 64) {
-        int k = (len < NMAX ? (int)len : NMAX);
+        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
         k -= k % 64;
         len -= k;
         __m512i vs1_0 = vs1;
@@ -77,8 +65,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
 
         /* Remainder peeling */
         if (k % 128) {
-            vbuf1 = _mm512_loadu_si512(buf);
-            buf += 64;
+            vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+            src += 64;
             k -= 64;
 
             __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
@@ -94,9 +83,9 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
                vs1 = adler + sum(c[i])
                vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
             */
-            vbuf0 = _mm512_loadu_si512(buf);
-            vbuf1 = _mm512_loadu_si512(buf + 64);
-            buf += 128;
+            vbuf0 = _mm512_loadu_si512((__m512i*)src);
+            vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+            src += 128;
             k -= 128;
 
             __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
@@ -117,14 +106,120 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
         vs2 = _mm512_add_epi32(vs2, vs3);
         vs2 = _mm512_add_epi32(vs2, vs2_1);
 
-        adler = partial_hsum(vs1) % BASE;
-        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
-        sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
-        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler; 
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff; 
+
+rem_peel_copy:
+    if (len < 32) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3_ADLER32)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1); 
+#endif
+    }
+
+    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    const __m256i zero = _mm256_setzero_si256();
+    __m256i vs1, vs2;
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m256i vs2_1 = _mm256_setzero_si256();
+        __m256i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 64) {
+            vbuf1 = _mm256_loadu_si256((__m256i*)src);
+            _mm256_storeu_si256((__m256i*)dst, vbuf1);
+            dst += 32;
+
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm256_loadu_si256((__m256i*)src);
+            vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+            _mm256_storeu_si256((__m256i*)dst, vbuf0);
+            _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+            dst += 64;
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm256_add_epi32(vs3, vs1);
+            vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+        vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
     }
 
+    adler = adler0 | (adler1 << 16);
+
     /* Process tail (len < 64). */
-    return adler32_len_16(adler, buf, len, sum2);
+    if (len) {
+        goto rem_peel_copy;
+    }
+
+    return adler; 
 }
 
 #endif
diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c
index 4f21702aaf..92efe4d8db 100644
--- a/arch/x86/adler32_sse42.c
+++ b/arch/x86/adler32_sse42.c
@@ -1,4 +1,4 @@
-/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
  * Copyright (C) 1995-2011 Mark Adler
  * Authors:
  *   Adam Stylinski <kungfujesus06@gmail.com>
@@ -15,7 +15,6 @@
 #ifdef X86_SSE42_ADLER32
 
 Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-
     uint32_t adler0, adler1;
     adler1 = (adler >> 16) & 0xffff;
     adler0 = adler & 0xffff; 
diff --git a/arch/x86/adler32_ssse3_tpl.h b/arch/x86/adler32_ssse3_tpl.h
deleted file mode 100644
index aedfa81241..0000000000
--- a/arch/x86/adler32_ssse3_tpl.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_SSSE3_TPL_H_
-#define ADLER32_SSSE3_TPL_H_
-
-#include "../../zbuild.h"
-#include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
-#include "adler32_ssse3_p.h"
-
-#ifdef COPY
-Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
-#else
-Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) {
-#endif
-    uint32_t adler0, adler1;
-
-     /* split Adler-32 into component sums */
-    adler1 = (adler->nsums >> 16) & 0xffff;
-    adler0 = adler->nsums & 0xffff;
-
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (UNLIKELY(len == 1)) {
-#ifdef COPY
-        *(dst++) = *src;
-#endif
-        adler->nsums = adler32_len_1(adler0, src, adler1);
-        return;
-    }
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (UNLIKELY(src == NULL)) {
-        adler->nsums = 1L;
-        return;
-    }
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (UNLIKELY(len < 16)) {
-        goto sub16;
-    }
-
-    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
-    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    const __m128i dot3v = _mm_set1_epi16(1);
-    const __m128i zero = _mm_setzero_si128();
-
-    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
-            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
-
-    /* If our buffer is unaligned (likely), make the determination whether
-     * or not there's enough of a buffer to consume to make the scalar, aligning
-     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
-     * load. This is a pretty simple test, just test if 16 - the remainder + len is
-     * < 16 */
-    size_t max_iters = NMAX;
-    size_t rem = (uintptr_t)src & 15;
-    size_t align_offset = 16 - rem;
-    size_t k = 0;
-    if (rem) {
-        if (len < 16 + align_offset) {
-            /* Let's eat the cost of this one unaligned load so that
-             * we don't completely skip over the vectorization. Doing
-             * 16 bytes at a time unaligned is is better than 16 + <= 15
-             * sums */
-            vbuf = _mm_loadu_si128((__m128i*)src);
-            len -= 16;
-            src += 16;
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            dst += 16;
-#endif
-            vs1 = _mm_cvtsi32_si128(adler0);
-            vs2 = _mm_cvtsi32_si128(adler1);
-            vs3 = _mm_setzero_si128();
-            vs1_0 = vs1;
-            goto unaligned_jmp;
-        }
-
-#ifdef COPY
-        memcpy(dst, src, align_offset);
-        dst += align_offset;
-#endif
-        for (size_t i = 0; i < align_offset; ++i) {
-            adler0 += *(src++);
-            adler1 += adler0;
-        }
-
-        /* lop off the max number of sums based on the scalar sums done
-         * above */
-        len -= align_offset;
-        max_iters -= align_offset; 
-    }
-
-
-    while (len >= 16) {
-        vs1 = _mm_cvtsi32_si128(adler0);
-        vs2 = _mm_cvtsi32_si128(adler1);
-        vs3 = _mm_setzero_si128();
-        vs2_0 = _mm_setzero_si128();
-        vs1_0 = vs1;
-
-        k = (len < max_iters ? len : max_iters);
-        k -= k % 16;
-        len -= k;
-
-        while (k >= 32) {
-            /*
-               vs1 = adler + sum(c[i])
-               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-            */
-            vbuf = _mm_load_si128((__m128i*)src);
-            vbuf_0 = _mm_load_si128((__m128i*)(src + 16));
-            src += 32;
-            k -= 32;
-
-            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
-            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
-            dst += 32;
-#endif
-            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
-            vs3 = _mm_add_epi32(vs1_0, vs3);
-
-            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
-            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
-            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
-            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
-            vs2 = _mm_add_epi32(vsum2, vs2);
-            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
-            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
-            vs1_0 = vs1;
-        }
-
-        vs2 = _mm_add_epi32(vs2_0, vs2);
-        vs3 = _mm_slli_epi32(vs3, 5);
-        vs2 = _mm_add_epi32(vs3, vs2);
-        vs3 = _mm_setzero_si128();
-
-        while (k >= 16) {
-            /*
-               vs1 = adler + sum(c[i])
-               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-            */
-            vbuf = _mm_load_si128((__m128i*)src);
-            src += 16;
-            k -= 16;
-
-unaligned_jmp:
-            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
-#ifdef COPY
-            _mm_storeu_si128((__m128i*)dst, vbuf);
-            dst += 16;
-#endif
-            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
-            vs3 = _mm_add_epi32(vs1_0, vs3);
-            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
-            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
-            vs2 = _mm_add_epi32(vsum2, vs2);
-            vs1_0 = vs1;
-        }
-
-        vs3 = _mm_slli_epi32(vs3, 4);
-        vs2 = _mm_add_epi32(vs2, vs3);
-
-        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
-         * a partial reduction sum implicitly and only summing to integers in vector positions
-         * 0 and 2. This saves us some contention on the shuffle port(s) */
-        adler0 = partial_hsum(vs1) % BASE;
-        adler1 = hsum(vs2) % BASE;
-        max_iters = NMAX;
-    }
-
-sub16:
-#ifdef COPY
-    adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
-#else
-    /* Process tail (len < 16).  */
-    adler->nsums = adler32_len_16(adler0, src, len, adler1);
-#endif
-}
-
-#endif
diff --git a/cpu_features.h b/cpu_features.h
index 9e0d5cb95d..fc1b5d7339 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -6,8 +6,8 @@
 #ifndef CPU_FEATURES_H_
 #define CPU_FEATURES_H_
 
-#include "crc32_fold.h"
 #include "adler32_fold.h"
+#include "crc32_fold.h"
 
 #if defined(X86_FEATURES)
 #  include "arch/x86/x86_features.h"
@@ -35,16 +35,11 @@ extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len
 #ifdef X86_SSSE3_ADLER32
 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
 #endif
-#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
 #ifdef X86_AVX2_ADLER32
 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 #endif
 #ifdef X86_AVX512_ADLER32
 extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 #endif
 #ifdef X86_AVX512VNNI_ADLER32
 extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
@@ -53,6 +48,20 @@ extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, si
 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
 #endif
 
+/* adler32 folding */
+#ifdef X86_SSE42_ADLER32
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX2_ADLER32
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
 /* CRC32 folding */
 #ifdef X86_PCLMULQDQ_CRC
 extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
diff --git a/deflate.c b/deflate.c
index 006803c320..c2700f2b5f 100644
--- a/deflate.c
+++ b/deflate.c
@@ -52,7 +52,6 @@
 #include "deflate.h"
 #include "deflate_p.h"
 #include "functable.h"
-#include <stdio.h>
 
 const char PREFIX(deflate_copyright)[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler ";
 /*
@@ -446,7 +445,6 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
 
 #ifdef GZIP
     if (s->wrap == 2) {
-        /* Ensure that there's always a reset, regardless of "wrap" */
         strm->adler = functable.crc32_fold_reset(&s->crc_fold);
     } else
 #endif
diff --git a/deflate.h b/deflate.h
index abc87d8b8e..1b59c7539d 100644
--- a/deflate.h
+++ b/deflate.h
@@ -10,9 +10,9 @@
    subject to change. Applications should only use zlib.h.
  */
 
-#include "adler32_fold.h"
 #include "zutil.h"
 #include "zendian.h"
+#include "adler32_fold.h"
 #include "crc32_fold.h"
 
 /* define NO_GZIP when compiling if you want to disable gzip header and
diff --git a/functable.c b/functable.c
index 8328359702..3945323125 100644
--- a/functable.c
+++ b/functable.c
@@ -204,7 +204,7 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
 
 Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
     functable.adler32_fold_copy = &adler32_fold_copy_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+#if (defined X86_SSE42_ADLER32)
     if (x86_cpu_has_sse42)
         functable.adler32_fold_copy = &adler32_fold_copy_sse42;
 #endif
@@ -216,6 +216,10 @@ Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const u
     if (x86_cpu_has_avx512)
         functable.adler32_fold_copy = &adler32_fold_copy_avx512;
 #endif
+#ifdef X86_AVX512VNNI_ADLER32
+    if (x86_cpu_has_avx512vnni)
+        functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
+#endif
     return functable.adler32_fold_copy(adler, dst, src, len);
 }
 
diff --git a/inflate.c b/inflate.c
index 8611e4bd48..b9c3aa7973 100644
--- a/inflate.c
+++ b/inflate.c
@@ -609,7 +609,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
 #endif
         case DICTID:
             NEEDBITS(32);
-            //strm->adler = state->check = ZSWAP32(hold);
             strm->adler = state->check = ZSWAP32(hold);
             INITBITS();
             state->mode = DICT;
diff --git a/inflate.h b/inflate.h
index 5761077a60..941e8b0a28 100644
--- a/inflate.h
+++ b/inflate.h
@@ -11,8 +11,8 @@
 #ifndef INFLATE_H_
 #define INFLATE_H_
 
-#include "crc32_fold.h"
 #include "adler32_fold.h"
+#include "crc32_fold.h"
 
 /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate().
    NO_GZIP would be used to avoid linking in the crc code when it is not needed.
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index df1df49731..19762fc738 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 
 add_executable(benchmark_zlib
     benchmark_adler32.cc
+    benchmark_adler32_copy.cc
     benchmark_compare256.cc
     benchmark_crc32.cc
     benchmark_main.cc
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc
new file mode 100644
index 0000000000..fac4c7f1cd
--- /dev/null
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -0,0 +1,117 @@
+/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len);
+
+class adler32_copy: public benchmark::Fixture {
+private:
+    uint32_t *random_ints_src;
+    uint32_t *random_ints_dst;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        /* Control the alignment so that we have the best case scenario for loads. With
+         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
+         * And while this is a realistic scenario, it makes it difficult to compare benchmark
+         * to benchmark because one allocation could have been aligned perfectly for the loads
+         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
+         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
+         * control the _consistency_ of the results */
+        random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        assert(random_ints != NULL);
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            random_ints_src[i] = rand();
+        }
+    }
+
+    void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = adler32_func(hash, (unsigned char *)random_ints_dst,
+                                (const unsigned char*)random_ints_src, state.range(0));
+        }
+
+        benchmark::DoNotOptimize(hash);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(random_ints_src);
+        zng_free(random_ints_dst);
+    }
+};
+
+#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const unsigned char *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, len); \
+            return fptr(init_sum, buf, len); \
+        }); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+
+#ifdef ARM_NEON_ADLER32
+/* If we inline this copy for neon, the function would go here */
+//BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon);
+BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon);
+#endif
+
+#ifdef PPC_VMX_ADLER32
+//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec);
+BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec);
+#endif
+#ifdef POWER8_VSX_ADLER32
+//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07);
+BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07);
+#endif
+
+#ifdef X86_SSE42_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3);
+BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42);
+#endif
+#ifdef X86_AVX2_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2);
+#endif
+#ifdef X86_AVX512_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512);
+BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni);
+#endif
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index 8db2633b44..8a398e4993 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -30,15 +30,15 @@ WFLAGS  = \
 	-DX86_FEATURES \
 	-DX86_PCLMULQDQ_CRC \
 	-DX86_SSE2 \
-    -DX86_SSE42_ADLER32 \
+	-DX86_SSE42_ADLER32 \
 	-DX86_SSE42_CRC_INTRIN \
 	-DX86_SSE42_CRC_HASH \
-    -DX86_SSSE3_ADLER32 \
+	-DX86_SSSE3_ADLER32 \
 	-DX86_AVX2 \
-    -DX86_AVX2_ADLER32 \
+	-DX86_AVX2_ADLER32 \
 	-DX86_AVX_CHUNKSET \
-	-DX86_SSE2_CHUNKSET \
-	#
+	-DX86_SSE2_CHUNKSET
+
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
 RCFLAGS = /dWIN32 /r
@@ -51,12 +51,12 @@ SUFFIX =
 
 OBJS = \
 	adler32.obj \
-    adler32_avx2.obj \
-    adler32_avx512.obj \
-    adler32_avx512_vnni.obj \
-    adler32_sse42.obj \
-    adler32_ssse3.obj \
-    adler32_fold.obj \
+	adler32_avx2.obj \
+	adler32_avx512.obj \
+	adler32_avx512_vnni.obj \
+	adler32_sse42.obj \
+	adler32_ssse3.obj \
+	adler32_fold.obj \
 	chunkset.obj \
 	chunkset_avx.obj \
 	chunkset_sse2.obj \