summaryrefslogtreecommitdiff
path: root/neozip/chunkset_tpl.h
diff options
context:
space:
mode:
Diffstat (limited to 'neozip/chunkset_tpl.h')
-rw-r--r--neozip/chunkset_tpl.h281
1 files changed, 281 insertions, 0 deletions
diff --git a/neozip/chunkset_tpl.h b/neozip/chunkset_tpl.h
new file mode 100644
index 0000000000..82511f0e8c
--- /dev/null
+++ b/neozip/chunkset_tpl.h
@@ -0,0 +1,281 @@
+/* chunkset_tpl.h -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include <stdlib.h>
+
+/* Returns the chunk size */
+static inline size_t CHUNKSIZE(void) {
+ return sizeof(chunk_t);
+}
+
+/* Behave like memcpy, but assume that it's OK to overwrite at least
+ chunk_t bytes of output even if the length is shorter than this,
+ that the length is non-zero, and that `from` lags `out` by at least
+ sizeof chunk_t bytes (or that they don't overlap at all or simply that
+ the distance is less than the length of the copy).
+
+ Aside from better memory bus utilization, this means that short copies
+ (chunk_t bytes or fewer) will fall straight through the loop
+ without iteration, which will hopefully make the branch prediction more
+ reliable. */
+#ifndef HAVE_CHUNKCOPY
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "chunkcopy should never have a length 0");
+ chunk_t chunk;
+ size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += align;
+ from += align;
+ len -= align;
+ while (len > 0) {
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += sizeof(chunk_t);
+ from += sizeof(chunk_t);
+ len -= sizeof(chunk_t);
+ }
+ return out;
+}
+#endif
+
+/* Perform short copies until distance can be rewritten as being at least
+ sizeof chunk_t.
+
+ This assumes that it's OK to overwrite at least the first
+ 2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
+ This assumption holds because inflate_fast() starts every iteration with at
+ least 258 bytes of output space available (258 being the maximum length
+ output from a single token; see inflate_fast()'s assumptions below). */
+static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+ unsigned char const *from = out - *dist;
+ chunk_t chunk;
+ while (*dist < *len && *dist < sizeof(chunk_t)) {
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += *dist;
+ *len -= *dist;
+ *dist += *dist;
+ }
+ return out;
+}
+
+#ifndef HAVE_CHUNK_MAG
+/* Loads a magazine to feed into memory of the pattern */
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
+ /* This code takes string of length dist from "from" and repeats
+ * it for as many times as can fit in a chunk_t (vector register) */
+ size_t cpy_dist;
+ size_t bytes_remaining = sizeof(chunk_t);
+ chunk_t chunk_load;
+ uint8_t *cur_chunk = (uint8_t *)&chunk_load;
+ while (bytes_remaining) {
+ cpy_dist = MIN(dist, bytes_remaining);
+ memcpy(cur_chunk, buf, cpy_dist);
+ bytes_remaining -= cpy_dist;
+ cur_chunk += cpy_dist;
+ /* This allows us to bypass an expensive integer division since we're effectively
+ * counting in this loop, anyway */
+ *chunk_rem = cpy_dist;
+ }
+
+ return chunk_load;
+}
+#endif
+
+#if defined(HAVE_HALF_CHUNK) && !defined(HAVE_HALFCHUNKCOPY)
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "halfchunkcopy should never have a length 0");
+ halfchunk_t chunk;
+ size_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
+ loadhalfchunk(from, &chunk);
+ storehalfchunk(out, &chunk);
+ out += align;
+ from += align;
+ len -= align;
+ while (len > 0) {
+ loadhalfchunk(from, &chunk);
+ storehalfchunk(out, &chunk);
+ out += sizeof(halfchunk_t);
+ from += sizeof(halfchunk_t);
+ len -= sizeof(halfchunk_t);
+ }
+ return out;
+}
+#endif
+
+/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
+ Return OUT + LEN. */
+static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, size_t len) {
+ /* Debug performance related issues when len < sizeof(uint64_t):
+ Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
+ Assert(from != out, "chunkmemset cannot have a distance 0");
+
+ chunk_t chunk_load;
+ size_t chunk_mod = 0;
+ size_t adv_amount;
+ size_t dist = (size_t)ABS(out - from);
+
+ /* We are supporting the case for when we are reading bytes from ahead in the buffer.
+ * We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
+ * always needed to be handled here or if we're just now seeing it because we are
+ * dispatching to this function, more */
+ if (out < from && dist < len) {
+#ifdef HAVE_MASKED_READWRITE
+ /* We can still handle this case if we can mitigate over writing _and_ we
+ * fit the entirety of the copy length with one load */
+ if (len <= sizeof(chunk_t)) {
+ /* Tempting to add a goto to the block below but hopefully most compilers
+ * collapse these identical code segments as one label to jump to */
+ return CHUNKCOPY(out, from, len);
+ }
+#endif
+ /* Here the memmove semantics match perfectly, as when this happens we are
+ * effectively sliding down the contents of memory by dist bytes */
+ memmove(out, from, len);
+ return out + len;
+ }
+
+ if (dist == 1) {
+ memset(out, *from, len);
+ return out + len;
+ } else if (dist >= sizeof(chunk_t)) {
+ return CHUNKCOPY(out, from, len);
+ }
+
+ /* Only AVX2+ as there's 128 bit vectors and 256 bit. We allow for shorter vector
+ * lengths because they serve to allow more cases to fall into chunkcopy, as the
+ * distance of the shorter length is still deemed a safe distance. We rewrite this
+ * here rather than calling the ssse3 variant directly now because doing so required
+ * dispatching to another function and broke inlining for this function entirely. We
+ * also can merge an assert and some remainder peeling behavior into the same code blocks,
+ * making the code a little smaller. */
+#ifdef HAVE_HALF_CHUNK
+ if (len <= sizeof(halfchunk_t)) {
+ if (dist >= sizeof(halfchunk_t))
+ return HALFCHUNKCOPY(out, from, len);
+
+ if ((dist % 2) != 0 || dist == 6) {
+ halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
+
+ if (len == sizeof(halfchunk_t)) {
+ storehalfchunk(out, &halfchunk_load);
+ len -= sizeof(halfchunk_t);
+ out += sizeof(halfchunk_t);
+ }
+
+ chunk_load = halfchunk2whole(&halfchunk_load);
+ goto rem_bytes;
+ }
+ }
+#endif
+
+#ifdef HAVE_CHUNKMEMSET_2
+ if (dist == 2) {
+ chunkmemset_2(from, &chunk_load);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_4
+ if (dist == 4) {
+ chunkmemset_4(from, &chunk_load);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_8
+ if (dist == 8) {
+ chunkmemset_8(from, &chunk_load);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_16
+ if (dist == 16) {
+ chunkmemset_16(from, &chunk_load);
+ } else
+#endif
+ chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
+
+ adv_amount = sizeof(chunk_t) - chunk_mod;
+
+ while (len >= (2 * sizeof(chunk_t))) {
+ storechunk(out, &chunk_load);
+ storechunk(out + adv_amount, &chunk_load);
+ out += 2 * adv_amount;
+ len -= 2 * adv_amount;
+ }
+
+ /* If we don't have a "dist" length that divides evenly into a vector
+ * register, we can write the whole vector register but we need only
+ * advance by the amount of the whole string that fits in our chunk_t.
+ * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
+ while (len >= sizeof(chunk_t)) {
+ storechunk(out, &chunk_load);
+ len -= adv_amount;
+ out += adv_amount;
+ }
+
+#ifdef HAVE_HALF_CHUNK
+rem_bytes:
+#endif
+ if (len) {
+ memcpy(out, &chunk_load, len);
+ out += len;
+ }
+
+ return out;
+}
+
+Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, size_t len, size_t left) {
+#if OPTIMAL_CMP < 32
+ static const uintptr_t align_mask = 7;
+#elif OPTIMAL_CMP == 32
+ static const uintptr_t align_mask = 3;
+#endif
+
+ len = MIN(len, left);
+
+#if OPTIMAL_CMP < 64
+ while (((uintptr_t)out & align_mask) && (len > 0)) {
+ *out++ = *from++;
+ --len;
+ --left;
+ }
+#endif
+
+#ifndef HAVE_MASKED_READWRITE
+ if (UNLIKELY(left < sizeof(chunk_t))) {
+ while (len > 0) {
+ *out++ = *from++;
+ --len;
+ }
+
+ return out;
+ }
+#endif
+
+ if (len)
+ out = CHUNKMEMSET(out, from, len);
+
+ return out;
+}
+
+static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe)
+{
+ if (out == from)
+ return out + len;
+
+ size_t safelen = (safe - out);
+ len = MIN(len, safelen);
+
+#ifndef HAVE_MASKED_READWRITE
+ size_t from_dist = (size_t)ABS(safe - from);
+ if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
+ while (len--) {
+ *out++ = *from++;
+ }
+
+ return out;
+ }
+#endif
+
+ return CHUNKMEMSET(out, from, len);
+}