summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim Kukunas <james.t.kukunas@linux.intel.com>2013-07-02 12:09:37 -0700
committerJim Kukunas <james.t.kukunas@linux.intel.com>2014-06-03 15:37:51 -0700
commit5640481288aaf91efcc27622ae973d373bfc2cf3 (patch)
tree54fae19f4d106fcfb94f3c6bc2761d443fd199c8
parentfd80ca4fb96b8bdeb4e7f9ec863ebc34a9303968 (diff)
downloadProject-Tick-5640481288aaf91efcc27622ae973d373bfc2cf3.tar.gz
Project-Tick-5640481288aaf91efcc27622ae973d373bfc2cf3.zip
Adds SSE2 optimized hash shifting to fill_window.
Uses SSE2 subtraction with saturation to shift the hash in 16B chunks. Renames the old fill_window implementation to fill_window_c(), and adds a new fill_window_sse() implementation in fill_window_sse.c. Moves UPDATE_HASH into deflate.h and changes the scope of read_buf from local to ZLIB_INTERNAL for sharing between the two implementations. Updates the configure script to check for SSE2 intrinsics and enables this optimization by default on x86. The runtime check for SSE2 support only occurs on 32-bit, as x86_64 requires SSE2. Adds an explicit rule in Makefile.in to build fill_window_sse.c with the -msse2 compiler flag, which is required for SSE2 intrinsics.
-rw-r--r--Makefile.in15
-rwxr-xr-xconfigure45
-rw-r--r--deflate.c59
-rw-r--r--deflate.h8
-rw-r--r--fill_window_sse.c172
5 files changed, 279 insertions, 20 deletions
diff --git a/Makefile.in b/Makefile.in
index c61aa3008d..4774810f22 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -45,6 +45,9 @@ TAR=tar
SHELL=/bin/sh
EXE=
+FILL_WINDOW_SSE_o=
+FILL_WINDOW_SSE_lo=
+
prefix = /usr/local
exec_prefix = ${prefix}
libdir = ${exec_prefix}/lib
@@ -54,11 +57,11 @@ mandir = ${prefix}/share/man
man3dir = ${mandir}/man3
pkgconfigdir = ${libdir}/pkgconfig
-OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
+OBJZ = adler32.o crc32.o ${FILL_WINDOW_SSE_o} deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o
OBJC = $(OBJZ) $(OBJG)
-PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
+PIC_OBJZ = adler32.lo crc32.lo ${FILL_WINDOW_SSE_lo} deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
@@ -113,6 +116,14 @@ test64: all64
fi; \
rm -f $$TMP64
+fill_window_sse.lo: fill_window_sse.c
+ -@mkdir objs 2>/dev/null || test -d objs
+ $(CC) $(SFLAGS) -msse2 -DPIC -c -o objs/$*.o $<
+ -@mv objs/$*.o $@
+
+fill_window_sse.o: fill_window_sse.c
+ ${CC} ${CFLAGS} -msse2 -I. -c -o $@ fill_window_sse.c
+
infcover.o: test/infcover.c zlib.h zconf.h
$(CC) $(CFLAGS) -I. -c -o $@ test/infcover.c
diff --git a/configure b/configure
index ff66ab32e5..9755cbeba5 100755
--- a/configure
+++ b/configure
@@ -760,6 +760,23 @@ EOF
fi
fi
+# Check for SSE2 intrinsics
+cat > $test.c << EOF
+#include <immintrin.h>
+int main(void)
+{
+ __m128i zero = _mm_setzero_si128();
+ return 0;
+}
+EOF
+if try ${CC} ${CFLAGS} -msse2 $test.c; then
+ echo "Checking for SSE2 intrinsics ... Yes." | tee -a configure.log
+ HAVE_SSE2_INTRIN=1
+else
+ echo "Checking for SSE2 intrinsics ... No." | tee -a configure.log
+ HAVE_SSE2_INTRIN=0
+fi
+
# Set ARCH specific FLAGS
case "${ARCH}" in
x86_64)
@@ -774,6 +791,18 @@ case "${ARCH}" in
CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
+
+ if test ${HAVE_SSE2_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -UCHECK_SSE2 -DHAVE_SSE2"
+ SFLAGS="${SFLAGS} -UCHECK_SSE2 -DHAVE_SSE2"
+ FILL_WINDOW_SSE_o="fill_window_sse.o"
+ FILL_WINDOW_SSE_lo="fill_window_sse.lo"
+ OBJS="${OBJS} ${FILL_WINDOW_SSE_o}"
+ PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}"
+ else
+ FILL_WINDOW_SSE_o=""
+ FILL_WINDOW_SSE_lo=""
+ fi
;;
i386 | i486 | i586 | i686)
OBJC="${OBJC} x86.o"
@@ -787,6 +816,18 @@ case "${ARCH}" in
CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
+
+ if test ${HAVE_SSE2_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DCHECK_SSE2 -DHAVE_SSE2"
+ SFLAGS="${SFLAGS} -DCHECK_SSE2 -DHAVE_SSE2"
+ FILL_WINDOW_SSE_o="fill_window_sse.o"
+ FILL_WINDOW_SSE_lo="fill_window_sse.lo"
+ OBJS="${OBJS} ${FILL_WINDOW_SSE_o}"
+ PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}"
+ else
+ FILL_WINDOW_SSE_o=""
+ FILL_WINDOW_SSE_lo=""
+ fi
;;
esac
@@ -821,6 +862,8 @@ echo mandir = $mandir >> configure.log
echo prefix = $prefix >> configure.log
echo sharedlibdir = $sharedlibdir >> configure.log
echo uname = $uname >> configure.log
+echo FILL_WINDOW_SSE_o = ${FILL_WINDOW_SSE_o} >> configure.log
+echo FILL_WINDOW_SSE_lo= ${FILL_WINDOW_SSE_lo} >> configure.log
# udpate Makefile with the configure results
sed < Makefile.in "
@@ -850,6 +893,8 @@ sed < Makefile.in "
/^PIC_OBJC *=/s#=.*#= $PIC_OBJC#
/^all: */s#:.*#: $ALL#
/^test: */s#:.*#: $TEST#
+/^FILL_WINDOW_SSE_o *=/s#=.*#=$FILL_WINDOW_SSE_o#
+/^FILL_WINDOW_SSE_lo *=/s#=.*#=$FILL_WINDOW_SSE_lo#
" > Makefile
# create zlib.pc with the configure results
diff --git a/deflate.c b/deflate.c
index 96f555b016..32df2119bf 100644
--- a/deflate.c
+++ b/deflate.c
@@ -84,7 +84,7 @@ local block_state deflate_huff OF((deflate_state *s, int flush));
local void lm_init OF((deflate_state *s));
local void putShortMSB OF((deflate_state *s, uInt b));
local void flush_pending OF((z_streamp strm));
-local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+ZLIB_INTERNAL int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
#ifdef ASMV
void match_init OF((void)); /* asm code initialization */
uInt longest_match OF((deflate_state *s, IPos cur_match));
@@ -158,14 +158,6 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
/* rank Z_BLOCK between Z_NO_FLUSH and Z_PARTIAL_FLUSH */
#define RANK(f) (((f) << 1) - ((f) > 4 ? 9 : 0))
-/* ===========================================================================
- * Update a hash value with the given input byte
- * IN assertion: all calls to to UPDATE_HASH are made with consecutive
- * input characters, so that a running hash key can be computed from the
- * previous key instead of complete recalculation each time.
- */
-#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
-
/* ===========================================================================
* Insert string str in the dictionary and set match_head to the previous head
@@ -179,12 +171,12 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
*/
#ifdef FASTEST
#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+ (UPDATE_HASH(s, s->ins_h, (str)), \
match_head = s->head[s->ins_h], \
s->head[s->ins_h] = (Pos)(str))
#else
#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+ (UPDATE_HASH(s, s->ins_h, (str)), \
match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
s->head[s->ins_h] = (Pos)(str))
#endif
@@ -197,6 +189,10 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
s->head[s->hash_size-1] = NIL; \
zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
+#ifdef CHECK_SSE2
+#include "x86.h"
+#endif
+
/* ========================================================================= */
int ZEXPORT deflateInit_(strm, level, version, stream_size)
z_streamp strm;
@@ -230,6 +226,10 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
* output size for (length,distance) codes is <= 24 bits.
*/
+#ifdef CHECK_SSE2
+ x86_check_features();
+#endif
+
if (version == Z_NULL || version[0] != my_version[0] ||
stream_size != sizeof(z_stream)) {
return Z_VERSION_ERROR;
@@ -365,7 +365,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
str = s->strstart;
n = s->lookahead - (MIN_MATCH-1);
do {
- UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+ UPDATE_HASH(s, s->ins_h, str);
#ifndef FASTEST
s->prev[str & s->w_mask] = s->head[s->ins_h];
#endif
@@ -1073,7 +1073,7 @@ int ZEXPORT deflateCopy (dest, source)
* allocating a large strm->next_in buffer and copying from it.
* (See also flush_pending()).
*/
-local int read_buf(strm, buf, size)
+ZLIB_INTERNAL int read_buf(strm, buf, size)
z_streamp strm;
Bytef *buf;
unsigned size;
@@ -1171,10 +1171,31 @@ local void check_match(s, start, match, length)
* performed for at least two bytes (required for the zip translate_eol
* option -- not supported here).
*/
-local void fill_window(s)
+#ifdef HAVE_SSE2
+extern void fill_window_sse(deflate_state *s);
+#endif
+local void fill_window_c(deflate_state *s);
+
+local void fill_window(deflate_state *s)
+{
+#ifdef HAVE_SSE2
+#ifdef CHECK_SSE2
+ if (x86_cpu_has_sse2) {
+#endif
+ fill_window_sse(s);
+ return;
+#ifdef CHECK_SSE2
+ }
+#endif
+#endif
+
+ fill_window_c(s);
+}
+
+local void fill_window_c(s)
deflate_state *s;
{
- register unsigned n, m;
+ register unsigned n;
register Posf *p;
unsigned more; /* Amount of free space at the end of the window. */
uInt wsize = s->w_size;
@@ -1216,6 +1237,7 @@ local void fill_window(s)
n = s->hash_size;
p = &s->head[n];
do {
+ unsigned m;
m = *--p;
*p = (Pos)(m >= wsize ? m-wsize : NIL);
} while (--n);
@@ -1224,6 +1246,7 @@ local void fill_window(s)
#ifndef FASTEST
p = &s->prev[n];
do {
+ unsigned m;
m = *--p;
*p = (Pos)(m >= wsize ? m-wsize : NIL);
/* If n is not on any hash chain, prev[n] is garbage but
@@ -1255,12 +1278,12 @@ local void fill_window(s)
if (s->lookahead + s->insert >= MIN_MATCH) {
uInt str = s->strstart - s->insert;
s->ins_h = s->window[str];
- UPDATE_HASH(s, s->ins_h, s->window[str + 1]);
+ UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
#if MIN_MATCH != 3
Call UPDATE_HASH() MIN_MATCH-3 more times
#endif
while (s->insert) {
- UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+ UPDATE_HASH(s, s->ins_h, str);
#ifndef FASTEST
s->prev[str & s->w_mask] = s->head[s->ins_h];
#endif
@@ -1478,7 +1501,7 @@ local block_state deflate_fast(s, flush)
s->strstart += s->match_length;
s->match_length = 0;
s->ins_h = s->window[s->strstart];
- UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+ UPDATE_HASH(s, s->ins_h, s->strstart+1 - (MIN_MATCH-1));
#if MIN_MATCH != 3
Call UPDATE_HASH() MIN_MATCH-3 more times
#endif
diff --git a/deflate.h b/deflate.h
index ce0299edd1..f1c1ed9ba6 100644
--- a/deflate.h
+++ b/deflate.h
@@ -343,4 +343,12 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
flush = _tr_tally(s, distance, length)
#endif
+/* ===========================================================================
+ * Update a hash value with the given input byte
+ * IN assertion: all calls to to UPDATE_HASH are made with consecutive
+ * input characters, so that a running hash key can be computed from the
+ * previous key instead of complete recalculation each time.
+ */
+#define UPDATE_HASH(s,h,i) (h = (((h)<<s->hash_shift) ^ (s->window[i + (MIN_MATCH-1)])) & s->hash_mask)
+
#endif /* DEFLATE_H */
diff --git a/fill_window_sse.c b/fill_window_sse.c
new file mode 100644
index 0000000000..e07fd1c439
--- /dev/null
+++ b/fill_window_sse.c
@@ -0,0 +1,172 @@
+/*
+ * Fill Window with SSE2-optimized hash shifting
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef HAVE_SSE2
+
+#include <immintrin.h>
+#include "deflate.h"
+
+void fill_window_sse(deflate_state *s)
+{
+ z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
+
+ register unsigned n;
+ register Posf *p;
+ unsigned more; /* Amount of free space at the end of the window. */
+ uInt wsize = s->w_size;
+
+ Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+ do {
+ more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+ /* Deal with !@#$% 64K limit: */
+ if (sizeof(int) <= 2) {
+ if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+ more = wsize;
+
+ } else if (more == (unsigned)(-1)) {
+ /* Very unlikely, but possible on 16 bit machine if
+ * strstart == 0 && lookahead == 1 (input done a byte at time)
+ */
+ more--;
+ }
+ }
+
+ /* If the window is almost full and there is insufficient lookahead,
+ * move the upper half to the lower one to make room in the upper half.
+ */
+ if (s->strstart >= wsize+MAX_DIST(s)) {
+
+ zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
+ s->match_start -= wsize;
+ s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
+ s->block_start -= (long) wsize;
+
+ /* Slide the hash table (could be avoided with 32 bit values
+ at the expense of memory usage). We slide even when level == 0
+ to keep the hash table consistent if we switch back to level > 0
+ later. (Using level 0 permanently is not an optimal usage of
+ zlib, so we don't care about this pathological case.)
+ */
+ n = s->hash_size;
+ p = &s->head[n];
+ p -= 8;
+ do {
+ __m128i value, result;
+
+ value = _mm_loadu_si128((__m128i *)p);
+ result = _mm_subs_epu16(value, xmm_wsize);
+ _mm_storeu_si128((__m128i *)p, result);
+
+ p -= 8;
+ n -= 8;
+ } while (n > 0);
+
+ n = wsize;
+#ifndef FASTEST
+ p = &s->prev[n];
+ p -= 8;
+ do {
+ __m128i value, result;
+
+ value = _mm_loadu_si128((__m128i *)p);
+ result = _mm_subs_epu16(value, xmm_wsize);
+ _mm_storeu_si128((__m128i *)p, result);
+
+ p -= 8;
+ n -= 8;
+ } while (n > 0);
+#endif
+ more += wsize;
+ }
+ if (s->strm->avail_in == 0) break;
+
+ /* If there was no sliding:
+ * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+ * more == window_size - lookahead - strstart
+ * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+ * => more >= window_size - 2*WSIZE + 2
+ * In the BIG_MEM or MMAP case (not yet supported),
+ * window_size == input_size + MIN_LOOKAHEAD &&
+ * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+ * Otherwise, window_size == 2*WSIZE so more >= 2.
+ * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+ */
+ Assert(more >= 2, "more < 2");
+
+ n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+ s->lookahead += n;
+
+ /* Initialize the hash value now that we have some input: */
+ if (s->lookahead + s->insert >= MIN_MATCH) {
+ uInt str = s->strstart - s->insert;
+ s->ins_h = s->window[str];
+ if (str >= 1)
+ UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
+#if MIN_MATCH != 3
+ Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+ while (s->insert) {
+ UPDATE_HASH(s, s->ins_h, str);
+#ifndef FASTEST
+ s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+ s->head[s->ins_h] = (Pos)str;
+ str++;
+ s->insert--;
+ if (s->lookahead + s->insert < MIN_MATCH)
+ break;
+ }
+ }
+ /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+ * but this is not important since only literal bytes will be emitted.
+ */
+
+ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+ /* If the WIN_INIT bytes after the end of the current data have never been
+ * written, then zero those bytes in order to avoid memory check reports of
+ * the use of uninitialized (or uninitialised as Julian writes) bytes by
+ * the longest match routines. Update the high water mark for the next
+ * time through here. WIN_INIT is set to MAX_MATCH since the longest match
+ * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+ */
+ if (s->high_water < s->window_size) {
+ ulg curr = s->strstart + (ulg)(s->lookahead);
+ ulg init;
+
+ if (s->high_water < curr) {
+ /* Previous high water mark below current data -- zero WIN_INIT
+ * bytes or up to end of window, whichever is less.
+ */
+ init = s->window_size - curr;
+ if (init > WIN_INIT)
+ init = WIN_INIT;
+ zmemzero(s->window + curr, (unsigned)init);
+ s->high_water = curr + init;
+ }
+ else if (s->high_water < (ulg)curr + WIN_INIT) {
+ /* High water mark at or above current data, but below current data
+ * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+ * to end of window, whichever is less.
+ */
+ init = (ulg)curr + WIN_INIT - s->high_water;
+ if (init > s->window_size - s->high_water)
+ init = s->window_size - s->high_water;
+ zmemzero(s->window + s->high_water, (unsigned)init);
+ s->high_water += init;
+ }
+ }
+
+ Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+ "not enough room for search");
+}
+#endif