From ee2dd805d6418dee2cce27d6bbef20f400fc934e Mon Sep 17 00:00:00 2001 From: Nathan Moinvaziri Date: Sat, 31 Jan 2026 16:57:24 -0800 Subject: Optimize symbol buffer access based on platform unaligned access --- deflate.c | 2 +- deflate.h | 8 +++++--- deflate_p.h | 9 +++++++++ trees.c | 12 +++++++++--- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/deflate.c b/deflate.c index e0f89fab9d..81e1ac5a5f 100644 --- a/deflate.c +++ b/deflate.c @@ -167,7 +167,7 @@ Z_INTERNAL deflate_allocs* alloc_deflate(PREFIX3(stream) *strm, int windowBits, int window_size = DEFLATE_ADJUST_WINDOW_SIZE((1 << windowBits) * 2); int prev_size = (1 << windowBits) * (int)sizeof(Pos); int head_size = HASH_SIZE * sizeof(Pos); - int pending_size = lit_bufsize * LIT_BUFS; + int pending_size = (lit_bufsize * LIT_BUFS) + 1; int state_size = sizeof(deflate_state); int alloc_size = sizeof(deflate_allocs); diff --git a/deflate.h b/deflate.h index 85435636d4..3f9f8f4686 100644 --- a/deflate.h +++ b/deflate.h @@ -27,9 +27,11 @@ # define GZIP #endif -/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at - the cost of a larger memory footprint */ -#ifndef NO_LIT_MEM +/* LIT_MEM uses separate distance/length buffers instead of the overlaid sym_buf. + This uses ~20% more memory but is 1-2% faster on platforms without fast unaligned + access. By default, LIT_MEM is only enabled when OPTIMAL_CMP < 32. Define LIT_MEM + to force separate buffers, or NO_LIT_MEM to force sym_buf usage. */ +#if !defined(LIT_MEM) && !defined(NO_LIT_MEM) && (OPTIMAL_CMP < 32) # define LIT_MEM #endif diff --git a/deflate_p.h b/deflate_p.h index ae340f8f37..f60970bab3 100644 --- a/deflate_p.h +++ b/deflate_p.h @@ -11,6 +11,7 @@ #include "functable.h" #include "fallback_builtins.h" +#include "zmemory.h" /* Forward declare common non-inlined functions declared in deflate.c */ @@ -68,9 +69,13 @@ static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) { s->l_buf[sym_next] = c; s->sym_next = sym_next + 1; #else +# if OPTIMAL_CMP >= 32 + zng_memwrite_4(&s->sym_buf[sym_next], Z_U32_TO_LE((uint32_t)c << 16)); +# else s->sym_buf[sym_next] = 0; s->sym_buf[sym_next+1] = 0; s->sym_buf[sym_next+2] = c; +# endif s->sym_next = sym_next + 3; #endif s->dyn_ltree[c].Freq++; @@ -90,9 +95,13 @@ static inline int zng_tr_tally_dist(deflate_state* s, uint32_t dist, uint32_t le s->l_buf[sym_next] = (uint8_t)len; s->sym_next = sym_next + 1; #else +# if OPTIMAL_CMP >= 32 + zng_memwrite_4(&s->sym_buf[sym_next], Z_U32_TO_LE(dist | ((uint32_t)len << 16))); +# else s->sym_buf[sym_next] = (uint8_t)(dist); s->sym_buf[sym_next+1] = (uint8_t)(dist >> 8); s->sym_buf[sym_next+2] = (uint8_t)len; +# endif s->sym_next = sym_next + 3; #endif s->matches++; diff --git a/trees.c b/trees.c index 28ea5d2f1f..0f35b68d93 100644 --- a/trees.c +++ b/trees.c @@ -730,9 +730,15 @@ static void compress_block(deflate_state *s, const ct_data *ltree, const ct_data dist = d_buf[sx]; lc = l_buf[sx++]; #else - dist = sym_buf[sx++] & 0xff; - dist += (unsigned)(sym_buf[sx++] & 0xff) << 8; - lc = sym_buf[sx++]; +# if OPTIMAL_CMP >= 32 + uint32_t val = Z_U32_FROM_LE(zng_memread_4(&sym_buf[sx])); + dist = val & 0xffff; + lc = (val >> 16) & 0xff; +# else + dist = sym_buf[sx] + ((unsigned)sym_buf[sx + 1] << 8); + lc = sym_buf[sx + 2]; +# endif + sx += 3; #endif if (dist == 0) { zng_emit_lit(s, ltree, lc); -- cgit 0.0.5-2-1-g0f52