From ee2dd805d6418dee2cce27d6bbef20f400fc934e Mon Sep 17 00:00:00 2001
From: Nathan Moinvaziri <nathan@nathanm.com>
Date: Sat, 31 Jan 2026 16:57:24 -0800
Subject: Optimize symbol buffer access based on platform unaligned access

---
 deflate.c   |  2 +-
 deflate.h   |  8 +++++---
 deflate_p.h |  9 +++++++++
 trees.c     | 12 +++++++++---
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/deflate.c b/deflate.c
index e0f89fab9d..81e1ac5a5f 100644
--- a/deflate.c
+++ b/deflate.c
@@ -167,7 +167,7 @@ Z_INTERNAL deflate_allocs* alloc_deflate(PREFIX3(stream) *strm, int windowBits,
     int window_size = DEFLATE_ADJUST_WINDOW_SIZE((1 << windowBits) * 2);
     int prev_size = (1 << windowBits) * (int)sizeof(Pos);
     int head_size = HASH_SIZE * sizeof(Pos);
-    int pending_size = lit_bufsize * LIT_BUFS;
+    int pending_size = (lit_bufsize * LIT_BUFS) + 1;
     int state_size = sizeof(deflate_state);
     int alloc_size = sizeof(deflate_allocs);
 
diff --git a/deflate.h b/deflate.h
index 85435636d4..3f9f8f4686 100644
--- a/deflate.h
+++ b/deflate.h
@@ -27,9 +27,11 @@
 #  define GZIP
 #endif
 
-/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
-   the cost of a larger memory footprint */
-#ifndef NO_LIT_MEM
+/* LIT_MEM uses separate distance/length buffers instead of the overlaid sym_buf.
+   This uses ~20% more memory but is 1-2% faster on platforms without fast unaligned
+   access. By default, LIT_MEM is only enabled when OPTIMAL_CMP < 32. Define LIT_MEM
+   to force separate buffers, or NO_LIT_MEM to force sym_buf usage. */
+#if !defined(LIT_MEM) && !defined(NO_LIT_MEM) && (OPTIMAL_CMP < 32)
 #  define LIT_MEM
 #endif
 
diff --git a/deflate_p.h b/deflate_p.h
index ae340f8f37..f60970bab3 100644
--- a/deflate_p.h
+++ b/deflate_p.h
@@ -11,6 +11,7 @@
 
 #include "functable.h"
 #include "fallback_builtins.h"
+#include "zmemory.h"
 
 /* Forward declare common non-inlined functions declared in deflate.c */
 
@@ -68,9 +69,13 @@ static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
     s->l_buf[sym_next] = c;
     s->sym_next = sym_next + 1;
 #else
+#  if OPTIMAL_CMP >= 32
+    zng_memwrite_4(&s->sym_buf[sym_next], Z_U32_TO_LE((uint32_t)c << 16));
+#  else
     s->sym_buf[sym_next] = 0;
     s->sym_buf[sym_next+1] = 0;
     s->sym_buf[sym_next+2] = c;
+#  endif
     s->sym_next = sym_next + 3;
 #endif
     s->dyn_ltree[c].Freq++;
@@ -90,9 +95,13 @@ static inline int zng_tr_tally_dist(deflate_state* s, uint32_t dist, uint32_t le
     s->l_buf[sym_next] = (uint8_t)len;
     s->sym_next = sym_next + 1;
 #else
+#  if OPTIMAL_CMP >= 32
+    zng_memwrite_4(&s->sym_buf[sym_next], Z_U32_TO_LE(dist | ((uint32_t)len << 16)));
+#  else
     s->sym_buf[sym_next] = (uint8_t)(dist);
     s->sym_buf[sym_next+1] = (uint8_t)(dist >> 8);
     s->sym_buf[sym_next+2] = (uint8_t)len;
+#  endif
     s->sym_next = sym_next + 3;
 #endif
     s->matches++;
diff --git a/trees.c b/trees.c
index 28ea5d2f1f..0f35b68d93 100644
--- a/trees.c
+++ b/trees.c
@@ -730,9 +730,15 @@ static void compress_block(deflate_state *s, const ct_data *ltree, const ct_data
             dist = d_buf[sx];
             lc = l_buf[sx++];
 #else
-            dist = sym_buf[sx++] & 0xff;
-            dist += (unsigned)(sym_buf[sx++] & 0xff) << 8;
-            lc = sym_buf[sx++];
+#  if OPTIMAL_CMP >= 32
+            uint32_t val = Z_U32_FROM_LE(zng_memread_4(&sym_buf[sx]));
+            dist = val & 0xffff;
+            lc = (val >> 16) & 0xff;
+#  else
+            dist = sym_buf[sx] + ((unsigned)sym_buf[sx + 1] << 8);
+            lc = sym_buf[sx + 2];
+#  endif
+            sx += 3;
 #endif
             if (dist == 0) {
                 zng_emit_lit(s, ltree, lc);
-- 
cgit 0.0.5-2-1-g0f52