summaryrefslogtreecommitdiff
path: root/docs/handbook/neozip/gzip-support.md
diff options
context:
space:
mode:
Diffstat (limited to 'docs/handbook/neozip/gzip-support.md')
-rw-r--r--docs/handbook/neozip/gzip-support.md413
1 files changed, 413 insertions, 0 deletions
diff --git a/docs/handbook/neozip/gzip-support.md b/docs/handbook/neozip/gzip-support.md
new file mode 100644
index 0000000000..ada310fa88
--- /dev/null
+++ b/docs/handbook/neozip/gzip-support.md
@@ -0,0 +1,413 @@
+# Gzip Support
+
+## Overview
+
+Neozip provides a complete gzip file I/O layer on top of the core
+deflate/inflate engine. This layer is implemented in three files:
+
+- `gzlib.c` — Shared state management, file open/close, seeking
+- `gzread.c` — Gzip file reading (decompression)
+- `gzwrite.c` — Gzip file writing (compression)
+- `gzguts.h` — Internal structures and constants
+
+The gzip API is enabled by the `WITH_GZFILEOP` CMake option (ON by default).
+
+---
+
+## The `gz_state` Structure
+
+From `gzguts.h`:
+
+```c
+typedef struct {
+ // Identification
+ PREFIX3(stream) strm; // Inflate/deflate stream
+ int mode; // GZ_READ or GZ_WRITE
+ int fd; // File descriptor
+ char *path; // Path for error messages
+ unsigned size; // Buffer size (default GZBUFSIZE)
+
+ // Buffering
+ unsigned want; // Requested buffer size
+ unsigned char *in; // Input buffer (read mode)
+ unsigned char *out; // Output buffer
+ int direct; // 0=compressed, 1=passthrough (not gzip)
+
+ // Position tracking
+ z_off64_t start; // Start of compressed data (after header)
+ z_off64_t raw; // Raw (compressed) file position
+ z_off64_t pos; // Uncompressed data position
+ int eof; // End of input file reached
+ int past; // Read past end of input
+
+ // Error tracking
+ int err; // Error code
+ char *msg; // Error message (or NULL)
+ int how; // 0=output, 1=copy, 2=decompress
+
+ // Write mode
+ int level; // Compression level
+ int strategy; // Compression strategy
+ int reset; // true if deflateReset needed
+
+ // Seeking
+ z_off64_t skip; // Bytes to skip during next read
+
+ // Peek
+ int seek; // Seek request pending
+} gz_state;
+```
+
+### Constants
+
+```c
+#define GZBUFSIZE 131072 // Default buffer size (128 KB)
+#define GZ_READ 7247 // Sentinel for read mode
+#define GZ_WRITE 31153 // Sentinel for write mode
+#define GZ_APPEND 1 // Mode flag for append
+```
+
+The sentinel values `GZ_READ` and `GZ_WRITE` are non-obvious integers
+chosen to catch state corruption.
+
+---
+
+## File Open (`gzlib.c`)
+
+### `gzopen()` / `gzdopen()`
+
+```c
+gzFile PREFIX(gzopen)(const char *path, const char *mode);
+gzFile PREFIX(gzdopen)(int fd, const char *mode);
+gzFile PREFIX(gzopen64)(const char *path, const char *mode);
+```
+
+The mode string supports:
+- `r` — Read (decompress)
+- `w` — Write (compress)
+- `a` — Append (compress, append to existing file)
+- `0-9` — Compression level
+- `f` — `Z_FILTERED` strategy
+- `h` — `Z_HUFFMAN_ONLY` strategy
+- `R` — `Z_RLE` strategy
+- `F` — `Z_FIXED` strategy
+- `T` — Direct/transparent (no compression)
+
+### `gz_state_init()`
+
+```c
+static void gz_state_init(gz_state *state) {
+ state->size = 0;
+ state->want = GZBUFSIZE;
+ state->in = NULL;
+ state->out = NULL;
+ state->direct = 0;
+ state->err = Z_OK;
+ state->pos = 0;
+ state->strm.avail_in = 0;
+}
+```
+
+### `gz_buffer_alloc()`
+
+Allocates I/O buffers:
+
+```c
+static int gz_buffer_alloc(gz_state *state) {
+ unsigned size = state->want;
+
+ if (state->mode == GZ_READ) {
+ // Read: input buffer = size, output buffer = size * 2
+ state->in = malloc(size);
+ state->out = malloc(size << 1);
+ state->size = size;
+ } else {
+ // Write: output buffer = size
+ state->in = NULL;
+ state->out = malloc(size);
+ state->size = size;
+ }
+ return 0;
+}
+```
+
+In read mode, the output buffer is doubled to handle cases where
+decompression expands data significantly within a single call.
+
+---
+
+## Reading (`gzread.c`)
+
+### Read Pipeline
+
+```
+gz_read() → gz_fetch() → gz_decomp() → inflate()
+ ↘ gz_look() (header detection)
+```
+
+### `gz_look()` — Header Detection
+
+Determines if the file is gzip-compressed or raw:
+
+```c
+static int gz_look(gz_state *state) {
+ // Read enough to check for gzip magic number
+ if (state->strm.avail_in < 2) {
+ // Read from file
+ int got = read(state->fd, state->in, state->size);
+ state->strm.avail_in = got;
+ state->strm.next_in = state->in;
+ }
+
+ // Check for gzip magic (1f 8b)
+ if (state->strm.avail_in >= 2 &&
+ state->in[0] == 0x1f && state->in[1] == 0x8b) {
+ // Initialize inflate for gzip
+ inflateInit2(&state->strm, 15 + 16); // windowBits + 16 = gzip
+ state->how = 2; // Decompress mode
+ } else {
+ // Not gzip — pass through directly
+ state->direct = 1;
+ state->how = 1; // Copy mode
+ }
+}
+```
+
+### `gz_decomp()` — Decompression
+
+```c
+static int gz_decomp(gz_state *state) {
+ int ret;
+ unsigned had = state->strm.avail_out;
+
+ // Call inflate
+ ret = PREFIX(inflate)(&state->strm, Z_NO_FLUSH);
+ state->pos += had - state->strm.avail_out;
+
+ if (ret == Z_STREAM_END) {
+ // End of gzip member — may be concatenated gzip
+ inflateReset(&state->strm);
+ state->how = 0; // Need to look for next member
+ }
+ return 0;
+}
+```
+
+### `gz_fetch()` — Fetch More Data
+
+```c
+static int gz_fetch(gz_state *state) {
+ do {
+ switch (state->how) {
+ case 0: // Look for gzip header
+ if (gz_look(state) == -1) return -1;
+ if (state->how == 0) return 0; // EOF
+ break;
+ case 1: // Copy raw data
+ if (gz_load(state, state->out, state->size << 1, &got) == -1)
+ return -1;
+ state->pos += got;
+ break;
+ case 2: // Decompress
+ if (state->strm.avail_in == 0) {
+ // Refill input buffer
+ int got = read(state->fd, state->in, state->size);
+ state->strm.avail_in = got;
+ state->strm.next_in = state->in;
+ }
+ if (gz_decomp(state) == -1) return -1;
+ break;
+ }
+ } while (state->strm.avail_out && !state->eof);
+ return 0;
+}
+```
+
+### Public Read API
+
+```c
+int PREFIX(gzread)(gzFile file, void *buf, unsigned len);
+int PREFIX(gzgetc)(gzFile file); // Read single character
+char *PREFIX(gzgets)(gzFile file, char *buf, int len); // Read line
+z_off_t PREFIX(gzungetc)(int c, gzFile file); // Push back character
+int PREFIX(gzdirect)(gzFile file); // Check if raw
+```
+
+---
+
+## Writing (`gzwrite.c`)
+
+### Write Pipeline
+
+```
+gz_write() → gz_comp() → deflate()
+```
+
+### `gz_write_init()` — Lazy Initialisation
+
+```c
+static int gz_write_init(gz_state *state) {
+ // Allocate output buffer
+ gz_buffer_alloc(state);
+
+ // Initialize deflate
+ state->strm.next_out = state->out;
+ state->strm.avail_out = state->size;
+
+ int ret = PREFIX(deflateInit2)(&state->strm,
+ state->level, Z_DEFLATED,
+ 15 + 16, // windowBits + 16 = gzip wrapping
+ DEF_MEM_LEVEL, state->strategy);
+
+ return ret == Z_OK ? 0 : -1;
+}
+```
+
+### `gz_comp()` — Compress Buffered Data
+
+```c
+static int gz_comp(gz_state *state, int flush) {
+ int ret;
+ unsigned have;
+
+ // Deflate until done
+ do {
+ if (state->strm.avail_out == 0) {
+ // Flush output buffer to file
+ have = state->size;
+ if (write(state->fd, state->out, have) != have) {
+ state->err = Z_ERRNO;
+ return -1;
+ }
+ state->strm.next_out = state->out;
+ state->strm.avail_out = state->size;
+ }
+ ret = PREFIX(deflate)(&state->strm, flush);
+ } while (ret == Z_OK && state->strm.avail_out == 0);
+
+ if (flush == Z_FINISH && ret == Z_STREAM_END) {
+ // Write final output
+ have = state->size - state->strm.avail_out;
+ if (have && write(state->fd, state->out, have) != have) {
+ state->err = Z_ERRNO;
+ return -1;
+ }
+ }
+ return 0;
+}
+```
+
+### Public Write API
+
+```c
+int PREFIX(gzwrite)(gzFile file, const void *buf, unsigned len);
+int PREFIX(gzputc)(gzFile file, int c);
+int PREFIX(gzputs)(gzFile file, const char *s);
+int PREFIX(gzprintf)(gzFile file, const char *format, ...);
+int PREFIX(gzflush)(gzFile file, int flush);
+int PREFIX(gzsetparams)(gzFile file, int level, int strategy);
+```
+
+---
+
+## Seeking and Position
+
+```c
+z_off64_t PREFIX(gzseek64)(gzFile file, z_off64_t offset, int whence);
+z_off64_t PREFIX(gztell64)(gzFile file);
+z_off64_t PREFIX(gzoffset64)(gzFile file);
+int PREFIX(gzrewind)(gzFile file);
+int PREFIX(gzeof)(gzFile file);
+```
+
+### Forward Seeking
+
+For read mode, seeking forward decompresses and discards data:
+
+```c
+// In gzseek: forward seek in read mode
+state->skip = offset; // Will be consumed during next gz_fetch
+```
+
+### Backward Seeking
+
+Backward seeking requires a full rewind and re-decompression:
+
+```c
+// Must reset and decompress from the beginning
+gzrewind(file);
+state->skip = offset;
+```
+
+---
+
+## Gzip Format
+
+A gzip file (RFC 1952) consists of:
+
+```
+┌──────────────────────────────────┐
+│ Header (10+ bytes) │
+│ 1F 8B — magic number │
+│ 08 — compression method │
+│ FLG — flags │
+│ MTIME — modification time │
+│ XFL — extra flags │
+│ OS — operating system │
+│ [EXTRA] [NAME] [COMMENT] [HCRC]│
+├──────────────────────────────────┤
+│ Compressed data (deflate) │
+├──────────────────────────────────┤
+│ Trailer (8 bytes) │
+│ CRC32 — CRC of original data │
+│ ISIZE — size of original data │
+└──────────────────────────────────┘
+```
+
+FLG bits:
+- `FTEXT` (0x01) — Text mode hint
+- `FHCRC` (0x02) — Header CRC present
+- `FEXTRA` (0x04) — Extra field present
+- `FNAME` (0x08) — Original filename present
+- `FCOMMENT` (0x10) — Comment present
+
+### Concatenated Gzip
+
+Multiple gzip members can be concatenated. `gzread()` transparently
+decompresses all members in sequence, resetting the inflate state at
+each `Z_STREAM_END` boundary.
+
+---
+
+## Error Handling
+
+```c
+int PREFIX(gzerror)(gzFile file, int *errnum); // Get error message
+void PREFIX(gzclearerr)(gzFile file); // Clear error state
+```
+
+The `gz_state.err` field tracks errors:
+- `Z_OK` — No error
+- `Z_ERRNO` — System I/O error (check `errno`)
+- `Z_STREAM_ERROR` — Invalid state
+- `Z_DATA_ERROR` — Corrupted gzip data
+- `Z_MEM_ERROR` — Memory allocation failure
+- `Z_BUF_ERROR` — Insufficient buffer space
+
+---
+
+## Close
+
+```c
+int PREFIX(gzclose)(gzFile file);
+int PREFIX(gzclose_r)(gzFile file); // Close read-mode file
+int PREFIX(gzclose_w)(gzFile file); // Close write-mode file
+```
+
+`gzclose_w()` flushes pending output with `Z_FINISH`, writes the
+remaining compressed data, then calls `deflateEnd()`.
+
+`gzclose_r()` calls `inflateEnd()` and frees buffers.
+
+Both close the file descriptor (unless opened via `gzdopen()` with
+the `F` flag to leave the fd open).