docs/handbook/neozip/arm-optimizations.md


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

# ARM Optimizations

## Overview

Neozip provides ARM SIMD optimizations using NEON (Advanced SIMD), CRC32
hardware instructions, and PMULL (polynomial multiply long). These cover
both AArch32 (ARMv7+) and AArch64 (ARMv8+) targets. All implementations
reside in `arch/arm/`.

---

## Source Files

| File | ISA Extension | Function |
|---|---|---|
| `arm_features.c/h` | — | Feature detection |
| `adler32_neon.c` | NEON | Adler-32 checksum |
| `chunkset_neon.c` | NEON | Pattern fill for inflate |
| `compare256_neon.c` | NEON | 256-byte string comparison |
| `crc32_acle.c` | CRC32 | Hardware CRC-32 |
| `crc32_pmull.c` | PMULL | CLMUL-based CRC-32 |
| `insert_string_acle.c` | CRC32 | CRC-based hash insertion |
| `slide_hash_neon.c` | NEON | Hash table slide |
| `inffast_neon.c` | NEON | Fast inflate inner loop |

---

## Feature Detection

### `arm_cpu_features` Structure

```c
struct arm_cpu_features {
    int has_simd;        // ARMv6 SIMD (AArch32 only)
    int has_neon;        // NEON / ASIMD
    int has_crc32;       // CRC32 instructions (ARMv8.0-A optional, ARMv8.1-A mandatory)
    int has_pmull;       // PMULL (polynomial multiply long, 64→128-bit)
    int has_eor3;        // SHA3 EOR3 instruction (ARMv8.2-A+SHA3)
    int has_fast_pmull;  // High-perf PMULL
};
```

### Linux Detection

```c
void Z_INTERNAL arm_check_features(struct cpu_features *features) {
#if defined(__linux__)
    unsigned long hwcap = getauxval(AT_HWCAP);
#if defined(__aarch64__)
    features->arm.has_neon  = !!(hwcap & HWCAP_ASIMD);
    features->arm.has_crc32 = !!(hwcap & HWCAP_CRC32);
    features->arm.has_pmull = !!(hwcap & HWCAP_PMULL);
    unsigned long hwcap2 = getauxval(AT_HWCAP2);
    features->arm.has_eor3 = !!(hwcap2 & HWCAP2_SHA3);
#else  // AArch32
    features->arm.has_simd = !!(hwcap & HWCAP_ARM_VFPv3);
    features->arm.has_neon = !!(hwcap & HWCAP_ARM_NEON);
    features->arm.has_crc32 = !!(hwcap2 & HWCAP2_CRC32);
    features->arm.has_pmull = !!(hwcap2 & HWCAP2_PMULL);
#endif
#endif
}
```

### macOS/iOS Detection

```c
#if defined(__APPLE__)
    // NEON is always available on Apple Silicon
    features->arm.has_neon = 1;
    features->arm.has_crc32 = has_feature("hw.optional.armv8_crc32");
    features->arm.has_pmull = has_feature("hw.optional.arm.FEAT_PMULL");
#endif
```

### Windows Detection

```c
#if defined(_WIN32)
    features->arm.has_neon  = IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
    features->arm.has_crc32 = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
#endif
```

---

## NEON Adler-32 (`adler32_neon.c`)

Uses 128-bit NEON registers to process 16 bytes per iteration:

```c
Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
    uint32_t s1 = adler & 0xffff;
    uint32_t s2 = adler >> 16;

    // Position weight vector: {16,15,14,...,1}
    static const uint8_t taps[] = {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
    uint8x16_t vtaps = vld1q_u8(taps);

    while (len >= 16) {
        uint32x4_t vs1 = vdupq_n_u32(0);
        uint32x4_t vs2 = vdupq_n_u32(0);
        uint32x4_t vs1_0 = vdupq_n_u32(s1);

        // Process up to NMAX bytes before reduction
        size_t block = MIN(len, NMAX);
        size_t nblocks = block / 16;

        for (size_t i = 0; i < nblocks; i++) {
            uint8x16_t vbuf = vld1q_u8(buf);

            // s1 += sum(bytes)
            uint16x8_t sum16 = vpaddlq_u8(vbuf);
            uint32x4_t sum32 = vpaddlq_u16(sum16);
            vs1 = vaddq_u32(vs1, sum32);

            // s2 += 16 * s1_prev + weighted_sum(bytes)
            vs2 = vshlq_n_u32(vs1_0, 4);  // 16 * s1
            // Multiply-accumulate: weighted position sum
            uint16x8_t prod = vmull_u8(vget_low_u8(vbuf), vget_low_u8(vtaps));
            prod = vmlal_u8(prod, vget_high_u8(vbuf), vget_high_u8(vtaps));
            vs2 = vaddq_u32(vs2, vpaddlq_u16(prod));

            vs1_0 = vs1;
            buf += 16;
        }

        // Horizontal reduction
        s1 += vaddvq_u32(vs1);
        s2 += vaddvq_u32(vs2);
        s1 %= BASE;
        s2 %= BASE;
        len -= nblocks * 16;
    }
    return s1 | (s2 << 16);
}
```

Key NEON intrinsics used:
- `vpaddlq_u8` — Pairwise add long (u8→u16)
- `vpaddlq_u16` — Pairwise add long (u16→u32)
- `vmull_u8` — Multiply long (u8×u8→u16)
- `vmlal_u8` — Multiply-accumulate long
- `vaddvq_u32` — Horizontal sum across vector (AArch64)

---

## Hardware CRC-32 (`crc32_acle.c`)

Uses ARMv8 CRC32 instructions via ACLE (ARM C Language Extensions):

```c
Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
    crc = ~crc;  // CRC32 instructions use inverted convention

    // Process 8 bytes at a time
    while (len >= 8) {
        crc = __crc32d(crc, *(uint64_t *)buf);
        buf += 8;
        len -= 8;
    }

    // Process 4 bytes
    if (len >= 4) {
        crc = __crc32w(crc, *(uint32_t *)buf);
        buf += 4;
        len -= 4;
    }

    // Process remaining bytes
    while (len--) {
        crc = __crc32b(crc, *buf++);
    }

    return ~crc;
}
```

The `__crc32b`, `__crc32w`, `__crc32d` intrinsics compile to single CRC32
instructions, computing CRC-32 of 1/4/8 bytes per instruction.

---

## PMULL CRC-32 (`crc32_pmull.c`)

For larger data, polynomial multiply (PMULL) provides higher throughput
via carry-less multiplication, similar to x86 PCLMULQDQ:

```c
Z_INTERNAL uint32_t crc32_pmull(uint32_t crc, const uint8_t *buf, size_t len) {
    poly128_t fold_const;
    uint64x2_t crc0, crc1, crc2, crc3;

    // Initialize four accumulators with first 64 bytes
    crc0 = veorq_u64(vld1q_u64((uint64_t *)buf),
                      vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));
    // ... crc1, crc2, crc3

    // Main fold loop: 64 bytes per iteration
    while (len >= 64) {
        // vmull_p64: 64×64→128-bit polynomial multiply
        poly128_t h0 = vmull_p64(vgetq_lane_u64(crc0, 0), fold_lo);
        poly128_t h1 = vmull_p64(vgetq_lane_u64(crc0, 1), fold_hi);
        crc0 = veorq_u64(vreinterpretq_u64_p128(h0),
                          vreinterpretq_u64_p128(h1));
        crc0 = veorq_u64(crc0, vld1q_u64((uint64_t *)buf));
        // repeat for crc1..crc3
    }

    // Barrett reduction to 32-bit CRC
}
```

With `has_eor3` (SHA3 extension), three-way XOR is done in a single
instruction:

```c
#ifdef ARM_FEATURE_SHA3
    // EOR3: a ^= b ^ c in one instruction
    crc0 = vreinterpretq_u64_u8(veor3q_u8(
        vreinterpretq_u8_p128(h0),
        vreinterpretq_u8_p128(h1),
        vreinterpretq_u8_u64(data)));
#endif
```

---

## NEON String Comparison (`compare256_neon.c`)

```c
Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
    uint32_t len = 0;
    do {
        uint8x16_t v0 = vld1q_u8(src0 + len);
        uint8x16_t v1 = vld1q_u8(src1 + len);
        uint8x16_t cmp = vceqq_u8(v0, v1);

        // Check if all bytes matched
        uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
        uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);

        if (mask_lo != ~0ULL) {
            // First mismatch in lower 8 bytes
            return len + (__builtin_ctzll(~mask_lo) >> 3);
        }
        if (mask_hi != ~0ULL) {
            return len + 8 + (__builtin_ctzll(~mask_hi) >> 3);
        }
        len += 16;
    } while (len < 256);
    return 256;
}
```

---

## NEON Slide Hash (`slide_hash_neon.c`)

```c
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
    unsigned n;
    Pos *p;
    uint16x8_t vw = vdupq_n_u16((uint16_t)s->w_size);

    n = HASH_SIZE;
    p = &s->head[n];
    do {
        p -= 8;
        uint16x8_t val = vld1q_u16(p);
        val = vqsubq_u16(val, vw);   // Saturating subtract
        vst1q_u16(p, val);
        n -= 8;
    } while (n);

    // Same loop for s->prev[0..w_size-1]
    n = s->w_size;
    p = &s->prev[n];
    do {
        p -= 8;
        uint16x8_t val = vld1q_u16(p);
        val = vqsubq_u16(val, vw);
        vst1q_u16(p, val);
        n -= 8;
    } while (n);
}
```

`vqsubq_u16` performs unsigned saturating subtract — values below zero
clamp to zero rather than wrapping.

---

## NEON Chunk Memory Set (`chunkset_neon.c`)

Used during inflate for back-reference copies:

```c
Z_INTERNAL uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from,
                                           unsigned dist, unsigned len) {
    if (dist == 1) {
        // Broadcast single byte
        uint8x16_t vfill = vdupq_n_u8(*from);
        while (len >= 16) {
            vst1q_u8(out, vfill);
            out += 16;
            len -= 16;
        }
    } else if (dist == 2) {
        uint8x16_t v = vreinterpretq_u8_u16(vdupq_n_u16(*(uint16_t *)from));
        // ...
    } else if (dist >= 16) {
        // Standard copy
        while (len >= 16) {
            vst1q_u8(out, vld1q_u8(from));
            out += 16;
            from += 16;
            len -= 16;
        }
    } else {
        // Replicate dist-byte pattern into 16 bytes
        // ...
    }
    return out;
}
```

---

## CRC-Based Hash Insertion (`insert_string_acle.c`)

When ARMv8 CRC32 instructions are available, they provide excellent hash
distribution:

```c
Z_INTERNAL Pos insert_string_acle(deflate_state *s, Pos str, unsigned count) {
    Pos idx;
    for (unsigned i = 0; i < count; i++) {
        uint32_t val = *(uint32_t *)(s->window + str + i);
        uint32_t h = __crc32w(0, val);
        h &= s->hash_mask;
        idx = s->head[h];
        s->prev[(str + i) & s->w_mask] = idx;
        s->head[h] = (Pos)(str + i);
    }
    return idx;
}
```

---

## CMake Configuration

ARM features are detected via compiler intrinsic checks:

```cmake
option(WITH_NEON "Build with NEON SIMD" ON)
option(WITH_ACLE "Build with ACLE CRC" ON)

# AArch64 compiler flags
if(WITH_NEON)
    check_c_compiler_flag("-march=armv8-a+simd" HAS_NEON)
    if(HAS_NEON)
        set_property(SOURCE arch/arm/adler32_neon.c APPEND
                     PROPERTY COMPILE_OPTIONS -march=armv8-a+simd)
        # ... other NEON sources
        add_definitions(-DARM_NEON)
    endif()
endif()

if(WITH_ACLE)
    check_c_compiler_flag("-march=armv8-a+crc" HAS_CRC32)
    if(HAS_CRC32)
        set_property(SOURCE arch/arm/crc32_acle.c APPEND
                     PROPERTY COMPILE_OPTIONS -march=armv8-a+crc)
        add_definitions(-DARM_ACLE_CRC_HASH)
    endif()
    check_c_compiler_flag("-march=armv8-a+crypto" HAS_PMULL)
    if(HAS_PMULL)
        set_property(SOURCE arch/arm/crc32_pmull.c APPEND
                     PROPERTY COMPILE_OPTIONS -march=armv8-a+crypto)
        add_definitions(-DARM_PMULL_CRC)
    endif()
endif()
```

---

## Performance Notes

| Operation | NEON | CRC32 HW | PMULL |
|---|---|---|---|
| Adler-32 | ~8 bytes/cycle | — | — |
| CRC-32 | — | ~4 bytes/cycle | ~16 bytes/cycle |
| CRC-32+Copy | — | — | ~12 bytes/cycle |
| Compare256 | ~16 bytes/cycle | — | — |
| Slide Hash | ~8 entries/cycle | — | — |

Apple Silicon (M1+) provides particularly fast CRC32 and PMULL
implementations with low latency per instruction.

On Cortex-A55 and similar in-order cores, the throughput numbers are roughly
halved compared to Cortex-A76/A78 and Apple Silicon out-of-order cores.