summaryrefslogtreecommitdiff
path: root/neozip/arch/riscv/crc32_zbc.c
blob: cf52279b8038b84d470597239eb9af1978936ddf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* crc32_zbc.c - RISCV Zbc version of crc32
 * Copyright (C) 2025 ByteDance. All rights reserved.
 * Contributed by Yin Tong <yintong.ustc@bytedance.com>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#ifdef RISCV_CRC32_ZBC

#include "zbuild.h"
#include "arch_functions.h"

#define CLMUL_MIN_LEN 16   // Minimum size of buffer for _crc32_clmul
#define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul

#define CONSTANT_R3 0x1751997d0ULL
#define CONSTANT_R4 0x0ccaa009eULL
#define CONSTANT_R5 0x163cd6124ULL
#define MASK32 0xFFFFFFFF
#define CRCPOLY_TRUE_LE_FULL 0x1DB710641ULL
#define CONSTANT_RU 0x1F7011641ULL

static inline uint64_t clmul(uint64_t a, uint64_t b) {
    uint64_t res;
    __asm__ volatile("clmul %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
    return res;
}

static inline uint64_t clmulh(uint64_t a, uint64_t b) {
    uint64_t res;
    __asm__ volatile("clmulh %0, %1, %2" : "=r"(res) : "r"(a), "r"(b));
    return res;
}

Z_FORCEINLINE static uint32_t crc32_clmul_impl(uint64_t crc, const unsigned char *buf, uint64_t len) {
    const uint64_t *buf64 = (const uint64_t *)buf;
    uint64_t low = buf64[0] ^ crc;
    uint64_t high = buf64[1];

    if (len < 16)
        goto finish_fold;
    len -= 16;
    buf64 += 2;

    // process each 16-byte block
    while (len >= 16) {
        uint64_t t2 = clmul(CONSTANT_R4, high);
        uint64_t t3 = clmulh(CONSTANT_R4, high);

        uint64_t t0_new = clmul(CONSTANT_R3, low);
        uint64_t t1_new = clmulh(CONSTANT_R3, low);

        // Combine the results and XOR with new data
        low = t0_new ^ t2;
        high = t1_new ^ t3;
        low ^= buf64[0];
        high ^= buf64[1];

        buf64 += 2;
        len -= 16;
    }

finish_fold:
    // Fold the 128-bit result into 64 bits
    uint64_t fold_t3 = clmulh(low, CONSTANT_R4);
    uint64_t fold_t2 = clmul(low, CONSTANT_R4);
    low = high ^ fold_t2;
    high = fold_t3;

    // Combine the low and high parts and perform polynomial reduction
    uint64_t combined = (low >> 32) | ((high & MASK32) << 32);
    uint64_t reduced_low = clmul(low & MASK32, CONSTANT_R5) ^ combined;

    // Barrett reduction step
    uint64_t barrett = clmul(reduced_low & MASK32, CONSTANT_RU) & MASK32;
    barrett = clmul(barrett, CRCPOLY_TRUE_LE_FULL);
    uint64_t final = barrett ^ reduced_low;

    // Return the high 32 bits as the final CRC
    return (uint32_t)(final >> 32);
}

Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len) {
    if (len < CLMUL_MIN_LEN) {
        return crc32_braid(crc, buf, len);
    }

    uint64_t unaligned_length = len % CLMUL_CHUNK_LEN;
    if (unaligned_length) {
        crc = crc32_braid(crc, buf, unaligned_length);
        buf += unaligned_length;
        len -= unaligned_length;
    }

    crc = crc32_clmul_impl(~crc, buf, len);
    return ~crc;
}

Z_INTERNAL uint32_t crc32_copy_riscv64_zbc(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
    crc = crc32_riscv64_zbc(crc, src, len);
    memcpy(dst, src, len);
    return crc;
}
#endif