summaryrefslogtreecommitdiff
path: root/cmark/tools/make_case_fold_inc.py
blob: 3347d291b9cd8d3db60b35008c2f0bbc162eb698 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Creates a C lookup table for Unicode case folding (https://unicode.org/Public/UCD/latest/ucd/CaseFolding.txt).
# Usage: python3 tools/make_case_fold_inc.py < data/CaseFolding.txt > src/case_fold.inc

import sys, re

prog = re.compile('([0-9A-F]+); [CF];((?: [0-9A-F]+)+);')
main_table = []
repl_table = []
repl_idx = 0
test = ''
test_result = ''

for line in sys.stdin:
    m = prog.match(line)
    if m is None:
        continue

    cp = int(m[1], 16);
    if cp < 0x80:
        continue

    repl = b''
    for x in m[2].split():
        repl += chr(int(x, 16)).encode('UTF-8')

    # Generate test case
    if len(main_table) % 20 == 0:
        test += chr(cp)
        test_result += repl.decode('UTF-8')

    # 17 bits for code point
    if cp >= (1 << 17):
        raise Exception("code point too large")

    # 12 bits for upper bits of replacement index
    # The lowest bit is always zero.
    if repl_idx // 2 >= (1 << 12):
        raise Exception("too many replacements")

    # 3 bits for size of replacement
    repl_size = len(repl)
    if repl_size >= (1 << 3):
        raise Exception("too many replacement chars")

    main_table += [ cp | repl_idx // 2 << 17 | repl_size << 29 ]
    repl_table += repl
    repl_idx += repl_size

    # Make sure that repl_idx is even
    if repl_idx % 2 != 0:
        repl_table += [0]
        repl_idx += 1

# Print test case
if False:
    print("test:", test)
    print("test_result:", test_result)
    sys.exit(0)

print("""// Generated by tools/make_case_fold_inc.py

#define CF_MAX            (1 << 17)
#define CF_TABLE_SIZE     %d
#define CF_CODE_POINT(x)  ((x) & 0x1FFFF)
#define CF_REPL_IDX(x)    ((((x) >> 17) & 0xFFF) * 2)
#define CF_REPL_SIZE(x)   ((x) >> 29)

static const uint32_t cf_table[%d] = {""" % (len(main_table), len(main_table)))

i = 0
size = len(main_table)
for value in main_table:
    if i % 6 == 0:
        print("  ", end="")
    print("0x%X" % value, end="")
    i += 1
    if i == size: print()
    elif i % 6 == 0: print(",")
    else: print(", ", end="")

print("""};

static const unsigned char cf_repl[%d] = {""" % len(repl_table))

i = 0
size = len(repl_table)
for value in repl_table:
    if i % 12 == 0:
        print("  ", end="")
    print("0x%02X" % value, end="")
    i += 1
    if i == size: print()
    elif i % 12 == 0: print(",")
    else: print(", ", end="")

print("};")