# Creates C data structures for binary lookup table of entities, # using python's html5 entity data. # Usage: python3 tools/make_entities_inc.py > src/entities.inc import html entities5 = html.entities.html5 # Remove keys without semicolons. HTML5 allows some named character # references without a trailing semicolon. entities = sorted([(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';']) main_table = [] text_table = b'' text_idx = 0 for (ent, repl) in entities: ent_bytes = ent.encode('UTF-8') ent_size = len(ent_bytes) repl_bytes = repl.encode('UTF-8') repl_size = len(repl_bytes) if text_idx >= (1 << 15): raise Exception("text index too large") if ent_size >= (1 << 5): raise Exception("entity name too long") if repl_size >= (1 << 3): raise Exception("entity replacement too long") main_table += [ text_idx | ent_size << 15 | repl_size << 20 ] text_table += ent_bytes + repl_bytes text_idx += ent_size + repl_size print("""/* Autogenerated by tools/make_headers_inc.py */ #define ENT_MIN_LENGTH 2 #define ENT_MAX_LENGTH 32 #define ENT_TABLE_SIZE %d #define ENT_TEXT_IDX(x) ((x) & 0x7FFF) #define ENT_NAME_SIZE(x) (((x) >> 15) & 0x1F) #define ENT_REPL_SIZE(x) ((x) >> 20) static const uint32_t cmark_entities[%d] = {""" % (len(main_table), len(main_table))); i = 0 size = len(main_table) for value in main_table: if i % 6 == 0: print(" ", end="") print("0x%X" % value, end="") i += 1 if i == size: print() elif i % 6 == 0: print(",") else: print(", ", end="") print("""}; static const unsigned char cmark_entity_text[%d] = {""" % len(text_table)) i = 0 size = len(text_table) for value in text_table: if i % 12 == 0: print(" ", end="") print("0x%02X" % value, end="") i += 1 if i == size: print() elif i % 12 == 0: print(",") else: print(", ", end="") print("};")