diff options
Diffstat (limited to 'uvim/runtime/tools/unicode.vim')
| -rw-r--r-- | uvim/runtime/tools/unicode.vim | 475 |
1 files changed, 0 insertions, 475 deletions
diff --git a/uvim/runtime/tools/unicode.vim b/uvim/runtime/tools/unicode.vim deleted file mode 100644 index 4086b188f3..0000000000 --- a/uvim/runtime/tools/unicode.vim +++ /dev/null @@ -1,475 +0,0 @@ -" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. -" The format of the UnicodeData.txt file is explained here: -" http://www.unicode.org/Public/5.1.0/ucd/UCD.html -" For the other files see the header. -" -" Might need to update the URL to the emoji-data.txt -" Usage: Vim -S <this-file> -" -" Author: Bram Moolenaar -" Last Update: 2025 Sep 21 - -" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. -func! ParseDataToProps() - let s:dataprops = [] - let lnum = 1 - while lnum <= line('$') - let l = split(getline(lnum), '\s*;\s*', 1) - if len(l) != 15 - echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' - return - endif - call add(s:dataprops, l) - let lnum += 1 - endwhile -endfunc - -" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. -func! ParseFoldProps() - let s:foldprops = [] - let lnum = 1 - while lnum <= line('$') - let line = getline(lnum) - if line !~ '^#' && line !~ '^\s*$' - let l = split(line, '\s*;\s*', 1) - if len(l) != 4 - echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' - return - endif - call add(s:foldprops, l) - endif - let lnum += 1 - endwhile -endfunc - -" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. -func! ParseWidthProps() - let s:widthprops = [] - let lnum = 1 - while lnum <= line('$') - let line = getline(lnum) - if line !~ '^#' && line !~ '^\s*$' - let l = split(line, '\s*;\s*', 1) - if len(l) != 2 - echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' - return - endif - call add(s:widthprops, l) - endif - let lnum += 1 - endwhile -endfunc - -" Build the toLower or toUpper table in a new buffer. -" Uses s:dataprops. -func! BuildCaseTable(name, index) - let start = -1 - let end = -1 - let step = 0 - let add = -1 - let ranges = [] - for p in s:dataprops - if p[a:index] != '' - let n = ('0x' . p[0]) + 0 - let nl = ('0x' . p[a:index]) + 0 - if start >= 0 && add == nl - n && (step == 0 || n - end == step) - " continue with same range. - let step = n - end - let end = n - else - if start >= 0 - " produce previous range - call Range(ranges, start, end, step, add) - endif - let start = n - let end = n - let step = 0 - let add = nl - n - endif - endif - endfor - if start >= 0 - call Range(ranges, start, end, step, add) - endif - - " New buffer to put the result in. - new - exe "file to" . a:name - call setline(1, "static convertStruct to" . a:name . "[] =") - call setline(2, "{") - call append('$', ranges) - call setline('$', getline('$')[:-2]) " remove last comma - call setline(line('$') + 1, "};") - wincmd p -endfunc - -" Build the foldCase table in a new buffer. -" Uses s:foldprops. -func! BuildFoldTable() - let start = -1 - let end = -1 - let step = 0 - let add = -1 - let ranges = [] - for p in s:foldprops - if p[1] == 'C' || p[1] == 'S' - let n = ('0x' . p[0]) + 0 - let nl = ('0x' . p[2]) + 0 - if start >= 0 && add == nl - n && (step == 0 || n - end == step) - " continue with same range. - let step = n - end - let end = n - else - if start >= 0 - " produce previous range - call Range(ranges, start, end, step, add) - endif - let start = n - let end = n - let step = 0 - let add = nl - n - endif - endif - endfor - if start >= 0 - call Range(ranges, start, end, step, add) - endif - - " New buffer to put the result in. - new - file foldCase - call setline(1, "static convertStruct foldCase[] =") - call setline(2, "{") - call append('$', ranges) - call setline('$', getline('$')[:-2]) " remove last comma - call setline(line('$') + 1, "};") - wincmd p -endfunc - -func! Range(ranges, start, end, step, add) - let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) - call add(a:ranges, s) -endfunc - -" Build the combining table. -" Uses s:dataprops. -func! BuildCombiningTable() - let start = -1 - let end = -1 - let ranges = [] - for p in s:dataprops - " The 'Mc' property was removed, it does take up space. - if p[2] == 'Mn' || p[2] == 'Me' - let n = ('0x' . p[0]) + 0 - if start >= 0 && end + 1 == n - " continue with same range. - let end = n - else - if start >= 0 - " produce previous range - call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) - endif - let start = n - let end = n - endif - endif - endfor - if start >= 0 - call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) - endif - - " New buffer to put the result in. - new - file combining - call setline(1, " static struct interval combining[] =") - call setline(2, " {") - call append('$', ranges) - call setline('$', getline('$')[:-2]) " remove last comma - call setline(line('$') + 1, " };") - wincmd p -endfunc - -" Build the double width or ambiguous width table in a new buffer. -" Uses s:widthprops and s:dataprops. -func! BuildWidthTable(pattern, tableName) - let start = -1 - let end = -1 - let ranges = [] - let dataidx = 0 - " Account for indentation differences between ambiguous and doublewidth - " table in mbyte.c - if a:pattern == 'A' - let spc = ' ' - else - let spc = "\t" - endif - for p in s:widthprops - if p[1][0] =~ a:pattern - if p[0] =~ '\.\.' - " It is a range. we don't check for composing char then. - let rng = split(p[0], '\.\.') - if len(rng) != 2 - echoerr "Cannot parse range: '" . p[0] . "' in width table" - endif - let n = ('0x' . rng[0]) + 0 - let n_last = ('0x' . rng[1]) + 0 - else - let n = ('0x' . p[0]) + 0 - let n_last = n - endif - " Find this char in the data table. - while 1 - let dn = ('0x' . s:dataprops[dataidx][0]) + 0 - if dn >= n - break - endif - let dataidx += 1 - endwhile - if dn != n && n_last == n - echoerr "Cannot find character " . n . " in data table" - endif - " Only use the char when it's not a composing char. - " But use all chars from a range. - let dp = s:dataprops[dataidx] - if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') - if start >= 0 && end + 1 == n - " continue with same range. - else - if start >= 0 - " produce previous range - call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) - if a:pattern == 'A' - call add(s:ambitable, [start, end]) - else - call add(s:doubletable, [start, end]) - endif - endif - let start = n - endif - let end = n_last - endif - endif - endfor - if start >= 0 - call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) - if a:pattern == 'A' - call add(s:ambitable, [start, end]) - else - call add(s:doubletable, [start, end]) - endif - endif - - " New buffer to put the result in. - new - exe "file " . a:tableName - if a:pattern == 'A' - call setline(1, "static struct interval " . a:tableName . "[] =") - call setline(2, "{") - else - call setline(1, " static struct interval " . a:tableName . "[] =") - call setline(2, " {") - endif - call append('$', ranges) - call setline('$', getline('$')[:-2]) " remove last comma - if a:pattern == 'A' - call setline(line('$') + 1, "};") - else - call setline(line('$') + 1, " };") - endif - wincmd p -endfunc - - -" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..." -" and put them in dictionary "chardict" -func AddLinesToCharDict(lines, chardict) - for line in a:lines - let tokens = split(line, '\.\.') - let first = str2nr(tokens[0], 16) - if len(tokens) == 1 - let last = first - else - let last = str2nr(tokens[1], 16) - endif - for nr in range(first, last) - let a:chardict[nr] = 1 - endfor - endfor -endfunc - -func Test_AddLinesToCharDict() - let dict = {} - call AddLinesToCharDict([ - \ '1234 blah blah', - \ '1235 blah blah', - \ '12a0..12a2 blah blah', - \ '12a1 blah blah', - \ ], dict) - call assert_equal({0x1234: 1, 0x1235: 1, - \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1, - \ }, dict) - if v:errors != [] - echoerr 'AddLinesToCharDict' v:errors - return 1 - endif - return 0 -endfunc - - -func CharDictToPairList(chardict) - let result = [] - let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N') - let low = keys[0] - let high = keys[0] - for key in keys - if key > high + 1 - call add(result, [low, high]) - let low = key - let high = key - else - let high = key - endif - endfor - call add(result, [low, high]) - return result -endfunc - -func Test_CharDictToPairList() - let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1, - \ 0x1024: 1, - \ 0x2022: 1, - \ 0x2024: 1, 0x2025: 1} - call assert_equal([ - \ [0x1020, 0x1022], - \ [0x1024, 0x1024], - \ [0x2022, 0x2022], - \ [0x2024, 0x2025], - \ ], CharDictToPairList(dict)) - if v:errors != [] - echoerr 'CharDictToPairList' v:errors - return 1 - endif - return 0 -endfunc - - -" Build the amoji width table in a new buffer. -func BuildEmojiTable() - " First make the table for all emojis. - let pattern = '; Emoji\s\+#\s' - let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') - - " Make a dictionary with an entry for each character. - let chardict = {} - call AddLinesToCharDict(lines, chardict) - let pairlist = CharDictToPairList(chardict) - let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])') - - " New buffer to put the result in. - new - exe 'file emoji_all' - call setline(1, "static struct interval emoji_all[] =") - call setline(2, "{") - call append('$', allranges) - call setline('$', getline('$')[:-2]) " remove last comma - call setline(line('$') + 1, "};") - wincmd p - - " Make the table for wide emojis. - let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s' - let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') - - " Make a dictionary with an entry for each character. - let chardict = {} - call AddLinesToCharDict(lines, chardict) - - " exclude characters that are in the "ambiguous" or "doublewidth" table - for ambi in s:ambitable - for nr in range(ambi[0], ambi[1]) - if has_key(chardict, nr) - call remove(chardict, nr) - endif - endfor - endfor - - for wide in s:doubletable - for nr in range(wide[0], wide[1]) - if has_key(chardict, nr) - call remove(chardict, nr) - endif - endfor - endfor - - let pairlist = CharDictToPairList(chardict) - let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') - - " New buffer to put the result in. - new - exe 'file emoji_wide' - call setline(1, " static struct interval emoji_wide[] =") - call setline(2, " {") - call append('$', wide_ranges) - call setline('$', getline('$')[:-2]) " remove last comma - call setline(line('$') + 1, " };") - wincmd p -endfunc - -" First test a few things -let v:errors = [] -if Test_AddLinesToCharDict() || Test_CharDictToPairList() - finish -endif - -if !exists("g:loaded_netrwPlugin") - echomsg "Netrw not available, cannot download" - finish -endif - -" Try to avoid hitting E36 -set equalalways - -" Edit the Unicode text file. Requires the netrw plugin. -edit http://unicode.org/Public/UNIDATA/UnicodeData.txt - -" Parse each line, create a list of lists. -call ParseDataToProps() - -" Build the toLower table. -call BuildCaseTable("Lower", 13) - -" Build the toUpper table. -call BuildCaseTable("Upper", 12) - -" Build the ranges of composing chars. -call BuildCombiningTable() - -" Edit the case folding text file. Requires the netrw plugin. -edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt - -" Parse each line, create a list of lists. -call ParseFoldProps() - -" Build the foldCase table. -call BuildFoldTable() - -" Edit the width text file. Requires the netrw plugin. -edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt - -" Parse each line, create a list of lists. -call ParseWidthProps() - -" Build the double width table. -let s:doubletable = [] -call BuildWidthTable('[WF]', 'doublewidth') - -" Build the ambiguous width table. -let s:ambitable = [] -call BuildWidthTable('A', 'ambiguous') - -" Edit the emoji text file. Requires the netrw plugin. -" commented out, because it drops too many characters -"edit https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt -" -"" Build the emoji table. Ver. 1.0 - 6.0 -"" Must come after the "ambiguous" and "doublewidth" tables -"call BuildEmojiTable() |
