gen_unicodedata.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. import unicodedata
  2. from tqdm import trange
  3. from typing import Literal
  4. info = []
  5. for i in trange(0x110000):
  6. char = chr(i)
  7. category = unicodedata.category(char)
  8. east_asian_width = unicodedata.east_asian_width(char)
  9. info.append((i, category, east_asian_width))
  10. def merge(index: Literal[1, 2], filter):
  11. # index = 1, category
  12. # index = 2, east_asian_width
  13. result: list[tuple[int, int, str]] = []
  14. last_value = None
  15. last_start = None
  16. for i in range(len(info)):
  17. value = info[i][index]
  18. if value != last_value:
  19. if last_value is not None:
  20. result.append((last_start, i - 1, last_value))
  21. last_value = value
  22. last_start = i
  23. if last_value is not None:
  24. result.append((last_start, len(info) - 1, last_value))
  25. return [x for x in result if filter(x[2])]
  26. df_category = merge(1, lambda x: x == 'Lo')
  27. df_east_asian_width = merge(2, lambda x: x != 'N')
  28. def to_c11(ranges, name, with_value=True):
  29. with open(f'{name}.c', 'wt', encoding='utf-8', newline='\n') as f:
  30. f.write(f'const static c11_u32_range {name}[] = {{\n')
  31. for start, end, value in ranges:
  32. if with_value:
  33. f.write(f' {{ {start}, {end}, "{value}\\0" }},\n')
  34. else:
  35. f.write(f' {{ {start}, {end} }},\n')
  36. f.write(f'}};\n')
  37. to_c11(df_category, 'kLoRanges', with_value=False)
  38. to_c11(df_east_asian_width, 'kEastAsianWidthRanges', with_value=True)