1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# 4# Leverage Python's unicodedata module to generate ucs_recompose_table.h 5# 6# The generated table maps base character + combining mark pairs to their 7# precomposed equivalents. 8# 9# Usage: 10# python3 gen_ucs_recompose_table.py # Generate with common recomposition pairs 11# python3 gen_ucs_recompose_table.py --full # Generate with all recomposition pairs 12 13import unicodedata 14import sys 15import argparse 16import textwrap 17 18# This script's file name 19from pathlib import Path 20this_file = Path(__file__).name 21 22# Default output file name 23DEFAULT_OUT_FILE = "ucs_recompose_table.h" 24 25common_recompose_description = "most commonly used Latin, Greek, and Cyrillic recomposition pairs only" 26COMMON_RECOMPOSITION_PAIRS = [ 27 # Latin letters with accents - uppercase 28 (0x0041, 0x0300, 0x00C0), # A + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER A WITH GRAVE 29 (0x0041, 0x0301, 0x00C1), # A + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER A WITH ACUTE 30 (0x0041, 0x0302, 0x00C2), # A + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER A WITH CIRCUMFLEX 31 (0x0041, 0x0303, 0x00C3), # A + COMBINING TILDE = LATIN CAPITAL LETTER A WITH TILDE 32 (0x0041, 0x0308, 0x00C4), # A + COMBINING DIAERESIS = LATIN CAPITAL LETTER A WITH DIAERESIS 33 (0x0041, 0x030A, 0x00C5), # A + COMBINING RING ABOVE = LATIN CAPITAL LETTER A WITH RING ABOVE 34 (0x0043, 0x0327, 0x00C7), # C + COMBINING CEDILLA = LATIN CAPITAL LETTER C WITH CEDILLA 35 (0x0045, 0x0300, 0x00C8), # E + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER E WITH GRAVE 36 (0x0045, 0x0301, 0x00C9), # E + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER E WITH ACUTE 37 (0x0045, 0x0302, 0x00CA), # E + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER E WITH CIRCUMFLEX 38 (0x0045, 0x0308, 0x00CB), # E + COMBINING DIAERESIS = LATIN CAPITAL LETTER E WITH DIAERESIS 39 (0x0049, 0x0300, 0x00CC), # I + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER I WITH GRAVE 40 (0x0049, 0x0301, 0x00CD), # I + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER I WITH ACUTE 41 (0x0049, 0x0302, 0x00CE), # I + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER I WITH CIRCUMFLEX 42 (0x0049, 0x0308, 0x00CF), # I + COMBINING DIAERESIS = LATIN CAPITAL LETTER I WITH DIAERESIS 43 (0x004E, 0x0303, 0x00D1), # N + COMBINING TILDE = LATIN CAPITAL LETTER N WITH TILDE 44 (0x004F, 0x0300, 0x00D2), # O + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER O WITH GRAVE 45 (0x004F, 0x0301, 0x00D3), # O + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER O WITH ACUTE 46 (0x004F, 0x0302, 0x00D4), # O + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER O WITH CIRCUMFLEX 47 (0x004F, 0x0303, 0x00D5), # O + COMBINING TILDE = LATIN CAPITAL LETTER O WITH TILDE 48 (0x004F, 0x0308, 0x00D6), # O + COMBINING DIAERESIS = LATIN CAPITAL LETTER O WITH DIAERESIS 49 (0x0055, 0x0300, 0x00D9), # U + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER U WITH GRAVE 50 (0x0055, 0x0301, 0x00DA), # U + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER U WITH ACUTE 51 (0x0055, 0x0302, 0x00DB), # U + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER U WITH CIRCUMFLEX 52 (0x0055, 0x0308, 0x00DC), # U + COMBINING DIAERESIS = LATIN CAPITAL LETTER U WITH DIAERESIS 53 (0x0059, 0x0301, 0x00DD), # Y + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER Y WITH ACUTE 54 55 # Latin letters with accents - lowercase 56 (0x0061, 0x0300, 0x00E0), # a + COMBINING GRAVE ACCENT = LATIN SMALL LETTER A WITH GRAVE 57 (0x0061, 0x0301, 0x00E1), # a + COMBINING ACUTE ACCENT = LATIN SMALL LETTER A WITH ACUTE 58 (0x0061, 0x0302, 0x00E2), # a + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER A WITH CIRCUMFLEX 59 (0x0061, 0x0303, 0x00E3), # a + COMBINING TILDE = LATIN SMALL LETTER A WITH TILDE 60 (0x0061, 0x0308, 0x00E4), # a + COMBINING DIAERESIS = LATIN SMALL LETTER A WITH DIAERESIS 61 (0x0061, 0x030A, 0x00E5), # a + COMBINING RING ABOVE = LATIN SMALL LETTER A WITH RING ABOVE 62 (0x0063, 0x0327, 0x00E7), # c + COMBINING CEDILLA = LATIN SMALL LETTER C WITH CEDILLA 63 (0x0065, 0x0300, 0x00E8), # e + COMBINING GRAVE ACCENT = LATIN SMALL LETTER E WITH GRAVE 64 (0x0065, 0x0301, 0x00E9), # e + COMBINING ACUTE ACCENT = LATIN SMALL LETTER E WITH ACUTE 65 (0x0065, 0x0302, 0x00EA), # e + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER E WITH CIRCUMFLEX 66 (0x0065, 0x0308, 0x00EB), # e + COMBINING DIAERESIS = LATIN SMALL LETTER E WITH DIAERESIS 67 (0x0069, 0x0300, 0x00EC), # i + COMBINING GRAVE ACCENT = LATIN SMALL LETTER I WITH GRAVE 68 (0x0069, 0x0301, 0x00ED), # i + COMBINING ACUTE ACCENT = LATIN SMALL LETTER I WITH ACUTE 69 (0x0069, 0x0302, 0x00EE), # i + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER I WITH CIRCUMFLEX 70 (0x0069, 0x0308, 0x00EF), # i + COMBINING DIAERESIS = LATIN SMALL LETTER I WITH DIAERESIS 71 (0x006E, 0x0303, 0x00F1), # n + COMBINING TILDE = LATIN SMALL LETTER N WITH TILDE 72 (0x006F, 0x0300, 0x00F2), # o + COMBINING GRAVE ACCENT = LATIN SMALL LETTER O WITH GRAVE 73 (0x006F, 0x0301, 0x00F3), # o + COMBINING ACUTE ACCENT = LATIN SMALL LETTER O WITH ACUTE 74 (0x006F, 0x0302, 0x00F4), # o + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER O WITH CIRCUMFLEX 75 (0x006F, 0x0303, 0x00F5), # o + COMBINING TILDE = LATIN SMALL LETTER O WITH TILDE 76 (0x006F, 0x0308, 0x00F6), # o + COMBINING DIAERESIS = LATIN SMALL LETTER O WITH DIAERESIS 77 (0x0075, 0x0300, 0x00F9), # u + COMBINING GRAVE ACCENT = LATIN SMALL LETTER U WITH GRAVE 78 (0x0075, 0x0301, 0x00FA), # u + COMBINING ACUTE ACCENT = LATIN SMALL LETTER U WITH ACUTE 79 (0x0075, 0x0302, 0x00FB), # u + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER U WITH CIRCUMFLEX 80 (0x0075, 0x0308, 0x00FC), # u + COMBINING DIAERESIS = LATIN SMALL LETTER U WITH DIAERESIS 81 (0x0079, 0x0301, 0x00FD), # y + COMBINING ACUTE ACCENT = LATIN SMALL LETTER Y WITH ACUTE 82 (0x0079, 0x0308, 0x00FF), # y + COMBINING DIAERESIS = LATIN SMALL LETTER Y WITH DIAERESIS 83 84 # Common Greek characters 85 (0x0391, 0x0301, 0x0386), # Α + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER ALPHA WITH TONOS 86 (0x0395, 0x0301, 0x0388), # Ε + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER EPSILON WITH TONOS 87 (0x0397, 0x0301, 0x0389), # Η + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER ETA WITH TONOS 88 (0x0399, 0x0301, 0x038A), # Ι + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER IOTA WITH TONOS 89 (0x039F, 0x0301, 0x038C), # Ο + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER OMICRON WITH TONOS 90 (0x03A5, 0x0301, 0x038E), # Υ + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER UPSILON WITH TONOS 91 (0x03A9, 0x0301, 0x038F), # Ω + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER OMEGA WITH TONOS 92 (0x03B1, 0x0301, 0x03AC), # α + COMBINING ACUTE ACCENT = GREEK SMALL LETTER ALPHA WITH TONOS 93 (0x03B5, 0x0301, 0x03AD), # ε + COMBINING ACUTE ACCENT = GREEK SMALL LETTER EPSILON WITH TONOS 94 (0x03B7, 0x0301, 0x03AE), # η + COMBINING ACUTE ACCENT = GREEK SMALL LETTER ETA WITH TONOS 95 (0x03B9, 0x0301, 0x03AF), # ι + COMBINING ACUTE ACCENT = GREEK SMALL LETTER IOTA WITH TONOS 96 (0x03BF, 0x0301, 0x03CC), # ο + COMBINING ACUTE ACCENT = GREEK SMALL LETTER OMICRON WITH TONOS 97 (0x03C5, 0x0301, 0x03CD), # υ + COMBINING ACUTE ACCENT = GREEK SMALL LETTER UPSILON WITH TONOS 98 (0x03C9, 0x0301, 0x03CE), # ω + COMBINING ACUTE ACCENT = GREEK SMALL LETTER OMEGA WITH TONOS 99 100 # Common Cyrillic characters 101 (0x0418, 0x0306, 0x0419), # И + COMBINING BREVE = CYRILLIC CAPITAL LETTER SHORT I 102 (0x0438, 0x0306, 0x0439), # и + COMBINING BREVE = CYRILLIC SMALL LETTER SHORT I 103 (0x0423, 0x0306, 0x040E), # У + COMBINING BREVE = CYRILLIC CAPITAL LETTER SHORT U 104 (0x0443, 0x0306, 0x045E), # у + COMBINING BREVE = CYRILLIC SMALL LETTER SHORT U 105] 106 107full_recompose_description = "all possible recomposition pairs from the Unicode BMP" 108def collect_all_recomposition_pairs(): 109 """Collect all possible recomposition pairs from the Unicode data.""" 110 # Map to store recomposition pairs: (base, combining) -> recomposed 111 recompose_map = {} 112 113 # Process all assigned Unicode code points in BMP (Basic Multilingual Plane) 114 # We limit to BMP (0x0000-0xFFFF) to keep our table smaller with uint16_t 115 for cp in range(0, 0x10000): 116 try: 117 char = chr(cp) 118 119 # Skip unassigned or control characters 120 if not unicodedata.name(char, ''): 121 continue 122 123 # Find decomposition 124 decomp = unicodedata.decomposition(char) 125 if not decomp or '<' in decomp: # Skip compatibility decompositions 126 continue 127 128 # Parse the decomposition 129 parts = decomp.split() 130 if len(parts) == 2: # Simple base + combining mark 131 base = int(parts[0], 16) 132 combining = int(parts[1], 16) 133 134 # Only store if both are in BMP 135 if base < 0x10000 and combining < 0x10000: 136 recompose_map[(base, combining)] = cp 137 138 except (ValueError, TypeError): 139 continue 140 141 # Convert to a list of tuples and sort for binary search 142 recompose_list = [(base, combining, recomposed) 143 for (base, combining), recomposed in recompose_map.items()] 144 recompose_list.sort() 145 146 return recompose_list 147 148def validate_common_pairs(full_list): 149 """Validate that all common pairs are in the full list. 150 151 Raises: 152 ValueError: If any common pair is missing or has a different recomposition 153 value than what's in the full table. 154 """ 155 full_pairs = {(base, combining): recomposed for base, combining, recomposed in full_list} 156 for base, combining, recomposed in COMMON_RECOMPOSITION_PAIRS: 157 full_recomposed = full_pairs.get((base, combining)) 158 if full_recomposed is None: 159 error_msg = f"Error: Common pair (0x{base:04X}, 0x{combining:04X}) not found in full data" 160 print(error_msg) 161 raise ValueError(error_msg) 162 elif full_recomposed != recomposed: 163 error_msg = (f"Error: Common pair (0x{base:04X}, 0x{combining:04X}) has different recomposition: " 164 f"0x{recomposed:04X} vs 0x{full_recomposed:04X}") 165 print(error_msg) 166 raise ValueError(error_msg) 167 168def generate_recomposition_table(use_full_list=False, out_file=DEFAULT_OUT_FILE): 169 """Generate the recomposition C table.""" 170 171 # Collect all recomposition pairs for validation 172 full_recompose_list = collect_all_recomposition_pairs() 173 174 # Decide which list to use 175 if use_full_list: 176 print("Using full recomposition list...") 177 recompose_list = full_recompose_list 178 table_description = full_recompose_description 179 alt_list = COMMON_RECOMPOSITION_PAIRS 180 alt_description = common_recompose_description 181 else: 182 print("Using common recomposition list...") 183 # Validate that all common pairs are in the full list 184 validate_common_pairs(full_recompose_list) 185 recompose_list = sorted(COMMON_RECOMPOSITION_PAIRS) 186 table_description = common_recompose_description 187 alt_list = full_recompose_list 188 alt_description = full_recompose_description 189 generation_mode = " --full" if use_full_list else "" 190 alternative_mode = " --full" if not use_full_list else "" 191 table_description_detail = f"{table_description} ({len(recompose_list)} entries)" 192 alt_description_detail = f"{alt_description} ({len(alt_list)} entries)" 193 194 # Calculate min/max values for boundary checks 195 min_base = min(base for base, _, _ in recompose_list) 196 max_base = max(base for base, _, _ in recompose_list) 197 min_combining = min(combining for _, combining, _ in recompose_list) 198 max_combining = max(combining for _, combining, _ in recompose_list) 199 200 # Generate implementation file 201 with open(out_file, 'w') as f: 202 f.write(f"""\ 203/* SPDX-License-Identifier: GPL-2.0 */ 204/* 205 * {out_file} - Unicode character recomposition 206 * 207 * Auto-generated by {this_file}{generation_mode} 208 * 209 * Unicode Version: {unicodedata.unidata_version} 210 * 211{textwrap.fill( 212 f"This file contains a table with {table_description_detail}. " + 213 f"To generate a table with {alt_description_detail} instead, run:", 214 width=75, initial_indent=" * ", subsequent_indent=" * ")} 215 * 216 * python3 {this_file}{alternative_mode} 217 */ 218 219/* 220 * Table of {table_description} 221 * Sorted by base character and then combining mark for binary search 222 */ 223static const struct ucs_recomposition ucs_recomposition_table[] = {{ 224""") 225 226 for base, combining, recomposed in recompose_list: 227 try: 228 base_name = unicodedata.name(chr(base)) 229 combining_name = unicodedata.name(chr(combining)) 230 recomposed_name = unicodedata.name(chr(recomposed)) 231 comment = f"/* {base_name} + {combining_name} = {recomposed_name} */" 232 except ValueError: 233 comment = f"/* U+{base:04X} + U+{combining:04X} = U+{recomposed:04X} */" 234 f.write(f"\t{{ 0x{base:04X}, 0x{combining:04X}, 0x{recomposed:04X} }}, {comment}\n") 235 236 f.write(f"""\ 237}}; 238 239/* 240 * Boundary values for quick rejection 241 * These are calculated by analyzing the table during generation 242 */ 243#define UCS_RECOMPOSE_MIN_BASE 0x{min_base:04X} 244#define UCS_RECOMPOSE_MAX_BASE 0x{max_base:04X} 245#define UCS_RECOMPOSE_MIN_MARK 0x{min_combining:04X} 246#define UCS_RECOMPOSE_MAX_MARK 0x{max_combining:04X} 247""") 248 249if __name__ == "__main__": 250 parser = argparse.ArgumentParser(description="Generate Unicode recomposition table") 251 parser.add_argument("--full", action="store_true", 252 help="Generate a full recomposition table (default: common pairs only)") 253 parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, 254 help=f"Output file name (default: {DEFAULT_OUT_FILE})") 255 args = parser.parse_args() 256 257 generate_recomposition_table(use_full_list=args.full, out_file=args.output_file) 258