1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# 4# Leverage Python's unidecode module to generate ucs_fallback_table.h 5# 6# The generated table maps complex characters to their simpler fallback forms 7# for a terminal display when corresponding glyphs are unavailable. 8# 9# Usage: 10# python3 gen_ucs_fallback_table.py # Generate fallback tables 11# python3 gen_ucs_fallback_table.py -o FILE # Specify output file 12 13import unicodedata 14from unidecode import unidecode 15import sys 16import argparse 17from collections import defaultdict 18 19# Try to get unidecode version 20try: 21 from importlib.metadata import version 22 unidecode_version = version('unidecode') 23except: 24 unidecode_version = 'unknown' 25 26# This script's file name 27from pathlib import Path 28this_file = Path(__file__).name 29 30# Default output file name 31DEFAULT_OUT_FILE = "ucs_fallback_table.h" 32 33# Define the range marker value 34RANGE_MARKER = 0x00 35 36def generate_fallback_map(): 37 """Generate a fallback map using unidecode for all relevant Unicode points.""" 38 fallback_map = {} 39 40 # Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable 41 for cp in range(0x0080, 0x10000): # Skip ASCII range (0x00-0x7F) 42 char = chr(cp) 43 44 # Skip unassigned/control characters 45 try: 46 if not unicodedata.name(char, ''): 47 continue 48 except ValueError: 49 continue 50 51 # Get the unidecode transliteration 52 ascii_version = unidecode(char) 53 54 # Only store if it results in a single character mapping 55 if len(ascii_version) == 1: 56 fallback_map[cp] = ord(ascii_version) 57 58 # Apply manual overrides for special cases 59 fallback_map.update(get_special_overrides()) 60 61 return fallback_map 62 63def get_special_overrides(): 64 """Get special case overrides that need different handling than unidecode 65 provides... or doesn't provide at all.""" 66 67 overrides = {} 68 69 # Multi-character unidecode output 70 # These map to single chars instead of unidecode's multiple-char mappings 71 # In a terminal fallback context, we need a single character rather than multiple 72 overrides[0x00C6] = ord('E') # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE") 73 overrides[0x00E6] = ord('e') # æ LATIN SMALL LETTER AE -> e (unidecode: "ae") 74 overrides[0x0152] = ord('E') # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE") 75 overrides[0x0153] = ord('e') # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe") 76 overrides[0x00DF] = ord('s') # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss") 77 78 # Comparison operators that unidecode renders as multiple characters 79 overrides[0x2264] = ord('<') # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=") 80 overrides[0x2265] = ord('>') # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=") 81 82 # Unidecode returns an empty string for these 83 overrides[0x2260] = ord('#') # ≠ NOT EQUAL TO -> # (unidecode: empty string) 84 85 # Quadrant block characters that unidecode doesn't map 86 for cp in range(0x2596, 0x259F+1): 87 overrides[cp] = ord('#') # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string) 88 89 # Directional arrows 90 # These provide better semantic meaning than unidecode's mappings 91 overrides[0x2192] = ord('>') # → RIGHTWARDS ARROW -> > (unidecode: "-") 92 overrides[0x2190] = ord('<') # ← LEFTWARDS ARROW -> < (unidecode: "-") 93 overrides[0x2191] = ord('^') # ↑ UPWARDS ARROW -> ^ (unidecode: "|") 94 overrides[0x2193] = ord('v') # ↓ DOWNWARDS ARROW -> v (unidecode: "|") 95 96 # Double arrows with their directional semantic mappings 97 overrides[0x21D0] = ord('<') # ⇐ LEFTWARDS DOUBLE ARROW -> < 98 overrides[0x21D1] = ord('^') # ⇑ UPWARDS DOUBLE ARROW -> ^ 99 overrides[0x21D2] = ord('>') # ⇒ RIGHTWARDS DOUBLE ARROW -> > 100 overrides[0x21D3] = ord('v') # ⇓ DOWNWARDS DOUBLE ARROW -> v 101 102 # Halfwidth arrows 103 # These need the same treatment as their normal-width counterparts 104 overrides[0xFFE9] = ord('<') # ← HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-") 105 overrides[0xFFEA] = ord('^') # ↑ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|") 106 overrides[0xFFEB] = ord('>') # → HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-") 107 overrides[0xFFEC] = ord('v') # ↓ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|") 108 109 # Currency symbols - each mapped to a representative letter 110 overrides[0x00A2] = ord('c') # ¢ CENT SIGN -> c 111 overrides[0x00A3] = ord('L') # £ POUND SIGN -> L 112 overrides[0x00A5] = ord('Y') # ¥ YEN SIGN -> Y 113 overrides[0x20AC] = ord('E') # € EURO SIGN -> E 114 115 # Symbols mapped to letters 116 overrides[0x00A7] = ord('S') # § SECTION SIGN -> S 117 overrides[0x00A9] = ord('C') # © COPYRIGHT SIGN -> C 118 overrides[0x00AE] = ord('R') # ® REGISTERED SIGN -> R 119 overrides[0x2122] = ord('T') # ™ TRADE MARK SIGN -> T 120 121 # Degree-related symbols 122 overrides[0x00B0] = ord('o') # ° DEGREE SIGN -> o 123 overrides[0x2103] = ord('C') # ℃ DEGREE CELSIUS -> C 124 overrides[0x2109] = ord('F') # ℉ DEGREE FAHRENHEIT -> F 125 126 # Angle quotation marks 127 overrides[0x00AB] = ord('<') # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> < 128 overrides[0x00BB] = ord('>') # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> > 129 130 # Operators with circular shape 131 overrides[0x2218] = ord('o') # ∘ RING OPERATOR -> o 132 overrides[0x2219] = ord('.') # ∙ BULLET OPERATOR -> . 133 134 # Negated mathematical symbols (preserving the negation semantics) 135 # Negated symbols mapped to exclamation mark (semantically "not") 136 for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285): 137 overrides[cp] = ord('!') # Negated math symbols -> ! (not) 138 139 # Negated symbols mapped to hash sign (semantically "not equal") 140 for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B): 141 overrides[cp] = ord('#') # Negated equality symbols -> # (not equal) 142 143 # Negated arrows - all mapped to exclamation mark 144 for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF): 145 overrides[cp] = ord('!') # Negated arrows -> ! (not) 146 147 # Dashes and hyphens 148 for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052): 149 overrides[cp] = ord('-') # Dashes and hyphens -> - 150 151 # Question mark punctuation 152 for cp in (0x203D, 0x2047, 0x2048): 153 overrides[cp] = ord('?') # Question marks -> ? 154 155 # Exclamation mark punctuation 156 for cp in (0x203C, 0x2049): 157 overrides[cp] = ord('!') # Exclamation marks -> ! 158 159 # Asterisk-like symbols 160 for cp in (0x2042, 0x2051, 0x2055): 161 overrides[cp] = ord('*') 162 163 # Other specific punctuation with unique mappings 164 overrides[0x201E] = ord('"') # „ DOUBLE LOW-9 QUOTATION MARK 165 overrides[0x2023] = ord('>') # ‣ TRIANGULAR BULLET 166 overrides[0x2026] = ord('.') # … HORIZONTAL ELLIPSIS 167 overrides[0x2033] = ord('"') # ″ DOUBLE PRIME 168 overrides[0x204B] = ord('P') # ⁋ REVERSED PILCROW SIGN 169 overrides[0x204C] = ord('<') # ⁌ BLACK LEFTWARDS BULLET 170 overrides[0x204D] = ord('>') # ⁍ BLACK RIGHTWARDS BULLET 171 overrides[0x204F] = ord(';') # ⁏ REVERSED SEMICOLON 172 overrides[0x205B] = ord(':') # ⁛ FOUR DOT MARK 173 174 # Check marks 175 overrides[0x2713] = ord('v') # ✓ CHECK MARK 176 overrides[0x2714] = ord('V') # ✔ HEAVY CHECK MARK 177 178 # X marks - lowercase for regular, uppercase for heavy 179 for cp in (0x2715, 0x2717): 180 overrides[cp] = ord('x') # Regular X marks -> x 181 for cp in (0x2716, 0x2718): 182 overrides[cp] = ord('X') # Heavy X marks -> X 183 184 # Stars and asterisk-like symbols mapped to '*' 185 for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698): 186 overrides[cp] = ord('*') # All star and asterisk symbols -> * 187 for cp in range(0x2721, 0x2746+1): 188 overrides[cp] = ord('*') # All star and asterisk symbols -> * 189 for cp in range(0x2749, 0x274B+1): 190 overrides[cp] = ord('*') # Last set of asterisk symbols -> * 191 for cp in (0x229B, 0x22C6, 0x235F, 0x2363): 192 overrides[cp] = ord('*') # Star operators -> * 193 194 # Special exclusions with fallback value of 0 195 # These will be filtered out in organize_by_pages() 196 197 # Exclude U+2028 (LINE SEPARATOR) 198 overrides[0x2028] = 0 # LINE SEPARATOR (unidecode: '\n') 199 200 # Full-width to ASCII mapping (covering all printable ASCII 33-126) 201 # 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~) 202 # Those are excluded here to reduce the table size. 203 # It is more efficient to process them programmatically in 204 # ucs.c:ucs_get_fallback(). 205 for cp in range(0xFF01, 0xFF5E + 1): 206 overrides[cp] = 0 # Double-width ASCII characters 207 208 return overrides 209 210def organize_by_pages(fallback_map): 211 """Organize the fallback mappings by their high byte (page).""" 212 # Group by high byte (page) 213 page_groups = defaultdict(list) 214 for code, fallback in fallback_map.items(): 215 # Skip characters with fallback value of 0 (excluded characters) 216 if fallback == 0: 217 continue 218 219 page = code >> 8 # Get the high byte (page) 220 offset = code & 0xFF # Get the low byte (offset within page) 221 page_groups[page].append((offset, fallback)) 222 223 # Sort each page's entries by offset 224 for page in page_groups: 225 page_groups[page].sort() 226 227 return page_groups 228 229def compress_ranges(page_groups): 230 """Compress consecutive entries with the same fallback character into ranges. 231 A range is only compressed if it contains 3 or more consecutive entries.""" 232 233 compressed_pages = {} 234 235 for page, entries in page_groups.items(): 236 compressed_entries = [] 237 i = 0 238 while i < len(entries): 239 start_offset, fallback = entries[i] 240 241 # Look ahead to find consecutive entries with the same fallback 242 j = i + 1 243 while (j < len(entries) and 244 entries[j][0] == entries[j-1][0] + 1 and # consecutive offsets 245 entries[j][1] == fallback): # same fallback 246 j += 1 247 248 # Calculate the range end 249 end_offset = entries[j-1][0] 250 251 # If we found a range with 3 or more entries (worth compressing) 252 if j - i >= 3: 253 # Add a range entry 254 compressed_entries.append((start_offset, RANGE_MARKER)) 255 compressed_entries.append((end_offset, fallback)) 256 else: 257 # Add the individual entries as is 258 for k in range(i, j): 259 compressed_entries.append(entries[k]) 260 261 i = j 262 263 compressed_pages[page] = compressed_entries 264 265 return compressed_pages 266 267def cp_name(cp): 268 """Get the Unicode character name for a code point.""" 269 try: 270 return unicodedata.name(chr(cp)) 271 except: 272 return f"U+{cp:04X}" 273 274def generate_fallback_tables(out_file=DEFAULT_OUT_FILE): 275 """Generate the fallback character tables.""" 276 # Generate fallback map using unidecode 277 fallback_map = generate_fallback_map() 278 print(f"Generated {len(fallback_map)} total fallback mappings") 279 280 # Organize by pages 281 page_groups = organize_by_pages(fallback_map) 282 print(f"Organized into {len(page_groups)} pages") 283 284 # Compress ranges 285 compressed_pages = compress_ranges(page_groups) 286 total_compressed_entries = sum(len(entries) for entries in compressed_pages.values()) 287 print(f"Total compressed entries: {total_compressed_entries}") 288 289 # Create output file 290 with open(out_file, 'w') as f: 291 f.write(f"""\ 292/* SPDX-License-Identifier: GPL-2.0 */ 293/* 294 * {out_file} - Unicode character fallback table 295 * 296 * Auto-generated by {this_file} 297 * 298 * Unicode Version: {unicodedata.unidata_version} 299 * Unidecode Version: {unidecode_version} 300 * 301 * This file contains optimized tables that map complex Unicode characters 302 * to simpler fallback characters for terminal display when corresponding 303 * glyphs are unavailable. 304 */ 305 306static const struct ucs_page_desc ucs_fallback_pages[] = {{ 307""") 308 309 # Convert compressed_pages to a sorted list of (page, entries) tuples 310 sorted_pages = sorted(compressed_pages.items()) 311 312 # Track the start index for each page 313 start_index = 0 314 315 # Write page descriptors 316 for page, entries in sorted_pages: 317 count = len(entries) 318 f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n") 319 start_index += count 320 321 # Write entries array 322 f.write("""\ 323}; 324 325/* Page entries array (referenced by page descriptors) */ 326static const struct ucs_page_entry ucs_fallback_entries[] = { 327""") 328 329 # Write all entries 330 for page, entries in sorted_pages: 331 page_hex = f"0x{page:02X}" 332 f.write(f"\t/* Entries for page {page_hex} */\n") 333 334 for i, (offset, fallback) in enumerate(entries): 335 # Convert to hex for better readability 336 offset_hex = f"0x{offset:02X}" 337 fallback_hex = f"0x{fallback:02X}" 338 339 # Handle comments 340 codepoint = (page << 8) | offset 341 342 if fallback == RANGE_MARKER: 343 comment = f"{cp_name(codepoint)} -> ..." 344 else: 345 comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'" 346 f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n") 347 348 f.write(f"""\ 349}}; 350 351#define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER} 352""") 353 354if __name__ == "__main__": 355 parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables") 356 parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, 357 help=f"Output file name (default: {DEFAULT_OUT_FILE})") 358 args = parser.parse_args() 359 360 generate_fallback_tables(out_file=args.output_file) 361