15071ddc1SNicolas Pitre#!/usr/bin/env python3 25071ddc1SNicolas Pitre# SPDX-License-Identifier: GPL-2.0 35071ddc1SNicolas Pitre# 45071ddc1SNicolas Pitre# Leverage Python's unidecode module to generate ucs_fallback_table.h 55071ddc1SNicolas Pitre# 65071ddc1SNicolas Pitre# The generated table maps complex characters to their simpler fallback forms 75071ddc1SNicolas Pitre# for a terminal display when corresponding glyphs are unavailable. 85071ddc1SNicolas Pitre# 95071ddc1SNicolas Pitre# Usage: 105071ddc1SNicolas Pitre# python3 gen_ucs_fallback_table.py # Generate fallback tables 115071ddc1SNicolas Pitre# python3 gen_ucs_fallback_table.py -o FILE # Specify output file 125071ddc1SNicolas Pitre 135071ddc1SNicolas Pitreimport unicodedata 145071ddc1SNicolas Pitrefrom unidecode import unidecode 155071ddc1SNicolas Pitreimport sys 165071ddc1SNicolas Pitreimport argparse 175071ddc1SNicolas Pitrefrom collections import defaultdict 185071ddc1SNicolas Pitre 195071ddc1SNicolas Pitre# Try to get unidecode version 205071ddc1SNicolas Pitretry: 215071ddc1SNicolas Pitre from importlib.metadata import version 225071ddc1SNicolas Pitre unidecode_version = version('unidecode') 235071ddc1SNicolas Pitreexcept: 245071ddc1SNicolas Pitre unidecode_version = 'unknown' 255071ddc1SNicolas Pitre 265071ddc1SNicolas Pitre# This script's file name 275071ddc1SNicolas Pitrefrom pathlib import Path 285071ddc1SNicolas Pitrethis_file = Path(__file__).name 295071ddc1SNicolas Pitre 305071ddc1SNicolas Pitre# Default output file name 315071ddc1SNicolas PitreDEFAULT_OUT_FILE = "ucs_fallback_table.h" 325071ddc1SNicolas Pitre 335071ddc1SNicolas Pitre# Define the range marker value 345071ddc1SNicolas PitreRANGE_MARKER = 0x00 355071ddc1SNicolas Pitre 365071ddc1SNicolas Pitredef generate_fallback_map(): 375071ddc1SNicolas Pitre """Generate a fallback map using unidecode for all relevant Unicode points.""" 385071ddc1SNicolas Pitre fallback_map = {} 395071ddc1SNicolas Pitre 405071ddc1SNicolas Pitre # Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable 415071ddc1SNicolas Pitre for cp in range(0x0080, 0x10000): # Skip ASCII range (0x00-0x7F) 425071ddc1SNicolas Pitre char = chr(cp) 435071ddc1SNicolas Pitre 445071ddc1SNicolas Pitre # Skip unassigned/control characters 455071ddc1SNicolas Pitre try: 465071ddc1SNicolas Pitre if not unicodedata.name(char, ''): 475071ddc1SNicolas Pitre continue 485071ddc1SNicolas Pitre except ValueError: 495071ddc1SNicolas Pitre continue 505071ddc1SNicolas Pitre 515071ddc1SNicolas Pitre # Get the unidecode transliteration 525071ddc1SNicolas Pitre ascii_version = unidecode(char) 535071ddc1SNicolas Pitre 545071ddc1SNicolas Pitre # Only store if it results in a single character mapping 555071ddc1SNicolas Pitre if len(ascii_version) == 1: 565071ddc1SNicolas Pitre fallback_map[cp] = ord(ascii_version) 575071ddc1SNicolas Pitre 585071ddc1SNicolas Pitre # Apply manual overrides for special cases 595071ddc1SNicolas Pitre fallback_map.update(get_special_overrides()) 605071ddc1SNicolas Pitre 615071ddc1SNicolas Pitre return fallback_map 625071ddc1SNicolas Pitre 635071ddc1SNicolas Pitredef get_special_overrides(): 645071ddc1SNicolas Pitre """Get special case overrides that need different handling than unidecode 655071ddc1SNicolas Pitre provides... or doesn't provide at all.""" 665071ddc1SNicolas Pitre 675071ddc1SNicolas Pitre overrides = {} 685071ddc1SNicolas Pitre 695071ddc1SNicolas Pitre # Multi-character unidecode output 705071ddc1SNicolas Pitre # These map to single chars instead of unidecode's multiple-char mappings 715071ddc1SNicolas Pitre # In a terminal fallback context, we need a single character rather than multiple 725071ddc1SNicolas Pitre overrides[0x00C6] = ord('E') # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE") 735071ddc1SNicolas Pitre overrides[0x00E6] = ord('e') # æ LATIN SMALL LETTER AE -> e (unidecode: "ae") 745071ddc1SNicolas Pitre overrides[0x0152] = ord('E') # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE") 755071ddc1SNicolas Pitre overrides[0x0153] = ord('e') # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe") 765071ddc1SNicolas Pitre overrides[0x00DF] = ord('s') # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss") 775071ddc1SNicolas Pitre 785071ddc1SNicolas Pitre # Comparison operators that unidecode renders as multiple characters 795071ddc1SNicolas Pitre overrides[0x2264] = ord('<') # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=") 805071ddc1SNicolas Pitre overrides[0x2265] = ord('>') # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=") 815071ddc1SNicolas Pitre 825071ddc1SNicolas Pitre # Unidecode returns an empty string for these 835071ddc1SNicolas Pitre overrides[0x2260] = ord('#') # ≠ NOT EQUAL TO -> # (unidecode: empty string) 845071ddc1SNicolas Pitre 855071ddc1SNicolas Pitre # Quadrant block characters that unidecode doesn't map 865071ddc1SNicolas Pitre for cp in range(0x2596, 0x259F+1): 875071ddc1SNicolas Pitre overrides[cp] = ord('#') # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string) 885071ddc1SNicolas Pitre 895071ddc1SNicolas Pitre # Directional arrows 905071ddc1SNicolas Pitre # These provide better semantic meaning than unidecode's mappings 915071ddc1SNicolas Pitre overrides[0x2192] = ord('>') # → RIGHTWARDS ARROW -> > (unidecode: "-") 925071ddc1SNicolas Pitre overrides[0x2190] = ord('<') # ← LEFTWARDS ARROW -> < (unidecode: "-") 935071ddc1SNicolas Pitre overrides[0x2191] = ord('^') # ↑ UPWARDS ARROW -> ^ (unidecode: "|") 945071ddc1SNicolas Pitre overrides[0x2193] = ord('v') # ↓ DOWNWARDS ARROW -> v (unidecode: "|") 955071ddc1SNicolas Pitre 965071ddc1SNicolas Pitre # Double arrows with their directional semantic mappings 975071ddc1SNicolas Pitre overrides[0x21D0] = ord('<') # ⇐ LEFTWARDS DOUBLE ARROW -> < 985071ddc1SNicolas Pitre overrides[0x21D1] = ord('^') # ⇑ UPWARDS DOUBLE ARROW -> ^ 995071ddc1SNicolas Pitre overrides[0x21D2] = ord('>') # ⇒ RIGHTWARDS DOUBLE ARROW -> > 1005071ddc1SNicolas Pitre overrides[0x21D3] = ord('v') # ⇓ DOWNWARDS DOUBLE ARROW -> v 1015071ddc1SNicolas Pitre 1025071ddc1SNicolas Pitre # Halfwidth arrows 1035071ddc1SNicolas Pitre # These need the same treatment as their normal-width counterparts 1045071ddc1SNicolas Pitre overrides[0xFFE9] = ord('<') # ← HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-") 1055071ddc1SNicolas Pitre overrides[0xFFEA] = ord('^') # ↑ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|") 1065071ddc1SNicolas Pitre overrides[0xFFEB] = ord('>') # → HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-") 1075071ddc1SNicolas Pitre overrides[0xFFEC] = ord('v') # ↓ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|") 1085071ddc1SNicolas Pitre 1095071ddc1SNicolas Pitre # Currency symbols - each mapped to a representative letter 1105071ddc1SNicolas Pitre overrides[0x00A2] = ord('c') # ¢ CENT SIGN -> c 1115071ddc1SNicolas Pitre overrides[0x00A3] = ord('L') # £ POUND SIGN -> L 1125071ddc1SNicolas Pitre overrides[0x00A5] = ord('Y') # ¥ YEN SIGN -> Y 1135071ddc1SNicolas Pitre overrides[0x20AC] = ord('E') # € EURO SIGN -> E 1145071ddc1SNicolas Pitre 1155071ddc1SNicolas Pitre # Symbols mapped to letters 1165071ddc1SNicolas Pitre overrides[0x00A7] = ord('S') # § SECTION SIGN -> S 1175071ddc1SNicolas Pitre overrides[0x00A9] = ord('C') # © COPYRIGHT SIGN -> C 1185071ddc1SNicolas Pitre overrides[0x00AE] = ord('R') # ® REGISTERED SIGN -> R 1195071ddc1SNicolas Pitre overrides[0x2122] = ord('T') # ™ TRADE MARK SIGN -> T 1205071ddc1SNicolas Pitre 1215071ddc1SNicolas Pitre # Degree-related symbols 1225071ddc1SNicolas Pitre overrides[0x00B0] = ord('o') # ° DEGREE SIGN -> o 1235071ddc1SNicolas Pitre overrides[0x2103] = ord('C') # ℃ DEGREE CELSIUS -> C 1245071ddc1SNicolas Pitre overrides[0x2109] = ord('F') # ℉ DEGREE FAHRENHEIT -> F 1255071ddc1SNicolas Pitre 1265071ddc1SNicolas Pitre # Angle quotation marks 1275071ddc1SNicolas Pitre overrides[0x00AB] = ord('<') # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> < 1285071ddc1SNicolas Pitre overrides[0x00BB] = ord('>') # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> > 1295071ddc1SNicolas Pitre 1305071ddc1SNicolas Pitre # Operators with circular shape 1315071ddc1SNicolas Pitre overrides[0x2218] = ord('o') # ∘ RING OPERATOR -> o 1325071ddc1SNicolas Pitre overrides[0x2219] = ord('.') # ∙ BULLET OPERATOR -> . 1335071ddc1SNicolas Pitre 1345071ddc1SNicolas Pitre # Negated mathematical symbols (preserving the negation semantics) 1355071ddc1SNicolas Pitre # Negated symbols mapped to exclamation mark (semantically "not") 1365071ddc1SNicolas Pitre for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285): 1375071ddc1SNicolas Pitre overrides[cp] = ord('!') # Negated math symbols -> ! (not) 1385071ddc1SNicolas Pitre 1395071ddc1SNicolas Pitre # Negated symbols mapped to hash sign (semantically "not equal") 1405071ddc1SNicolas Pitre for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B): 1415071ddc1SNicolas Pitre overrides[cp] = ord('#') # Negated equality symbols -> # (not equal) 1425071ddc1SNicolas Pitre 1435071ddc1SNicolas Pitre # Negated arrows - all mapped to exclamation mark 1445071ddc1SNicolas Pitre for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF): 1455071ddc1SNicolas Pitre overrides[cp] = ord('!') # Negated arrows -> ! (not) 1465071ddc1SNicolas Pitre 1475071ddc1SNicolas Pitre # Dashes and hyphens 1485071ddc1SNicolas Pitre for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052): 1495071ddc1SNicolas Pitre overrides[cp] = ord('-') # Dashes and hyphens -> - 1505071ddc1SNicolas Pitre 1515071ddc1SNicolas Pitre # Question mark punctuation 1525071ddc1SNicolas Pitre for cp in (0x203D, 0x2047, 0x2048): 1535071ddc1SNicolas Pitre overrides[cp] = ord('?') # Question marks -> ? 1545071ddc1SNicolas Pitre 1555071ddc1SNicolas Pitre # Exclamation mark punctuation 1565071ddc1SNicolas Pitre for cp in (0x203C, 0x2049): 1575071ddc1SNicolas Pitre overrides[cp] = ord('!') # Exclamation marks -> ! 1585071ddc1SNicolas Pitre 1595071ddc1SNicolas Pitre # Asterisk-like symbols 1605071ddc1SNicolas Pitre for cp in (0x2042, 0x2051, 0x2055): 1615071ddc1SNicolas Pitre overrides[cp] = ord('*') 1625071ddc1SNicolas Pitre 1635071ddc1SNicolas Pitre # Other specific punctuation with unique mappings 1645071ddc1SNicolas Pitre overrides[0x201E] = ord('"') # „ DOUBLE LOW-9 QUOTATION MARK 1655071ddc1SNicolas Pitre overrides[0x2023] = ord('>') # ‣ TRIANGULAR BULLET 1665071ddc1SNicolas Pitre overrides[0x2026] = ord('.') # … HORIZONTAL ELLIPSIS 1675071ddc1SNicolas Pitre overrides[0x2033] = ord('"') # ″ DOUBLE PRIME 1685071ddc1SNicolas Pitre overrides[0x204B] = ord('P') # ⁋ REVERSED PILCROW SIGN 1695071ddc1SNicolas Pitre overrides[0x204C] = ord('<') # ⁌ BLACK LEFTWARDS BULLET 1705071ddc1SNicolas Pitre overrides[0x204D] = ord('>') # ⁍ BLACK RIGHTWARDS BULLET 1715071ddc1SNicolas Pitre overrides[0x204F] = ord(';') # ⁏ REVERSED SEMICOLON 1725071ddc1SNicolas Pitre overrides[0x205B] = ord(':') # ⁛ FOUR DOT MARK 1735071ddc1SNicolas Pitre 1745071ddc1SNicolas Pitre # Check marks 1755071ddc1SNicolas Pitre overrides[0x2713] = ord('v') # ✓ CHECK MARK 1765071ddc1SNicolas Pitre overrides[0x2714] = ord('V') # ✔ HEAVY CHECK MARK 1775071ddc1SNicolas Pitre 1785071ddc1SNicolas Pitre # X marks - lowercase for regular, uppercase for heavy 1795071ddc1SNicolas Pitre for cp in (0x2715, 0x2717): 1805071ddc1SNicolas Pitre overrides[cp] = ord('x') # Regular X marks -> x 1815071ddc1SNicolas Pitre for cp in (0x2716, 0x2718): 1825071ddc1SNicolas Pitre overrides[cp] = ord('X') # Heavy X marks -> X 1835071ddc1SNicolas Pitre 1845071ddc1SNicolas Pitre # Stars and asterisk-like symbols mapped to '*' 1855071ddc1SNicolas Pitre for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698): 1865071ddc1SNicolas Pitre overrides[cp] = ord('*') # All star and asterisk symbols -> * 1875071ddc1SNicolas Pitre for cp in range(0x2721, 0x2746+1): 1885071ddc1SNicolas Pitre overrides[cp] = ord('*') # All star and asterisk symbols -> * 1895071ddc1SNicolas Pitre for cp in range(0x2749, 0x274B+1): 1905071ddc1SNicolas Pitre overrides[cp] = ord('*') # Last set of asterisk symbols -> * 1915071ddc1SNicolas Pitre for cp in (0x229B, 0x22C6, 0x235F, 0x2363): 1925071ddc1SNicolas Pitre overrides[cp] = ord('*') # Star operators -> * 1935071ddc1SNicolas Pitre 1945071ddc1SNicolas Pitre # Special exclusions with fallback value of 0 1955071ddc1SNicolas Pitre # These will be filtered out in organize_by_pages() 1965071ddc1SNicolas Pitre 1975071ddc1SNicolas Pitre # Exclude U+2028 (LINE SEPARATOR) 1985071ddc1SNicolas Pitre overrides[0x2028] = 0 # LINE SEPARATOR (unidecode: '\n') 1995071ddc1SNicolas Pitre 200*63f0d28dSNicolas Pitre # Full-width to ASCII mapping (covering all printable ASCII 33-126) 201*63f0d28dSNicolas Pitre # 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~) 202*63f0d28dSNicolas Pitre # Those are excluded here to reduce the table size. 203*63f0d28dSNicolas Pitre # It is more efficient to process them programmatically in 204*63f0d28dSNicolas Pitre # ucs.c:ucs_get_fallback(). 205*63f0d28dSNicolas Pitre for cp in range(0xFF01, 0xFF5E + 1): 206*63f0d28dSNicolas Pitre overrides[cp] = 0 # Double-width ASCII characters 207*63f0d28dSNicolas Pitre 2085071ddc1SNicolas Pitre return overrides 2095071ddc1SNicolas Pitre 2105071ddc1SNicolas Pitredef organize_by_pages(fallback_map): 2115071ddc1SNicolas Pitre """Organize the fallback mappings by their high byte (page).""" 2125071ddc1SNicolas Pitre # Group by high byte (page) 2135071ddc1SNicolas Pitre page_groups = defaultdict(list) 2145071ddc1SNicolas Pitre for code, fallback in fallback_map.items(): 2155071ddc1SNicolas Pitre # Skip characters with fallback value of 0 (excluded characters) 2165071ddc1SNicolas Pitre if fallback == 0: 2175071ddc1SNicolas Pitre continue 2185071ddc1SNicolas Pitre 2195071ddc1SNicolas Pitre page = code >> 8 # Get the high byte (page) 2205071ddc1SNicolas Pitre offset = code & 0xFF # Get the low byte (offset within page) 2215071ddc1SNicolas Pitre page_groups[page].append((offset, fallback)) 2225071ddc1SNicolas Pitre 2235071ddc1SNicolas Pitre # Sort each page's entries by offset 2245071ddc1SNicolas Pitre for page in page_groups: 2255071ddc1SNicolas Pitre page_groups[page].sort() 2265071ddc1SNicolas Pitre 2275071ddc1SNicolas Pitre return page_groups 2285071ddc1SNicolas Pitre 2295071ddc1SNicolas Pitredef compress_ranges(page_groups): 2305071ddc1SNicolas Pitre """Compress consecutive entries with the same fallback character into ranges. 2315071ddc1SNicolas Pitre A range is only compressed if it contains 3 or more consecutive entries.""" 2325071ddc1SNicolas Pitre 2335071ddc1SNicolas Pitre compressed_pages = {} 2345071ddc1SNicolas Pitre 2355071ddc1SNicolas Pitre for page, entries in page_groups.items(): 2365071ddc1SNicolas Pitre compressed_entries = [] 2375071ddc1SNicolas Pitre i = 0 2385071ddc1SNicolas Pitre while i < len(entries): 2395071ddc1SNicolas Pitre start_offset, fallback = entries[i] 2405071ddc1SNicolas Pitre 2415071ddc1SNicolas Pitre # Look ahead to find consecutive entries with the same fallback 2425071ddc1SNicolas Pitre j = i + 1 2435071ddc1SNicolas Pitre while (j < len(entries) and 2445071ddc1SNicolas Pitre entries[j][0] == entries[j-1][0] + 1 and # consecutive offsets 2455071ddc1SNicolas Pitre entries[j][1] == fallback): # same fallback 2465071ddc1SNicolas Pitre j += 1 2475071ddc1SNicolas Pitre 2485071ddc1SNicolas Pitre # Calculate the range end 2495071ddc1SNicolas Pitre end_offset = entries[j-1][0] 2505071ddc1SNicolas Pitre 2515071ddc1SNicolas Pitre # If we found a range with 3 or more entries (worth compressing) 2525071ddc1SNicolas Pitre if j - i >= 3: 2535071ddc1SNicolas Pitre # Add a range entry 2545071ddc1SNicolas Pitre compressed_entries.append((start_offset, RANGE_MARKER)) 2555071ddc1SNicolas Pitre compressed_entries.append((end_offset, fallback)) 2565071ddc1SNicolas Pitre else: 2575071ddc1SNicolas Pitre # Add the individual entries as is 2585071ddc1SNicolas Pitre for k in range(i, j): 2595071ddc1SNicolas Pitre compressed_entries.append(entries[k]) 2605071ddc1SNicolas Pitre 2615071ddc1SNicolas Pitre i = j 2625071ddc1SNicolas Pitre 2635071ddc1SNicolas Pitre compressed_pages[page] = compressed_entries 2645071ddc1SNicolas Pitre 2655071ddc1SNicolas Pitre return compressed_pages 2665071ddc1SNicolas Pitre 2675071ddc1SNicolas Pitredef cp_name(cp): 2685071ddc1SNicolas Pitre """Get the Unicode character name for a code point.""" 2695071ddc1SNicolas Pitre try: 2705071ddc1SNicolas Pitre return unicodedata.name(chr(cp)) 2715071ddc1SNicolas Pitre except: 2725071ddc1SNicolas Pitre return f"U+{cp:04X}" 2735071ddc1SNicolas Pitre 2745071ddc1SNicolas Pitredef generate_fallback_tables(out_file=DEFAULT_OUT_FILE): 2755071ddc1SNicolas Pitre """Generate the fallback character tables.""" 2765071ddc1SNicolas Pitre # Generate fallback map using unidecode 2775071ddc1SNicolas Pitre fallback_map = generate_fallback_map() 2785071ddc1SNicolas Pitre print(f"Generated {len(fallback_map)} total fallback mappings") 2795071ddc1SNicolas Pitre 2805071ddc1SNicolas Pitre # Organize by pages 2815071ddc1SNicolas Pitre page_groups = organize_by_pages(fallback_map) 2825071ddc1SNicolas Pitre print(f"Organized into {len(page_groups)} pages") 2835071ddc1SNicolas Pitre 2845071ddc1SNicolas Pitre # Compress ranges 2855071ddc1SNicolas Pitre compressed_pages = compress_ranges(page_groups) 2865071ddc1SNicolas Pitre total_compressed_entries = sum(len(entries) for entries in compressed_pages.values()) 2875071ddc1SNicolas Pitre print(f"Total compressed entries: {total_compressed_entries}") 2885071ddc1SNicolas Pitre 2895071ddc1SNicolas Pitre # Create output file 2905071ddc1SNicolas Pitre with open(out_file, 'w') as f: 2915071ddc1SNicolas Pitre f.write(f"""\ 2925071ddc1SNicolas Pitre/* SPDX-License-Identifier: GPL-2.0 */ 2935071ddc1SNicolas Pitre/* 2945071ddc1SNicolas Pitre * {out_file} - Unicode character fallback table 2955071ddc1SNicolas Pitre * 2965071ddc1SNicolas Pitre * Auto-generated by {this_file} 2975071ddc1SNicolas Pitre * 2985071ddc1SNicolas Pitre * Unicode Version: {unicodedata.unidata_version} 2995071ddc1SNicolas Pitre * Unidecode Version: {unidecode_version} 3005071ddc1SNicolas Pitre * 3015071ddc1SNicolas Pitre * This file contains optimized tables that map complex Unicode characters 3025071ddc1SNicolas Pitre * to simpler fallback characters for terminal display when corresponding 3035071ddc1SNicolas Pitre * glyphs are unavailable. 3045071ddc1SNicolas Pitre */ 3055071ddc1SNicolas Pitre 3065071ddc1SNicolas Pitrestatic const struct ucs_page_desc ucs_fallback_pages[] = {{ 3075071ddc1SNicolas Pitre""") 3085071ddc1SNicolas Pitre 3095071ddc1SNicolas Pitre # Convert compressed_pages to a sorted list of (page, entries) tuples 3105071ddc1SNicolas Pitre sorted_pages = sorted(compressed_pages.items()) 3115071ddc1SNicolas Pitre 3125071ddc1SNicolas Pitre # Track the start index for each page 3135071ddc1SNicolas Pitre start_index = 0 3145071ddc1SNicolas Pitre 3155071ddc1SNicolas Pitre # Write page descriptors 3165071ddc1SNicolas Pitre for page, entries in sorted_pages: 3175071ddc1SNicolas Pitre count = len(entries) 3185071ddc1SNicolas Pitre f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n") 3195071ddc1SNicolas Pitre start_index += count 3205071ddc1SNicolas Pitre 3215071ddc1SNicolas Pitre # Write entries array 3225071ddc1SNicolas Pitre f.write("""\ 3235071ddc1SNicolas Pitre}; 3245071ddc1SNicolas Pitre 3255071ddc1SNicolas Pitre/* Page entries array (referenced by page descriptors) */ 3265071ddc1SNicolas Pitrestatic const struct ucs_page_entry ucs_fallback_entries[] = { 3275071ddc1SNicolas Pitre""") 3285071ddc1SNicolas Pitre 3295071ddc1SNicolas Pitre # Write all entries 3305071ddc1SNicolas Pitre for page, entries in sorted_pages: 3315071ddc1SNicolas Pitre page_hex = f"0x{page:02X}" 3325071ddc1SNicolas Pitre f.write(f"\t/* Entries for page {page_hex} */\n") 3335071ddc1SNicolas Pitre 3345071ddc1SNicolas Pitre for i, (offset, fallback) in enumerate(entries): 3355071ddc1SNicolas Pitre # Convert to hex for better readability 3365071ddc1SNicolas Pitre offset_hex = f"0x{offset:02X}" 3375071ddc1SNicolas Pitre fallback_hex = f"0x{fallback:02X}" 3385071ddc1SNicolas Pitre 3395071ddc1SNicolas Pitre # Handle comments 3405071ddc1SNicolas Pitre codepoint = (page << 8) | offset 3415071ddc1SNicolas Pitre 3425071ddc1SNicolas Pitre if fallback == RANGE_MARKER: 3435071ddc1SNicolas Pitre comment = f"{cp_name(codepoint)} -> ..." 3445071ddc1SNicolas Pitre else: 3455071ddc1SNicolas Pitre comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'" 3465071ddc1SNicolas Pitre f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n") 3475071ddc1SNicolas Pitre 3485071ddc1SNicolas Pitre f.write(f"""\ 3495071ddc1SNicolas Pitre}}; 3505071ddc1SNicolas Pitre 3515071ddc1SNicolas Pitre#define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER} 3525071ddc1SNicolas Pitre""") 3535071ddc1SNicolas Pitre 3545071ddc1SNicolas Pitreif __name__ == "__main__": 3555071ddc1SNicolas Pitre parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables") 3565071ddc1SNicolas Pitre parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, 3575071ddc1SNicolas Pitre help=f"Output file name (default: {DEFAULT_OUT_FILE})") 3585071ddc1SNicolas Pitre args = parser.parse_args() 3595071ddc1SNicolas Pitre 3605071ddc1SNicolas Pitre generate_fallback_tables(out_file=args.output_file) 361