xref: /linux/drivers/tty/vt/gen_ucs_fallback_table.py (revision 378ec25aec5a8444879f8696d580c94950a1f1df)
15071ddc1SNicolas Pitre#!/usr/bin/env python3
25071ddc1SNicolas Pitre# SPDX-License-Identifier: GPL-2.0
35071ddc1SNicolas Pitre#
45071ddc1SNicolas Pitre# Leverage Python's unidecode module to generate ucs_fallback_table.h
55071ddc1SNicolas Pitre#
65071ddc1SNicolas Pitre# The generated table maps complex characters to their simpler fallback forms
75071ddc1SNicolas Pitre# for a terminal display when corresponding glyphs are unavailable.
85071ddc1SNicolas Pitre#
95071ddc1SNicolas Pitre# Usage:
105071ddc1SNicolas Pitre#   python3 gen_ucs_fallback_table.py         # Generate fallback tables
115071ddc1SNicolas Pitre#   python3 gen_ucs_fallback_table.py -o FILE # Specify output file
125071ddc1SNicolas Pitre
135071ddc1SNicolas Pitreimport unicodedata
145071ddc1SNicolas Pitrefrom unidecode import unidecode
155071ddc1SNicolas Pitreimport sys
165071ddc1SNicolas Pitreimport argparse
175071ddc1SNicolas Pitrefrom collections import defaultdict
185071ddc1SNicolas Pitre
195071ddc1SNicolas Pitre# Try to get unidecode version
205071ddc1SNicolas Pitretry:
215071ddc1SNicolas Pitre    from importlib.metadata import version
225071ddc1SNicolas Pitre    unidecode_version = version('unidecode')
235071ddc1SNicolas Pitreexcept:
245071ddc1SNicolas Pitre    unidecode_version = 'unknown'
255071ddc1SNicolas Pitre
265071ddc1SNicolas Pitre# This script's file name
275071ddc1SNicolas Pitrefrom pathlib import Path
285071ddc1SNicolas Pitrethis_file = Path(__file__).name
295071ddc1SNicolas Pitre
305071ddc1SNicolas Pitre# Default output file name
315071ddc1SNicolas PitreDEFAULT_OUT_FILE = "ucs_fallback_table.h"
325071ddc1SNicolas Pitre
335071ddc1SNicolas Pitre# Define the range marker value
345071ddc1SNicolas PitreRANGE_MARKER = 0x00
355071ddc1SNicolas Pitre
365071ddc1SNicolas Pitredef generate_fallback_map():
375071ddc1SNicolas Pitre    """Generate a fallback map using unidecode for all relevant Unicode points."""
385071ddc1SNicolas Pitre    fallback_map = {}
395071ddc1SNicolas Pitre
405071ddc1SNicolas Pitre    # Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable
415071ddc1SNicolas Pitre    for cp in range(0x0080, 0x10000):  # Skip ASCII range (0x00-0x7F)
425071ddc1SNicolas Pitre        char = chr(cp)
435071ddc1SNicolas Pitre
445071ddc1SNicolas Pitre        # Skip unassigned/control characters
455071ddc1SNicolas Pitre        try:
465071ddc1SNicolas Pitre            if not unicodedata.name(char, ''):
475071ddc1SNicolas Pitre                continue
485071ddc1SNicolas Pitre        except ValueError:
495071ddc1SNicolas Pitre            continue
505071ddc1SNicolas Pitre
515071ddc1SNicolas Pitre        # Get the unidecode transliteration
525071ddc1SNicolas Pitre        ascii_version = unidecode(char)
535071ddc1SNicolas Pitre
545071ddc1SNicolas Pitre        # Only store if it results in a single character mapping
555071ddc1SNicolas Pitre        if len(ascii_version) == 1:
565071ddc1SNicolas Pitre            fallback_map[cp] = ord(ascii_version)
575071ddc1SNicolas Pitre
585071ddc1SNicolas Pitre    # Apply manual overrides for special cases
595071ddc1SNicolas Pitre    fallback_map.update(get_special_overrides())
605071ddc1SNicolas Pitre
615071ddc1SNicolas Pitre    return fallback_map
625071ddc1SNicolas Pitre
635071ddc1SNicolas Pitredef get_special_overrides():
645071ddc1SNicolas Pitre    """Get special case overrides that need different handling than unidecode
655071ddc1SNicolas Pitre    provides... or doesn't provide at all."""
665071ddc1SNicolas Pitre
675071ddc1SNicolas Pitre    overrides = {}
685071ddc1SNicolas Pitre
695071ddc1SNicolas Pitre    # Multi-character unidecode output
705071ddc1SNicolas Pitre    # These map to single chars instead of unidecode's multiple-char mappings
715071ddc1SNicolas Pitre    # In a terminal fallback context, we need a single character rather than multiple
725071ddc1SNicolas Pitre    overrides[0x00C6] = ord('E')  # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE")
735071ddc1SNicolas Pitre    overrides[0x00E6] = ord('e')  # æ LATIN SMALL LETTER AE -> e (unidecode: "ae")
745071ddc1SNicolas Pitre    overrides[0x0152] = ord('E')  # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE")
755071ddc1SNicolas Pitre    overrides[0x0153] = ord('e')  # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe")
765071ddc1SNicolas Pitre    overrides[0x00DF] = ord('s')  # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss")
775071ddc1SNicolas Pitre
785071ddc1SNicolas Pitre    # Comparison operators that unidecode renders as multiple characters
795071ddc1SNicolas Pitre    overrides[0x2264] = ord('<')  # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=")
805071ddc1SNicolas Pitre    overrides[0x2265] = ord('>')  # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=")
815071ddc1SNicolas Pitre
825071ddc1SNicolas Pitre    # Unidecode returns an empty string for these
835071ddc1SNicolas Pitre    overrides[0x2260] = ord('#')  # ≠ NOT EQUAL TO -> # (unidecode: empty string)
845071ddc1SNicolas Pitre
855071ddc1SNicolas Pitre    # Quadrant block characters that unidecode doesn't map
865071ddc1SNicolas Pitre    for cp in range(0x2596, 0x259F+1):
875071ddc1SNicolas Pitre        overrides[cp] = ord('#')  # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string)
885071ddc1SNicolas Pitre
895071ddc1SNicolas Pitre    # Directional arrows
905071ddc1SNicolas Pitre    # These provide better semantic meaning than unidecode's mappings
915071ddc1SNicolas Pitre    overrides[0x2192] = ord('>')  # → RIGHTWARDS ARROW -> > (unidecode: "-")
925071ddc1SNicolas Pitre    overrides[0x2190] = ord('<')  # ← LEFTWARDS ARROW -> < (unidecode: "-")
935071ddc1SNicolas Pitre    overrides[0x2191] = ord('^')  # ↑ UPWARDS ARROW -> ^ (unidecode: "|")
945071ddc1SNicolas Pitre    overrides[0x2193] = ord('v')  # ↓ DOWNWARDS ARROW -> v (unidecode: "|")
955071ddc1SNicolas Pitre
965071ddc1SNicolas Pitre    # Double arrows with their directional semantic mappings
975071ddc1SNicolas Pitre    overrides[0x21D0] = ord('<')  # ⇐ LEFTWARDS DOUBLE ARROW -> <
985071ddc1SNicolas Pitre    overrides[0x21D1] = ord('^')  # ⇑ UPWARDS DOUBLE ARROW -> ^
995071ddc1SNicolas Pitre    overrides[0x21D2] = ord('>')  # ⇒ RIGHTWARDS DOUBLE ARROW -> >
1005071ddc1SNicolas Pitre    overrides[0x21D3] = ord('v')  # ⇓ DOWNWARDS DOUBLE ARROW -> v
1015071ddc1SNicolas Pitre
1025071ddc1SNicolas Pitre    # Halfwidth arrows
1035071ddc1SNicolas Pitre    # These need the same treatment as their normal-width counterparts
1045071ddc1SNicolas Pitre    overrides[0xFFE9] = ord('<')  # ← HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-")
1055071ddc1SNicolas Pitre    overrides[0xFFEA] = ord('^')  # ↑ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|")
1065071ddc1SNicolas Pitre    overrides[0xFFEB] = ord('>')  # → HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-")
1075071ddc1SNicolas Pitre    overrides[0xFFEC] = ord('v')  # ↓ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|")
1085071ddc1SNicolas Pitre
1095071ddc1SNicolas Pitre    # Currency symbols - each mapped to a representative letter
1105071ddc1SNicolas Pitre    overrides[0x00A2] = ord('c')  # ¢ CENT SIGN -> c
1115071ddc1SNicolas Pitre    overrides[0x00A3] = ord('L')  # £ POUND SIGN -> L
1125071ddc1SNicolas Pitre    overrides[0x00A5] = ord('Y')  # ¥ YEN SIGN -> Y
1135071ddc1SNicolas Pitre    overrides[0x20AC] = ord('E')  # € EURO SIGN -> E
1145071ddc1SNicolas Pitre
1155071ddc1SNicolas Pitre    # Symbols mapped to letters
1165071ddc1SNicolas Pitre    overrides[0x00A7] = ord('S')  # § SECTION SIGN -> S
1175071ddc1SNicolas Pitre    overrides[0x00A9] = ord('C')  # © COPYRIGHT SIGN -> C
1185071ddc1SNicolas Pitre    overrides[0x00AE] = ord('R')  # ® REGISTERED SIGN -> R
1195071ddc1SNicolas Pitre    overrides[0x2122] = ord('T')  # ™ TRADE MARK SIGN -> T
1205071ddc1SNicolas Pitre
1215071ddc1SNicolas Pitre    # Degree-related symbols
1225071ddc1SNicolas Pitre    overrides[0x00B0] = ord('o')  # ° DEGREE SIGN -> o
1235071ddc1SNicolas Pitre    overrides[0x2103] = ord('C')  # ℃ DEGREE CELSIUS -> C
1245071ddc1SNicolas Pitre    overrides[0x2109] = ord('F')  # ℉ DEGREE FAHRENHEIT -> F
1255071ddc1SNicolas Pitre
1265071ddc1SNicolas Pitre    # Angle quotation marks
1275071ddc1SNicolas Pitre    overrides[0x00AB] = ord('<')  # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> <
1285071ddc1SNicolas Pitre    overrides[0x00BB] = ord('>')  # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> >
1295071ddc1SNicolas Pitre
1305071ddc1SNicolas Pitre    # Operators with circular shape
1315071ddc1SNicolas Pitre    overrides[0x2218] = ord('o')  # ∘ RING OPERATOR -> o
1325071ddc1SNicolas Pitre    overrides[0x2219] = ord('.')  # ∙ BULLET OPERATOR -> .
1335071ddc1SNicolas Pitre
1345071ddc1SNicolas Pitre    # Negated mathematical symbols (preserving the negation semantics)
1355071ddc1SNicolas Pitre    # Negated symbols mapped to exclamation mark (semantically "not")
1365071ddc1SNicolas Pitre    for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285):
1375071ddc1SNicolas Pitre        overrides[cp] = ord('!')  # Negated math symbols -> ! (not)
1385071ddc1SNicolas Pitre
1395071ddc1SNicolas Pitre    # Negated symbols mapped to hash sign (semantically "not equal")
1405071ddc1SNicolas Pitre    for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B):
1415071ddc1SNicolas Pitre        overrides[cp] = ord('#')  # Negated equality symbols -> # (not equal)
1425071ddc1SNicolas Pitre
1435071ddc1SNicolas Pitre    # Negated arrows - all mapped to exclamation mark
1445071ddc1SNicolas Pitre    for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF):
1455071ddc1SNicolas Pitre        overrides[cp] = ord('!')  # Negated arrows -> ! (not)
1465071ddc1SNicolas Pitre
1475071ddc1SNicolas Pitre    # Dashes and hyphens
1485071ddc1SNicolas Pitre    for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052):
1495071ddc1SNicolas Pitre        overrides[cp] = ord('-')  # Dashes and hyphens -> -
1505071ddc1SNicolas Pitre
1515071ddc1SNicolas Pitre    # Question mark punctuation
1525071ddc1SNicolas Pitre    for cp in (0x203D, 0x2047, 0x2048):
1535071ddc1SNicolas Pitre        overrides[cp] = ord('?')  # Question marks -> ?
1545071ddc1SNicolas Pitre
1555071ddc1SNicolas Pitre    # Exclamation mark punctuation
1565071ddc1SNicolas Pitre    for cp in (0x203C, 0x2049):
1575071ddc1SNicolas Pitre        overrides[cp] = ord('!')  # Exclamation marks -> !
1585071ddc1SNicolas Pitre
1595071ddc1SNicolas Pitre    # Asterisk-like symbols
1605071ddc1SNicolas Pitre    for cp in (0x2042, 0x2051, 0x2055):
1615071ddc1SNicolas Pitre        overrides[cp] = ord('*')
1625071ddc1SNicolas Pitre
1635071ddc1SNicolas Pitre    # Other specific punctuation with unique mappings
1645071ddc1SNicolas Pitre    overrides[0x201E] = ord('"')  # „ DOUBLE LOW-9 QUOTATION MARK
1655071ddc1SNicolas Pitre    overrides[0x2023] = ord('>')  # ‣ TRIANGULAR BULLET
1665071ddc1SNicolas Pitre    overrides[0x2026] = ord('.')  # … HORIZONTAL ELLIPSIS
1675071ddc1SNicolas Pitre    overrides[0x2033] = ord('"')  # ″ DOUBLE PRIME
1685071ddc1SNicolas Pitre    overrides[0x204B] = ord('P')  # ⁋ REVERSED PILCROW SIGN
1695071ddc1SNicolas Pitre    overrides[0x204C] = ord('<')  # ⁌ BLACK LEFTWARDS BULLET
1705071ddc1SNicolas Pitre    overrides[0x204D] = ord('>')  # ⁍ BLACK RIGHTWARDS BULLET
1715071ddc1SNicolas Pitre    overrides[0x204F] = ord(';')  # ⁏ REVERSED SEMICOLON
1725071ddc1SNicolas Pitre    overrides[0x205B] = ord(':')  # ⁛ FOUR DOT MARK
1735071ddc1SNicolas Pitre
1745071ddc1SNicolas Pitre    # Check marks
1755071ddc1SNicolas Pitre    overrides[0x2713] = ord('v')  # ✓ CHECK MARK
1765071ddc1SNicolas Pitre    overrides[0x2714] = ord('V')  # ✔ HEAVY CHECK MARK
1775071ddc1SNicolas Pitre
1785071ddc1SNicolas Pitre    # X marks - lowercase for regular, uppercase for heavy
1795071ddc1SNicolas Pitre    for cp in (0x2715, 0x2717):
1805071ddc1SNicolas Pitre        overrides[cp] = ord('x')  # Regular X marks -> x
1815071ddc1SNicolas Pitre    for cp in (0x2716, 0x2718):
1825071ddc1SNicolas Pitre        overrides[cp] = ord('X')  # Heavy X marks -> X
1835071ddc1SNicolas Pitre
1845071ddc1SNicolas Pitre    # Stars and asterisk-like symbols mapped to '*'
1855071ddc1SNicolas Pitre    for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698):
1865071ddc1SNicolas Pitre        overrides[cp] = ord('*')  # All star and asterisk symbols -> *
1875071ddc1SNicolas Pitre    for cp in range(0x2721, 0x2746+1):
1885071ddc1SNicolas Pitre        overrides[cp] = ord('*')  # All star and asterisk symbols -> *
1895071ddc1SNicolas Pitre    for cp in range(0x2749, 0x274B+1):
1905071ddc1SNicolas Pitre        overrides[cp] = ord('*')  # Last set of asterisk symbols -> *
1915071ddc1SNicolas Pitre    for cp in (0x229B, 0x22C6, 0x235F, 0x2363):
1925071ddc1SNicolas Pitre        overrides[cp] = ord('*')  # Star operators -> *
1935071ddc1SNicolas Pitre
1945071ddc1SNicolas Pitre    # Special exclusions with fallback value of 0
1955071ddc1SNicolas Pitre    # These will be filtered out in organize_by_pages()
1965071ddc1SNicolas Pitre
1975071ddc1SNicolas Pitre    # Exclude U+2028 (LINE SEPARATOR)
1985071ddc1SNicolas Pitre    overrides[0x2028] = 0  # LINE SEPARATOR (unidecode: '\n')
1995071ddc1SNicolas Pitre
200*63f0d28dSNicolas Pitre    # Full-width to ASCII mapping (covering all printable ASCII 33-126)
201*63f0d28dSNicolas Pitre    # 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
202*63f0d28dSNicolas Pitre    # Those are excluded here to reduce the table size.
203*63f0d28dSNicolas Pitre    # It is more efficient to process them programmatically in
204*63f0d28dSNicolas Pitre    # ucs.c:ucs_get_fallback().
205*63f0d28dSNicolas Pitre    for cp in range(0xFF01, 0xFF5E + 1):
206*63f0d28dSNicolas Pitre        overrides[cp] = 0  # Double-width ASCII characters
207*63f0d28dSNicolas Pitre
2085071ddc1SNicolas Pitre    return overrides
2095071ddc1SNicolas Pitre
2105071ddc1SNicolas Pitredef organize_by_pages(fallback_map):
2115071ddc1SNicolas Pitre    """Organize the fallback mappings by their high byte (page)."""
2125071ddc1SNicolas Pitre    # Group by high byte (page)
2135071ddc1SNicolas Pitre    page_groups = defaultdict(list)
2145071ddc1SNicolas Pitre    for code, fallback in fallback_map.items():
2155071ddc1SNicolas Pitre        # Skip characters with fallback value of 0 (excluded characters)
2165071ddc1SNicolas Pitre        if fallback == 0:
2175071ddc1SNicolas Pitre            continue
2185071ddc1SNicolas Pitre
2195071ddc1SNicolas Pitre        page = code >> 8  # Get the high byte (page)
2205071ddc1SNicolas Pitre        offset = code & 0xFF  # Get the low byte (offset within page)
2215071ddc1SNicolas Pitre        page_groups[page].append((offset, fallback))
2225071ddc1SNicolas Pitre
2235071ddc1SNicolas Pitre    # Sort each page's entries by offset
2245071ddc1SNicolas Pitre    for page in page_groups:
2255071ddc1SNicolas Pitre        page_groups[page].sort()
2265071ddc1SNicolas Pitre
2275071ddc1SNicolas Pitre    return page_groups
2285071ddc1SNicolas Pitre
2295071ddc1SNicolas Pitredef compress_ranges(page_groups):
2305071ddc1SNicolas Pitre    """Compress consecutive entries with the same fallback character into ranges.
2315071ddc1SNicolas Pitre    A range is only compressed if it contains 3 or more consecutive entries."""
2325071ddc1SNicolas Pitre
2335071ddc1SNicolas Pitre    compressed_pages = {}
2345071ddc1SNicolas Pitre
2355071ddc1SNicolas Pitre    for page, entries in page_groups.items():
2365071ddc1SNicolas Pitre        compressed_entries = []
2375071ddc1SNicolas Pitre        i = 0
2385071ddc1SNicolas Pitre        while i < len(entries):
2395071ddc1SNicolas Pitre            start_offset, fallback = entries[i]
2405071ddc1SNicolas Pitre
2415071ddc1SNicolas Pitre            # Look ahead to find consecutive entries with the same fallback
2425071ddc1SNicolas Pitre            j = i + 1
2435071ddc1SNicolas Pitre            while (j < len(entries) and
2445071ddc1SNicolas Pitre                   entries[j][0] == entries[j-1][0] + 1 and  # consecutive offsets
2455071ddc1SNicolas Pitre                   entries[j][1] == fallback):               # same fallback
2465071ddc1SNicolas Pitre                j += 1
2475071ddc1SNicolas Pitre
2485071ddc1SNicolas Pitre            # Calculate the range end
2495071ddc1SNicolas Pitre            end_offset = entries[j-1][0]
2505071ddc1SNicolas Pitre
2515071ddc1SNicolas Pitre            # If we found a range with 3 or more entries (worth compressing)
2525071ddc1SNicolas Pitre            if j - i >= 3:
2535071ddc1SNicolas Pitre                # Add a range entry
2545071ddc1SNicolas Pitre                compressed_entries.append((start_offset, RANGE_MARKER))
2555071ddc1SNicolas Pitre                compressed_entries.append((end_offset, fallback))
2565071ddc1SNicolas Pitre            else:
2575071ddc1SNicolas Pitre                # Add the individual entries as is
2585071ddc1SNicolas Pitre                for k in range(i, j):
2595071ddc1SNicolas Pitre                    compressed_entries.append(entries[k])
2605071ddc1SNicolas Pitre
2615071ddc1SNicolas Pitre            i = j
2625071ddc1SNicolas Pitre
2635071ddc1SNicolas Pitre        compressed_pages[page] = compressed_entries
2645071ddc1SNicolas Pitre
2655071ddc1SNicolas Pitre    return compressed_pages
2665071ddc1SNicolas Pitre
2675071ddc1SNicolas Pitredef cp_name(cp):
2685071ddc1SNicolas Pitre    """Get the Unicode character name for a code point."""
2695071ddc1SNicolas Pitre    try:
2705071ddc1SNicolas Pitre        return unicodedata.name(chr(cp))
2715071ddc1SNicolas Pitre    except:
2725071ddc1SNicolas Pitre        return f"U+{cp:04X}"
2735071ddc1SNicolas Pitre
2745071ddc1SNicolas Pitredef generate_fallback_tables(out_file=DEFAULT_OUT_FILE):
2755071ddc1SNicolas Pitre    """Generate the fallback character tables."""
2765071ddc1SNicolas Pitre    # Generate fallback map using unidecode
2775071ddc1SNicolas Pitre    fallback_map = generate_fallback_map()
2785071ddc1SNicolas Pitre    print(f"Generated {len(fallback_map)} total fallback mappings")
2795071ddc1SNicolas Pitre
2805071ddc1SNicolas Pitre    # Organize by pages
2815071ddc1SNicolas Pitre    page_groups = organize_by_pages(fallback_map)
2825071ddc1SNicolas Pitre    print(f"Organized into {len(page_groups)} pages")
2835071ddc1SNicolas Pitre
2845071ddc1SNicolas Pitre    # Compress ranges
2855071ddc1SNicolas Pitre    compressed_pages = compress_ranges(page_groups)
2865071ddc1SNicolas Pitre    total_compressed_entries = sum(len(entries) for entries in compressed_pages.values())
2875071ddc1SNicolas Pitre    print(f"Total compressed entries: {total_compressed_entries}")
2885071ddc1SNicolas Pitre
2895071ddc1SNicolas Pitre    # Create output file
2905071ddc1SNicolas Pitre    with open(out_file, 'w') as f:
2915071ddc1SNicolas Pitre        f.write(f"""\
2925071ddc1SNicolas Pitre/* SPDX-License-Identifier: GPL-2.0 */
2935071ddc1SNicolas Pitre/*
2945071ddc1SNicolas Pitre * {out_file} - Unicode character fallback table
2955071ddc1SNicolas Pitre *
2965071ddc1SNicolas Pitre * Auto-generated by {this_file}
2975071ddc1SNicolas Pitre *
2985071ddc1SNicolas Pitre * Unicode Version: {unicodedata.unidata_version}
2995071ddc1SNicolas Pitre * Unidecode Version: {unidecode_version}
3005071ddc1SNicolas Pitre *
3015071ddc1SNicolas Pitre * This file contains optimized tables that map complex Unicode characters
3025071ddc1SNicolas Pitre * to simpler fallback characters for terminal display when corresponding
3035071ddc1SNicolas Pitre * glyphs are unavailable.
3045071ddc1SNicolas Pitre */
3055071ddc1SNicolas Pitre
3065071ddc1SNicolas Pitrestatic const struct ucs_page_desc ucs_fallback_pages[] = {{
3075071ddc1SNicolas Pitre""")
3085071ddc1SNicolas Pitre
3095071ddc1SNicolas Pitre        # Convert compressed_pages to a sorted list of (page, entries) tuples
3105071ddc1SNicolas Pitre        sorted_pages = sorted(compressed_pages.items())
3115071ddc1SNicolas Pitre
3125071ddc1SNicolas Pitre        # Track the start index for each page
3135071ddc1SNicolas Pitre        start_index = 0
3145071ddc1SNicolas Pitre
3155071ddc1SNicolas Pitre        # Write page descriptors
3165071ddc1SNicolas Pitre        for page, entries in sorted_pages:
3175071ddc1SNicolas Pitre            count = len(entries)
3185071ddc1SNicolas Pitre            f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n")
3195071ddc1SNicolas Pitre            start_index += count
3205071ddc1SNicolas Pitre
3215071ddc1SNicolas Pitre        # Write entries array
3225071ddc1SNicolas Pitre        f.write("""\
3235071ddc1SNicolas Pitre};
3245071ddc1SNicolas Pitre
3255071ddc1SNicolas Pitre/* Page entries array (referenced by page descriptors) */
3265071ddc1SNicolas Pitrestatic const struct ucs_page_entry ucs_fallback_entries[] = {
3275071ddc1SNicolas Pitre""")
3285071ddc1SNicolas Pitre
3295071ddc1SNicolas Pitre        # Write all entries
3305071ddc1SNicolas Pitre        for page, entries in sorted_pages:
3315071ddc1SNicolas Pitre            page_hex = f"0x{page:02X}"
3325071ddc1SNicolas Pitre            f.write(f"\t/* Entries for page {page_hex} */\n")
3335071ddc1SNicolas Pitre
3345071ddc1SNicolas Pitre            for i, (offset, fallback) in enumerate(entries):
3355071ddc1SNicolas Pitre                # Convert to hex for better readability
3365071ddc1SNicolas Pitre                offset_hex = f"0x{offset:02X}"
3375071ddc1SNicolas Pitre                fallback_hex = f"0x{fallback:02X}"
3385071ddc1SNicolas Pitre
3395071ddc1SNicolas Pitre                # Handle comments
3405071ddc1SNicolas Pitre                codepoint = (page << 8) | offset
3415071ddc1SNicolas Pitre
3425071ddc1SNicolas Pitre                if fallback == RANGE_MARKER:
3435071ddc1SNicolas Pitre                    comment = f"{cp_name(codepoint)} -> ..."
3445071ddc1SNicolas Pitre                else:
3455071ddc1SNicolas Pitre                    comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'"
3465071ddc1SNicolas Pitre                f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n")
3475071ddc1SNicolas Pitre
3485071ddc1SNicolas Pitre        f.write(f"""\
3495071ddc1SNicolas Pitre}};
3505071ddc1SNicolas Pitre
3515071ddc1SNicolas Pitre#define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER}
3525071ddc1SNicolas Pitre""")
3535071ddc1SNicolas Pitre
3545071ddc1SNicolas Pitreif __name__ == "__main__":
3555071ddc1SNicolas Pitre    parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables")
3565071ddc1SNicolas Pitre    parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
3575071ddc1SNicolas Pitre                       help=f"Output file name (default: {DEFAULT_OUT_FILE})")
3585071ddc1SNicolas Pitre    args = parser.parse_args()
3595071ddc1SNicolas Pitre
3605071ddc1SNicolas Pitre    generate_fallback_tables(out_file=args.output_file)
361