xref: /linux/drivers/tty/vt/gen_ucs_recompose_table.py (revision 378ec25aec5a8444879f8696d580c94950a1f1df)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3#
4# Leverage Python's unicodedata module to generate ucs_recompose_table.h
5#
6# The generated table maps base character + combining mark pairs to their
7# precomposed equivalents.
8#
9# Usage:
10#   python3 gen_ucs_recompose_table.py         # Generate with common recomposition pairs
11#   python3 gen_ucs_recompose_table.py --full  # Generate with all recomposition pairs
12
13import unicodedata
14import sys
15import argparse
16import textwrap
17
18# This script's file name
19from pathlib import Path
20this_file = Path(__file__).name
21
22# Default output file name
23DEFAULT_OUT_FILE = "ucs_recompose_table.h"
24
25common_recompose_description = "most commonly used Latin, Greek, and Cyrillic recomposition pairs only"
26COMMON_RECOMPOSITION_PAIRS = [
27    # Latin letters with accents - uppercase
28    (0x0041, 0x0300, 0x00C0),  # A + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER A WITH GRAVE
29    (0x0041, 0x0301, 0x00C1),  # A + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER A WITH ACUTE
30    (0x0041, 0x0302, 0x00C2),  # A + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER A WITH CIRCUMFLEX
31    (0x0041, 0x0303, 0x00C3),  # A + COMBINING TILDE = LATIN CAPITAL LETTER A WITH TILDE
32    (0x0041, 0x0308, 0x00C4),  # A + COMBINING DIAERESIS = LATIN CAPITAL LETTER A WITH DIAERESIS
33    (0x0041, 0x030A, 0x00C5),  # A + COMBINING RING ABOVE = LATIN CAPITAL LETTER A WITH RING ABOVE
34    (0x0043, 0x0327, 0x00C7),  # C + COMBINING CEDILLA = LATIN CAPITAL LETTER C WITH CEDILLA
35    (0x0045, 0x0300, 0x00C8),  # E + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER E WITH GRAVE
36    (0x0045, 0x0301, 0x00C9),  # E + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER E WITH ACUTE
37    (0x0045, 0x0302, 0x00CA),  # E + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER E WITH CIRCUMFLEX
38    (0x0045, 0x0308, 0x00CB),  # E + COMBINING DIAERESIS = LATIN CAPITAL LETTER E WITH DIAERESIS
39    (0x0049, 0x0300, 0x00CC),  # I + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER I WITH GRAVE
40    (0x0049, 0x0301, 0x00CD),  # I + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER I WITH ACUTE
41    (0x0049, 0x0302, 0x00CE),  # I + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER I WITH CIRCUMFLEX
42    (0x0049, 0x0308, 0x00CF),  # I + COMBINING DIAERESIS = LATIN CAPITAL LETTER I WITH DIAERESIS
43    (0x004E, 0x0303, 0x00D1),  # N + COMBINING TILDE = LATIN CAPITAL LETTER N WITH TILDE
44    (0x004F, 0x0300, 0x00D2),  # O + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER O WITH GRAVE
45    (0x004F, 0x0301, 0x00D3),  # O + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER O WITH ACUTE
46    (0x004F, 0x0302, 0x00D4),  # O + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER O WITH CIRCUMFLEX
47    (0x004F, 0x0303, 0x00D5),  # O + COMBINING TILDE = LATIN CAPITAL LETTER O WITH TILDE
48    (0x004F, 0x0308, 0x00D6),  # O + COMBINING DIAERESIS = LATIN CAPITAL LETTER O WITH DIAERESIS
49    (0x0055, 0x0300, 0x00D9),  # U + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER U WITH GRAVE
50    (0x0055, 0x0301, 0x00DA),  # U + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER U WITH ACUTE
51    (0x0055, 0x0302, 0x00DB),  # U + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER U WITH CIRCUMFLEX
52    (0x0055, 0x0308, 0x00DC),  # U + COMBINING DIAERESIS = LATIN CAPITAL LETTER U WITH DIAERESIS
53    (0x0059, 0x0301, 0x00DD),  # Y + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER Y WITH ACUTE
54
55    # Latin letters with accents - lowercase
56    (0x0061, 0x0300, 0x00E0),  # a + COMBINING GRAVE ACCENT = LATIN SMALL LETTER A WITH GRAVE
57    (0x0061, 0x0301, 0x00E1),  # a + COMBINING ACUTE ACCENT = LATIN SMALL LETTER A WITH ACUTE
58    (0x0061, 0x0302, 0x00E2),  # a + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER A WITH CIRCUMFLEX
59    (0x0061, 0x0303, 0x00E3),  # a + COMBINING TILDE = LATIN SMALL LETTER A WITH TILDE
60    (0x0061, 0x0308, 0x00E4),  # a + COMBINING DIAERESIS = LATIN SMALL LETTER A WITH DIAERESIS
61    (0x0061, 0x030A, 0x00E5),  # a + COMBINING RING ABOVE = LATIN SMALL LETTER A WITH RING ABOVE
62    (0x0063, 0x0327, 0x00E7),  # c + COMBINING CEDILLA = LATIN SMALL LETTER C WITH CEDILLA
63    (0x0065, 0x0300, 0x00E8),  # e + COMBINING GRAVE ACCENT = LATIN SMALL LETTER E WITH GRAVE
64    (0x0065, 0x0301, 0x00E9),  # e + COMBINING ACUTE ACCENT = LATIN SMALL LETTER E WITH ACUTE
65    (0x0065, 0x0302, 0x00EA),  # e + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER E WITH CIRCUMFLEX
66    (0x0065, 0x0308, 0x00EB),  # e + COMBINING DIAERESIS = LATIN SMALL LETTER E WITH DIAERESIS
67    (0x0069, 0x0300, 0x00EC),  # i + COMBINING GRAVE ACCENT = LATIN SMALL LETTER I WITH GRAVE
68    (0x0069, 0x0301, 0x00ED),  # i + COMBINING ACUTE ACCENT = LATIN SMALL LETTER I WITH ACUTE
69    (0x0069, 0x0302, 0x00EE),  # i + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER I WITH CIRCUMFLEX
70    (0x0069, 0x0308, 0x00EF),  # i + COMBINING DIAERESIS = LATIN SMALL LETTER I WITH DIAERESIS
71    (0x006E, 0x0303, 0x00F1),  # n + COMBINING TILDE = LATIN SMALL LETTER N WITH TILDE
72    (0x006F, 0x0300, 0x00F2),  # o + COMBINING GRAVE ACCENT = LATIN SMALL LETTER O WITH GRAVE
73    (0x006F, 0x0301, 0x00F3),  # o + COMBINING ACUTE ACCENT = LATIN SMALL LETTER O WITH ACUTE
74    (0x006F, 0x0302, 0x00F4),  # o + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER O WITH CIRCUMFLEX
75    (0x006F, 0x0303, 0x00F5),  # o + COMBINING TILDE = LATIN SMALL LETTER O WITH TILDE
76    (0x006F, 0x0308, 0x00F6),  # o + COMBINING DIAERESIS = LATIN SMALL LETTER O WITH DIAERESIS
77    (0x0075, 0x0300, 0x00F9),  # u + COMBINING GRAVE ACCENT = LATIN SMALL LETTER U WITH GRAVE
78    (0x0075, 0x0301, 0x00FA),  # u + COMBINING ACUTE ACCENT = LATIN SMALL LETTER U WITH ACUTE
79    (0x0075, 0x0302, 0x00FB),  # u + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER U WITH CIRCUMFLEX
80    (0x0075, 0x0308, 0x00FC),  # u + COMBINING DIAERESIS = LATIN SMALL LETTER U WITH DIAERESIS
81    (0x0079, 0x0301, 0x00FD),  # y + COMBINING ACUTE ACCENT = LATIN SMALL LETTER Y WITH ACUTE
82    (0x0079, 0x0308, 0x00FF),  # y + COMBINING DIAERESIS = LATIN SMALL LETTER Y WITH DIAERESIS
83
84    # Common Greek characters
85    (0x0391, 0x0301, 0x0386),  # Α + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER ALPHA WITH TONOS
86    (0x0395, 0x0301, 0x0388),  # Ε + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER EPSILON WITH TONOS
87    (0x0397, 0x0301, 0x0389),  # Η + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER ETA WITH TONOS
88    (0x0399, 0x0301, 0x038A),  # Ι + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER IOTA WITH TONOS
89    (0x039F, 0x0301, 0x038C),  # Ο + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER OMICRON WITH TONOS
90    (0x03A5, 0x0301, 0x038E),  # Υ + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER UPSILON WITH TONOS
91    (0x03A9, 0x0301, 0x038F),  # Ω + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER OMEGA WITH TONOS
92    (0x03B1, 0x0301, 0x03AC),  # α + COMBINING ACUTE ACCENT = GREEK SMALL LETTER ALPHA WITH TONOS
93    (0x03B5, 0x0301, 0x03AD),  # ε + COMBINING ACUTE ACCENT = GREEK SMALL LETTER EPSILON WITH TONOS
94    (0x03B7, 0x0301, 0x03AE),  # η + COMBINING ACUTE ACCENT = GREEK SMALL LETTER ETA WITH TONOS
95    (0x03B9, 0x0301, 0x03AF),  # ι + COMBINING ACUTE ACCENT = GREEK SMALL LETTER IOTA WITH TONOS
96    (0x03BF, 0x0301, 0x03CC),  # ο + COMBINING ACUTE ACCENT = GREEK SMALL LETTER OMICRON WITH TONOS
97    (0x03C5, 0x0301, 0x03CD),  # υ + COMBINING ACUTE ACCENT = GREEK SMALL LETTER UPSILON WITH TONOS
98    (0x03C9, 0x0301, 0x03CE),  # ω + COMBINING ACUTE ACCENT = GREEK SMALL LETTER OMEGA WITH TONOS
99
100    # Common Cyrillic characters
101    (0x0418, 0x0306, 0x0419),  # И + COMBINING BREVE = CYRILLIC CAPITAL LETTER SHORT I
102    (0x0438, 0x0306, 0x0439),  # и + COMBINING BREVE = CYRILLIC SMALL LETTER SHORT I
103    (0x0423, 0x0306, 0x040E),  # У + COMBINING BREVE = CYRILLIC CAPITAL LETTER SHORT U
104    (0x0443, 0x0306, 0x045E),  # у + COMBINING BREVE = CYRILLIC SMALL LETTER SHORT U
105]
106
107full_recompose_description = "all possible recomposition pairs from the Unicode BMP"
108def collect_all_recomposition_pairs():
109    """Collect all possible recomposition pairs from the Unicode data."""
110    # Map to store recomposition pairs: (base, combining) -> recomposed
111    recompose_map = {}
112
113    # Process all assigned Unicode code points in BMP (Basic Multilingual Plane)
114    # We limit to BMP (0x0000-0xFFFF) to keep our table smaller with uint16_t
115    for cp in range(0, 0x10000):
116        try:
117            char = chr(cp)
118
119            # Skip unassigned or control characters
120            if not unicodedata.name(char, ''):
121                continue
122
123            # Find decomposition
124            decomp = unicodedata.decomposition(char)
125            if not decomp or '<' in decomp:  # Skip compatibility decompositions
126                continue
127
128            # Parse the decomposition
129            parts = decomp.split()
130            if len(parts) == 2:  # Simple base + combining mark
131                base = int(parts[0], 16)
132                combining = int(parts[1], 16)
133
134                # Only store if both are in BMP
135                if base < 0x10000 and combining < 0x10000:
136                    recompose_map[(base, combining)] = cp
137
138        except (ValueError, TypeError):
139            continue
140
141    # Convert to a list of tuples and sort for binary search
142    recompose_list = [(base, combining, recomposed)
143                     for (base, combining), recomposed in recompose_map.items()]
144    recompose_list.sort()
145
146    return recompose_list
147
148def validate_common_pairs(full_list):
149    """Validate that all common pairs are in the full list.
150
151    Raises:
152        ValueError: If any common pair is missing or has a different recomposition
153        value than what's in the full table.
154    """
155    full_pairs = {(base, combining): recomposed for base, combining, recomposed in full_list}
156    for base, combining, recomposed in COMMON_RECOMPOSITION_PAIRS:
157        full_recomposed = full_pairs.get((base, combining))
158        if full_recomposed is None:
159            error_msg = f"Error: Common pair (0x{base:04X}, 0x{combining:04X}) not found in full data"
160            print(error_msg)
161            raise ValueError(error_msg)
162        elif full_recomposed != recomposed:
163            error_msg = (f"Error: Common pair (0x{base:04X}, 0x{combining:04X}) has different recomposition: "
164                         f"0x{recomposed:04X} vs 0x{full_recomposed:04X}")
165            print(error_msg)
166            raise ValueError(error_msg)
167
168def generate_recomposition_table(use_full_list=False, out_file=DEFAULT_OUT_FILE):
169    """Generate the recomposition C table."""
170
171    # Collect all recomposition pairs for validation
172    full_recompose_list = collect_all_recomposition_pairs()
173
174    # Decide which list to use
175    if use_full_list:
176        print("Using full recomposition list...")
177        recompose_list = full_recompose_list
178        table_description = full_recompose_description
179        alt_list = COMMON_RECOMPOSITION_PAIRS
180        alt_description = common_recompose_description
181    else:
182        print("Using common recomposition list...")
183        # Validate that all common pairs are in the full list
184        validate_common_pairs(full_recompose_list)
185        recompose_list = sorted(COMMON_RECOMPOSITION_PAIRS)
186        table_description = common_recompose_description
187        alt_list = full_recompose_list
188        alt_description = full_recompose_description
189    generation_mode = " --full" if use_full_list else ""
190    alternative_mode = " --full" if not use_full_list else ""
191    table_description_detail = f"{table_description} ({len(recompose_list)} entries)"
192    alt_description_detail = f"{alt_description} ({len(alt_list)} entries)"
193
194    # Calculate min/max values for boundary checks
195    min_base = min(base for base, _, _ in recompose_list)
196    max_base = max(base for base, _, _ in recompose_list)
197    min_combining = min(combining for _, combining, _ in recompose_list)
198    max_combining = max(combining for _, combining, _ in recompose_list)
199
200    # Generate implementation file
201    with open(out_file, 'w') as f:
202        f.write(f"""\
203/* SPDX-License-Identifier: GPL-2.0 */
204/*
205 * {out_file} - Unicode character recomposition
206 *
207 * Auto-generated by {this_file}{generation_mode}
208 *
209 * Unicode Version: {unicodedata.unidata_version}
210 *
211{textwrap.fill(
212    f"This file contains a table with {table_description_detail}. " +
213    f"To generate a table with {alt_description_detail} instead, run:",
214    width=75, initial_indent=" * ", subsequent_indent=" * ")}
215 *
216 *   python3 {this_file}{alternative_mode}
217 */
218
219/*
220 * Table of {table_description}
221 * Sorted by base character and then combining mark for binary search
222 */
223static const struct ucs_recomposition ucs_recomposition_table[] = {{
224""")
225
226        for base, combining, recomposed in recompose_list:
227            try:
228                base_name = unicodedata.name(chr(base))
229                combining_name = unicodedata.name(chr(combining))
230                recomposed_name = unicodedata.name(chr(recomposed))
231                comment = f"/* {base_name} + {combining_name} = {recomposed_name} */"
232            except ValueError:
233                comment = f"/* U+{base:04X} + U+{combining:04X} = U+{recomposed:04X} */"
234            f.write(f"\t{{ 0x{base:04X}, 0x{combining:04X}, 0x{recomposed:04X} }}, {comment}\n")
235
236        f.write(f"""\
237}};
238
239/*
240 * Boundary values for quick rejection
241 * These are calculated by analyzing the table during generation
242 */
243#define UCS_RECOMPOSE_MIN_BASE  0x{min_base:04X}
244#define UCS_RECOMPOSE_MAX_BASE  0x{max_base:04X}
245#define UCS_RECOMPOSE_MIN_MARK  0x{min_combining:04X}
246#define UCS_RECOMPOSE_MAX_MARK  0x{max_combining:04X}
247""")
248
249if __name__ == "__main__":
250    parser = argparse.ArgumentParser(description="Generate Unicode recomposition table")
251    parser.add_argument("--full", action="store_true",
252                        help="Generate a full recomposition table (default: common pairs only)")
253    parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
254                        help=f"Output file name (default: {DEFAULT_OUT_FILE})")
255    args = parser.parse_args()
256
257    generate_recomposition_table(use_full_list=args.full, out_file=args.output_file)
258