xref: /linux/drivers/tty/vt/gen_ucs_width_table.py (revision 378ec25aec5a8444879f8696d580c94950a1f1df)
1b11a0411SNicolas Pitre#!/usr/bin/env python3
2b11a0411SNicolas Pitre# SPDX-License-Identifier: GPL-2.0
3b11a0411SNicolas Pitre#
4b11a0411SNicolas Pitre# Leverage Python's unicodedata module to generate ucs_width_table.h
5b11a0411SNicolas Pitre
6b11a0411SNicolas Pitreimport unicodedata
7b11a0411SNicolas Pitreimport sys
8*c2d2c5c0SNicolas Pitreimport argparse
9b11a0411SNicolas Pitre
10b11a0411SNicolas Pitre# This script's file name
11b11a0411SNicolas Pitrefrom pathlib import Path
12b11a0411SNicolas Pitrethis_file = Path(__file__).name
13b11a0411SNicolas Pitre
14*c2d2c5c0SNicolas Pitre# Default output file name
15*c2d2c5c0SNicolas PitreDEFAULT_OUT_FILE = "ucs_width_table.h"
16b11a0411SNicolas Pitre
17b11a0411SNicolas Pitre# --- Global Constants for Width Assignments ---
18b11a0411SNicolas Pitre
19b11a0411SNicolas Pitre# Known zero-width characters
20b11a0411SNicolas PitreKNOWN_ZERO_WIDTH = (
21b11a0411SNicolas Pitre    0x200B,  # ZERO WIDTH SPACE
22b11a0411SNicolas Pitre    0x200C,  # ZERO WIDTH NON-JOINER
23b11a0411SNicolas Pitre    0x200D,  # ZERO WIDTH JOINER
24b11a0411SNicolas Pitre    0x2060,  # WORD JOINER
25b11a0411SNicolas Pitre    0xFEFF   # ZERO WIDTH NO-BREAK SPACE (BOM)
26b11a0411SNicolas Pitre)
27b11a0411SNicolas Pitre
28b11a0411SNicolas Pitre# Zero-width emoji modifiers and components
29b11a0411SNicolas Pitre# NOTE: Some of these characters would normally be single-width according to
30b11a0411SNicolas Pitre# East Asian Width properties, but we deliberately override them to be
31b11a0411SNicolas Pitre# zero-width because they function as modifiers in emoji sequences.
32b11a0411SNicolas PitreEMOJI_ZERO_WIDTH = [
33b11a0411SNicolas Pitre    # Skin tone modifiers
34b11a0411SNicolas Pitre    (0x1F3FB, 0x1F3FF),  # Emoji modifiers (skin tones)
35b11a0411SNicolas Pitre
36b11a0411SNicolas Pitre    # Variation selectors (note: VS16 is treated specially in vt.c)
37b11a0411SNicolas Pitre    (0xFE00, 0xFE0F),    # Variation Selectors 1-16
38b11a0411SNicolas Pitre
39b11a0411SNicolas Pitre    # Gender and hair style modifiers
40b11a0411SNicolas Pitre    # These would be single-width by Unicode properties, but are zero-width
41b11a0411SNicolas Pitre    # when part of emoji
42b11a0411SNicolas Pitre    (0x2640, 0x2640),    # Female sign
43b11a0411SNicolas Pitre    (0x2642, 0x2642),    # Male sign
44b11a0411SNicolas Pitre    (0x26A7, 0x26A7),    # Transgender symbol
45b11a0411SNicolas Pitre    (0x1F9B0, 0x1F9B3),  # Hair components (red, curly, white, bald)
46b11a0411SNicolas Pitre
47b11a0411SNicolas Pitre    # Tag characters
48b11a0411SNicolas Pitre    (0xE0020, 0xE007E),  # Tags
49b11a0411SNicolas Pitre]
50b11a0411SNicolas Pitre
51b11a0411SNicolas Pitre# Regional indicators (flag components)
52b11a0411SNicolas PitreREGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF)  # Regional indicator symbols A-Z
53b11a0411SNicolas Pitre
54b11a0411SNicolas Pitre# Double-width emoji ranges
55b11a0411SNicolas Pitre#
56b11a0411SNicolas Pitre# Many emoji characters are classified as single-width according to Unicode
57b11a0411SNicolas Pitre# Standard Annex #11 East Asian Width property (N or Neutral), but we
58b11a0411SNicolas Pitre# deliberately override them to be double-width. References:
59b11a0411SNicolas Pitre# 1. Unicode Technical Standard #51: Unicode Emoji
60b11a0411SNicolas Pitre#    (https://www.unicode.org/reports/tr51/)
61b11a0411SNicolas Pitre# 2. Principle of "emoji presentation" in WHATWG CSS Text specification
62b11a0411SNicolas Pitre#    (https://drafts.csswg.org/css-text-3/#character-properties)
63b11a0411SNicolas Pitre# 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which
64b11a0411SNicolas Pitre#    universally render emoji as double-width characters regardless of their
65b11a0411SNicolas Pitre#    Unicode EAW property
66b11a0411SNicolas Pitre# 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1
67b11a0411SNicolas Pitre#    Emoji width (https://www.w3.org/TR/jlreq/)
68b11a0411SNicolas PitreEMOJI_RANGES = [
69b11a0411SNicolas Pitre    (0x1F000, 0x1F02F),  # Mahjong Tiles (EAW: N, but displayed as double-width)
70b11a0411SNicolas Pitre    (0x1F0A0, 0x1F0FF),  # Playing Cards (EAW: N, but displayed as double-width)
71b11a0411SNicolas Pitre    (0x1F300, 0x1F5FF),  # Miscellaneous Symbols and Pictographs
72b11a0411SNicolas Pitre    (0x1F600, 0x1F64F),  # Emoticons
73b11a0411SNicolas Pitre    (0x1F680, 0x1F6FF),  # Transport and Map Symbols
74b11a0411SNicolas Pitre    (0x1F700, 0x1F77F),  # Alchemical Symbols
75b11a0411SNicolas Pitre    (0x1F780, 0x1F7FF),  # Geometric Shapes Extended
76b11a0411SNicolas Pitre    (0x1F800, 0x1F8FF),  # Supplemental Arrows-C
77b11a0411SNicolas Pitre    (0x1F900, 0x1F9FF),  # Supplemental Symbols and Pictographs
78b11a0411SNicolas Pitre    (0x1FA00, 0x1FA6F),  # Chess Symbols
79b11a0411SNicolas Pitre    (0x1FA70, 0x1FAFF),  # Symbols and Pictographs Extended-A
80b11a0411SNicolas Pitre]
81b11a0411SNicolas Pitre
82b11a0411SNicolas Pitredef create_width_tables():
83b11a0411SNicolas Pitre    """
84b11a0411SNicolas Pitre    Creates Unicode character width tables and returns the data structures.
85b11a0411SNicolas Pitre
86b11a0411SNicolas Pitre    Returns:
87b11a0411SNicolas Pitre        tuple: (zero_width_ranges, double_width_ranges)
88b11a0411SNicolas Pitre    """
89b11a0411SNicolas Pitre
90b11a0411SNicolas Pitre    # Width data mapping
91b11a0411SNicolas Pitre    width_map = {}  # Maps code points to width (0, 1, 2)
92b11a0411SNicolas Pitre
93b11a0411SNicolas Pitre    # Mark emoji modifiers as zero-width
94b11a0411SNicolas Pitre    for start, end in EMOJI_ZERO_WIDTH:
95b11a0411SNicolas Pitre        for cp in range(start, end + 1):
96b11a0411SNicolas Pitre            width_map[cp] = 0
97b11a0411SNicolas Pitre
98b11a0411SNicolas Pitre    # Mark all regional indicators as single-width as they are usually paired
99b11a0411SNicolas Pitre    # providing a combined width of 2 when displayed together.
100b11a0411SNicolas Pitre    start, end = REGIONAL_INDICATORS
101b11a0411SNicolas Pitre    for cp in range(start, end + 1):
102b11a0411SNicolas Pitre        width_map[cp] = 1
103b11a0411SNicolas Pitre
104b11a0411SNicolas Pitre    # Process all assigned Unicode code points (Basic Multilingual Plane +
105b11a0411SNicolas Pitre    # Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range)
106b11a0411SNicolas Pitre    for block_start in range(0, 0x110000, 0x1000):
107b11a0411SNicolas Pitre        block_end = block_start + 0x1000
108b11a0411SNicolas Pitre        for cp in range(block_start, block_end):
109b11a0411SNicolas Pitre            try:
110b11a0411SNicolas Pitre                char = chr(cp)
111b11a0411SNicolas Pitre
112b11a0411SNicolas Pitre                # Skip if already processed
113b11a0411SNicolas Pitre                if cp in width_map:
114b11a0411SNicolas Pitre                    continue
115b11a0411SNicolas Pitre
116b11a0411SNicolas Pitre                # Check for combining marks and a format characters
117b11a0411SNicolas Pitre                category = unicodedata.category(char)
118b11a0411SNicolas Pitre
119b11a0411SNicolas Pitre                # Combining marks
120b11a0411SNicolas Pitre                if category.startswith('M'):
121b11a0411SNicolas Pitre                    width_map[cp] = 0
122b11a0411SNicolas Pitre                    continue
123b11a0411SNicolas Pitre
124b11a0411SNicolas Pitre                # Format characters
125b11a0411SNicolas Pitre                # Since we have no support for bidirectional text, all format
126b11a0411SNicolas Pitre                # characters (category Cf) can be treated with width 0 (zero)
127b11a0411SNicolas Pitre                # for simplicity, as they don't need to occupy visual space
128b11a0411SNicolas Pitre                # in a non-bidirectional text environment.
129b11a0411SNicolas Pitre                if category == 'Cf':
130b11a0411SNicolas Pitre                    width_map[cp] = 0
131b11a0411SNicolas Pitre                    continue
132b11a0411SNicolas Pitre
133b11a0411SNicolas Pitre                # Known zero-width characters
134b11a0411SNicolas Pitre                if cp in KNOWN_ZERO_WIDTH:
135b11a0411SNicolas Pitre                    width_map[cp] = 0
136b11a0411SNicolas Pitre                    continue
137b11a0411SNicolas Pitre
138b11a0411SNicolas Pitre                # Use East Asian Width property
139b11a0411SNicolas Pitre                eaw = unicodedata.east_asian_width(char)
140b11a0411SNicolas Pitre                if eaw in ('F', 'W'):  # Fullwidth or Wide
141b11a0411SNicolas Pitre                    width_map[cp] = 2
142b11a0411SNicolas Pitre                elif eaw in ('Na', 'H', 'N', 'A'):  # Narrow, Halfwidth, Neutral, Ambiguous
143b11a0411SNicolas Pitre                    width_map[cp] = 1
144b11a0411SNicolas Pitre                else:
145b11a0411SNicolas Pitre                    # Default to single-width for unknown
146b11a0411SNicolas Pitre                    width_map[cp] = 1
147b11a0411SNicolas Pitre
148b11a0411SNicolas Pitre            except (ValueError, OverflowError):
149b11a0411SNicolas Pitre                # Skip invalid code points
150b11a0411SNicolas Pitre                continue
151b11a0411SNicolas Pitre
152b11a0411SNicolas Pitre    # Process Emoji - generally double-width
153b11a0411SNicolas Pitre    for start, end in EMOJI_RANGES:
154b11a0411SNicolas Pitre        for cp in range(start, end + 1):
155b11a0411SNicolas Pitre            if cp not in width_map or width_map[cp] != 0:  # Don't override zero-width
156b11a0411SNicolas Pitre                try:
157b11a0411SNicolas Pitre                    char = chr(cp)
158b11a0411SNicolas Pitre                    width_map[cp] = 2
159b11a0411SNicolas Pitre                except (ValueError, OverflowError):
160b11a0411SNicolas Pitre                    continue
161b11a0411SNicolas Pitre
162b11a0411SNicolas Pitre    # Optimize to create range tables
163b11a0411SNicolas Pitre    def ranges_optimize(width_data, target_width):
164b11a0411SNicolas Pitre        points = sorted([cp for cp, width in width_data.items() if width == target_width])
165b11a0411SNicolas Pitre        if not points:
166b11a0411SNicolas Pitre            return []
167b11a0411SNicolas Pitre
168b11a0411SNicolas Pitre        # Group consecutive code points into ranges
169b11a0411SNicolas Pitre        ranges = []
170b11a0411SNicolas Pitre        start = points[0]
171b11a0411SNicolas Pitre        prev = start
172b11a0411SNicolas Pitre
173b11a0411SNicolas Pitre        for cp in points[1:]:
174b11a0411SNicolas Pitre            if cp > prev + 1:
175b11a0411SNicolas Pitre                ranges.append((start, prev))
176b11a0411SNicolas Pitre                start = cp
177b11a0411SNicolas Pitre            prev = cp
178b11a0411SNicolas Pitre
179b11a0411SNicolas Pitre        # Add the last range
180b11a0411SNicolas Pitre        ranges.append((start, prev))
181b11a0411SNicolas Pitre        return ranges
182b11a0411SNicolas Pitre
183b11a0411SNicolas Pitre    # Extract ranges for each width
184b11a0411SNicolas Pitre    zero_width_ranges = ranges_optimize(width_map, 0)
185b11a0411SNicolas Pitre    double_width_ranges = ranges_optimize(width_map, 2)
186b11a0411SNicolas Pitre
187b11a0411SNicolas Pitre    return zero_width_ranges, double_width_ranges
188b11a0411SNicolas Pitre
189*c2d2c5c0SNicolas Pitredef write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE):
190b11a0411SNicolas Pitre    """
191b11a0411SNicolas Pitre    Write the generated tables to C header file.
192b11a0411SNicolas Pitre
193b11a0411SNicolas Pitre    Args:
194b11a0411SNicolas Pitre        zero_width_ranges: List of (start, end) ranges for zero-width characters
195b11a0411SNicolas Pitre        double_width_ranges: List of (start, end) ranges for double-width characters
196*c2d2c5c0SNicolas Pitre        out_file: Output file name (default: DEFAULT_OUT_FILE)
197b11a0411SNicolas Pitre    """
198b11a0411SNicolas Pitre
199ad934777SNicolas Pitre    # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
200ad934777SNicolas Pitre    def split_ranges_by_size(ranges):
201ad934777SNicolas Pitre        bmp_ranges = []
202ad934777SNicolas Pitre        non_bmp_ranges = []
203ad934777SNicolas Pitre
204ad934777SNicolas Pitre        for start, end in ranges:
205ad934777SNicolas Pitre            if end <= 0xFFFF:
206ad934777SNicolas Pitre                bmp_ranges.append((start, end))
207ad934777SNicolas Pitre            elif start > 0xFFFF:
208ad934777SNicolas Pitre                non_bmp_ranges.append((start, end))
209ad934777SNicolas Pitre            else:
210ad934777SNicolas Pitre                # Split the range at 0xFFFF
211ad934777SNicolas Pitre                bmp_ranges.append((start, 0xFFFF))
212ad934777SNicolas Pitre                non_bmp_ranges.append((0x10000, end))
213ad934777SNicolas Pitre
214ad934777SNicolas Pitre        return bmp_ranges, non_bmp_ranges
215ad934777SNicolas Pitre
216ad934777SNicolas Pitre    # Split ranges into BMP and non-BMP
217ad934777SNicolas Pitre    zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
218ad934777SNicolas Pitre    double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)
219ad934777SNicolas Pitre
220b11a0411SNicolas Pitre    # Function to generate code point description comments
221b11a0411SNicolas Pitre    def get_code_point_comment(start, end):
222b11a0411SNicolas Pitre        try:
223b11a0411SNicolas Pitre            start_char_desc = unicodedata.name(chr(start))
224b11a0411SNicolas Pitre            if start == end:
225b11a0411SNicolas Pitre                return f"/* {start_char_desc} */"
226b11a0411SNicolas Pitre            else:
227b11a0411SNicolas Pitre                end_char_desc = unicodedata.name(chr(end))
228b11a0411SNicolas Pitre                return f"/* {start_char_desc} - {end_char_desc} */"
229b11a0411SNicolas Pitre        except:
230b11a0411SNicolas Pitre            if start == end:
231b11a0411SNicolas Pitre                return f"/* U+{start:04X} */"
232b11a0411SNicolas Pitre            else:
233b11a0411SNicolas Pitre                return f"/* U+{start:04X} - U+{end:04X} */"
234b11a0411SNicolas Pitre
235b11a0411SNicolas Pitre    # Generate C tables
236b11a0411SNicolas Pitre    with open(out_file, 'w') as f:
237b11a0411SNicolas Pitre        f.write(f"""\
238b11a0411SNicolas Pitre/* SPDX-License-Identifier: GPL-2.0 */
239b11a0411SNicolas Pitre/*
240b11a0411SNicolas Pitre * {out_file} - Unicode character width
241b11a0411SNicolas Pitre *
242b11a0411SNicolas Pitre * Auto-generated by {this_file}
243b11a0411SNicolas Pitre *
244b11a0411SNicolas Pitre * Unicode Version: {unicodedata.unidata_version}
245b11a0411SNicolas Pitre */
246b11a0411SNicolas Pitre
247ad934777SNicolas Pitre/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
248ad934777SNicolas Pitrestatic const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{
249b11a0411SNicolas Pitre""")
250b11a0411SNicolas Pitre
251ad934777SNicolas Pitre        for start, end in zero_width_bmp:
252ad934777SNicolas Pitre            comment = get_code_point_comment(start, end)
253ad934777SNicolas Pitre            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
254ad934777SNicolas Pitre
255ad934777SNicolas Pitre        f.write("""\
256ad934777SNicolas Pitre};
257ad934777SNicolas Pitre
258ad934777SNicolas Pitre/* Zero-width character ranges (non-BMP, U+10000 and above) */
259ad934777SNicolas Pitrestatic const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = {
260ad934777SNicolas Pitre""")
261ad934777SNicolas Pitre
262ad934777SNicolas Pitre        for start, end in zero_width_non_bmp:
263b11a0411SNicolas Pitre            comment = get_code_point_comment(start, end)
264b11a0411SNicolas Pitre            f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
265b11a0411SNicolas Pitre
266b11a0411SNicolas Pitre        f.write("""\
267b11a0411SNicolas Pitre};
268b11a0411SNicolas Pitre
269ad934777SNicolas Pitre/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
270ad934777SNicolas Pitrestatic const struct ucs_interval16 ucs_double_width_bmp_ranges[] = {
271b11a0411SNicolas Pitre""")
272b11a0411SNicolas Pitre
273ad934777SNicolas Pitre        for start, end in double_width_bmp:
274ad934777SNicolas Pitre            comment = get_code_point_comment(start, end)
275ad934777SNicolas Pitre            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
276ad934777SNicolas Pitre
277ad934777SNicolas Pitre        f.write("""\
278ad934777SNicolas Pitre};
279ad934777SNicolas Pitre
280ad934777SNicolas Pitre/* Double-width character ranges (non-BMP, U+10000 and above) */
281ad934777SNicolas Pitrestatic const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = {
282ad934777SNicolas Pitre""")
283ad934777SNicolas Pitre
284ad934777SNicolas Pitre        for start, end in double_width_non_bmp:
285b11a0411SNicolas Pitre            comment = get_code_point_comment(start, end)
286b11a0411SNicolas Pitre            f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
287b11a0411SNicolas Pitre
288b11a0411SNicolas Pitre        f.write("};\n")
289b11a0411SNicolas Pitre
290b11a0411SNicolas Pitreif __name__ == "__main__":
291*c2d2c5c0SNicolas Pitre    # Parse command line arguments
292*c2d2c5c0SNicolas Pitre    parser = argparse.ArgumentParser(description="Generate Unicode width tables")
293*c2d2c5c0SNicolas Pitre    parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
294*c2d2c5c0SNicolas Pitre                        help=f"Output file name (default: {DEFAULT_OUT_FILE})")
295*c2d2c5c0SNicolas Pitre    args = parser.parse_args()
296*c2d2c5c0SNicolas Pitre
297b11a0411SNicolas Pitre    # Write tables to header file
298b11a0411SNicolas Pitre    zero_width_ranges, double_width_ranges = create_width_tables()
299*c2d2c5c0SNicolas Pitre    write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file)
300b11a0411SNicolas Pitre
301b11a0411SNicolas Pitre    # Print summary
302b11a0411SNicolas Pitre    zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
303b11a0411SNicolas Pitre    double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
304*c2d2c5c0SNicolas Pitre    print(f"Generated {args.output_file} with:")
305b11a0411SNicolas Pitre    print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points")
306b11a0411SNicolas Pitre    print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points")
307b11a0411SNicolas Pitre    print(f"- Unicode Version: {unicodedata.unidata_version}")
308