xref: /linux/drivers/tty/vt/gen_ucs_width_table.py (revision 255dc0ec0b79c354bff017f6d6202adaa092a1c9)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3#
4# Leverage Python's unicodedata module to generate ucs_width_table.h
5
6import unicodedata
7import sys
8import argparse
9
10# This script's file name
11from pathlib import Path
12this_file = Path(__file__).name
13
14# Default output file name
15DEFAULT_OUT_FILE = "ucs_width_table.h"
16
17# --- Global Constants for Width Assignments ---
18
19# Known zero-width characters
20KNOWN_ZERO_WIDTH = (
21    0x200B,  # ZERO WIDTH SPACE
22    0x200C,  # ZERO WIDTH NON-JOINER
23    0x200D,  # ZERO WIDTH JOINER
24    0x2060,  # WORD JOINER
25    0xFEFF   # ZERO WIDTH NO-BREAK SPACE (BOM)
26)
27
28# Zero-width emoji modifiers and components
29# NOTE: Some of these characters would normally be single-width according to
30# East Asian Width properties, but we deliberately override them to be
31# zero-width because they function as modifiers in emoji sequences.
32EMOJI_ZERO_WIDTH = [
33    # Skin tone modifiers
34    (0x1F3FB, 0x1F3FF),  # Emoji modifiers (skin tones)
35
36    # Variation selectors (note: VS16 is treated specially in vt.c)
37    (0xFE00, 0xFE0F),    # Variation Selectors 1-16
38
39    # Gender and hair style modifiers
40    # These would be single-width by Unicode properties, but are zero-width
41    # when part of emoji
42    (0x2640, 0x2640),    # Female sign
43    (0x2642, 0x2642),    # Male sign
44    (0x26A7, 0x26A7),    # Transgender symbol
45    (0x1F9B0, 0x1F9B3),  # Hair components (red, curly, white, bald)
46
47    # Tag characters
48    (0xE0020, 0xE007E),  # Tags
49]
50
51# Regional indicators (flag components)
52REGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF)  # Regional indicator symbols A-Z
53
54# Double-width emoji ranges
55#
56# Many emoji characters are classified as single-width according to Unicode
57# Standard Annex #11 East Asian Width property (N or Neutral), but we
58# deliberately override them to be double-width. References:
59# 1. Unicode Technical Standard #51: Unicode Emoji
60#    (https://www.unicode.org/reports/tr51/)
61# 2. Principle of "emoji presentation" in WHATWG CSS Text specification
62#    (https://drafts.csswg.org/css-text-3/#character-properties)
63# 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which
64#    universally render emoji as double-width characters regardless of their
65#    Unicode EAW property
66# 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1
67#    Emoji width (https://www.w3.org/TR/jlreq/)
68EMOJI_RANGES = [
69    (0x1F000, 0x1F02F),  # Mahjong Tiles (EAW: N, but displayed as double-width)
70    (0x1F0A0, 0x1F0FF),  # Playing Cards (EAW: N, but displayed as double-width)
71    (0x1F300, 0x1F5FF),  # Miscellaneous Symbols and Pictographs
72    (0x1F600, 0x1F64F),  # Emoticons
73    (0x1F680, 0x1F6FF),  # Transport and Map Symbols
74    (0x1F700, 0x1F77F),  # Alchemical Symbols
75    (0x1F780, 0x1F7FF),  # Geometric Shapes Extended
76    (0x1F800, 0x1F8FF),  # Supplemental Arrows-C
77    (0x1F900, 0x1F9FF),  # Supplemental Symbols and Pictographs
78    (0x1FA00, 0x1FA6F),  # Chess Symbols
79    (0x1FA70, 0x1FAFF),  # Symbols and Pictographs Extended-A
80]
81
82def create_width_tables():
83    """
84    Creates Unicode character width tables and returns the data structures.
85
86    Returns:
87        tuple: (zero_width_ranges, double_width_ranges)
88    """
89
90    # Width data mapping
91    width_map = {}  # Maps code points to width (0, 1, 2)
92
93    # Mark emoji modifiers as zero-width
94    for start, end in EMOJI_ZERO_WIDTH:
95        for cp in range(start, end + 1):
96            width_map[cp] = 0
97
98    # Mark all regional indicators as single-width as they are usually paired
99    # providing a combined width of 2 when displayed together.
100    start, end = REGIONAL_INDICATORS
101    for cp in range(start, end + 1):
102        width_map[cp] = 1
103
104    # Process all assigned Unicode code points (Basic Multilingual Plane +
105    # Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range)
106    for block_start in range(0, 0x110000, 0x1000):
107        block_end = block_start + 0x1000
108        for cp in range(block_start, block_end):
109            try:
110                char = chr(cp)
111
112                # Skip if already processed
113                if cp in width_map:
114                    continue
115
116                # Check for combining marks and a format characters
117                category = unicodedata.category(char)
118
119                # Combining marks
120                if category.startswith('M'):
121                    width_map[cp] = 0
122                    continue
123
124                # Format characters
125                # Since we have no support for bidirectional text, all format
126                # characters (category Cf) can be treated with width 0 (zero)
127                # for simplicity, as they don't need to occupy visual space
128                # in a non-bidirectional text environment.
129                if category == 'Cf':
130                    width_map[cp] = 0
131                    continue
132
133                # Known zero-width characters
134                if cp in KNOWN_ZERO_WIDTH:
135                    width_map[cp] = 0
136                    continue
137
138                # Use East Asian Width property
139                eaw = unicodedata.east_asian_width(char)
140                if eaw in ('F', 'W'):  # Fullwidth or Wide
141                    width_map[cp] = 2
142                elif eaw in ('Na', 'H', 'N', 'A'):  # Narrow, Halfwidth, Neutral, Ambiguous
143                    width_map[cp] = 1
144                else:
145                    # Default to single-width for unknown
146                    width_map[cp] = 1
147
148            except (ValueError, OverflowError):
149                # Skip invalid code points
150                continue
151
152    # Process Emoji - generally double-width
153    for start, end in EMOJI_RANGES:
154        for cp in range(start, end + 1):
155            if cp not in width_map or width_map[cp] != 0:  # Don't override zero-width
156                try:
157                    char = chr(cp)
158                    width_map[cp] = 2
159                except (ValueError, OverflowError):
160                    continue
161
162    # Optimize to create range tables
163    def ranges_optimize(width_data, target_width):
164        points = sorted([cp for cp, width in width_data.items() if width == target_width])
165        if not points:
166            return []
167
168        # Group consecutive code points into ranges
169        ranges = []
170        start = points[0]
171        prev = start
172
173        for cp in points[1:]:
174            if cp > prev + 1:
175                ranges.append((start, prev))
176                start = cp
177            prev = cp
178
179        # Add the last range
180        ranges.append((start, prev))
181        return ranges
182
183    # Extract ranges for each width
184    zero_width_ranges = ranges_optimize(width_map, 0)
185    double_width_ranges = ranges_optimize(width_map, 2)
186
187    return zero_width_ranges, double_width_ranges
188
189def write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE):
190    """
191    Write the generated tables to C header file.
192
193    The output uses a single sorted-by-`first` table per region (BMP and
194    non-BMP), with zero-width and double-width ranges merged together. The
195    non-BMP table also hosts the BMP double-width bitmap in spare bits of
196    `last`. See the encoding comment at the top of ucs.c for the layout.
197
198    Args:
199        zero_width_ranges: List of (start, end) ranges for zero-width characters
200        double_width_ranges: List of (start, end) ranges for double-width characters
201        out_file: Output file name (default: DEFAULT_OUT_FILE)
202    """
203
204    # Bits per BMP-bitmap chunk hosted in one non-BMP entry's `last` field.
205    # 8 bits makes `idx / BITS_PER_CHUNK` / `idx % BITS_PER_CHUNK` compile to
206    # a cheap shift+mask in the lookup. The chunk size is also emitted as
207    # UCS_NONBMP_BMP_BITS in the generated header so ucs.c stays in sync.
208    BITS_PER_CHUNK = 8
209
210    # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
211    def split_ranges_by_size(ranges):
212        bmp_ranges = []
213        non_bmp_ranges = []
214
215        for start, end in ranges:
216            if end <= 0xFFFF:
217                bmp_ranges.append((start, end))
218            elif start > 0xFFFF:
219                non_bmp_ranges.append((start, end))
220            else:
221                # Split the range at 0xFFFF
222                bmp_ranges.append((start, 0xFFFF))
223                non_bmp_ranges.append((0x10000, end))
224
225        return bmp_ranges, non_bmp_ranges
226
227    # Split ranges into BMP and non-BMP
228    zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
229    double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)
230
231    # Merge zero- and double-width ranges per region, tagging each with its
232    # width, then sort by `first` so binary search works on the union.
233    bmp_entries = sorted(
234        [(s, e, 0) for s, e in zero_width_bmp] +
235        [(s, e, 2) for s, e in double_width_bmp],
236        key=lambda t: t[0])
237    nonbmp_entries = sorted(
238        [(s, e, 0) for s, e in zero_width_non_bmp] +
239        [(s, e, 2) for s, e in double_width_non_bmp],
240        key=lambda t: t[0])
241
242    # Build the BMP double-width bitmap: one bit per BMP entry (in sort
243    # order), set iff that entry is double-width. Pack into BITS_PER_CHUNK-
244    # wide chunks, with bit j of the chunk corresponding to entry
245    # (chunk_index * BITS_PER_CHUNK + j).
246    bmp_w2_bits = [1 if w == 2 else 0 for _, _, w in bmp_entries]
247    n_chunks = (len(bmp_w2_bits) + BITS_PER_CHUNK - 1) // BITS_PER_CHUNK
248
249    if n_chunks > len(nonbmp_entries):
250        raise RuntimeError(
251            f"BMP bitmap needs {n_chunks} host entries, "
252            f"but only {len(nonbmp_entries)} non-BMP entries are available")
253
254    chunks = []  # list of (base_index, end_index, packed_value)
255    for c in range(n_chunks):
256        base = c * BITS_PER_CHUNK
257        end_idx = min(base + BITS_PER_CHUNK - 1, len(bmp_w2_bits) - 1)
258        value = 0
259        for j in range(BITS_PER_CHUNK):
260            k = base + j
261            if k < len(bmp_w2_bits) and bmp_w2_bits[k]:
262                value |= 1 << j
263        chunks.append((base, end_idx, value))
264
265    # Function to generate code point description comments
266    def get_code_point_comment(start, end):
267        try:
268            start_char_desc = unicodedata.name(chr(start))
269            if start == end:
270                return f"/* {start_char_desc} */"
271            else:
272                end_char_desc = unicodedata.name(chr(end))
273                return f"/* {start_char_desc} - {end_char_desc} */"
274        except:
275            if start == end:
276                return f"/* U+{start:04X} */"
277            else:
278                return f"/* U+{start:04X} - U+{end:04X} */"
279
280    # Generate C tables
281    with open(out_file, 'w') as f:
282        f.write(f"""\
283/* SPDX-License-Identifier: GPL-2.0 */
284/*
285 * {out_file} - Unicode character width
286 *
287 * Auto-generated by {this_file}
288 *
289 * Unicode Version: {unicodedata.unidata_version}
290 *
291 * Zero-width and double-width ranges are merged into one sorted-by-`first`
292 * table per region. The non-BMP table additionally hosts the BMP
293 * double-width bitmap in the low {BITS_PER_CHUNK} bits of `last` of its
294 * first {n_chunks} entries (covering {len(bmp_w2_bits)} BMP entries).
295 * See ucs.c for the encoding details and the lookup code.
296 */
297
298/* Bits per BMP-bitmap chunk hosted in one non-BMP entry's `last` field. */
299#define UCS_NONBMP_BMP_BITS {BITS_PER_CHUNK}
300
301/* Combined zero- and double-width ranges
302 * (BMP - Basic Multilingual Plane, U+0000 to U+FFFF). */
303static const struct ucs_width16 ucs_bmp_ranges[] = {{
304""")
305
306        for s, e, w in bmp_entries:
307            macro = "BMP_0WIDTH" if w == 0 else "BMP_2WIDTH"
308            comment = get_code_point_comment(s, e)
309            f.write(f"\t{{ {macro}(0x{s:04X}, 0x{e:04X}) }}, {comment}\n")
310
311        f.write(f"""\
312}};
313
314/* Combined zero- and double-width ranges (non-BMP, U+10000 and above).
315 * The first {n_chunks} entries host the BMP double-width bitmap in the low
316 * {BITS_PER_CHUNK} bits of `last`. */
317static const struct ucs_width32 ucs_nonbmp_ranges[] = {{
318""")
319
320        for i, (s, e, w) in enumerate(nonbmp_entries):
321            macro = "RANGE_0WIDTH" if w == 0 else "RANGE_2WIDTH"
322            comment = get_code_point_comment(s, e)
323            if i < len(chunks):
324                base, end_idx, value = chunks[i]
325                f.write(
326                    f"\t{{ {macro}(0x{s:05X}, 0x{e:05X})   {comment}\n"
327                    f"\t  | BMP_2W_BITS(0b{value:0{BITS_PER_CHUNK}b}) }},"
328                    f" /* BMP entries [{base:>3}..{end_idx:>3}] */\n")
329            else:
330                f.write(f"\t{{ {macro}(0x{s:05X}, 0x{e:05X}) }}, {comment}\n")
331
332        f.write("};\n")
333
334if __name__ == "__main__":
335    # Parse command line arguments
336    parser = argparse.ArgumentParser(description="Generate Unicode width tables")
337    parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
338                        help=f"Output file name (default: {DEFAULT_OUT_FILE})")
339    args = parser.parse_args()
340
341    # Write tables to header file
342    zero_width_ranges, double_width_ranges = create_width_tables()
343    write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file)
344
345    # Print summary
346    zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
347    double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
348    n_zero = len(zero_width_ranges)
349    n_double = len(double_width_ranges)
350    print(f"Generated {args.output_file} with:")
351    print(f"- {n_zero} zero-width ranges covering ~{zero_width_count} code points")
352    print(f"- {n_double} double-width ranges covering ~{double_width_count} code points")
353    print(f"- {n_zero + n_double} merged ranges total")
354    print(f"- Unicode Version: {unicodedata.unidata_version}")
355