1b11a0411SNicolas Pitre#!/usr/bin/env python3 2b11a0411SNicolas Pitre# SPDX-License-Identifier: GPL-2.0 3b11a0411SNicolas Pitre# 4b11a0411SNicolas Pitre# Leverage Python's unicodedata module to generate ucs_width_table.h 5b11a0411SNicolas Pitre 6b11a0411SNicolas Pitreimport unicodedata 7b11a0411SNicolas Pitreimport sys 8*c2d2c5c0SNicolas Pitreimport argparse 9b11a0411SNicolas Pitre 10b11a0411SNicolas Pitre# This script's file name 11b11a0411SNicolas Pitrefrom pathlib import Path 12b11a0411SNicolas Pitrethis_file = Path(__file__).name 13b11a0411SNicolas Pitre 14*c2d2c5c0SNicolas Pitre# Default output file name 15*c2d2c5c0SNicolas PitreDEFAULT_OUT_FILE = "ucs_width_table.h" 16b11a0411SNicolas Pitre 17b11a0411SNicolas Pitre# --- Global Constants for Width Assignments --- 18b11a0411SNicolas Pitre 19b11a0411SNicolas Pitre# Known zero-width characters 20b11a0411SNicolas PitreKNOWN_ZERO_WIDTH = ( 21b11a0411SNicolas Pitre 0x200B, # ZERO WIDTH SPACE 22b11a0411SNicolas Pitre 0x200C, # ZERO WIDTH NON-JOINER 23b11a0411SNicolas Pitre 0x200D, # ZERO WIDTH JOINER 24b11a0411SNicolas Pitre 0x2060, # WORD JOINER 25b11a0411SNicolas Pitre 0xFEFF # ZERO WIDTH NO-BREAK SPACE (BOM) 26b11a0411SNicolas Pitre) 27b11a0411SNicolas Pitre 28b11a0411SNicolas Pitre# Zero-width emoji modifiers and components 29b11a0411SNicolas Pitre# NOTE: Some of these characters would normally be single-width according to 30b11a0411SNicolas Pitre# East Asian Width properties, but we deliberately override them to be 31b11a0411SNicolas Pitre# zero-width because they function as modifiers in emoji sequences. 32b11a0411SNicolas PitreEMOJI_ZERO_WIDTH = [ 33b11a0411SNicolas Pitre # Skin tone modifiers 34b11a0411SNicolas Pitre (0x1F3FB, 0x1F3FF), # Emoji modifiers (skin tones) 35b11a0411SNicolas Pitre 36b11a0411SNicolas Pitre # Variation selectors (note: VS16 is treated specially in vt.c) 37b11a0411SNicolas Pitre (0xFE00, 0xFE0F), # Variation Selectors 1-16 38b11a0411SNicolas Pitre 39b11a0411SNicolas Pitre # Gender and hair style modifiers 40b11a0411SNicolas Pitre # These would be single-width by Unicode properties, but are zero-width 41b11a0411SNicolas Pitre # when part of emoji 42b11a0411SNicolas Pitre (0x2640, 0x2640), # Female sign 43b11a0411SNicolas Pitre (0x2642, 0x2642), # Male sign 44b11a0411SNicolas Pitre (0x26A7, 0x26A7), # Transgender symbol 45b11a0411SNicolas Pitre (0x1F9B0, 0x1F9B3), # Hair components (red, curly, white, bald) 46b11a0411SNicolas Pitre 47b11a0411SNicolas Pitre # Tag characters 48b11a0411SNicolas Pitre (0xE0020, 0xE007E), # Tags 49b11a0411SNicolas Pitre] 50b11a0411SNicolas Pitre 51b11a0411SNicolas Pitre# Regional indicators (flag components) 52b11a0411SNicolas PitreREGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF) # Regional indicator symbols A-Z 53b11a0411SNicolas Pitre 54b11a0411SNicolas Pitre# Double-width emoji ranges 55b11a0411SNicolas Pitre# 56b11a0411SNicolas Pitre# Many emoji characters are classified as single-width according to Unicode 57b11a0411SNicolas Pitre# Standard Annex #11 East Asian Width property (N or Neutral), but we 58b11a0411SNicolas Pitre# deliberately override them to be double-width. References: 59b11a0411SNicolas Pitre# 1. Unicode Technical Standard #51: Unicode Emoji 60b11a0411SNicolas Pitre# (https://www.unicode.org/reports/tr51/) 61b11a0411SNicolas Pitre# 2. Principle of "emoji presentation" in WHATWG CSS Text specification 62b11a0411SNicolas Pitre# (https://drafts.csswg.org/css-text-3/#character-properties) 63b11a0411SNicolas Pitre# 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which 64b11a0411SNicolas Pitre# universally render emoji as double-width characters regardless of their 65b11a0411SNicolas Pitre# Unicode EAW property 66b11a0411SNicolas Pitre# 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1 67b11a0411SNicolas Pitre# Emoji width (https://www.w3.org/TR/jlreq/) 68b11a0411SNicolas PitreEMOJI_RANGES = [ 69b11a0411SNicolas Pitre (0x1F000, 0x1F02F), # Mahjong Tiles (EAW: N, but displayed as double-width) 70b11a0411SNicolas Pitre (0x1F0A0, 0x1F0FF), # Playing Cards (EAW: N, but displayed as double-width) 71b11a0411SNicolas Pitre (0x1F300, 0x1F5FF), # Miscellaneous Symbols and Pictographs 72b11a0411SNicolas Pitre (0x1F600, 0x1F64F), # Emoticons 73b11a0411SNicolas Pitre (0x1F680, 0x1F6FF), # Transport and Map Symbols 74b11a0411SNicolas Pitre (0x1F700, 0x1F77F), # Alchemical Symbols 75b11a0411SNicolas Pitre (0x1F780, 0x1F7FF), # Geometric Shapes Extended 76b11a0411SNicolas Pitre (0x1F800, 0x1F8FF), # Supplemental Arrows-C 77b11a0411SNicolas Pitre (0x1F900, 0x1F9FF), # Supplemental Symbols and Pictographs 78b11a0411SNicolas Pitre (0x1FA00, 0x1FA6F), # Chess Symbols 79b11a0411SNicolas Pitre (0x1FA70, 0x1FAFF), # Symbols and Pictographs Extended-A 80b11a0411SNicolas Pitre] 81b11a0411SNicolas Pitre 82b11a0411SNicolas Pitredef create_width_tables(): 83b11a0411SNicolas Pitre """ 84b11a0411SNicolas Pitre Creates Unicode character width tables and returns the data structures. 85b11a0411SNicolas Pitre 86b11a0411SNicolas Pitre Returns: 87b11a0411SNicolas Pitre tuple: (zero_width_ranges, double_width_ranges) 88b11a0411SNicolas Pitre """ 89b11a0411SNicolas Pitre 90b11a0411SNicolas Pitre # Width data mapping 91b11a0411SNicolas Pitre width_map = {} # Maps code points to width (0, 1, 2) 92b11a0411SNicolas Pitre 93b11a0411SNicolas Pitre # Mark emoji modifiers as zero-width 94b11a0411SNicolas Pitre for start, end in EMOJI_ZERO_WIDTH: 95b11a0411SNicolas Pitre for cp in range(start, end + 1): 96b11a0411SNicolas Pitre width_map[cp] = 0 97b11a0411SNicolas Pitre 98b11a0411SNicolas Pitre # Mark all regional indicators as single-width as they are usually paired 99b11a0411SNicolas Pitre # providing a combined width of 2 when displayed together. 100b11a0411SNicolas Pitre start, end = REGIONAL_INDICATORS 101b11a0411SNicolas Pitre for cp in range(start, end + 1): 102b11a0411SNicolas Pitre width_map[cp] = 1 103b11a0411SNicolas Pitre 104b11a0411SNicolas Pitre # Process all assigned Unicode code points (Basic Multilingual Plane + 105b11a0411SNicolas Pitre # Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range) 106b11a0411SNicolas Pitre for block_start in range(0, 0x110000, 0x1000): 107b11a0411SNicolas Pitre block_end = block_start + 0x1000 108b11a0411SNicolas Pitre for cp in range(block_start, block_end): 109b11a0411SNicolas Pitre try: 110b11a0411SNicolas Pitre char = chr(cp) 111b11a0411SNicolas Pitre 112b11a0411SNicolas Pitre # Skip if already processed 113b11a0411SNicolas Pitre if cp in width_map: 114b11a0411SNicolas Pitre continue 115b11a0411SNicolas Pitre 116b11a0411SNicolas Pitre # Check for combining marks and a format characters 117b11a0411SNicolas Pitre category = unicodedata.category(char) 118b11a0411SNicolas Pitre 119b11a0411SNicolas Pitre # Combining marks 120b11a0411SNicolas Pitre if category.startswith('M'): 121b11a0411SNicolas Pitre width_map[cp] = 0 122b11a0411SNicolas Pitre continue 123b11a0411SNicolas Pitre 124b11a0411SNicolas Pitre # Format characters 125b11a0411SNicolas Pitre # Since we have no support for bidirectional text, all format 126b11a0411SNicolas Pitre # characters (category Cf) can be treated with width 0 (zero) 127b11a0411SNicolas Pitre # for simplicity, as they don't need to occupy visual space 128b11a0411SNicolas Pitre # in a non-bidirectional text environment. 129b11a0411SNicolas Pitre if category == 'Cf': 130b11a0411SNicolas Pitre width_map[cp] = 0 131b11a0411SNicolas Pitre continue 132b11a0411SNicolas Pitre 133b11a0411SNicolas Pitre # Known zero-width characters 134b11a0411SNicolas Pitre if cp in KNOWN_ZERO_WIDTH: 135b11a0411SNicolas Pitre width_map[cp] = 0 136b11a0411SNicolas Pitre continue 137b11a0411SNicolas Pitre 138b11a0411SNicolas Pitre # Use East Asian Width property 139b11a0411SNicolas Pitre eaw = unicodedata.east_asian_width(char) 140b11a0411SNicolas Pitre if eaw in ('F', 'W'): # Fullwidth or Wide 141b11a0411SNicolas Pitre width_map[cp] = 2 142b11a0411SNicolas Pitre elif eaw in ('Na', 'H', 'N', 'A'): # Narrow, Halfwidth, Neutral, Ambiguous 143b11a0411SNicolas Pitre width_map[cp] = 1 144b11a0411SNicolas Pitre else: 145b11a0411SNicolas Pitre # Default to single-width for unknown 146b11a0411SNicolas Pitre width_map[cp] = 1 147b11a0411SNicolas Pitre 148b11a0411SNicolas Pitre except (ValueError, OverflowError): 149b11a0411SNicolas Pitre # Skip invalid code points 150b11a0411SNicolas Pitre continue 151b11a0411SNicolas Pitre 152b11a0411SNicolas Pitre # Process Emoji - generally double-width 153b11a0411SNicolas Pitre for start, end in EMOJI_RANGES: 154b11a0411SNicolas Pitre for cp in range(start, end + 1): 155b11a0411SNicolas Pitre if cp not in width_map or width_map[cp] != 0: # Don't override zero-width 156b11a0411SNicolas Pitre try: 157b11a0411SNicolas Pitre char = chr(cp) 158b11a0411SNicolas Pitre width_map[cp] = 2 159b11a0411SNicolas Pitre except (ValueError, OverflowError): 160b11a0411SNicolas Pitre continue 161b11a0411SNicolas Pitre 162b11a0411SNicolas Pitre # Optimize to create range tables 163b11a0411SNicolas Pitre def ranges_optimize(width_data, target_width): 164b11a0411SNicolas Pitre points = sorted([cp for cp, width in width_data.items() if width == target_width]) 165b11a0411SNicolas Pitre if not points: 166b11a0411SNicolas Pitre return [] 167b11a0411SNicolas Pitre 168b11a0411SNicolas Pitre # Group consecutive code points into ranges 169b11a0411SNicolas Pitre ranges = [] 170b11a0411SNicolas Pitre start = points[0] 171b11a0411SNicolas Pitre prev = start 172b11a0411SNicolas Pitre 173b11a0411SNicolas Pitre for cp in points[1:]: 174b11a0411SNicolas Pitre if cp > prev + 1: 175b11a0411SNicolas Pitre ranges.append((start, prev)) 176b11a0411SNicolas Pitre start = cp 177b11a0411SNicolas Pitre prev = cp 178b11a0411SNicolas Pitre 179b11a0411SNicolas Pitre # Add the last range 180b11a0411SNicolas Pitre ranges.append((start, prev)) 181b11a0411SNicolas Pitre return ranges 182b11a0411SNicolas Pitre 183b11a0411SNicolas Pitre # Extract ranges for each width 184b11a0411SNicolas Pitre zero_width_ranges = ranges_optimize(width_map, 0) 185b11a0411SNicolas Pitre double_width_ranges = ranges_optimize(width_map, 2) 186b11a0411SNicolas Pitre 187b11a0411SNicolas Pitre return zero_width_ranges, double_width_ranges 188b11a0411SNicolas Pitre 189*c2d2c5c0SNicolas Pitredef write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE): 190b11a0411SNicolas Pitre """ 191b11a0411SNicolas Pitre Write the generated tables to C header file. 192b11a0411SNicolas Pitre 193b11a0411SNicolas Pitre Args: 194b11a0411SNicolas Pitre zero_width_ranges: List of (start, end) ranges for zero-width characters 195b11a0411SNicolas Pitre double_width_ranges: List of (start, end) ranges for double-width characters 196*c2d2c5c0SNicolas Pitre out_file: Output file name (default: DEFAULT_OUT_FILE) 197b11a0411SNicolas Pitre """ 198b11a0411SNicolas Pitre 199ad934777SNicolas Pitre # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit) 200ad934777SNicolas Pitre def split_ranges_by_size(ranges): 201ad934777SNicolas Pitre bmp_ranges = [] 202ad934777SNicolas Pitre non_bmp_ranges = [] 203ad934777SNicolas Pitre 204ad934777SNicolas Pitre for start, end in ranges: 205ad934777SNicolas Pitre if end <= 0xFFFF: 206ad934777SNicolas Pitre bmp_ranges.append((start, end)) 207ad934777SNicolas Pitre elif start > 0xFFFF: 208ad934777SNicolas Pitre non_bmp_ranges.append((start, end)) 209ad934777SNicolas Pitre else: 210ad934777SNicolas Pitre # Split the range at 0xFFFF 211ad934777SNicolas Pitre bmp_ranges.append((start, 0xFFFF)) 212ad934777SNicolas Pitre non_bmp_ranges.append((0x10000, end)) 213ad934777SNicolas Pitre 214ad934777SNicolas Pitre return bmp_ranges, non_bmp_ranges 215ad934777SNicolas Pitre 216ad934777SNicolas Pitre # Split ranges into BMP and non-BMP 217ad934777SNicolas Pitre zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges) 218ad934777SNicolas Pitre double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges) 219ad934777SNicolas Pitre 220b11a0411SNicolas Pitre # Function to generate code point description comments 221b11a0411SNicolas Pitre def get_code_point_comment(start, end): 222b11a0411SNicolas Pitre try: 223b11a0411SNicolas Pitre start_char_desc = unicodedata.name(chr(start)) 224b11a0411SNicolas Pitre if start == end: 225b11a0411SNicolas Pitre return f"/* {start_char_desc} */" 226b11a0411SNicolas Pitre else: 227b11a0411SNicolas Pitre end_char_desc = unicodedata.name(chr(end)) 228b11a0411SNicolas Pitre return f"/* {start_char_desc} - {end_char_desc} */" 229b11a0411SNicolas Pitre except: 230b11a0411SNicolas Pitre if start == end: 231b11a0411SNicolas Pitre return f"/* U+{start:04X} */" 232b11a0411SNicolas Pitre else: 233b11a0411SNicolas Pitre return f"/* U+{start:04X} - U+{end:04X} */" 234b11a0411SNicolas Pitre 235b11a0411SNicolas Pitre # Generate C tables 236b11a0411SNicolas Pitre with open(out_file, 'w') as f: 237b11a0411SNicolas Pitre f.write(f"""\ 238b11a0411SNicolas Pitre/* SPDX-License-Identifier: GPL-2.0 */ 239b11a0411SNicolas Pitre/* 240b11a0411SNicolas Pitre * {out_file} - Unicode character width 241b11a0411SNicolas Pitre * 242b11a0411SNicolas Pitre * Auto-generated by {this_file} 243b11a0411SNicolas Pitre * 244b11a0411SNicolas Pitre * Unicode Version: {unicodedata.unidata_version} 245b11a0411SNicolas Pitre */ 246b11a0411SNicolas Pitre 247ad934777SNicolas Pitre/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ 248ad934777SNicolas Pitrestatic const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{ 249b11a0411SNicolas Pitre""") 250b11a0411SNicolas Pitre 251ad934777SNicolas Pitre for start, end in zero_width_bmp: 252ad934777SNicolas Pitre comment = get_code_point_comment(start, end) 253ad934777SNicolas Pitre f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") 254ad934777SNicolas Pitre 255ad934777SNicolas Pitre f.write("""\ 256ad934777SNicolas Pitre}; 257ad934777SNicolas Pitre 258ad934777SNicolas Pitre/* Zero-width character ranges (non-BMP, U+10000 and above) */ 259ad934777SNicolas Pitrestatic const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = { 260ad934777SNicolas Pitre""") 261ad934777SNicolas Pitre 262ad934777SNicolas Pitre for start, end in zero_width_non_bmp: 263b11a0411SNicolas Pitre comment = get_code_point_comment(start, end) 264b11a0411SNicolas Pitre f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") 265b11a0411SNicolas Pitre 266b11a0411SNicolas Pitre f.write("""\ 267b11a0411SNicolas Pitre}; 268b11a0411SNicolas Pitre 269ad934777SNicolas Pitre/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ 270ad934777SNicolas Pitrestatic const struct ucs_interval16 ucs_double_width_bmp_ranges[] = { 271b11a0411SNicolas Pitre""") 272b11a0411SNicolas Pitre 273ad934777SNicolas Pitre for start, end in double_width_bmp: 274ad934777SNicolas Pitre comment = get_code_point_comment(start, end) 275ad934777SNicolas Pitre f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") 276ad934777SNicolas Pitre 277ad934777SNicolas Pitre f.write("""\ 278ad934777SNicolas Pitre}; 279ad934777SNicolas Pitre 280ad934777SNicolas Pitre/* Double-width character ranges (non-BMP, U+10000 and above) */ 281ad934777SNicolas Pitrestatic const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = { 282ad934777SNicolas Pitre""") 283ad934777SNicolas Pitre 284ad934777SNicolas Pitre for start, end in double_width_non_bmp: 285b11a0411SNicolas Pitre comment = get_code_point_comment(start, end) 286b11a0411SNicolas Pitre f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") 287b11a0411SNicolas Pitre 288b11a0411SNicolas Pitre f.write("};\n") 289b11a0411SNicolas Pitre 290b11a0411SNicolas Pitreif __name__ == "__main__": 291*c2d2c5c0SNicolas Pitre # Parse command line arguments 292*c2d2c5c0SNicolas Pitre parser = argparse.ArgumentParser(description="Generate Unicode width tables") 293*c2d2c5c0SNicolas Pitre parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, 294*c2d2c5c0SNicolas Pitre help=f"Output file name (default: {DEFAULT_OUT_FILE})") 295*c2d2c5c0SNicolas Pitre args = parser.parse_args() 296*c2d2c5c0SNicolas Pitre 297b11a0411SNicolas Pitre # Write tables to header file 298b11a0411SNicolas Pitre zero_width_ranges, double_width_ranges = create_width_tables() 299*c2d2c5c0SNicolas Pitre write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file) 300b11a0411SNicolas Pitre 301b11a0411SNicolas Pitre # Print summary 302b11a0411SNicolas Pitre zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges) 303b11a0411SNicolas Pitre double_width_count = sum(end - start + 1 for start, end in double_width_ranges) 304*c2d2c5c0SNicolas Pitre print(f"Generated {args.output_file} with:") 305b11a0411SNicolas Pitre print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points") 306b11a0411SNicolas Pitre print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points") 307b11a0411SNicolas Pitre print(f"- Unicode Version: {unicodedata.unidata_version}") 308