1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# 4# Leverage Python's unicodedata module to generate ucs_width_table.h 5 6import unicodedata 7import sys 8import argparse 9 10# This script's file name 11from pathlib import Path 12this_file = Path(__file__).name 13 14# Default output file name 15DEFAULT_OUT_FILE = "ucs_width_table.h" 16 17# --- Global Constants for Width Assignments --- 18 19# Known zero-width characters 20KNOWN_ZERO_WIDTH = ( 21 0x200B, # ZERO WIDTH SPACE 22 0x200C, # ZERO WIDTH NON-JOINER 23 0x200D, # ZERO WIDTH JOINER 24 0x2060, # WORD JOINER 25 0xFEFF # ZERO WIDTH NO-BREAK SPACE (BOM) 26) 27 28# Zero-width emoji modifiers and components 29# NOTE: Some of these characters would normally be single-width according to 30# East Asian Width properties, but we deliberately override them to be 31# zero-width because they function as modifiers in emoji sequences. 32EMOJI_ZERO_WIDTH = [ 33 # Skin tone modifiers 34 (0x1F3FB, 0x1F3FF), # Emoji modifiers (skin tones) 35 36 # Variation selectors (note: VS16 is treated specially in vt.c) 37 (0xFE00, 0xFE0F), # Variation Selectors 1-16 38 39 # Gender and hair style modifiers 40 # These would be single-width by Unicode properties, but are zero-width 41 # when part of emoji 42 (0x2640, 0x2640), # Female sign 43 (0x2642, 0x2642), # Male sign 44 (0x26A7, 0x26A7), # Transgender symbol 45 (0x1F9B0, 0x1F9B3), # Hair components (red, curly, white, bald) 46 47 # Tag characters 48 (0xE0020, 0xE007E), # Tags 49] 50 51# Regional indicators (flag components) 52REGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF) # Regional indicator symbols A-Z 53 54# Double-width emoji ranges 55# 56# Many emoji characters are classified as single-width according to Unicode 57# Standard Annex #11 East Asian Width property (N or Neutral), but we 58# deliberately override them to be double-width. References: 59# 1. Unicode Technical Standard #51: Unicode Emoji 60# (https://www.unicode.org/reports/tr51/) 61# 2. Principle of "emoji presentation" in WHATWG CSS Text specification 62# (https://drafts.csswg.org/css-text-3/#character-properties) 63# 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which 64# universally render emoji as double-width characters regardless of their 65# Unicode EAW property 66# 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1 67# Emoji width (https://www.w3.org/TR/jlreq/) 68EMOJI_RANGES = [ 69 (0x1F000, 0x1F02F), # Mahjong Tiles (EAW: N, but displayed as double-width) 70 (0x1F0A0, 0x1F0FF), # Playing Cards (EAW: N, but displayed as double-width) 71 (0x1F300, 0x1F5FF), # Miscellaneous Symbols and Pictographs 72 (0x1F600, 0x1F64F), # Emoticons 73 (0x1F680, 0x1F6FF), # Transport and Map Symbols 74 (0x1F700, 0x1F77F), # Alchemical Symbols 75 (0x1F780, 0x1F7FF), # Geometric Shapes Extended 76 (0x1F800, 0x1F8FF), # Supplemental Arrows-C 77 (0x1F900, 0x1F9FF), # Supplemental Symbols and Pictographs 78 (0x1FA00, 0x1FA6F), # Chess Symbols 79 (0x1FA70, 0x1FAFF), # Symbols and Pictographs Extended-A 80] 81 82def create_width_tables(): 83 """ 84 Creates Unicode character width tables and returns the data structures. 85 86 Returns: 87 tuple: (zero_width_ranges, double_width_ranges) 88 """ 89 90 # Width data mapping 91 width_map = {} # Maps code points to width (0, 1, 2) 92 93 # Mark emoji modifiers as zero-width 94 for start, end in EMOJI_ZERO_WIDTH: 95 for cp in range(start, end + 1): 96 width_map[cp] = 0 97 98 # Mark all regional indicators as single-width as they are usually paired 99 # providing a combined width of 2 when displayed together. 100 start, end = REGIONAL_INDICATORS 101 for cp in range(start, end + 1): 102 width_map[cp] = 1 103 104 # Process all assigned Unicode code points (Basic Multilingual Plane + 105 # Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range) 106 for block_start in range(0, 0x110000, 0x1000): 107 block_end = block_start + 0x1000 108 for cp in range(block_start, block_end): 109 try: 110 char = chr(cp) 111 112 # Skip if already processed 113 if cp in width_map: 114 continue 115 116 # Check for combining marks and a format characters 117 category = unicodedata.category(char) 118 119 # Combining marks 120 if category.startswith('M'): 121 width_map[cp] = 0 122 continue 123 124 # Format characters 125 # Since we have no support for bidirectional text, all format 126 # characters (category Cf) can be treated with width 0 (zero) 127 # for simplicity, as they don't need to occupy visual space 128 # in a non-bidirectional text environment. 129 if category == 'Cf': 130 width_map[cp] = 0 131 continue 132 133 # Known zero-width characters 134 if cp in KNOWN_ZERO_WIDTH: 135 width_map[cp] = 0 136 continue 137 138 # Use East Asian Width property 139 eaw = unicodedata.east_asian_width(char) 140 if eaw in ('F', 'W'): # Fullwidth or Wide 141 width_map[cp] = 2 142 elif eaw in ('Na', 'H', 'N', 'A'): # Narrow, Halfwidth, Neutral, Ambiguous 143 width_map[cp] = 1 144 else: 145 # Default to single-width for unknown 146 width_map[cp] = 1 147 148 except (ValueError, OverflowError): 149 # Skip invalid code points 150 continue 151 152 # Process Emoji - generally double-width 153 for start, end in EMOJI_RANGES: 154 for cp in range(start, end + 1): 155 if cp not in width_map or width_map[cp] != 0: # Don't override zero-width 156 try: 157 char = chr(cp) 158 width_map[cp] = 2 159 except (ValueError, OverflowError): 160 continue 161 162 # Optimize to create range tables 163 def ranges_optimize(width_data, target_width): 164 points = sorted([cp for cp, width in width_data.items() if width == target_width]) 165 if not points: 166 return [] 167 168 # Group consecutive code points into ranges 169 ranges = [] 170 start = points[0] 171 prev = start 172 173 for cp in points[1:]: 174 if cp > prev + 1: 175 ranges.append((start, prev)) 176 start = cp 177 prev = cp 178 179 # Add the last range 180 ranges.append((start, prev)) 181 return ranges 182 183 # Extract ranges for each width 184 zero_width_ranges = ranges_optimize(width_map, 0) 185 double_width_ranges = ranges_optimize(width_map, 2) 186 187 return zero_width_ranges, double_width_ranges 188 189def write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE): 190 """ 191 Write the generated tables to C header file. 192 193 Args: 194 zero_width_ranges: List of (start, end) ranges for zero-width characters 195 double_width_ranges: List of (start, end) ranges for double-width characters 196 out_file: Output file name (default: DEFAULT_OUT_FILE) 197 """ 198 199 # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit) 200 def split_ranges_by_size(ranges): 201 bmp_ranges = [] 202 non_bmp_ranges = [] 203 204 for start, end in ranges: 205 if end <= 0xFFFF: 206 bmp_ranges.append((start, end)) 207 elif start > 0xFFFF: 208 non_bmp_ranges.append((start, end)) 209 else: 210 # Split the range at 0xFFFF 211 bmp_ranges.append((start, 0xFFFF)) 212 non_bmp_ranges.append((0x10000, end)) 213 214 return bmp_ranges, non_bmp_ranges 215 216 # Split ranges into BMP and non-BMP 217 zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges) 218 double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges) 219 220 # Function to generate code point description comments 221 def get_code_point_comment(start, end): 222 try: 223 start_char_desc = unicodedata.name(chr(start)) 224 if start == end: 225 return f"/* {start_char_desc} */" 226 else: 227 end_char_desc = unicodedata.name(chr(end)) 228 return f"/* {start_char_desc} - {end_char_desc} */" 229 except: 230 if start == end: 231 return f"/* U+{start:04X} */" 232 else: 233 return f"/* U+{start:04X} - U+{end:04X} */" 234 235 # Generate C tables 236 with open(out_file, 'w') as f: 237 f.write(f"""\ 238/* SPDX-License-Identifier: GPL-2.0 */ 239/* 240 * {out_file} - Unicode character width 241 * 242 * Auto-generated by {this_file} 243 * 244 * Unicode Version: {unicodedata.unidata_version} 245 */ 246 247/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ 248static const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{ 249""") 250 251 for start, end in zero_width_bmp: 252 comment = get_code_point_comment(start, end) 253 f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") 254 255 f.write("""\ 256}; 257 258/* Zero-width character ranges (non-BMP, U+10000 and above) */ 259static const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = { 260""") 261 262 for start, end in zero_width_non_bmp: 263 comment = get_code_point_comment(start, end) 264 f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") 265 266 f.write("""\ 267}; 268 269/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ 270static const struct ucs_interval16 ucs_double_width_bmp_ranges[] = { 271""") 272 273 for start, end in double_width_bmp: 274 comment = get_code_point_comment(start, end) 275 f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") 276 277 f.write("""\ 278}; 279 280/* Double-width character ranges (non-BMP, U+10000 and above) */ 281static const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = { 282""") 283 284 for start, end in double_width_non_bmp: 285 comment = get_code_point_comment(start, end) 286 f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") 287 288 f.write("};\n") 289 290if __name__ == "__main__": 291 # Parse command line arguments 292 parser = argparse.ArgumentParser(description="Generate Unicode width tables") 293 parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, 294 help=f"Output file name (default: {DEFAULT_OUT_FILE})") 295 args = parser.parse_args() 296 297 # Write tables to header file 298 zero_width_ranges, double_width_ranges = create_width_tables() 299 write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file) 300 301 # Print summary 302 zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges) 303 double_width_count = sum(end - start + 1 for start, end in double_width_ranges) 304 print(f"Generated {args.output_file} with:") 305 print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points") 306 print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points") 307 print(f"- Unicode Version: {unicodedata.unidata_version}") 308