1d713e089SXin LI#!/usr/bin/env perl 2a15691bfSXin LIuse strict; 3a15691bfSXin LI 4a15691bfSXin LImy $USAGE = <<__EOF__; 5a15691bfSXin LI usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt 6a15691bfSXin LI -n = take non-matching types 7a15691bfSXin LI -f = zero-based type field (default 2) 8a15691bfSXin LI__EOF__ 9a15691bfSXin LI 10a15691bfSXin LIuse Getopt::Std; 116f26c71dSXin LIuse vars qw( $opt_f $opt_n ); 126f26c71dSXin LI 13a15691bfSXin LImy $type_field = 2; 14a15691bfSXin LI 15b2ea2440SXin LI# Override Unicode tables for certain control chars 16b2ea2440SXin LI# that are expected to be found in normal text files. 17b2ea2440SXin LImy %force_space = ( 18b2ea2440SXin LI 0x08 => 1, # backspace 19b2ea2440SXin LI 0x09 => 1, # tab 20b2ea2440SXin LI 0x0a => 1, # newline 21b2ea2440SXin LI 0x0c => 1, # form feed 22b2ea2440SXin LI 0x0d => 1, # carriage return 23b2ea2440SXin LI); 24b2ea2440SXin LI 25*d5cb458bSXin LI# Override Unicode tables for certain modifier chars which act differently 26*d5cb458bSXin LI# on different terminals. Treat them as omittable. 27*d5cb458bSXin LImy @force_omit = ( 28*d5cb458bSXin LI [0xad, 0xad], # SOFT HYPHEN 29*d5cb458bSXin LI [0x200d, 0x200d], # ZERO WIDTH JOINER 30*d5cb458bSXin LI [0x1f3fb, 0x1f3ff], # EMOJI MODIFIER FITZPATRICK TYPE-[1-6] 31*d5cb458bSXin LI [0x1f9b0, 0x1f9b3], # EMOJI COMPONENT [RED,CURLY,BALD,WHITE] HAIR 32*d5cb458bSXin LI [0xfe00, 0xfe0f], # VARIATION SELECTOR-[1-16] 33*d5cb458bSXin LI [0xe0100, 0xe01ef], # VARIATION SELECTOR-[17-256] 34*d5cb458bSXin LI); 35*d5cb458bSXin LI 366f26c71dSXin LI# Hangul Jamo medial vowels and final consonants should be zero width. 376f26c71dSXin LImy @force_compose = ( 386f26c71dSXin LI [0x1160, 0x11ff], 396f26c71dSXin LI [0xd7b0, 0xd7c6], 406f26c71dSXin LI [0xd7cb, 0xd7fb] 416f26c71dSXin LI); 426f26c71dSXin LI 43f6b74a7dSXin LIexit (main() ? 0 : 1); 44a15691bfSXin LI 45a15691bfSXin LIsub main { 46a15691bfSXin LI my $args = join ' ', @ARGV; 47a15691bfSXin LI die $USAGE if not getopts('f:n'); 48a15691bfSXin LI $type_field = $opt_f if $opt_f; 496f26c71dSXin LI 50a15691bfSXin LI my %types; 51a15691bfSXin LI my $arg; 52a15691bfSXin LI while ($arg = shift @ARGV) { 53a15691bfSXin LI last if $arg eq '--'; 54a15691bfSXin LI $types{$arg} = 1; 55a15691bfSXin LI } 56a15691bfSXin LI my %out = ( 'types' => \%types ); 57a15691bfSXin LI 586f26c71dSXin LI my %force_compose; 596f26c71dSXin LI foreach my $comp (@force_compose) { 606f26c71dSXin LI my ($lo,$hi) = @$comp; 616f26c71dSXin LI for (my $ch = $lo; $ch <= $hi; ++$ch) { 626f26c71dSXin LI $force_compose{$ch} = 1; 636f26c71dSXin LI } 646f26c71dSXin LI } 65*d5cb458bSXin LI my %force_omit; 66*d5cb458bSXin LI foreach my $comp (@force_omit) { 67*d5cb458bSXin LI my ($lo,$hi) = @$comp; 68*d5cb458bSXin LI for (my $ch = $lo; $ch <= $hi; ++$ch) { 69*d5cb458bSXin LI $force_omit{$ch} = 1; 70*d5cb458bSXin LI } 71*d5cb458bSXin LI } 726f26c71dSXin LI 73252d6ddeSXin LI my ($sec,$min,$hour,$mday,$mon,$year) = gmtime($ENV{SOURCE_DATE_EPOCH} // time()); 74252d6ddeSXin LI my @month = ( "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" ); 75252d6ddeSXin LI printf "/* Generated by \"%s %s\" on %s %2d %2d:%02d:%02d GMT %d */\n", 76252d6ddeSXin LI $0, $args, $month[$mon], $mday, $hour, $min, $sec, $year+1900; 776f26c71dSXin LI 78f6b74a7dSXin LI my $last_code = 0; 79d713e089SXin LI my $start_range = 0; 80a15691bfSXin LI while (<>) { 81a15691bfSXin LI chomp; 82a15691bfSXin LI s/#.*//; 83a15691bfSXin LI my @fields = split /;/; 84a15691bfSXin LI next if not @fields; 85f6b74a7dSXin LI my ($lo_code, $hi_code); 86f6b74a7dSXin LI my $codes = $fields[0]; 87f6b74a7dSXin LI if ($codes =~ /(\w+)\.\.(\w+)/) { 88f6b74a7dSXin LI $lo_code = hex $1; 89f6b74a7dSXin LI $hi_code = hex $2; 90f6b74a7dSXin LI } else { 916f26c71dSXin LI $lo_code = $hi_code = hex $codes; 92f6b74a7dSXin LI } 93d713e089SXin LI if ($fields[1] =~ /, First>$/) { 94d713e089SXin LI die "invalid Unicode data: First with range" if $hi_code != $lo_code; 95d713e089SXin LI $start_range = $lo_code; 96d713e089SXin LI next; 97d713e089SXin LI } 98d713e089SXin LI if ($fields[1] =~ /, Last>$/) { 99d713e089SXin LI die "invalid Unicode data: Last without First" if not $start_range; 100d713e089SXin LI $lo_code = $start_range; 101d713e089SXin LI $start_range = 0; 102d713e089SXin LI } elsif ($start_range) { 103d713e089SXin LI die "invalid Unicode data: First without Last"; 104d713e089SXin LI } 105a15691bfSXin LI my $type = $fields[$type_field]; 106a15691bfSXin LI $type =~ s/\s//g; 107f6b74a7dSXin LI for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) { 1086f26c71dSXin LI output(\%out, $last_code, 109*d5cb458bSXin LI $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : 110*d5cb458bSXin LI $force_omit{$last_code} ? 'Xx' : $type); 111a15691bfSXin LI } 112a15691bfSXin LI } 113f6b74a7dSXin LI output(\%out, $last_code); 114f6b74a7dSXin LI return 1; 115a15691bfSXin LI} 116a15691bfSXin LI 117a15691bfSXin LIsub output { 118a15691bfSXin LI my ($out, $code, $type) = @_; 119f6b74a7dSXin LI my $type_ok = ($type and ${${$out}{types}}{$type}); 120f6b74a7dSXin LI $type_ok = not $type_ok if $opt_n; 121f6b74a7dSXin LI my $prev_code = $$out{prev_code}; 122f6b74a7dSXin LI 123f6b74a7dSXin LI if (not $type_ok) { 124f6b74a7dSXin LI end_run($out, $prev_code); 125f6b74a7dSXin LI } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) { 126f6b74a7dSXin LI end_run($out, $prev_code); 127a15691bfSXin LI start_run($out, $code, $type); 128a15691bfSXin LI } 129f6b74a7dSXin LI $$out{prev_code} = $code; 130a15691bfSXin LI} 131a15691bfSXin LI 132a15691bfSXin LIsub start_run { 133a15691bfSXin LI my ($out, $code, $type) = @_; 134a15691bfSXin LI $$out{start_code} = $code; 135f6b74a7dSXin LI $$out{prev_code} = $code; 136f6b74a7dSXin LI $$out{run_type} = $type; 137a15691bfSXin LI $$out{in_run} = 1; 138a15691bfSXin LI} 139a15691bfSXin LI 140a15691bfSXin LIsub end_run { 141a15691bfSXin LI my ($out, $code) = @_; 142a15691bfSXin LI return if not $$out{in_run}; 143f6b74a7dSXin LI printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type}; 144a15691bfSXin LI $$out{in_run} = 0; 145a15691bfSXin LI} 146