1#!/usr/bin/env perl 2use strict; 3 4my $USAGE = <<__EOF__; 5 usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt 6 -n = take non-matching types 7 -f = zero-based type field (default 2) 8__EOF__ 9 10use Getopt::Std; 11use vars qw( $opt_f $opt_n ); 12 13my $type_field = 2; 14 15# Override Unicode tables for certain control chars 16# that are expected to be found in normal text files. 17my %force_space = ( 18 0x08 => 1, # backspace 19 0x09 => 1, # tab 20 0x0a => 1, # newline 21 0x0c => 1, # form feed 22 0x0d => 1, # carriage return 23); 24 25# Override Unicode tables for certain modifier chars which act differently 26# on different terminals. Treat them as omittable. 27my @force_omit = ( 28 [0xad, 0xad], # SOFT HYPHEN 29 [0x200d, 0x200d], # ZERO WIDTH JOINER 30 [0x1f3fb, 0x1f3ff], # EMOJI MODIFIER FITZPATRICK TYPE-[1-6] 31 [0x1f9b0, 0x1f9b3], # EMOJI COMPONENT [RED,CURLY,BALD,WHITE] HAIR 32 [0xfe00, 0xfe0f], # VARIATION SELECTOR-[1-16] 33 [0xe0100, 0xe01ef], # VARIATION SELECTOR-[17-256] 34); 35 36# Hangul Jamo medial vowels and final consonants should be zero width. 37my @force_compose = ( 38 [0x1160, 0x11ff], 39 [0xd7b0, 0xd7c6], 40 [0xd7cb, 0xd7fb] 41); 42 43exit (main() ? 0 : 1); 44 45sub main { 46 my $args = join ' ', @ARGV; 47 die $USAGE if not getopts('f:n'); 48 $type_field = $opt_f if $opt_f; 49 50 my %types; 51 my $arg; 52 while ($arg = shift @ARGV) { 53 last if $arg eq '--'; 54 $types{$arg} = 1; 55 } 56 my %out = ( 'types' => \%types ); 57 58 my %force_compose; 59 foreach my $comp (@force_compose) { 60 my ($lo,$hi) = @$comp; 61 for (my $ch = $lo; $ch <= $hi; ++$ch) { 62 $force_compose{$ch} = 1; 63 } 64 } 65 my %force_omit; 66 foreach my $comp (@force_omit) { 67 my ($lo,$hi) = @$comp; 68 for (my $ch = $lo; $ch <= $hi; ++$ch) { 69 $force_omit{$ch} = 1; 70 } 71 } 72 73 my ($sec,$min,$hour,$mday,$mon,$year) = gmtime($ENV{SOURCE_DATE_EPOCH} // time()); 74 my @month = ( "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" ); 75 printf "/* Generated by \"%s %s\" on %s %2d %2d:%02d:%02d GMT %d */\n", 76 $0, $args, $month[$mon], $mday, $hour, $min, $sec, $year+1900; 77 78 my $last_code = 0; 79 my $start_range = 0; 80 while (<>) { 81 chomp; 82 s/#.*//; 83 my @fields = split /;/; 84 next if not @fields; 85 my ($lo_code, $hi_code); 86 my $codes = $fields[0]; 87 if ($codes =~ /(\w+)\.\.(\w+)/) { 88 $lo_code = hex $1; 89 $hi_code = hex $2; 90 } else { 91 $lo_code = $hi_code = hex $codes; 92 } 93 if ($fields[1] =~ /, First>$/) { 94 die "invalid Unicode data: First with range" if $hi_code != $lo_code; 95 $start_range = $lo_code; 96 next; 97 } 98 if ($fields[1] =~ /, Last>$/) { 99 die "invalid Unicode data: Last without First" if not $start_range; 100 $lo_code = $start_range; 101 $start_range = 0; 102 } elsif ($start_range) { 103 die "invalid Unicode data: First without Last"; 104 } 105 my $type = $fields[$type_field]; 106 $type =~ s/\s//g; 107 for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) { 108 output(\%out, $last_code, 109 $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : 110 $force_omit{$last_code} ? 'Xx' : $type); 111 } 112 } 113 output(\%out, $last_code); 114 return 1; 115} 116 117sub output { 118 my ($out, $code, $type) = @_; 119 my $type_ok = ($type and ${${$out}{types}}{$type}); 120 $type_ok = not $type_ok if $opt_n; 121 my $prev_code = $$out{prev_code}; 122 123 if (not $type_ok) { 124 end_run($out, $prev_code); 125 } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) { 126 end_run($out, $prev_code); 127 start_run($out, $code, $type); 128 } 129 $$out{prev_code} = $code; 130} 131 132sub start_run { 133 my ($out, $code, $type) = @_; 134 $$out{start_code} = $code; 135 $$out{prev_code} = $code; 136 $$out{run_type} = $type; 137 $$out{in_run} = 1; 138} 139 140sub end_run { 141 my ($out, $code) = @_; 142 return if not $$out{in_run}; 143 printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type}; 144 $$out{in_run} = 0; 145} 146