1a15691bfSXin LI#! /usr/bin/perl 2a15691bfSXin LIuse strict; 3a15691bfSXin LI 4a15691bfSXin LImy $USAGE = <<__EOF__; 5a15691bfSXin LI usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt 6a15691bfSXin LI -n = take non-matching types 7a15691bfSXin LI -f = zero-based type field (default 2) 8a15691bfSXin LI__EOF__ 9a15691bfSXin LI 10a15691bfSXin LIuse Getopt::Std; 11*6f26c71dSXin LIuse vars qw( $opt_f $opt_n ); 12*6f26c71dSXin LI 13a15691bfSXin LImy $type_field = 2; 14a15691bfSXin LI 15b2ea2440SXin LI# Override Unicode tables for certain control chars 16b2ea2440SXin LI# that are expected to be found in normal text files. 17b2ea2440SXin LImy %force_space = ( 18b2ea2440SXin LI 0x08 => 1, # backspace 19b2ea2440SXin LI 0x09 => 1, # tab 20b2ea2440SXin LI 0x0a => 1, # newline 21b2ea2440SXin LI 0x0c => 1, # form feed 22b2ea2440SXin LI 0x0d => 1, # carriage return 23b2ea2440SXin LI); 24b2ea2440SXin LI 25*6f26c71dSXin LI# Hangul Jamo medial vowels and final consonants should be zero width. 26*6f26c71dSXin LImy @force_compose = ( 27*6f26c71dSXin LI [0x1160, 0x11ff], 28*6f26c71dSXin LI [0xd7b0, 0xd7c6], 29*6f26c71dSXin LI [0xd7cb, 0xd7fb] 30*6f26c71dSXin LI); 31*6f26c71dSXin LI 32f6b74a7dSXin LIexit (main() ? 0 : 1); 33a15691bfSXin LI 34a15691bfSXin LIsub main { 35a15691bfSXin LI my $args = join ' ', @ARGV; 36a15691bfSXin LI die $USAGE if not getopts('f:n'); 37a15691bfSXin LI $type_field = $opt_f if $opt_f; 38*6f26c71dSXin LI 39a15691bfSXin LI my %types; 40a15691bfSXin LI my $arg; 41a15691bfSXin LI while ($arg = shift @ARGV) { 42a15691bfSXin LI last if $arg eq '--'; 43a15691bfSXin LI $types{$arg} = 1; 44a15691bfSXin LI } 45a15691bfSXin LI my %out = ( 'types' => \%types ); 46a15691bfSXin LI 47*6f26c71dSXin LI my %force_compose; 48*6f26c71dSXin LI foreach my $comp (@force_compose) { 49*6f26c71dSXin LI my ($lo,$hi) = @$comp; 50*6f26c71dSXin LI for (my $ch = $lo; $ch <= $hi; ++$ch) { 51*6f26c71dSXin LI $force_compose{$ch} = 1; 52*6f26c71dSXin LI } 53*6f26c71dSXin LI } 54*6f26c71dSXin LI 55*6f26c71dSXin LI my $date = `date`; 56*6f26c71dSXin LI chomp $date; 57*6f26c71dSXin LI print "/* Generated by \"$0 $args\" on $date */\n"; 58*6f26c71dSXin LI 59f6b74a7dSXin LI my $last_code = 0; 60a15691bfSXin LI while (<>) { 61a15691bfSXin LI chomp; 62a15691bfSXin LI s/#.*//; 63a15691bfSXin LI my @fields = split /;/; 64a15691bfSXin LI next if not @fields; 65f6b74a7dSXin LI my ($lo_code, $hi_code); 66f6b74a7dSXin LI my $codes = $fields[0]; 67f6b74a7dSXin LI if ($codes =~ /(\w+)\.\.(\w+)/) { 68f6b74a7dSXin LI $lo_code = hex $1; 69f6b74a7dSXin LI $hi_code = hex $2; 70f6b74a7dSXin LI } else { 71*6f26c71dSXin LI $lo_code = $hi_code = hex $codes; 72f6b74a7dSXin LI } 73a15691bfSXin LI my $type = $fields[$type_field]; 74a15691bfSXin LI $type =~ s/\s//g; 75f6b74a7dSXin LI for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) { 76*6f26c71dSXin LI output(\%out, $last_code, 77*6f26c71dSXin LI $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : $type); 78a15691bfSXin LI } 79a15691bfSXin LI } 80f6b74a7dSXin LI output(\%out, $last_code); 81f6b74a7dSXin LI return 1; 82a15691bfSXin LI} 83a15691bfSXin LI 84a15691bfSXin LIsub output { 85a15691bfSXin LI my ($out, $code, $type) = @_; 86f6b74a7dSXin LI my $type_ok = ($type and ${${$out}{types}}{$type}); 87f6b74a7dSXin LI $type_ok = not $type_ok if $opt_n; 88f6b74a7dSXin LI my $prev_code = $$out{prev_code}; 89f6b74a7dSXin LI 90f6b74a7dSXin LI if (not $type_ok) { 91f6b74a7dSXin LI end_run($out, $prev_code); 92f6b74a7dSXin LI } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) { 93f6b74a7dSXin LI end_run($out, $prev_code); 94a15691bfSXin LI start_run($out, $code, $type); 95a15691bfSXin LI } 96f6b74a7dSXin LI $$out{prev_code} = $code; 97a15691bfSXin LI} 98a15691bfSXin LI 99a15691bfSXin LIsub start_run { 100a15691bfSXin LI my ($out, $code, $type) = @_; 101a15691bfSXin LI $$out{start_code} = $code; 102f6b74a7dSXin LI $$out{prev_code} = $code; 103f6b74a7dSXin LI $$out{run_type} = $type; 104a15691bfSXin LI $$out{in_run} = 1; 105a15691bfSXin LI} 106a15691bfSXin LI 107a15691bfSXin LIsub end_run { 108a15691bfSXin LI my ($out, $code) = @_; 109a15691bfSXin LI return if not $$out{in_run}; 110f6b74a7dSXin LI printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type}; 111a15691bfSXin LI $$out{in_run} = 0; 112a15691bfSXin LI} 113