1#!/usr/bin/env perl 2use strict; 3 4my $USAGE = <<__EOF__; 5 usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt 6 -n = take non-matching types 7 -f = zero-based type field (default 2) 8__EOF__ 9 10use Getopt::Std; 11use vars qw( $opt_f $opt_n ); 12 13my $type_field = 2; 14 15# Override Unicode tables for certain control chars 16# that are expected to be found in normal text files. 17my %force_space = ( 18 0x08 => 1, # backspace 19 0x09 => 1, # tab 20 0x0a => 1, # newline 21 0x0c => 1, # form feed 22 0x0d => 1, # carriage return 23); 24 25# Hangul Jamo medial vowels and final consonants should be zero width. 26my @force_compose = ( 27 [0x1160, 0x11ff], 28 [0xd7b0, 0xd7c6], 29 [0xd7cb, 0xd7fb] 30); 31 32exit (main() ? 0 : 1); 33 34sub main { 35 my $args = join ' ', @ARGV; 36 die $USAGE if not getopts('f:n'); 37 $type_field = $opt_f if $opt_f; 38 39 my %types; 40 my $arg; 41 while ($arg = shift @ARGV) { 42 last if $arg eq '--'; 43 $types{$arg} = 1; 44 } 45 my %out = ( 'types' => \%types ); 46 47 my %force_compose; 48 foreach my $comp (@force_compose) { 49 my ($lo,$hi) = @$comp; 50 for (my $ch = $lo; $ch <= $hi; ++$ch) { 51 $force_compose{$ch} = 1; 52 } 53 } 54 55 my $date = `date`; 56 chomp $date; 57 print "/* Generated by \"$0 $args\" on $date */\n"; 58 59 my $last_code = 0; 60 my $start_range = 0; 61 while (<>) { 62 chomp; 63 s/#.*//; 64 my @fields = split /;/; 65 next if not @fields; 66 my ($lo_code, $hi_code); 67 my $codes = $fields[0]; 68 if ($codes =~ /(\w+)\.\.(\w+)/) { 69 $lo_code = hex $1; 70 $hi_code = hex $2; 71 } else { 72 $lo_code = $hi_code = hex $codes; 73 } 74 if ($fields[1] =~ /, First>$/) { 75 die "invalid Unicode data: First with range" if $hi_code != $lo_code; 76 $start_range = $lo_code; 77 next; 78 } 79 if ($fields[1] =~ /, Last>$/) { 80 die "invalid Unicode data: Last without First" if not $start_range; 81 $lo_code = $start_range; 82 $start_range = 0; 83 } elsif ($start_range) { 84 die "invalid Unicode data: First without Last"; 85 } 86 my $type = $fields[$type_field]; 87 $type =~ s/\s//g; 88 for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) { 89 output(\%out, $last_code, 90 $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : $type); 91 } 92 } 93 output(\%out, $last_code); 94 return 1; 95} 96 97sub output { 98 my ($out, $code, $type) = @_; 99 my $type_ok = ($type and ${${$out}{types}}{$type}); 100 $type_ok = not $type_ok if $opt_n; 101 my $prev_code = $$out{prev_code}; 102 103 if (not $type_ok) { 104 end_run($out, $prev_code); 105 } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) { 106 end_run($out, $prev_code); 107 start_run($out, $code, $type); 108 } 109 $$out{prev_code} = $code; 110} 111 112sub start_run { 113 my ($out, $code, $type) = @_; 114 $$out{start_code} = $code; 115 $$out{prev_code} = $code; 116 $$out{run_type} = $type; 117 $$out{in_run} = 1; 118} 119 120sub end_run { 121 my ($out, $code) = @_; 122 return if not $$out{in_run}; 123 printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type}; 124 $$out{in_run} = 0; 125} 126