1#! /usr/bin/perl 2use strict; 3 4my $USAGE = <<__EOF__; 5 usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt 6 -n = take non-matching types 7 -f = zero-based type field (default 2) 8__EOF__ 9 10use vars qw( $opt_f $opt_n ); 11use Getopt::Std; 12my $type_field = 2; 13 14# Override Unicode tables for certain control chars 15# that are expected to be found in normal text files. 16my %force_space = ( 17 0x08 => 1, # backspace 18 0x09 => 1, # tab 19 0x0a => 1, # newline 20 0x0c => 1, # form feed 21 0x0d => 1, # carriage return 22); 23 24exit (main() ? 0 : 1); 25 26sub main { 27 my $date = `date`; 28 chomp $date; 29 my $args = join ' ', @ARGV; 30 my $header = "/* Generated by \"$0 $args\" on $date */\n"; 31 32 die $USAGE if not getopts('f:n'); 33 $type_field = $opt_f if $opt_f; 34 my %types; 35 my $arg; 36 while ($arg = shift @ARGV) { 37 last if $arg eq '--'; 38 $types{$arg} = 1; 39 } 40 my %out = ( 'types' => \%types ); 41 42 print $header; 43 my $last_code = 0; 44 while (<>) { 45 chomp; 46 s/#.*//; 47 my @fields = split /;/; 48 next if not @fields; 49 my ($lo_code, $hi_code); 50 my $codes = $fields[0]; 51 if ($codes =~ /(\w+)\.\.(\w+)/) { 52 $lo_code = hex $1; 53 $hi_code = hex $2; 54 } else { 55 $lo_code = $hi_code = hex $fields[0]; 56 } 57 my $type = $fields[$type_field]; 58 $type =~ s/\s//g; 59 for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) { 60 $type = 'Zs' if $force_space{$last_code}; 61 output(\%out, $last_code, $type); 62 } 63 } 64 output(\%out, $last_code); 65 return 1; 66} 67 68sub output { 69 my ($out, $code, $type) = @_; 70 my $type_ok = ($type and ${${$out}{types}}{$type}); 71 $type_ok = not $type_ok if $opt_n; 72 my $prev_code = $$out{prev_code}; 73 74 if (not $type_ok) { 75 end_run($out, $prev_code); 76 } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) { 77 end_run($out, $prev_code); 78 start_run($out, $code, $type); 79 } 80 $$out{prev_code} = $code; 81} 82 83sub start_run { 84 my ($out, $code, $type) = @_; 85 $$out{start_code} = $code; 86 $$out{prev_code} = $code; 87 $$out{run_type} = $type; 88 $$out{in_run} = 1; 89} 90 91sub end_run { 92 my ($out, $code) = @_; 93 return if not $$out{in_run}; 94 printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type}; 95 $$out{in_run} = 0; 96} 97