xref: /freebsd/contrib/less/mkutable (revision 6f26c71d76bb795b30684affb3b57870a7926b26)
1a15691bfSXin LI#! /usr/bin/perl
2a15691bfSXin LIuse strict;
3a15691bfSXin LI
4a15691bfSXin LImy $USAGE = <<__EOF__;
5a15691bfSXin LI   usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt
6a15691bfSXin LI          -n = take non-matching types
7a15691bfSXin LI          -f = zero-based type field (default 2)
8a15691bfSXin LI__EOF__
9a15691bfSXin LI
10a15691bfSXin LIuse Getopt::Std;
11*6f26c71dSXin LIuse vars qw( $opt_f $opt_n );
12*6f26c71dSXin LI
13a15691bfSXin LImy $type_field = 2;
14a15691bfSXin LI
15b2ea2440SXin LI# Override Unicode tables for certain control chars
16b2ea2440SXin LI# that are expected to be found in normal text files.
17b2ea2440SXin LImy %force_space = (
18b2ea2440SXin LI    0x08 => 1, # backspace
19b2ea2440SXin LI    0x09 => 1, # tab
20b2ea2440SXin LI    0x0a => 1, # newline
21b2ea2440SXin LI    0x0c => 1, # form feed
22b2ea2440SXin LI    0x0d => 1, # carriage return
23b2ea2440SXin LI);
24b2ea2440SXin LI
25*6f26c71dSXin LI# Hangul Jamo medial vowels and final consonants should be zero width.
26*6f26c71dSXin LImy @force_compose = (
27*6f26c71dSXin LI    [0x1160, 0x11ff],
28*6f26c71dSXin LI    [0xd7b0, 0xd7c6],
29*6f26c71dSXin LI    [0xd7cb, 0xd7fb]
30*6f26c71dSXin LI);
31*6f26c71dSXin LI
32f6b74a7dSXin LIexit (main() ? 0 : 1);
33a15691bfSXin LI
34a15691bfSXin LIsub main {
35a15691bfSXin LI    my $args = join ' ', @ARGV;
36a15691bfSXin LI    die $USAGE if not getopts('f:n');
37a15691bfSXin LI    $type_field = $opt_f if $opt_f;
38*6f26c71dSXin LI
39a15691bfSXin LI    my %types;
40a15691bfSXin LI    my $arg;
41a15691bfSXin LI    while ($arg = shift @ARGV) {
42a15691bfSXin LI        last if $arg eq '--';
43a15691bfSXin LI        $types{$arg} = 1;
44a15691bfSXin LI    }
45a15691bfSXin LI    my %out = ( 'types' => \%types );
46a15691bfSXin LI
47*6f26c71dSXin LI    my %force_compose;
48*6f26c71dSXin LI    foreach my $comp (@force_compose) {
49*6f26c71dSXin LI        my ($lo,$hi) = @$comp;
50*6f26c71dSXin LI        for (my $ch = $lo; $ch <= $hi; ++$ch) {
51*6f26c71dSXin LI            $force_compose{$ch} = 1;
52*6f26c71dSXin LI        }
53*6f26c71dSXin LI    }
54*6f26c71dSXin LI
55*6f26c71dSXin LI    my $date = `date`;
56*6f26c71dSXin LI    chomp $date;
57*6f26c71dSXin LI    print "/* Generated by \"$0 $args\" on $date */\n";
58*6f26c71dSXin LI
59f6b74a7dSXin LI    my $last_code = 0;
60a15691bfSXin LI    while (<>) {
61a15691bfSXin LI        chomp;
62a15691bfSXin LI        s/#.*//;
63a15691bfSXin LI        my @fields = split /;/;
64a15691bfSXin LI        next if not @fields;
65f6b74a7dSXin LI        my ($lo_code, $hi_code);
66f6b74a7dSXin LI        my $codes = $fields[0];
67f6b74a7dSXin LI        if ($codes =~ /(\w+)\.\.(\w+)/) {
68f6b74a7dSXin LI            $lo_code = hex $1;
69f6b74a7dSXin LI            $hi_code = hex $2;
70f6b74a7dSXin LI        } else {
71*6f26c71dSXin LI            $lo_code = $hi_code = hex $codes;
72f6b74a7dSXin LI        }
73a15691bfSXin LI        my $type = $fields[$type_field];
74a15691bfSXin LI        $type =~ s/\s//g;
75f6b74a7dSXin LI        for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) {
76*6f26c71dSXin LI            output(\%out, $last_code,
77*6f26c71dSXin LI                $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : $type);
78a15691bfSXin LI        }
79a15691bfSXin LI    }
80f6b74a7dSXin LI    output(\%out, $last_code);
81f6b74a7dSXin LI    return 1;
82a15691bfSXin LI}
83a15691bfSXin LI
84a15691bfSXin LIsub output {
85a15691bfSXin LI    my ($out, $code, $type) = @_;
86f6b74a7dSXin LI    my $type_ok = ($type and ${${$out}{types}}{$type});
87f6b74a7dSXin LI    $type_ok = not $type_ok if $opt_n;
88f6b74a7dSXin LI    my $prev_code = $$out{prev_code};
89f6b74a7dSXin LI
90f6b74a7dSXin LI    if (not $type_ok) {
91f6b74a7dSXin LI        end_run($out, $prev_code);
92f6b74a7dSXin LI    } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) {
93f6b74a7dSXin LI        end_run($out, $prev_code);
94a15691bfSXin LI        start_run($out, $code, $type);
95a15691bfSXin LI    }
96f6b74a7dSXin LI    $$out{prev_code} = $code;
97a15691bfSXin LI}
98a15691bfSXin LI
99a15691bfSXin LIsub start_run {
100a15691bfSXin LI    my ($out, $code, $type) = @_;
101a15691bfSXin LI    $$out{start_code} = $code;
102f6b74a7dSXin LI    $$out{prev_code} = $code;
103f6b74a7dSXin LI    $$out{run_type} = $type;
104a15691bfSXin LI    $$out{in_run} = 1;
105a15691bfSXin LI}
106a15691bfSXin LI
107a15691bfSXin LIsub end_run {
108a15691bfSXin LI    my ($out, $code) = @_;
109a15691bfSXin LI    return if not $$out{in_run};
110f6b74a7dSXin LI    printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type};
111a15691bfSXin LI    $$out{in_run} = 0;
112a15691bfSXin LI}
113