xref: /freebsd/contrib/less/mkutable (revision 9f23cbd6cae82fd77edfad7173432fa8dccd0a95)
1#!/usr/bin/env perl
2use strict;
3
4my $USAGE = <<__EOF__;
5   usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt
6          -n = take non-matching types
7          -f = zero-based type field (default 2)
8__EOF__
9
10use Getopt::Std;
11use vars qw( $opt_f $opt_n );
12
13my $type_field = 2;
14
15# Override Unicode tables for certain control chars
16# that are expected to be found in normal text files.
17my %force_space = (
18    0x08 => 1, # backspace
19    0x09 => 1, # tab
20    0x0a => 1, # newline
21    0x0c => 1, # form feed
22    0x0d => 1, # carriage return
23);
24
25# Hangul Jamo medial vowels and final consonants should be zero width.
26my @force_compose = (
27    [0x1160, 0x11ff],
28    [0xd7b0, 0xd7c6],
29    [0xd7cb, 0xd7fb]
30);
31
32exit (main() ? 0 : 1);
33
34sub main {
35    my $args = join ' ', @ARGV;
36    die $USAGE if not getopts('f:n');
37    $type_field = $opt_f if $opt_f;
38
39    my %types;
40    my $arg;
41    while ($arg = shift @ARGV) {
42        last if $arg eq '--';
43        $types{$arg} = 1;
44    }
45    my %out = ( 'types' => \%types );
46
47    my %force_compose;
48    foreach my $comp (@force_compose) {
49        my ($lo,$hi) = @$comp;
50        for (my $ch = $lo; $ch <= $hi; ++$ch) {
51            $force_compose{$ch} = 1;
52        }
53    }
54
55    my $date = `date`;
56    chomp $date;
57    print "/* Generated by \"$0 $args\" on $date */\n";
58
59    my $last_code = 0;
60    my $start_range = 0;
61    while (<>) {
62        chomp;
63        s/#.*//;
64        my @fields = split /;/;
65        next if not @fields;
66        my ($lo_code, $hi_code);
67        my $codes = $fields[0];
68        if ($codes =~ /(\w+)\.\.(\w+)/) {
69            $lo_code = hex $1;
70            $hi_code = hex $2;
71        } else {
72            $lo_code = $hi_code = hex $codes;
73        }
74        if ($fields[1] =~ /, First>$/) {
75            die "invalid Unicode data: First with range" if $hi_code != $lo_code;
76            $start_range = $lo_code;
77            next;
78        }
79        if ($fields[1] =~ /, Last>$/) {
80            die "invalid Unicode data: Last without First" if not $start_range;
81            $lo_code = $start_range;
82            $start_range = 0;
83        } elsif ($start_range) {
84            die "invalid Unicode data: First without Last";
85        }
86        my $type = $fields[$type_field];
87        $type =~ s/\s//g;
88        for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) {
89            output(\%out, $last_code,
90                $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' : $type);
91        }
92    }
93    output(\%out, $last_code);
94    return 1;
95}
96
97sub output {
98    my ($out, $code, $type) = @_;
99    my $type_ok = ($type and ${${$out}{types}}{$type});
100    $type_ok = not $type_ok if $opt_n;
101    my $prev_code = $$out{prev_code};
102
103    if (not $type_ok) {
104        end_run($out, $prev_code);
105    } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) {
106        end_run($out, $prev_code);
107        start_run($out, $code, $type);
108    }
109    $$out{prev_code} = $code;
110}
111
112sub start_run {
113    my ($out, $code, $type) = @_;
114    $$out{start_code} = $code;
115    $$out{prev_code} = $code;
116    $$out{run_type} = $type;
117    $$out{in_run} = 1;
118}
119
120sub end_run {
121    my ($out, $code) = @_;
122    return if not $$out{in_run};
123    printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type};
124    $$out{in_run} = 0;
125}
126