xref: /freebsd/contrib/less/mkutable (revision d5cb458b4b58b0f0b3c058a32439f232fd5455ca)
1d713e089SXin LI#!/usr/bin/env perl
2a15691bfSXin LIuse strict;
3a15691bfSXin LI
4a15691bfSXin LImy $USAGE = <<__EOF__;
5a15691bfSXin LI   usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt
6a15691bfSXin LI          -n = take non-matching types
7a15691bfSXin LI          -f = zero-based type field (default 2)
8a15691bfSXin LI__EOF__
9a15691bfSXin LI
10a15691bfSXin LIuse Getopt::Std;
116f26c71dSXin LIuse vars qw( $opt_f $opt_n );
126f26c71dSXin LI
13a15691bfSXin LImy $type_field = 2;
14a15691bfSXin LI
15b2ea2440SXin LI# Override Unicode tables for certain control chars
16b2ea2440SXin LI# that are expected to be found in normal text files.
17b2ea2440SXin LImy %force_space = (
18b2ea2440SXin LI    0x08 => 1, # backspace
19b2ea2440SXin LI    0x09 => 1, # tab
20b2ea2440SXin LI    0x0a => 1, # newline
21b2ea2440SXin LI    0x0c => 1, # form feed
22b2ea2440SXin LI    0x0d => 1, # carriage return
23b2ea2440SXin LI);
24b2ea2440SXin LI
25*d5cb458bSXin LI# Override Unicode tables for certain modifier chars which act differently
26*d5cb458bSXin LI# on different terminals. Treat them as omittable.
27*d5cb458bSXin LImy @force_omit = (
28*d5cb458bSXin LI    [0xad,    0xad],    # SOFT HYPHEN
29*d5cb458bSXin LI    [0x200d,  0x200d],  # ZERO WIDTH JOINER
30*d5cb458bSXin LI    [0x1f3fb, 0x1f3ff], # EMOJI MODIFIER FITZPATRICK TYPE-[1-6]
31*d5cb458bSXin LI    [0x1f9b0, 0x1f9b3], # EMOJI COMPONENT [RED,CURLY,BALD,WHITE] HAIR
32*d5cb458bSXin LI    [0xfe00,  0xfe0f],  # VARIATION SELECTOR-[1-16]
33*d5cb458bSXin LI    [0xe0100, 0xe01ef], # VARIATION SELECTOR-[17-256]
34*d5cb458bSXin LI);
35*d5cb458bSXin LI
366f26c71dSXin LI# Hangul Jamo medial vowels and final consonants should be zero width.
376f26c71dSXin LImy @force_compose = (
386f26c71dSXin LI    [0x1160, 0x11ff],
396f26c71dSXin LI    [0xd7b0, 0xd7c6],
406f26c71dSXin LI    [0xd7cb, 0xd7fb]
416f26c71dSXin LI);
426f26c71dSXin LI
43f6b74a7dSXin LIexit (main() ? 0 : 1);
44a15691bfSXin LI
45a15691bfSXin LIsub main {
46a15691bfSXin LI    my $args = join ' ', @ARGV;
47a15691bfSXin LI    die $USAGE if not getopts('f:n');
48a15691bfSXin LI    $type_field = $opt_f if $opt_f;
496f26c71dSXin LI
50a15691bfSXin LI    my %types;
51a15691bfSXin LI    my $arg;
52a15691bfSXin LI    while ($arg = shift @ARGV) {
53a15691bfSXin LI        last if $arg eq '--';
54a15691bfSXin LI        $types{$arg} = 1;
55a15691bfSXin LI    }
56a15691bfSXin LI    my %out = ( 'types' => \%types );
57a15691bfSXin LI
586f26c71dSXin LI    my %force_compose;
596f26c71dSXin LI    foreach my $comp (@force_compose) {
606f26c71dSXin LI        my ($lo,$hi) = @$comp;
616f26c71dSXin LI        for (my $ch = $lo; $ch <= $hi; ++$ch) {
626f26c71dSXin LI            $force_compose{$ch} = 1;
636f26c71dSXin LI        }
646f26c71dSXin LI    }
65*d5cb458bSXin LI    my %force_omit;
66*d5cb458bSXin LI    foreach my $comp (@force_omit) {
67*d5cb458bSXin LI        my ($lo,$hi) = @$comp;
68*d5cb458bSXin LI        for (my $ch = $lo; $ch <= $hi; ++$ch) {
69*d5cb458bSXin LI            $force_omit{$ch} = 1;
70*d5cb458bSXin LI        }
71*d5cb458bSXin LI    }
726f26c71dSXin LI
73252d6ddeSXin LI    my ($sec,$min,$hour,$mday,$mon,$year) = gmtime($ENV{SOURCE_DATE_EPOCH} // time());
74252d6ddeSXin LI    my @month = ( "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" );
75252d6ddeSXin LI    printf "/* Generated by \"%s %s\" on %s %2d %2d:%02d:%02d GMT %d */\n",
76252d6ddeSXin LI        $0, $args, $month[$mon], $mday, $hour, $min, $sec, $year+1900;
776f26c71dSXin LI
78f6b74a7dSXin LI    my $last_code = 0;
79d713e089SXin LI    my $start_range = 0;
80a15691bfSXin LI    while (<>) {
81a15691bfSXin LI        chomp;
82a15691bfSXin LI        s/#.*//;
83a15691bfSXin LI        my @fields = split /;/;
84a15691bfSXin LI        next if not @fields;
85f6b74a7dSXin LI        my ($lo_code, $hi_code);
86f6b74a7dSXin LI        my $codes = $fields[0];
87f6b74a7dSXin LI        if ($codes =~ /(\w+)\.\.(\w+)/) {
88f6b74a7dSXin LI            $lo_code = hex $1;
89f6b74a7dSXin LI            $hi_code = hex $2;
90f6b74a7dSXin LI        } else {
916f26c71dSXin LI            $lo_code = $hi_code = hex $codes;
92f6b74a7dSXin LI        }
93d713e089SXin LI        if ($fields[1] =~ /, First>$/) {
94d713e089SXin LI            die "invalid Unicode data: First with range" if $hi_code != $lo_code;
95d713e089SXin LI            $start_range = $lo_code;
96d713e089SXin LI            next;
97d713e089SXin LI        }
98d713e089SXin LI        if ($fields[1] =~ /, Last>$/) {
99d713e089SXin LI            die "invalid Unicode data: Last without First" if not $start_range;
100d713e089SXin LI            $lo_code = $start_range;
101d713e089SXin LI            $start_range = 0;
102d713e089SXin LI        } elsif ($start_range) {
103d713e089SXin LI            die "invalid Unicode data: First without Last";
104d713e089SXin LI        }
105a15691bfSXin LI        my $type = $fields[$type_field];
106a15691bfSXin LI        $type =~ s/\s//g;
107f6b74a7dSXin LI        for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) {
1086f26c71dSXin LI            output(\%out, $last_code,
109*d5cb458bSXin LI                $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' :
110*d5cb458bSXin LI                $force_omit{$last_code} ? 'Xx' : $type);
111a15691bfSXin LI        }
112a15691bfSXin LI    }
113f6b74a7dSXin LI    output(\%out, $last_code);
114f6b74a7dSXin LI    return 1;
115a15691bfSXin LI}
116a15691bfSXin LI
117a15691bfSXin LIsub output {
118a15691bfSXin LI    my ($out, $code, $type) = @_;
119f6b74a7dSXin LI    my $type_ok = ($type and ${${$out}{types}}{$type});
120f6b74a7dSXin LI    $type_ok = not $type_ok if $opt_n;
121f6b74a7dSXin LI    my $prev_code = $$out{prev_code};
122f6b74a7dSXin LI
123f6b74a7dSXin LI    if (not $type_ok) {
124f6b74a7dSXin LI        end_run($out, $prev_code);
125f6b74a7dSXin LI    } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) {
126f6b74a7dSXin LI        end_run($out, $prev_code);
127a15691bfSXin LI        start_run($out, $code, $type);
128a15691bfSXin LI    }
129f6b74a7dSXin LI    $$out{prev_code} = $code;
130a15691bfSXin LI}
131a15691bfSXin LI
132a15691bfSXin LIsub start_run {
133a15691bfSXin LI    my ($out, $code, $type) = @_;
134a15691bfSXin LI    $$out{start_code} = $code;
135f6b74a7dSXin LI    $$out{prev_code} = $code;
136f6b74a7dSXin LI    $$out{run_type} = $type;
137a15691bfSXin LI    $$out{in_run} = 1;
138a15691bfSXin LI}
139a15691bfSXin LI
140a15691bfSXin LIsub end_run {
141a15691bfSXin LI    my ($out, $code) = @_;
142a15691bfSXin LI    return if not $$out{in_run};
143f6b74a7dSXin LI    printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type};
144a15691bfSXin LI    $$out{in_run} = 0;
145a15691bfSXin LI}
146