1#!/usr/local/bin/perl -wC 2 3# SPDX-License-Identifier: BSD-2-Clause 4# 5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org> 6# Copyright 2015 John Marino <draco@marino.st> 7# 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the following conditions 10# are met: 11# 1. Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# 2. Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in the 15# documentation and/or other materials provided with the distribution. 16# 17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27# SUCH DAMAGE. 28# 29 30use strict; 31use Getopt::Long; 32use Encode qw(encode decode); 33 34if ($#ARGV != 0) { 35 print "Usage: $0 --unidir=<unidir>\n"; 36 exit(1); 37} 38 39my $UNIDIR = undef; 40 41my $result = GetOptions ( 42 "unidir=s" => \$UNIDIR 43 ); 44 45my %utf8map = (); 46my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; 47 48get_utf8map("$UNIDIR/posix/UTF-8.cm"); 49generate_header (); 50parse_unidata ("$UNIDIR/UnicodeData.txt"); 51generate_footer (); 52 53############################ 54 55sub utf8to32 { 56 my @kl = split /\\x/, $_[0]; 57 58 shift @kl if ($kl[0] eq ''); 59 my $k = pack('H2' x scalar @kl, @kl); 60 my $ux = encode('UTF-32BE', decode('UTF-8', $k)); 61 my $u = uc(unpack('H*', $ux)); 62 # Remove BOM 63 $u =~ s/^0000FEFF//; 64 # Remove heading bytes of 0 65 while ($u =~ m/^0/ and length($u) > 4) { 66 $u =~ s/^0//; 67 } 68 69 return $u; 70} 71 72sub get_utf8map { 73 my $file = shift; 74 75 open(FIN, $file); 76 my @lines = <FIN>; 77 close(FIN); 78 chomp(@lines); 79 80 my $incharmap = 0; 81 foreach my $l (@lines) { 82 $l =~ s/\r//; 83 next if ($l =~ /^\#/); 84 next if ($l eq ""); 85 86 if ($l eq "CHARMAP") { 87 $incharmap = 1; 88 next; 89 } 90 91 next if (!$incharmap); 92 last if ($l eq "END CHARMAP"); 93 94 $l =~ /^(<[^\s]+>)\s+(.*)/; 95 my $k = utf8to32($2); # UTF-8 char code 96 my $v = $1; 97 98# print STDERR "register: $k - $v\n"; 99 $utf8map{$k} = $v; 100 } 101} 102 103sub generate_header { 104 open(FOUT, ">", "$outfilename") 105 or die ("can't write to $outfilename\n"); 106 print FOUT <<EOF; 107# Warning: Do not edit. This file is automatically generated from the 108# tools in /usr/src/tools/tools/locale. The data is obtained from the 109# CLDR project, obtained from http://cldr.unicode.org/ 110# ----------------------------------------------------------------------------- 111 112comment_char * 113escape_char / 114 115LC_CTYPE 116EOF 117} 118 119sub generate_footer { 120 print FOUT "\nEND LC_CTYPE\n"; 121 close (FOUT); 122} 123 124sub wctomb { 125 my $wc = hex(shift); 126 my $lead; 127 my $len; 128 my $ret = ""; 129 my $i; 130 131 if (($wc & ~0x7f) == 0) { 132 return sprintf "%02X", $wc; 133 } elsif (($wc & ~0x7ff) == 0) { 134 $lead = 0xc0; 135 $len = 2; 136 } elsif (($wc & ~0xffff) == 0) { 137 $lead = 0xe0; 138 $len = 3; 139 } elsif ($wc >= 0 && $wc <= 0x10ffff) { 140 $lead = 0xf0; 141 $len = 4; 142 } 143 144 for ($i = $len - 1; $i > 0; $i--) { 145 $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret; 146 $wc >>= 6; 147 } 148 $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret; 149 150 return $ret; 151} 152 153sub parse_unidata { 154 my $file = shift; 155 my %data = (); 156 157 open(FIN, $file); 158 my @lines = <FIN>; 159 close(FIN); 160 chomp(@lines); 161 162 foreach my $l (@lines) { 163 my @d = split(/;/, $l, -1); 164 my $mb = $d[0]; 165 my $cat; 166 167 # XXX There are code points present in UnicodeData.txt 168 # and missing from UTF-8.cm 169 next if !defined $utf8map{$mb}; 170 171 # Define the category 172 if ($d[2] =~ /^Lu/) { 173 $cat = "upper"; 174 } elsif ($d[2] =~ /^Ll/) { 175 $cat = "lower"; 176 } elsif ($d[2] =~ /^Nd/) { 177 $cat = "digit"; 178 } elsif ($d[2] =~ /^L/) { 179 $cat = "alpha"; 180 } elsif ($d[2] =~ /^P/) { 181 $cat = "punct"; 182 } elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ || 183 $d[2] =~ /^S/) { 184 $cat = "graph"; 185 } elsif ($d[2] =~ /^C/) { 186 $cat = "cntrl"; 187 } elsif ($d[2] =~ /^Z/) { 188 $cat = "space"; 189 } 190 $data{$cat}{$mb}{'wc'} = $d[0]; 191 192 # Check if it's a start or end of range 193 if ($d[1] =~ /First>$/) { 194 $data{$cat}{$mb}{'start'} = 1; 195 } elsif ($d[1] =~ /Last>$/) { 196 $data{$cat}{$mb}{'end'} = 1; 197 } 198 199 # Check if there's upper/lower mapping 200 if ($d[12] ne "") { 201 $data{'toupper'}{$mb} = $d[12]; 202 } elsif ($d[13] ne "") { 203 $data{'tolower'}{$mb} = $d[13]; 204 } 205 } 206 207 my $first; 208 my $inrange = 0; 209 210 # Now write out the categories 211 foreach my $cat (sort keys (%data)) { 212 print FOUT "$cat\t"; 213 $first = 1; 214 foreach my $mb (sort {hex($a) <=> hex($b)} keys (%{$data{$cat}})) { 215 if ($first == 1) { 216 $first = 0; 217 } elsif ($inrange == 1) { 218 # Safety belt 219 die "broken range end wc=$data{$cat}{$mb}{'wc'}" 220 if !defined $data{$cat}{$mb}{'end'}; 221 print FOUT ";...;"; 222 $inrange = 0; 223 } else { 224 print FOUT ";/\n\t"; 225 } 226 227 if ($cat eq "tolower" || $cat eq "toupper") { 228 print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})"; 229 } else { 230 if (defined($data{$cat}{$mb}{'start'})) { 231 $inrange = 1; 232 } 233 print FOUT "$utf8map{$mb}"; 234 } 235 } 236 print FOUT "\n"; 237 } 238} 239