1#!/usr/bin/perl -wC 2 3# 4# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org> 5# Copyright 2015 John Marino <draco@marino.st> 6# 7# Redistribution and use in source and binary forms, with or without 8# modification, are permitted provided that the following conditions 9# are met: 10# 1. Redistributions of source code must retain the above copyright 11# notice, this list of conditions and the following disclaimer. 12# 2. Redistributions in binary form must reproduce the above copyright 13# notice, this list of conditions and the following disclaimer in the 14# documentation and/or other materials provided with the distribution. 15# 16# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26# SUCH DAMAGE. 27# 28 29use strict; 30use Getopt::Long; 31 32if ($#ARGV != 0) { 33 print "Usage: $0 --unidata=</path/to/UnicodeData.txt>\n"; 34 exit(1); 35} 36 37my $UNIDATA = undef; 38 39my $result = GetOptions ( 40 "unidata=s" => \$UNIDATA 41 ); 42 43my %utf8map = (); 44my $outfilename = "data/common.UTF-8.src"; 45 46get_utf8map("data/UTF-8.cm"); 47generate_header (); 48parse_unidata ("$UNIDATA"); 49generate_footer (); 50 51############################ 52 53sub get_utf8map { 54 my $file = shift; 55 56 open(FIN, $file); 57 my @lines = <FIN>; 58 close(FIN); 59 chomp(@lines); 60 61 my $incharmap = 0; 62 foreach my $l (@lines) { 63 $l =~ s/\r//; 64 next if ($l =~ /^\#/); 65 next if ($l eq ""); 66 67 if ($l eq "CHARMAP") { 68 $incharmap = 1; 69 next; 70 } 71 72 next if (!$incharmap); 73 last if ($l eq "END CHARMAP"); 74 75 $l =~ /^(<[^\s]+>)\s+(.*)/; 76 my $k = $2; 77 my $v = $1; 78 $k =~ s/\\x//g; # UTF-8 char code 79 $utf8map{$k} = $v; 80 } 81} 82 83sub generate_header { 84 open(FOUT, ">", "$outfilename") 85 or die ("can't write to $outfilename\n"); 86 print FOUT "LC_CTYPE\n\n"; 87} 88 89sub generate_footer { 90 print FOUT "\nEND LC_CTYPE\n"; 91 close (FOUT); 92} 93 94sub wctomb { 95 my $wc = hex(shift); 96 my $lead; 97 my $len; 98 my $ret = ""; 99 my $i; 100 101 if (($wc & ~0x7f) == 0) { 102 return sprintf "%02X", $wc; 103 } elsif (($wc & ~0x7ff) == 0) { 104 $lead = 0xc0; 105 $len = 2; 106 } elsif (($wc & ~0xffff) == 0) { 107 $lead = 0xe0; 108 $len = 3; 109 } elsif ($wc >= 0 && $wc <= 0x10ffff) { 110 $lead = 0xf0; 111 $len = 4; 112 } 113 114 for ($i = $len - 1; $i > 0; $i--) { 115 $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret; 116 $wc >>= 6; 117 } 118 $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret; 119 120 return $ret; 121} 122 123sub parse_unidata { 124 my $file = shift; 125 my %data = (); 126 127 open(FIN, $file); 128 my @lines = <FIN>; 129 close(FIN); 130 chomp(@lines); 131 132 foreach my $l (@lines) { 133 my @d = split(/;/, $l, -1); 134 my $mb = wctomb($d[0]); 135 my $cat; 136 137 # XXX There are code points present in UnicodeData.txt 138 # and missing from UTF-8.cm 139 next if !defined $utf8map{$mb}; 140 141 # Define the category 142 if ($d[2] =~ /^Lu/) { 143 $cat = "upper"; 144 } elsif ($d[2] =~ /^Ll/) { 145 $cat = "lower"; 146 } elsif ($d[2] =~ /^Nd/) { 147 $cat = "digit"; 148 } elsif ($d[2] =~ /^L/) { 149 $cat = "alpha"; 150 } elsif ($d[2] =~ /^P/) { 151 $cat = "punct"; 152 } elsif ($d[2] =~ /^M/ || $d[2] =~ /^N/ || $d[2] =~ /^S/) { 153 $cat = "graph"; 154 } elsif ($d[2] =~ /^C/) { 155 $cat = "cntrl"; 156 } elsif ($d[2] =~ /^Z/) { 157 $cat = "space"; 158 } 159 $data{$cat}{$mb}{'wc'} = $d[0]; 160 161 # Check if it's a start or end of range 162 if ($d[1] =~ /First>$/) { 163 $data{$cat}{$mb}{'start'} = 1; 164 } elsif ($d[1] =~ /Last>$/) { 165 $data{$cat}{$mb}{'end'} = 1; 166 } 167 168 # Check if there's upper/lower mapping 169 if ($d[12] ne "") { 170 $data{'toupper'}{$mb} = wctomb($d[12]); 171 } elsif ($d[13] ne "") { 172 $data{'tolower'}{$mb} = wctomb($d[13]); 173 } 174 } 175 176 my $first; 177 my $inrange = 0; 178 179 # Now write out the categories 180 foreach my $cat (sort keys (%data)) { 181 print FOUT "$cat\t"; 182 $first = 1; 183 foreach my $mb (sort keys (%{$data{$cat}})) { 184 if ($first == 1) { 185 $first = 0; 186 } elsif ($inrange == 1) { 187 # Safety belt 188 die "broken range end wc=$data{$cat}{$mb}{'wc'}" 189 if !defined $data{$cat}{$mb}{'end'}; 190 print FOUT ";...;"; 191 $inrange = 0; 192 } else { 193 print FOUT ";/\n\t"; 194 } 195 196 if ($cat eq "tolower" || $cat eq "toupper") { 197 print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})"; 198 } else { 199 if (defined($data{$cat}{$mb}{'start'})) { 200 $inrange = 1; 201 } 202 print FOUT "$utf8map{$mb}"; 203 } 204 } 205 print FOUT "\n"; 206 } 207} 208