xref: /illumos-gate/usr/src/data/locale/tools/utf8-rollup.pl (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1#!/usr/bin/perl -wC
2
3#
4# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
5# Copyright 2015 John Marino <draco@marino.st>
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions
9# are met:
10# 1. Redistributions of source code must retain the above copyright
11#    notice, this list of conditions and the following disclaimer.
12# 2. Redistributions in binary form must reproduce the above copyright
13#    notice, this list of conditions and the following disclaimer in the
14#    documentation and/or other materials provided with the distribution.
15#
16# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26# SUCH DAMAGE.
27#
28
29use strict;
30use Getopt::Long;
31
32if ($#ARGV != 0) {
33	print "Usage: $0 --unidata=</path/to/UnicodeData.txt>\n";
34	exit(1);
35}
36
37my $UNIDATA = undef;
38
39my $result = GetOptions (
40		"unidata=s"	=> \$UNIDATA
41	    );
42
43my %utf8map = ();
44my $outfilename = "data/common.UTF-8.src";
45
46get_utf8map("data/UTF-8.cm");
47generate_header ();
48parse_unidata ("$UNIDATA");
49generate_footer ();
50
51############################
52
53sub get_utf8map {
54	my $file = shift;
55
56	open(FIN, $file);
57	my @lines = <FIN>;
58	close(FIN);
59	chomp(@lines);
60
61	my $incharmap = 0;
62	foreach my $l (@lines) {
63		$l =~ s/\r//;
64		next if ($l =~ /^\#/);
65		next if ($l eq "");
66
67		if ($l eq "CHARMAP") {
68			$incharmap = 1;
69			next;
70		}
71
72		next if (!$incharmap);
73		last if ($l eq "END CHARMAP");
74
75		$l =~ /^(<[^\s]+>)\s+(.*)/;
76		my $k = $2;
77		my $v = $1;
78		$k =~ s/\\x//g;		# UTF-8 char code
79		$utf8map{$k} = $v;
80	}
81}
82
83sub generate_header {
84	open(FOUT, ">", "$outfilename")
85		or die ("can't write to $outfilename\n");
86	print FOUT "LC_CTYPE\n\n";
87}
88
89sub generate_footer {
90	print FOUT "\nEND LC_CTYPE\n";
91	close (FOUT);
92}
93
94sub wctomb {
95	my $wc = hex(shift);
96	my $lead;
97	my $len;
98	my $ret = "";
99	my $i;
100
101	if (($wc & ~0x7f) == 0) {
102		return sprintf "%02X", $wc;
103	} elsif (($wc & ~0x7ff) == 0) {
104		$lead = 0xc0;
105		$len = 2;
106	} elsif (($wc & ~0xffff) == 0) {
107		$lead = 0xe0;
108		$len = 3;
109	} elsif ($wc >= 0 && $wc <= 0x10ffff) {
110		$lead = 0xf0;
111		$len = 4;
112	}
113
114	for ($i = $len - 1; $i > 0; $i--) {
115		$ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
116		$wc >>= 6;
117	}
118	$ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
119
120	return $ret;
121}
122
123sub parse_unidata {
124	my $file = shift;
125	my %data = ();
126
127	open(FIN, $file);
128	my @lines = <FIN>;
129	close(FIN);
130	chomp(@lines);
131
132	foreach my $l (@lines) {
133		my @d = split(/;/, $l, -1);
134		my $mb = wctomb($d[0]);
135		my $cat;
136
137		# XXX There are code points present in UnicodeData.txt
138		# and missing from UTF-8.cm
139		next if !defined $utf8map{$mb};
140
141		# Define the category
142		if ($d[2] =~ /^Lu/) {
143			$cat = "upper";
144		} elsif ($d[2] =~ /^Ll/) {
145			$cat = "lower";
146		} elsif ($d[2] =~ /^Nd/) {
147			$cat = "digit";
148		} elsif ($d[2] =~ /^L/) {
149			$cat = "alpha";
150		} elsif ($d[2] =~ /^P/) {
151			$cat = "punct";
152		} elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ ||
153		    $d[2] =~ /^S/) {
154			$cat = "graph";
155		} elsif ($d[2] =~ /^C/) {
156			$cat = "cntrl";
157		} elsif ($d[2] =~ /^Z/) {
158			$cat = "space";
159		}
160		$data{$cat}{$mb}{'wc'} = $d[0];
161
162		# Check if it's a start or end of range
163		if ($d[1] =~ /First>$/) {
164			$data{$cat}{$mb}{'start'} = 1;
165		} elsif ($d[1] =~ /Last>$/) {
166			$data{$cat}{$mb}{'end'} = 1;
167		}
168
169		# Check if there's upper/lower mapping
170		if ($d[12] ne "") {
171			$data{'toupper'}{$mb} = wctomb($d[12]);
172		} elsif ($d[13] ne "") {
173			$data{'tolower'}{$mb} = wctomb($d[13]);
174		}
175	}
176
177	my $first;
178	my $inrange = 0;
179
180	# Now write out the categories
181	foreach my $cat (sort keys (%data)) {
182		print FOUT "$cat\t";
183		$first = 1;
184	foreach my $mb (sort keys (%{$data{$cat}})) {
185		if ($first == 1) {
186			$first = 0;
187		} elsif ($inrange == 1) {
188			# Safety belt
189			die "broken range end wc=$data{$cat}{$mb}{'wc'}"
190			    if !defined $data{$cat}{$mb}{'end'};
191			print FOUT ";...;";
192			$inrange = 0;
193		} else {
194			print FOUT ";/\n\t";
195		}
196
197		if ($cat eq "tolower" || $cat eq "toupper") {
198			print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
199		} else {
200			if (defined($data{$cat}{$mb}{'start'})) {
201				$inrange = 1;
202			}
203			print FOUT "$utf8map{$mb}";
204		}
205	}
206		print FOUT "\n";
207	}
208}
209