xref: /freebsd/tools/tools/locale/tools/utf8-rollup.pl (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1#!/usr/local/bin/perl -wC
2
3# SPDX-License-Identifier: BSD-2-Clause
4#
5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6# Copyright 2015 John Marino <draco@marino.st>
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11# 1. Redistributions of source code must retain the above copyright
12#    notice, this list of conditions and the following disclaimer.
13# 2. Redistributions in binary form must reproduce the above copyright
14#    notice, this list of conditions and the following disclaimer in the
15#    documentation and/or other materials provided with the distribution.
16#
17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27# SUCH DAMAGE.
28#
29
30use strict;
31use Getopt::Long;
32use Encode qw(encode decode);
33
34if ($#ARGV != 0) {
35	print "Usage: $0 --unidir=<unidir>\n";
36	exit(1);
37}
38
39my $UNIDIR = undef;
40
41my $result = GetOptions (
42		"unidir=s"	=> \$UNIDIR
43	    );
44
45my %utf8map = ();
46my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
47
48get_utf8map("$UNIDIR/posix/UTF-8.cm");
49generate_header ();
50parse_unidata ("$UNIDIR/UnicodeData.txt");
51generate_footer ();
52
53############################
54
55sub utf8to32 {
56	my @kl = split /\\x/, $_[0];
57
58	shift @kl if ($kl[0] eq '');
59	my $k = pack('H2' x scalar @kl, @kl);
60	my $ux = encode('UTF-32BE', decode('UTF-8', $k));
61	my $u = uc(unpack('H*', $ux));
62	# Remove BOM
63	$u =~ s/^0000FEFF//;
64	# Remove heading bytes of 0
65	while ($u =~ m/^0/ and length($u) > 4) {
66		$u =~ s/^0//;
67	}
68
69	return $u;
70}
71
72sub get_utf8map {
73	my $file = shift;
74
75	open(FIN, $file);
76	my @lines = <FIN>;
77	close(FIN);
78	chomp(@lines);
79
80	my $incharmap = 0;
81	foreach my $l (@lines) {
82		$l =~ s/\r//;
83		next if ($l =~ /^\#/);
84		next if ($l eq "");
85
86		if ($l eq "CHARMAP") {
87			$incharmap = 1;
88			next;
89		}
90
91		next if (!$incharmap);
92		last if ($l eq "END CHARMAP");
93
94		$l =~ /^(<[^\s]+>)\s+(.*)/;
95		my $k = utf8to32($2);	# UTF-8 char code
96		my $v = $1;
97
98#		print STDERR "register: $k - $v\n";
99		$utf8map{$k} = $v;
100	}
101}
102
103sub generate_header {
104	open(FOUT, ">", "$outfilename")
105		or die ("can't write to $outfilename\n");
106	print FOUT <<EOF;
107# Warning: Do not edit. This file is automatically generated from the
108# tools in /usr/src/tools/tools/locale. The data is obtained from the
109# CLDR project, obtained from http://cldr.unicode.org/
110# -----------------------------------------------------------------------------
111
112comment_char *
113escape_char /
114
115LC_CTYPE
116EOF
117}
118
119sub generate_footer {
120	print FOUT "\nEND LC_CTYPE\n";
121	close (FOUT);
122}
123
124sub wctomb {
125	my $wc = hex(shift);
126	my $lead;
127	my $len;
128	my $ret = "";
129	my $i;
130
131	if (($wc & ~0x7f) == 0) {
132		return sprintf "%02X", $wc;
133	} elsif (($wc & ~0x7ff) == 0) {
134		$lead = 0xc0;
135		$len = 2;
136	} elsif (($wc & ~0xffff) == 0) {
137		$lead = 0xe0;
138		$len = 3;
139	} elsif ($wc >= 0 && $wc <= 0x10ffff) {
140		$lead = 0xf0;
141		$len = 4;
142	}
143
144	for ($i = $len - 1; $i > 0; $i--) {
145		$ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
146		$wc >>= 6;
147	}
148	$ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
149
150	return $ret;
151}
152
153sub parse_unidata {
154	my $file = shift;
155	my %data = ();
156
157	open(FIN, $file);
158	my @lines = <FIN>;
159	close(FIN);
160	chomp(@lines);
161
162	foreach my $l (@lines) {
163		my @d = split(/;/, $l, -1);
164		my $mb = $d[0];
165		my $cat;
166
167		# XXX There are code points present in UnicodeData.txt
168		# and missing from UTF-8.cm
169		next if !defined $utf8map{$mb};
170
171		# Define the category
172		if ($d[2] =~ /^Lu/) {
173			$cat = "upper";
174		} elsif ($d[2] =~ /^Ll/) {
175			$cat = "lower";
176		} elsif ($d[2] =~ /^Nd/) {
177			$cat = "digit";
178		} elsif ($d[2] =~ /^L/) {
179			$cat = "alpha";
180		} elsif ($d[2] =~ /^P/) {
181			$cat = "punct";
182		} elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ ||
183		    $d[2] =~ /^S/) {
184			$cat = "graph";
185		} elsif ($d[2] =~ /^C/) {
186			$cat = "cntrl";
187		} elsif ($d[2] =~ /^Z/) {
188			$cat = "space";
189		}
190		$data{$cat}{$mb}{'wc'} = $d[0];
191
192		# Check if it's a start or end of range
193		if ($d[1] =~ /First>$/) {
194			$data{$cat}{$mb}{'start'} = 1;
195		} elsif ($d[1] =~ /Last>$/) {
196			$data{$cat}{$mb}{'end'} = 1;
197		}
198
199		# Check if there's upper/lower mapping
200		if ($d[12] ne "") {
201			$data{'toupper'}{$mb} = $d[12];
202		} elsif ($d[13] ne "") {
203			$data{'tolower'}{$mb} = $d[13];
204		}
205	}
206
207	my $first;
208	my $inrange = 0;
209
210	# Now write out the categories
211	foreach my $cat (sort keys (%data)) {
212		print FOUT "$cat\t";
213		$first = 1;
214	foreach my $mb (sort {hex($a) <=> hex($b)} keys (%{$data{$cat}})) {
215		if ($first == 1) {
216			$first = 0;
217		} elsif ($inrange == 1) {
218			# Safety belt
219			die "broken range end wc=$data{$cat}{$mb}{'wc'}"
220			    if !defined $data{$cat}{$mb}{'end'};
221			print FOUT ";...;";
222			$inrange = 0;
223		} else {
224			print FOUT ";/\n\t";
225		}
226
227		if ($cat eq "tolower" || $cat eq "toupper") {
228			print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
229		} else {
230			if (defined($data{$cat}{$mb}{'start'})) {
231				$inrange = 1;
232			}
233			print FOUT "$utf8map{$mb}";
234		}
235	}
236		print FOUT "\n";
237	}
238}
239