xref: /freebsd/tools/tools/locale/tools/cldr2def.pl (revision 95f37aa3e51ce46821059b2c9dbb02fef5c4bec5)
1#!/usr/local/bin/perl -wC
2
3# SPDX-License-Identifier: BSD-2-Clause
4#
5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6# Copyright 2015 John Marino <draco@marino.st>
7# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org>
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions
11# are met:
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17#
18# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28# SUCH DAMAGE.
29#
30
31use strict;
32use File::Copy;
33use XML::Parser;
34use Tie::IxHash;
35use Text::Iconv;
36#use Data::Dumper;
37use Getopt::Long;
38use Digest::SHA qw(sha1_hex);
39require "charmaps.pm";
40
41if ($#ARGV < 2) {
42	print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
43	exit(1);
44}
45
46my $DEFENCODING = "UTF-8";
47
48my $UNIDIR = undef;
49my $ETCDIR = undef;
50my $TYPE = undef;
51
52my $CLDR_VERSION = undef;
53
54my $result = GetOptions (
55		"unidir=s"	=> \$UNIDIR,
56		"etc=s"		=> \$ETCDIR,
57		"type=s"	=> \$TYPE,
58	    );
59
60my %convertors = ();
61
62my %ucd = ();
63my %values = ();
64my %hashtable = ();
65my %languages = ();
66my %translations = ();
67my %alternativemonths = ();
68get_languages();
69
70my %utfmap = ();
71$utfmap{'UTF-8'} = {};
72$utfmap{'UTF-32'} = {};
73get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
74get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
75
76my %keys = ();
77tie(%keys, "Tie::IxHash");
78tie(%hashtable, "Tie::IxHash");
79
80my %FILESNAMES = (
81	"monetdef"	=> "LC_MONETARY",
82	"timedef"	=> "LC_TIME",
83	"msgdef"	=> "LC_MESSAGES",
84	"numericdef"	=> "LC_NUMERIC",
85	"colldef"	=> "LC_COLLATE",
86	"ctypedef"	=> "LC_CTYPE"
87);
88
89my %callback = (
90	mdorder => \&callback_mdorder,
91	altmon => \&callback_altmon,
92	cformat => \&callback_cformat,
93	dformat => \&callback_dformat,
94	dtformat => \&callback_dtformat,
95	cbabmon => \&callback_abmon,
96	cbampm => \&callback_ampm,
97	data => undef,
98);
99
100my %DESC = (
101
102	# numericdef
103	"decimal_point"	=> "decimal_point",
104	"thousands_sep"	=> "thousands_sep",
105	"grouping"	=> "grouping",
106
107	# monetdef
108	"int_curr_symbol"	=> "int_curr_symbol (last character always " .
109				   "SPACE)",
110	"currency_symbol"	=> "currency_symbol",
111	"mon_decimal_point"	=> "mon_decimal_point",
112	"mon_thousands_sep"	=> "mon_thousands_sep",
113	"mon_grouping"		=> "mon_grouping",
114	"positive_sign"		=> "positive_sign",
115	"negative_sign"		=> "negative_sign",
116	"int_frac_digits"	=> "int_frac_digits",
117	"frac_digits"		=> "frac_digits",
118	"p_cs_precedes"		=> "p_cs_precedes",
119	"p_sep_by_space"	=> "p_sep_by_space",
120	"n_cs_precedes"		=> "n_cs_precedes",
121	"n_sep_by_space"	=> "n_sep_by_space",
122	"p_sign_posn"		=> "p_sign_posn",
123	"n_sign_posn"		=> "n_sign_posn",
124	"int_p_cs_precedes"	=> "int_p_cs_precedes",
125	"int_n_cs_precedes"	=> "int_n_cs_precedes",
126	"int_p_sep_by_space"	=> "int_p_sep_by_space",
127	"int_n_sep_by_space"	=> "int_n_sep_by_space",
128	"int_p_sign_posn"	=> "int_p_sign_posn",
129	"int_n_sign_posn"	=> "int_n_sign_posn",
130
131	# msgdef
132	"yesexpr"	=> "yesexpr",
133	"noexpr"	=> "noexpr",
134	"yesstr"	=> "yesstr",
135	"nostr"		=> "nostr",
136
137	# timedef
138	"abmon"		=> "Short month names",
139	"mon"		=> "Long month names (as in a date)",
140	"abday"		=> "Short weekday names",
141	"day"		=> "Long weekday names",
142	"t_fmt"		=> "X_fmt",
143	"d_fmt"		=> "x_fmt",
144	"c_fmt"		=> "c_fmt",
145	"am_pm"		=> "AM/PM",
146	"d_t_fmt"	=> "date_fmt",
147	"altmon"	=> "Long month names (without case ending)",
148	"md_order"	=> "md_order",
149	"t_fmt_ampm"	=> "ampm_fmt",
150);
151
152if ($TYPE eq "colldef") {
153	transform_collation();
154	make_makefile();
155}
156
157if ($TYPE eq "ctypedef") {
158	transform_ctypes();
159	make_makefile();
160}
161
162if ($TYPE eq "numericdef") {
163	%keys = (
164	    "decimal_point"	=> "s",
165	    "thousands_sep"	=> "s",
166	    "grouping"		=> "ai",
167	);
168	get_fields();
169	print_fields();
170	make_makefile();
171}
172
173if ($TYPE eq "monetdef") {
174	%keys = (
175	    "int_curr_symbol"		=> "s",
176	    "currency_symbol"		=> "s",
177	    "mon_decimal_point"		=> "s",
178	    "mon_thousands_sep"		=> "s",
179	    "mon_grouping"		=> "ai",
180	    "positive_sign"		=> "s",
181	    "negative_sign"		=> "s",
182	    "int_frac_digits"		=> "i",
183	    "frac_digits"		=> "i",
184	    "p_cs_precedes"		=> "i",
185	    "p_sep_by_space"		=> "i",
186	    "n_cs_precedes"		=> "i",
187	    "n_sep_by_space"		=> "i",
188	    "p_sign_posn"		=> "i",
189	    "n_sign_posn"		=> "i",
190	    "int_p_cs_precedes"		=> "i",
191	    "int_n_cs_precedes"		=> "i",
192	    "int_p_sep_by_space"	=> "i",
193	    "int_n_sep_by_space"	=> "i",
194	    "int_p_sign_posn"		=> "i",
195	    "int_n_sign_posn"		=> "i"
196	);
197	get_fields();
198	print_fields();
199	make_makefile();
200}
201
202if ($TYPE eq "msgdef") {
203	%keys = (
204	    "yesexpr"		=> "s",
205	    "noexpr"		=> "s",
206	    "yesstr"		=> "s",
207	    "nostr"		=> "s"
208	);
209	get_fields();
210	print_fields();
211	make_makefile();
212}
213
214if ($TYPE eq "timedef") {
215	%keys = (
216	    "abmon"		=> "<cbabmon<abmon<as",
217	    "mon"		=> "as",
218	    "abday"		=> "as",
219	    "day"		=> "as",
220	    "t_fmt"		=> "s",
221	    "d_fmt"		=> "<dformat<d_fmt<s",
222	    "c_fmt"		=> "<cformat<d_t_fmt<s",
223	    "am_pm"		=> "<cbampm<am_pm<as",
224	    "d_t_fmt"		=> "<dtformat<d_t_fmt<s",
225	    "altmon"		=> "<altmon<mon<as",
226	    "md_order"		=> "<mdorder<d_fmt<s",
227	    "t_fmt_ampm"	=> "s",
228	);
229	get_fields();
230	print_fields();
231	make_makefile();
232}
233
234sub callback_ampm {
235	my $s = shift;
236	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
237	my $enc = $callback{data}{e};
238
239	if ($nl eq 'ru_RU') {
240		if ($enc eq 'UTF-8') {
241			$s = 'дп;пп';
242		} else {
243			my  $converter = Text::Iconv->new("utf-8", "$enc");
244			$s = $converter->convert("дп;пп");
245		}
246	}
247	return $s;
248}
249
250sub callback_cformat {
251	my $s = shift;
252	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
253
254	if ($nl eq 'ko_KR') {
255		$s =~ s/(> )(%p)/$1%A $2/;
256	}
257	$s =~ s/\.,/\./;
258	$s =~ s/ %Z//;
259	$s =~ s/ %z//;
260	$s =~ s/^"%e\./%A %e/;
261	$s =~ s/^"(%B %e, )/"%A, $1/;
262	$s =~ s/^"(%e %B )/"%A $1/;
263	return $s;
264};
265
266sub callback_dformat {
267	my $s = shift;
268
269	$s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/;
270	$s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/;
271	return $s;
272};
273
274sub callback_dtformat {
275	my $s = shift;
276	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
277
278	if ($nl eq 'ja_JP') {
279		$s =~ s/(> )(%H)/$1%A $2/;
280	} elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') {
281		if ($nl ne 'ko_KR') {
282			$s =~ s/%m/%_m/;
283		}
284		$s =~ s/(> )(%p)/$1%A $2/;
285	}
286	$s =~ s/\.,/\./;
287	$s =~ s/^"%e\./%A %e/;
288	$s =~ s/^"(%B %e, )/"%A, $1/;
289	$s =~ s/^"(%e %B )/"%A $1/;
290	return $s;
291};
292
293sub callback_mdorder {
294	my $s = shift;
295	return undef if (!defined $s);
296	$s =~ s/[^dem]//g;
297	$s =~ s/e/d/g;
298	return $s;
299};
300
301sub callback_altmon {
302	# if the language/country is known in %alternative months then
303	# return that, otherwise repeat mon
304	my $s = shift;
305
306	if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) {
307		my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}});
308		my @cleaned;
309		foreach (@altnames)
310		{
311			$_ =~ s/^\s+//;
312			$_ =~ s/\s+$//;
313			push @cleaned, $_;
314		}
315		return join(";",@cleaned);
316	}
317
318	return $s;
319}
320
321sub callback_abmon {
322	# for specified CJK locales, pad result with a space to enable
323	# columns to line up (style established in FreeBSD in 2001)
324	my $s = shift;
325	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
326
327	if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' ||
328	    $nl eq 'zh_HK' || $nl eq 'zh_TW') {
329		my @monthnames = split(";", $s);
330		my @cleaned;
331		foreach (@monthnames)
332		{
333			if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ ||
334			   ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/))
335			{
336				$_ =~ s/^"/"<space>/;
337			}
338			push @cleaned, $_;
339		}
340		return join(";",@cleaned);
341	}
342	return $s;
343}
344
345############################
346
347sub get_utfmap {
348	my ($file, $db) = @_;
349
350	open(FIN, $file);
351	my @lines = <FIN>;
352	close(FIN);
353	chomp(@lines);
354
355	my $prev_k = undef;
356	my $prev_v = "";
357	my $incharmap = 0;
358	foreach my $l (@lines) {
359		chomp($l);
360		next if ($l =~ /^\#/);
361		next if ($l eq "");
362
363		if ($l eq "CHARMAP") {
364			$incharmap = 1;
365			next;
366		}
367
368		next if (!$incharmap);
369		last if ($l eq "END CHARMAP");
370
371		$l =~ /^<([^\s]+)>\s+(.*)/;
372		my $k = $1;
373		my $v = $2;
374		$v =~ s/\\x//g;		# UTF-8 char code
375		$db->{$k} = $v;
376#		print STDERR "UTF $k = $v\n";
377
378		# XXX: no longer needed
379		# $db_alias->{$k} = $prev_k if ($prev_v eq $v);
380
381		$prev_v = $v;
382		$prev_k = $k;
383	}
384}
385
386sub resolve_enc_addition {
387	my $ret = '';
388
389	foreach my $t (split(/\+/, $_[0])) {
390		$t =~ s/^0[xX]//;
391		$ret .= $t;
392	}
393	return $ret;
394}
395
396sub get_languages {
397	my %data = get_xmldata($ETCDIR);
398	%languages = %{$data{L}};
399	%translations = %{$data{T}};
400	%alternativemonths = %{$data{AM}};
401}
402
403sub transform_ctypes {
404	# Add the C.UTF-8
405	$languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef;
406
407	foreach my $l (sort keys(%languages)) {
408	foreach my $f (sort keys(%{$languages{$l}})) {
409	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
410		next if (defined $languages{$l}{$f}{definitions}
411		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
412		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
413		my $file = $l;
414		$file .= "_" . $f if ($f ne "x");
415		$file .= "_" . $c if ($c ne "x");
416		my $actfile = $file;
417
418		my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
419		if (! -f $filename) {
420			print STDERR "Cannot open $filename\n";
421			next;
422		}
423		open(FIN, "$filename");
424		print "Reading from $filename for ${l}_${f}_${c}\n";
425		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
426		my @lines;
427		my $shex;
428		my $uhex;
429		while (<FIN>) {
430			push @lines, $_;
431		}
432		close(FIN);
433		$shex = sha1_hex(join("\n", @lines));
434		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
435		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
436		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
437		print FOUT @lines;
438		close(FOUT);
439		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
440			next if ($enc eq $DEFENCODING);
441			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
442			if ($file eq 'ja_JP') {
443				# Override $filename for ja_JP because
444				# its CTYPE is not compatible with UTF-8.
445				$filename = "$UNIDIR/posix/$file.eucJP.src";
446			}
447			if (! -f $filename) {
448				print STDERR "Cannot open $filename\n";
449				next;
450			}
451			@lines = ();
452			open(FIN, "$filename");
453			while (<FIN>) {
454				if ((/^comment_char\s/) || (/^escape_char\s/)){
455					push @lines, $_;
456				}
457				if (/^LC_CTYPE/../^END LC_CTYPE/) {
458					push @lines, $_;
459				}
460			}
461			close(FIN);
462			$uhex = sha1_hex(join("\n", @lines) . $enc);
463			$languages{$l}{$f}{data}{$c}{$enc} = $uhex;
464			$hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1;
465			open(FOUT, ">$TYPE.draft/$actfile.$enc.src");
466			print FOUT <<EOF;
467# Warning: Do not edit. This file is automatically extracted from the
468# tools in /usr/src/tools/tools/locale. The data is obtained from the
469# CLDR project, obtained from http://cldr.unicode.org/
470# -----------------------------------------------------------------------------
471EOF
472			print FOUT @lines;
473			close(FOUT);
474		}
475	}
476	}
477	}
478}
479
480
481sub transform_collation {
482	# Read the CLDR version
483	open(FIN, "$UNIDIR/cldr-version") or die "Cannot open cldr-version";
484	read FIN, $CLDR_VERSION, -s FIN;
485	close(FIN);
486	$CLDR_VERSION =~ s/\s*$//;
487
488	foreach my $l (sort keys(%languages)) {
489	foreach my $f (sort keys(%{$languages{$l}})) {
490	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
491		next if (defined $languages{$l}{$f}{definitions}
492		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
493		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
494		my $file;
495		$file = $l . "_";
496		$file .= $f . "_" if ($f ne "x");
497		$file .= $c;
498		my $actfile = $file;
499
500		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
501		$filename = "$ETCDIR/$file.$DEFENCODING.src"
502		    if (! -f $filename);
503		if (! -f $filename
504		 && defined $languages{$l}{$f}{fallback}) {
505			$file = $languages{$l}{$f}{fallback};
506			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
507		}
508		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
509		    if (! -f $filename);
510		if (! -f $filename) {
511			print STDERR
512			    "Cannot open $file.$DEFENCODING.src or fallback\n";
513			next;
514		}
515		open(FIN, "$filename");
516		print "Reading from $filename for ${l}_${f}_${c}\n";
517		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
518		my @lines;
519		my $shex;
520		while (<FIN>) {
521			if ((/^comment_char\s/) || (/^escape_char\s/)){
522				push @lines, $_;
523			}
524			if (/^LC_COLLATE/../^END LC_COLLATE/) {
525				$_ =~ s/[ ]+/ /g;
526				push @lines, $_;
527			}
528		}
529		close(FIN);
530		$shex = sha1_hex(join("\n", @lines));
531		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
532		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
533		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
534		print FOUT <<EOF;
535# Warning: Do not edit. This file is automatically extracted from the
536# tools in /usr/src/tools/tools/locale. The data is obtained from the
537# CLDR project, obtained from http://cldr.unicode.org/
538# -----------------------------------------------------------------------------
539EOF
540		print FOUT @lines;
541		close(FOUT);
542
543		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
544			next if ($enc eq $DEFENCODING);
545
546			open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src";
547			open FOUT, ">$TYPE.draft/$actfile.$enc.src";
548			my $order_start = 0;
549			my $print_p = 0;
550			#
551			# %c_elem: collation elements
552			#
553			#   undef: not defined
554			#   1: defined
555			#   2: invalid in this encoding
556			#
557			my %c_elem = ();
558			while (<FIN>) {	# XXX: this loop should be refactored.
559				chomp;
560				$print_p = 1;
561				if ($order_start) {
562					$order_start = 0 if (m/^order_end/);
563					if (m/^<([^>]+)>/) {
564						if (not defined $c_elem{$1}) {
565#							print STDERR "$1:\n";
566
567							my $u32 = $utfmap{'UTF-32'}->{$1};
568							die "order, $1\n" if (not defined $u32);
569#							print STDERR "u32 for $1 = $u32\n";
570							if (not defined $convertors{$enc}{$u32}) {
571#								print STDERR "$1 - $u32 not defined in $enc\n";
572								$print_p = 0;
573							}
574						} elsif ($c_elem{$1} == 2) {
575#							print STDERR "$1 is marked as invalid in $enc\n";
576							$print_p = 0;
577						}
578					}
579				} elsif (m/^collating-element/) {
580					my ($elem, $l);
581					if (m/<([^>]+)> from (.+)/) {
582						($elem, $l) = ($1, $2);
583					}
584#					print STDERR "$elem: enter ($print_p, $l,)\n";
585					while ($print_p and
586					    defined $l and
587					    $l =~ m/<([^>]+)>/g) {
588#						print STDERR "$elem: $1\n";
589						my $u32 = $utfmap{'UTF-32'}->{$1};
590						die "collating-element, $1\n" if (not defined $u32);
591#						print STDERR "u32 for $1 = $u32\n";
592						if (not $convertors{$enc}{$u32}) {
593#							print STDERR "$1 - $u32 not defined in $enc\n";
594							$print_p = 0;
595#							print STDERR "Mark $elem as invalid\n";
596							$c_elem{$elem} = 2;
597						}
598					}
599					if ($print_p) {
600#						print STDERR "Add $elem\n";
601						$c_elem{$elem} = 1;
602					}
603				} elsif (m/^collating-symbol <([^>]+)>/) {
604#					print STDERR "Add $1\n";
605					$c_elem{$1} = 1;
606				} elsif (m/^order_start/) {
607					$order_start = 1;
608					# do nothing
609				}
610				print FOUT $_, "\n" if ($print_p);
611			}
612			close FOUT;
613			close FIN;
614			$languages{$l}{$f}{data}{$c}{$enc} = $shex;
615			$hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
616		}
617	}
618	}
619	}
620}
621
622sub get_fields {
623	foreach my $l (sort keys(%languages)) {
624	foreach my $f (sort keys(%{$languages{$l}})) {
625	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
626		next if (defined $languages{$l}{$f}{definitions}
627		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
628
629		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
630		my $file;
631		$file = $l . "_";
632		$file .= $f . "_" if ($f ne "x");
633		$file .= $c;
634
635		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
636		$filename = "$ETCDIR/$file.$DEFENCODING.src"
637		    if (! -f $filename);
638		if (! -f $filename
639		 && defined $languages{$l}{$f}{fallback}) {
640			$file = $languages{$l}{$f}{fallback};
641			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
642		}
643		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
644		    if (! -f $filename);
645		if (! -f $filename) {
646			print STDERR
647			    "Cannot open $file.$DEFENCODING.src or fallback\n";
648			next;
649		}
650		open(FIN, "$filename");
651		print "Reading from $filename for ${l}_${f}_${c}\n";
652		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
653		my @lines = <FIN>;
654		chomp(@lines);
655		close(FIN);
656		my $continue = 0;
657		foreach my $k (keys(%keys)) {
658			foreach my $line (@lines) {
659				$line =~ s/\r//;
660				next if (!$continue && $line !~ /^$k\s/);
661				if ($continue) {
662					$line =~ s/^\s+//;
663				} else {
664					$line =~ s/^$k\s+//;
665				}
666
667				$values{$l}{$f}{$c}{$k} = ""
668					if (!defined $values{$l}{$f}{$c}{$k});
669
670				$continue = ($line =~ /\/$/);
671				$line =~ s/\/$// if ($continue);
672
673#				while ($line =~ /_/) {
674#					$line =~
675#					    s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
676#				}
677#				die "_ in data - $line" if ($line =~ /_/);
678				$values{$l}{$f}{$c}{$k} .= $line;
679
680				last if (!$continue);
681			}
682		}
683	}
684	}
685	}
686}
687
688sub decodecldr {
689	my $e = shift;
690	my $s = shift;
691
692	my $v = undef;
693
694	if ($e eq "UTF-8") {
695		#
696		# Conversion to UTF-8 can be done from the Unicode name to
697		# the UTF-8 character code.
698		#
699		$v = $utfmap{'UTF-8'}->{$s};
700		die "Cannot convert $s in $e (charmap)" if (!defined $v);
701	} else {
702		#
703		# Conversion to these encodings can be done from the Unicode
704		# name to Unicode code to the encodings code.
705		#
706		# hex - hex or string attr
707		# unicode - unicode attr
708		# ucc - ucc attr
709		my $hex = $translations{$e}{$s}{hex};
710		my $ucc = $utfmap{'UTF-32'}->{$s};
711		my $ucc_attr = $translations{$e}{$s}{ucc};
712		my $unicode = $translations{$e}{$s}{unicode};
713
714		if (defined $hex) {		# hex is in local encoding
715			$v = $hex;
716		} elsif (defined $unicode) {	# unicode is in name
717			$v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}};
718		} elsif (defined $ucc_attr) {	# ucc is in code point
719			if (defined $ucc) {
720#				print STDERR "INFO: ucc=$ucc_attr ",
721#				    "overrides $ucc in UTF-32\n";
722			}
723			# normalize
724			$ucc_attr = sprintf("%08X", hex($ucc_attr));
725#			print STDERR "convert $ucc_attr into $e\n";
726			$v = $convertors{$e}{$ucc_attr};
727		} elsif (defined $ucc) {
728			# normalize
729			$ucc = sprintf("%08X", hex($ucc));
730#			print STDERR "convert $ucc into $e\n";
731			$v = $convertors{$e}{$ucc};
732		}
733		die "Cannot convert $s in $e" if (!defined $v);
734	}
735
736	# XXX: length = 8 is not supported yet.
737	$v =~ s/^[0]+//g;
738	$v = "0" . $v if (length($v) % 2);
739	return pack("C", hex($v)) if (length($v) == 2);
740	return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
741		if (length($v) == 4);
742	return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
743	    hex(substr($v, 4, 2))) if (length($v) == 6);
744	die "Cannot convert $s in $e (length = " . length($v) . "\n";
745}
746
747sub translate {
748	my $enc = shift;
749	my $v = shift;
750
751	return $translations{$enc}{$v} if (defined $translations{$enc}{$v});
752	return undef;
753}
754
755sub print_fields {
756	foreach my $l (sort keys(%languages)) {
757	foreach my $f (sort keys(%{$languages{$l}})) {
758	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
759		next if (defined $languages{$l}{$f}{definitions}
760		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
761		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
762			if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
763				print "Skipping ${l}_" .
764				    ($f eq "x" ? "" : "${f}_") .
765				    "${c} - not read\n";
766				next;
767			}
768			my $file = $l;
769			$file .= "_" . $f if ($f ne "x");
770			$file .= "_" . $c;
771			print "Writing to $file in $enc\n";
772
773			if ($enc ne $DEFENCODING &&
774			    !defined $convertors{$enc}) {
775				print "Failed! Cannot convert to $enc.\n";
776				next;
777			};
778
779			open(FOUT, ">$TYPE.draft/$file.$enc.new");
780			my $okay = 1;
781			my $output = "";
782			print FOUT <<EOF;
783# Warning: Do not edit. This file is automatically generated from the
784# tools in /usr/src/tools/tools/locale. The data is obtained from the
785# CLDR project, obtained from http://cldr.unicode.org/
786# -----------------------------------------------------------------------------
787EOF
788			foreach my $k (keys(%keys)) {
789				my $g = $keys{$k};
790
791				die("Unknown $k in \%DESC")
792					if (!defined $DESC{$k});
793
794				$output .= "#\n# $DESC{$k}\n";
795
796				# Replace one row with another
797				if ($g =~ /^>/) {
798					$k = substr($g, 1);
799					$g = $keys{$k};
800				}
801
802				# Callback function
803				if ($g =~ /^\</) {
804					$callback{data}{c} = $c;
805					$callback{data}{k} = $k;
806					$callback{data}{f} = $f;
807					$callback{data}{l} = $l;
808					$callback{data}{e} = $enc;
809					my @a = split(/\</, substr($g, 1));
810					my $rv =
811					    &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]});
812					$values{$l}{$f}{$c}{$k} = $rv;
813					$g = $a[2];
814					$callback{data} = ();
815				}
816
817				my $v = $values{$l}{$f}{$c}{$k};
818				$v = "undef" if (!defined $v);
819
820				if ($g eq "i") {
821					$output .= "$v\n";
822					next;
823				}
824				if ($g eq "ai") {
825					$output .= "$v\n";
826					next;
827				}
828				if ($g eq "s") {
829					$v =~ s/^"//;
830					$v =~ s/"$//;
831					my $cm = "";
832					while ($v =~ /^(.*?)<(.*?)>(.*)/) {
833						my $p1 = $1;
834						$cm = $2;
835						my $p3 = $3;
836
837						my $rv = decodecldr($enc, $cm);
838#						$rv = translate($enc, $cm)
839#							if (!defined $rv);
840						if (!defined $rv) {
841							print STDERR
842"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
843							$okay = 0;
844							next;
845						}
846
847						$v = $p1 . $rv . $p3;
848					}
849					$output .= "$v\n";
850					next;
851				}
852				if ($g eq "as") {
853					foreach my $v (split(/;/, $v)) {
854						$v =~ s/^"//;
855						$v =~ s/"$//;
856						my $cm = "";
857						while ($v =~ /^(.*?)<(.*?)>(.*)/) {
858							my $p1 = $1;
859							$cm = $2;
860							my $p3 = $3;
861
862							my $rv =
863							    decodecldr($enc,
864								$cm);
865#							$rv = translate($enc,
866#							    $cm)
867#							    if (!defined $rv);
868							if (!defined $rv) {
869								print STDERR
870"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
871								$okay = 0;
872								next;
873							}
874
875							$v = $1 . $rv . $3;
876						}
877						$output .= "$v\n";
878					}
879					next;
880				}
881
882				die("$k is '$g'");
883
884			}
885
886			$languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output);
887			$hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1;
888			print FOUT "$output# EOF\n";
889			close(FOUT);
890
891			if ($okay) {
892				rename("$TYPE.draft/$file.$enc.new",
893				    "$TYPE.draft/$file.$enc.src");
894			} else {
895				rename("$TYPE.draft/$file.$enc.new",
896				    "$TYPE.draft/$file.$enc.failed");
897			}
898		}
899	}
900	}
901	}
902}
903
904sub make_makefile {
905	print "Creating Makefile for $TYPE\n";
906	my $SRCOUT;
907	my $SRCOUT2;
908	my $SRCOUT3 = "";
909	my $SRCOUT4 = "";
910	my $MAPLOC;
911	if ($TYPE eq "colldef") {
912		# In future, we might want to try to put the CLDR version into
913		# the .src files with some new syntax, instead of the makefile.
914		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " .
915			"-i \${.IMPSRC} \\\n" .
916			"\t-V \${CLDR_VERSION} \\\n" .
917			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " .
918			"\${.OBJDIR}/\${.IMPSRC:T:R}";
919		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
920				"locale/etc/final-maps\n";
921		$SRCOUT2 = "LC_COLLATE";
922		$SRCOUT3 = "" .
923			".for f t in \${LOCALES_MAPPED}\n" .
924			"FILES+=\t\$t.LC_COLLATE\n" .
925			"FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" .
926			"FILESDIR_\$t.LC_COLLATEPACKAGE=\tlocales\n" .
927			"\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" .
928			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " .
929			"-i \${.ALLSRC} \\\n" .
930			"\t-V \${CLDR_VERSION} \\\n" .
931			"\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" .
932			"\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" .
933			".endfor\n\n";
934		$SRCOUT4 = "## LOCALES_MAPPED\n";
935	}
936	elsif ($TYPE eq "ctypedef") {
937		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " .
938			"-w \${MAPLOC}/widths.txt \\\n" .
939			"\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " .
940			"\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " .
941			" || true";
942		$SRCOUT2 = "LC_CTYPE";
943		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
944				"locale/etc/final-maps\n";
945		$SRCOUT3 = "## SYMPAIRS\n\n" .
946			".for s t in \${SYMPAIRS}\n" .
947			"\${t:S/src\$/LC_CTYPE/}: " .
948			"\$s\n" .
949			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " .
950			"-w \${MAPLOC}/widths.txt \\\n" .
951			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " .
952			"\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " .
953			" || true\n" .
954			".endfor\n\n";
955	}
956	else {
957		$SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}";
958		$SRCOUT2 = "out";
959		$MAPLOC = "";
960	}
961	open(FOUT, ">$TYPE.draft/Makefile");
962	print FOUT <<EOF;
963# Warning: Do not edit. This file is automatically generated from the
964# tools in /usr/src/tools/tools/locale.
965
966PACKAGE=	locales
967LOCALEDIR=	\${SHAREDIR}/locale
968FILESNAME=	$FILESNAMES{$TYPE}
969.SUFFIXES:	.src .${SRCOUT2}
970${MAPLOC}
971EOF
972
973	if ($TYPE eq "colldef") {
974		print FOUT <<EOF;
975CLDR_VERSION=	"${CLDR_VERSION}"
976
977EOF
978	}
979
980	if ($TYPE eq "colldef" || $TYPE eq "ctypedef") {
981		print FOUT <<EOF;
982.include <bsd.endian.mk>
983
984EOF
985	}
986
987	print FOUT <<EOF;
988.src.${SRCOUT2}:
989	$SRCOUT
990
991## PLACEHOLDER
992
993${SRCOUT4}
994
995EOF
996
997	foreach my $hash (keys(%hashtable)) {
998		# For colldef, weight LOCALES to UTF-8
999		#     Sort as upper-case and reverse to achieve it
1000		#     Make en_US, ru_RU, and ca_AD preferred
1001		my @files;
1002		if ($TYPE eq "colldef") {
1003			@files = sort {
1004				if ($a eq 'en_x_US.UTF-8' ||
1005				    $a eq 'ru_x_RU.UTF-8' ||
1006				    $a eq 'ca_x_AD.UTF-8') { return -1; }
1007				elsif ($b eq 'en_x_US.UTF-8' ||
1008				       $b eq 'ru_x_RU.UTF-8' ||
1009				       $b eq 'ca_x_AD.UTF-8') { return 1; }
1010				else { return uc($b) cmp uc($a); }
1011				} keys(%{$hashtable{$hash}});
1012		} elsif ($TYPE eq "ctypedef") {
1013			@files = sort {
1014				if ($a eq 'C_x_x.UTF-8') { return -1; }
1015				elsif ($b eq 'C_x_x.UTF-8') { return 1; }
1016				if ($a =~ /^en_x_US/) { return -1; }
1017				elsif ($b =~ /^en_x_US/) { return 1; }
1018
1019				if ($a =~ /^en_x_GB.ISO8859-15/ ||
1020				    $a =~ /^ru_x_RU/) { return -1; }
1021				elsif ($b =~ /^en_x_GB.ISO8859-15/ ||
1022				       $b =~ /ru_x_RU/) { return 1; }
1023				else { return uc($b) cmp uc($a); }
1024
1025				} keys(%{$hashtable{$hash}});
1026		} else {
1027			@files = sort {
1028				if ($a =~ /_Comm_/ ||
1029				    $b eq 'en_x_US.UTF-8') { return 1; }
1030				elsif ($b =~ /_Comm_/ ||
1031				       $a eq 'en_x_US.UTF-8') { return -1; }
1032				else { return uc($b) cmp uc($a); }
1033				} keys(%{$hashtable{$hash}});
1034		}
1035		if ($#files > 0) {
1036			my $link = shift(@files);
1037			$link =~ s/_x_x//;	# special case for C
1038			$link =~ s/_x_/_/;	# strip family if none there
1039			foreach my $file (@files) {
1040				my @a = split(/_/, $file);
1041				my @b = split(/\./, $a[-1]);
1042				$file =~ s/_x_/_/;
1043				print FOUT "SAME+=\t\t$link $file\n";
1044				undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]});
1045			}
1046		}
1047	}
1048
1049	foreach my $l (sort keys(%languages)) {
1050	foreach my $f (sort keys(%{$languages{$l}})) {
1051	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
1052		next if (defined $languages{$l}{$f}{definitions}
1053		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
1054		if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING}
1055		 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
1056			print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") .
1057			    "${c} - not read\n";
1058			next;
1059		}
1060		foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1061			my $file = $l;
1062			$file .= "_" . $f if ($f ne "x");
1063			$file .= "_" . $c if ($c ne "x");
1064			next if (!defined $languages{$l}{$f}{data}{$c}{$e});
1065			print FOUT "LOCALES+=\t$file.$e\n";
1066		}
1067
1068		if (defined $languages{$l}{$f}{nc_link}) {
1069			foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1070				my $file = $l . "_";
1071				$file .= $f . "_" if ($f ne "x");
1072				$file .= $c;
1073				print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n";
1074			}
1075		}
1076
1077		if (defined $languages{$l}{$f}{e_link}) {
1078			foreach my $el (split(" ", $languages{$l}{$f}{e_link})) {
1079				my @a = split(/:/, $el);
1080				my $file = $l . "_";
1081				$file .= $f . "_" if ($f ne "x");
1082				$file .= $c;
1083				print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n";
1084			}
1085		}
1086
1087	}
1088	}
1089	}
1090
1091	print FOUT <<EOF;
1092
1093FILES=		\${LOCALES:S/\$/.${SRCOUT2}/}
1094CLEANFILES=	\${FILES}
1095
1096.for f t in \${SAME}
1097DIRS+=		LOCALEDIR_\$t
1098LOCALEDIR_\$t=	\${LOCALEDIR}/\$t
1099LOCALEDIR_\$tPACKAGE=	locales
1100SYMLINKS+=	../\$f/\${FILESNAME} \\
1101    \${LOCALEDIR}/\$t/\${FILESNAME}
1102.endfor
1103
1104.for f in \${LOCALES}
1105FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f}
1106FILESDIR_\${f}.${SRCOUT2}PACKAGE= locales
1107.endfor
1108
1109${SRCOUT3}.include <bsd.prog.mk>
1110EOF
1111
1112	close(FOUT);
1113}
1114