1#!/usr/local/bin/perl -wC 2 3# SPDX-License-Identifier: BSD-2-Clause 4# 5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org> 6# Copyright 2015 John Marino <draco@marino.st> 7# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org> 8# 9# Redistribution and use in source and binary forms, with or without 10# modification, are permitted provided that the following conditions 11# are met: 12# 1. Redistributions of source code must retain the above copyright 13# notice, this list of conditions and the following disclaimer. 14# 2. Redistributions in binary form must reproduce the above copyright 15# notice, this list of conditions and the following disclaimer in the 16# documentation and/or other materials provided with the distribution. 17# 18# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28# SUCH DAMAGE. 29# 30 31use strict; 32use File::Copy; 33use XML::Parser; 34use Tie::IxHash; 35use Text::Iconv; 36#use Data::Dumper; 37use Getopt::Long; 38use Digest::SHA qw(sha1_hex); 39require "charmaps.pm"; 40 41if ($#ARGV < 2) { 42 print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n"; 43 exit(1); 44} 45 46my $DEFENCODING = "UTF-8"; 47 48my $UNIDIR = undef; 49my $ETCDIR = undef; 50my $TYPE = undef; 51 52my $CLDR_VERSION = undef; 53 54my $result = GetOptions ( 55 "unidir=s" => \$UNIDIR, 56 "etc=s" => \$ETCDIR, 57 "type=s" => \$TYPE, 58 ); 59 60my %convertors = (); 61 62my %ucd = (); 63my %values = (); 64my %hashtable = (); 65my %languages = (); 66my %translations = (); 67my %alternativemonths = (); 68get_languages(); 69 70my %utfmap = (); 71$utfmap{'UTF-8'} = {}; 72$utfmap{'UTF-32'} = {}; 73get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'}); 74get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'}); 75 76my %keys = (); 77tie(%keys, "Tie::IxHash"); 78tie(%hashtable, "Tie::IxHash"); 79 80my %FILESNAMES = ( 81 "monetdef" => "LC_MONETARY", 82 "timedef" => "LC_TIME", 83 "msgdef" => "LC_MESSAGES", 84 "numericdef" => "LC_NUMERIC", 85 "colldef" => "LC_COLLATE", 86 "ctypedef" => "LC_CTYPE" 87); 88 89my %callback = ( 90 mdorder => \&callback_mdorder, 91 altmon => \&callback_altmon, 92 cformat => \&callback_cformat, 93 dformat => \&callback_dformat, 94 dtformat => \&callback_dtformat, 95 cbabmon => \&callback_abmon, 96 cbampm => \&callback_ampm, 97 data => undef, 98); 99 100my %DESC = ( 101 102 # numericdef 103 "decimal_point" => "decimal_point", 104 "thousands_sep" => "thousands_sep", 105 "grouping" => "grouping", 106 107 # monetdef 108 "int_curr_symbol" => "int_curr_symbol (last character always " . 109 "SPACE)", 110 "currency_symbol" => "currency_symbol", 111 "mon_decimal_point" => "mon_decimal_point", 112 "mon_thousands_sep" => "mon_thousands_sep", 113 "mon_grouping" => "mon_grouping", 114 "positive_sign" => "positive_sign", 115 "negative_sign" => "negative_sign", 116 "int_frac_digits" => "int_frac_digits", 117 "frac_digits" => "frac_digits", 118 "p_cs_precedes" => "p_cs_precedes", 119 "p_sep_by_space" => "p_sep_by_space", 120 "n_cs_precedes" => "n_cs_precedes", 121 "n_sep_by_space" => "n_sep_by_space", 122 "p_sign_posn" => "p_sign_posn", 123 "n_sign_posn" => "n_sign_posn", 124 "int_p_cs_precedes" => "int_p_cs_precedes", 125 "int_n_cs_precedes" => "int_n_cs_precedes", 126 "int_p_sep_by_space" => "int_p_sep_by_space", 127 "int_n_sep_by_space" => "int_n_sep_by_space", 128 "int_p_sign_posn" => "int_p_sign_posn", 129 "int_n_sign_posn" => "int_n_sign_posn", 130 131 # msgdef 132 "yesexpr" => "yesexpr", 133 "noexpr" => "noexpr", 134 "yesstr" => "yesstr", 135 "nostr" => "nostr", 136 137 # timedef 138 "abmon" => "Short month names", 139 "mon" => "Long month names (as in a date)", 140 "abday" => "Short weekday names", 141 "day" => "Long weekday names", 142 "t_fmt" => "X_fmt", 143 "d_fmt" => "x_fmt", 144 "c_fmt" => "c_fmt", 145 "am_pm" => "AM/PM", 146 "d_t_fmt" => "date_fmt", 147 "altmon" => "Long month names (without case ending)", 148 "md_order" => "md_order", 149 "t_fmt_ampm" => "ampm_fmt", 150); 151 152if ($TYPE eq "colldef") { 153 transform_collation(); 154 make_makefile(); 155} 156 157if ($TYPE eq "ctypedef") { 158 transform_ctypes(); 159 make_makefile(); 160} 161 162if ($TYPE eq "numericdef") { 163 %keys = ( 164 "decimal_point" => "s", 165 "thousands_sep" => "s", 166 "grouping" => "ai", 167 ); 168 get_fields(); 169 print_fields(); 170 make_makefile(); 171} 172 173if ($TYPE eq "monetdef") { 174 %keys = ( 175 "int_curr_symbol" => "s", 176 "currency_symbol" => "s", 177 "mon_decimal_point" => "s", 178 "mon_thousands_sep" => "s", 179 "mon_grouping" => "ai", 180 "positive_sign" => "s", 181 "negative_sign" => "s", 182 "int_frac_digits" => "i", 183 "frac_digits" => "i", 184 "p_cs_precedes" => "i", 185 "p_sep_by_space" => "i", 186 "n_cs_precedes" => "i", 187 "n_sep_by_space" => "i", 188 "p_sign_posn" => "i", 189 "n_sign_posn" => "i", 190 "int_p_cs_precedes" => "i", 191 "int_n_cs_precedes" => "i", 192 "int_p_sep_by_space" => "i", 193 "int_n_sep_by_space" => "i", 194 "int_p_sign_posn" => "i", 195 "int_n_sign_posn" => "i" 196 ); 197 get_fields(); 198 print_fields(); 199 make_makefile(); 200} 201 202if ($TYPE eq "msgdef") { 203 %keys = ( 204 "yesexpr" => "s", 205 "noexpr" => "s", 206 "yesstr" => "s", 207 "nostr" => "s" 208 ); 209 get_fields(); 210 print_fields(); 211 make_makefile(); 212} 213 214if ($TYPE eq "timedef") { 215 %keys = ( 216 "abmon" => "<cbabmon<abmon<as", 217 "mon" => "as", 218 "abday" => "as", 219 "day" => "as", 220 "t_fmt" => "s", 221 "d_fmt" => "<dformat<d_fmt<s", 222 "c_fmt" => "<cformat<d_t_fmt<s", 223 "am_pm" => "<cbampm<am_pm<as", 224 "d_t_fmt" => "<dtformat<d_t_fmt<s", 225 "altmon" => "<altmon<mon<as", 226 "md_order" => "<mdorder<d_fmt<s", 227 "t_fmt_ampm" => "s", 228 ); 229 get_fields(); 230 print_fields(); 231 make_makefile(); 232} 233 234sub callback_ampm { 235 my $s = shift; 236 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 237 my $enc = $callback{data}{e}; 238 239 if ($nl eq 'ru_RU') { 240 if ($enc eq 'UTF-8') { 241 $s = 'дп;пп'; 242 } else { 243 my $converter = Text::Iconv->new("utf-8", "$enc"); 244 $s = $converter->convert("дп;пп"); 245 } 246 } 247 return $s; 248} 249 250sub callback_cformat { 251 my $s = shift; 252 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 253 254 if ($nl eq 'ko_KR') { 255 $s =~ s/(> )(%p)/$1%A $2/; 256 } 257 $s =~ s/\.,/\./; 258 $s =~ s/ %Z//; 259 $s =~ s/ %z//; 260 $s =~ s/^"%e\./%A %e/; 261 $s =~ s/^"(%B %e, )/"%A, $1/; 262 $s =~ s/^"(%e %B )/"%A $1/; 263 return $s; 264}; 265 266sub callback_dformat { 267 my $s = shift; 268 269 $s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/; 270 $s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/; 271 return $s; 272}; 273 274sub callback_dtformat { 275 my $s = shift; 276 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 277 278 if ($nl eq 'ja_JP') { 279 $s =~ s/(> )(%H)/$1%A $2/; 280 } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') { 281 if ($nl ne 'ko_KR') { 282 $s =~ s/%m/%_m/; 283 } 284 $s =~ s/(> )(%p)/$1%A $2/; 285 } 286 $s =~ s/\.,/\./; 287 $s =~ s/^"%e\./%A %e/; 288 $s =~ s/^"(%B %e, )/"%A, $1/; 289 $s =~ s/^"(%e %B )/"%A $1/; 290 return $s; 291}; 292 293sub callback_mdorder { 294 my $s = shift; 295 return undef if (!defined $s); 296 $s =~ s/[^dem]//g; 297 $s =~ s/e/d/g; 298 return $s; 299}; 300 301sub callback_altmon { 302 # if the language/country is known in %alternative months then 303 # return that, otherwise repeat mon 304 my $s = shift; 305 306 if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) { 307 my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}}); 308 my @cleaned; 309 foreach (@altnames) 310 { 311 $_ =~ s/^\s+//; 312 $_ =~ s/\s+$//; 313 push @cleaned, $_; 314 } 315 return join(";",@cleaned); 316 } 317 318 return $s; 319} 320 321sub callback_abmon { 322 # for specified CJK locales, pad result with a space to enable 323 # columns to line up (style established in FreeBSD in 2001) 324 my $s = shift; 325 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 326 327 if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' || 328 $nl eq 'zh_HK' || $nl eq 'zh_TW') { 329 my @monthnames = split(";", $s); 330 my @cleaned; 331 foreach (@monthnames) 332 { 333 if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ || 334 ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/)) 335 { 336 $_ =~ s/^"/"<space>/; 337 } 338 push @cleaned, $_; 339 } 340 return join(";",@cleaned); 341 } 342 return $s; 343} 344 345############################ 346 347sub get_utfmap { 348 my ($file, $db) = @_; 349 350 open(FIN, $file); 351 my @lines = <FIN>; 352 close(FIN); 353 chomp(@lines); 354 355 my $prev_k = undef; 356 my $prev_v = ""; 357 my $incharmap = 0; 358 foreach my $l (@lines) { 359 chomp($l); 360 next if ($l =~ /^\#/); 361 next if ($l eq ""); 362 363 if ($l eq "CHARMAP") { 364 $incharmap = 1; 365 next; 366 } 367 368 next if (!$incharmap); 369 last if ($l eq "END CHARMAP"); 370 371 $l =~ /^<([^\s]+)>\s+(.*)/; 372 my $k = $1; 373 my $v = $2; 374 $v =~ s/\\x//g; # UTF-8 char code 375 $db->{$k} = $v; 376# print STDERR "UTF $k = $v\n"; 377 378 # XXX: no longer needed 379 # $db_alias->{$k} = $prev_k if ($prev_v eq $v); 380 381 $prev_v = $v; 382 $prev_k = $k; 383 } 384} 385 386sub resolve_enc_addition { 387 my $ret = ''; 388 389 foreach my $t (split(/\+/, $_[0])) { 390 $t =~ s/^0[xX]//; 391 $ret .= $t; 392 } 393 return $ret; 394} 395 396sub get_languages { 397 my %data = get_xmldata($ETCDIR); 398 %languages = %{$data{L}}; 399 %translations = %{$data{T}}; 400 %alternativemonths = %{$data{AM}}; 401} 402 403sub transform_ctypes { 404 # Add the C.UTF-8 405 $languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef; 406 407 foreach my $l (sort keys(%languages)) { 408 foreach my $f (sort keys(%{$languages{$l}})) { 409 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 410 next if (defined $languages{$l}{$f}{definitions} 411 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 412 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 413 my $file = $l; 414 $file .= "_" . $f if ($f ne "x"); 415 $file .= "_" . $c if ($c ne "x"); 416 my $actfile = $file; 417 418 my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; 419 if (! -f $filename) { 420 print STDERR "Cannot open $filename\n"; 421 next; 422 } 423 open(FIN, "$filename"); 424 print "Reading from $filename for ${l}_${f}_${c}\n"; 425 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 426 my @lines; 427 my $shex; 428 my $uhex; 429 while (<FIN>) { 430 push @lines, $_; 431 } 432 close(FIN); 433 $shex = sha1_hex(join("\n", @lines)); 434 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; 435 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; 436 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); 437 print FOUT @lines; 438 close(FOUT); 439 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 440 next if ($enc eq $DEFENCODING); 441 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 442 if ($file eq 'ja_JP') { 443 # Override $filename for ja_JP because 444 # its CTYPE is not compatible with UTF-8. 445 $filename = "$UNIDIR/posix/$file.eucJP.src"; 446 } 447 if (! -f $filename) { 448 print STDERR "Cannot open $filename\n"; 449 next; 450 } 451 @lines = (); 452 open(FIN, "$filename"); 453 while (<FIN>) { 454 if ((/^comment_char\s/) || (/^escape_char\s/)){ 455 push @lines, $_; 456 } 457 if (/^LC_CTYPE/../^END LC_CTYPE/) { 458 push @lines, $_; 459 } 460 } 461 close(FIN); 462 $uhex = sha1_hex(join("\n", @lines) . $enc); 463 $languages{$l}{$f}{data}{$c}{$enc} = $uhex; 464 $hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1; 465 open(FOUT, ">$TYPE.draft/$actfile.$enc.src"); 466 print FOUT <<EOF; 467# Warning: Do not edit. This file is automatically extracted from the 468# tools in /usr/src/tools/tools/locale. The data is obtained from the 469# CLDR project, obtained from http://cldr.unicode.org/ 470# ----------------------------------------------------------------------------- 471EOF 472 print FOUT @lines; 473 close(FOUT); 474 } 475 } 476 } 477 } 478} 479 480 481sub transform_collation { 482 # Read the CLDR version 483 open(FIN, "$UNIDIR/cldr-version") or die "Cannot open cldr-version"; 484 read FIN, $CLDR_VERSION, -s FIN; 485 close(FIN); 486 $CLDR_VERSION =~ s/\s*$//; 487 488 foreach my $l (sort keys(%languages)) { 489 foreach my $f (sort keys(%{$languages{$l}})) { 490 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 491 next if (defined $languages{$l}{$f}{definitions} 492 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 493 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 494 my $file; 495 $file = $l . "_"; 496 $file .= $f . "_" if ($f ne "x"); 497 $file .= $c; 498 my $actfile = $file; 499 500 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 501 $filename = "$ETCDIR/$file.$DEFENCODING.src" 502 if (! -f $filename); 503 if (! -f $filename 504 && defined $languages{$l}{$f}{fallback}) { 505 $file = $languages{$l}{$f}{fallback}; 506 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 507 } 508 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src" 509 if (! -f $filename); 510 if (! -f $filename) { 511 print STDERR 512 "Cannot open $file.$DEFENCODING.src or fallback\n"; 513 next; 514 } 515 open(FIN, "$filename"); 516 print "Reading from $filename for ${l}_${f}_${c}\n"; 517 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 518 my @lines; 519 my $shex; 520 while (<FIN>) { 521 if ((/^comment_char\s/) || (/^escape_char\s/)){ 522 push @lines, $_; 523 } 524 if (/^LC_COLLATE/../^END LC_COLLATE/) { 525 $_ =~ s/[ ]+/ /g; 526 push @lines, $_; 527 } 528 } 529 close(FIN); 530 $shex = sha1_hex(join("\n", @lines)); 531 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; 532 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; 533 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); 534 print FOUT <<EOF; 535# Warning: Do not edit. This file is automatically extracted from the 536# tools in /usr/src/tools/tools/locale. The data is obtained from the 537# CLDR project, obtained from http://cldr.unicode.org/ 538# ----------------------------------------------------------------------------- 539EOF 540 print FOUT @lines; 541 close(FOUT); 542 543 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 544 next if ($enc eq $DEFENCODING); 545 546 open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src"; 547 open FOUT, ">$TYPE.draft/$actfile.$enc.src"; 548 my $order_start = 0; 549 my $print_p = 0; 550 # 551 # %c_elem: collation elements 552 # 553 # undef: not defined 554 # 1: defined 555 # 2: invalid in this encoding 556 # 557 my %c_elem = (); 558 while (<FIN>) { # XXX: this loop should be refactored. 559 chomp; 560 $print_p = 1; 561 if ($order_start) { 562 $order_start = 0 if (m/^order_end/); 563 if (m/^<([^>]+)>/) { 564 if (not defined $c_elem{$1}) { 565# print STDERR "$1:\n"; 566 567 my $u32 = $utfmap{'UTF-32'}->{$1}; 568 die "order, $1\n" if (not defined $u32); 569# print STDERR "u32 for $1 = $u32\n"; 570 if (not defined $convertors{$enc}{$u32}) { 571# print STDERR "$1 - $u32 not defined in $enc\n"; 572 $print_p = 0; 573 } 574 } elsif ($c_elem{$1} == 2) { 575# print STDERR "$1 is marked as invalid in $enc\n"; 576 $print_p = 0; 577 } 578 } 579 } elsif (m/^collating-element/) { 580 my ($elem, $l); 581 if (m/<([^>]+)> from (.+)/) { 582 ($elem, $l) = ($1, $2); 583 } 584# print STDERR "$elem: enter ($print_p, $l,)\n"; 585 while ($print_p and 586 defined $l and 587 $l =~ m/<([^>]+)>/g) { 588# print STDERR "$elem: $1\n"; 589 my $u32 = $utfmap{'UTF-32'}->{$1}; 590 die "collating-element, $1\n" if (not defined $u32); 591# print STDERR "u32 for $1 = $u32\n"; 592 if (not $convertors{$enc}{$u32}) { 593# print STDERR "$1 - $u32 not defined in $enc\n"; 594 $print_p = 0; 595# print STDERR "Mark $elem as invalid\n"; 596 $c_elem{$elem} = 2; 597 } 598 } 599 if ($print_p) { 600# print STDERR "Add $elem\n"; 601 $c_elem{$elem} = 1; 602 } 603 } elsif (m/^collating-symbol <([^>]+)>/) { 604# print STDERR "Add $1\n"; 605 $c_elem{$1} = 1; 606 } elsif (m/^order_start/) { 607 $order_start = 1; 608 # do nothing 609 } 610 print FOUT $_, "\n" if ($print_p); 611 } 612 close FOUT; 613 close FIN; 614 $languages{$l}{$f}{data}{$c}{$enc} = $shex; 615 $hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1; 616 } 617 } 618 } 619 } 620} 621 622sub get_fields { 623 foreach my $l (sort keys(%languages)) { 624 foreach my $f (sort keys(%{$languages{$l}})) { 625 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 626 next if (defined $languages{$l}{$f}{definitions} 627 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 628 629 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 630 my $file; 631 $file = $l . "_"; 632 $file .= $f . "_" if ($f ne "x"); 633 $file .= $c; 634 635 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 636 $filename = "$ETCDIR/$file.$DEFENCODING.src" 637 if (! -f $filename); 638 if (! -f $filename 639 && defined $languages{$l}{$f}{fallback}) { 640 $file = $languages{$l}{$f}{fallback}; 641 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 642 } 643 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src" 644 if (! -f $filename); 645 if (! -f $filename) { 646 print STDERR 647 "Cannot open $file.$DEFENCODING.src or fallback\n"; 648 next; 649 } 650 open(FIN, "$filename"); 651 print "Reading from $filename for ${l}_${f}_${c}\n"; 652 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 653 my @lines = <FIN>; 654 chomp(@lines); 655 close(FIN); 656 my $continue = 0; 657 foreach my $k (keys(%keys)) { 658 foreach my $line (@lines) { 659 $line =~ s/\r//; 660 next if (!$continue && $line !~ /^$k\s/); 661 if ($continue) { 662 $line =~ s/^\s+//; 663 } else { 664 $line =~ s/^$k\s+//; 665 } 666 667 $values{$l}{$f}{$c}{$k} = "" 668 if (!defined $values{$l}{$f}{$c}{$k}); 669 670 $continue = ($line =~ /\/$/); 671 $line =~ s/\/$// if ($continue); 672 673# while ($line =~ /_/) { 674# $line =~ 675# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; 676# } 677# die "_ in data - $line" if ($line =~ /_/); 678 $values{$l}{$f}{$c}{$k} .= $line; 679 680 last if (!$continue); 681 } 682 } 683 } 684 } 685 } 686} 687 688sub decodecldr { 689 my $e = shift; 690 my $s = shift; 691 692 my $v = undef; 693 694 if ($e eq "UTF-8") { 695 # 696 # Conversion to UTF-8 can be done from the Unicode name to 697 # the UTF-8 character code. 698 # 699 $v = $utfmap{'UTF-8'}->{$s}; 700 die "Cannot convert $s in $e (charmap)" if (!defined $v); 701 } else { 702 # 703 # Conversion to these encodings can be done from the Unicode 704 # name to Unicode code to the encodings code. 705 # 706 # hex - hex or string attr 707 # unicode - unicode attr 708 # ucc - ucc attr 709 my $hex = $translations{$e}{$s}{hex}; 710 my $ucc = $utfmap{'UTF-32'}->{$s}; 711 my $ucc_attr = $translations{$e}{$s}{ucc}; 712 my $unicode = $translations{$e}{$s}{unicode}; 713 714 if (defined $hex) { # hex is in local encoding 715 $v = $hex; 716 } elsif (defined $unicode) { # unicode is in name 717 $v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}}; 718 } elsif (defined $ucc_attr) { # ucc is in code point 719 if (defined $ucc) { 720# print STDERR "INFO: ucc=$ucc_attr ", 721# "overrides $ucc in UTF-32\n"; 722 } 723 # normalize 724 $ucc_attr = sprintf("%08X", hex($ucc_attr)); 725# print STDERR "convert $ucc_attr into $e\n"; 726 $v = $convertors{$e}{$ucc_attr}; 727 } elsif (defined $ucc) { 728 # normalize 729 $ucc = sprintf("%08X", hex($ucc)); 730# print STDERR "convert $ucc into $e\n"; 731 $v = $convertors{$e}{$ucc}; 732 } 733 die "Cannot convert $s in $e" if (!defined $v); 734 } 735 736 # XXX: length = 8 is not supported yet. 737 $v =~ s/^[0]+//g; 738 $v = "0" . $v if (length($v) % 2); 739 return pack("C", hex($v)) if (length($v) == 2); 740 return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) 741 if (length($v) == 4); 742 return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), 743 hex(substr($v, 4, 2))) if (length($v) == 6); 744 die "Cannot convert $s in $e (length = " . length($v) . "\n"; 745} 746 747sub translate { 748 my $enc = shift; 749 my $v = shift; 750 751 return $translations{$enc}{$v} if (defined $translations{$enc}{$v}); 752 return undef; 753} 754 755sub print_fields { 756 foreach my $l (sort keys(%languages)) { 757 foreach my $f (sort keys(%{$languages{$l}})) { 758 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 759 next if (defined $languages{$l}{$f}{definitions} 760 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 761 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 762 if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { 763 print "Skipping ${l}_" . 764 ($f eq "x" ? "" : "${f}_") . 765 "${c} - not read\n"; 766 next; 767 } 768 my $file = $l; 769 $file .= "_" . $f if ($f ne "x"); 770 $file .= "_" . $c; 771 print "Writing to $file in $enc\n"; 772 773 if ($enc ne $DEFENCODING && 774 !defined $convertors{$enc}) { 775 print "Failed! Cannot convert to $enc.\n"; 776 next; 777 }; 778 779 open(FOUT, ">$TYPE.draft/$file.$enc.new"); 780 my $okay = 1; 781 my $output = ""; 782 print FOUT <<EOF; 783# Warning: Do not edit. This file is automatically generated from the 784# tools in /usr/src/tools/tools/locale. The data is obtained from the 785# CLDR project, obtained from http://cldr.unicode.org/ 786# ----------------------------------------------------------------------------- 787EOF 788 foreach my $k (keys(%keys)) { 789 my $g = $keys{$k}; 790 791 die("Unknown $k in \%DESC") 792 if (!defined $DESC{$k}); 793 794 $output .= "#\n# $DESC{$k}\n"; 795 796 # Replace one row with another 797 if ($g =~ /^>/) { 798 $k = substr($g, 1); 799 $g = $keys{$k}; 800 } 801 802 # Callback function 803 if ($g =~ /^\</) { 804 $callback{data}{c} = $c; 805 $callback{data}{k} = $k; 806 $callback{data}{f} = $f; 807 $callback{data}{l} = $l; 808 $callback{data}{e} = $enc; 809 my @a = split(/\</, substr($g, 1)); 810 my $rv = 811 &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]}); 812 $values{$l}{$f}{$c}{$k} = $rv; 813 $g = $a[2]; 814 $callback{data} = (); 815 } 816 817 my $v = $values{$l}{$f}{$c}{$k}; 818 $v = "undef" if (!defined $v); 819 820 if ($g eq "i") { 821 $output .= "$v\n"; 822 next; 823 } 824 if ($g eq "ai") { 825 $output .= "$v\n"; 826 next; 827 } 828 if ($g eq "s") { 829 $v =~ s/^"//; 830 $v =~ s/"$//; 831 my $cm = ""; 832 while ($v =~ /^(.*?)<(.*?)>(.*)/) { 833 my $p1 = $1; 834 $cm = $2; 835 my $p3 = $3; 836 837 my $rv = decodecldr($enc, $cm); 838# $rv = translate($enc, $cm) 839# if (!defined $rv); 840 if (!defined $rv) { 841 print STDERR 842"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; 843 $okay = 0; 844 next; 845 } 846 847 $v = $p1 . $rv . $p3; 848 } 849 $output .= "$v\n"; 850 next; 851 } 852 if ($g eq "as") { 853 foreach my $v (split(/;/, $v)) { 854 $v =~ s/^"//; 855 $v =~ s/"$//; 856 my $cm = ""; 857 while ($v =~ /^(.*?)<(.*?)>(.*)/) { 858 my $p1 = $1; 859 $cm = $2; 860 my $p3 = $3; 861 862 my $rv = 863 decodecldr($enc, 864 $cm); 865# $rv = translate($enc, 866# $cm) 867# if (!defined $rv); 868 if (!defined $rv) { 869 print STDERR 870"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; 871 $okay = 0; 872 next; 873 } 874 875 $v = $1 . $rv . $3; 876 } 877 $output .= "$v\n"; 878 } 879 next; 880 } 881 882 die("$k is '$g'"); 883 884 } 885 886 $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output); 887 $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1; 888 print FOUT "$output# EOF\n"; 889 close(FOUT); 890 891 if ($okay) { 892 rename("$TYPE.draft/$file.$enc.new", 893 "$TYPE.draft/$file.$enc.src"); 894 } else { 895 rename("$TYPE.draft/$file.$enc.new", 896 "$TYPE.draft/$file.$enc.failed"); 897 } 898 } 899 } 900 } 901 } 902} 903 904sub make_makefile { 905 print "Creating Makefile for $TYPE\n"; 906 my $SRCOUT; 907 my $SRCOUT2; 908 my $SRCOUT3 = ""; 909 my $SRCOUT4 = ""; 910 my $MAPLOC; 911 if ($TYPE eq "colldef") { 912 # In future, we might want to try to put the CLDR version into 913 # the .src files with some new syntax, instead of the makefile. 914 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " . 915 "-i \${.IMPSRC} \\\n" . 916 "\t-V \${CLDR_VERSION} \\\n" . 917 "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " . 918 "\${.OBJDIR}/\${.IMPSRC:T:R}"; 919 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . 920 "locale/etc/final-maps\n"; 921 $SRCOUT2 = "LC_COLLATE"; 922 $SRCOUT3 = "" . 923 ".for f t in \${LOCALES_MAPPED}\n" . 924 "FILES+=\t\$t.LC_COLLATE\n" . 925 "FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" . 926 "FILESDIR_\$t.LC_COLLATEPACKAGE=\tlocales\n" . 927 "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" . 928 "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " . 929 "-i \${.ALLSRC} \\\n" . 930 "\t-V \${CLDR_VERSION} \\\n" . 931 "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" . 932 "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" . 933 ".endfor\n\n"; 934 $SRCOUT4 = "## LOCALES_MAPPED\n"; 935 } 936 elsif ($TYPE eq "ctypedef") { 937 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " . 938 "-w \${MAPLOC}/widths.txt \\\n" . 939 "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " . 940 "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " . 941 " || true"; 942 $SRCOUT2 = "LC_CTYPE"; 943 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . 944 "locale/etc/final-maps\n"; 945 $SRCOUT3 = "## SYMPAIRS\n\n" . 946 ".for s t in \${SYMPAIRS}\n" . 947 "\${t:S/src\$/LC_CTYPE/}: " . 948 "\$s\n" . 949 "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " . 950 "-w \${MAPLOC}/widths.txt \\\n" . 951 "\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " . 952 "\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " . 953 " || true\n" . 954 ".endfor\n\n"; 955 } 956 else { 957 $SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}"; 958 $SRCOUT2 = "out"; 959 $MAPLOC = ""; 960 } 961 open(FOUT, ">$TYPE.draft/Makefile"); 962 print FOUT <<EOF; 963# Warning: Do not edit. This file is automatically generated from the 964# tools in /usr/src/tools/tools/locale. 965 966PACKAGE= locales 967LOCALEDIR= \${SHAREDIR}/locale 968FILESNAME= $FILESNAMES{$TYPE} 969.SUFFIXES: .src .${SRCOUT2} 970${MAPLOC} 971EOF 972 973 if ($TYPE eq "colldef") { 974 print FOUT <<EOF; 975CLDR_VERSION= "${CLDR_VERSION}" 976 977EOF 978 } 979 980 if ($TYPE eq "colldef" || $TYPE eq "ctypedef") { 981 print FOUT <<EOF; 982.include <bsd.endian.mk> 983 984EOF 985 } 986 987 print FOUT <<EOF; 988.src.${SRCOUT2}: 989 $SRCOUT 990 991## PLACEHOLDER 992 993${SRCOUT4} 994 995EOF 996 997 foreach my $hash (keys(%hashtable)) { 998 # For colldef, weight LOCALES to UTF-8 999 # Sort as upper-case and reverse to achieve it 1000 # Make en_US, ru_RU, and ca_AD preferred 1001 my @files; 1002 if ($TYPE eq "colldef") { 1003 @files = sort { 1004 if ($a eq 'en_x_US.UTF-8' || 1005 $a eq 'ru_x_RU.UTF-8' || 1006 $a eq 'ca_x_AD.UTF-8') { return -1; } 1007 elsif ($b eq 'en_x_US.UTF-8' || 1008 $b eq 'ru_x_RU.UTF-8' || 1009 $b eq 'ca_x_AD.UTF-8') { return 1; } 1010 else { return uc($b) cmp uc($a); } 1011 } keys(%{$hashtable{$hash}}); 1012 } elsif ($TYPE eq "ctypedef") { 1013 @files = sort { 1014 if ($a eq 'C_x_x.UTF-8') { return -1; } 1015 elsif ($b eq 'C_x_x.UTF-8') { return 1; } 1016 if ($a =~ /^en_x_US/) { return -1; } 1017 elsif ($b =~ /^en_x_US/) { return 1; } 1018 1019 if ($a =~ /^en_x_GB.ISO8859-15/ || 1020 $a =~ /^ru_x_RU/) { return -1; } 1021 elsif ($b =~ /^en_x_GB.ISO8859-15/ || 1022 $b =~ /ru_x_RU/) { return 1; } 1023 else { return uc($b) cmp uc($a); } 1024 1025 } keys(%{$hashtable{$hash}}); 1026 } else { 1027 @files = sort { 1028 if ($a =~ /_Comm_/ || 1029 $b eq 'en_x_US.UTF-8') { return 1; } 1030 elsif ($b =~ /_Comm_/ || 1031 $a eq 'en_x_US.UTF-8') { return -1; } 1032 else { return uc($b) cmp uc($a); } 1033 } keys(%{$hashtable{$hash}}); 1034 } 1035 if ($#files > 0) { 1036 my $link = shift(@files); 1037 $link =~ s/_x_x//; # special case for C 1038 $link =~ s/_x_/_/; # strip family if none there 1039 foreach my $file (@files) { 1040 my @a = split(/_/, $file); 1041 my @b = split(/\./, $a[-1]); 1042 $file =~ s/_x_/_/; 1043 print FOUT "SAME+=\t\t$link $file\n"; 1044 undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]}); 1045 } 1046 } 1047 } 1048 1049 foreach my $l (sort keys(%languages)) { 1050 foreach my $f (sort keys(%{$languages{$l}})) { 1051 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 1052 next if (defined $languages{$l}{$f}{definitions} 1053 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 1054 if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING} 1055 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { 1056 print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") . 1057 "${c} - not read\n"; 1058 next; 1059 } 1060 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 1061 my $file = $l; 1062 $file .= "_" . $f if ($f ne "x"); 1063 $file .= "_" . $c if ($c ne "x"); 1064 next if (!defined $languages{$l}{$f}{data}{$c}{$e}); 1065 print FOUT "LOCALES+=\t$file.$e\n"; 1066 } 1067 1068 if (defined $languages{$l}{$f}{nc_link}) { 1069 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 1070 my $file = $l . "_"; 1071 $file .= $f . "_" if ($f ne "x"); 1072 $file .= $c; 1073 print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n"; 1074 } 1075 } 1076 1077 if (defined $languages{$l}{$f}{e_link}) { 1078 foreach my $el (split(" ", $languages{$l}{$f}{e_link})) { 1079 my @a = split(/:/, $el); 1080 my $file = $l . "_"; 1081 $file .= $f . "_" if ($f ne "x"); 1082 $file .= $c; 1083 print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n"; 1084 } 1085 } 1086 1087 } 1088 } 1089 } 1090 1091 print FOUT <<EOF; 1092 1093FILES= \${LOCALES:S/\$/.${SRCOUT2}/} 1094CLEANFILES= \${FILES} 1095 1096.for f t in \${SAME} 1097DIRS+= LOCALEDIR_\$t 1098LOCALEDIR_\$t= \${LOCALEDIR}/\$t 1099LOCALEDIR_\$tPACKAGE= locales 1100SYMLINKS+= ../\$f/\${FILESNAME} \\ 1101 \${LOCALEDIR}/\$t/\${FILESNAME} 1102.endfor 1103 1104.for f in \${LOCALES} 1105FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f} 1106FILESDIR_\${f}.${SRCOUT2}PACKAGE= locales 1107.endfor 1108 1109${SRCOUT3}.include <bsd.prog.mk> 1110EOF 1111 1112 close(FOUT); 1113} 1114