16b5e5868SGarrett D'Amore#! /usr/perl5/bin/perl 26b5e5868SGarrett D'Amore# 36b5e5868SGarrett D'Amore# This file and its contents are supplied under the terms of the 46b5e5868SGarrett D'Amore# Common Development and Distribution License ("CDDL"), version 1.0. 5*5aec55ebSGarrett D'Amore# You may only use this file in accordance with the terms of version 66b5e5868SGarrett D'Amore# 1.0 of the CDDL. 76b5e5868SGarrett D'Amore# 86b5e5868SGarrett D'Amore# A full copy of the text of the CDDL should have accompanied this 96b5e5868SGarrett D'Amore# source. A copy is of the CDDL is also available via the Internet 106b5e5868SGarrett D'Amore# at http://www.illumos.org/license/CDDL. 116b5e5868SGarrett D'Amore# 126b5e5868SGarrett D'Amore 136b5e5868SGarrett D'Amore# 146b5e5868SGarrett D'Amore# Copyright 2010 Nexenta Systems, Inc. All rights reserved. 156b5e5868SGarrett D'Amore# 166b5e5868SGarrett D'Amore 176b5e5868SGarrett D'Amore# This converts MAPPING files to localedef character maps 186b5e5868SGarrett D'Amore# suitable for use with the UTF-8 derived localedef data. 196b5e5868SGarrett D'Amore 206b5e5868SGarrett D'Amoresub ucs_to_utf8 216b5e5868SGarrett D'Amore{ 226b5e5868SGarrett D'Amore my $ucs = shift; 236b5e5868SGarrett D'Amore my $utf8; 246b5e5868SGarrett D'Amore 256b5e5868SGarrett D'Amore if ($ucs <= 0x7f) { 266b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", $ucs).$utf8; 276b5e5868SGarrett D'Amore } elsif ($ucs <= 0x7ff) { 286b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 296b5e5868SGarrett D'Amore $ucs >>= 6; 306b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8; 316b5e5868SGarrett D'Amore 326b5e5868SGarrett D'Amore } elsif ($ucs <= 0xffff) { 336b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 346b5e5868SGarrett D'Amore $ucs >>= 6; 356b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 366b5e5868SGarrett D'Amore $ucs >>= 6; 376b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8; 386b5e5868SGarrett D'Amore 396b5e5868SGarrett D'Amore } elsif ($ucs <= 0x1fffff) { 406b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 416b5e5868SGarrett D'Amore $ucs >>= 6; 426b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 436b5e5868SGarrett D'Amore $ucs >>= 6; 446b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 456b5e5868SGarrett D'Amore $ucs >>= 6; 466b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8; 476b5e5868SGarrett D'Amore 486b5e5868SGarrett D'Amore } elsif ($ucs <= 0x03ffffff) { 496b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 506b5e5868SGarrett D'Amore $ucs >>= 6; 516b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 526b5e5868SGarrett D'Amore $ucs >>= 6; 536b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 546b5e5868SGarrett D'Amore $ucs >>= 6; 556b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 566b5e5868SGarrett D'Amore $ucs >>= 6; 576b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; 586b5e5868SGarrett D'Amore 596b5e5868SGarrett D'Amore } else { 606b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 616b5e5868SGarrett D'Amore $ucs >>= 6; 626b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 636b5e5868SGarrett D'Amore $ucs >>= 6; 646b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 656b5e5868SGarrett D'Amore $ucs >>= 6; 666b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 676b5e5868SGarrett D'Amore $ucs >>= 6; 686b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; 696b5e5868SGarrett D'Amore $ucs >>= 6; 706b5e5868SGarrett D'Amore $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; 716b5e5868SGarrett D'Amore } 726b5e5868SGarrett D'Amore 736b5e5868SGarrett D'Amore return ($utf8); 746b5e5868SGarrett D'Amore} 756b5e5868SGarrett D'Amore 766b5e5868SGarrett D'Amoremy %unames; 776b5e5868SGarrett D'Amoremy %uvalues; 786b5e5868SGarrett D'Amore 796b5e5868SGarrett D'Amore# 806b5e5868SGarrett D'Amore# This is not a general purpose Character Map parser, but its good enough 816b5e5868SGarrett D'Amore# for the stock one supplied with CLDR. 826b5e5868SGarrett D'Amore# 836b5e5868SGarrett D'Amoresub load_utf8_cm 846b5e5868SGarrett D'Amore{ 856b5e5868SGarrett D'Amore my $file = shift; 866b5e5868SGarrett D'Amore 876b5e5868SGarrett D'Amore open(UTF8, "$file") || die "open"; 886b5e5868SGarrett D'Amore 896b5e5868SGarrett D'Amore while (<UTF8>) { 906b5e5868SGarrett D'Amore next if (/^#/); 916b5e5868SGarrett D'Amore next if (/^\s*$/); 926b5e5868SGarrett D'Amore next if (/^\s*CHARMAP\s*$/); 936b5e5868SGarrett D'Amore next if (/^\s*END\s*CHARMAP\s*$/); 946b5e5868SGarrett D'Amore chomp; 956b5e5868SGarrett D'Amore @words = split /\s+/; 966b5e5868SGarrett D'Amore $name = $words[0]; 976b5e5868SGarrett D'Amore $utf8val = $words[1]; 986b5e5868SGarrett D'Amore 996b5e5868SGarrett D'Amore if (defined($unames{$utf8val})) { 1006b5e5868SGarrett D'Amore $unames{$utf8val} .= "\n" .$name; 1016b5e5868SGarrett D'Amore } else { 1026b5e5868SGarrett D'Amore $unames{$utf8val} = $name; 1036b5e5868SGarrett D'Amore } 1046b5e5868SGarrett D'Amore $uvalues{$name} = $utf8val; 1056b5e5868SGarrett D'Amore } 1066b5e5868SGarrett D'Amore close(UTF8); 1076b5e5868SGarrett D'Amore} 1086b5e5868SGarrett D'Amore 1096b5e5868SGarrett D'Amoremy %map; 1106b5e5868SGarrett D'Amore 1116b5e5868SGarrett D'Amoresub load_map 1126b5e5868SGarrett D'Amore{ 1136b5e5868SGarrett D'Amore my $file = shift; 1146b5e5868SGarrett D'Amore 1156b5e5868SGarrett D'Amore open(MAP, "$file") || die "open"; 1166b5e5868SGarrett D'Amore 1176b5e5868SGarrett D'Amore while (<MAP>) { 1186b5e5868SGarrett D'Amore next if (/^#/); 1196b5e5868SGarrett D'Amore next if (/^\s*$/); 1206b5e5868SGarrett D'Amore chomp; 1216b5e5868SGarrett D'Amore @words = split /\s+/; 1226b5e5868SGarrett D'Amore $utf8 = $words[1]; 1236b5e5868SGarrett D'Amore $utf8 =~ s/^\\x[0]*//; 1246b5e5868SGarrett D'Amore $utf8 = ucs_to_utf8(hex($utf8)); 1256b5e5868SGarrett D'Amore $val = $words[0]; 1266b5e5868SGarrett D'Amore if (defined ($map{$val})) { 1276b5e5868SGarrett D'Amore $map{$val} .= " ".$utf8; 1286b5e5868SGarrett D'Amore } else { 1296b5e5868SGarrett D'Amore $map{$val} = $utf8; 1306b5e5868SGarrett D'Amore } 1316b5e5868SGarrett D'Amore } 1326b5e5868SGarrett D'Amore} 1336b5e5868SGarrett D'Amore 1346b5e5868SGarrett D'Amoresub mb_str 1356b5e5868SGarrett D'Amore{ 1366b5e5868SGarrett D'Amore my $val = shift; 1376b5e5868SGarrett D'Amore my $str = ""; 1386b5e5868SGarrett D'Amore $val = hex($val); 1396b5e5868SGarrett D'Amore 1406b5e5868SGarrett D'Amore if ($val == 0) { 1416b5e5868SGarrett D'Amore return ("\\x00"); 1426b5e5868SGarrett D'Amore } 1436b5e5868SGarrett D'Amore while ($val) { 1446b5e5868SGarrett D'Amore $str = sprintf("\\x%02x", $val & 0xff).$str; 1456b5e5868SGarrett D'Amore $val >>= 8; 1466b5e5868SGarrett D'Amore } 1476b5e5868SGarrett D'Amore return ($str); 1486b5e5868SGarrett D'Amore} 1496b5e5868SGarrett D'Amore 1506b5e5868SGarrett D'Amore$mf = shift(@ARGV); 1516b5e5868SGarrett D'Amore 1526b5e5868SGarrett D'Amoreload_utf8_cm("UTF-8.cm"); 1536b5e5868SGarrett D'Amoreload_map($mf); 1546b5e5868SGarrett D'Amore 1556b5e5868SGarrett D'Amore 1566b5e5868SGarrett D'Amoreprint("CHARMAP\n"); 1576b5e5868SGarrett D'Amoreforeach $val (sort (keys (%map))) { 1586b5e5868SGarrett D'Amore #$utf8 = $map{$val}; 1596b5e5868SGarrett D'Amore foreach $utf8 (split / /, $map{$val}) { 1606b5e5868SGarrett D'Amore $ref = $unames{$utf8}; 1616b5e5868SGarrett D'Amore foreach $name (sort (split /\n/, $ref)) { 1626b5e5868SGarrett D'Amore print "$name"; 1636b5e5868SGarrett D'Amore my $nt = int((64 - length($name) + 7) / 8); 1646b5e5868SGarrett D'Amore while ($nt) { 1656b5e5868SGarrett D'Amore print "\t"; 1666b5e5868SGarrett D'Amore $nt--; 1676b5e5868SGarrett D'Amore } 1686b5e5868SGarrett D'Amore print mb_str($val)."\n"; 1696b5e5868SGarrett D'Amore } 1706b5e5868SGarrett D'Amore } 1716b5e5868SGarrett D'Amore} 1726b5e5868SGarrett D'Amoreprint "END CHARMAP\n"; 173