xref: /titanic_44/usr/src/cmd/localedef/data/convert_map.pl (revision 5aec55eb0591d2fcdd38d7dd5408a6ff3456e596)
16b5e5868SGarrett D'Amore#! /usr/perl5/bin/perl
26b5e5868SGarrett D'Amore#
36b5e5868SGarrett D'Amore# This file and its contents are supplied under the terms of the
46b5e5868SGarrett D'Amore# Common Development and Distribution License ("CDDL"), version 1.0.
5*5aec55ebSGarrett D'Amore# You may only use this file in accordance with the terms of version
66b5e5868SGarrett D'Amore# 1.0 of the CDDL.
76b5e5868SGarrett D'Amore#
86b5e5868SGarrett D'Amore# A full copy of the text of the CDDL should have accompanied this
96b5e5868SGarrett D'Amore# source.  A copy is of the CDDL is also available via the Internet
106b5e5868SGarrett D'Amore# at http://www.illumos.org/license/CDDL.
116b5e5868SGarrett D'Amore#
126b5e5868SGarrett D'Amore
136b5e5868SGarrett D'Amore#
146b5e5868SGarrett D'Amore# Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
156b5e5868SGarrett D'Amore#
166b5e5868SGarrett D'Amore
176b5e5868SGarrett D'Amore# This converts MAPPING files to localedef character maps
186b5e5868SGarrett D'Amore# suitable for use with the UTF-8 derived localedef data.
196b5e5868SGarrett D'Amore
206b5e5868SGarrett D'Amoresub ucs_to_utf8
216b5e5868SGarrett D'Amore{
226b5e5868SGarrett D'Amore    my $ucs = shift;
236b5e5868SGarrett D'Amore    my $utf8;
246b5e5868SGarrett D'Amore
256b5e5868SGarrett D'Amore    if ($ucs <= 0x7f) {
266b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", $ucs).$utf8;
276b5e5868SGarrett D'Amore    } elsif ($ucs <= 0x7ff) {
286b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
296b5e5868SGarrett D'Amore	$ucs >>= 6;
306b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8;
316b5e5868SGarrett D'Amore
326b5e5868SGarrett D'Amore    } elsif ($ucs <= 0xffff) {
336b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
346b5e5868SGarrett D'Amore	$ucs >>= 6;
356b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
366b5e5868SGarrett D'Amore	$ucs >>= 6;
376b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8;
386b5e5868SGarrett D'Amore
396b5e5868SGarrett D'Amore    } elsif ($ucs <= 0x1fffff) {
406b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
416b5e5868SGarrett D'Amore	$ucs >>= 6;
426b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
436b5e5868SGarrett D'Amore	$ucs >>= 6;
446b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
456b5e5868SGarrett D'Amore	$ucs >>= 6;
466b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8;
476b5e5868SGarrett D'Amore
486b5e5868SGarrett D'Amore    } elsif ($ucs <= 0x03ffffff) {
496b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
506b5e5868SGarrett D'Amore	$ucs >>= 6;
516b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
526b5e5868SGarrett D'Amore	$ucs >>= 6;
536b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
546b5e5868SGarrett D'Amore	$ucs >>= 6;
556b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
566b5e5868SGarrett D'Amore	$ucs >>= 6;
576b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8;
586b5e5868SGarrett D'Amore
596b5e5868SGarrett D'Amore    } else {
606b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
616b5e5868SGarrett D'Amore	$ucs >>= 6;
626b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
636b5e5868SGarrett D'Amore	$ucs >>= 6;
646b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
656b5e5868SGarrett D'Amore	$ucs >>= 6;
666b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
676b5e5868SGarrett D'Amore	$ucs >>= 6;
686b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
696b5e5868SGarrett D'Amore	$ucs >>= 6;
706b5e5868SGarrett D'Amore	$utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8;
716b5e5868SGarrett D'Amore    }
726b5e5868SGarrett D'Amore
736b5e5868SGarrett D'Amore    return ($utf8);
746b5e5868SGarrett D'Amore}
756b5e5868SGarrett D'Amore
766b5e5868SGarrett D'Amoremy %unames;
776b5e5868SGarrett D'Amoremy %uvalues;
786b5e5868SGarrett D'Amore
796b5e5868SGarrett D'Amore#
806b5e5868SGarrett D'Amore# This is not a general purpose Character Map parser, but its good enough
816b5e5868SGarrett D'Amore# for the stock one supplied with CLDR.
826b5e5868SGarrett D'Amore#
836b5e5868SGarrett D'Amoresub load_utf8_cm
846b5e5868SGarrett D'Amore{
856b5e5868SGarrett D'Amore    my $file = shift;
866b5e5868SGarrett D'Amore
876b5e5868SGarrett D'Amore    open(UTF8, "$file") || die "open";
886b5e5868SGarrett D'Amore
896b5e5868SGarrett D'Amore    while (<UTF8>) {
906b5e5868SGarrett D'Amore	next if (/^#/);
916b5e5868SGarrett D'Amore	next if (/^\s*$/);
926b5e5868SGarrett D'Amore	next if (/^\s*CHARMAP\s*$/);
936b5e5868SGarrett D'Amore	next if (/^\s*END\s*CHARMAP\s*$/);
946b5e5868SGarrett D'Amore	chomp;
956b5e5868SGarrett D'Amore	@words = split /\s+/;
966b5e5868SGarrett D'Amore	$name = $words[0];
976b5e5868SGarrett D'Amore	$utf8val = $words[1];
986b5e5868SGarrett D'Amore
996b5e5868SGarrett D'Amore	if (defined($unames{$utf8val})) {
1006b5e5868SGarrett D'Amore	    $unames{$utf8val} .= "\n" .$name;
1016b5e5868SGarrett D'Amore	} else {
1026b5e5868SGarrett D'Amore	    $unames{$utf8val} = $name;
1036b5e5868SGarrett D'Amore	}
1046b5e5868SGarrett D'Amore	$uvalues{$name} = $utf8val;
1056b5e5868SGarrett D'Amore    }
1066b5e5868SGarrett D'Amore    close(UTF8);
1076b5e5868SGarrett D'Amore}
1086b5e5868SGarrett D'Amore
1096b5e5868SGarrett D'Amoremy %map;
1106b5e5868SGarrett D'Amore
1116b5e5868SGarrett D'Amoresub load_map
1126b5e5868SGarrett D'Amore{
1136b5e5868SGarrett D'Amore    my $file = shift;
1146b5e5868SGarrett D'Amore
1156b5e5868SGarrett D'Amore    open(MAP, "$file") || die "open";
1166b5e5868SGarrett D'Amore
1176b5e5868SGarrett D'Amore    while (<MAP>) {
1186b5e5868SGarrett D'Amore	next if (/^#/);
1196b5e5868SGarrett D'Amore	next if (/^\s*$/);
1206b5e5868SGarrett D'Amore	chomp;
1216b5e5868SGarrett D'Amore	@words = split /\s+/;
1226b5e5868SGarrett D'Amore	$utf8 = $words[1];
1236b5e5868SGarrett D'Amore	$utf8 =~ s/^\\x[0]*//;
1246b5e5868SGarrett D'Amore	$utf8 = ucs_to_utf8(hex($utf8));
1256b5e5868SGarrett D'Amore	$val = $words[0];
1266b5e5868SGarrett D'Amore	if (defined ($map{$val})) {
1276b5e5868SGarrett D'Amore	    $map{$val} .= " ".$utf8;
1286b5e5868SGarrett D'Amore	} else {
1296b5e5868SGarrett D'Amore	    $map{$val} = $utf8;
1306b5e5868SGarrett D'Amore	}
1316b5e5868SGarrett D'Amore    }
1326b5e5868SGarrett D'Amore}
1336b5e5868SGarrett D'Amore
1346b5e5868SGarrett D'Amoresub mb_str
1356b5e5868SGarrett D'Amore{
1366b5e5868SGarrett D'Amore    my $val = shift;
1376b5e5868SGarrett D'Amore    my $str = "";
1386b5e5868SGarrett D'Amore    $val = hex($val);
1396b5e5868SGarrett D'Amore
1406b5e5868SGarrett D'Amore    if ($val == 0) {
1416b5e5868SGarrett D'Amore	return ("\\x00");
1426b5e5868SGarrett D'Amore    }
1436b5e5868SGarrett D'Amore    while ($val) {
1446b5e5868SGarrett D'Amore	$str = sprintf("\\x%02x", $val & 0xff).$str;
1456b5e5868SGarrett D'Amore	$val >>= 8;
1466b5e5868SGarrett D'Amore    }
1476b5e5868SGarrett D'Amore    return ($str);
1486b5e5868SGarrett D'Amore}
1496b5e5868SGarrett D'Amore
1506b5e5868SGarrett D'Amore$mf = shift(@ARGV);
1516b5e5868SGarrett D'Amore
1526b5e5868SGarrett D'Amoreload_utf8_cm("UTF-8.cm");
1536b5e5868SGarrett D'Amoreload_map($mf);
1546b5e5868SGarrett D'Amore
1556b5e5868SGarrett D'Amore
1566b5e5868SGarrett D'Amoreprint("CHARMAP\n");
1576b5e5868SGarrett D'Amoreforeach $val (sort (keys (%map))) {
1586b5e5868SGarrett D'Amore    #$utf8 = $map{$val};
1596b5e5868SGarrett D'Amore    foreach $utf8 (split / /, $map{$val}) {
1606b5e5868SGarrett D'Amore	$ref = $unames{$utf8};
1616b5e5868SGarrett D'Amore	foreach $name (sort (split /\n/, $ref)) {
1626b5e5868SGarrett D'Amore	    print "$name";
1636b5e5868SGarrett D'Amore	    my $nt = int((64 - length($name) + 7) / 8);
1646b5e5868SGarrett D'Amore	    while ($nt) {
1656b5e5868SGarrett D'Amore		print "\t";
1666b5e5868SGarrett D'Amore		$nt--;
1676b5e5868SGarrett D'Amore	    }
1686b5e5868SGarrett D'Amore	    print mb_str($val)."\n";
1696b5e5868SGarrett D'Amore	}
1706b5e5868SGarrett D'Amore    }
1716b5e5868SGarrett D'Amore}
1726b5e5868SGarrett D'Amoreprint "END CHARMAP\n";
173