14bff34e3Sthurlow /*
24bff34e3Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
34bff34e3Sthurlow *
44bff34e3Sthurlow * @APPLE_LICENSE_HEADER_START@
54bff34e3Sthurlow *
64bff34e3Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights
74bff34e3Sthurlow * Reserved. This file contains Original Code and/or Modifications of
84bff34e3Sthurlow * Original Code as defined in and that are subject to the Apple Public
94bff34e3Sthurlow * Source License Version 1.0 (the 'License'). You may not use this file
104bff34e3Sthurlow * except in compliance with the License. Please obtain a copy of the
114bff34e3Sthurlow * License at http://www.apple.com/publicsource and read it before using
124bff34e3Sthurlow * this file.
134bff34e3Sthurlow *
144bff34e3Sthurlow * The Original Code and all software distributed under the License are
154bff34e3Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
164bff34e3Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
174bff34e3Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
184bff34e3Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
194bff34e3Sthurlow * License for the specific language governing rights and limitations
204bff34e3Sthurlow * under the License."
214bff34e3Sthurlow *
224bff34e3Sthurlow * @APPLE_LICENSE_HEADER_END@
234bff34e3Sthurlow */
249c9af259SGordon Ross /* CSTYLED */
259c9af259SGordon Ross /*
269c9af259SGordon Ross * @(#)charsets.c *
274bff34e3Sthurlow * (c) 2004 Apple Computer, Inc. All Rights Reserved
284bff34e3Sthurlow *
294bff34e3Sthurlow *
304bff34e3Sthurlow * charsets.c -- Routines converting between UTF-8, 16-bit
314bff34e3Sthurlow * little-endian Unicode, and various Windows
324bff34e3Sthurlow * code pages.
334bff34e3Sthurlow *
344bff34e3Sthurlow * MODIFICATION HISTORY:
354bff34e3Sthurlow * 28-Nov-2004 Guy Harris New today
364bff34e3Sthurlow */
374bff34e3Sthurlow
384bff34e3Sthurlow #include <stdlib.h>
394bff34e3Sthurlow #include <stdio.h>
404bff34e3Sthurlow #include <string.h>
414bff34e3Sthurlow #include <ctype.h>
429c9af259SGordon Ross #include <errno.h>
434bff34e3Sthurlow #include <iconv.h>
444bff34e3Sthurlow #include <langinfo.h>
454bff34e3Sthurlow #include <strings.h>
46*613a2f6bSGordon Ross #include <libintl.h>
474bff34e3Sthurlow
48*613a2f6bSGordon Ross #include <sys/isa_defs.h>
494bff34e3Sthurlow #include <netsmb/smb_lib.h>
504bff34e3Sthurlow #include <netsmb/mchain.h>
514bff34e3Sthurlow
524bff34e3Sthurlow #include "charsets.h"
534bff34e3Sthurlow
544bff34e3Sthurlow /*
554bff34e3Sthurlow * On Solaris, we will need to do some rewriting to use our iconv
564bff34e3Sthurlow * routines for the conversions. For now, we're effectively
574bff34e3Sthurlow * stubbing out code, leaving the details of what happens on
584bff34e3Sthurlow * Darwin in case it's useful as a guide later.
594bff34e3Sthurlow */
604bff34e3Sthurlow
614bff34e3Sthurlow static unsigned
xtoi(char u)624bff34e3Sthurlow xtoi(char u)
634bff34e3Sthurlow {
644bff34e3Sthurlow if (isdigit(u))
654bff34e3Sthurlow return (u - '0');
664bff34e3Sthurlow else if (islower(u))
674bff34e3Sthurlow return (10 + u - 'a');
684bff34e3Sthurlow else if (isupper(u))
694bff34e3Sthurlow return (10 + u - 'A');
704bff34e3Sthurlow return (16);
714bff34e3Sthurlow }
724bff34e3Sthurlow
734bff34e3Sthurlow
749c9af259SGordon Ross /*
759c9af259SGordon Ross * Removes the "%" escape sequences from a URL component.
764bff34e3Sthurlow * See IETF RFC 2396.
774bff34e3Sthurlow */
784bff34e3Sthurlow char *
unpercent(char * component)794bff34e3Sthurlow unpercent(char *component)
804bff34e3Sthurlow {
814bff34e3Sthurlow char c, *s;
824bff34e3Sthurlow unsigned hi, lo;
834bff34e3Sthurlow
849c9af259SGordon Ross if (component == NULL)
859c9af259SGordon Ross return (component);
869c9af259SGordon Ross
874bff34e3Sthurlow for (s = component; (c = *s) != 0; s++) {
884bff34e3Sthurlow if (c != '%')
894bff34e3Sthurlow continue;
904bff34e3Sthurlow if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
914bff34e3Sthurlow continue; /* ignore invalid escapes */
924bff34e3Sthurlow s[0] = hi*16 + lo;
934bff34e3Sthurlow /*
944bff34e3Sthurlow * This was strcpy(s + 1, s + 3);
954bff34e3Sthurlow * But nowadays leftward overlapping copies are
964bff34e3Sthurlow * officially undefined in C. Ours seems to
974bff34e3Sthurlow * work or not depending upon alignment.
984bff34e3Sthurlow */
994bff34e3Sthurlow memmove(s+1, s+3, strlen(s+3) + 1);
1004bff34e3Sthurlow }
1014bff34e3Sthurlow return (component);
1024bff34e3Sthurlow }
1034bff34e3Sthurlow
1049c9af259SGordon Ross /* BEGIN CSTYLED */
1054bff34e3Sthurlow #ifdef NOTPORTED
1064bff34e3Sthurlow static CFStringEncoding
get_windows_encoding_equivalent(void)1074bff34e3Sthurlow get_windows_encoding_equivalent( void )
1084bff34e3Sthurlow {
1094bff34e3Sthurlow
1104bff34e3Sthurlow CFStringEncoding encoding;
1114bff34e3Sthurlow uint32_t index,region;
1124bff34e3Sthurlow
1134bff34e3Sthurlow /* important! use root ID so you can read the config file! */
1144bff34e3Sthurlow seteuid(eff_uid);
1154bff34e3Sthurlow __CFStringGetInstallationEncodingAndRegion(&index,®ion);
1164bff34e3Sthurlow seteuid(real_uid);
1174bff34e3Sthurlow
1184bff34e3Sthurlow switch ( index )
1194bff34e3Sthurlow {
1204bff34e3Sthurlow case kCFStringEncodingMacRoman:
1214bff34e3Sthurlow if (region) /* anything nonzero is not US */
1224bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1;
1234bff34e3Sthurlow else /* US region */
1244bff34e3Sthurlow encoding = kCFStringEncodingDOSLatinUS;
1254bff34e3Sthurlow break;
1264bff34e3Sthurlow
1274bff34e3Sthurlow case kCFStringEncodingMacJapanese:
1284bff34e3Sthurlow encoding = kCFStringEncodingDOSJapanese;
1294bff34e3Sthurlow break;
1304bff34e3Sthurlow
1314bff34e3Sthurlow case kCFStringEncodingMacChineseTrad:
1324bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseTrad;
1334bff34e3Sthurlow break;
1344bff34e3Sthurlow
1354bff34e3Sthurlow case kCFStringEncodingMacKorean:
1364bff34e3Sthurlow encoding = kCFStringEncodingDOSKorean;
1374bff34e3Sthurlow break;
1384bff34e3Sthurlow
1394bff34e3Sthurlow case kCFStringEncodingMacArabic:
1404bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic;
1414bff34e3Sthurlow break;
1424bff34e3Sthurlow
1434bff34e3Sthurlow case kCFStringEncodingMacHebrew:
1444bff34e3Sthurlow encoding = kCFStringEncodingDOSHebrew;
1454bff34e3Sthurlow break;
1464bff34e3Sthurlow
1474bff34e3Sthurlow case kCFStringEncodingMacGreek:
1484bff34e3Sthurlow encoding = kCFStringEncodingDOSGreek;
1494bff34e3Sthurlow break;
1504bff34e3Sthurlow
1514bff34e3Sthurlow case kCFStringEncodingMacCyrillic:
1524bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic;
1534bff34e3Sthurlow break;
1544bff34e3Sthurlow
1554bff34e3Sthurlow case kCFStringEncodingMacThai:
1564bff34e3Sthurlow encoding = kCFStringEncodingDOSThai;
1574bff34e3Sthurlow break;
1584bff34e3Sthurlow
1594bff34e3Sthurlow case kCFStringEncodingMacChineseSimp:
1604bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseSimplif;
1614bff34e3Sthurlow break;
1624bff34e3Sthurlow
1634bff34e3Sthurlow case kCFStringEncodingMacCentralEurRoman:
1644bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2;
1654bff34e3Sthurlow break;
1664bff34e3Sthurlow
1674bff34e3Sthurlow case kCFStringEncodingMacTurkish:
1684bff34e3Sthurlow encoding = kCFStringEncodingDOSTurkish;
1694bff34e3Sthurlow break;
1704bff34e3Sthurlow
1714bff34e3Sthurlow case kCFStringEncodingMacCroatian:
1724bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2;
1734bff34e3Sthurlow break;
1744bff34e3Sthurlow
1754bff34e3Sthurlow case kCFStringEncodingMacIcelandic:
1764bff34e3Sthurlow encoding = kCFStringEncodingDOSIcelandic;
1774bff34e3Sthurlow break;
1784bff34e3Sthurlow
1794bff34e3Sthurlow case kCFStringEncodingMacRomanian:
1804bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2;
1814bff34e3Sthurlow break;
1824bff34e3Sthurlow
1834bff34e3Sthurlow case kCFStringEncodingMacFarsi:
1844bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic;
1854bff34e3Sthurlow break;
1864bff34e3Sthurlow
1874bff34e3Sthurlow case kCFStringEncodingMacUkrainian:
1884bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic;
1894bff34e3Sthurlow break;
1904bff34e3Sthurlow
1914bff34e3Sthurlow default:
1924bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1;
1934bff34e3Sthurlow break;
1944bff34e3Sthurlow }
1954bff34e3Sthurlow
1964bff34e3Sthurlow return encoding;
1974bff34e3Sthurlow }
1984bff34e3Sthurlow #endif /* NOTPORTED */
1994bff34e3Sthurlow
2004bff34e3Sthurlow /*
2014bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all
2024bff34e3Sthurlow * conversions.
2034bff34e3Sthurlow */
2044bff34e3Sthurlow char *
convert_wincs_to_utf8(const char * windows_string)2054bff34e3Sthurlow convert_wincs_to_utf8(const char *windows_string)
2064bff34e3Sthurlow {
2074bff34e3Sthurlow #ifdef NOTPORTED
2084bff34e3Sthurlow CFStringRef s;
2094bff34e3Sthurlow CFIndex maxlen;
2104bff34e3Sthurlow char *result;
2114bff34e3Sthurlow
2124bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string,
2134bff34e3Sthurlow get_windows_encoding_equivalent());
2144bff34e3Sthurlow if (s == NULL) {
2154bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
2164bff34e3Sthurlow windows_string);
2174bff34e3Sthurlow
2184bff34e3Sthurlow /* kCFStringEncodingMacRoman should always succeed */
2194bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string,
2204bff34e3Sthurlow kCFStringEncodingMacRoman);
2214bff34e3Sthurlow if (s == NULL) {
2224bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
2234bff34e3Sthurlow -1, windows_string);
2244bff34e3Sthurlow return NULL;
2254bff34e3Sthurlow }
2264bff34e3Sthurlow }
2274bff34e3Sthurlow
2284bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2294bff34e3Sthurlow kCFStringEncodingUTF8) + 1;
2304bff34e3Sthurlow result = malloc(maxlen);
2314bff34e3Sthurlow if (result == NULL) {
2324bff34e3Sthurlow smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
2334bff34e3Sthurlow windows_string);
2344bff34e3Sthurlow CFRelease(s);
2354bff34e3Sthurlow return NULL;
2364bff34e3Sthurlow }
2374bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
2384bff34e3Sthurlow smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
2394bff34e3Sthurlow -1, windows_string);
2404bff34e3Sthurlow CFRelease(s);
2414bff34e3Sthurlow return NULL;
2424bff34e3Sthurlow }
2434bff34e3Sthurlow CFRelease(s);
2444bff34e3Sthurlow return result;
2454bff34e3Sthurlow #else /* NOTPORTED */
2469c9af259SGordon Ross return (strdup((char*)windows_string));
2474bff34e3Sthurlow #endif /* NOTPORTED */
2484bff34e3Sthurlow }
2494bff34e3Sthurlow
2504bff34e3Sthurlow /*
2514bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all
2524bff34e3Sthurlow * conversions.
2534bff34e3Sthurlow */
2544bff34e3Sthurlow char *
convert_utf8_to_wincs(const char * utf8_string)2554bff34e3Sthurlow convert_utf8_to_wincs(const char *utf8_string)
2564bff34e3Sthurlow {
2574bff34e3Sthurlow #ifdef NOTPORTED
2584bff34e3Sthurlow CFStringRef s;
2594bff34e3Sthurlow CFIndex maxlen;
2604bff34e3Sthurlow char *result;
2614bff34e3Sthurlow
2624bff34e3Sthurlow s = CFStringCreateWithCString(NULL, utf8_string,
2634bff34e3Sthurlow kCFStringEncodingUTF8);
2644bff34e3Sthurlow if (s == NULL) {
2654bff34e3Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
2664bff34e3Sthurlow utf8_string);
2674bff34e3Sthurlow return NULL;
2684bff34e3Sthurlow }
2694bff34e3Sthurlow
2704bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2714bff34e3Sthurlow get_windows_encoding_equivalent()) + 1;
2724bff34e3Sthurlow result = malloc(maxlen);
2734bff34e3Sthurlow if (result == NULL) {
2744bff34e3Sthurlow smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
2754bff34e3Sthurlow utf8_string);
2764bff34e3Sthurlow CFRelease(s);
2774bff34e3Sthurlow return NULL;
2784bff34e3Sthurlow }
2794bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen,
2804bff34e3Sthurlow get_windows_encoding_equivalent())) {
2814bff34e3Sthurlow smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
2824bff34e3Sthurlow -1, utf8_string);
2834bff34e3Sthurlow CFRelease(s);
2844bff34e3Sthurlow return NULL;
2854bff34e3Sthurlow }
2864bff34e3Sthurlow CFRelease(s);
2874bff34e3Sthurlow return result;
2884bff34e3Sthurlow #else /* NOTPORTED */
2899c9af259SGordon Ross return (strdup((char*)utf8_string));
2904bff34e3Sthurlow #endif /* NOTPORTED */
2914bff34e3Sthurlow }
2929c9af259SGordon Ross /* END CSTYLED */
2934bff34e3Sthurlow
2944bff34e3Sthurlow /*
2959c9af259SGordon Ross * We replaced these routines for Solaris:
2969c9af259SGordon Ross * convert_leunicode_to_utf8
2979c9af259SGordon Ross * convert_unicode_to_utf8
2989c9af259SGordon Ross * convert_utf8_to_leunicode
2999c9af259SGordon Ross * with new code in: utf_str.c
3004bff34e3Sthurlow */
301