xref: /titanic_51/usr/src/lib/libsmbfs/smb/charsets.c (revision 9c9af2590af49bb395bc8d2eace0f2d4ea16d165)
14bff34e3Sthurlow /*
24bff34e3Sthurlow  * Copyright (c) 2001 Apple Computer, Inc. All rights reserved.
34bff34e3Sthurlow  *
44bff34e3Sthurlow  * @APPLE_LICENSE_HEADER_START@
54bff34e3Sthurlow  *
64bff34e3Sthurlow  * "Portions Copyright (c) 1999 Apple Computer, Inc.  All Rights
74bff34e3Sthurlow  * Reserved.  This file contains Original Code and/or Modifications of
84bff34e3Sthurlow  * Original Code as defined in and that are subject to the Apple Public
94bff34e3Sthurlow  * Source License Version 1.0 (the 'License').  You may not use this file
104bff34e3Sthurlow  * except in compliance with the License.  Please obtain a copy of the
114bff34e3Sthurlow  * License at http://www.apple.com/publicsource and read it before using
124bff34e3Sthurlow  * this file.
134bff34e3Sthurlow  *
144bff34e3Sthurlow  * The Original Code and all software distributed under the License are
154bff34e3Sthurlow  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
164bff34e3Sthurlow  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
174bff34e3Sthurlow  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
184bff34e3Sthurlow  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
194bff34e3Sthurlow  * License for the specific language governing rights and limitations
204bff34e3Sthurlow  * under the License."
214bff34e3Sthurlow  *
224bff34e3Sthurlow  * @APPLE_LICENSE_HEADER_END@
234bff34e3Sthurlow  */
24*9c9af259SGordon Ross /* CSTYLED */
25*9c9af259SGordon Ross /*
26*9c9af259SGordon Ross  *      @(#)charsets.c      *
274bff34e3Sthurlow  *      (c) 2004   Apple Computer, Inc.  All Rights Reserved
284bff34e3Sthurlow  *
294bff34e3Sthurlow  *
304bff34e3Sthurlow  *      charsets.c -- Routines converting between UTF-8, 16-bit
314bff34e3Sthurlow  *			little-endian Unicode, and various Windows
324bff34e3Sthurlow  *			code pages.
334bff34e3Sthurlow  *
344bff34e3Sthurlow  *      MODIFICATION HISTORY:
354bff34e3Sthurlow  *       28-Nov-2004     Guy Harris	New today
364bff34e3Sthurlow  */
374bff34e3Sthurlow 
384bff34e3Sthurlow #include <stdlib.h>
394bff34e3Sthurlow #include <stdio.h>
404bff34e3Sthurlow #include <string.h>
414bff34e3Sthurlow #include <ctype.h>
42*9c9af259SGordon Ross #include <errno.h>
434bff34e3Sthurlow #include <iconv.h>
444bff34e3Sthurlow #include <langinfo.h>
454bff34e3Sthurlow #include <strings.h>
464bff34e3Sthurlow 
474bff34e3Sthurlow #include <netsmb/smb_lib.h>
484bff34e3Sthurlow #include <netsmb/mchain.h>
494bff34e3Sthurlow 
504bff34e3Sthurlow #include "charsets.h"
514bff34e3Sthurlow 
524bff34e3Sthurlow /*
534bff34e3Sthurlow  * On Solaris, we will need to do some rewriting to use our iconv
544bff34e3Sthurlow  * routines for the conversions.  For now, we're effectively
554bff34e3Sthurlow  * stubbing out code, leaving the details of what happens on
564bff34e3Sthurlow  * Darwin in case it's useful as a guide later.
574bff34e3Sthurlow  */
584bff34e3Sthurlow 
594bff34e3Sthurlow static unsigned
604bff34e3Sthurlow xtoi(char u)
614bff34e3Sthurlow {
624bff34e3Sthurlow 	if (isdigit(u))
634bff34e3Sthurlow 		return (u - '0');
644bff34e3Sthurlow 	else if (islower(u))
654bff34e3Sthurlow 		return (10 + u - 'a');
664bff34e3Sthurlow 	else if (isupper(u))
674bff34e3Sthurlow 		return (10 + u - 'A');
684bff34e3Sthurlow 	return (16);
694bff34e3Sthurlow }
704bff34e3Sthurlow 
714bff34e3Sthurlow 
72*9c9af259SGordon Ross /*
73*9c9af259SGordon Ross  * Removes the "%" escape sequences from a URL component.
744bff34e3Sthurlow  * See IETF RFC 2396.
754bff34e3Sthurlow  */
764bff34e3Sthurlow char *
774bff34e3Sthurlow unpercent(char *component)
784bff34e3Sthurlow {
794bff34e3Sthurlow 	char c, *s;
804bff34e3Sthurlow 	unsigned hi, lo;
814bff34e3Sthurlow 
82*9c9af259SGordon Ross 	if (component == NULL)
83*9c9af259SGordon Ross 		return (component);
84*9c9af259SGordon Ross 
854bff34e3Sthurlow 	for (s = component; (c = *s) != 0; s++) {
864bff34e3Sthurlow 		if (c != '%')
874bff34e3Sthurlow 			continue;
884bff34e3Sthurlow 		if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15)
894bff34e3Sthurlow 			continue; /* ignore invalid escapes */
904bff34e3Sthurlow 		s[0] = hi*16 + lo;
914bff34e3Sthurlow 		/*
924bff34e3Sthurlow 		 * This was strcpy(s + 1, s + 3);
934bff34e3Sthurlow 		 * But nowadays leftward overlapping copies are
944bff34e3Sthurlow 		 * officially undefined in C.  Ours seems to
954bff34e3Sthurlow 		 * work or not depending upon alignment.
964bff34e3Sthurlow 		 */
974bff34e3Sthurlow 		memmove(s+1, s+3, strlen(s+3) + 1);
984bff34e3Sthurlow 	}
994bff34e3Sthurlow 	return (component);
1004bff34e3Sthurlow }
1014bff34e3Sthurlow 
102*9c9af259SGordon Ross /* BEGIN CSTYLED */
1034bff34e3Sthurlow #ifdef NOTPORTED
1044bff34e3Sthurlow static CFStringEncoding
1054bff34e3Sthurlow get_windows_encoding_equivalent( void )
1064bff34e3Sthurlow {
1074bff34e3Sthurlow 
1084bff34e3Sthurlow 	CFStringEncoding encoding;
1094bff34e3Sthurlow 	uint32_t index,region;
1104bff34e3Sthurlow 
1114bff34e3Sthurlow 	/* important! use root ID so you can read the config file! */
1124bff34e3Sthurlow 	seteuid(eff_uid);
1134bff34e3Sthurlow 	__CFStringGetInstallationEncodingAndRegion(&index,&region);
1144bff34e3Sthurlow 	seteuid(real_uid);
1154bff34e3Sthurlow 
1164bff34e3Sthurlow 	switch ( index )
1174bff34e3Sthurlow 	{
1184bff34e3Sthurlow 		case	kCFStringEncodingMacRoman:
1194bff34e3Sthurlow 			if (region) /* anything nonzero is not US */
1204bff34e3Sthurlow 				encoding = kCFStringEncodingDOSLatin1;
1214bff34e3Sthurlow 			else /* US region */
1224bff34e3Sthurlow 				encoding = kCFStringEncodingDOSLatinUS;
1234bff34e3Sthurlow 			break;
1244bff34e3Sthurlow 
1254bff34e3Sthurlow 		case	kCFStringEncodingMacJapanese:
1264bff34e3Sthurlow 			encoding = kCFStringEncodingDOSJapanese;
1274bff34e3Sthurlow 			break;
1284bff34e3Sthurlow 
1294bff34e3Sthurlow 		case	kCFStringEncodingMacChineseTrad:
1304bff34e3Sthurlow 			encoding = kCFStringEncodingDOSChineseTrad;
1314bff34e3Sthurlow 			break;
1324bff34e3Sthurlow 
1334bff34e3Sthurlow 		case	kCFStringEncodingMacKorean:
1344bff34e3Sthurlow 			encoding = kCFStringEncodingDOSKorean;
1354bff34e3Sthurlow 			break;
1364bff34e3Sthurlow 
1374bff34e3Sthurlow 		case	kCFStringEncodingMacArabic:
1384bff34e3Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1394bff34e3Sthurlow 			break;
1404bff34e3Sthurlow 
1414bff34e3Sthurlow 		case	kCFStringEncodingMacHebrew:
1424bff34e3Sthurlow 			encoding = kCFStringEncodingDOSHebrew;
1434bff34e3Sthurlow 			break;
1444bff34e3Sthurlow 
1454bff34e3Sthurlow 		case	kCFStringEncodingMacGreek:
1464bff34e3Sthurlow 			encoding = kCFStringEncodingDOSGreek;
1474bff34e3Sthurlow 			break;
1484bff34e3Sthurlow 
1494bff34e3Sthurlow 		case	kCFStringEncodingMacCyrillic:
1504bff34e3Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1514bff34e3Sthurlow 			break;
1524bff34e3Sthurlow 
1534bff34e3Sthurlow 		case	kCFStringEncodingMacThai:
1544bff34e3Sthurlow 			encoding = kCFStringEncodingDOSThai;
1554bff34e3Sthurlow 			break;
1564bff34e3Sthurlow 
1574bff34e3Sthurlow 		case	kCFStringEncodingMacChineseSimp:
1584bff34e3Sthurlow 			encoding = kCFStringEncodingDOSChineseSimplif;
1594bff34e3Sthurlow 			break;
1604bff34e3Sthurlow 
1614bff34e3Sthurlow 		case	kCFStringEncodingMacCentralEurRoman:
1624bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1634bff34e3Sthurlow 			break;
1644bff34e3Sthurlow 
1654bff34e3Sthurlow 		case	kCFStringEncodingMacTurkish:
1664bff34e3Sthurlow 			encoding = kCFStringEncodingDOSTurkish;
1674bff34e3Sthurlow 			break;
1684bff34e3Sthurlow 
1694bff34e3Sthurlow 		case	kCFStringEncodingMacCroatian:
1704bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1714bff34e3Sthurlow 			break;
1724bff34e3Sthurlow 
1734bff34e3Sthurlow 		case	kCFStringEncodingMacIcelandic:
1744bff34e3Sthurlow 			encoding = kCFStringEncodingDOSIcelandic;
1754bff34e3Sthurlow 			break;
1764bff34e3Sthurlow 
1774bff34e3Sthurlow 		case	kCFStringEncodingMacRomanian:
1784bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin2;
1794bff34e3Sthurlow 			break;
1804bff34e3Sthurlow 
1814bff34e3Sthurlow 		case	kCFStringEncodingMacFarsi:
1824bff34e3Sthurlow 			encoding = kCFStringEncodingDOSArabic;
1834bff34e3Sthurlow 			break;
1844bff34e3Sthurlow 
1854bff34e3Sthurlow 		case	kCFStringEncodingMacUkrainian:
1864bff34e3Sthurlow 			encoding = kCFStringEncodingDOSCyrillic;
1874bff34e3Sthurlow 			break;
1884bff34e3Sthurlow 
1894bff34e3Sthurlow 		default:
1904bff34e3Sthurlow 			encoding = kCFStringEncodingDOSLatin1;
1914bff34e3Sthurlow 			break;
1924bff34e3Sthurlow 	}
1934bff34e3Sthurlow 
1944bff34e3Sthurlow 	return encoding;
1954bff34e3Sthurlow }
1964bff34e3Sthurlow #endif /* NOTPORTED */
1974bff34e3Sthurlow 
1984bff34e3Sthurlow /*
1994bff34e3Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2004bff34e3Sthurlow  * conversions.
2014bff34e3Sthurlow  */
2024bff34e3Sthurlow char *
2034bff34e3Sthurlow convert_wincs_to_utf8(const char *windows_string)
2044bff34e3Sthurlow {
2054bff34e3Sthurlow #ifdef NOTPORTED
2064bff34e3Sthurlow 	CFStringRef s;
2074bff34e3Sthurlow 	CFIndex maxlen;
2084bff34e3Sthurlow 	char *result;
2094bff34e3Sthurlow 
2104bff34e3Sthurlow 	s = CFStringCreateWithCString(NULL, windows_string,
2114bff34e3Sthurlow 		get_windows_encoding_equivalent());
2124bff34e3Sthurlow 	if (s == NULL) {
2134bff34e3Sthurlow 		smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1,
2144bff34e3Sthurlow 		    windows_string);
2154bff34e3Sthurlow 
2164bff34e3Sthurlow 		/* kCFStringEncodingMacRoman should always succeed */
2174bff34e3Sthurlow 		s = CFStringCreateWithCString(NULL, windows_string,
2184bff34e3Sthurlow 		    kCFStringEncodingMacRoman);
2194bff34e3Sthurlow 		if (s == NULL) {
2204bff34e3Sthurlow 			smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping",
2214bff34e3Sthurlow 			    -1, windows_string);
2224bff34e3Sthurlow 			return NULL;
2234bff34e3Sthurlow 		}
2244bff34e3Sthurlow 	}
2254bff34e3Sthurlow 
2264bff34e3Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2274bff34e3Sthurlow 	    kCFStringEncodingUTF8) + 1;
2284bff34e3Sthurlow 	result = malloc(maxlen);
2294bff34e3Sthurlow 	if (result == NULL) {
2304bff34e3Sthurlow 		smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1,
2314bff34e3Sthurlow 		    windows_string);
2324bff34e3Sthurlow 		CFRelease(s);
2334bff34e3Sthurlow 		return NULL;
2344bff34e3Sthurlow 	}
2354bff34e3Sthurlow 	if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) {
2364bff34e3Sthurlow 		smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping",
2374bff34e3Sthurlow 		    -1, windows_string);
2384bff34e3Sthurlow 		CFRelease(s);
2394bff34e3Sthurlow 		return NULL;
2404bff34e3Sthurlow 	}
2414bff34e3Sthurlow 	CFRelease(s);
2424bff34e3Sthurlow 	return result;
2434bff34e3Sthurlow #else /* NOTPORTED */
244*9c9af259SGordon Ross 	return (strdup((char*)windows_string));
2454bff34e3Sthurlow #endif /* NOTPORTED */
2464bff34e3Sthurlow }
2474bff34e3Sthurlow 
2484bff34e3Sthurlow /*
2494bff34e3Sthurlow  * XXX - NLS, or CF?  We should probably use the same routine for all
2504bff34e3Sthurlow  * conversions.
2514bff34e3Sthurlow  */
2524bff34e3Sthurlow char *
2534bff34e3Sthurlow convert_utf8_to_wincs(const char *utf8_string)
2544bff34e3Sthurlow {
2554bff34e3Sthurlow #ifdef NOTPORTED
2564bff34e3Sthurlow 	CFStringRef s;
2574bff34e3Sthurlow 	CFIndex maxlen;
2584bff34e3Sthurlow 	char *result;
2594bff34e3Sthurlow 
2604bff34e3Sthurlow 	s = CFStringCreateWithCString(NULL, utf8_string,
2614bff34e3Sthurlow 	    kCFStringEncodingUTF8);
2624bff34e3Sthurlow 	if (s == NULL) {
2634bff34e3Sthurlow 		smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1,
2644bff34e3Sthurlow 		    utf8_string);
2654bff34e3Sthurlow 		return NULL;
2664bff34e3Sthurlow 	}
2674bff34e3Sthurlow 
2684bff34e3Sthurlow 	maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s),
2694bff34e3Sthurlow 	    get_windows_encoding_equivalent()) + 1;
2704bff34e3Sthurlow 	result = malloc(maxlen);
2714bff34e3Sthurlow 	if (result == NULL) {
2724bff34e3Sthurlow 		smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1,
2734bff34e3Sthurlow 		    utf8_string);
2744bff34e3Sthurlow 		CFRelease(s);
2754bff34e3Sthurlow 		return NULL;
2764bff34e3Sthurlow 	}
2774bff34e3Sthurlow 	if (!CFStringGetCString(s, result, maxlen,
2784bff34e3Sthurlow 	    get_windows_encoding_equivalent())) {
2794bff34e3Sthurlow 		smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping",
2804bff34e3Sthurlow 		    -1, utf8_string);
2814bff34e3Sthurlow 		CFRelease(s);
2824bff34e3Sthurlow 		return NULL;
2834bff34e3Sthurlow 	}
2844bff34e3Sthurlow 	CFRelease(s);
2854bff34e3Sthurlow 	return result;
2864bff34e3Sthurlow #else /* NOTPORTED */
287*9c9af259SGordon Ross 	return (strdup((char*)utf8_string));
2884bff34e3Sthurlow #endif /* NOTPORTED */
2894bff34e3Sthurlow }
290*9c9af259SGordon Ross /* END CSTYLED */
2914bff34e3Sthurlow 
2924bff34e3Sthurlow /*
293*9c9af259SGordon Ross  * We replaced these routines for Solaris:
294*9c9af259SGordon Ross  *	convert_leunicode_to_utf8
295*9c9af259SGordon Ross  *	convert_unicode_to_utf8
296*9c9af259SGordon Ross  *	convert_utf8_to_leunicode
297*9c9af259SGordon Ross  * with new code in: utf_str.c
2984bff34e3Sthurlow  */
299