14bff34e3Sthurlow /* 24bff34e3Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved. 34bff34e3Sthurlow * 44bff34e3Sthurlow * @APPLE_LICENSE_HEADER_START@ 54bff34e3Sthurlow * 64bff34e3Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights 74bff34e3Sthurlow * Reserved. This file contains Original Code and/or Modifications of 84bff34e3Sthurlow * Original Code as defined in and that are subject to the Apple Public 94bff34e3Sthurlow * Source License Version 1.0 (the 'License'). You may not use this file 104bff34e3Sthurlow * except in compliance with the License. Please obtain a copy of the 114bff34e3Sthurlow * License at http://www.apple.com/publicsource and read it before using 124bff34e3Sthurlow * this file. 134bff34e3Sthurlow * 144bff34e3Sthurlow * The Original Code and all software distributed under the License are 154bff34e3Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 164bff34e3Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 174bff34e3Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 184bff34e3Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the 194bff34e3Sthurlow * License for the specific language governing rights and limitations 204bff34e3Sthurlow * under the License." 214bff34e3Sthurlow * 224bff34e3Sthurlow * @APPLE_LICENSE_HEADER_END@ 234bff34e3Sthurlow */ 24*9c9af259SGordon Ross /* CSTYLED */ 25*9c9af259SGordon Ross /* 26*9c9af259SGordon Ross * @(#)charsets.c * 274bff34e3Sthurlow * (c) 2004 Apple Computer, Inc. All Rights Reserved 284bff34e3Sthurlow * 294bff34e3Sthurlow * 304bff34e3Sthurlow * charsets.c -- Routines converting between UTF-8, 16-bit 314bff34e3Sthurlow * little-endian Unicode, and various Windows 324bff34e3Sthurlow * code pages. 334bff34e3Sthurlow * 344bff34e3Sthurlow * MODIFICATION HISTORY: 354bff34e3Sthurlow * 28-Nov-2004 Guy Harris New today 364bff34e3Sthurlow */ 374bff34e3Sthurlow 384bff34e3Sthurlow #include <stdlib.h> 394bff34e3Sthurlow #include <stdio.h> 404bff34e3Sthurlow #include <string.h> 414bff34e3Sthurlow #include <ctype.h> 42*9c9af259SGordon Ross #include <errno.h> 434bff34e3Sthurlow #include <iconv.h> 444bff34e3Sthurlow #include <langinfo.h> 454bff34e3Sthurlow #include <strings.h> 464bff34e3Sthurlow 474bff34e3Sthurlow #include <netsmb/smb_lib.h> 484bff34e3Sthurlow #include <netsmb/mchain.h> 494bff34e3Sthurlow 504bff34e3Sthurlow #include "charsets.h" 514bff34e3Sthurlow 524bff34e3Sthurlow /* 534bff34e3Sthurlow * On Solaris, we will need to do some rewriting to use our iconv 544bff34e3Sthurlow * routines for the conversions. For now, we're effectively 554bff34e3Sthurlow * stubbing out code, leaving the details of what happens on 564bff34e3Sthurlow * Darwin in case it's useful as a guide later. 574bff34e3Sthurlow */ 584bff34e3Sthurlow 594bff34e3Sthurlow static unsigned 604bff34e3Sthurlow xtoi(char u) 614bff34e3Sthurlow { 624bff34e3Sthurlow if (isdigit(u)) 634bff34e3Sthurlow return (u - '0'); 644bff34e3Sthurlow else if (islower(u)) 654bff34e3Sthurlow return (10 + u - 'a'); 664bff34e3Sthurlow else if (isupper(u)) 674bff34e3Sthurlow return (10 + u - 'A'); 684bff34e3Sthurlow return (16); 694bff34e3Sthurlow } 704bff34e3Sthurlow 714bff34e3Sthurlow 72*9c9af259SGordon Ross /* 73*9c9af259SGordon Ross * Removes the "%" escape sequences from a URL component. 744bff34e3Sthurlow * See IETF RFC 2396. 754bff34e3Sthurlow */ 764bff34e3Sthurlow char * 774bff34e3Sthurlow unpercent(char *component) 784bff34e3Sthurlow { 794bff34e3Sthurlow char c, *s; 804bff34e3Sthurlow unsigned hi, lo; 814bff34e3Sthurlow 82*9c9af259SGordon Ross if (component == NULL) 83*9c9af259SGordon Ross return (component); 84*9c9af259SGordon Ross 854bff34e3Sthurlow for (s = component; (c = *s) != 0; s++) { 864bff34e3Sthurlow if (c != '%') 874bff34e3Sthurlow continue; 884bff34e3Sthurlow if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15) 894bff34e3Sthurlow continue; /* ignore invalid escapes */ 904bff34e3Sthurlow s[0] = hi*16 + lo; 914bff34e3Sthurlow /* 924bff34e3Sthurlow * This was strcpy(s + 1, s + 3); 934bff34e3Sthurlow * But nowadays leftward overlapping copies are 944bff34e3Sthurlow * officially undefined in C. Ours seems to 954bff34e3Sthurlow * work or not depending upon alignment. 964bff34e3Sthurlow */ 974bff34e3Sthurlow memmove(s+1, s+3, strlen(s+3) + 1); 984bff34e3Sthurlow } 994bff34e3Sthurlow return (component); 1004bff34e3Sthurlow } 1014bff34e3Sthurlow 102*9c9af259SGordon Ross /* BEGIN CSTYLED */ 1034bff34e3Sthurlow #ifdef NOTPORTED 1044bff34e3Sthurlow static CFStringEncoding 1054bff34e3Sthurlow get_windows_encoding_equivalent( void ) 1064bff34e3Sthurlow { 1074bff34e3Sthurlow 1084bff34e3Sthurlow CFStringEncoding encoding; 1094bff34e3Sthurlow uint32_t index,region; 1104bff34e3Sthurlow 1114bff34e3Sthurlow /* important! use root ID so you can read the config file! */ 1124bff34e3Sthurlow seteuid(eff_uid); 1134bff34e3Sthurlow __CFStringGetInstallationEncodingAndRegion(&index,®ion); 1144bff34e3Sthurlow seteuid(real_uid); 1154bff34e3Sthurlow 1164bff34e3Sthurlow switch ( index ) 1174bff34e3Sthurlow { 1184bff34e3Sthurlow case kCFStringEncodingMacRoman: 1194bff34e3Sthurlow if (region) /* anything nonzero is not US */ 1204bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1; 1214bff34e3Sthurlow else /* US region */ 1224bff34e3Sthurlow encoding = kCFStringEncodingDOSLatinUS; 1234bff34e3Sthurlow break; 1244bff34e3Sthurlow 1254bff34e3Sthurlow case kCFStringEncodingMacJapanese: 1264bff34e3Sthurlow encoding = kCFStringEncodingDOSJapanese; 1274bff34e3Sthurlow break; 1284bff34e3Sthurlow 1294bff34e3Sthurlow case kCFStringEncodingMacChineseTrad: 1304bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseTrad; 1314bff34e3Sthurlow break; 1324bff34e3Sthurlow 1334bff34e3Sthurlow case kCFStringEncodingMacKorean: 1344bff34e3Sthurlow encoding = kCFStringEncodingDOSKorean; 1354bff34e3Sthurlow break; 1364bff34e3Sthurlow 1374bff34e3Sthurlow case kCFStringEncodingMacArabic: 1384bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic; 1394bff34e3Sthurlow break; 1404bff34e3Sthurlow 1414bff34e3Sthurlow case kCFStringEncodingMacHebrew: 1424bff34e3Sthurlow encoding = kCFStringEncodingDOSHebrew; 1434bff34e3Sthurlow break; 1444bff34e3Sthurlow 1454bff34e3Sthurlow case kCFStringEncodingMacGreek: 1464bff34e3Sthurlow encoding = kCFStringEncodingDOSGreek; 1474bff34e3Sthurlow break; 1484bff34e3Sthurlow 1494bff34e3Sthurlow case kCFStringEncodingMacCyrillic: 1504bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic; 1514bff34e3Sthurlow break; 1524bff34e3Sthurlow 1534bff34e3Sthurlow case kCFStringEncodingMacThai: 1544bff34e3Sthurlow encoding = kCFStringEncodingDOSThai; 1554bff34e3Sthurlow break; 1564bff34e3Sthurlow 1574bff34e3Sthurlow case kCFStringEncodingMacChineseSimp: 1584bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseSimplif; 1594bff34e3Sthurlow break; 1604bff34e3Sthurlow 1614bff34e3Sthurlow case kCFStringEncodingMacCentralEurRoman: 1624bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 1634bff34e3Sthurlow break; 1644bff34e3Sthurlow 1654bff34e3Sthurlow case kCFStringEncodingMacTurkish: 1664bff34e3Sthurlow encoding = kCFStringEncodingDOSTurkish; 1674bff34e3Sthurlow break; 1684bff34e3Sthurlow 1694bff34e3Sthurlow case kCFStringEncodingMacCroatian: 1704bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 1714bff34e3Sthurlow break; 1724bff34e3Sthurlow 1734bff34e3Sthurlow case kCFStringEncodingMacIcelandic: 1744bff34e3Sthurlow encoding = kCFStringEncodingDOSIcelandic; 1754bff34e3Sthurlow break; 1764bff34e3Sthurlow 1774bff34e3Sthurlow case kCFStringEncodingMacRomanian: 1784bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 1794bff34e3Sthurlow break; 1804bff34e3Sthurlow 1814bff34e3Sthurlow case kCFStringEncodingMacFarsi: 1824bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic; 1834bff34e3Sthurlow break; 1844bff34e3Sthurlow 1854bff34e3Sthurlow case kCFStringEncodingMacUkrainian: 1864bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic; 1874bff34e3Sthurlow break; 1884bff34e3Sthurlow 1894bff34e3Sthurlow default: 1904bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1; 1914bff34e3Sthurlow break; 1924bff34e3Sthurlow } 1934bff34e3Sthurlow 1944bff34e3Sthurlow return encoding; 1954bff34e3Sthurlow } 1964bff34e3Sthurlow #endif /* NOTPORTED */ 1974bff34e3Sthurlow 1984bff34e3Sthurlow /* 1994bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 2004bff34e3Sthurlow * conversions. 2014bff34e3Sthurlow */ 2024bff34e3Sthurlow char * 2034bff34e3Sthurlow convert_wincs_to_utf8(const char *windows_string) 2044bff34e3Sthurlow { 2054bff34e3Sthurlow #ifdef NOTPORTED 2064bff34e3Sthurlow CFStringRef s; 2074bff34e3Sthurlow CFIndex maxlen; 2084bff34e3Sthurlow char *result; 2094bff34e3Sthurlow 2104bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 2114bff34e3Sthurlow get_windows_encoding_equivalent()); 2124bff34e3Sthurlow if (s == NULL) { 2134bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1, 2144bff34e3Sthurlow windows_string); 2154bff34e3Sthurlow 2164bff34e3Sthurlow /* kCFStringEncodingMacRoman should always succeed */ 2174bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 2184bff34e3Sthurlow kCFStringEncodingMacRoman); 2194bff34e3Sthurlow if (s == NULL) { 2204bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping", 2214bff34e3Sthurlow -1, windows_string); 2224bff34e3Sthurlow return NULL; 2234bff34e3Sthurlow } 2244bff34e3Sthurlow } 2254bff34e3Sthurlow 2264bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 2274bff34e3Sthurlow kCFStringEncodingUTF8) + 1; 2284bff34e3Sthurlow result = malloc(maxlen); 2294bff34e3Sthurlow if (result == NULL) { 2304bff34e3Sthurlow smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1, 2314bff34e3Sthurlow windows_string); 2324bff34e3Sthurlow CFRelease(s); 2334bff34e3Sthurlow return NULL; 2344bff34e3Sthurlow } 2354bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) { 2364bff34e3Sthurlow smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping", 2374bff34e3Sthurlow -1, windows_string); 2384bff34e3Sthurlow CFRelease(s); 2394bff34e3Sthurlow return NULL; 2404bff34e3Sthurlow } 2414bff34e3Sthurlow CFRelease(s); 2424bff34e3Sthurlow return result; 2434bff34e3Sthurlow #else /* NOTPORTED */ 244*9c9af259SGordon Ross return (strdup((char*)windows_string)); 2454bff34e3Sthurlow #endif /* NOTPORTED */ 2464bff34e3Sthurlow } 2474bff34e3Sthurlow 2484bff34e3Sthurlow /* 2494bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 2504bff34e3Sthurlow * conversions. 2514bff34e3Sthurlow */ 2524bff34e3Sthurlow char * 2534bff34e3Sthurlow convert_utf8_to_wincs(const char *utf8_string) 2544bff34e3Sthurlow { 2554bff34e3Sthurlow #ifdef NOTPORTED 2564bff34e3Sthurlow CFStringRef s; 2574bff34e3Sthurlow CFIndex maxlen; 2584bff34e3Sthurlow char *result; 2594bff34e3Sthurlow 2604bff34e3Sthurlow s = CFStringCreateWithCString(NULL, utf8_string, 2614bff34e3Sthurlow kCFStringEncodingUTF8); 2624bff34e3Sthurlow if (s == NULL) { 2634bff34e3Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 2644bff34e3Sthurlow utf8_string); 2654bff34e3Sthurlow return NULL; 2664bff34e3Sthurlow } 2674bff34e3Sthurlow 2684bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 2694bff34e3Sthurlow get_windows_encoding_equivalent()) + 1; 2704bff34e3Sthurlow result = malloc(maxlen); 2714bff34e3Sthurlow if (result == NULL) { 2724bff34e3Sthurlow smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1, 2734bff34e3Sthurlow utf8_string); 2744bff34e3Sthurlow CFRelease(s); 2754bff34e3Sthurlow return NULL; 2764bff34e3Sthurlow } 2774bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen, 2784bff34e3Sthurlow get_windows_encoding_equivalent())) { 2794bff34e3Sthurlow smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping", 2804bff34e3Sthurlow -1, utf8_string); 2814bff34e3Sthurlow CFRelease(s); 2824bff34e3Sthurlow return NULL; 2834bff34e3Sthurlow } 2844bff34e3Sthurlow CFRelease(s); 2854bff34e3Sthurlow return result; 2864bff34e3Sthurlow #else /* NOTPORTED */ 287*9c9af259SGordon Ross return (strdup((char*)utf8_string)); 2884bff34e3Sthurlow #endif /* NOTPORTED */ 2894bff34e3Sthurlow } 290*9c9af259SGordon Ross /* END CSTYLED */ 2914bff34e3Sthurlow 2924bff34e3Sthurlow /* 293*9c9af259SGordon Ross * We replaced these routines for Solaris: 294*9c9af259SGordon Ross * convert_leunicode_to_utf8 295*9c9af259SGordon Ross * convert_unicode_to_utf8 296*9c9af259SGordon Ross * convert_utf8_to_leunicode 297*9c9af259SGordon Ross * with new code in: utf_str.c 2984bff34e3Sthurlow */ 299