14bff34e3Sthurlow /* 24bff34e3Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved. 34bff34e3Sthurlow * 44bff34e3Sthurlow * @APPLE_LICENSE_HEADER_START@ 54bff34e3Sthurlow * 64bff34e3Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights 74bff34e3Sthurlow * Reserved. This file contains Original Code and/or Modifications of 84bff34e3Sthurlow * Original Code as defined in and that are subject to the Apple Public 94bff34e3Sthurlow * Source License Version 1.0 (the 'License'). You may not use this file 104bff34e3Sthurlow * except in compliance with the License. Please obtain a copy of the 114bff34e3Sthurlow * License at http://www.apple.com/publicsource and read it before using 124bff34e3Sthurlow * this file. 134bff34e3Sthurlow * 144bff34e3Sthurlow * The Original Code and all software distributed under the License are 154bff34e3Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 164bff34e3Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 174bff34e3Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 184bff34e3Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the 194bff34e3Sthurlow * License for the specific language governing rights and limitations 204bff34e3Sthurlow * under the License." 214bff34e3Sthurlow * 224bff34e3Sthurlow * @APPLE_LICENSE_HEADER_END@ 234bff34e3Sthurlow */ 249c9af259SGordon Ross /* CSTYLED */ 259c9af259SGordon Ross /* 269c9af259SGordon Ross * @(#)charsets.c * 274bff34e3Sthurlow * (c) 2004 Apple Computer, Inc. All Rights Reserved 284bff34e3Sthurlow * 294bff34e3Sthurlow * 304bff34e3Sthurlow * charsets.c -- Routines converting between UTF-8, 16-bit 314bff34e3Sthurlow * little-endian Unicode, and various Windows 324bff34e3Sthurlow * code pages. 334bff34e3Sthurlow * 344bff34e3Sthurlow * MODIFICATION HISTORY: 354bff34e3Sthurlow * 28-Nov-2004 Guy Harris New today 364bff34e3Sthurlow */ 374bff34e3Sthurlow 384bff34e3Sthurlow #include <stdlib.h> 394bff34e3Sthurlow #include <stdio.h> 404bff34e3Sthurlow #include <string.h> 414bff34e3Sthurlow #include <ctype.h> 429c9af259SGordon Ross #include <errno.h> 434bff34e3Sthurlow #include <iconv.h> 444bff34e3Sthurlow #include <langinfo.h> 454bff34e3Sthurlow #include <strings.h> 46*613a2f6bSGordon Ross #include <libintl.h> 474bff34e3Sthurlow 48*613a2f6bSGordon Ross #include <sys/isa_defs.h> 494bff34e3Sthurlow #include <netsmb/smb_lib.h> 504bff34e3Sthurlow #include <netsmb/mchain.h> 514bff34e3Sthurlow 524bff34e3Sthurlow #include "charsets.h" 534bff34e3Sthurlow 544bff34e3Sthurlow /* 554bff34e3Sthurlow * On Solaris, we will need to do some rewriting to use our iconv 564bff34e3Sthurlow * routines for the conversions. For now, we're effectively 574bff34e3Sthurlow * stubbing out code, leaving the details of what happens on 584bff34e3Sthurlow * Darwin in case it's useful as a guide later. 594bff34e3Sthurlow */ 604bff34e3Sthurlow 614bff34e3Sthurlow static unsigned 624bff34e3Sthurlow xtoi(char u) 634bff34e3Sthurlow { 644bff34e3Sthurlow if (isdigit(u)) 654bff34e3Sthurlow return (u - '0'); 664bff34e3Sthurlow else if (islower(u)) 674bff34e3Sthurlow return (10 + u - 'a'); 684bff34e3Sthurlow else if (isupper(u)) 694bff34e3Sthurlow return (10 + u - 'A'); 704bff34e3Sthurlow return (16); 714bff34e3Sthurlow } 724bff34e3Sthurlow 734bff34e3Sthurlow 749c9af259SGordon Ross /* 759c9af259SGordon Ross * Removes the "%" escape sequences from a URL component. 764bff34e3Sthurlow * See IETF RFC 2396. 774bff34e3Sthurlow */ 784bff34e3Sthurlow char * 794bff34e3Sthurlow unpercent(char *component) 804bff34e3Sthurlow { 814bff34e3Sthurlow char c, *s; 824bff34e3Sthurlow unsigned hi, lo; 834bff34e3Sthurlow 849c9af259SGordon Ross if (component == NULL) 859c9af259SGordon Ross return (component); 869c9af259SGordon Ross 874bff34e3Sthurlow for (s = component; (c = *s) != 0; s++) { 884bff34e3Sthurlow if (c != '%') 894bff34e3Sthurlow continue; 904bff34e3Sthurlow if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15) 914bff34e3Sthurlow continue; /* ignore invalid escapes */ 924bff34e3Sthurlow s[0] = hi*16 + lo; 934bff34e3Sthurlow /* 944bff34e3Sthurlow * This was strcpy(s + 1, s + 3); 954bff34e3Sthurlow * But nowadays leftward overlapping copies are 964bff34e3Sthurlow * officially undefined in C. Ours seems to 974bff34e3Sthurlow * work or not depending upon alignment. 984bff34e3Sthurlow */ 994bff34e3Sthurlow memmove(s+1, s+3, strlen(s+3) + 1); 1004bff34e3Sthurlow } 1014bff34e3Sthurlow return (component); 1024bff34e3Sthurlow } 1034bff34e3Sthurlow 1049c9af259SGordon Ross /* BEGIN CSTYLED */ 1054bff34e3Sthurlow #ifdef NOTPORTED 1064bff34e3Sthurlow static CFStringEncoding 1074bff34e3Sthurlow get_windows_encoding_equivalent( void ) 1084bff34e3Sthurlow { 1094bff34e3Sthurlow 1104bff34e3Sthurlow CFStringEncoding encoding; 1114bff34e3Sthurlow uint32_t index,region; 1124bff34e3Sthurlow 1134bff34e3Sthurlow /* important! use root ID so you can read the config file! */ 1144bff34e3Sthurlow seteuid(eff_uid); 1154bff34e3Sthurlow __CFStringGetInstallationEncodingAndRegion(&index,®ion); 1164bff34e3Sthurlow seteuid(real_uid); 1174bff34e3Sthurlow 1184bff34e3Sthurlow switch ( index ) 1194bff34e3Sthurlow { 1204bff34e3Sthurlow case kCFStringEncodingMacRoman: 1214bff34e3Sthurlow if (region) /* anything nonzero is not US */ 1224bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1; 1234bff34e3Sthurlow else /* US region */ 1244bff34e3Sthurlow encoding = kCFStringEncodingDOSLatinUS; 1254bff34e3Sthurlow break; 1264bff34e3Sthurlow 1274bff34e3Sthurlow case kCFStringEncodingMacJapanese: 1284bff34e3Sthurlow encoding = kCFStringEncodingDOSJapanese; 1294bff34e3Sthurlow break; 1304bff34e3Sthurlow 1314bff34e3Sthurlow case kCFStringEncodingMacChineseTrad: 1324bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseTrad; 1334bff34e3Sthurlow break; 1344bff34e3Sthurlow 1354bff34e3Sthurlow case kCFStringEncodingMacKorean: 1364bff34e3Sthurlow encoding = kCFStringEncodingDOSKorean; 1374bff34e3Sthurlow break; 1384bff34e3Sthurlow 1394bff34e3Sthurlow case kCFStringEncodingMacArabic: 1404bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic; 1414bff34e3Sthurlow break; 1424bff34e3Sthurlow 1434bff34e3Sthurlow case kCFStringEncodingMacHebrew: 1444bff34e3Sthurlow encoding = kCFStringEncodingDOSHebrew; 1454bff34e3Sthurlow break; 1464bff34e3Sthurlow 1474bff34e3Sthurlow case kCFStringEncodingMacGreek: 1484bff34e3Sthurlow encoding = kCFStringEncodingDOSGreek; 1494bff34e3Sthurlow break; 1504bff34e3Sthurlow 1514bff34e3Sthurlow case kCFStringEncodingMacCyrillic: 1524bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic; 1534bff34e3Sthurlow break; 1544bff34e3Sthurlow 1554bff34e3Sthurlow case kCFStringEncodingMacThai: 1564bff34e3Sthurlow encoding = kCFStringEncodingDOSThai; 1574bff34e3Sthurlow break; 1584bff34e3Sthurlow 1594bff34e3Sthurlow case kCFStringEncodingMacChineseSimp: 1604bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseSimplif; 1614bff34e3Sthurlow break; 1624bff34e3Sthurlow 1634bff34e3Sthurlow case kCFStringEncodingMacCentralEurRoman: 1644bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 1654bff34e3Sthurlow break; 1664bff34e3Sthurlow 1674bff34e3Sthurlow case kCFStringEncodingMacTurkish: 1684bff34e3Sthurlow encoding = kCFStringEncodingDOSTurkish; 1694bff34e3Sthurlow break; 1704bff34e3Sthurlow 1714bff34e3Sthurlow case kCFStringEncodingMacCroatian: 1724bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 1734bff34e3Sthurlow break; 1744bff34e3Sthurlow 1754bff34e3Sthurlow case kCFStringEncodingMacIcelandic: 1764bff34e3Sthurlow encoding = kCFStringEncodingDOSIcelandic; 1774bff34e3Sthurlow break; 1784bff34e3Sthurlow 1794bff34e3Sthurlow case kCFStringEncodingMacRomanian: 1804bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 1814bff34e3Sthurlow break; 1824bff34e3Sthurlow 1834bff34e3Sthurlow case kCFStringEncodingMacFarsi: 1844bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic; 1854bff34e3Sthurlow break; 1864bff34e3Sthurlow 1874bff34e3Sthurlow case kCFStringEncodingMacUkrainian: 1884bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic; 1894bff34e3Sthurlow break; 1904bff34e3Sthurlow 1914bff34e3Sthurlow default: 1924bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1; 1934bff34e3Sthurlow break; 1944bff34e3Sthurlow } 1954bff34e3Sthurlow 1964bff34e3Sthurlow return encoding; 1974bff34e3Sthurlow } 1984bff34e3Sthurlow #endif /* NOTPORTED */ 1994bff34e3Sthurlow 2004bff34e3Sthurlow /* 2014bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 2024bff34e3Sthurlow * conversions. 2034bff34e3Sthurlow */ 2044bff34e3Sthurlow char * 2054bff34e3Sthurlow convert_wincs_to_utf8(const char *windows_string) 2064bff34e3Sthurlow { 2074bff34e3Sthurlow #ifdef NOTPORTED 2084bff34e3Sthurlow CFStringRef s; 2094bff34e3Sthurlow CFIndex maxlen; 2104bff34e3Sthurlow char *result; 2114bff34e3Sthurlow 2124bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 2134bff34e3Sthurlow get_windows_encoding_equivalent()); 2144bff34e3Sthurlow if (s == NULL) { 2154bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1, 2164bff34e3Sthurlow windows_string); 2174bff34e3Sthurlow 2184bff34e3Sthurlow /* kCFStringEncodingMacRoman should always succeed */ 2194bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 2204bff34e3Sthurlow kCFStringEncodingMacRoman); 2214bff34e3Sthurlow if (s == NULL) { 2224bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping", 2234bff34e3Sthurlow -1, windows_string); 2244bff34e3Sthurlow return NULL; 2254bff34e3Sthurlow } 2264bff34e3Sthurlow } 2274bff34e3Sthurlow 2284bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 2294bff34e3Sthurlow kCFStringEncodingUTF8) + 1; 2304bff34e3Sthurlow result = malloc(maxlen); 2314bff34e3Sthurlow if (result == NULL) { 2324bff34e3Sthurlow smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1, 2334bff34e3Sthurlow windows_string); 2344bff34e3Sthurlow CFRelease(s); 2354bff34e3Sthurlow return NULL; 2364bff34e3Sthurlow } 2374bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) { 2384bff34e3Sthurlow smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping", 2394bff34e3Sthurlow -1, windows_string); 2404bff34e3Sthurlow CFRelease(s); 2414bff34e3Sthurlow return NULL; 2424bff34e3Sthurlow } 2434bff34e3Sthurlow CFRelease(s); 2444bff34e3Sthurlow return result; 2454bff34e3Sthurlow #else /* NOTPORTED */ 2469c9af259SGordon Ross return (strdup((char*)windows_string)); 2474bff34e3Sthurlow #endif /* NOTPORTED */ 2484bff34e3Sthurlow } 2494bff34e3Sthurlow 2504bff34e3Sthurlow /* 2514bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 2524bff34e3Sthurlow * conversions. 2534bff34e3Sthurlow */ 2544bff34e3Sthurlow char * 2554bff34e3Sthurlow convert_utf8_to_wincs(const char *utf8_string) 2564bff34e3Sthurlow { 2574bff34e3Sthurlow #ifdef NOTPORTED 2584bff34e3Sthurlow CFStringRef s; 2594bff34e3Sthurlow CFIndex maxlen; 2604bff34e3Sthurlow char *result; 2614bff34e3Sthurlow 2624bff34e3Sthurlow s = CFStringCreateWithCString(NULL, utf8_string, 2634bff34e3Sthurlow kCFStringEncodingUTF8); 2644bff34e3Sthurlow if (s == NULL) { 2654bff34e3Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 2664bff34e3Sthurlow utf8_string); 2674bff34e3Sthurlow return NULL; 2684bff34e3Sthurlow } 2694bff34e3Sthurlow 2704bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 2714bff34e3Sthurlow get_windows_encoding_equivalent()) + 1; 2724bff34e3Sthurlow result = malloc(maxlen); 2734bff34e3Sthurlow if (result == NULL) { 2744bff34e3Sthurlow smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1, 2754bff34e3Sthurlow utf8_string); 2764bff34e3Sthurlow CFRelease(s); 2774bff34e3Sthurlow return NULL; 2784bff34e3Sthurlow } 2794bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen, 2804bff34e3Sthurlow get_windows_encoding_equivalent())) { 2814bff34e3Sthurlow smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping", 2824bff34e3Sthurlow -1, utf8_string); 2834bff34e3Sthurlow CFRelease(s); 2844bff34e3Sthurlow return NULL; 2854bff34e3Sthurlow } 2864bff34e3Sthurlow CFRelease(s); 2874bff34e3Sthurlow return result; 2884bff34e3Sthurlow #else /* NOTPORTED */ 2899c9af259SGordon Ross return (strdup((char*)utf8_string)); 2904bff34e3Sthurlow #endif /* NOTPORTED */ 2914bff34e3Sthurlow } 2929c9af259SGordon Ross /* END CSTYLED */ 2934bff34e3Sthurlow 2944bff34e3Sthurlow /* 2959c9af259SGordon Ross * We replaced these routines for Solaris: 2969c9af259SGordon Ross * convert_leunicode_to_utf8 2979c9af259SGordon Ross * convert_unicode_to_utf8 2989c9af259SGordon Ross * convert_utf8_to_leunicode 2999c9af259SGordon Ross * with new code in: utf_str.c 3004bff34e3Sthurlow */ 301