1*4bff34e3Sthurlow /* 2*4bff34e3Sthurlow * Copyright (c) 2001 Apple Computer, Inc. All rights reserved. 3*4bff34e3Sthurlow * 4*4bff34e3Sthurlow * @APPLE_LICENSE_HEADER_START@ 5*4bff34e3Sthurlow * 6*4bff34e3Sthurlow * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights 7*4bff34e3Sthurlow * Reserved. This file contains Original Code and/or Modifications of 8*4bff34e3Sthurlow * Original Code as defined in and that are subject to the Apple Public 9*4bff34e3Sthurlow * Source License Version 1.0 (the 'License'). You may not use this file 10*4bff34e3Sthurlow * except in compliance with the License. Please obtain a copy of the 11*4bff34e3Sthurlow * License at http://www.apple.com/publicsource and read it before using 12*4bff34e3Sthurlow * this file. 13*4bff34e3Sthurlow * 14*4bff34e3Sthurlow * The Original Code and all software distributed under the License are 15*4bff34e3Sthurlow * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 16*4bff34e3Sthurlow * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 17*4bff34e3Sthurlow * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 18*4bff34e3Sthurlow * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the 19*4bff34e3Sthurlow * License for the specific language governing rights and limitations 20*4bff34e3Sthurlow * under the License." 21*4bff34e3Sthurlow * 22*4bff34e3Sthurlow * @APPLE_LICENSE_HEADER_END@ 23*4bff34e3Sthurlow */ 24*4bff34e3Sthurlow /* @(#)charsets.c * 25*4bff34e3Sthurlow * (c) 2004 Apple Computer, Inc. All Rights Reserved 26*4bff34e3Sthurlow * 27*4bff34e3Sthurlow * 28*4bff34e3Sthurlow * charsets.c -- Routines converting between UTF-8, 16-bit 29*4bff34e3Sthurlow * little-endian Unicode, and various Windows 30*4bff34e3Sthurlow * code pages. 31*4bff34e3Sthurlow * 32*4bff34e3Sthurlow * MODIFICATION HISTORY: 33*4bff34e3Sthurlow * 28-Nov-2004 Guy Harris New today 34*4bff34e3Sthurlow */ 35*4bff34e3Sthurlow 36*4bff34e3Sthurlow #pragma ident "%Z%%M% %I% %E% SMI" 37*4bff34e3Sthurlow 38*4bff34e3Sthurlow #include <stdlib.h> 39*4bff34e3Sthurlow #include <stdio.h> 40*4bff34e3Sthurlow #include <string.h> 41*4bff34e3Sthurlow #include <ctype.h> 42*4bff34e3Sthurlow #include <iconv.h> 43*4bff34e3Sthurlow #include <langinfo.h> 44*4bff34e3Sthurlow #include <strings.h> 45*4bff34e3Sthurlow 46*4bff34e3Sthurlow #ifdef NOTPORTED 47*4bff34e3Sthurlow #include <CoreFoundation/CoreFoundation.h> 48*4bff34e3Sthurlow #include <CoreFoundation/CFStringDefaultEncoding.h> 49*4bff34e3Sthurlow #include <CoreFoundation/CFStringEncodingConverter.h> 50*4bff34e3Sthurlow #include <sys/mchain.h> 51*4bff34e3Sthurlow #endif /* NOTPORTED */ 52*4bff34e3Sthurlow 53*4bff34e3Sthurlow #include <netsmb/smb_lib.h> 54*4bff34e3Sthurlow #include <netsmb/mchain.h> 55*4bff34e3Sthurlow 56*4bff34e3Sthurlow #include "charsets.h" 57*4bff34e3Sthurlow 58*4bff34e3Sthurlow #ifdef NOTPORTED 59*4bff34e3Sthurlow extern uid_t real_uid,eff_uid; 60*4bff34e3Sthurlow #endif /* NOTPORTED */ 61*4bff34e3Sthurlow 62*4bff34e3Sthurlow /* 63*4bff34e3Sthurlow * On Solaris, we will need to do some rewriting to use our iconv 64*4bff34e3Sthurlow * routines for the conversions. For now, we're effectively 65*4bff34e3Sthurlow * stubbing out code, leaving the details of what happens on 66*4bff34e3Sthurlow * Darwin in case it's useful as a guide later. 67*4bff34e3Sthurlow */ 68*4bff34e3Sthurlow 69*4bff34e3Sthurlow static unsigned 70*4bff34e3Sthurlow xtoi(char u) 71*4bff34e3Sthurlow { 72*4bff34e3Sthurlow if (isdigit(u)) 73*4bff34e3Sthurlow return (u - '0'); 74*4bff34e3Sthurlow else if (islower(u)) 75*4bff34e3Sthurlow return (10 + u - 'a'); 76*4bff34e3Sthurlow else if (isupper(u)) 77*4bff34e3Sthurlow return (10 + u - 'A'); 78*4bff34e3Sthurlow return (16); 79*4bff34e3Sthurlow } 80*4bff34e3Sthurlow 81*4bff34e3Sthurlow 82*4bff34e3Sthurlow /* Removes the "%" escape sequences from a URL component. 83*4bff34e3Sthurlow * See IETF RFC 2396. 84*4bff34e3Sthurlow */ 85*4bff34e3Sthurlow char * 86*4bff34e3Sthurlow unpercent(char * component) 87*4bff34e3Sthurlow { 88*4bff34e3Sthurlow char c, *s; 89*4bff34e3Sthurlow unsigned hi, lo; 90*4bff34e3Sthurlow 91*4bff34e3Sthurlow if (component) 92*4bff34e3Sthurlow for (s = component; (c = *s) != 0; s++) { 93*4bff34e3Sthurlow if (c != '%') 94*4bff34e3Sthurlow continue; 95*4bff34e3Sthurlow if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15) 96*4bff34e3Sthurlow continue; /* ignore invalid escapes */ 97*4bff34e3Sthurlow s[0] = hi*16 + lo; 98*4bff34e3Sthurlow /* 99*4bff34e3Sthurlow * This was strcpy(s + 1, s + 3); 100*4bff34e3Sthurlow * But nowadays leftward overlapping copies are 101*4bff34e3Sthurlow * officially undefined in C. Ours seems to 102*4bff34e3Sthurlow * work or not depending upon alignment. 103*4bff34e3Sthurlow */ 104*4bff34e3Sthurlow memmove(s+1, s+3, strlen(s+3) + 1); 105*4bff34e3Sthurlow } 106*4bff34e3Sthurlow return (component); 107*4bff34e3Sthurlow } 108*4bff34e3Sthurlow 109*4bff34e3Sthurlow #ifdef NOTPORTED 110*4bff34e3Sthurlow static CFStringEncoding 111*4bff34e3Sthurlow get_windows_encoding_equivalent( void ) 112*4bff34e3Sthurlow { 113*4bff34e3Sthurlow 114*4bff34e3Sthurlow CFStringEncoding encoding; 115*4bff34e3Sthurlow uint32_t index,region; 116*4bff34e3Sthurlow 117*4bff34e3Sthurlow /* important! use root ID so you can read the config file! */ 118*4bff34e3Sthurlow seteuid(eff_uid); 119*4bff34e3Sthurlow __CFStringGetInstallationEncodingAndRegion(&index,®ion); 120*4bff34e3Sthurlow seteuid(real_uid); 121*4bff34e3Sthurlow 122*4bff34e3Sthurlow switch ( index ) 123*4bff34e3Sthurlow { 124*4bff34e3Sthurlow case kCFStringEncodingMacRoman: 125*4bff34e3Sthurlow if (region) /* anything nonzero is not US */ 126*4bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1; 127*4bff34e3Sthurlow else /* US region */ 128*4bff34e3Sthurlow encoding = kCFStringEncodingDOSLatinUS; 129*4bff34e3Sthurlow break; 130*4bff34e3Sthurlow 131*4bff34e3Sthurlow case kCFStringEncodingMacJapanese: 132*4bff34e3Sthurlow encoding = kCFStringEncodingDOSJapanese; 133*4bff34e3Sthurlow break; 134*4bff34e3Sthurlow 135*4bff34e3Sthurlow case kCFStringEncodingMacChineseTrad: 136*4bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseTrad; 137*4bff34e3Sthurlow break; 138*4bff34e3Sthurlow 139*4bff34e3Sthurlow case kCFStringEncodingMacKorean: 140*4bff34e3Sthurlow encoding = kCFStringEncodingDOSKorean; 141*4bff34e3Sthurlow break; 142*4bff34e3Sthurlow 143*4bff34e3Sthurlow case kCFStringEncodingMacArabic: 144*4bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic; 145*4bff34e3Sthurlow break; 146*4bff34e3Sthurlow 147*4bff34e3Sthurlow case kCFStringEncodingMacHebrew: 148*4bff34e3Sthurlow encoding = kCFStringEncodingDOSHebrew; 149*4bff34e3Sthurlow break; 150*4bff34e3Sthurlow 151*4bff34e3Sthurlow case kCFStringEncodingMacGreek: 152*4bff34e3Sthurlow encoding = kCFStringEncodingDOSGreek; 153*4bff34e3Sthurlow break; 154*4bff34e3Sthurlow 155*4bff34e3Sthurlow case kCFStringEncodingMacCyrillic: 156*4bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic; 157*4bff34e3Sthurlow break; 158*4bff34e3Sthurlow 159*4bff34e3Sthurlow case kCFStringEncodingMacThai: 160*4bff34e3Sthurlow encoding = kCFStringEncodingDOSThai; 161*4bff34e3Sthurlow break; 162*4bff34e3Sthurlow 163*4bff34e3Sthurlow case kCFStringEncodingMacChineseSimp: 164*4bff34e3Sthurlow encoding = kCFStringEncodingDOSChineseSimplif; 165*4bff34e3Sthurlow break; 166*4bff34e3Sthurlow 167*4bff34e3Sthurlow case kCFStringEncodingMacCentralEurRoman: 168*4bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 169*4bff34e3Sthurlow break; 170*4bff34e3Sthurlow 171*4bff34e3Sthurlow case kCFStringEncodingMacTurkish: 172*4bff34e3Sthurlow encoding = kCFStringEncodingDOSTurkish; 173*4bff34e3Sthurlow break; 174*4bff34e3Sthurlow 175*4bff34e3Sthurlow case kCFStringEncodingMacCroatian: 176*4bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 177*4bff34e3Sthurlow break; 178*4bff34e3Sthurlow 179*4bff34e3Sthurlow case kCFStringEncodingMacIcelandic: 180*4bff34e3Sthurlow encoding = kCFStringEncodingDOSIcelandic; 181*4bff34e3Sthurlow break; 182*4bff34e3Sthurlow 183*4bff34e3Sthurlow case kCFStringEncodingMacRomanian: 184*4bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin2; 185*4bff34e3Sthurlow break; 186*4bff34e3Sthurlow 187*4bff34e3Sthurlow case kCFStringEncodingMacFarsi: 188*4bff34e3Sthurlow encoding = kCFStringEncodingDOSArabic; 189*4bff34e3Sthurlow break; 190*4bff34e3Sthurlow 191*4bff34e3Sthurlow case kCFStringEncodingMacUkrainian: 192*4bff34e3Sthurlow encoding = kCFStringEncodingDOSCyrillic; 193*4bff34e3Sthurlow break; 194*4bff34e3Sthurlow 195*4bff34e3Sthurlow default: 196*4bff34e3Sthurlow encoding = kCFStringEncodingDOSLatin1; 197*4bff34e3Sthurlow break; 198*4bff34e3Sthurlow } 199*4bff34e3Sthurlow 200*4bff34e3Sthurlow return encoding; 201*4bff34e3Sthurlow } 202*4bff34e3Sthurlow #endif /* NOTPORTED */ 203*4bff34e3Sthurlow 204*4bff34e3Sthurlow /* 205*4bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 206*4bff34e3Sthurlow * conversions. 207*4bff34e3Sthurlow */ 208*4bff34e3Sthurlow char * 209*4bff34e3Sthurlow convert_wincs_to_utf8(const char *windows_string) 210*4bff34e3Sthurlow { 211*4bff34e3Sthurlow #ifdef NOTPORTED 212*4bff34e3Sthurlow CFStringRef s; 213*4bff34e3Sthurlow CFIndex maxlen; 214*4bff34e3Sthurlow char *result; 215*4bff34e3Sthurlow 216*4bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 217*4bff34e3Sthurlow get_windows_encoding_equivalent()); 218*4bff34e3Sthurlow if (s == NULL) { 219*4bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1, 220*4bff34e3Sthurlow windows_string); 221*4bff34e3Sthurlow 222*4bff34e3Sthurlow /* kCFStringEncodingMacRoman should always succeed */ 223*4bff34e3Sthurlow s = CFStringCreateWithCString(NULL, windows_string, 224*4bff34e3Sthurlow kCFStringEncodingMacRoman); 225*4bff34e3Sthurlow if (s == NULL) { 226*4bff34e3Sthurlow smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping", 227*4bff34e3Sthurlow -1, windows_string); 228*4bff34e3Sthurlow return NULL; 229*4bff34e3Sthurlow } 230*4bff34e3Sthurlow } 231*4bff34e3Sthurlow 232*4bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 233*4bff34e3Sthurlow kCFStringEncodingUTF8) + 1; 234*4bff34e3Sthurlow result = malloc(maxlen); 235*4bff34e3Sthurlow if (result == NULL) { 236*4bff34e3Sthurlow smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1, 237*4bff34e3Sthurlow windows_string); 238*4bff34e3Sthurlow CFRelease(s); 239*4bff34e3Sthurlow return NULL; 240*4bff34e3Sthurlow } 241*4bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) { 242*4bff34e3Sthurlow smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping", 243*4bff34e3Sthurlow -1, windows_string); 244*4bff34e3Sthurlow CFRelease(s); 245*4bff34e3Sthurlow return NULL; 246*4bff34e3Sthurlow } 247*4bff34e3Sthurlow CFRelease(s); 248*4bff34e3Sthurlow return result; 249*4bff34e3Sthurlow #else /* NOTPORTED */ 250*4bff34e3Sthurlow return ((char*)windows_string); 251*4bff34e3Sthurlow #endif /* NOTPORTED */ 252*4bff34e3Sthurlow } 253*4bff34e3Sthurlow 254*4bff34e3Sthurlow /* 255*4bff34e3Sthurlow * XXX - NLS, or CF? We should probably use the same routine for all 256*4bff34e3Sthurlow * conversions. 257*4bff34e3Sthurlow */ 258*4bff34e3Sthurlow char * 259*4bff34e3Sthurlow convert_utf8_to_wincs(const char *utf8_string) 260*4bff34e3Sthurlow { 261*4bff34e3Sthurlow #ifdef NOTPORTED 262*4bff34e3Sthurlow CFStringRef s; 263*4bff34e3Sthurlow CFIndex maxlen; 264*4bff34e3Sthurlow char *result; 265*4bff34e3Sthurlow 266*4bff34e3Sthurlow s = CFStringCreateWithCString(NULL, utf8_string, 267*4bff34e3Sthurlow kCFStringEncodingUTF8); 268*4bff34e3Sthurlow if (s == NULL) { 269*4bff34e3Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 270*4bff34e3Sthurlow utf8_string); 271*4bff34e3Sthurlow return NULL; 272*4bff34e3Sthurlow } 273*4bff34e3Sthurlow 274*4bff34e3Sthurlow maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 275*4bff34e3Sthurlow get_windows_encoding_equivalent()) + 1; 276*4bff34e3Sthurlow result = malloc(maxlen); 277*4bff34e3Sthurlow if (result == NULL) { 278*4bff34e3Sthurlow smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1, 279*4bff34e3Sthurlow utf8_string); 280*4bff34e3Sthurlow CFRelease(s); 281*4bff34e3Sthurlow return NULL; 282*4bff34e3Sthurlow } 283*4bff34e3Sthurlow if (!CFStringGetCString(s, result, maxlen, 284*4bff34e3Sthurlow get_windows_encoding_equivalent())) { 285*4bff34e3Sthurlow smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping", 286*4bff34e3Sthurlow -1, utf8_string); 287*4bff34e3Sthurlow CFRelease(s); 288*4bff34e3Sthurlow return NULL; 289*4bff34e3Sthurlow } 290*4bff34e3Sthurlow CFRelease(s); 291*4bff34e3Sthurlow return result; 292*4bff34e3Sthurlow #else /* NOTPORTED */ 293*4bff34e3Sthurlow return ((char*)utf8_string); 294*4bff34e3Sthurlow #endif /* NOTPORTED */ 295*4bff34e3Sthurlow } 296*4bff34e3Sthurlow 297*4bff34e3Sthurlow /* 298*4bff34e3Sthurlow * Convert little-endian Unicode string to UTF-8. 299*4bff34e3Sthurlow * Converts the Unicode string to host byte order in place. 300*4bff34e3Sthurlow */ 301*4bff34e3Sthurlow char * 302*4bff34e3Sthurlow convert_leunicode_to_utf8(unsigned short *unicode_string) 303*4bff34e3Sthurlow { 304*4bff34e3Sthurlow unsigned short *unicode_charp, unicode_char; 305*4bff34e3Sthurlow int len = 0; 306*4bff34e3Sthurlow 307*4bff34e3Sthurlow for (unicode_charp = unicode_string; 308*4bff34e3Sthurlow (unicode_char = *unicode_charp) != 0; 309*4bff34e3Sthurlow unicode_charp++) { 310*4bff34e3Sthurlow *unicode_charp = letohs(unicode_char); 311*4bff34e3Sthurlow len = len + 2; 312*4bff34e3Sthurlow } 313*4bff34e3Sthurlow return (convert_unicode_to_utf8(unicode_string, len)); 314*4bff34e3Sthurlow } 315*4bff34e3Sthurlow 316*4bff34e3Sthurlow char * 317*4bff34e3Sthurlow convert_unicode_to_utf8(unsigned short *unicode_string, int len) 318*4bff34e3Sthurlow { 319*4bff34e3Sthurlow iconv_t cd; 320*4bff34e3Sthurlow char from[BUFSIZ], to[BUFSIZ]; 321*4bff34e3Sthurlow char *tptr = NULL; 322*4bff34e3Sthurlow const char *fptr; 323*4bff34e3Sthurlow size_t ileft, oleft, ret; 324*4bff34e3Sthurlow 325*4bff34e3Sthurlow cd = iconv_open("UTF-8", "UTF-16"); 326*4bff34e3Sthurlow if (cd != (iconv_t)-1) { 327*4bff34e3Sthurlow ileft = len; 328*4bff34e3Sthurlow bcopy((char *)unicode_string, from, ileft); 329*4bff34e3Sthurlow fptr = from; 330*4bff34e3Sthurlow oleft = BUFSIZ; 331*4bff34e3Sthurlow tptr = to; 332*4bff34e3Sthurlow ret = iconv(cd, &fptr, &ileft, &tptr, &oleft); 333*4bff34e3Sthurlow if (ret != (size_t)-1) { 334*4bff34e3Sthurlow to[BUFSIZ-oleft] = '\0'; 335*4bff34e3Sthurlow tptr = to; 336*4bff34e3Sthurlow } else { 337*4bff34e3Sthurlow tptr = NULL; 338*4bff34e3Sthurlow } 339*4bff34e3Sthurlow (void) iconv_close(cd); 340*4bff34e3Sthurlow } 341*4bff34e3Sthurlow return (tptr); 342*4bff34e3Sthurlow } 343*4bff34e3Sthurlow 344*4bff34e3Sthurlow /* 345*4bff34e3Sthurlow * Convert UTF-8 string to little-endian Unicode. 346*4bff34e3Sthurlow */ 347*4bff34e3Sthurlow unsigned short * 348*4bff34e3Sthurlow convert_utf8_to_leunicode(const char *utf8_string) 349*4bff34e3Sthurlow { 350*4bff34e3Sthurlow #ifdef NOTPORTED 351*4bff34e3Sthurlow CFStringRef s; 352*4bff34e3Sthurlow CFIndex maxlen; 353*4bff34e3Sthurlow unsigned short *result; 354*4bff34e3Sthurlow CFRange range; 355*4bff34e3Sthurlow int i; 356*4bff34e3Sthurlow 357*4bff34e3Sthurlow s = CFStringCreateWithCString(NULL, utf8_string, 358*4bff34e3Sthurlow kCFStringEncodingUTF8); 359*4bff34e3Sthurlow if (s == NULL) { 360*4bff34e3Sthurlow smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 361*4bff34e3Sthurlow utf8_string); 362*4bff34e3Sthurlow return NULL; 363*4bff34e3Sthurlow } 364*4bff34e3Sthurlow 365*4bff34e3Sthurlow maxlen = CFStringGetLength(s); 366*4bff34e3Sthurlow result = malloc(2*(maxlen + 1)); 367*4bff34e3Sthurlow if (result == NULL) { 368*4bff34e3Sthurlow smb_error("Couldn't allocate buffer for Unicode string for \"%s\" - skipping", -1, 369*4bff34e3Sthurlow utf8_string); 370*4bff34e3Sthurlow CFRelease(s); 371*4bff34e3Sthurlow return NULL; 372*4bff34e3Sthurlow } 373*4bff34e3Sthurlow range.location = 0; 374*4bff34e3Sthurlow range.length = maxlen; 375*4bff34e3Sthurlow CFStringGetCharacters(s, range, result); 376*4bff34e3Sthurlow for (i = 0; i < maxlen; i++) 377*4bff34e3Sthurlow result[i] = CFSwapInt16HostToLittle(result[i]); 378*4bff34e3Sthurlow result[maxlen] = 0; 379*4bff34e3Sthurlow CFRelease(s); 380*4bff34e3Sthurlow return result; 381*4bff34e3Sthurlow #else /* NOTPORTED */ 382*4bff34e3Sthurlow /* LINTED */ /* XXX Really need to fix this! */ 383*4bff34e3Sthurlow return ((ushort_t *)utf8_string); /* XXX */ 384*4bff34e3Sthurlow #endif /* NOTPORTED */ 385*4bff34e3Sthurlow } 386