1 /* 2 * Copyright (c) 2001 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights 7 * Reserved. This file contains Original Code and/or Modifications of 8 * Original Code as defined in and that are subject to the Apple Public 9 * Source License Version 1.0 (the 'License'). You may not use this file 10 * except in compliance with the License. Please obtain a copy of the 11 * License at http://www.apple.com/publicsource and read it before using 12 * this file. 13 * 14 * The Original Code and all software distributed under the License are 15 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 16 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 17 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the 19 * License for the specific language governing rights and limitations 20 * under the License." 21 * 22 * @APPLE_LICENSE_HEADER_END@ 23 */ 24 /* @(#)charsets.c * 25 * (c) 2004 Apple Computer, Inc. All Rights Reserved 26 * 27 * 28 * charsets.c -- Routines converting between UTF-8, 16-bit 29 * little-endian Unicode, and various Windows 30 * code pages. 31 * 32 * MODIFICATION HISTORY: 33 * 28-Nov-2004 Guy Harris New today 34 */ 35 36 #pragma ident "%Z%%M% %I% %E% SMI" 37 38 #include <stdlib.h> 39 #include <stdio.h> 40 #include <string.h> 41 #include <ctype.h> 42 #include <iconv.h> 43 #include <langinfo.h> 44 #include <strings.h> 45 46 #ifdef NOTPORTED 47 #include <CoreFoundation/CoreFoundation.h> 48 #include <CoreFoundation/CFStringDefaultEncoding.h> 49 #include <CoreFoundation/CFStringEncodingConverter.h> 50 #include <sys/mchain.h> 51 #endif /* NOTPORTED */ 52 53 #include <netsmb/smb_lib.h> 54 #include <netsmb/mchain.h> 55 56 #include "charsets.h" 57 58 #ifdef NOTPORTED 59 extern uid_t real_uid,eff_uid; 60 #endif /* NOTPORTED */ 61 62 /* 63 * On Solaris, we will need to do some rewriting to use our iconv 64 * routines for the conversions. For now, we're effectively 65 * stubbing out code, leaving the details of what happens on 66 * Darwin in case it's useful as a guide later. 67 */ 68 69 static unsigned 70 xtoi(char u) 71 { 72 if (isdigit(u)) 73 return (u - '0'); 74 else if (islower(u)) 75 return (10 + u - 'a'); 76 else if (isupper(u)) 77 return (10 + u - 'A'); 78 return (16); 79 } 80 81 82 /* Removes the "%" escape sequences from a URL component. 83 * See IETF RFC 2396. 84 */ 85 char * 86 unpercent(char * component) 87 { 88 char c, *s; 89 unsigned hi, lo; 90 91 if (component) 92 for (s = component; (c = *s) != 0; s++) { 93 if (c != '%') 94 continue; 95 if ((hi = xtoi(s[1])) > 15 || (lo = xtoi(s[2])) > 15) 96 continue; /* ignore invalid escapes */ 97 s[0] = hi*16 + lo; 98 /* 99 * This was strcpy(s + 1, s + 3); 100 * But nowadays leftward overlapping copies are 101 * officially undefined in C. Ours seems to 102 * work or not depending upon alignment. 103 */ 104 memmove(s+1, s+3, strlen(s+3) + 1); 105 } 106 return (component); 107 } 108 109 #ifdef NOTPORTED 110 static CFStringEncoding 111 get_windows_encoding_equivalent( void ) 112 { 113 114 CFStringEncoding encoding; 115 uint32_t index,region; 116 117 /* important! use root ID so you can read the config file! */ 118 seteuid(eff_uid); 119 __CFStringGetInstallationEncodingAndRegion(&index,®ion); 120 seteuid(real_uid); 121 122 switch ( index ) 123 { 124 case kCFStringEncodingMacRoman: 125 if (region) /* anything nonzero is not US */ 126 encoding = kCFStringEncodingDOSLatin1; 127 else /* US region */ 128 encoding = kCFStringEncodingDOSLatinUS; 129 break; 130 131 case kCFStringEncodingMacJapanese: 132 encoding = kCFStringEncodingDOSJapanese; 133 break; 134 135 case kCFStringEncodingMacChineseTrad: 136 encoding = kCFStringEncodingDOSChineseTrad; 137 break; 138 139 case kCFStringEncodingMacKorean: 140 encoding = kCFStringEncodingDOSKorean; 141 break; 142 143 case kCFStringEncodingMacArabic: 144 encoding = kCFStringEncodingDOSArabic; 145 break; 146 147 case kCFStringEncodingMacHebrew: 148 encoding = kCFStringEncodingDOSHebrew; 149 break; 150 151 case kCFStringEncodingMacGreek: 152 encoding = kCFStringEncodingDOSGreek; 153 break; 154 155 case kCFStringEncodingMacCyrillic: 156 encoding = kCFStringEncodingDOSCyrillic; 157 break; 158 159 case kCFStringEncodingMacThai: 160 encoding = kCFStringEncodingDOSThai; 161 break; 162 163 case kCFStringEncodingMacChineseSimp: 164 encoding = kCFStringEncodingDOSChineseSimplif; 165 break; 166 167 case kCFStringEncodingMacCentralEurRoman: 168 encoding = kCFStringEncodingDOSLatin2; 169 break; 170 171 case kCFStringEncodingMacTurkish: 172 encoding = kCFStringEncodingDOSTurkish; 173 break; 174 175 case kCFStringEncodingMacCroatian: 176 encoding = kCFStringEncodingDOSLatin2; 177 break; 178 179 case kCFStringEncodingMacIcelandic: 180 encoding = kCFStringEncodingDOSIcelandic; 181 break; 182 183 case kCFStringEncodingMacRomanian: 184 encoding = kCFStringEncodingDOSLatin2; 185 break; 186 187 case kCFStringEncodingMacFarsi: 188 encoding = kCFStringEncodingDOSArabic; 189 break; 190 191 case kCFStringEncodingMacUkrainian: 192 encoding = kCFStringEncodingDOSCyrillic; 193 break; 194 195 default: 196 encoding = kCFStringEncodingDOSLatin1; 197 break; 198 } 199 200 return encoding; 201 } 202 #endif /* NOTPORTED */ 203 204 /* 205 * XXX - NLS, or CF? We should probably use the same routine for all 206 * conversions. 207 */ 208 char * 209 convert_wincs_to_utf8(const char *windows_string) 210 { 211 #ifdef NOTPORTED 212 CFStringRef s; 213 CFIndex maxlen; 214 char *result; 215 216 s = CFStringCreateWithCString(NULL, windows_string, 217 get_windows_encoding_equivalent()); 218 if (s == NULL) { 219 smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" ", -1, 220 windows_string); 221 222 /* kCFStringEncodingMacRoman should always succeed */ 223 s = CFStringCreateWithCString(NULL, windows_string, 224 kCFStringEncodingMacRoman); 225 if (s == NULL) { 226 smb_error("CFStringCreateWithCString for Windows code page failed on \"%s\" with kCFStringEncodingMacRoman - skipping", 227 -1, windows_string); 228 return NULL; 229 } 230 } 231 232 maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 233 kCFStringEncodingUTF8) + 1; 234 result = malloc(maxlen); 235 if (result == NULL) { 236 smb_error("Couldn't allocate buffer for UTF-8 string for \"%s\" - skipping", -1, 237 windows_string); 238 CFRelease(s); 239 return NULL; 240 } 241 if (!CFStringGetCString(s, result, maxlen, kCFStringEncodingUTF8)) { 242 smb_error("CFStringGetCString for UTF-8 failed on \"%s\" - skipping", 243 -1, windows_string); 244 CFRelease(s); 245 return NULL; 246 } 247 CFRelease(s); 248 return result; 249 #else /* NOTPORTED */ 250 return ((char*)windows_string); 251 #endif /* NOTPORTED */ 252 } 253 254 /* 255 * XXX - NLS, or CF? We should probably use the same routine for all 256 * conversions. 257 */ 258 char * 259 convert_utf8_to_wincs(const char *utf8_string) 260 { 261 #ifdef NOTPORTED 262 CFStringRef s; 263 CFIndex maxlen; 264 char *result; 265 266 s = CFStringCreateWithCString(NULL, utf8_string, 267 kCFStringEncodingUTF8); 268 if (s == NULL) { 269 smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 270 utf8_string); 271 return NULL; 272 } 273 274 maxlen = CFStringGetMaximumSizeForEncoding(CFStringGetLength(s), 275 get_windows_encoding_equivalent()) + 1; 276 result = malloc(maxlen); 277 if (result == NULL) { 278 smb_error("Couldn't allocate buffer for Windows code page string for \"%s\" - skipping", -1, 279 utf8_string); 280 CFRelease(s); 281 return NULL; 282 } 283 if (!CFStringGetCString(s, result, maxlen, 284 get_windows_encoding_equivalent())) { 285 smb_error("CFStringGetCString for Windows code page failed on \"%s\" - skipping", 286 -1, utf8_string); 287 CFRelease(s); 288 return NULL; 289 } 290 CFRelease(s); 291 return result; 292 #else /* NOTPORTED */ 293 return ((char*)utf8_string); 294 #endif /* NOTPORTED */ 295 } 296 297 /* 298 * Convert little-endian Unicode string to UTF-8. 299 * Converts the Unicode string to host byte order in place. 300 */ 301 char * 302 convert_leunicode_to_utf8(unsigned short *unicode_string) 303 { 304 unsigned short *unicode_charp, unicode_char; 305 int len = 0; 306 307 for (unicode_charp = unicode_string; 308 (unicode_char = *unicode_charp) != 0; 309 unicode_charp++) { 310 *unicode_charp = letohs(unicode_char); 311 len = len + 2; 312 } 313 return (convert_unicode_to_utf8(unicode_string, len)); 314 } 315 316 char * 317 convert_unicode_to_utf8(unsigned short *unicode_string, int len) 318 { 319 iconv_t cd; 320 char from[BUFSIZ], to[BUFSIZ]; 321 char *tptr = NULL; 322 const char *fptr; 323 size_t ileft, oleft, ret; 324 325 cd = iconv_open("UTF-8", "UTF-16"); 326 if (cd != (iconv_t)-1) { 327 ileft = len; 328 bcopy((char *)unicode_string, from, ileft); 329 fptr = from; 330 oleft = BUFSIZ; 331 tptr = to; 332 ret = iconv(cd, &fptr, &ileft, &tptr, &oleft); 333 if (ret != (size_t)-1) { 334 to[BUFSIZ-oleft] = '\0'; 335 tptr = to; 336 } else { 337 tptr = NULL; 338 } 339 (void) iconv_close(cd); 340 } 341 return (tptr); 342 } 343 344 /* 345 * Convert UTF-8 string to little-endian Unicode. 346 */ 347 unsigned short * 348 convert_utf8_to_leunicode(const char *utf8_string) 349 { 350 #ifdef NOTPORTED 351 CFStringRef s; 352 CFIndex maxlen; 353 unsigned short *result; 354 CFRange range; 355 int i; 356 357 s = CFStringCreateWithCString(NULL, utf8_string, 358 kCFStringEncodingUTF8); 359 if (s == NULL) { 360 smb_error("CFStringCreateWithCString for UTF-8 failed on \"%s\"", -1, 361 utf8_string); 362 return NULL; 363 } 364 365 maxlen = CFStringGetLength(s); 366 result = malloc(2*(maxlen + 1)); 367 if (result == NULL) { 368 smb_error("Couldn't allocate buffer for Unicode string for \"%s\" - skipping", -1, 369 utf8_string); 370 CFRelease(s); 371 return NULL; 372 } 373 range.location = 0; 374 range.length = maxlen; 375 CFStringGetCharacters(s, range, result); 376 for (i = 0; i < maxlen; i++) 377 result[i] = CFSwapInt16HostToLittle(result[i]); 378 result[maxlen] = 0; 379 CFRelease(s); 380 return result; 381 #else /* NOTPORTED */ 382 /* LINTED */ /* XXX Really need to fix this! */ 383 return ((ushort_t *)utf8_string); /* XXX */ 384 #endif /* NOTPORTED */ 385 } 386