1 /*- 2 * Copyright (c) 2003-2007 Tim Kientzle 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "archive_platform.h" 27 __FBSDID("$FreeBSD$"); 28 29 /* 30 * Basic resizable string support, to simplify manipulating arbitrary-sized 31 * strings while minimizing heap activity. 32 */ 33 34 #ifdef HAVE_STDLIB_H 35 #include <stdlib.h> 36 #endif 37 #ifdef HAVE_STRING_H 38 #include <string.h> 39 #endif 40 #ifdef HAVE_WCHAR_H 41 #include <wchar.h> 42 #endif 43 #if defined(_WIN32) && !defined(__CYGWIN__) 44 #include <windows.h> 45 #endif 46 47 #include "archive_private.h" 48 #include "archive_string.h" 49 50 struct archive_string * 51 __archive_string_append(struct archive_string *as, const char *p, size_t s) 52 { 53 if (__archive_string_ensure(as, as->length + s + 1) == NULL) 54 __archive_errx(1, "Out of memory"); 55 memcpy(as->s + as->length, p, s); 56 as->s[as->length + s] = 0; 57 as->length += s; 58 return (as); 59 } 60 61 void 62 __archive_string_copy(struct archive_string *dest, struct archive_string *src) 63 { 64 if (src->length == 0) 65 dest->length = 0; 66 else { 67 if (__archive_string_ensure(dest, src->length + 1) == NULL) 68 __archive_errx(1, "Out of memory"); 69 memcpy(dest->s, src->s, src->length); 70 dest->length = src->length; 71 dest->s[dest->length] = 0; 72 } 73 } 74 75 void 76 __archive_string_concat(struct archive_string *dest, struct archive_string *src) 77 { 78 if (src->length > 0) { 79 if (__archive_string_ensure(dest, dest->length + src->length + 1) == NULL) 80 __archive_errx(1, "Out of memory"); 81 memcpy(dest->s + dest->length, src->s, src->length); 82 dest->length += src->length; 83 dest->s[dest->length] = 0; 84 } 85 } 86 87 void 88 __archive_string_free(struct archive_string *as) 89 { 90 as->length = 0; 91 as->buffer_length = 0; 92 if (as->s != NULL) { 93 free(as->s); 94 as->s = NULL; 95 } 96 } 97 98 /* Returns NULL on any allocation failure. */ 99 struct archive_string * 100 __archive_string_ensure(struct archive_string *as, size_t s) 101 { 102 /* If buffer is already big enough, don't reallocate. */ 103 if (as->s && (s <= as->buffer_length)) 104 return (as); 105 106 /* 107 * Growing the buffer at least exponentially ensures that 108 * append operations are always linear in the number of 109 * characters appended. Using a smaller growth rate for 110 * larger buffers reduces memory waste somewhat at the cost of 111 * a larger constant factor. 112 */ 113 if (as->buffer_length < 32) 114 /* Start with a minimum 32-character buffer. */ 115 as->buffer_length = 32; 116 else if (as->buffer_length < 8192) 117 /* Buffers under 8k are doubled for speed. */ 118 as->buffer_length += as->buffer_length; 119 else { 120 /* Buffers 8k and over grow by at least 25% each time. */ 121 size_t old_length = as->buffer_length; 122 as->buffer_length += as->buffer_length / 4; 123 /* Be safe: If size wraps, release buffer and return NULL. */ 124 if (as->buffer_length < old_length) { 125 free(as->s); 126 as->s = NULL; 127 return (NULL); 128 } 129 } 130 /* 131 * The computation above is a lower limit to how much we'll 132 * grow the buffer. In any case, we have to grow it enough to 133 * hold the request. 134 */ 135 if (as->buffer_length < s) 136 as->buffer_length = s; 137 /* Now we can reallocate the buffer. */ 138 as->s = (char *)realloc(as->s, as->buffer_length); 139 if (as->s == NULL) 140 return (NULL); 141 return (as); 142 } 143 144 struct archive_string * 145 __archive_strncat(struct archive_string *as, const void *_p, size_t n) 146 { 147 size_t s; 148 const char *p, *pp; 149 150 p = (const char *)_p; 151 152 /* Like strlen(p), except won't examine positions beyond p[n]. */ 153 s = 0; 154 pp = p; 155 while (s < n && *pp) { 156 pp++; 157 s++; 158 } 159 return (__archive_string_append(as, p, s)); 160 } 161 162 struct archive_string * 163 __archive_strappend_char(struct archive_string *as, char c) 164 { 165 return (__archive_string_append(as, &c, 1)); 166 } 167 168 /* 169 * Translates a wide character string into UTF-8 and appends 170 * to the archive_string. Note: returns NULL if conversion fails, 171 * but still leaves a best-effort conversion in the argument as. 172 */ 173 struct archive_string * 174 __archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w) 175 { 176 char *p; 177 unsigned wc; 178 char buff[256]; 179 struct archive_string *return_val = as; 180 181 /* 182 * Convert one wide char at a time into 'buff', whenever that 183 * fills, append it to the string. 184 */ 185 p = buff; 186 while (*w != L'\0') { 187 /* Flush the buffer when we have <=16 bytes free. */ 188 /* (No encoding has a single character >16 bytes.) */ 189 if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) { 190 *p = '\0'; 191 archive_strcat(as, buff); 192 p = buff; 193 } 194 wc = *w++; 195 /* If this is a surrogate pair, assemble the full code point.*/ 196 /* Note: wc must not be wchar_t here, because the full code 197 * point can be more than 16 bits! */ 198 if (wc >= 0xD800 && wc <= 0xDBff 199 && *w >= 0xDC00 && *w <= 0xDFFF) { 200 wc -= 0xD800; 201 wc *= 0x400; 202 wc += (*w - 0xDC00); 203 wc += 0x10000; 204 ++w; 205 } 206 /* Translate code point to UTF8 */ 207 if (wc <= 0x7f) { 208 *p++ = (char)wc; 209 } else if (wc <= 0x7ff) { 210 *p++ = 0xc0 | ((wc >> 6) & 0x1f); 211 *p++ = 0x80 | (wc & 0x3f); 212 } else if (wc <= 0xffff) { 213 *p++ = 0xe0 | ((wc >> 12) & 0x0f); 214 *p++ = 0x80 | ((wc >> 6) & 0x3f); 215 *p++ = 0x80 | (wc & 0x3f); 216 } else if (wc <= 0x1fffff) { 217 *p++ = 0xf0 | ((wc >> 18) & 0x07); 218 *p++ = 0x80 | ((wc >> 12) & 0x3f); 219 *p++ = 0x80 | ((wc >> 6) & 0x3f); 220 *p++ = 0x80 | (wc & 0x3f); 221 } else { 222 /* Unicode has no codes larger than 0x1fffff. */ 223 /* TODO: use \uXXXX escape here instead of ? */ 224 *p++ = '?'; 225 return_val = NULL; 226 } 227 } 228 *p = '\0'; 229 archive_strcat(as, buff); 230 return (return_val); 231 } 232 233 static int 234 utf8_to_unicode(int *pwc, const char *s, size_t n) 235 { 236 int ch; 237 238 /* 239 * Decode 1-4 bytes depending on the value of the first byte. 240 */ 241 ch = (unsigned char)*s; 242 if (ch == 0) { 243 return (0); /* Standard: return 0 for end-of-string. */ 244 } 245 if ((ch & 0x80) == 0) { 246 *pwc = ch & 0x7f; 247 return (1); 248 } 249 if ((ch & 0xe0) == 0xc0) { 250 if (n < 2) 251 return (-1); 252 if ((s[1] & 0xc0) != 0x80) return (-1); 253 *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); 254 return (2); 255 } 256 if ((ch & 0xf0) == 0xe0) { 257 if (n < 3) 258 return (-1); 259 if ((s[1] & 0xc0) != 0x80) return (-1); 260 if ((s[2] & 0xc0) != 0x80) return (-1); 261 *pwc = ((ch & 0x0f) << 12) 262 | ((s[1] & 0x3f) << 6) 263 | (s[2] & 0x3f); 264 return (3); 265 } 266 if ((ch & 0xf8) == 0xf0) { 267 if (n < 4) 268 return (-1); 269 if ((s[1] & 0xc0) != 0x80) return (-1); 270 if ((s[2] & 0xc0) != 0x80) return (-1); 271 if ((s[3] & 0xc0) != 0x80) return (-1); 272 *pwc = ((ch & 0x07) << 18) 273 | ((s[1] & 0x3f) << 12) 274 | ((s[2] & 0x3f) << 6) 275 | (s[3] & 0x3f); 276 return (4); 277 } 278 /* Invalid first byte. */ 279 return (-1); 280 } 281 282 /* 283 * Return a wide-character Unicode string by converting this archive_string 284 * from UTF-8. We assume that systems with 16-bit wchar_t always use 285 * UTF16 and systems with 32-bit wchar_t can accept UCS4. 286 */ 287 wchar_t * 288 __archive_string_utf8_w(struct archive_string *as) 289 { 290 wchar_t *ws, *dest; 291 int wc, wc2;/* Must be large enough for a 21-bit Unicode code point. */ 292 const char *src; 293 int n; 294 295 ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t)); 296 if (ws == NULL) 297 __archive_errx(1, "Out of memory"); 298 dest = ws; 299 src = as->s; 300 while (*src != '\0') { 301 n = utf8_to_unicode(&wc, src, 8); 302 if (n == 0) 303 break; 304 if (n < 0) { 305 free(ws); 306 return (NULL); 307 } 308 src += n; 309 if (wc >= 0xDC00 && wc <= 0xDBFF) { 310 /* This is a leading surrogate; some idiot 311 * has translated UTF16 to UTF8 without combining 312 * surrogates; rebuild the full code point before 313 * continuing. */ 314 n = utf8_to_unicode(&wc2, src, 8); 315 if (n < 0) { 316 free(ws); 317 return (NULL); 318 } 319 if (n == 0) /* Ignore the leading surrogate */ 320 break; 321 if (wc2 < 0xDC00 || wc2 > 0xDFFF) { 322 /* If the second character isn't a 323 * trailing surrogate, then someone 324 * has really screwed up and this is 325 * invalid. */ 326 free(ws); 327 return (NULL); 328 } else { 329 src += n; 330 wc -= 0xD800; 331 wc *= 0x400; 332 wc += wc2 - 0xDC00; 333 wc += 0x10000; 334 } 335 } 336 if ((sizeof(wchar_t) < 4) && (wc > 0xffff)) { 337 /* We have a code point that won't fit into a 338 * wchar_t; convert it to a surrogate pair. */ 339 wc -= 0x10000; 340 *dest++ = ((wc >> 10) & 0x3ff) + 0xD800; 341 *dest++ = (wc & 0x3ff) + 0xDC00; 342 } else 343 *dest++ = wc; 344 } 345 *dest = L'\0'; 346 return (ws); 347 } 348 349 #if defined(_WIN32) && !defined(__CYGWIN__) 350 351 /* 352 * Translates a wide character string into current locale character set 353 * and appends to the archive_string. Note: returns NULL if conversion 354 * fails. 355 * 356 * Win32 builds use WideCharToMultiByte from the Windows API. 357 * (Maybe Cygwin should too? WideCharToMultiByte will know a 358 * lot more about local character encodings than the wcrtomb() 359 * wrapper is going to know.) 360 */ 361 struct archive_string * 362 __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w) 363 { 364 char *p; 365 int l, wl; 366 BOOL useDefaultChar = FALSE; 367 368 wl = (int)wcslen(w); 369 l = wl * 4 + 4; 370 p = malloc(l); 371 if (p == NULL) 372 __archive_errx(1, "Out of memory"); 373 /* To check a useDefaultChar is to simulate error handling of 374 * the my_wcstombs() which is running on non Windows system with 375 * wctomb(). 376 * And to set NULL for last argument is necessary when a codepage 377 * is not CP_ACP(current locale). 378 */ 379 l = WideCharToMultiByte(CP_ACP, 0, w, wl, p, l, NULL, &useDefaultChar); 380 if (l == 0) { 381 free(p); 382 return (NULL); 383 } 384 __archive_string_append(as, p, l); 385 free(p); 386 return (as); 387 } 388 389 #else 390 391 /* 392 * Translates a wide character string into current locale character set 393 * and appends to the archive_string. Note: returns NULL if conversion 394 * fails. 395 * 396 * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion 397 * one character at a time. If a non-Windows platform doesn't have 398 * either of these, fall back to the built-in UTF8 conversion. 399 */ 400 struct archive_string * 401 __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w) 402 { 403 #if !defined(HAVE_WCTOMB) && !defined(HAVE_WCRTOMB) 404 /* If there's no built-in locale support, fall back to UTF8 always. */ 405 return __archive_strappend_w_utf8(as, w); 406 #else 407 /* We cannot use the standard wcstombs() here because it 408 * cannot tell us how big the output buffer should be. So 409 * I've built a loop around wcrtomb() or wctomb() that 410 * converts a character at a time and resizes the string as 411 * needed. We prefer wcrtomb() when it's available because 412 * it's thread-safe. */ 413 int n; 414 char *p; 415 char buff[256]; 416 #if HAVE_WCRTOMB 417 mbstate_t shift_state; 418 419 memset(&shift_state, 0, sizeof(shift_state)); 420 #else 421 /* Clear the shift state before starting. */ 422 wctomb(NULL, L'\0'); 423 #endif 424 425 /* 426 * Convert one wide char at a time into 'buff', whenever that 427 * fills, append it to the string. 428 */ 429 p = buff; 430 while (*w != L'\0') { 431 /* Flush the buffer when we have <=16 bytes free. */ 432 /* (No encoding has a single character >16 bytes.) */ 433 if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - MB_CUR_MAX)) { 434 *p = '\0'; 435 archive_strcat(as, buff); 436 p = buff; 437 } 438 #if HAVE_WCRTOMB 439 n = wcrtomb(p, *w++, &shift_state); 440 #else 441 n = wctomb(p, *w++); 442 #endif 443 if (n == -1) 444 return (NULL); 445 p += n; 446 } 447 *p = '\0'; 448 archive_strcat(as, buff); 449 return (as); 450 #endif 451 } 452 453 #endif /* _WIN32 && ! __CYGWIN__ */ 454