1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===------------------------------------------------------------------------=*/ 8 /* 9 * Copyright © 1991-2015 Unicode, Inc. All rights reserved. 10 * Distributed under the Terms of Use in 11 * http://www.unicode.org/copyright.html. 12 * 13 * Permission is hereby granted, free of charge, to any person obtaining 14 * a copy of the Unicode data files and any associated documentation 15 * (the "Data Files") or Unicode software and any associated documentation 16 * (the "Software") to deal in the Data Files or Software 17 * without restriction, including without limitation the rights to use, 18 * copy, modify, merge, publish, distribute, and/or sell copies of 19 * the Data Files or Software, and to permit persons to whom the Data Files 20 * or Software are furnished to do so, provided that 21 * (a) this copyright and permission notice appear with all copies 22 * of the Data Files or Software, 23 * (b) this copyright and permission notice appear in associated 24 * documentation, and 25 * (c) there is clear notice in each modified Data File or in the Software 26 * as well as in the documentation associated with the Data File(s) or 27 * Software that the data or software has been modified. 28 * 29 * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 30 * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 31 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 32 * NONINFRINGEMENT OF THIRD PARTY RIGHTS. 33 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 34 * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 35 * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 36 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 37 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 38 * PERFORMANCE OF THE DATA FILES OR SOFTWARE. 39 * 40 * Except as contained in this notice, the name of a copyright holder 41 * shall not be used in advertising or otherwise to promote the sale, 42 * use or other dealings in these Data Files or Software without prior 43 * written authorization of the copyright holder. 44 */ 45 46 /* --------------------------------------------------------------------- 47 48 Conversions between UTF32, UTF-16, and UTF-8. Source code file. 49 Author: Mark E. Davis, 1994. 50 Rev History: Rick McGowan, fixes & updates May 2001. 51 Sept 2001: fixed const & error conditions per 52 mods suggested by S. Parent & A. Lillich. 53 June 2002: Tim Dodd added detection and handling of incomplete 54 source sequences, enhanced error detection, added casts 55 to eliminate compiler warnings. 56 July 2003: slight mods to back out aggressive FFFE detection. 57 Jan 2004: updated switches in from-UTF8 conversions. 58 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 59 60 See the header file "ConvertUTF.h" for complete documentation. 61 62 ------------------------------------------------------------------------ */ 63 64 #include "llvm/Support/ConvertUTF.h" 65 #ifdef CVTUTF_DEBUG 66 #include <stdio.h> 67 #endif 68 #include <assert.h> 69 70 /* 71 * This code extensively uses fall-through switches. 72 * Keep the compiler from warning about that. 73 */ 74 #if defined(__clang__) && defined(__has_warning) 75 # if __has_warning("-Wimplicit-fallthrough") 76 # define ConvertUTF_DISABLE_WARNINGS \ 77 _Pragma("clang diagnostic push") \ 78 _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"") 79 # define ConvertUTF_RESTORE_WARNINGS \ 80 _Pragma("clang diagnostic pop") 81 # endif 82 #elif defined(__GNUC__) && __GNUC__ > 6 83 # define ConvertUTF_DISABLE_WARNINGS \ 84 _Pragma("GCC diagnostic push") \ 85 _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") 86 # define ConvertUTF_RESTORE_WARNINGS \ 87 _Pragma("GCC diagnostic pop") 88 #endif 89 #ifndef ConvertUTF_DISABLE_WARNINGS 90 # define ConvertUTF_DISABLE_WARNINGS 91 #endif 92 #ifndef ConvertUTF_RESTORE_WARNINGS 93 # define ConvertUTF_RESTORE_WARNINGS 94 #endif 95 96 ConvertUTF_DISABLE_WARNINGS 97 98 namespace llvm { 99 100 static const int halfShift = 10; /* used for shifting by 10 bits */ 101 102 static const UTF32 halfBase = 0x0010000UL; 103 static const UTF32 halfMask = 0x3FFUL; 104 105 #define UNI_SUR_HIGH_START (UTF32)0xD800 106 #define UNI_SUR_HIGH_END (UTF32)0xDBFF 107 #define UNI_SUR_LOW_START (UTF32)0xDC00 108 #define UNI_SUR_LOW_END (UTF32)0xDFFF 109 110 /* --------------------------------------------------------------------- */ 111 112 /* 113 * Index into the table below with the first byte of a UTF-8 sequence to 114 * get the number of trailing bytes that are supposed to follow it. 115 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 116 * left as-is for anyone who may want to do such conversion, which was 117 * allowed in earlier algorithms. 118 */ 119 static const char trailingBytesForUTF8[256] = { 120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 123 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 124 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 126 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 127 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 128 }; 129 130 /* 131 * Magic values subtracted from a buffer value during UTF8 conversion. 132 * This table contains as many values as there might be trailing bytes 133 * in a UTF-8 sequence. 134 */ 135 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 136 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 137 138 /* 139 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 140 * into the first byte, depending on how many bytes follow. There are 141 * as many entries in this table as there are UTF-8 sequence types. 142 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 143 * for *legal* UTF-8 will be 4 or fewer bytes total. 144 */ 145 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 146 147 /* --------------------------------------------------------------------- */ 148 149 /* The interface converts a whole buffer to avoid function-call overhead. 150 * Constants have been gathered. Loops & conditionals have been removed as 151 * much as possible for efficiency, in favor of drop-through switches. 152 * (See "Note A" at the bottom of the file for equivalent code.) 153 * If your compiler supports it, the "isLegalUTF8" call can be turned 154 * into an inline function. 155 */ 156 157 158 /* --------------------------------------------------------------------- */ 159 160 ConversionResult ConvertUTF32toUTF16 ( 161 const UTF32** sourceStart, const UTF32* sourceEnd, 162 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 163 ConversionResult result = conversionOK; 164 const UTF32* source = *sourceStart; 165 UTF16* target = *targetStart; 166 while (source < sourceEnd) { 167 UTF32 ch; 168 if (target >= targetEnd) { 169 result = targetExhausted; break; 170 } 171 ch = *source++; 172 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 173 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 174 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 175 if (flags == strictConversion) { 176 --source; /* return to the illegal value itself */ 177 result = sourceIllegal; 178 break; 179 } else { 180 *target++ = UNI_REPLACEMENT_CHAR; 181 } 182 } else { 183 *target++ = (UTF16)ch; /* normal case */ 184 } 185 } else if (ch > UNI_MAX_LEGAL_UTF32) { 186 if (flags == strictConversion) { 187 result = sourceIllegal; 188 } else { 189 *target++ = UNI_REPLACEMENT_CHAR; 190 } 191 } else { 192 /* target is a character in range 0xFFFF - 0x10FFFF. */ 193 if (target + 1 >= targetEnd) { 194 --source; /* Back up source pointer! */ 195 result = targetExhausted; break; 196 } 197 ch -= halfBase; 198 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 199 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 200 } 201 } 202 *sourceStart = source; 203 *targetStart = target; 204 return result; 205 } 206 207 /* --------------------------------------------------------------------- */ 208 209 ConversionResult ConvertUTF16toUTF32 ( 210 const UTF16** sourceStart, const UTF16* sourceEnd, 211 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 212 ConversionResult result = conversionOK; 213 const UTF16* source = *sourceStart; 214 UTF32* target = *targetStart; 215 UTF32 ch, ch2; 216 while (source < sourceEnd) { 217 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 218 ch = *source++; 219 /* If we have a surrogate pair, convert to UTF32 first. */ 220 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 221 /* If the 16 bits following the high surrogate are in the source buffer... */ 222 if (source < sourceEnd) { 223 ch2 = *source; 224 /* If it's a low surrogate, convert to UTF32. */ 225 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 226 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 227 + (ch2 - UNI_SUR_LOW_START) + halfBase; 228 ++source; 229 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 230 --source; /* return to the illegal value itself */ 231 result = sourceIllegal; 232 break; 233 } 234 } else { /* We don't have the 16 bits following the high surrogate. */ 235 --source; /* return to the high surrogate */ 236 result = sourceExhausted; 237 break; 238 } 239 } else if (flags == strictConversion) { 240 /* UTF-16 surrogate values are illegal in UTF-32 */ 241 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 242 --source; /* return to the illegal value itself */ 243 result = sourceIllegal; 244 break; 245 } 246 } 247 if (target >= targetEnd) { 248 source = oldSource; /* Back up source pointer! */ 249 result = targetExhausted; break; 250 } 251 *target++ = ch; 252 } 253 *sourceStart = source; 254 *targetStart = target; 255 #ifdef CVTUTF_DEBUG 256 if (result == sourceIllegal) { 257 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 258 fflush(stderr); 259 } 260 #endif 261 return result; 262 } 263 ConversionResult ConvertUTF16toUTF8 ( 264 const UTF16** sourceStart, const UTF16* sourceEnd, 265 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 266 ConversionResult result = conversionOK; 267 const UTF16* source = *sourceStart; 268 UTF8* target = *targetStart; 269 while (source < sourceEnd) { 270 UTF32 ch; 271 unsigned short bytesToWrite = 0; 272 const UTF32 byteMask = 0xBF; 273 const UTF32 byteMark = 0x80; 274 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 275 ch = *source++; 276 /* If we have a surrogate pair, convert to UTF32 first. */ 277 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 278 /* If the 16 bits following the high surrogate are in the source buffer... */ 279 if (source < sourceEnd) { 280 UTF32 ch2 = *source; 281 /* If it's a low surrogate, convert to UTF32. */ 282 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 283 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 284 + (ch2 - UNI_SUR_LOW_START) + halfBase; 285 ++source; 286 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 287 --source; /* return to the illegal value itself */ 288 result = sourceIllegal; 289 break; 290 } 291 } else { /* We don't have the 16 bits following the high surrogate. */ 292 --source; /* return to the high surrogate */ 293 result = sourceExhausted; 294 break; 295 } 296 } else if (flags == strictConversion) { 297 /* UTF-16 surrogate values are illegal in UTF-32 */ 298 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 299 --source; /* return to the illegal value itself */ 300 result = sourceIllegal; 301 break; 302 } 303 } 304 /* Figure out how many bytes the result will require */ 305 if (ch < (UTF32)0x80) { bytesToWrite = 1; 306 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 307 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 308 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 309 } else { bytesToWrite = 3; 310 ch = UNI_REPLACEMENT_CHAR; 311 } 312 313 target += bytesToWrite; 314 if (target > targetEnd) { 315 source = oldSource; /* Back up source pointer! */ 316 target -= bytesToWrite; result = targetExhausted; break; 317 } 318 switch (bytesToWrite) { /* note: everything falls through. */ 319 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 320 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 321 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 322 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 323 } 324 target += bytesToWrite; 325 } 326 *sourceStart = source; 327 *targetStart = target; 328 return result; 329 } 330 331 /* --------------------------------------------------------------------- */ 332 333 ConversionResult ConvertUTF32toUTF8 ( 334 const UTF32** sourceStart, const UTF32* sourceEnd, 335 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 336 ConversionResult result = conversionOK; 337 const UTF32* source = *sourceStart; 338 UTF8* target = *targetStart; 339 while (source < sourceEnd) { 340 UTF32 ch; 341 unsigned short bytesToWrite = 0; 342 const UTF32 byteMask = 0xBF; 343 const UTF32 byteMark = 0x80; 344 ch = *source++; 345 if (flags == strictConversion ) { 346 /* UTF-16 surrogate values are illegal in UTF-32 */ 347 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 348 --source; /* return to the illegal value itself */ 349 result = sourceIllegal; 350 break; 351 } 352 } 353 /* 354 * Figure out how many bytes the result will require. Turn any 355 * illegally large UTF32 things (> Plane 17) into replacement chars. 356 */ 357 if (ch < (UTF32)0x80) { bytesToWrite = 1; 358 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 359 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 360 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 361 } else { bytesToWrite = 3; 362 ch = UNI_REPLACEMENT_CHAR; 363 result = sourceIllegal; 364 } 365 366 target += bytesToWrite; 367 if (target > targetEnd) { 368 --source; /* Back up source pointer! */ 369 target -= bytesToWrite; result = targetExhausted; break; 370 } 371 switch (bytesToWrite) { /* note: everything falls through. */ 372 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 373 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 374 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 375 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 376 } 377 target += bytesToWrite; 378 } 379 *sourceStart = source; 380 *targetStart = target; 381 return result; 382 } 383 384 /* --------------------------------------------------------------------- */ 385 386 /* 387 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 388 * This must be called with the length pre-determined by the first byte. 389 * If not calling this from ConvertUTF8to*, then the length can be set by: 390 * length = trailingBytesForUTF8[*source]+1; 391 * and the sequence is illegal right away if there aren't that many bytes 392 * available. 393 * If presented with a length > 4, this returns false. The Unicode 394 * definition of UTF-8 goes up to 4-byte sequences. 395 */ 396 397 static Boolean isLegalUTF8(const UTF8 *source, int length) { 398 UTF8 a; 399 const UTF8 *srcptr = source+length; 400 switch (length) { 401 default: return false; 402 /* Everything else falls through when "true"... */ 403 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 404 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 405 case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 406 407 switch (*source) { 408 /* no fall-through in this inner switch */ 409 case 0xE0: if (a < 0xA0) return false; break; 410 case 0xED: if (a > 0x9F) return false; break; 411 case 0xF0: if (a < 0x90) return false; break; 412 case 0xF4: if (a > 0x8F) return false; break; 413 default: if (a < 0x80) return false; 414 } 415 416 case 1: if (*source >= 0x80 && *source < 0xC2) return false; 417 } 418 if (*source > 0xF4) return false; 419 return true; 420 } 421 422 /* --------------------------------------------------------------------- */ 423 424 /* 425 * Exported function to return whether a UTF-8 sequence is legal or not. 426 * This is not used here; it's just exported. 427 */ 428 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 429 int length = trailingBytesForUTF8[*source]+1; 430 if (length > sourceEnd - source) { 431 return false; 432 } 433 return isLegalUTF8(source, length); 434 } 435 436 /* 437 * Exported function to return the size of the first utf-8 code unit sequence, 438 * Or 0 if the sequence is not valid; 439 */ 440 unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) { 441 int length = trailingBytesForUTF8[*source] + 1; 442 return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length 443 : 0; 444 } 445 446 /* --------------------------------------------------------------------- */ 447 448 static unsigned 449 findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, 450 const UTF8 *sourceEnd) { 451 UTF8 b1, b2, b3; 452 453 assert(!isLegalUTF8Sequence(source, sourceEnd)); 454 455 /* 456 * Unicode 6.3.0, D93b: 457 * 458 * Maximal subpart of an ill-formed subsequence: The longest code unit 459 * subsequence starting at an unconvertible offset that is either: 460 * a. the initial subsequence of a well-formed code unit sequence, or 461 * b. a subsequence of length one. 462 */ 463 464 if (source == sourceEnd) 465 return 0; 466 467 /* 468 * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 469 * Byte Sequences. 470 */ 471 472 b1 = *source; 473 ++source; 474 if (b1 >= 0xC2 && b1 <= 0xDF) { 475 /* 476 * First byte is valid, but we know that this code unit sequence is 477 * invalid, so the maximal subpart has to end after the first byte. 478 */ 479 return 1; 480 } 481 482 if (source == sourceEnd) 483 return 1; 484 485 b2 = *source; 486 ++source; 487 488 if (b1 == 0xE0) { 489 return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; 490 } 491 if (b1 >= 0xE1 && b1 <= 0xEC) { 492 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 493 } 494 if (b1 == 0xED) { 495 return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; 496 } 497 if (b1 >= 0xEE && b1 <= 0xEF) { 498 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 499 } 500 if (b1 == 0xF0) { 501 if (b2 >= 0x90 && b2 <= 0xBF) { 502 if (source == sourceEnd) 503 return 2; 504 505 b3 = *source; 506 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 507 } 508 return 1; 509 } 510 if (b1 >= 0xF1 && b1 <= 0xF3) { 511 if (b2 >= 0x80 && b2 <= 0xBF) { 512 if (source == sourceEnd) 513 return 2; 514 515 b3 = *source; 516 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 517 } 518 return 1; 519 } 520 if (b1 == 0xF4) { 521 if (b2 >= 0x80 && b2 <= 0x8F) { 522 if (source == sourceEnd) 523 return 2; 524 525 b3 = *source; 526 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 527 } 528 return 1; 529 } 530 531 assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); 532 /* 533 * There are no valid sequences that start with these bytes. Maximal subpart 534 * is defined to have length 1 in these cases. 535 */ 536 return 1; 537 } 538 539 /* --------------------------------------------------------------------- */ 540 541 /* 542 * Exported function to return the total number of bytes in a codepoint 543 * represented in UTF-8, given the value of the first byte. 544 */ 545 unsigned getNumBytesForUTF8(UTF8 first) { 546 return trailingBytesForUTF8[first] + 1; 547 } 548 549 /* --------------------------------------------------------------------- */ 550 551 /* 552 * Exported function to return whether a UTF-8 string is legal or not. 553 * This is not used here; it's just exported. 554 */ 555 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { 556 while (*source != sourceEnd) { 557 int length = trailingBytesForUTF8[**source] + 1; 558 if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) 559 return false; 560 *source += length; 561 } 562 return true; 563 } 564 565 /* --------------------------------------------------------------------- */ 566 567 ConversionResult ConvertUTF8toUTF16 ( 568 const UTF8** sourceStart, const UTF8* sourceEnd, 569 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 570 ConversionResult result = conversionOK; 571 const UTF8* source = *sourceStart; 572 UTF16* target = *targetStart; 573 while (source < sourceEnd) { 574 UTF32 ch = 0; 575 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 576 if (extraBytesToRead >= sourceEnd - source) { 577 result = sourceExhausted; break; 578 } 579 /* Do this check whether lenient or strict */ 580 if (!isLegalUTF8(source, extraBytesToRead+1)) { 581 result = sourceIllegal; 582 break; 583 } 584 /* 585 * The cases all fall through. See "Note A" below. 586 */ 587 switch (extraBytesToRead) { 588 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 589 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 590 case 3: ch += *source++; ch <<= 6; 591 case 2: ch += *source++; ch <<= 6; 592 case 1: ch += *source++; ch <<= 6; 593 case 0: ch += *source++; 594 } 595 ch -= offsetsFromUTF8[extraBytesToRead]; 596 597 if (target >= targetEnd) { 598 source -= (extraBytesToRead+1); /* Back up source pointer! */ 599 result = targetExhausted; break; 600 } 601 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 602 /* UTF-16 surrogate values are illegal in UTF-32 */ 603 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 604 if (flags == strictConversion) { 605 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 606 result = sourceIllegal; 607 break; 608 } else { 609 *target++ = UNI_REPLACEMENT_CHAR; 610 } 611 } else { 612 *target++ = (UTF16)ch; /* normal case */ 613 } 614 } else if (ch > UNI_MAX_UTF16) { 615 if (flags == strictConversion) { 616 result = sourceIllegal; 617 source -= (extraBytesToRead+1); /* return to the start */ 618 break; /* Bail out; shouldn't continue */ 619 } else { 620 *target++ = UNI_REPLACEMENT_CHAR; 621 } 622 } else { 623 /* target is a character in range 0xFFFF - 0x10FFFF. */ 624 if (target + 1 >= targetEnd) { 625 source -= (extraBytesToRead+1); /* Back up source pointer! */ 626 result = targetExhausted; break; 627 } 628 ch -= halfBase; 629 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 630 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 631 } 632 } 633 *sourceStart = source; 634 *targetStart = target; 635 return result; 636 } 637 638 /* --------------------------------------------------------------------- */ 639 640 static ConversionResult ConvertUTF8toUTF32Impl( 641 const UTF8** sourceStart, const UTF8* sourceEnd, 642 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, 643 Boolean InputIsPartial) { 644 ConversionResult result = conversionOK; 645 const UTF8* source = *sourceStart; 646 UTF32* target = *targetStart; 647 while (source < sourceEnd) { 648 UTF32 ch = 0; 649 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 650 if (extraBytesToRead >= sourceEnd - source) { 651 if (flags == strictConversion || InputIsPartial) { 652 result = sourceExhausted; 653 break; 654 } else { 655 result = sourceIllegal; 656 657 /* 658 * Replace the maximal subpart of ill-formed sequence with 659 * replacement character. 660 */ 661 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 662 sourceEnd); 663 *target++ = UNI_REPLACEMENT_CHAR; 664 continue; 665 } 666 } 667 if (target >= targetEnd) { 668 result = targetExhausted; break; 669 } 670 671 /* Do this check whether lenient or strict */ 672 if (!isLegalUTF8(source, extraBytesToRead+1)) { 673 result = sourceIllegal; 674 if (flags == strictConversion) { 675 /* Abort conversion. */ 676 break; 677 } else { 678 /* 679 * Replace the maximal subpart of ill-formed sequence with 680 * replacement character. 681 */ 682 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 683 sourceEnd); 684 *target++ = UNI_REPLACEMENT_CHAR; 685 continue; 686 } 687 } 688 /* 689 * The cases all fall through. See "Note A" below. 690 */ 691 switch (extraBytesToRead) { 692 case 5: ch += *source++; ch <<= 6; 693 case 4: ch += *source++; ch <<= 6; 694 case 3: ch += *source++; ch <<= 6; 695 case 2: ch += *source++; ch <<= 6; 696 case 1: ch += *source++; ch <<= 6; 697 case 0: ch += *source++; 698 } 699 ch -= offsetsFromUTF8[extraBytesToRead]; 700 701 if (ch <= UNI_MAX_LEGAL_UTF32) { 702 /* 703 * UTF-16 surrogate values are illegal in UTF-32, and anything 704 * over Plane 17 (> 0x10FFFF) is illegal. 705 */ 706 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 707 if (flags == strictConversion) { 708 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 709 result = sourceIllegal; 710 break; 711 } else { 712 *target++ = UNI_REPLACEMENT_CHAR; 713 } 714 } else { 715 *target++ = ch; 716 } 717 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 718 result = sourceIllegal; 719 *target++ = UNI_REPLACEMENT_CHAR; 720 } 721 } 722 *sourceStart = source; 723 *targetStart = target; 724 return result; 725 } 726 727 ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, 728 const UTF8 *sourceEnd, 729 UTF32 **targetStart, 730 UTF32 *targetEnd, 731 ConversionFlags flags) { 732 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 733 flags, /*InputIsPartial=*/true); 734 } 735 736 ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, 737 const UTF8 *sourceEnd, UTF32 **targetStart, 738 UTF32 *targetEnd, ConversionFlags flags) { 739 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 740 flags, /*InputIsPartial=*/false); 741 } 742 743 /* --------------------------------------------------------------------- 744 745 Note A. 746 The fall-through switches in UTF-8 reading code save a 747 temp variable, some decrements & conditionals. The switches 748 are equivalent to the following loop: 749 { 750 int tmpBytesToRead = extraBytesToRead+1; 751 do { 752 ch += *source++; 753 --tmpBytesToRead; 754 if (tmpBytesToRead) ch <<= 6; 755 } while (tmpBytesToRead > 0); 756 } 757 In UTF-8 writing code, the switches on "bytesToWrite" are 758 similarly unrolled loops. 759 760 --------------------------------------------------------------------- */ 761 762 } // namespace llvm 763 764 ConvertUTF_RESTORE_WARNINGS 765