Lines Matching +full:8 +full:- +full:ch

1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===------------------------------------------------------------------------=*/
9 * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
46 /* ---------------------------------------------------------------------
48 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
57 Jan 2004: updated switches in from-UTF8 conversions.
58 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
62 ------------------------------------------------------------------------ */
71 * This code extensively uses fall-through switches.
75 # if __has_warning("-Wimplicit-fallthrough")
78 _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
85 _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
110 /* --------------------------------------------------------------------- */
113 * Index into the table below with the first byte of a UTF-8 sequence to
115 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
116 * left as-is for anyone who may want to do such conversion, which was
133 * in a UTF-8 sequence.
139 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
141 * as many entries in this table as there are UTF-8 sequence types.
143 * for *legal* UTF-8 will be 4 or fewer bytes total.
147 /* --------------------------------------------------------------------- */
149 /* The interface converts a whole buffer to avoid function-call overhead.
151 * much as possible for efficiency, in favor of drop-through switches.
158 /* --------------------------------------------------------------------- */
167 UTF32 ch; in ConvertUTF32toUTF16() local
171 ch = *source++; in ConvertUTF32toUTF16()
172 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ in ConvertUTF32toUTF16()
173 … /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ in ConvertUTF32toUTF16()
174 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { in ConvertUTF32toUTF16()
176 --source; /* return to the illegal value itself */ in ConvertUTF32toUTF16()
183 *target++ = (UTF16)ch; /* normal case */ in ConvertUTF32toUTF16()
185 } else if (ch > UNI_MAX_LEGAL_UTF32) { in ConvertUTF32toUTF16()
192 /* target is a character in range 0xFFFF - 0x10FFFF. */ in ConvertUTF32toUTF16()
194 --source; /* Back up source pointer! */ in ConvertUTF32toUTF16()
197 ch -= halfBase; in ConvertUTF32toUTF16()
198 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); in ConvertUTF32toUTF16()
199 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); in ConvertUTF32toUTF16()
207 /* --------------------------------------------------------------------- */
215 UTF32 ch, ch2; in ConvertUTF16toUTF32() local
218 ch = *source++; in ConvertUTF16toUTF32()
220 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { in ConvertUTF16toUTF32()
226 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) in ConvertUTF16toUTF32()
227 + (ch2 - UNI_SUR_LOW_START) + halfBase; in ConvertUTF16toUTF32()
230 --source; /* return to the illegal value itself */ in ConvertUTF16toUTF32()
235 --source; /* return to the high surrogate */ in ConvertUTF16toUTF32()
240 /* UTF-16 surrogate values are illegal in UTF-32 */ in ConvertUTF16toUTF32()
241 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { in ConvertUTF16toUTF32()
242 --source; /* return to the illegal value itself */ in ConvertUTF16toUTF32()
251 *target++ = ch; in ConvertUTF16toUTF32()
257 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); in ConvertUTF16toUTF32()
270 UTF32 ch; in ConvertUTF16toUTF8() local
275 ch = *source++; in ConvertUTF16toUTF8()
277 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { in ConvertUTF16toUTF8()
283 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) in ConvertUTF16toUTF8()
284 + (ch2 - UNI_SUR_LOW_START) + halfBase; in ConvertUTF16toUTF8()
287 --source; /* return to the illegal value itself */ in ConvertUTF16toUTF8()
292 --source; /* return to the high surrogate */ in ConvertUTF16toUTF8()
297 /* UTF-16 surrogate values are illegal in UTF-32 */ in ConvertUTF16toUTF8()
298 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { in ConvertUTF16toUTF8()
299 --source; /* return to the illegal value itself */ in ConvertUTF16toUTF8()
305 if (ch < (UTF32)0x80) { bytesToWrite = 1; in ConvertUTF16toUTF8()
306 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; in ConvertUTF16toUTF8()
307 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; in ConvertUTF16toUTF8()
308 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; in ConvertUTF16toUTF8()
310 ch = UNI_REPLACEMENT_CHAR; in ConvertUTF16toUTF8()
316 target -= bytesToWrite; result = targetExhausted; break; in ConvertUTF16toUTF8()
319 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; in ConvertUTF16toUTF8()
320 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; in ConvertUTF16toUTF8()
321 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; in ConvertUTF16toUTF8()
322 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); in ConvertUTF16toUTF8()
331 /* --------------------------------------------------------------------- */
340 UTF32 ch; in ConvertUTF32toUTF8() local
344 ch = *source++; in ConvertUTF32toUTF8()
346 /* UTF-16 surrogate values are illegal in UTF-32 */ in ConvertUTF32toUTF8()
347 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { in ConvertUTF32toUTF8()
348 --source; /* return to the illegal value itself */ in ConvertUTF32toUTF8()
357 if (ch < (UTF32)0x80) { bytesToWrite = 1; in ConvertUTF32toUTF8()
358 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; in ConvertUTF32toUTF8()
359 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; in ConvertUTF32toUTF8()
360 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; in ConvertUTF32toUTF8()
362 ch = UNI_REPLACEMENT_CHAR; in ConvertUTF32toUTF8()
368 --source; /* Back up source pointer! */ in ConvertUTF32toUTF8()
369 target -= bytesToWrite; result = targetExhausted; break; in ConvertUTF32toUTF8()
372 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; in ConvertUTF32toUTF8()
373 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; in ConvertUTF32toUTF8()
374 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; in ConvertUTF32toUTF8()
375 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); in ConvertUTF32toUTF8()
384 /* --------------------------------------------------------------------- */
387 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
388 * This must be called with the length pre-determined by the first byte.
394 * definition of UTF-8 goes up to 4-byte sequences.
403 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; in isLegalUTF8()
404 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; in isLegalUTF8()
405 case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; in isLegalUTF8()
408 /* no fall-through in this inner switch */ in isLegalUTF8()
422 /* --------------------------------------------------------------------- */
425 * Exported function to return whether a UTF-8 sequence is legal or not.
430 if (length > sourceEnd - source) { in isLegalUTF8Sequence()
437 * Exported function to return the size of the first utf-8 code unit sequence,
442 return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length in getUTF8SequenceSize()
446 /* --------------------------------------------------------------------- */
458 * Maximal subpart of an ill-formed subsequence: The longest code unit in findMaximalSubpartOfIllFormedUTF8Sequence()
460 * a. the initial subsequence of a well-formed code unit sequence, or in findMaximalSubpartOfIllFormedUTF8Sequence()
468 * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 in findMaximalSubpartOfIllFormedUTF8Sequence()
539 /* --------------------------------------------------------------------- */
543 * represented in UTF-8, given the value of the first byte.
549 /* --------------------------------------------------------------------- */
552 * Exported function to return whether a UTF-8 string is legal or not.
558 if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) in isLegalUTF8String()
565 /* --------------------------------------------------------------------- */
574 UTF32 ch = 0; in ConvertUTF8toUTF16() local
576 if (extraBytesToRead >= sourceEnd - source) { in ConvertUTF8toUTF16()
588 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ in ConvertUTF8toUTF16()
589 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ in ConvertUTF8toUTF16()
590 case 3: ch += *source++; ch <<= 6; in ConvertUTF8toUTF16()
591 case 2: ch += *source++; ch <<= 6; in ConvertUTF8toUTF16()
592 case 1: ch += *source++; ch <<= 6; in ConvertUTF8toUTF16()
593 case 0: ch += *source++; in ConvertUTF8toUTF16()
595 ch -= offsetsFromUTF8[extraBytesToRead]; in ConvertUTF8toUTF16()
598 source -= (extraBytesToRead+1); /* Back up source pointer! */ in ConvertUTF8toUTF16()
601 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ in ConvertUTF8toUTF16()
602 /* UTF-16 surrogate values are illegal in UTF-32 */ in ConvertUTF8toUTF16()
603 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { in ConvertUTF8toUTF16()
605 source -= (extraBytesToRead+1); /* return to the illegal value itself */ in ConvertUTF8toUTF16()
612 *target++ = (UTF16)ch; /* normal case */ in ConvertUTF8toUTF16()
614 } else if (ch > UNI_MAX_UTF16) { in ConvertUTF8toUTF16()
617 source -= (extraBytesToRead+1); /* return to the start */ in ConvertUTF8toUTF16()
623 /* target is a character in range 0xFFFF - 0x10FFFF. */ in ConvertUTF8toUTF16()
625 source -= (extraBytesToRead+1); /* Back up source pointer! */ in ConvertUTF8toUTF16()
628 ch -= halfBase; in ConvertUTF8toUTF16()
629 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); in ConvertUTF8toUTF16()
630 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); in ConvertUTF8toUTF16()
638 /* --------------------------------------------------------------------- */
648 UTF32 ch = 0; in ConvertUTF8toUTF32Impl() local
650 if (extraBytesToRead >= sourceEnd - source) { in ConvertUTF8toUTF32Impl()
658 * Replace the maximal subpart of ill-formed sequence with in ConvertUTF8toUTF32Impl()
679 * Replace the maximal subpart of ill-formed sequence with in ConvertUTF8toUTF32Impl()
692 case 5: ch += *source++; ch <<= 6; in ConvertUTF8toUTF32Impl()
693 case 4: ch += *source++; ch <<= 6; in ConvertUTF8toUTF32Impl()
694 case 3: ch += *source++; ch <<= 6; in ConvertUTF8toUTF32Impl()
695 case 2: ch += *source++; ch <<= 6; in ConvertUTF8toUTF32Impl()
696 case 1: ch += *source++; ch <<= 6; in ConvertUTF8toUTF32Impl()
697 case 0: ch += *source++; in ConvertUTF8toUTF32Impl()
699 ch -= offsetsFromUTF8[extraBytesToRead]; in ConvertUTF8toUTF32Impl()
701 if (ch <= UNI_MAX_LEGAL_UTF32) { in ConvertUTF8toUTF32Impl()
703 * UTF-16 surrogate values are illegal in UTF-32, and anything in ConvertUTF8toUTF32Impl()
706 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { in ConvertUTF8toUTF32Impl()
708 source -= (extraBytesToRead+1); /* return to the illegal value itself */ in ConvertUTF8toUTF32Impl()
715 *target++ = ch; in ConvertUTF8toUTF32Impl()
717 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ in ConvertUTF8toUTF32Impl()
743 /* ---------------------------------------------------------------------
746 The fall-through switches in UTF-8 reading code save a
752 ch += *source++;
753 --tmpBytesToRead;
754 if (tmpBytesToRead) ch <<= 6;
757 In UTF-8 writing code, the switches on "bytesToWrite" are
760 --------------------------------------------------------------------- */