ConvertUTF.cpp - OpenGrok cross reference for /freebsd/contrib/llvm-project/llvm/lib/Support/ConvertUTF.cpp

Lines Matching +full:8 +full:- +full:ch
1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7  *===------------------------------------------------------------------------=*/
9  * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
46 /* ---------------------------------------------------------------------
48     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
57     Jan 2004: updated switches in from-UTF8 conversions.
58     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
62 ------------------------------------------------------------------------ */
71  * This code extensively uses fall-through switches.
75 # if __has_warning("-Wimplicit-fallthrough")
78     _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
85    _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
110 /* --------------------------------------------------------------------- */
113  * Index into the table below with the first byte of a UTF-8 sequence to
115  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
116  * left as-is for anyone who may want to do such conversion, which was
133  * in a UTF-8 sequence.
139  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
141  * as many entries in this table as there are UTF-8 sequence types.
143  * for *legal* UTF-8 will be 4 or fewer bytes total.
147 /* --------------------------------------------------------------------- */
149 /* The interface converts a whole buffer to avoid function-call overhead.
151  * much as possible for efficiency, in favor of drop-through switches.
158 /* --------------------------------------------------------------------- */
167         UTF32 ch;  in ConvertUTF32toUTF16()  local
171         ch = *source++;  in ConvertUTF32toUTF16()
172         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */  in ConvertUTF32toUTF16()
173 …     /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */  in ConvertUTF32toUTF16()
174             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {  in ConvertUTF32toUTF16()
176                     --source; /* return to the illegal value itself */  in ConvertUTF32toUTF16()
183                 *target++ = (UTF16)ch; /* normal case */  in ConvertUTF32toUTF16()
185         } else if (ch > UNI_MAX_LEGAL_UTF32) {  in ConvertUTF32toUTF16()
192             /* target is a character in range 0xFFFF - 0x10FFFF. */  in ConvertUTF32toUTF16()
194                 --source; /* Back up source pointer! */  in ConvertUTF32toUTF16()
197             ch -= halfBase;  in ConvertUTF32toUTF16()
198             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);  in ConvertUTF32toUTF16()
199             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);  in ConvertUTF32toUTF16()
207 /* --------------------------------------------------------------------- */
215     UTF32 ch, ch2;  in ConvertUTF16toUTF32()  local
218         ch = *source++;  in ConvertUTF16toUTF32()
220         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {  in ConvertUTF16toUTF32()
226                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)  in ConvertUTF16toUTF32()
227                         + (ch2 - UNI_SUR_LOW_START) + halfBase;  in ConvertUTF16toUTF32()
230                     --source; /* return to the illegal value itself */  in ConvertUTF16toUTF32()
235                 --source; /* return to the high surrogate */  in ConvertUTF16toUTF32()
240             /* UTF-16 surrogate values are illegal in UTF-32 */  in ConvertUTF16toUTF32()
241             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {  in ConvertUTF16toUTF32()
242                 --source; /* return to the illegal value itself */  in ConvertUTF16toUTF32()
251         *target++ = ch;  in ConvertUTF16toUTF32()
257     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);  in ConvertUTF16toUTF32()
270         UTF32 ch;  in ConvertUTF16toUTF8()  local
275         ch = *source++;  in ConvertUTF16toUTF8()
277         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {  in ConvertUTF16toUTF8()
283                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)  in ConvertUTF16toUTF8()
284                         + (ch2 - UNI_SUR_LOW_START) + halfBase;  in ConvertUTF16toUTF8()
287                     --source; /* return to the illegal value itself */  in ConvertUTF16toUTF8()
292                 --source; /* return to the high surrogate */  in ConvertUTF16toUTF8()
297             /* UTF-16 surrogate values are illegal in UTF-32 */  in ConvertUTF16toUTF8()
298             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {  in ConvertUTF16toUTF8()
299                 --source; /* return to the illegal value itself */  in ConvertUTF16toUTF8()
305         if (ch < (UTF32)0x80) {      bytesToWrite = 1;  in ConvertUTF16toUTF8()
306         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;  in ConvertUTF16toUTF8()
307         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;  in ConvertUTF16toUTF8()
308         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;  in ConvertUTF16toUTF8()
310                                             ch = UNI_REPLACEMENT_CHAR;  in ConvertUTF16toUTF8()
316             target -= bytesToWrite; result = targetExhausted; break;  in ConvertUTF16toUTF8()
319             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;  in ConvertUTF16toUTF8()
320             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;  in ConvertUTF16toUTF8()
321             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;  in ConvertUTF16toUTF8()
322             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);  in ConvertUTF16toUTF8()
331 /* --------------------------------------------------------------------- */
340         UTF32 ch;  in ConvertUTF32toUTF8()  local
344         ch = *source++;  in ConvertUTF32toUTF8()
346             /* UTF-16 surrogate values are illegal in UTF-32 */  in ConvertUTF32toUTF8()
347             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {  in ConvertUTF32toUTF8()
348                 --source; /* return to the illegal value itself */  in ConvertUTF32toUTF8()
357         if (ch < (UTF32)0x80) {      bytesToWrite = 1;  in ConvertUTF32toUTF8()
358         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;  in ConvertUTF32toUTF8()
359         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;  in ConvertUTF32toUTF8()
360         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;  in ConvertUTF32toUTF8()
362                                             ch = UNI_REPLACEMENT_CHAR;  in ConvertUTF32toUTF8()
368             --source; /* Back up source pointer! */  in ConvertUTF32toUTF8()
369             target -= bytesToWrite; result = targetExhausted; break;  in ConvertUTF32toUTF8()
372             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;  in ConvertUTF32toUTF8()
373             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;  in ConvertUTF32toUTF8()
374             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;  in ConvertUTF32toUTF8()
375             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);  in ConvertUTF32toUTF8()
384 /* --------------------------------------------------------------------- */
387  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
388  * This must be called with the length pre-determined by the first byte.
394  * definition of UTF-8 goes up to 4-byte sequences.
403     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;  in isLegalUTF8()
404     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;  in isLegalUTF8()
405     case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;  in isLegalUTF8()
408             /* no fall-through in this inner switch */  in isLegalUTF8()
422 /* --------------------------------------------------------------------- */
425  * Exported function to return whether a UTF-8 sequence is legal or not.
430     if (length > sourceEnd - source) {  in isLegalUTF8Sequence()
437  * Exported function to return the size of the first utf-8 code unit sequence,
442   return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length  in getUTF8SequenceSize()
446 /* --------------------------------------------------------------------- */
458    *   Maximal subpart of an ill-formed subsequence: The longest code unit  in findMaximalSubpartOfIllFormedUTF8Sequence()
460    *   a. the initial subsequence of a well-formed code unit sequence, or  in findMaximalSubpartOfIllFormedUTF8Sequence()
468    * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8  in findMaximalSubpartOfIllFormedUTF8Sequence()
539 /* --------------------------------------------------------------------- */
543  * represented in UTF-8, given the value of the first byte.
549 /* --------------------------------------------------------------------- */
552  * Exported function to return whether a UTF-8 string is legal or not.
558         if (length > sourceEnd - *source || !isLegalUTF8(*source, length))  in isLegalUTF8String()
565 /* --------------------------------------------------------------------- */
574         UTF32 ch = 0;  in ConvertUTF8toUTF16()  local
576         if (extraBytesToRead >= sourceEnd - source) {  in ConvertUTF8toUTF16()
588             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */  in ConvertUTF8toUTF16()
589             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */  in ConvertUTF8toUTF16()
590             case 3: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF16()
591             case 2: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF16()
592             case 1: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF16()
593             case 0: ch += *source++;  in ConvertUTF8toUTF16()
595         ch -= offsetsFromUTF8[extraBytesToRead];  in ConvertUTF8toUTF16()
598             source -= (extraBytesToRead+1); /* Back up source pointer! */  in ConvertUTF8toUTF16()
601         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */  in ConvertUTF8toUTF16()
602             /* UTF-16 surrogate values are illegal in UTF-32 */  in ConvertUTF8toUTF16()
603             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {  in ConvertUTF8toUTF16()
605                     source -= (extraBytesToRead+1); /* return to the illegal value itself */  in ConvertUTF8toUTF16()
612                 *target++ = (UTF16)ch; /* normal case */  in ConvertUTF8toUTF16()
614         } else if (ch > UNI_MAX_UTF16) {  in ConvertUTF8toUTF16()
617                 source -= (extraBytesToRead+1); /* return to the start */  in ConvertUTF8toUTF16()
623             /* target is a character in range 0xFFFF - 0x10FFFF. */  in ConvertUTF8toUTF16()
625                 source -= (extraBytesToRead+1); /* Back up source pointer! */  in ConvertUTF8toUTF16()
628             ch -= halfBase;  in ConvertUTF8toUTF16()
629             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);  in ConvertUTF8toUTF16()
630             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);  in ConvertUTF8toUTF16()
638 /* --------------------------------------------------------------------- */
648         UTF32 ch = 0;  in ConvertUTF8toUTF32Impl()  local
650         if (extraBytesToRead >= sourceEnd - source) {  in ConvertUTF8toUTF32Impl()
658                  * Replace the maximal subpart of ill-formed sequence with  in ConvertUTF8toUTF32Impl()
679                  * Replace the maximal subpart of ill-formed sequence with  in ConvertUTF8toUTF32Impl()
692             case 5: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF32Impl()
693             case 4: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF32Impl()
694             case 3: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF32Impl()
695             case 2: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF32Impl()
696             case 1: ch += *source++; ch <<= 6;  in ConvertUTF8toUTF32Impl()
697             case 0: ch += *source++;  in ConvertUTF8toUTF32Impl()
699         ch -= offsetsFromUTF8[extraBytesToRead];  in ConvertUTF8toUTF32Impl()
701         if (ch <= UNI_MAX_LEGAL_UTF32) {  in ConvertUTF8toUTF32Impl()
703              * UTF-16 surrogate values are illegal in UTF-32, and anything  in ConvertUTF8toUTF32Impl()
706             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {  in ConvertUTF8toUTF32Impl()
708                     source -= (extraBytesToRead+1); /* return to the illegal value itself */  in ConvertUTF8toUTF32Impl()
715                 *target++ = ch;  in ConvertUTF8toUTF32Impl()
717         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */  in ConvertUTF8toUTF32Impl()
743 /* ---------------------------------------------------------------------
746     The fall-through switches in UTF-8 reading code save a
752                 ch += *source++;
753                 --tmpBytesToRead;
754                 if (tmpBytesToRead) ch <<= 6;
757     In UTF-8 writing code, the switches on "bytesToWrite" are
760    --------------------------------------------------------------------- */