1 /* 2 * Copyright (c) 2020 Proofpoint, Inc. and its suppliers. 3 * All rights reserved. 4 * 5 * By using this file, you agree to the terms and conditions set 6 * forth in the LICENSE file which can be found at the top level of 7 * the sendmail distribution. 8 * 9 */ 10 11 #include <sm/gen.h> 12 #include <sm/sendmail.h> 13 #include <sm/ixlen.h> 14 15 #if USE_EAI 16 17 /* 18 ** legal utf-8 byte sequence 19 ** http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 20 ** 21 ** Code Points 1st 2s 3s 4s 22 ** U+0000..U+007F 00..7F 23 ** U+0080..U+07FF C2..DF 80..BF 24 ** U+0800..U+0FFF E0 A0..BF 80..BF 25 ** U+1000..U+CFFF E1..EC 80..BF 80..BF 26 ** U+D000..U+D7FF ED 80..9F 80..BF 27 ** U+E000..U+FFFF EE..EF 80..BF 80..BF 28 ** U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 29 ** U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 30 ** U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 31 */ 32 33 /* 34 ** based on 35 ** https://github.com/lemire/fastvalidate-utf-8.git 36 ** which is distributed under an MIT license (besides others). 37 */ 38 39 bool 40 utf8_valid(b, length) 41 const char *b; 42 size_t length; 43 { 44 const unsigned char *bytes; 45 size_t index; 46 47 bytes = (const unsigned char *)b; 48 index = 0; 49 while (true) 50 { 51 unsigned char byte1; 52 53 do { /* fast ASCII Path */ 54 if (index >= length) 55 return true; 56 byte1 = bytes[index++]; 57 } while (byte1 < 0x80); 58 if (byte1 < 0xE0) 59 { 60 /* Two-byte form. */ 61 if (index == length) 62 return false; 63 if (byte1 < 0xC2 || bytes[index++] > 0xBF) 64 return false; 65 } 66 else if (byte1 < 0xF0) 67 { 68 /* Three-byte form. */ 69 if (index + 1 >= length) 70 return false; 71 unsigned char byte2 = bytes[index++]; 72 if (byte2 > 0xBF 73 /* Overlong? 5 most significant bits must not all be zero. */ 74 || (byte1 == 0xE0 && byte2 < 0xA0) 75 /* Check for illegal surrogate codepoints. */ 76 || (byte1 == 0xED && 0xA0 <= byte2) 77 /* Third byte trailing-byte test. */ 78 || bytes[index++] > 0xBF) 79 return false; 80 } 81 else 82 { 83 84 /* Four-byte form. */ 85 if (index + 2 >= length) 86 return false; 87 int byte2 = bytes[index++]; 88 if (byte2 > 0xBF 89 /* Check that 1 <= plane <= 16. Tricky optimized form of: */ 90 /* if (byte1 > (byte) 0xF4 */ 91 /* || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 */ 92 /* || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) */ 93 || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0 94 /* Third byte trailing-byte test */ 95 || bytes[index++] > 0xBF 96 /* Fourth byte trailing-byte test */ 97 || bytes[index++] > 0xBF) 98 return false; 99 } 100 } 101 /* NOTREACHED */ 102 return false; 103 } 104 #endif /* USE_EAI */ 105