1 /* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */ 2 3 /*- 4 * Copyright (c) 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Dieter Baron. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/types.h> 33 34 #define UNICODE_DECOMPOSE 0x01 35 #define UNICODE_PRECOMPOSE 0x02 36 #define UNICODE_UTF8_LATIN1_FALLBACK 0x03 37 38 size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *); 39 size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *); 40 41 size_t 42 utf8_to_utf16(uint16_t *dst, size_t dst_len, 43 const char *src, size_t src_len, 44 int flags, int *errp) 45 { 46 const unsigned char *s; 47 size_t spos, dpos; 48 int error; 49 uint16_t c; 50 51 #define IS_CONT(c) (((c)&0xc0) == 0x80) 52 53 error = 0; 54 s = (const unsigned char *)src; 55 spos = dpos = 0; 56 while (spos<src_len) { 57 if (s[spos] < 0x80) 58 c = s[spos++]; 59 else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 60 && (spos >= src_len || !IS_CONT(s[spos+1])) 61 && s[spos]>=0xa0) { 62 /* not valid UTF-8, assume ISO 8859-1 */ 63 c = s[spos++]; 64 } 65 else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 66 /* continuation byte without lead byte 67 or lead byte for codepoint above 0x10ffff */ 68 error++; 69 spos++; 70 continue; 71 } 72 else if (s[spos] < 0xe0) { 73 if (spos >= src_len || !IS_CONT(s[spos+1])) { 74 spos++; 75 error++; 76 continue; 77 } 78 c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 79 spos += 2; 80 if (c < 0x80) { 81 /* overlong encoding */ 82 error++; 83 continue; 84 } 85 } 86 else if (s[spos] < 0xf0) { 87 if (spos >= src_len-2 88 || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 89 spos++; 90 error++; 91 continue; 92 } 93 c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 94 | (s[spos+2] & 0x3f); 95 spos += 3; 96 if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 97 /* overlong encoding or encoded surrogate */ 98 error++; 99 continue; 100 } 101 } 102 else { 103 uint32_t cc; 104 /* UTF-16 surrogate pair */ 105 106 if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 107 || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 108 spos++; 109 error++; 110 111 continue; 112 } 113 cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 114 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 115 spos += 4; 116 if (cc < 0x10000) { 117 /* overlong encoding */ 118 error++; 119 continue; 120 } 121 if (dst && dpos < dst_len) 122 dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 123 dpos++; 124 c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 125 } 126 127 if (dst && dpos < dst_len) 128 dst[dpos] = c; 129 dpos++; 130 } 131 132 if (errp) 133 *errp = error; 134 135 return dpos; 136 137 #undef IS_CONT 138 } 139 140 141 size_t 142 utf16_to_utf8(char *dst, size_t dst_len, 143 const uint16_t *src, size_t src_len, 144 int flags, int *errp) 145 { 146 uint16_t spos, dpos; 147 int error; 148 149 #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 150 #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 151 152 error = 0; 153 dpos = 0; 154 for (spos=0; spos<src_len; spos++) { 155 if (src[spos] < 0x80) { 156 CHECK_LENGTH(1); 157 ADD_BYTE(src[spos]); 158 } 159 else if (src[spos] < 0x800) { 160 CHECK_LENGTH(2); 161 ADD_BYTE(0xc0 | (src[spos]>>6)); 162 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 163 } 164 else if ((src[spos] & 0xdc00) == 0xd800) { 165 uint32_t c; 166 /* first surrogate */ 167 if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 168 /* no second surrogate present */ 169 error++; 170 continue; 171 } 172 spos++; 173 CHECK_LENGTH(4); 174 c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 175 ADD_BYTE(0xf0 | (c>>18)); 176 ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 177 ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 178 ADD_BYTE(0x80 | (c & 0x3f)); 179 } 180 else if ((src[spos] & 0xdc00) == 0xdc00) { 181 /* second surrogate without preceding first surrogate */ 182 error++; 183 } 184 else { 185 CHECK_LENGTH(3); 186 ADD_BYTE(0xe0 | src[spos]>>12); 187 ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 188 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 189 } 190 } 191 192 if (errp) 193 *errp = error; 194 195 return dpos; 196 197 #undef ADD_BYTE 198 #undef CHECK_LENGTH 199 } 200