1 /* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */ 2 3 /*- 4 * Copyright (c) 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Dieter Baron. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 * $FreeBSD$ 32 */ 33 34 #include <sys/types.h> 35 36 #define UNICODE_DECOMPOSE 0x01 37 #define UNICODE_PRECOMPOSE 0x02 38 #define UNICODE_UTF8_LATIN1_FALLBACK 0x03 39 40 size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *); 41 size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *); 42 43 size_t 44 utf8_to_utf16(uint16_t *dst, size_t dst_len, 45 const char *src, size_t src_len, 46 int flags, int *errp) 47 { 48 const unsigned char *s; 49 size_t spos, dpos; 50 int error; 51 uint16_t c; 52 53 #define IS_CONT(c) (((c)&0xc0) == 0x80) 54 55 error = 0; 56 s = (const unsigned char *)src; 57 spos = dpos = 0; 58 while (spos<src_len) { 59 if (s[spos] < 0x80) 60 c = s[spos++]; 61 else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 62 && (spos >= src_len || !IS_CONT(s[spos+1])) 63 && s[spos]>=0xa0) { 64 /* not valid UTF-8, assume ISO 8859-1 */ 65 c = s[spos++]; 66 } 67 else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 68 /* continuation byte without lead byte 69 or lead byte for codepoint above 0x10ffff */ 70 error++; 71 spos++; 72 continue; 73 } 74 else if (s[spos] < 0xe0) { 75 if (spos >= src_len || !IS_CONT(s[spos+1])) { 76 spos++; 77 error++; 78 continue; 79 } 80 c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 81 spos += 2; 82 if (c < 0x80) { 83 /* overlong encoding */ 84 error++; 85 continue; 86 } 87 } 88 else if (s[spos] < 0xf0) { 89 if (spos >= src_len-2 90 || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 91 spos++; 92 error++; 93 continue; 94 } 95 c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 96 | (s[spos+2] & 0x3f); 97 spos += 3; 98 if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 99 /* overlong encoding or encoded surrogate */ 100 error++; 101 continue; 102 } 103 } 104 else { 105 uint32_t cc; 106 /* UTF-16 surrogate pair */ 107 108 if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 109 || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 110 spos++; 111 error++; 112 113 continue; 114 } 115 cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 116 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 117 spos += 4; 118 if (cc < 0x10000) { 119 /* overlong encoding */ 120 error++; 121 continue; 122 } 123 if (dst && dpos < dst_len) 124 dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 125 dpos++; 126 c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 127 } 128 129 if (dst && dpos < dst_len) 130 dst[dpos] = c; 131 dpos++; 132 } 133 134 if (errp) 135 *errp = error; 136 137 return dpos; 138 139 #undef IS_CONT 140 } 141 142 143 size_t 144 utf16_to_utf8(char *dst, size_t dst_len, 145 const uint16_t *src, size_t src_len, 146 int flags, int *errp) 147 { 148 uint16_t spos, dpos; 149 int error; 150 151 #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 152 #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 153 154 error = 0; 155 dpos = 0; 156 for (spos=0; spos<src_len; spos++) { 157 if (src[spos] < 0x80) { 158 CHECK_LENGTH(1); 159 ADD_BYTE(src[spos]); 160 } 161 else if (src[spos] < 0x800) { 162 CHECK_LENGTH(2); 163 ADD_BYTE(0xc0 | (src[spos]>>6)); 164 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 165 } 166 else if ((src[spos] & 0xdc00) == 0xd800) { 167 uint32_t c; 168 /* first surrogate */ 169 if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 170 /* no second surrogate present */ 171 error++; 172 continue; 173 } 174 spos++; 175 CHECK_LENGTH(4); 176 c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 177 ADD_BYTE(0xf0 | (c>>18)); 178 ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 179 ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 180 ADD_BYTE(0x80 | (c & 0x3f)); 181 } 182 else if ((src[spos] & 0xdc00) == 0xdc00) { 183 /* second surrogate without preceding first surrogate */ 184 error++; 185 } 186 else { 187 CHECK_LENGTH(3); 188 ADD_BYTE(0xe0 | src[spos]>>12); 189 ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 190 ADD_BYTE(0x80 | (src[spos] & 0x3f)); 191 } 192 } 193 194 if (errp) 195 *errp = error; 196 197 return dpos; 198 199 #undef ADD_BYTE 200 #undef CHECK_LENGTH 201 } 202