1*e72055b7SXin LI /* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */ 2*e72055b7SXin LI 3*e72055b7SXin LI /*- 4*e72055b7SXin LI * Copyright (c) 2007 The NetBSD Foundation, Inc. 5*e72055b7SXin LI * All rights reserved. 6*e72055b7SXin LI * 7*e72055b7SXin LI * This code is derived from software contributed to The NetBSD Foundation 8*e72055b7SXin LI * by Dieter Baron. 9*e72055b7SXin LI * 10*e72055b7SXin LI * Redistribution and use in source and binary forms, with or without 11*e72055b7SXin LI * modification, are permitted provided that the following conditions 12*e72055b7SXin LI * are met: 13*e72055b7SXin LI * 1. Redistributions of source code must retain the above copyright 14*e72055b7SXin LI * notice, this list of conditions and the following disclaimer. 15*e72055b7SXin LI * 2. Redistributions in binary form must reproduce the above copyright 16*e72055b7SXin LI * notice, this list of conditions and the following disclaimer in the 17*e72055b7SXin LI * documentation and/or other materials provided with the distribution. 18*e72055b7SXin LI * 19*e72055b7SXin LI * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20*e72055b7SXin LI * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21*e72055b7SXin LI * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22*e72055b7SXin LI * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23*e72055b7SXin LI * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24*e72055b7SXin LI * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25*e72055b7SXin LI * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26*e72055b7SXin LI * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27*e72055b7SXin LI * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28*e72055b7SXin LI * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29*e72055b7SXin LI * POSSIBILITY OF SUCH DAMAGE. 30*e72055b7SXin LI * 31*e72055b7SXin LI * $FreeBSD$ 32*e72055b7SXin LI */ 33*e72055b7SXin LI 34*e72055b7SXin LI #include <sys/types.h> 35*e72055b7SXin LI 36*e72055b7SXin LI #define UNICODE_DECOMPOSE 0x01 37*e72055b7SXin LI #define UNICODE_PRECOMPOSE 0x02 38*e72055b7SXin LI #define UNICODE_UTF8_LATIN1_FALLBACK 0x03 39*e72055b7SXin LI 40*e72055b7SXin LI size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *); 41*e72055b7SXin LI size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *); 42*e72055b7SXin LI 43*e72055b7SXin LI size_t 44*e72055b7SXin LI utf8_to_utf16(uint16_t *dst, size_t dst_len, 45*e72055b7SXin LI const char *src, size_t src_len, 46*e72055b7SXin LI int flags, int *errp) 47*e72055b7SXin LI { 48*e72055b7SXin LI const unsigned char *s; 49*e72055b7SXin LI size_t spos, dpos; 50*e72055b7SXin LI int error; 51*e72055b7SXin LI uint16_t c; 52*e72055b7SXin LI 53*e72055b7SXin LI #define IS_CONT(c) (((c)&0xc0) == 0x80) 54*e72055b7SXin LI 55*e72055b7SXin LI error = 0; 56*e72055b7SXin LI s = (const unsigned char *)src; 57*e72055b7SXin LI spos = dpos = 0; 58*e72055b7SXin LI while (spos<src_len) { 59*e72055b7SXin LI if (s[spos] < 0x80) 60*e72055b7SXin LI c = s[spos++]; 61*e72055b7SXin LI else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 62*e72055b7SXin LI && (spos >= src_len || !IS_CONT(s[spos+1])) 63*e72055b7SXin LI && s[spos]>=0xa0) { 64*e72055b7SXin LI /* not valid UTF-8, assume ISO 8859-1 */ 65*e72055b7SXin LI c = s[spos++]; 66*e72055b7SXin LI } 67*e72055b7SXin LI else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 68*e72055b7SXin LI /* continuation byte without lead byte 69*e72055b7SXin LI or lead byte for codepoint above 0x10ffff */ 70*e72055b7SXin LI error++; 71*e72055b7SXin LI spos++; 72*e72055b7SXin LI continue; 73*e72055b7SXin LI } 74*e72055b7SXin LI else if (s[spos] < 0xe0) { 75*e72055b7SXin LI if (spos >= src_len || !IS_CONT(s[spos+1])) { 76*e72055b7SXin LI spos++; 77*e72055b7SXin LI error++; 78*e72055b7SXin LI continue; 79*e72055b7SXin LI } 80*e72055b7SXin LI c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 81*e72055b7SXin LI spos += 2; 82*e72055b7SXin LI if (c < 0x80) { 83*e72055b7SXin LI /* overlong encoding */ 84*e72055b7SXin LI error++; 85*e72055b7SXin LI continue; 86*e72055b7SXin LI } 87*e72055b7SXin LI } 88*e72055b7SXin LI else if (s[spos] < 0xf0) { 89*e72055b7SXin LI if (spos >= src_len-2 90*e72055b7SXin LI || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 91*e72055b7SXin LI spos++; 92*e72055b7SXin LI error++; 93*e72055b7SXin LI continue; 94*e72055b7SXin LI } 95*e72055b7SXin LI c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 96*e72055b7SXin LI | (s[spos+2] & 0x3f); 97*e72055b7SXin LI spos += 3; 98*e72055b7SXin LI if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 99*e72055b7SXin LI /* overlong encoding or encoded surrogate */ 100*e72055b7SXin LI error++; 101*e72055b7SXin LI continue; 102*e72055b7SXin LI } 103*e72055b7SXin LI } 104*e72055b7SXin LI else { 105*e72055b7SXin LI uint32_t cc; 106*e72055b7SXin LI /* UTF-16 surrogate pair */ 107*e72055b7SXin LI 108*e72055b7SXin LI if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 109*e72055b7SXin LI || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 110*e72055b7SXin LI spos++; 111*e72055b7SXin LI error++; 112*e72055b7SXin LI 113*e72055b7SXin LI continue; 114*e72055b7SXin LI } 115*e72055b7SXin LI cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 116*e72055b7SXin LI | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 117*e72055b7SXin LI spos += 4; 118*e72055b7SXin LI if (cc < 0x10000) { 119*e72055b7SXin LI /* overlong encoding */ 120*e72055b7SXin LI error++; 121*e72055b7SXin LI continue; 122*e72055b7SXin LI } 123*e72055b7SXin LI if (dst && dpos < dst_len) 124*e72055b7SXin LI dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 125*e72055b7SXin LI dpos++; 126*e72055b7SXin LI c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 127*e72055b7SXin LI } 128*e72055b7SXin LI 129*e72055b7SXin LI if (dst && dpos < dst_len) 130*e72055b7SXin LI dst[dpos] = c; 131*e72055b7SXin LI dpos++; 132*e72055b7SXin LI } 133*e72055b7SXin LI 134*e72055b7SXin LI if (errp) 135*e72055b7SXin LI *errp = error; 136*e72055b7SXin LI 137*e72055b7SXin LI return dpos; 138*e72055b7SXin LI 139*e72055b7SXin LI #undef IS_CONT 140*e72055b7SXin LI } 141*e72055b7SXin LI 142*e72055b7SXin LI 143*e72055b7SXin LI size_t 144*e72055b7SXin LI utf16_to_utf8(char *dst, size_t dst_len, 145*e72055b7SXin LI const uint16_t *src, size_t src_len, 146*e72055b7SXin LI int flags, int *errp) 147*e72055b7SXin LI { 148*e72055b7SXin LI uint16_t spos, dpos; 149*e72055b7SXin LI int error; 150*e72055b7SXin LI 151*e72055b7SXin LI #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 152*e72055b7SXin LI #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 153*e72055b7SXin LI 154*e72055b7SXin LI error = 0; 155*e72055b7SXin LI dpos = 0; 156*e72055b7SXin LI for (spos=0; spos<src_len; spos++) { 157*e72055b7SXin LI if (src[spos] < 0x80) { 158*e72055b7SXin LI CHECK_LENGTH(1); 159*e72055b7SXin LI ADD_BYTE(src[spos]); 160*e72055b7SXin LI } 161*e72055b7SXin LI else if (src[spos] < 0x800) { 162*e72055b7SXin LI CHECK_LENGTH(2); 163*e72055b7SXin LI ADD_BYTE(0xc0 | (src[spos]>>6)); 164*e72055b7SXin LI ADD_BYTE(0x80 | (src[spos] & 0x3f)); 165*e72055b7SXin LI } 166*e72055b7SXin LI else if ((src[spos] & 0xdc00) == 0xd800) { 167*e72055b7SXin LI uint32_t c; 168*e72055b7SXin LI /* first surrogate */ 169*e72055b7SXin LI if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 170*e72055b7SXin LI /* no second surrogate present */ 171*e72055b7SXin LI error++; 172*e72055b7SXin LI continue; 173*e72055b7SXin LI } 174*e72055b7SXin LI spos++; 175*e72055b7SXin LI CHECK_LENGTH(4); 176*e72055b7SXin LI c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 177*e72055b7SXin LI ADD_BYTE(0xf0 | (c>>18)); 178*e72055b7SXin LI ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 179*e72055b7SXin LI ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 180*e72055b7SXin LI ADD_BYTE(0x80 | (c & 0x3f)); 181*e72055b7SXin LI } 182*e72055b7SXin LI else if ((src[spos] & 0xdc00) == 0xdc00) { 183*e72055b7SXin LI /* second surrogate without preceding first surrogate */ 184*e72055b7SXin LI error++; 185*e72055b7SXin LI } 186*e72055b7SXin LI else { 187*e72055b7SXin LI CHECK_LENGTH(3); 188*e72055b7SXin LI ADD_BYTE(0xe0 | src[spos]>>12); 189*e72055b7SXin LI ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 190*e72055b7SXin LI ADD_BYTE(0x80 | (src[spos] & 0x3f)); 191*e72055b7SXin LI } 192*e72055b7SXin LI } 193*e72055b7SXin LI 194*e72055b7SXin LI if (errp) 195*e72055b7SXin LI *errp = error; 196*e72055b7SXin LI 197*e72055b7SXin LI return dpos; 198*e72055b7SXin LI 199*e72055b7SXin LI #undef ADD_BYTE 200*e72055b7SXin LI #undef CHECK_LENGTH 201*e72055b7SXin LI } 202