1*e72055b7SXin LI /* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
2*e72055b7SXin LI
3*e72055b7SXin LI /*-
4*e72055b7SXin LI * Copyright (c) 2007 The NetBSD Foundation, Inc.
5*e72055b7SXin LI * All rights reserved.
6*e72055b7SXin LI *
7*e72055b7SXin LI * This code is derived from software contributed to The NetBSD Foundation
8*e72055b7SXin LI * by Dieter Baron.
9*e72055b7SXin LI *
10*e72055b7SXin LI * Redistribution and use in source and binary forms, with or without
11*e72055b7SXin LI * modification, are permitted provided that the following conditions
12*e72055b7SXin LI * are met:
13*e72055b7SXin LI * 1. Redistributions of source code must retain the above copyright
14*e72055b7SXin LI * notice, this list of conditions and the following disclaimer.
15*e72055b7SXin LI * 2. Redistributions in binary form must reproduce the above copyright
16*e72055b7SXin LI * notice, this list of conditions and the following disclaimer in the
17*e72055b7SXin LI * documentation and/or other materials provided with the distribution.
18*e72055b7SXin LI *
19*e72055b7SXin LI * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20*e72055b7SXin LI * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21*e72055b7SXin LI * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22*e72055b7SXin LI * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23*e72055b7SXin LI * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*e72055b7SXin LI * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*e72055b7SXin LI * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*e72055b7SXin LI * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*e72055b7SXin LI * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*e72055b7SXin LI * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*e72055b7SXin LI * POSSIBILITY OF SUCH DAMAGE.
30*e72055b7SXin LI */
31*e72055b7SXin LI
32*e72055b7SXin LI #include <sys/types.h>
33*e72055b7SXin LI
34*e72055b7SXin LI #define UNICODE_DECOMPOSE 0x01
35*e72055b7SXin LI #define UNICODE_PRECOMPOSE 0x02
36*e72055b7SXin LI #define UNICODE_UTF8_LATIN1_FALLBACK 0x03
37*e72055b7SXin LI
38*e72055b7SXin LI size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
39*e72055b7SXin LI size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
40*e72055b7SXin LI
41*e72055b7SXin LI size_t
utf8_to_utf16(uint16_t * dst,size_t dst_len,const char * src,size_t src_len,int flags,int * errp)42*e72055b7SXin LI utf8_to_utf16(uint16_t *dst, size_t dst_len,
43*e72055b7SXin LI const char *src, size_t src_len,
44*e72055b7SXin LI int flags, int *errp)
45*e72055b7SXin LI {
46*e72055b7SXin LI const unsigned char *s;
47*e72055b7SXin LI size_t spos, dpos;
48*e72055b7SXin LI int error;
49*e72055b7SXin LI uint16_t c;
50*e72055b7SXin LI
51*e72055b7SXin LI #define IS_CONT(c) (((c)&0xc0) == 0x80)
52*e72055b7SXin LI
53*e72055b7SXin LI error = 0;
54*e72055b7SXin LI s = (const unsigned char *)src;
55*e72055b7SXin LI spos = dpos = 0;
56*e72055b7SXin LI while (spos<src_len) {
57*e72055b7SXin LI if (s[spos] < 0x80)
58*e72055b7SXin LI c = s[spos++];
59*e72055b7SXin LI else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
60*e72055b7SXin LI && (spos >= src_len || !IS_CONT(s[spos+1]))
61*e72055b7SXin LI && s[spos]>=0xa0) {
62*e72055b7SXin LI /* not valid UTF-8, assume ISO 8859-1 */
63*e72055b7SXin LI c = s[spos++];
64*e72055b7SXin LI }
65*e72055b7SXin LI else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
66*e72055b7SXin LI /* continuation byte without lead byte
67*e72055b7SXin LI or lead byte for codepoint above 0x10ffff */
68*e72055b7SXin LI error++;
69*e72055b7SXin LI spos++;
70*e72055b7SXin LI continue;
71*e72055b7SXin LI }
72*e72055b7SXin LI else if (s[spos] < 0xe0) {
73*e72055b7SXin LI if (spos >= src_len || !IS_CONT(s[spos+1])) {
74*e72055b7SXin LI spos++;
75*e72055b7SXin LI error++;
76*e72055b7SXin LI continue;
77*e72055b7SXin LI }
78*e72055b7SXin LI c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
79*e72055b7SXin LI spos += 2;
80*e72055b7SXin LI if (c < 0x80) {
81*e72055b7SXin LI /* overlong encoding */
82*e72055b7SXin LI error++;
83*e72055b7SXin LI continue;
84*e72055b7SXin LI }
85*e72055b7SXin LI }
86*e72055b7SXin LI else if (s[spos] < 0xf0) {
87*e72055b7SXin LI if (spos >= src_len-2
88*e72055b7SXin LI || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
89*e72055b7SXin LI spos++;
90*e72055b7SXin LI error++;
91*e72055b7SXin LI continue;
92*e72055b7SXin LI }
93*e72055b7SXin LI c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
94*e72055b7SXin LI | (s[spos+2] & 0x3f);
95*e72055b7SXin LI spos += 3;
96*e72055b7SXin LI if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
97*e72055b7SXin LI /* overlong encoding or encoded surrogate */
98*e72055b7SXin LI error++;
99*e72055b7SXin LI continue;
100*e72055b7SXin LI }
101*e72055b7SXin LI }
102*e72055b7SXin LI else {
103*e72055b7SXin LI uint32_t cc;
104*e72055b7SXin LI /* UTF-16 surrogate pair */
105*e72055b7SXin LI
106*e72055b7SXin LI if (spos >= src_len-3 || !IS_CONT(s[spos+1])
107*e72055b7SXin LI || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
108*e72055b7SXin LI spos++;
109*e72055b7SXin LI error++;
110*e72055b7SXin LI
111*e72055b7SXin LI continue;
112*e72055b7SXin LI }
113*e72055b7SXin LI cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
114*e72055b7SXin LI | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
115*e72055b7SXin LI spos += 4;
116*e72055b7SXin LI if (cc < 0x10000) {
117*e72055b7SXin LI /* overlong encoding */
118*e72055b7SXin LI error++;
119*e72055b7SXin LI continue;
120*e72055b7SXin LI }
121*e72055b7SXin LI if (dst && dpos < dst_len)
122*e72055b7SXin LI dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
123*e72055b7SXin LI dpos++;
124*e72055b7SXin LI c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
125*e72055b7SXin LI }
126*e72055b7SXin LI
127*e72055b7SXin LI if (dst && dpos < dst_len)
128*e72055b7SXin LI dst[dpos] = c;
129*e72055b7SXin LI dpos++;
130*e72055b7SXin LI }
131*e72055b7SXin LI
132*e72055b7SXin LI if (errp)
133*e72055b7SXin LI *errp = error;
134*e72055b7SXin LI
135*e72055b7SXin LI return dpos;
136*e72055b7SXin LI
137*e72055b7SXin LI #undef IS_CONT
138*e72055b7SXin LI }
139*e72055b7SXin LI
140*e72055b7SXin LI
141*e72055b7SXin LI size_t
utf16_to_utf8(char * dst,size_t dst_len,const uint16_t * src,size_t src_len,int flags,int * errp)142*e72055b7SXin LI utf16_to_utf8(char *dst, size_t dst_len,
143*e72055b7SXin LI const uint16_t *src, size_t src_len,
144*e72055b7SXin LI int flags, int *errp)
145*e72055b7SXin LI {
146*e72055b7SXin LI uint16_t spos, dpos;
147*e72055b7SXin LI int error;
148*e72055b7SXin LI
149*e72055b7SXin LI #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL)
150*e72055b7SXin LI #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++)
151*e72055b7SXin LI
152*e72055b7SXin LI error = 0;
153*e72055b7SXin LI dpos = 0;
154*e72055b7SXin LI for (spos=0; spos<src_len; spos++) {
155*e72055b7SXin LI if (src[spos] < 0x80) {
156*e72055b7SXin LI CHECK_LENGTH(1);
157*e72055b7SXin LI ADD_BYTE(src[spos]);
158*e72055b7SXin LI }
159*e72055b7SXin LI else if (src[spos] < 0x800) {
160*e72055b7SXin LI CHECK_LENGTH(2);
161*e72055b7SXin LI ADD_BYTE(0xc0 | (src[spos]>>6));
162*e72055b7SXin LI ADD_BYTE(0x80 | (src[spos] & 0x3f));
163*e72055b7SXin LI }
164*e72055b7SXin LI else if ((src[spos] & 0xdc00) == 0xd800) {
165*e72055b7SXin LI uint32_t c;
166*e72055b7SXin LI /* first surrogate */
167*e72055b7SXin LI if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
168*e72055b7SXin LI /* no second surrogate present */
169*e72055b7SXin LI error++;
170*e72055b7SXin LI continue;
171*e72055b7SXin LI }
172*e72055b7SXin LI spos++;
173*e72055b7SXin LI CHECK_LENGTH(4);
174*e72055b7SXin LI c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
175*e72055b7SXin LI ADD_BYTE(0xf0 | (c>>18));
176*e72055b7SXin LI ADD_BYTE(0x80 | ((c>>12) & 0x3f));
177*e72055b7SXin LI ADD_BYTE(0x80 | ((c>>6) & 0x3f));
178*e72055b7SXin LI ADD_BYTE(0x80 | (c & 0x3f));
179*e72055b7SXin LI }
180*e72055b7SXin LI else if ((src[spos] & 0xdc00) == 0xdc00) {
181*e72055b7SXin LI /* second surrogate without preceding first surrogate */
182*e72055b7SXin LI error++;
183*e72055b7SXin LI }
184*e72055b7SXin LI else {
185*e72055b7SXin LI CHECK_LENGTH(3);
186*e72055b7SXin LI ADD_BYTE(0xe0 | src[spos]>>12);
187*e72055b7SXin LI ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
188*e72055b7SXin LI ADD_BYTE(0x80 | (src[spos] & 0x3f));
189*e72055b7SXin LI }
190*e72055b7SXin LI }
191*e72055b7SXin LI
192*e72055b7SXin LI if (errp)
193*e72055b7SXin LI *errp = error;
194*e72055b7SXin LI
195*e72055b7SXin LI return dpos;
196*e72055b7SXin LI
197*e72055b7SXin LI #undef ADD_BYTE
198*e72055b7SXin LI #undef CHECK_LENGTH
199*e72055b7SXin LI }
200