xref: /freebsd/sys/dev/hyperv/utilities/unicode.h (revision e72055b7feba695a760d45f01f0f8268b1cb4a74)
1*e72055b7SXin LI /* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
2*e72055b7SXin LI 
3*e72055b7SXin LI /*-
4*e72055b7SXin LI  * Copyright (c) 2007 The NetBSD Foundation, Inc.
5*e72055b7SXin LI  * All rights reserved.
6*e72055b7SXin LI  *
7*e72055b7SXin LI  * This code is derived from software contributed to The NetBSD Foundation
8*e72055b7SXin LI  * by Dieter Baron.
9*e72055b7SXin LI  *
10*e72055b7SXin LI  * Redistribution and use in source and binary forms, with or without
11*e72055b7SXin LI  * modification, are permitted provided that the following conditions
12*e72055b7SXin LI  * are met:
13*e72055b7SXin LI  * 1. Redistributions of source code must retain the above copyright
14*e72055b7SXin LI  *    notice, this list of conditions and the following disclaimer.
15*e72055b7SXin LI  * 2. Redistributions in binary form must reproduce the above copyright
16*e72055b7SXin LI  *    notice, this list of conditions and the following disclaimer in the
17*e72055b7SXin LI  *    documentation and/or other materials provided with the distribution.
18*e72055b7SXin LI  *
19*e72055b7SXin LI  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20*e72055b7SXin LI  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21*e72055b7SXin LI  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22*e72055b7SXin LI  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23*e72055b7SXin LI  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*e72055b7SXin LI  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*e72055b7SXin LI  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*e72055b7SXin LI  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*e72055b7SXin LI  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*e72055b7SXin LI  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*e72055b7SXin LI  * POSSIBILITY OF SUCH DAMAGE.
30*e72055b7SXin LI  *
31*e72055b7SXin LI  * $FreeBSD$
32*e72055b7SXin LI  */
33*e72055b7SXin LI 
34*e72055b7SXin LI #include <sys/types.h>
35*e72055b7SXin LI 
36*e72055b7SXin LI #define UNICODE_DECOMPOSE		0x01
37*e72055b7SXin LI #define UNICODE_PRECOMPOSE		0x02
38*e72055b7SXin LI #define UNICODE_UTF8_LATIN1_FALLBACK	0x03
39*e72055b7SXin LI 
40*e72055b7SXin LI size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
41*e72055b7SXin LI size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
42*e72055b7SXin LI 
43*e72055b7SXin LI size_t
44*e72055b7SXin LI utf8_to_utf16(uint16_t *dst, size_t dst_len,
45*e72055b7SXin LI 	      const char *src, size_t src_len,
46*e72055b7SXin LI 	      int flags, int *errp)
47*e72055b7SXin LI {
48*e72055b7SXin LI     const unsigned char *s;
49*e72055b7SXin LI     size_t spos, dpos;
50*e72055b7SXin LI     int error;
51*e72055b7SXin LI     uint16_t c;
52*e72055b7SXin LI 
53*e72055b7SXin LI #define IS_CONT(c)	(((c)&0xc0) == 0x80)
54*e72055b7SXin LI 
55*e72055b7SXin LI     error = 0;
56*e72055b7SXin LI     s = (const unsigned char *)src;
57*e72055b7SXin LI     spos = dpos = 0;
58*e72055b7SXin LI     while (spos<src_len) {
59*e72055b7SXin LI 	if (s[spos] < 0x80)
60*e72055b7SXin LI 	    c = s[spos++];
61*e72055b7SXin LI 	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
62*e72055b7SXin LI 		 && (spos >= src_len || !IS_CONT(s[spos+1]))
63*e72055b7SXin LI 		 && s[spos]>=0xa0) {
64*e72055b7SXin LI 	    /* not valid UTF-8, assume ISO 8859-1 */
65*e72055b7SXin LI 	    c = s[spos++];
66*e72055b7SXin LI 	}
67*e72055b7SXin LI 	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
68*e72055b7SXin LI 	    /* continuation byte without lead byte
69*e72055b7SXin LI 	       or lead byte for codepoint above 0x10ffff */
70*e72055b7SXin LI 	    error++;
71*e72055b7SXin LI 	    spos++;
72*e72055b7SXin LI 	    continue;
73*e72055b7SXin LI 	}
74*e72055b7SXin LI 	else if (s[spos] < 0xe0) {
75*e72055b7SXin LI 	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
76*e72055b7SXin LI 		spos++;
77*e72055b7SXin LI 		error++;
78*e72055b7SXin LI 		continue;
79*e72055b7SXin LI 	    }
80*e72055b7SXin LI 	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
81*e72055b7SXin LI 	    spos += 2;
82*e72055b7SXin LI 	    if (c < 0x80) {
83*e72055b7SXin LI 		/* overlong encoding */
84*e72055b7SXin LI 		error++;
85*e72055b7SXin LI 		continue;
86*e72055b7SXin LI 	    }
87*e72055b7SXin LI 	}
88*e72055b7SXin LI 	else if (s[spos] < 0xf0) {
89*e72055b7SXin LI 	    if (spos >= src_len-2
90*e72055b7SXin LI 		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
91*e72055b7SXin LI 		spos++;
92*e72055b7SXin LI 		error++;
93*e72055b7SXin LI 		continue;
94*e72055b7SXin LI 	    }
95*e72055b7SXin LI 	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
96*e72055b7SXin LI 		| (s[spos+2] & 0x3f);
97*e72055b7SXin LI 	    spos += 3;
98*e72055b7SXin LI 	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
99*e72055b7SXin LI 		/* overlong encoding or encoded surrogate */
100*e72055b7SXin LI 		error++;
101*e72055b7SXin LI 		continue;
102*e72055b7SXin LI 	    }
103*e72055b7SXin LI 	}
104*e72055b7SXin LI 	else {
105*e72055b7SXin LI 	    uint32_t cc;
106*e72055b7SXin LI 	    /* UTF-16 surrogate pair */
107*e72055b7SXin LI 
108*e72055b7SXin LI 	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
109*e72055b7SXin LI 		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
110*e72055b7SXin LI 		spos++;
111*e72055b7SXin LI 		error++;
112*e72055b7SXin LI 
113*e72055b7SXin LI 		continue;
114*e72055b7SXin LI 	    }
115*e72055b7SXin LI 	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
116*e72055b7SXin LI 		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
117*e72055b7SXin LI 	    spos += 4;
118*e72055b7SXin LI 	    if (cc < 0x10000) {
119*e72055b7SXin LI 		/* overlong encoding */
120*e72055b7SXin LI 		error++;
121*e72055b7SXin LI 		continue;
122*e72055b7SXin LI 	    }
123*e72055b7SXin LI 	    if (dst && dpos < dst_len)
124*e72055b7SXin LI 		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
125*e72055b7SXin LI 	    dpos++;
126*e72055b7SXin LI 	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
127*e72055b7SXin LI 	}
128*e72055b7SXin LI 
129*e72055b7SXin LI 	if (dst && dpos < dst_len)
130*e72055b7SXin LI 	    dst[dpos] = c;
131*e72055b7SXin LI 	dpos++;
132*e72055b7SXin LI     }
133*e72055b7SXin LI 
134*e72055b7SXin LI     if (errp)
135*e72055b7SXin LI 	*errp = error;
136*e72055b7SXin LI 
137*e72055b7SXin LI     return dpos;
138*e72055b7SXin LI 
139*e72055b7SXin LI #undef IS_CONT
140*e72055b7SXin LI }
141*e72055b7SXin LI 
142*e72055b7SXin LI 
143*e72055b7SXin LI size_t
144*e72055b7SXin LI utf16_to_utf8(char *dst, size_t dst_len,
145*e72055b7SXin LI 	      const uint16_t *src, size_t src_len,
146*e72055b7SXin LI 	      int flags, int *errp)
147*e72055b7SXin LI {
148*e72055b7SXin LI     uint16_t spos, dpos;
149*e72055b7SXin LI     int error;
150*e72055b7SXin LI 
151*e72055b7SXin LI #define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
152*e72055b7SXin LI #define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
153*e72055b7SXin LI 
154*e72055b7SXin LI     error = 0;
155*e72055b7SXin LI     dpos = 0;
156*e72055b7SXin LI     for (spos=0; spos<src_len; spos++) {
157*e72055b7SXin LI 	if (src[spos] < 0x80) {
158*e72055b7SXin LI 	    CHECK_LENGTH(1);
159*e72055b7SXin LI 	    ADD_BYTE(src[spos]);
160*e72055b7SXin LI 	}
161*e72055b7SXin LI 	else if (src[spos] < 0x800) {
162*e72055b7SXin LI 	    CHECK_LENGTH(2);
163*e72055b7SXin LI 	    ADD_BYTE(0xc0 | (src[spos]>>6));
164*e72055b7SXin LI 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
165*e72055b7SXin LI 	}
166*e72055b7SXin LI 	else if ((src[spos] & 0xdc00) == 0xd800) {
167*e72055b7SXin LI 	    uint32_t c;
168*e72055b7SXin LI 	    /* first surrogate */
169*e72055b7SXin LI 	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
170*e72055b7SXin LI 		/* no second surrogate present */
171*e72055b7SXin LI 		error++;
172*e72055b7SXin LI 		continue;
173*e72055b7SXin LI 	    }
174*e72055b7SXin LI 	    spos++;
175*e72055b7SXin LI 	    CHECK_LENGTH(4);
176*e72055b7SXin LI 	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
177*e72055b7SXin LI 	    ADD_BYTE(0xf0 | (c>>18));
178*e72055b7SXin LI 	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
179*e72055b7SXin LI 	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
180*e72055b7SXin LI 	    ADD_BYTE(0x80 | (c & 0x3f));
181*e72055b7SXin LI 	}
182*e72055b7SXin LI 	else if ((src[spos] & 0xdc00) == 0xdc00) {
183*e72055b7SXin LI 	    /* second surrogate without preceding first surrogate */
184*e72055b7SXin LI 	    error++;
185*e72055b7SXin LI 	}
186*e72055b7SXin LI 	else {
187*e72055b7SXin LI 	    CHECK_LENGTH(3);
188*e72055b7SXin LI 	    ADD_BYTE(0xe0 | src[spos]>>12);
189*e72055b7SXin LI 	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
190*e72055b7SXin LI 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
191*e72055b7SXin LI 	}
192*e72055b7SXin LI     }
193*e72055b7SXin LI 
194*e72055b7SXin LI     if (errp)
195*e72055b7SXin LI 	*errp = error;
196*e72055b7SXin LI 
197*e72055b7SXin LI     return dpos;
198*e72055b7SXin LI 
199*e72055b7SXin LI #undef ADD_BYTE
200*e72055b7SXin LI #undef CHECK_LENGTH
201*e72055b7SXin LI }
202