xref: /freebsd/sys/dev/hyperv/utilities/unicode.h (revision 95ee2897e98f5d444f26ed2334cc7c439f9c16c6)
1*e72055b7SXin LI /* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
2*e72055b7SXin LI 
3*e72055b7SXin LI /*-
4*e72055b7SXin LI  * Copyright (c) 2007 The NetBSD Foundation, Inc.
5*e72055b7SXin LI  * All rights reserved.
6*e72055b7SXin LI  *
7*e72055b7SXin LI  * This code is derived from software contributed to The NetBSD Foundation
8*e72055b7SXin LI  * by Dieter Baron.
9*e72055b7SXin LI  *
10*e72055b7SXin LI  * Redistribution and use in source and binary forms, with or without
11*e72055b7SXin LI  * modification, are permitted provided that the following conditions
12*e72055b7SXin LI  * are met:
13*e72055b7SXin LI  * 1. Redistributions of source code must retain the above copyright
14*e72055b7SXin LI  *    notice, this list of conditions and the following disclaimer.
15*e72055b7SXin LI  * 2. Redistributions in binary form must reproduce the above copyright
16*e72055b7SXin LI  *    notice, this list of conditions and the following disclaimer in the
17*e72055b7SXin LI  *    documentation and/or other materials provided with the distribution.
18*e72055b7SXin LI  *
19*e72055b7SXin LI  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20*e72055b7SXin LI  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21*e72055b7SXin LI  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22*e72055b7SXin LI  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23*e72055b7SXin LI  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*e72055b7SXin LI  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*e72055b7SXin LI  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*e72055b7SXin LI  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*e72055b7SXin LI  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*e72055b7SXin LI  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*e72055b7SXin LI  * POSSIBILITY OF SUCH DAMAGE.
30*e72055b7SXin LI  */
31*e72055b7SXin LI 
32*e72055b7SXin LI #include <sys/types.h>
33*e72055b7SXin LI 
34*e72055b7SXin LI #define UNICODE_DECOMPOSE		0x01
35*e72055b7SXin LI #define UNICODE_PRECOMPOSE		0x02
36*e72055b7SXin LI #define UNICODE_UTF8_LATIN1_FALLBACK	0x03
37*e72055b7SXin LI 
38*e72055b7SXin LI size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
39*e72055b7SXin LI size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
40*e72055b7SXin LI 
41*e72055b7SXin LI size_t
utf8_to_utf16(uint16_t * dst,size_t dst_len,const char * src,size_t src_len,int flags,int * errp)42*e72055b7SXin LI utf8_to_utf16(uint16_t *dst, size_t dst_len,
43*e72055b7SXin LI 	      const char *src, size_t src_len,
44*e72055b7SXin LI 	      int flags, int *errp)
45*e72055b7SXin LI {
46*e72055b7SXin LI     const unsigned char *s;
47*e72055b7SXin LI     size_t spos, dpos;
48*e72055b7SXin LI     int error;
49*e72055b7SXin LI     uint16_t c;
50*e72055b7SXin LI 
51*e72055b7SXin LI #define IS_CONT(c)	(((c)&0xc0) == 0x80)
52*e72055b7SXin LI 
53*e72055b7SXin LI     error = 0;
54*e72055b7SXin LI     s = (const unsigned char *)src;
55*e72055b7SXin LI     spos = dpos = 0;
56*e72055b7SXin LI     while (spos<src_len) {
57*e72055b7SXin LI 	if (s[spos] < 0x80)
58*e72055b7SXin LI 	    c = s[spos++];
59*e72055b7SXin LI 	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
60*e72055b7SXin LI 		 && (spos >= src_len || !IS_CONT(s[spos+1]))
61*e72055b7SXin LI 		 && s[spos]>=0xa0) {
62*e72055b7SXin LI 	    /* not valid UTF-8, assume ISO 8859-1 */
63*e72055b7SXin LI 	    c = s[spos++];
64*e72055b7SXin LI 	}
65*e72055b7SXin LI 	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
66*e72055b7SXin LI 	    /* continuation byte without lead byte
67*e72055b7SXin LI 	       or lead byte for codepoint above 0x10ffff */
68*e72055b7SXin LI 	    error++;
69*e72055b7SXin LI 	    spos++;
70*e72055b7SXin LI 	    continue;
71*e72055b7SXin LI 	}
72*e72055b7SXin LI 	else if (s[spos] < 0xe0) {
73*e72055b7SXin LI 	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
74*e72055b7SXin LI 		spos++;
75*e72055b7SXin LI 		error++;
76*e72055b7SXin LI 		continue;
77*e72055b7SXin LI 	    }
78*e72055b7SXin LI 	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
79*e72055b7SXin LI 	    spos += 2;
80*e72055b7SXin LI 	    if (c < 0x80) {
81*e72055b7SXin LI 		/* overlong encoding */
82*e72055b7SXin LI 		error++;
83*e72055b7SXin LI 		continue;
84*e72055b7SXin LI 	    }
85*e72055b7SXin LI 	}
86*e72055b7SXin LI 	else if (s[spos] < 0xf0) {
87*e72055b7SXin LI 	    if (spos >= src_len-2
88*e72055b7SXin LI 		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
89*e72055b7SXin LI 		spos++;
90*e72055b7SXin LI 		error++;
91*e72055b7SXin LI 		continue;
92*e72055b7SXin LI 	    }
93*e72055b7SXin LI 	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
94*e72055b7SXin LI 		| (s[spos+2] & 0x3f);
95*e72055b7SXin LI 	    spos += 3;
96*e72055b7SXin LI 	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
97*e72055b7SXin LI 		/* overlong encoding or encoded surrogate */
98*e72055b7SXin LI 		error++;
99*e72055b7SXin LI 		continue;
100*e72055b7SXin LI 	    }
101*e72055b7SXin LI 	}
102*e72055b7SXin LI 	else {
103*e72055b7SXin LI 	    uint32_t cc;
104*e72055b7SXin LI 	    /* UTF-16 surrogate pair */
105*e72055b7SXin LI 
106*e72055b7SXin LI 	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
107*e72055b7SXin LI 		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
108*e72055b7SXin LI 		spos++;
109*e72055b7SXin LI 		error++;
110*e72055b7SXin LI 
111*e72055b7SXin LI 		continue;
112*e72055b7SXin LI 	    }
113*e72055b7SXin LI 	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
114*e72055b7SXin LI 		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
115*e72055b7SXin LI 	    spos += 4;
116*e72055b7SXin LI 	    if (cc < 0x10000) {
117*e72055b7SXin LI 		/* overlong encoding */
118*e72055b7SXin LI 		error++;
119*e72055b7SXin LI 		continue;
120*e72055b7SXin LI 	    }
121*e72055b7SXin LI 	    if (dst && dpos < dst_len)
122*e72055b7SXin LI 		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
123*e72055b7SXin LI 	    dpos++;
124*e72055b7SXin LI 	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
125*e72055b7SXin LI 	}
126*e72055b7SXin LI 
127*e72055b7SXin LI 	if (dst && dpos < dst_len)
128*e72055b7SXin LI 	    dst[dpos] = c;
129*e72055b7SXin LI 	dpos++;
130*e72055b7SXin LI     }
131*e72055b7SXin LI 
132*e72055b7SXin LI     if (errp)
133*e72055b7SXin LI 	*errp = error;
134*e72055b7SXin LI 
135*e72055b7SXin LI     return dpos;
136*e72055b7SXin LI 
137*e72055b7SXin LI #undef IS_CONT
138*e72055b7SXin LI }
139*e72055b7SXin LI 
140*e72055b7SXin LI 
141*e72055b7SXin LI size_t
utf16_to_utf8(char * dst,size_t dst_len,const uint16_t * src,size_t src_len,int flags,int * errp)142*e72055b7SXin LI utf16_to_utf8(char *dst, size_t dst_len,
143*e72055b7SXin LI 	      const uint16_t *src, size_t src_len,
144*e72055b7SXin LI 	      int flags, int *errp)
145*e72055b7SXin LI {
146*e72055b7SXin LI     uint16_t spos, dpos;
147*e72055b7SXin LI     int error;
148*e72055b7SXin LI 
149*e72055b7SXin LI #define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
150*e72055b7SXin LI #define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
151*e72055b7SXin LI 
152*e72055b7SXin LI     error = 0;
153*e72055b7SXin LI     dpos = 0;
154*e72055b7SXin LI     for (spos=0; spos<src_len; spos++) {
155*e72055b7SXin LI 	if (src[spos] < 0x80) {
156*e72055b7SXin LI 	    CHECK_LENGTH(1);
157*e72055b7SXin LI 	    ADD_BYTE(src[spos]);
158*e72055b7SXin LI 	}
159*e72055b7SXin LI 	else if (src[spos] < 0x800) {
160*e72055b7SXin LI 	    CHECK_LENGTH(2);
161*e72055b7SXin LI 	    ADD_BYTE(0xc0 | (src[spos]>>6));
162*e72055b7SXin LI 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
163*e72055b7SXin LI 	}
164*e72055b7SXin LI 	else if ((src[spos] & 0xdc00) == 0xd800) {
165*e72055b7SXin LI 	    uint32_t c;
166*e72055b7SXin LI 	    /* first surrogate */
167*e72055b7SXin LI 	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
168*e72055b7SXin LI 		/* no second surrogate present */
169*e72055b7SXin LI 		error++;
170*e72055b7SXin LI 		continue;
171*e72055b7SXin LI 	    }
172*e72055b7SXin LI 	    spos++;
173*e72055b7SXin LI 	    CHECK_LENGTH(4);
174*e72055b7SXin LI 	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
175*e72055b7SXin LI 	    ADD_BYTE(0xf0 | (c>>18));
176*e72055b7SXin LI 	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
177*e72055b7SXin LI 	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
178*e72055b7SXin LI 	    ADD_BYTE(0x80 | (c & 0x3f));
179*e72055b7SXin LI 	}
180*e72055b7SXin LI 	else if ((src[spos] & 0xdc00) == 0xdc00) {
181*e72055b7SXin LI 	    /* second surrogate without preceding first surrogate */
182*e72055b7SXin LI 	    error++;
183*e72055b7SXin LI 	}
184*e72055b7SXin LI 	else {
185*e72055b7SXin LI 	    CHECK_LENGTH(3);
186*e72055b7SXin LI 	    ADD_BYTE(0xe0 | src[spos]>>12);
187*e72055b7SXin LI 	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
188*e72055b7SXin LI 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
189*e72055b7SXin LI 	}
190*e72055b7SXin LI     }
191*e72055b7SXin LI 
192*e72055b7SXin LI     if (errp)
193*e72055b7SXin LI 	*errp = error;
194*e72055b7SXin LI 
195*e72055b7SXin LI     return dpos;
196*e72055b7SXin LI 
197*e72055b7SXin LI #undef ADD_BYTE
198*e72055b7SXin LI #undef CHECK_LENGTH
199*e72055b7SXin LI }
200