xref: /titanic_52/usr/src/lib/libsmbfs/smb/utf_str.c (revision 209e49b2ff611e7d61ff58e13756ae67f51be550)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Unicode conversions (yet more)
29  */
30 
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <iconv.h>
36 #include <libintl.h>
37 
38 #include <sys/u8_textprep.h>
39 
40 #include <netsmb/smb_lib.h>
41 #include "charsets.h"
42 
43 
44 /*
45  * Number of unicode symbols in the string,
46  * not including the 2-byte null terminator.
47  * (multiply by two for storage size)
48  */
49 size_t
50 unicode_strlen(const uint16_t *us)
51 {
52 	size_t len = 0;
53 	while (*us++)
54 		len++;
55 	return (len);
56 }
57 
58 static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
59 
60 /*
61  * Convert (native) Unicode string to UTF-8.
62  * Returns allocated memory.
63  */
64 char *
65 convert_unicode_to_utf8(uint16_t *us)
66 {
67 	static iconv_t cd1 = (iconv_t)-1;
68 
69 	/* Get conversion descriptor (to, from) */
70 	if (cd1 == (iconv_t)-1)
71 		cd1 = iconv_open("UTF-8", "UCS-2");
72 
73 	return (convert_ucs2xx_to_utf8(cd1, us));
74 }
75 
76 /*
77  * Convert little-endian Unicode string to UTF-8.
78  * Returns allocated memory.
79  */
80 char *
81 convert_leunicode_to_utf8(unsigned short *us)
82 {
83 	static iconv_t cd2 = (iconv_t)-1;
84 
85 	/* Get conversion descriptor (to, from) */
86 	if (cd2 == (iconv_t)-1)
87 		cd2 = iconv_open("UTF-8", "UCS-2LE");
88 
89 	return (convert_ucs2xx_to_utf8(cd2, us));
90 }
91 
92 static char *
93 convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
94 {
95 	char *obuf, *optr;
96 	const char *iptr;
97 	size_t  ileft, obsize, oleft, ret;
98 
99 	if (cd == (iconv_t)-1) {
100 		smb_error(dgettext(TEXT_DOMAIN,
101 		    "iconv_open(UTF-8/UCS-2)"), -1);
102 		return (NULL);
103 	}
104 
105 	iptr = (const char *)us;
106 	ileft = unicode_strlen(us);
107 	ileft *= 2; /* now bytes */
108 
109 	/* Worst-case output size is 2x input size. */
110 	oleft = ileft * 2;
111 	obsize = oleft + 2; /* room for null */
112 	obuf = malloc(obsize);
113 	if (!obuf)
114 		return (NULL);
115 	optr = obuf;
116 
117 	ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
118 	*optr = '\0';
119 	if (ret == (size_t)-1) {
120 		smb_error(dgettext(TEXT_DOMAIN,
121 		    "iconv(%s) failed"), errno, obuf);
122 	}
123 	if (ileft) {
124 		smb_error(dgettext(TEXT_DOMAIN,
125 		    "iconv(%s) failed"), -1, obuf);
126 		/*
127 		 * XXX: What's better?  return NULL?
128 		 * The truncated string? << for now
129 		 */
130 	}
131 
132 	return (obuf);
133 }
134 
135 static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
136 
137 /*
138  * Convert UTF-8 string to Unicode.
139  * Returns allocated memory.
140  */
141 uint16_t *
142 convert_utf8_to_unicode(const char *utf8_string)
143 {
144 	static iconv_t cd3 = (iconv_t)-1;
145 
146 	/* Get conversion descriptor (to, from) */
147 	if (cd3 == (iconv_t)-1)
148 		cd3 = iconv_open("UCS-2", "UTF-8");
149 	return (convert_utf8_to_ucs2xx(cd3, utf8_string));
150 }
151 
152 /*
153  * Convert UTF-8 string to little-endian Unicode.
154  * Returns allocated memory.
155  */
156 uint16_t *
157 convert_utf8_to_leunicode(const char *utf8_string)
158 {
159 	static iconv_t cd4 = (iconv_t)-1;
160 
161 	/* Get conversion descriptor (to, from) */
162 	if (cd4 == (iconv_t)-1)
163 		cd4 = iconv_open("UCS-2LE", "UTF-8");
164 	return (convert_utf8_to_ucs2xx(cd4, utf8_string));
165 }
166 
167 static uint16_t *
168 convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
169 {
170 	uint16_t *obuf, *optr;
171 	const char *iptr;
172 	size_t  ileft, obsize, oleft, ret;
173 
174 	if (cd == (iconv_t)-1) {
175 		smb_error(dgettext(TEXT_DOMAIN,
176 		    "iconv_open(UCS-2/UTF-8)"), -1);
177 		return (NULL);
178 	}
179 
180 	iptr = utf8_string;
181 	ileft = strlen(iptr);
182 
183 	/* Worst-case output size is 2x input size. */
184 	oleft = ileft * 2;
185 	obsize = oleft + 2; /* room for null */
186 	obuf = malloc(obsize);
187 	if (!obuf)
188 		return (NULL);
189 	optr = obuf;
190 
191 	ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
192 	*optr = '\0';
193 	if (ret == (size_t)-1) {
194 		smb_error(dgettext(TEXT_DOMAIN,
195 		    "iconv(%s) failed"), errno, utf8_string);
196 	}
197 	if (ileft) {
198 		smb_error(dgettext(TEXT_DOMAIN,
199 		    "iconv(%s) failed"), -1, utf8_string);
200 		/*
201 		 * XXX: What's better?  return NULL?
202 		 * The truncated string? << for now
203 		 */
204 	}
205 
206 	return (obuf);
207 }
208 
209 
210 /*
211  * A simple wrapper around u8_textprep_str() that returns the Unicode
212  * upper-case version of some string.  Returns memory from malloc.
213  * Borrowed from idmapd.
214  */
215 static char *
216 utf8_str_to_upper_or_lower(const char *s, int upper_lower)
217 {
218 	char *res = NULL;
219 	char *outs;
220 	size_t inlen, outlen, inbleft, outbleft;
221 	int rc, err;
222 
223 	/*
224 	 * u8_textprep_str() does not allocate memory.  The input and
225 	 * output buffers may differ in size (though that would be more
226 	 * likely when normalization is done).  We have to loop over it...
227 	 *
228 	 * To improve the chances that we can avoid looping we add 10
229 	 * bytes of output buffer room the first go around.
230 	 */
231 	inlen = inbleft = strlen(s);
232 	outlen = outbleft = inlen + 10;
233 	if ((res = malloc(outlen)) == NULL)
234 		return (NULL);
235 	outs = res;
236 
237 	while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
238 	    &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
239 	    err == E2BIG) {
240 		if ((res = realloc(res, outlen + inbleft)) == NULL)
241 			return (NULL);
242 		/* adjust input/output buffer pointers */
243 		s += (inlen - inbleft);
244 		outs = res + outlen - outbleft;
245 		/* adjust outbleft and outlen */
246 		outlen += inbleft;
247 		outbleft += inbleft;
248 	}
249 
250 	if (rc < 0) {
251 		free(res);
252 		res = NULL;
253 		return (NULL);
254 	}
255 
256 	res[outlen - outbleft] = '\0';
257 
258 	return (res);
259 }
260 
261 char *
262 utf8_str_toupper(const char *s)
263 {
264 	return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
265 }
266 
267 char *
268 utf8_str_tolower(const char *s)
269 {
270 	return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
271 }
272