1 /*
2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
5
6 /*
7 * util/support/utf8_conv.c
8 *
9 * Copyright 2008 by the Massachusetts Institute of Technology.
10 * All Rights Reserved.
11 *
12 * Export of this software from the United States of America may
13 * require a specific license from the United States Government.
14 * It is the responsibility of any person or organization contemplating
15 * export to obtain such a license before exporting.
16 *
17 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
18 * distribute this software and its documentation for any purpose and
19 * without fee is hereby granted, provided that the above copyright
20 * notice appear in all copies and that both that copyright notice and
21 * this permission notice appear in supporting documentation, and that
22 * the name of M.I.T. not be used in advertising or publicity pertaining
23 * to distribution of the software without specific, written prior
24 * permission. Furthermore if you modify this software you must label
25 * your software as modified software and not distribute it in such a
26 * fashion that it might be confused with the original M.I.T. software.
27 * M.I.T. makes no representations about the suitability of
28 * this software for any purpose. It is provided "as is" without express
29 * or implied warranty.
30 */
31 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
32 *
33 * Copyright 1998-2008 The OpenLDAP Foundation.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted only as authorized by the OpenLDAP
38 * Public License.
39 *
40 * A copy of this license is available in the file LICENSE in the
41 * top-level directory of the distribution or, alternatively, at
42 * <http://www.OpenLDAP.org/license.html>.
43 */
44 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
45 *
46 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
47 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
48 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
49 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
50 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
51 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
52 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
53 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
54 */
55
56 /*
57 * UTF-8 Conversion Routines
58 *
59 * These routines convert between Wide Character and UTF-8,
60 * or between MultiByte and UTF-8 encodings.
61 *
62 * Both single character and string versions of the functions are provided.
63 * All functions return -1 if the character or string cannot be converted.
64 */
65
66 #include "k5-platform.h"
67 #include "k5-utf8.h"
68 #include "supp-int.h"
69 #include "errno.h" /* SUNW17PACresync */
70
71 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
72
73 static ssize_t
k5_utf8s_to_ucs2s(krb5_ucs2 * ucs2str,const char * utf8str,size_t count,int little_endian)74 k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str,
75 const char *utf8str,
76 size_t count,
77 int little_endian)
78 {
79 size_t ucs2len = 0;
80 size_t utflen, i;
81 krb5_ucs2 ch;
82
83 /* If input ptr is NULL or empty... */
84 if (utf8str == NULL || *utf8str == '\0') {
85 *ucs2str = 0;
86
87 return 0;
88 }
89
90 /* Examine next UTF-8 character. */
91 while (*utf8str && ucs2len < count) {
92 /* Get UTF-8 sequence length from 1st byte */
93 utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen);
94
95 if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN)
96 return -1;
97
98 /* First byte minus length tag */
99 ch = (krb5_ucs2)(utf8str[0] & mask[utflen]);
100
101 for (i = 1; i < utflen; i++) {
102 /* Subsequent bytes must start with 10 */
103 if ((utf8str[i] & 0xc0) != 0x80)
104 return -1;
105
106 ch <<= 6; /* 6 bits of data in each subsequent byte */
107 ch |= (krb5_ucs2)(utf8str[i] & 0x3f);
108 }
109
110 if (ucs2str != NULL) {
111 #ifdef K5_BE
112 #ifndef SWAP16
113 #define SWAP16(X) ((((X) << 8) | ((X) >> 8)) & 0xFFFF)
114 #endif
115 if (little_endian)
116 ucs2str[ucs2len] = SWAP16(ch);
117 else
118 #endif
119 ucs2str[ucs2len] = ch;
120 }
121
122 utf8str += utflen; /* Move to next UTF-8 character */
123 ucs2len++; /* Count number of wide chars stored/required */
124 }
125
126 assert(ucs2len < count);
127
128 if (ucs2str != NULL) {
129 /* Add null terminator if there's room in the buffer. */
130 ucs2str[ucs2len] = 0;
131 }
132
133 return ucs2len;
134 }
135
136 int
krb5int_utf8s_to_ucs2s(const char * utf8s,krb5_ucs2 ** ucs2s,size_t * ucs2chars)137 krb5int_utf8s_to_ucs2s(const char *utf8s,
138 krb5_ucs2 **ucs2s,
139 size_t *ucs2chars)
140 {
141 ssize_t len;
142 size_t chars;
143
144 chars = krb5int_utf8_chars(utf8s);
145 *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
146 if (*ucs2s == NULL) {
147 return ENOMEM;
148 }
149
150 len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
151 if (len < 0) {
152 free(*ucs2s);
153 *ucs2s = NULL;
154 return EINVAL;
155 }
156
157 if (ucs2chars != NULL) {
158 *ucs2chars = chars;
159 }
160
161 return 0;
162 }
163
164 int
krb5int_utf8cs_to_ucs2s(const char * utf8s,size_t utf8slen,krb5_ucs2 ** ucs2s,size_t * ucs2chars)165 krb5int_utf8cs_to_ucs2s(const char *utf8s,
166 size_t utf8slen,
167 krb5_ucs2 **ucs2s,
168 size_t *ucs2chars)
169 {
170 ssize_t len;
171 size_t chars;
172
173 chars = krb5int_utf8c_chars(utf8s, utf8slen);
174 *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
175 if (*ucs2s == NULL) {
176 return ENOMEM;
177 }
178
179 len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
180 if (len < 0) {
181 free(*ucs2s);
182 *ucs2s = NULL;
183 return EINVAL;
184 }
185
186 if (ucs2chars != NULL) {
187 *ucs2chars = chars;
188 }
189
190 return 0;
191 }
192
193 int
krb5int_utf8s_to_ucs2les(const char * utf8s,unsigned char ** ucs2les,size_t * ucs2leslen)194 krb5int_utf8s_to_ucs2les(const char *utf8s,
195 unsigned char **ucs2les,
196 size_t *ucs2leslen)
197 {
198 ssize_t len;
199 size_t chars;
200
201 chars = krb5int_utf8_chars(utf8s);
202
203 *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
204 if (*ucs2les == NULL) {
205 return ENOMEM;
206 }
207
208 len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
209 if (len < 0) {
210 free(*ucs2les);
211 *ucs2les = NULL;
212 return EINVAL;
213 }
214
215 if (ucs2leslen != NULL) {
216 *ucs2leslen = chars * sizeof(krb5_ucs2);
217 }
218
219 return 0;
220 }
221
222 int
krb5int_utf8cs_to_ucs2les(const char * utf8s,size_t utf8slen,unsigned char ** ucs2les,size_t * ucs2leslen)223 krb5int_utf8cs_to_ucs2les(const char *utf8s,
224 size_t utf8slen,
225 unsigned char **ucs2les,
226 size_t *ucs2leslen)
227 {
228 ssize_t len;
229 size_t chars;
230
231 chars = krb5int_utf8c_chars(utf8s, utf8slen);
232
233 *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
234 if (*ucs2les == NULL) {
235 return ENOMEM;
236 }
237
238 len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
239 if (len < 0) {
240 free(*ucs2les);
241 *ucs2les = NULL;
242 return EINVAL;
243 }
244
245 if (ucs2leslen != NULL) {
246 *ucs2leslen = chars * sizeof(krb5_ucs2);
247 }
248
249 return 0;
250 }
251
252 /*-----------------------------------------------------------------------------
253 Convert a wide char string to a UTF-8 string.
254 No more than 'count' bytes will be written to the output buffer.
255 Return the # of bytes written to the output buffer, excl null terminator.
256
257 ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the
258 length of the UCS-2 string in characters
259 */
260 static ssize_t
k5_ucs2s_to_utf8s(char * utf8str,const krb5_ucs2 * ucs2str,size_t count,ssize_t ucs2len,int little_endian)261 k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str,
262 size_t count, ssize_t ucs2len, int little_endian)
263 {
264 int len = 0;
265 int n;
266 char *p = utf8str;
267 krb5_ucs2 empty = 0, ch;
268
269 if (ucs2str == NULL) /* Treat input ptr NULL as an empty string */
270 ucs2str = ∅
271
272 if (utf8str == NULL) /* Just compute size of output, excl null */
273 {
274 while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) {
275 /* Get UTF-8 size of next wide char */
276 ch = *ucs2str++;
277 #ifdef K5_BE
278 if (little_endian)
279 ch = SWAP16(ch);
280 #endif
281
282 n = krb5int_ucs2_to_utf8(ch, NULL);
283 if (n < 1)
284 return -1;
285 if (len + n < len)
286 return -1; /* overflow */
287 len += n;
288 }
289
290 return len;
291 }
292
293 /* Do the actual conversion. */
294
295 n = 1; /* In case of empty ucs2str */
296 while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) {
297 ch = *ucs2str++;
298 #ifdef K5_BE
299 if (little_endian)
300 ch = SWAP16(ch);
301 #endif
302
303 n = krb5int_ucs2_to_utf8(ch, p);
304
305 if (n < 1)
306 break;
307
308 p += n;
309 count -= n; /* Space left in output buffer */
310 }
311
312 /* If not enough room for last character, pad remainder with null
313 so that return value = original count, indicating buffer full. */
314 if (n == 0) {
315 while (count--)
316 *p++ = 0;
317 }
318 /* Add a null terminator if there's room. */
319 else if (count)
320 *p = 0;
321
322 if (n == -1) /* Conversion encountered invalid wide char. */
323 return -1;
324
325 /* Return the number of bytes written to output buffer, excl null. */
326 return (p - utf8str);
327 }
328
329 int
krb5int_ucs2s_to_utf8s(const krb5_ucs2 * ucs2s,char ** utf8s,size_t * utf8slen)330 krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s,
331 char **utf8s,
332 size_t *utf8slen)
333 {
334 ssize_t len;
335
336 len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0);
337 if (len < 0) {
338 return EINVAL;
339 }
340
341 *utf8s = (char *)malloc((size_t)len + 1);
342 if (*utf8s == NULL) {
343 return ENOMEM;
344 }
345
346 len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0);
347 if (len < 0) {
348 free(*utf8s);
349 *utf8s = NULL;
350 return EINVAL;
351 }
352
353 if (utf8slen != NULL) {
354 *utf8slen = len;
355 }
356
357 return 0;
358 }
359
360 int
krb5int_ucs2les_to_utf8s(const unsigned char * ucs2les,char ** utf8s,size_t * utf8slen)361 krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les,
362 char **utf8s,
363 size_t *utf8slen)
364 {
365 ssize_t len;
366
367 len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1);
368 if (len < 0)
369 return EINVAL;
370
371 *utf8s = (char *)malloc((size_t)len + 1);
372 if (*utf8s == NULL) {
373 return ENOMEM;
374 }
375
376 len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1);
377 if (len < 0) {
378 free(*utf8s);
379 *utf8s = NULL;
380 return EINVAL;
381 }
382
383 if (utf8slen != NULL) {
384 *utf8slen = len;
385 }
386
387 return 0;
388 }
389
390 int
krb5int_ucs2cs_to_utf8s(const krb5_ucs2 * ucs2s,size_t ucs2slen,char ** utf8s,size_t * utf8slen)391 krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s,
392 size_t ucs2slen,
393 char **utf8s,
394 size_t *utf8slen)
395 {
396 ssize_t len;
397
398 if (ucs2slen > SSIZE_MAX)
399 return ERANGE;
400
401 len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0,
402 (ssize_t)ucs2slen, 0);
403 if (len < 0)
404 return EINVAL;
405
406 *utf8s = (char *)malloc((size_t)len + 1);
407 if (*utf8s == NULL) {
408 return ENOMEM;
409 }
410
411 len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s,
412 (size_t)len + 1, (ssize_t)ucs2slen, 0);
413 if (len < 0) {
414 free(*utf8s);
415 *utf8s = NULL;
416 return EINVAL;
417 }
418
419 if (utf8slen != NULL) {
420 *utf8slen = len;
421 }
422
423 return 0;
424 }
425
426 int
krb5int_ucs2lecs_to_utf8s(const unsigned char * ucs2les,size_t ucs2leslen,char ** utf8s,size_t * utf8slen)427 krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les,
428 size_t ucs2leslen,
429 char **utf8s,
430 size_t *utf8slen)
431 {
432 ssize_t len;
433
434 if (ucs2leslen > SSIZE_MAX)
435 return ERANGE;
436
437 len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0,
438 (ssize_t)ucs2leslen, 1);
439 if (len < 0)
440 return EINVAL;
441
442 *utf8s = (char *)malloc((size_t)len + 1);
443 if (*utf8s == NULL) {
444 return ENOMEM;
445 }
446
447 len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les,
448 (size_t)len + 1, (ssize_t)ucs2leslen, 1);
449 if (len < 0) {
450 free(*utf8s);
451 *utf8s = NULL;
452 return EINVAL;
453 }
454
455 if (utf8slen != NULL) {
456 *utf8slen = len;
457 }
458
459 return 0;
460 }
461
462