xref: /freebsd/crypto/krb5/src/util/support/utf8_conv.c (revision 7f2fe78b9dd5f51c821d771b63d2e096f6fd49e9)
1*7f2fe78bSCy Schubert /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2*7f2fe78bSCy Schubert /* util/support/utf8_conv.c */
3*7f2fe78bSCy Schubert /*
4*7f2fe78bSCy Schubert  * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
5*7f2fe78bSCy Schubert  * All Rights Reserved.
6*7f2fe78bSCy Schubert  *
7*7f2fe78bSCy Schubert  * Export of this software from the United States of America may
8*7f2fe78bSCy Schubert  *   require a specific license from the United States Government.
9*7f2fe78bSCy Schubert  *   It is the responsibility of any person or organization contemplating
10*7f2fe78bSCy Schubert  *   export to obtain such a license before exporting.
11*7f2fe78bSCy Schubert  *
12*7f2fe78bSCy Schubert  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13*7f2fe78bSCy Schubert  * distribute this software and its documentation for any purpose and
14*7f2fe78bSCy Schubert  * without fee is hereby granted, provided that the above copyright
15*7f2fe78bSCy Schubert  * notice appear in all copies and that both that copyright notice and
16*7f2fe78bSCy Schubert  * this permission notice appear in supporting documentation, and that
17*7f2fe78bSCy Schubert  * the name of M.I.T. not be used in advertising or publicity pertaining
18*7f2fe78bSCy Schubert  * to distribution of the software without specific, written prior
19*7f2fe78bSCy Schubert  * permission.  Furthermore if you modify this software you must label
20*7f2fe78bSCy Schubert  * your software as modified software and not distribute it in such a
21*7f2fe78bSCy Schubert  * fashion that it might be confused with the original M.I.T. software.
22*7f2fe78bSCy Schubert  * M.I.T. makes no representations about the suitability of
23*7f2fe78bSCy Schubert  * this software for any purpose.  It is provided "as is" without express
24*7f2fe78bSCy Schubert  * or implied warranty.
25*7f2fe78bSCy Schubert  */
26*7f2fe78bSCy Schubert /*
27*7f2fe78bSCy Schubert  * Copyright 1998-2008 The OpenLDAP Foundation.
28*7f2fe78bSCy Schubert  * All rights reserved.
29*7f2fe78bSCy Schubert  *
30*7f2fe78bSCy Schubert  * Redistribution and use in source and binary forms, with or without
31*7f2fe78bSCy Schubert  * modification, are permitted only as authorized by the OpenLDAP
32*7f2fe78bSCy Schubert  * Public License.
33*7f2fe78bSCy Schubert  *
34*7f2fe78bSCy Schubert  * A copy of this license is available in the file LICENSE in the
35*7f2fe78bSCy Schubert  * top-level directory of the distribution or, alternatively, at
36*7f2fe78bSCy Schubert  * <https://www.OpenLDAP.org/license.html>.
37*7f2fe78bSCy Schubert  */
38*7f2fe78bSCy Schubert /* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
39*7f2fe78bSCy Schubert  *
40*7f2fe78bSCy Schubert  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
41*7f2fe78bSCy Schubert  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
42*7f2fe78bSCy Schubert  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
43*7f2fe78bSCy Schubert  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
44*7f2fe78bSCy Schubert  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
45*7f2fe78bSCy Schubert  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
46*7f2fe78bSCy Schubert  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
47*7f2fe78bSCy Schubert  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
48*7f2fe78bSCy Schubert  */
49*7f2fe78bSCy Schubert 
50*7f2fe78bSCy Schubert /* This work is based on OpenLDAP Software <https://www.openldap.org/>. */
51*7f2fe78bSCy Schubert 
52*7f2fe78bSCy Schubert /*
53*7f2fe78bSCy Schubert  * These routines convert between UTF-16 and UTF-8.  UTF-16 encodes a Unicode
54*7f2fe78bSCy Schubert  * character in either two or four bytes.  Characters in the Basic Multilingual
55*7f2fe78bSCy Schubert  * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
56*7f2fe78bSCy Schubert  * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
57*7f2fe78bSCy Schubert  * surrogate and a low surrogate, each containing ten bits of the character
58*7f2fe78bSCy Schubert  * value, and encoded in four bytes.
59*7f2fe78bSCy Schubert  */
60*7f2fe78bSCy Schubert 
61*7f2fe78bSCy Schubert #include "k5-platform.h"
62*7f2fe78bSCy Schubert #include "k5-utf8.h"
63*7f2fe78bSCy Schubert #include "k5-buf.h"
64*7f2fe78bSCy Schubert #include "k5-input.h"
65*7f2fe78bSCy Schubert #include "supp-int.h"
66*7f2fe78bSCy Schubert 
67*7f2fe78bSCy Schubert static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
68*7f2fe78bSCy Schubert 
69*7f2fe78bSCy Schubert /* A high surrogate is ten bits masked with 0xD800. */
70*7f2fe78bSCy Schubert #define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
71*7f2fe78bSCy Schubert 
72*7f2fe78bSCy Schubert /* A low surrogate is ten bits masked with 0xDC00. */
73*7f2fe78bSCy Schubert #define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
74*7f2fe78bSCy Schubert 
75*7f2fe78bSCy Schubert /* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
76*7f2fe78bSCy Schubert  * value. */
77*7f2fe78bSCy Schubert #define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
78*7f2fe78bSCy Schubert #define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
79*7f2fe78bSCy Schubert 
80*7f2fe78bSCy Schubert /* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
81*7f2fe78bSCy Schubert  * surrogate value. */
82*7f2fe78bSCy Schubert #define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
83*7f2fe78bSCy Schubert 
84*7f2fe78bSCy Schubert /* Characters in the Supplementary Planes have a base value subtracted from
85*7f2fe78bSCy Schubert  * their code points to form a 20-bit value; ten bits go in each surrogate. */
86*7f2fe78bSCy Schubert #define BASE 0x10000
87*7f2fe78bSCy Schubert #define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
88*7f2fe78bSCy Schubert #define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
89*7f2fe78bSCy Schubert #define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
90*7f2fe78bSCy Schubert 
91*7f2fe78bSCy Schubert int
k5_utf8_to_utf16le(const char * utf8,uint8_t ** utf16_out,size_t * nbytes_out)92*7f2fe78bSCy Schubert k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
93*7f2fe78bSCy Schubert {
94*7f2fe78bSCy Schubert     struct k5buf buf;
95*7f2fe78bSCy Schubert     krb5_ucs4 ch;
96*7f2fe78bSCy Schubert     size_t chlen, i;
97*7f2fe78bSCy Schubert 
98*7f2fe78bSCy Schubert     *utf16_out = NULL;
99*7f2fe78bSCy Schubert     *nbytes_out = 0;
100*7f2fe78bSCy Schubert 
101*7f2fe78bSCy Schubert     /* UTF-16 conversion is used for RC4 string-to-key, so treat this data as
102*7f2fe78bSCy Schubert      * sensitive. */
103*7f2fe78bSCy Schubert     k5_buf_init_dynamic_zap(&buf);
104*7f2fe78bSCy Schubert 
105*7f2fe78bSCy Schubert     /* Examine next UTF-8 character. */
106*7f2fe78bSCy Schubert     while (*utf8 != '\0') {
107*7f2fe78bSCy Schubert         /* Get UTF-8 sequence length from first byte. */
108*7f2fe78bSCy Schubert         chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
109*7f2fe78bSCy Schubert         if (chlen == 0)
110*7f2fe78bSCy Schubert             goto invalid;
111*7f2fe78bSCy Schubert 
112*7f2fe78bSCy Schubert         /* First byte minus length tag */
113*7f2fe78bSCy Schubert         ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
114*7f2fe78bSCy Schubert 
115*7f2fe78bSCy Schubert         for (i = 1; i < chlen; i++) {
116*7f2fe78bSCy Schubert             /* Subsequent bytes must start with 10. */
117*7f2fe78bSCy Schubert             if ((utf8[i] & 0xc0) != 0x80)
118*7f2fe78bSCy Schubert                 goto invalid;
119*7f2fe78bSCy Schubert 
120*7f2fe78bSCy Schubert             /* 6 bits of data in each subsequent byte */
121*7f2fe78bSCy Schubert             ch <<= 6;
122*7f2fe78bSCy Schubert             ch |= (krb5_ucs4)(utf8[i] & 0x3f);
123*7f2fe78bSCy Schubert         }
124*7f2fe78bSCy Schubert         if (!IS_VALID_UNICODE(ch))
125*7f2fe78bSCy Schubert             goto invalid;
126*7f2fe78bSCy Schubert 
127*7f2fe78bSCy Schubert         /* Characters in the basic multilingual plane are encoded using two
128*7f2fe78bSCy Schubert          * bytes; other characters are encoded using four bytes. */
129*7f2fe78bSCy Schubert         if (IS_BMP(ch)) {
130*7f2fe78bSCy Schubert             k5_buf_add_uint16_le(&buf, ch);
131*7f2fe78bSCy Schubert         } else {
132*7f2fe78bSCy Schubert             /* 0x10000 is subtracted from ch; then the high ten bits plus
133*7f2fe78bSCy Schubert              * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
134*7f2fe78bSCy Schubert             k5_buf_add_uint16_le(&buf, HIGH_SURROGATE(ch));
135*7f2fe78bSCy Schubert             k5_buf_add_uint16_le(&buf, LOW_SURROGATE(ch));
136*7f2fe78bSCy Schubert         }
137*7f2fe78bSCy Schubert 
138*7f2fe78bSCy Schubert         /* Move to next UTF-8 character. */
139*7f2fe78bSCy Schubert         utf8 += chlen;
140*7f2fe78bSCy Schubert     }
141*7f2fe78bSCy Schubert 
142*7f2fe78bSCy Schubert     *utf16_out = buf.data;
143*7f2fe78bSCy Schubert     *nbytes_out = buf.len;
144*7f2fe78bSCy Schubert     return 0;
145*7f2fe78bSCy Schubert 
146*7f2fe78bSCy Schubert invalid:
147*7f2fe78bSCy Schubert     k5_buf_free(&buf);
148*7f2fe78bSCy Schubert     return EINVAL;
149*7f2fe78bSCy Schubert }
150*7f2fe78bSCy Schubert 
151*7f2fe78bSCy Schubert int
k5_utf16le_to_utf8(const uint8_t * utf16bytes,size_t nbytes,char ** utf8_out)152*7f2fe78bSCy Schubert k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
153*7f2fe78bSCy Schubert {
154*7f2fe78bSCy Schubert     struct k5buf buf;
155*7f2fe78bSCy Schubert     struct k5input in;
156*7f2fe78bSCy Schubert     uint16_t ch1, ch2;
157*7f2fe78bSCy Schubert     krb5_ucs4 ch;
158*7f2fe78bSCy Schubert     size_t chlen;
159*7f2fe78bSCy Schubert     void *p;
160*7f2fe78bSCy Schubert 
161*7f2fe78bSCy Schubert     *utf8_out = NULL;
162*7f2fe78bSCy Schubert 
163*7f2fe78bSCy Schubert     if (nbytes % 2 != 0)
164*7f2fe78bSCy Schubert         return EINVAL;
165*7f2fe78bSCy Schubert 
166*7f2fe78bSCy Schubert     k5_buf_init_dynamic(&buf);
167*7f2fe78bSCy Schubert     k5_input_init(&in, utf16bytes, nbytes);
168*7f2fe78bSCy Schubert     while (!in.status && in.len > 0) {
169*7f2fe78bSCy Schubert         /* Get the next character or high surrogate.  A low surrogate without a
170*7f2fe78bSCy Schubert          * preceding high surrogate is invalid. */
171*7f2fe78bSCy Schubert         ch1 = k5_input_get_uint16_le(&in);
172*7f2fe78bSCy Schubert         if (IS_LOW_SURROGATE(ch1))
173*7f2fe78bSCy Schubert             goto invalid;
174*7f2fe78bSCy Schubert         if (IS_HIGH_SURROGATE(ch1)) {
175*7f2fe78bSCy Schubert             /* Get the low surrogate and combine the pair. */
176*7f2fe78bSCy Schubert             ch2 = k5_input_get_uint16_le(&in);
177*7f2fe78bSCy Schubert             if (!IS_LOW_SURROGATE(ch2))
178*7f2fe78bSCy Schubert                 goto invalid;
179*7f2fe78bSCy Schubert             ch = COMPOSE(ch1, ch2);
180*7f2fe78bSCy Schubert         } else {
181*7f2fe78bSCy Schubert             ch = ch1;
182*7f2fe78bSCy Schubert         }
183*7f2fe78bSCy Schubert 
184*7f2fe78bSCy Schubert         chlen = krb5int_ucs4_to_utf8(ch, NULL);
185*7f2fe78bSCy Schubert         p = k5_buf_get_space(&buf, chlen);
186*7f2fe78bSCy Schubert         if (p == NULL)
187*7f2fe78bSCy Schubert             return ENOMEM;
188*7f2fe78bSCy Schubert         (void)krb5int_ucs4_to_utf8(ch, p);
189*7f2fe78bSCy Schubert     }
190*7f2fe78bSCy Schubert 
191*7f2fe78bSCy Schubert     if (in.status)
192*7f2fe78bSCy Schubert         goto invalid;
193*7f2fe78bSCy Schubert 
194*7f2fe78bSCy Schubert     *utf8_out = k5_buf_cstring(&buf);
195*7f2fe78bSCy Schubert     return (*utf8_out == NULL) ? ENOMEM : 0;
196*7f2fe78bSCy Schubert 
197*7f2fe78bSCy Schubert invalid:
198*7f2fe78bSCy Schubert     k5_buf_free(&buf);
199*7f2fe78bSCy Schubert     return EINVAL;
200*7f2fe78bSCy Schubert }
201