1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* util/support/utf8_conv.c */ 3 /* 4 * Copyright 2008, 2017 by the Massachusetts Institute of Technology. 5 * All Rights Reserved. 6 * 7 * Export of this software from the United States of America may 8 * require a specific license from the United States Government. 9 * It is the responsibility of any person or organization contemplating 10 * export to obtain such a license before exporting. 11 * 12 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and 13 * distribute this software and its documentation for any purpose and 14 * without fee is hereby granted, provided that the above copyright 15 * notice appear in all copies and that both that copyright notice and 16 * this permission notice appear in supporting documentation, and that 17 * the name of M.I.T. not be used in advertising or publicity pertaining 18 * to distribution of the software without specific, written prior 19 * permission. Furthermore if you modify this software you must label 20 * your software as modified software and not distribute it in such a 21 * fashion that it might be confused with the original M.I.T. software. 22 * M.I.T. makes no representations about the suitability of 23 * this software for any purpose. It is provided "as is" without express 24 * or implied warranty. 25 */ 26 /* 27 * Copyright 1998-2008 The OpenLDAP Foundation. 28 * All rights reserved. 29 * 30 * Redistribution and use in source and binary forms, with or without 31 * modification, are permitted only as authorized by the OpenLDAP 32 * Public License. 33 * 34 * A copy of this license is available in the file LICENSE in the 35 * top-level directory of the distribution or, alternatively, at 36 * <https://www.OpenLDAP.org/license.html>. 37 */ 38 /* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. 39 * 40 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND 41 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT 42 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS 43 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" 44 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION 45 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP 46 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT 47 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 48 */ 49 50 /* This work is based on OpenLDAP Software <https://www.openldap.org/>. */ 51 52 /* 53 * These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode 54 * character in either two or four bytes. Characters in the Basic Multilingual 55 * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes. 56 * Characters in the Supplementary Planes (10000..10FFFF) are split into a high 57 * surrogate and a low surrogate, each containing ten bits of the character 58 * value, and encoded in four bytes. 59 */ 60 61 #include "k5-platform.h" 62 #include "k5-utf8.h" 63 #include "k5-buf.h" 64 #include "k5-input.h" 65 #include "supp-int.h" 66 67 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 68 69 /* A high surrogate is ten bits masked with 0xD800. */ 70 #define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF) 71 72 /* A low surrogate is ten bits masked with 0xDC00. */ 73 #define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF) 74 75 /* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate 76 * value. */ 77 #define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF) 78 #define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c)) 79 80 /* A Basic Multilingual Plane character is in the range 0..FFFF and is not a 81 * surrogate value. */ 82 #define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c)) 83 84 /* Characters in the Supplementary Planes have a base value subtracted from 85 * their code points to form a 20-bit value; ten bits go in each surrogate. */ 86 #define BASE 0x10000 87 #define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10)) 88 #define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF)) 89 #define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF))) 90 91 int 92 k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out) 93 { 94 struct k5buf buf; 95 krb5_ucs4 ch; 96 size_t chlen, i; 97 98 *utf16_out = NULL; 99 *nbytes_out = 0; 100 101 /* UTF-16 conversion is used for RC4 string-to-key, so treat this data as 102 * sensitive. */ 103 k5_buf_init_dynamic_zap(&buf); 104 105 /* Examine next UTF-8 character. */ 106 while (*utf8 != '\0') { 107 /* Get UTF-8 sequence length from first byte. */ 108 chlen = KRB5_UTF8_CHARLEN2(utf8, chlen); 109 if (chlen == 0) 110 goto invalid; 111 112 /* First byte minus length tag */ 113 ch = (krb5_ucs4)(utf8[0] & mask[chlen]); 114 115 for (i = 1; i < chlen; i++) { 116 /* Subsequent bytes must start with 10. */ 117 if ((utf8[i] & 0xc0) != 0x80) 118 goto invalid; 119 120 /* 6 bits of data in each subsequent byte */ 121 ch <<= 6; 122 ch |= (krb5_ucs4)(utf8[i] & 0x3f); 123 } 124 if (!IS_VALID_UNICODE(ch)) 125 goto invalid; 126 127 /* Characters in the basic multilingual plane are encoded using two 128 * bytes; other characters are encoded using four bytes. */ 129 if (IS_BMP(ch)) { 130 k5_buf_add_uint16_le(&buf, ch); 131 } else { 132 /* 0x10000 is subtracted from ch; then the high ten bits plus 133 * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */ 134 k5_buf_add_uint16_le(&buf, HIGH_SURROGATE(ch)); 135 k5_buf_add_uint16_le(&buf, LOW_SURROGATE(ch)); 136 } 137 138 /* Move to next UTF-8 character. */ 139 utf8 += chlen; 140 } 141 142 *utf16_out = buf.data; 143 *nbytes_out = buf.len; 144 return 0; 145 146 invalid: 147 k5_buf_free(&buf); 148 return EINVAL; 149 } 150 151 int 152 k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out) 153 { 154 struct k5buf buf; 155 struct k5input in; 156 uint16_t ch1, ch2; 157 krb5_ucs4 ch; 158 size_t chlen; 159 void *p; 160 161 *utf8_out = NULL; 162 163 if (nbytes % 2 != 0) 164 return EINVAL; 165 166 k5_buf_init_dynamic(&buf); 167 k5_input_init(&in, utf16bytes, nbytes); 168 while (!in.status && in.len > 0) { 169 /* Get the next character or high surrogate. A low surrogate without a 170 * preceding high surrogate is invalid. */ 171 ch1 = k5_input_get_uint16_le(&in); 172 if (IS_LOW_SURROGATE(ch1)) 173 goto invalid; 174 if (IS_HIGH_SURROGATE(ch1)) { 175 /* Get the low surrogate and combine the pair. */ 176 ch2 = k5_input_get_uint16_le(&in); 177 if (!IS_LOW_SURROGATE(ch2)) 178 goto invalid; 179 ch = COMPOSE(ch1, ch2); 180 } else { 181 ch = ch1; 182 } 183 184 chlen = krb5int_ucs4_to_utf8(ch, NULL); 185 p = k5_buf_get_space(&buf, chlen); 186 if (p == NULL) 187 return ENOMEM; 188 (void)krb5int_ucs4_to_utf8(ch, p); 189 } 190 191 if (in.status) 192 goto invalid; 193 194 *utf8_out = k5_buf_cstring(&buf); 195 return (*utf8_out == NULL) ? ENOMEM : 0; 196 197 invalid: 198 k5_buf_free(&buf); 199 return EINVAL; 200 } 201