1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* util/support/utf8.c */
3 /*
4 * Copyright 2008 by the Massachusetts Institute of Technology.
5 * All Rights Reserved.
6 *
7 * Export of this software from the United States of America may
8 * require a specific license from the United States Government.
9 * It is the responsibility of any person or organization contemplating
10 * export to obtain such a license before exporting.
11 *
12 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13 * distribute this software and its documentation for any purpose and
14 * without fee is hereby granted, provided that the above copyright
15 * notice appear in all copies and that both that copyright notice and
16 * this permission notice appear in supporting documentation, and that
17 * the name of M.I.T. not be used in advertising or publicity pertaining
18 * to distribution of the software without specific, written prior
19 * permission. Furthermore if you modify this software you must label
20 * your software as modified software and not distribute it in such a
21 * fashion that it might be confused with the original M.I.T. software.
22 * M.I.T. makes no representations about the suitability of
23 * this software for any purpose. It is provided "as is" without express
24 * or implied warranty.
25 */
26 /*
27 * Copyright 1998-2008 The OpenLDAP Foundation.
28 * All rights reserved.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted only as authorized by the OpenLDAP
32 * Public License.
33 *
34 * A copy of this license is available in the file LICENSE in the
35 * top-level directory of the distribution or, alternatively, at
36 * <https://www.OpenLDAP.org/license.html>.
37 */
38
39 /* This work is part of OpenLDAP Software <https://www.openldap.org/>. */
40
41 /* Basic UTF-8 routines
42 *
43 * These routines are "dumb". Though they understand UTF-8,
44 * they don't grok Unicode. That is, they can push bits,
45 * but don't have a clue what the bits represent. That's
46 * good enough for use with the KRB5 Client SDK.
47 *
48 * These routines are not optimized.
49 */
50
51 #include "k5-platform.h"
52 #include "k5-utf8.h"
53 #include "supp-int.h"
54
55 /*
56 * Returns length indicated by first byte.
57 */
58 const char krb5int_utf8_lentab[] = {
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
64 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
65 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
66 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
67
68 /*
69 * Make sure the UTF-8 char used the shortest possible encoding
70 * returns charlen if valid, 0 if not.
71 *
72 * Here are the valid UTF-8 encodings, taken from RFC 3629 page 4.
73 * The table is slightly modified from that of the RFC.
74 *
75 * UCS-4 range (hex) UTF-8 sequence (binary)
76 * 0000 0000-0000 007F 0.......
77 * 0000 0080-0000 07FF 110++++. 10......
78 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
79 * 0001 0000-0010 FFFF 11110+++ 10++.... 10...... 10......
80 *
81 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
82 * at least one of the '+' bits must be set, otherwise the character
83 * should have been encoded in fewer octets. Note that in the two-octet
84 * case, only the first octet needs to be validated, and this is done
85 * in the krb5int_utf8_lentab[] above.
86 */
87
88 /* mask of required bits in second octet */
89 #undef c
90 #define c const char
91 c krb5int_utf8_mintab[] = {
92 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
93 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
94 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x00, (c)0x00, (c)0x00,
95 (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 };
96 #undef c
97
98 /*
99 * Convert a UTF8 character to a UCS4 character. Return 0 on success,
100 * -1 on failure.
101 */
krb5int_utf8_to_ucs4(const char * p,krb5_ucs4 * out)102 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
103 {
104 const unsigned char *c = (const unsigned char *) p;
105 krb5_ucs4 ch;
106 int len, i;
107 static unsigned char mask[] = {
108 0, 0x7f, 0x1f, 0x0f, 0x07 };
109
110 *out = 0;
111 len = KRB5_UTF8_CHARLEN2(p, len);
112
113 if (len == 0)
114 return -1;
115
116 ch = c[0] & mask[len];
117
118 for (i = 1; i < len; i++) {
119 if ((c[i] & 0xc0) != 0x80)
120 return -1;
121
122 ch <<= 6;
123 ch |= c[i] & 0x3f;
124 }
125
126 if (ch > 0x10ffff)
127 return -1;
128
129 *out = ch;
130 return 0;
131 }
132
133 /* conv UCS-4 to UTF-8 */
krb5int_ucs4_to_utf8(krb5_ucs4 c,char * buf)134 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
135 {
136 size_t len = 0;
137 unsigned char *p = (unsigned char *) buf;
138
139 /* not a valid Unicode character */
140 if (c > 0x10ffff)
141 return 0;
142
143 /* Just return length, don't convert */
144 if (buf == NULL) {
145 if (c < 0x80) return 1;
146 else if (c < 0x800) return 2;
147 else if (c < 0x10000) return 3;
148 else return 4;
149 }
150
151 if (c < 0x80) {
152 p[len++] = c;
153 } else if (c < 0x800) {
154 p[len++] = 0xc0 | ( c >> 6 );
155 p[len++] = 0x80 | ( c & 0x3f );
156 } else if (c < 0x10000) {
157 p[len++] = 0xe0 | ( c >> 12 );
158 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
159 p[len++] = 0x80 | ( c & 0x3f );
160 } else /* if (c < 0x110000) */ {
161 p[len++] = 0xf0 | ( c >> 18 );
162 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
163 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
164 p[len++] = 0x80 | ( c & 0x3f );
165 }
166
167 return len;
168 }
169