xref: /freebsd/crypto/krb5/src/util/support/utf8.c (revision 24e4dcf4ba5e9dedcf89efd358ea3e1fe5867020)
1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* util/support/utf8.c */
3 /*
4  * Copyright 2008 by the Massachusetts Institute of Technology.
5  * All Rights Reserved.
6  *
7  * Export of this software from the United States of America may
8  *   require a specific license from the United States Government.
9  *   It is the responsibility of any person or organization contemplating
10  *   export to obtain such a license before exporting.
11  *
12  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13  * distribute this software and its documentation for any purpose and
14  * without fee is hereby granted, provided that the above copyright
15  * notice appear in all copies and that both that copyright notice and
16  * this permission notice appear in supporting documentation, and that
17  * the name of M.I.T. not be used in advertising or publicity pertaining
18  * to distribution of the software without specific, written prior
19  * permission.  Furthermore if you modify this software you must label
20  * your software as modified software and not distribute it in such a
21  * fashion that it might be confused with the original M.I.T. software.
22  * M.I.T. makes no representations about the suitability of
23  * this software for any purpose.  It is provided "as is" without express
24  * or implied warranty.
25  */
26 /*
27  * Copyright 1998-2008 The OpenLDAP Foundation.
28  * All rights reserved.
29  *
30  * Redistribution and use in source and binary forms, with or without
31  * modification, are permitted only as authorized by the OpenLDAP
32  * Public License.
33  *
34  * A copy of this license is available in the file LICENSE in the
35  * top-level directory of the distribution or, alternatively, at
36  * <https://www.OpenLDAP.org/license.html>.
37  */
38 
39 /* This work is part of OpenLDAP Software <https://www.openldap.org/>. */
40 
41 /* Basic UTF-8 routines
42  *
43  * These routines are "dumb".  Though they understand UTF-8,
44  * they don't grok Unicode.  That is, they can push bits,
45  * but don't have a clue what the bits represent.  That's
46  * good enough for use with the KRB5 Client SDK.
47  *
48  * These routines are not optimized.
49  */
50 
51 #include "k5-platform.h"
52 #include "k5-utf8.h"
53 #include "supp-int.h"
54 
55 /*
56  * Returns length indicated by first byte.
57  */
58 const char krb5int_utf8_lentab[] = {
59     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
64     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
65     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
66     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
67 
68 /*
69  * Make sure the UTF-8 char used the shortest possible encoding
70  * returns charlen if valid, 0 if not.
71  *
72  * Here are the valid UTF-8 encodings, taken from RFC 3629 page 4.
73  * The table is slightly modified from that of the RFC.
74  *
75  * UCS-4 range (hex)      UTF-8 sequence (binary)
76  * 0000 0000-0000 007F   0.......
77  * 0000 0080-0000 07FF   110++++. 10......
78  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
79  * 0001 0000-0010 FFFF   11110+++ 10++.... 10...... 10......
80  *
81  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
82  * at least one of the '+' bits must be set, otherwise the character
83  * should have been encoded in fewer octets. Note that in the two-octet
84  * case, only the first octet needs to be validated, and this is done
85  * in the krb5int_utf8_lentab[] above.
86  */
87 
88 /* mask of required bits in second octet */
89 #undef c
90 #define c const char
91 c krb5int_utf8_mintab[] = {
92     (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
93     (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
94     (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x00, (c)0x00, (c)0x00,
95     (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 };
96 #undef c
97 
98 /*
99  * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
100  * -1 on failure.
101  */
102 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
103 {
104     const unsigned char *c = (const unsigned char *) p;
105     krb5_ucs4 ch;
106     int len, i;
107     static unsigned char mask[] = {
108         0, 0x7f, 0x1f, 0x0f, 0x07 };
109 
110     *out = 0;
111     len = KRB5_UTF8_CHARLEN2(p, len);
112 
113     if (len == 0)
114         return -1;
115 
116     ch = c[0] & mask[len];
117 
118     for (i = 1; i < len; i++) {
119         if ((c[i] & 0xc0) != 0x80)
120             return -1;
121 
122         ch <<= 6;
123         ch |= c[i] & 0x3f;
124     }
125 
126     if (ch > 0x10ffff)
127         return -1;
128 
129     *out = ch;
130     return 0;
131 }
132 
133 /* conv UCS-4 to UTF-8 */
134 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
135 {
136     size_t len = 0;
137     unsigned char *p = (unsigned char *) buf;
138 
139     /* not a valid Unicode character */
140     if (c > 0x10ffff)
141         return 0;
142 
143     /* Just return length, don't convert */
144     if (buf == NULL) {
145         if (c < 0x80) return 1;
146         else if (c < 0x800) return 2;
147         else if (c < 0x10000) return 3;
148         else return 4;
149     }
150 
151     if (c < 0x80) {
152         p[len++] = c;
153     } else if (c < 0x800) {
154         p[len++] = 0xc0 | ( c >> 6 );
155         p[len++] = 0x80 | ( c & 0x3f );
156     } else if (c < 0x10000) {
157         p[len++] = 0xe0 | ( c >> 12 );
158         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
159         p[len++] = 0x80 | ( c & 0x3f );
160     } else /* if (c < 0x110000) */ {
161         p[len++] = 0xf0 | ( c >> 18 );
162         p[len++] = 0x80 | ( (c >> 12) & 0x3f );
163         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
164         p[len++] = 0x80 | ( c & 0x3f );
165     }
166 
167     return len;
168 }
169