1 /*
2 * Copyright 1998-2008 The OpenLDAP Foundation. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted only as authorized by the OpenLDAP Public
6 * License.
7 *
8 * A copy of this license is available in file LICENSE in the top-level
9 * directory of the distribution or, alternatively, at
10 * <https://www.OpenLDAP.org/license.html>.
11 */
12
13 /*
14 * This work is part of OpenLDAP Software <https://www.openldap.org/>.
15 * $OpenLDAP: pkg/ldap/libraries/liblunicode/ucstr.c,v 1.40 2008/03/04 06:24:05 hyc Exp $
16 */
17
18 #include "k5-int.h"
19 #include "k5-utf8.h"
20 #include "k5-unicode.h"
21 #include "k5-input.h"
22 #include "ucdata/ucdata.h"
23
24 #include <ctype.h>
25
26 static int
krb5int_ucstrncmp(const krb5_unicode * u1,const krb5_unicode * u2,size_t n)27 krb5int_ucstrncmp(
28 const krb5_unicode * u1,
29 const krb5_unicode * u2,
30 size_t n)
31 {
32 for (; 0 < n; ++u1, ++u2, --n) {
33 if (*u1 != *u2) {
34 return *u1 < *u2 ? -1 : +1;
35 }
36 if (*u1 == 0) {
37 return 0;
38 }
39 }
40 return 0;
41 }
42
43 static int
krb5int_ucstrncasecmp(const krb5_unicode * u1,const krb5_unicode * u2,size_t n)44 krb5int_ucstrncasecmp(
45 const krb5_unicode * u1,
46 const krb5_unicode * u2,
47 size_t n)
48 {
49 for (; 0 < n; ++u1, ++u2, --n) {
50 krb5_unicode uu1 = uctolower(*u1);
51 krb5_unicode uu2 = uctolower(*u2);
52
53 if (uu1 != uu2) {
54 return uu1 < uu2 ? -1 : +1;
55 }
56 if (uu1 == 0) {
57 return 0;
58 }
59 }
60 return 0;
61 }
62
63 /* Return true if data contains valid UTF-8 sequences. */
64 krb5_boolean
k5_utf8_validate(const krb5_data * data)65 k5_utf8_validate(const krb5_data *data)
66 {
67 struct k5input in;
68 int len, tmplen, i;
69 const uint8_t *bytes;
70
71 k5_input_init(&in, data->data, data->length);
72 while (!in.status && in.len > 0) {
73 len = KRB5_UTF8_CHARLEN(in.ptr);
74 if (len < 1 || len > 4)
75 return FALSE;
76 bytes = k5_input_get_bytes(&in, len);
77 if (bytes == NULL)
78 return FALSE;
79 if (KRB5_UTF8_CHARLEN2(bytes, tmplen) != len)
80 return FALSE;
81 for (i = 1; i < len; i++) {
82 if ((bytes[i] & 0xc0) != 0x80)
83 return FALSE;
84 }
85 }
86 return !in.status;
87 }
88
89 #define TOLOWER(c) (isupper(c) ? tolower(c) : (c))
90
91 /* compare UTF8-strings, optionally ignore casing */
92 /* slow, should be optimized */
93 int
krb5int_utf8_normcmp(const krb5_data * data1,const krb5_data * data2,unsigned flags)94 krb5int_utf8_normcmp(
95 const krb5_data * data1,
96 const krb5_data * data2,
97 unsigned flags)
98 {
99 int i, l1, l2, len, ulen, res = 0;
100 char *s1, *s2, *done;
101 krb5_ucs4 *ucs, *ucsout1, *ucsout2;
102
103 unsigned casefold = flags & KRB5_UTF8_CASEFOLD;
104 unsigned norm1 = flags & KRB5_UTF8_ARG1NFC;
105 unsigned norm2 = flags & KRB5_UTF8_ARG2NFC;
106
107 if (data1 == NULL) {
108 return data2 == NULL ? 0 : -1;
109
110 } else if (data2 == NULL) {
111 return 1;
112 }
113 l1 = data1->length;
114 l2 = data2->length;
115
116 len = (l1 < l2) ? l1 : l2;
117 if (len == 0) {
118 return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
119 }
120 s1 = data1->data;
121 s2 = data2->data;
122 done = s1 + len;
123
124 while ((s1 < done) && KRB5_UTF8_ISASCII(s1) && KRB5_UTF8_ISASCII(s2)) {
125 if (casefold) {
126 char c1 = TOLOWER(*s1);
127 char c2 = TOLOWER(*s2);
128 res = c1 - c2;
129 } else {
130 res = *s1 - *s2;
131 }
132 s1++;
133 s2++;
134 if (res) {
135 /* done unless next character in s1 or s2 is non-ascii */
136 if (s1 < done) {
137 if (!KRB5_UTF8_ISASCII(s1) || !KRB5_UTF8_ISASCII(s2)) {
138 break;
139 }
140 } else if (((len < l1) && !KRB5_UTF8_ISASCII(s1)) ||
141 ((len < l2) && !KRB5_UTF8_ISASCII(s2))) {
142 break;
143 }
144 return res;
145 }
146 }
147
148 /* We have encountered non-ascii or strings equal up to len */
149
150 /* set i to number of iterations */
151 i = s1 - done + len;
152 /* passed through loop at least once? */
153 if (i > 0) {
154 if (!res && (s1 == done) &&
155 ((len == l1) || KRB5_UTF8_ISASCII(s1)) &&
156 ((len == l2) || KRB5_UTF8_ISASCII(s2))) {
157 /* all ascii and equal up to len */
158 return l1 - l2;
159 }
160 /* rewind one char, and do normalized compare from there */
161 s1--;
162 s2--;
163 l1 -= i - 1;
164 l2 -= i - 1;
165 }
166 /*
167 * Should first check to see if strings are already in proper normalized
168 * form.
169 */
170 ucs = malloc(((norm1 || l1 > l2) ? l1 : l2) * sizeof(*ucs));
171 if (ucs == NULL) {
172 return l1 > l2 ? 1 : -1;/* what to do??? */
173 }
174 /*
175 * XXYYZ: we convert to ucs4 even though -llunicode
176 * expects ucs2 in an ac_uint4
177 */
178
179 /* convert and normalize 1st string */
180 for (i = 0, ulen = 0; i < l1; i += len, ulen++) {
181 if (krb5int_utf8_to_ucs4(s1 + i, &ucs[ulen]) == -1) {
182 free(ucs);
183 return -1; /* what to do??? */
184 }
185 len = KRB5_UTF8_CHARLEN(s1 + i);
186 }
187
188 if (norm1) {
189 ucsout1 = ucs;
190 l1 = ulen;
191 ucs = malloc(l2 * sizeof(*ucs));
192 if (ucs == NULL) {
193 free(ucsout1);
194 return l1 > l2 ? 1 : -1; /* what to do??? */
195 }
196 } else {
197 uccompatdecomp(ucs, ulen, &ucsout1, &l1);
198 l1 = uccanoncomp(ucsout1, l1);
199 }
200
201 /* convert and normalize 2nd string */
202 for (i = 0, ulen = 0; i < l2; i += len, ulen++) {
203 if (krb5int_utf8_to_ucs4(s2 + i, &ucs[ulen]) == -1) {
204 free(ucsout1);
205 free(ucs);
206 return 1; /* what to do??? */
207 }
208 len = KRB5_UTF8_CHARLEN(s2 + i);
209 }
210
211 if (norm2) {
212 ucsout2 = ucs;
213 l2 = ulen;
214 } else {
215 uccompatdecomp(ucs, ulen, &ucsout2, &l2);
216 l2 = uccanoncomp(ucsout2, l2);
217 free(ucs);
218 }
219
220 res = casefold
221 ? krb5int_ucstrncasecmp(ucsout1, ucsout2, l1 < l2 ? l1 : l2)
222 : krb5int_ucstrncmp(ucsout1, ucsout2, l1 < l2 ? l1 : l2);
223 free(ucsout1);
224 free(ucsout2);
225
226 if (res != 0) {
227 return res;
228 }
229 if (l1 == l2) {
230 return 0;
231 }
232 return l1 > l2 ? 1 : -1;
233 }
234