xref: /freebsd/crypto/krb5/src/lib/krb5/unicode/ucstr.c (revision 7f2fe78b9dd5f51c821d771b63d2e096f6fd49e9)
1 /*
2  * Copyright 1998-2008 The OpenLDAP Foundation. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted only as authorized by the OpenLDAP Public
6  * License.
7  *
8  * A copy of this license is available in file LICENSE in the top-level
9  * directory of the distribution or, alternatively, at
10  * <https://www.OpenLDAP.org/license.html>.
11  */
12 
13 /*
14  * This work is part of OpenLDAP Software <https://www.openldap.org/>.
15  * $OpenLDAP: pkg/ldap/libraries/liblunicode/ucstr.c,v 1.40 2008/03/04 06:24:05 hyc Exp $
16  */
17 
18 #include "k5-int.h"
19 #include "k5-utf8.h"
20 #include "k5-unicode.h"
21 #include "k5-input.h"
22 #include "ucdata/ucdata.h"
23 
24 #include <ctype.h>
25 
26 static int
krb5int_ucstrncmp(const krb5_unicode * u1,const krb5_unicode * u2,size_t n)27 krb5int_ucstrncmp(
28 		  const krb5_unicode * u1,
29 		  const krb5_unicode * u2,
30 		  size_t n)
31 {
32     for (; 0 < n; ++u1, ++u2, --n) {
33 	if (*u1 != *u2) {
34 	    return *u1 < *u2 ? -1 : +1;
35 	}
36 	if (*u1 == 0) {
37 	    return 0;
38 	}
39     }
40     return 0;
41 }
42 
43 static int
krb5int_ucstrncasecmp(const krb5_unicode * u1,const krb5_unicode * u2,size_t n)44 krb5int_ucstrncasecmp(
45 		      const krb5_unicode * u1,
46 		      const krb5_unicode * u2,
47 		      size_t n)
48 {
49     for (; 0 < n; ++u1, ++u2, --n) {
50 	krb5_unicode uu1 = uctolower(*u1);
51 	krb5_unicode uu2 = uctolower(*u2);
52 
53 	if (uu1 != uu2) {
54 	    return uu1 < uu2 ? -1 : +1;
55 	}
56 	if (uu1 == 0) {
57 	    return 0;
58 	}
59     }
60     return 0;
61 }
62 
63 /* Return true if data contains valid UTF-8 sequences. */
64 krb5_boolean
k5_utf8_validate(const krb5_data * data)65 k5_utf8_validate(const krb5_data *data)
66 {
67     struct k5input in;
68     int len, tmplen, i;
69     const uint8_t *bytes;
70 
71     k5_input_init(&in, data->data, data->length);
72     while (!in.status && in.len > 0) {
73 	len = KRB5_UTF8_CHARLEN(in.ptr);
74 	if (len < 1 || len > 4)
75 	    return FALSE;
76 	bytes = k5_input_get_bytes(&in, len);
77 	if (bytes == NULL)
78 	    return FALSE;
79 	if (KRB5_UTF8_CHARLEN2(bytes, tmplen) != len)
80 	    return FALSE;
81 	for (i = 1; i < len; i++) {
82 	    if ((bytes[i] & 0xc0) != 0x80)
83 		return FALSE;
84 	}
85     }
86     return !in.status;
87 }
88 
89 #define TOLOWER(c)  (isupper(c) ? tolower(c) : (c))
90 
91 /* compare UTF8-strings, optionally ignore casing */
92 /* slow, should be optimized */
93 int
krb5int_utf8_normcmp(const krb5_data * data1,const krb5_data * data2,unsigned flags)94 krb5int_utf8_normcmp(
95 		     const krb5_data * data1,
96 		     const krb5_data * data2,
97 		     unsigned flags)
98 {
99     int i, l1, l2, len, ulen, res = 0;
100     char *s1, *s2, *done;
101     krb5_ucs4 *ucs, *ucsout1, *ucsout2;
102 
103     unsigned casefold = flags & KRB5_UTF8_CASEFOLD;
104     unsigned norm1 = flags & KRB5_UTF8_ARG1NFC;
105     unsigned norm2 = flags & KRB5_UTF8_ARG2NFC;
106 
107     if (data1 == NULL) {
108 	return data2 == NULL ? 0 : -1;
109 
110     } else if (data2 == NULL) {
111 	return 1;
112     }
113     l1 = data1->length;
114     l2 = data2->length;
115 
116     len = (l1 < l2) ? l1 : l2;
117     if (len == 0) {
118 	return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
119     }
120     s1 = data1->data;
121     s2 = data2->data;
122     done = s1 + len;
123 
124     while ((s1 < done) && KRB5_UTF8_ISASCII(s1) && KRB5_UTF8_ISASCII(s2)) {
125 	if (casefold) {
126 	    char c1 = TOLOWER(*s1);
127 	    char c2 = TOLOWER(*s2);
128 	    res = c1 - c2;
129 	} else {
130 	    res = *s1 - *s2;
131 	}
132 	s1++;
133 	s2++;
134 	if (res) {
135 	    /* done unless next character in s1 or s2 is non-ascii */
136 	    if (s1 < done) {
137 		if (!KRB5_UTF8_ISASCII(s1) || !KRB5_UTF8_ISASCII(s2)) {
138 		    break;
139 		}
140 	    } else if (((len < l1) && !KRB5_UTF8_ISASCII(s1)) ||
141 		       ((len < l2) && !KRB5_UTF8_ISASCII(s2))) {
142 		break;
143 	    }
144 	    return res;
145 	}
146     }
147 
148     /* We have encountered non-ascii or strings equal up to len */
149 
150     /* set i to number of iterations */
151     i = s1 - done + len;
152     /* passed through loop at least once? */
153     if (i > 0) {
154 	if (!res && (s1 == done) &&
155 	    ((len == l1) || KRB5_UTF8_ISASCII(s1)) &&
156 	    ((len == l2) || KRB5_UTF8_ISASCII(s2))) {
157 	    /* all ascii and equal up to len */
158 	    return l1 - l2;
159 	}
160 	/* rewind one char, and do normalized compare from there */
161 	s1--;
162 	s2--;
163 	l1 -= i - 1;
164 	l2 -= i - 1;
165     }
166     /*
167      * Should first check to see if strings are already in proper normalized
168      * form.
169      */
170     ucs = malloc(((norm1 || l1 > l2) ? l1 : l2) * sizeof(*ucs));
171     if (ucs == NULL) {
172 	return l1 > l2 ? 1 : -1;/* what to do??? */
173     }
174     /*
175      * XXYYZ: we convert to ucs4 even though -llunicode
176      * expects ucs2 in an ac_uint4
177      */
178 
179     /* convert and normalize 1st string */
180     for (i = 0, ulen = 0; i < l1; i += len, ulen++) {
181 	if (krb5int_utf8_to_ucs4(s1 + i, &ucs[ulen]) == -1) {
182 	    free(ucs);
183 	    return -1;		/* what to do??? */
184 	}
185 	len = KRB5_UTF8_CHARLEN(s1 + i);
186     }
187 
188     if (norm1) {
189 	ucsout1 = ucs;
190 	l1 = ulen;
191 	ucs = malloc(l2 * sizeof(*ucs));
192 	if (ucs == NULL) {
193 	    free(ucsout1);
194 	    return l1 > l2 ? 1 : -1;	/* what to do??? */
195 	}
196     } else {
197 	uccompatdecomp(ucs, ulen, &ucsout1, &l1);
198 	l1 = uccanoncomp(ucsout1, l1);
199     }
200 
201     /* convert and normalize 2nd string */
202     for (i = 0, ulen = 0; i < l2; i += len, ulen++) {
203 	if (krb5int_utf8_to_ucs4(s2 + i, &ucs[ulen]) == -1) {
204 	    free(ucsout1);
205 	    free(ucs);
206 	    return 1;		/* what to do??? */
207 	}
208 	len = KRB5_UTF8_CHARLEN(s2 + i);
209     }
210 
211     if (norm2) {
212 	ucsout2 = ucs;
213 	l2 = ulen;
214     } else {
215 	uccompatdecomp(ucs, ulen, &ucsout2, &l2);
216 	l2 = uccanoncomp(ucsout2, l2);
217 	free(ucs);
218     }
219 
220     res = casefold
221 	? krb5int_ucstrncasecmp(ucsout1, ucsout2, l1 < l2 ? l1 : l2)
222 	: krb5int_ucstrncmp(ucsout1, ucsout2, l1 < l2 ? l1 : l2);
223     free(ucsout1);
224     free(ucsout2);
225 
226     if (res != 0) {
227 	return res;
228     }
229     if (l1 == l2) {
230 	return 0;
231     }
232     return l1 > l2 ? 1 : -1;
233 }
234