xref: /illumos-gate/usr/src/lib/libslp/clib/slp_utf8.c (revision 3299f39fdcbdab4be7a9c70daa3873f2b78a398d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 1999 by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * UTF-8 encoded Unicode parsing routines. For efficiency, we convert
31  * to wide chars only when absolutely needed. The following interfaces
32  * are exported to libslp:
33  *
34  * slp_utf_strchr:	same semantics as strchr, but handles UTF-8 strings
35  * slp_fold_space:	folds white space around and in between works;
36  *				handles UTF-8 strings
37  * slp_strcasecmp:	same semantics as strcasecmp, but also folds white
38  *				space and attempts locale-specific
39  *				case-insensitive comparisons.
40  */
41 
42 #include <stdio.h>
43 #include <string.h>
44 #include <widec.h>
45 #include <stdlib.h>
46 #include <syslog.h>
47 #include <slp-internal.h>
48 
49 /*
50  * Same semantics as strchr.
51  * Assumes that we start on a char boundry, and that c is a 7-bit
52  * ASCII char.
53  */
54 char *slp_utf_strchr(const char *s, char c) {
55 	int len;
56 	char *p;
57 
58 	for (p = (char *)s; *p; p += len) {
59 		len = mblen(p, MB_CUR_MAX);
60 		if (len == 1 && *p == c)
61 			return (p);
62 	}
63 	return (NULL);
64 }
65 
66 /*
67  * folds white space around and in between words.
68  * " aa    bb   " becomes "aa bb".
69  * returns NULL if it couldn't allocate memory. The caller must free
70  * the result when done.
71  */
72 static char *slp_fold_space(const char *s) {
73 	int len;
74 	char *folded, *f;
75 
76 	if (!(folded = malloc(strlen(s) + 1))) {
77 		slp_err(LOG_CRIT, 0, "slp_fold_space", "out of memory");
78 		return (NULL);
79 	}
80 
81 	f = folded;
82 	for (;;) {
83 		/* step 1: skip white space */
84 		for (; *s; s++) {
85 			len = mblen(s, MB_CUR_MAX);
86 			if (len != 1)
87 				break;
88 			if (!isspace(*s))
89 				break;
90 		}
91 
92 		if (!*s) {
93 			/* end of string */
94 			*f = 0;
95 			return (folded);
96 		}
97 		/* if we are in between words, keep one space */
98 		if (f != folded)
99 			*f++ = ' ';
100 
101 		/* step 2: copy into folded until we hit more white space */
102 		while (*s) {
103 			int i;
104 			len = mblen(s, MB_CUR_MAX);
105 			if (len == 1 && isspace(*s))
106 				break;
107 
108 			for (i = 0; i < len; i++)
109 				*f++ = *s++;
110 		}
111 		*f = *s;
112 		if (!*s++)
113 			return (folded);
114 	}
115 }
116 
117 /*
118  * performs like strcasecmp, but also folds white space before comparing,
119  * and will handle UTF-8 comparisons (including case). Note that the
120  * application's locale must have been set to a UTF-8 locale for this
121  * to work properly.
122  */
123 int slp_strcasecmp(const char *s1, const char *s2) {
124 	int diff = -1;
125 	char *p1, *p2;
126 	size_t wcslen1, wcslen2;
127 	wchar_t *wcs1, *wcs2;
128 
129 	p1 = p2 = NULL; wcs1 = wcs2 = NULL;
130 
131 	/* optimization: try simple case first */
132 	if (strcasecmp(s1, s2) == 0)
133 		return (0);
134 
135 	/* fold white space, and try again */
136 	p1 = slp_fold_space(s1);
137 	p2 = slp_fold_space(s2);
138 	if (!p1 || !p2)
139 		goto cleanup;
140 
141 	if ((diff = strcasecmp(p1, p2)) == 0)
142 		goto cleanup;
143 
144 	/*
145 	 * try converting to wide char -- we must be in a locale which
146 	 * supports the UTF8 codeset for this to work.
147 	 */
148 	if ((wcslen1 = mbstowcs(NULL, p1, 0)) == (size_t)-1)
149 		goto cleanup;
150 
151 	if (!(wcs1 = malloc(sizeof (*wcs1) * (wcslen1 + 1)))) {
152 		slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
153 		goto cleanup;
154 	}
155 
156 	if ((wcslen2 = mbstowcs(NULL, p2, 0)) == (size_t)-1)
157 		goto cleanup;
158 
159 	if (!(wcs2 = malloc(sizeof (*wcs2) * (wcslen2 + 1)))) {
160 		slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
161 		goto cleanup;
162 	}
163 	if (mbstowcs(wcs1, p1, wcslen1 + 1) == (size_t)-1)
164 		goto cleanup;
165 	if (mbstowcs(wcs2, p2, wcslen2 + 1) == (size_t)-1)
166 		goto cleanup;
167 
168 	diff = wscasecmp(wcs1, wcs2);
169 
170 cleanup:
171 	if (p1) free(p1);
172 	if (p2) free(p2);
173 	if (wcs1) free(wcs1);
174 	if (wcs2) free(wcs2);
175 	return (diff);
176 }
177