xref: /titanic_44/usr/src/lib/libslp/clib/slp_utf8.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1999 by Sun Microsystems, Inc.
24*7c478bd9Sstevel@tonic-gate  * All rights reserved.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*7c478bd9Sstevel@tonic-gate 
29*7c478bd9Sstevel@tonic-gate /*
30*7c478bd9Sstevel@tonic-gate  * UTF-8 encoded Unicode parsing routines. For efficiency, we convert
31*7c478bd9Sstevel@tonic-gate  * to wide chars only when absolutely needed. The following interfaces
32*7c478bd9Sstevel@tonic-gate  * are exported to libslp:
33*7c478bd9Sstevel@tonic-gate  *
34*7c478bd9Sstevel@tonic-gate  * slp_utf_strchr:	same semantics as strchr, but handles UTF-8 strings
35*7c478bd9Sstevel@tonic-gate  * slp_fold_space:	folds white space around and in between works;
36*7c478bd9Sstevel@tonic-gate  *				handles UTF-8 strings
37*7c478bd9Sstevel@tonic-gate  * slp_strcasecmp:	same semantics as strcasecmp, but also folds white
38*7c478bd9Sstevel@tonic-gate  *				space and attempts locale-specific
39*7c478bd9Sstevel@tonic-gate  *				case-insensitive comparisons.
40*7c478bd9Sstevel@tonic-gate  */
41*7c478bd9Sstevel@tonic-gate 
42*7c478bd9Sstevel@tonic-gate #include <stdio.h>
43*7c478bd9Sstevel@tonic-gate #include <string.h>
44*7c478bd9Sstevel@tonic-gate #include <widec.h>
45*7c478bd9Sstevel@tonic-gate #include <stdlib.h>
46*7c478bd9Sstevel@tonic-gate #include <syslog.h>
47*7c478bd9Sstevel@tonic-gate #include <slp-internal.h>
48*7c478bd9Sstevel@tonic-gate 
49*7c478bd9Sstevel@tonic-gate /*
50*7c478bd9Sstevel@tonic-gate  * Same semantics as strchr.
51*7c478bd9Sstevel@tonic-gate  * Assumes that we start on a char boundry, and that c is a 7-bit
52*7c478bd9Sstevel@tonic-gate  * ASCII char.
53*7c478bd9Sstevel@tonic-gate  */
slp_utf_strchr(const char * s,char c)54*7c478bd9Sstevel@tonic-gate char *slp_utf_strchr(const char *s, char c) {
55*7c478bd9Sstevel@tonic-gate 	int len;
56*7c478bd9Sstevel@tonic-gate 	char *p;
57*7c478bd9Sstevel@tonic-gate 
58*7c478bd9Sstevel@tonic-gate 	for (p = (char *)s; *p; p += len) {
59*7c478bd9Sstevel@tonic-gate 		len = mblen(p, MB_CUR_MAX);
60*7c478bd9Sstevel@tonic-gate 		if (len == 1 && *p == c)
61*7c478bd9Sstevel@tonic-gate 			return (p);
62*7c478bd9Sstevel@tonic-gate 	}
63*7c478bd9Sstevel@tonic-gate 	return (NULL);
64*7c478bd9Sstevel@tonic-gate }
65*7c478bd9Sstevel@tonic-gate 
66*7c478bd9Sstevel@tonic-gate /*
67*7c478bd9Sstevel@tonic-gate  * folds white space around and in between words.
68*7c478bd9Sstevel@tonic-gate  * " aa    bb   " becomes "aa bb".
69*7c478bd9Sstevel@tonic-gate  * returns NULL if it couldn't allocate memory. The caller must free
70*7c478bd9Sstevel@tonic-gate  * the result when done.
71*7c478bd9Sstevel@tonic-gate  */
slp_fold_space(const char * s)72*7c478bd9Sstevel@tonic-gate static char *slp_fold_space(const char *s) {
73*7c478bd9Sstevel@tonic-gate 	int len;
74*7c478bd9Sstevel@tonic-gate 	char *folded, *f;
75*7c478bd9Sstevel@tonic-gate 
76*7c478bd9Sstevel@tonic-gate 	if (!(folded = malloc(strlen(s) + 1))) {
77*7c478bd9Sstevel@tonic-gate 		slp_err(LOG_CRIT, 0, "slp_fold_space", "out of memory");
78*7c478bd9Sstevel@tonic-gate 		return (NULL);
79*7c478bd9Sstevel@tonic-gate 	}
80*7c478bd9Sstevel@tonic-gate 
81*7c478bd9Sstevel@tonic-gate 	f = folded;
82*7c478bd9Sstevel@tonic-gate 	for (;;) {
83*7c478bd9Sstevel@tonic-gate 		/* step 1: skip white space */
84*7c478bd9Sstevel@tonic-gate 		for (; *s; s++) {
85*7c478bd9Sstevel@tonic-gate 			len = mblen(s, MB_CUR_MAX);
86*7c478bd9Sstevel@tonic-gate 			if (len != 1)
87*7c478bd9Sstevel@tonic-gate 				break;
88*7c478bd9Sstevel@tonic-gate 			if (!isspace(*s))
89*7c478bd9Sstevel@tonic-gate 				break;
90*7c478bd9Sstevel@tonic-gate 		}
91*7c478bd9Sstevel@tonic-gate 
92*7c478bd9Sstevel@tonic-gate 		if (!*s) {
93*7c478bd9Sstevel@tonic-gate 			/* end of string */
94*7c478bd9Sstevel@tonic-gate 			*f = 0;
95*7c478bd9Sstevel@tonic-gate 			return (folded);
96*7c478bd9Sstevel@tonic-gate 		}
97*7c478bd9Sstevel@tonic-gate 		/* if we are in between words, keep one space */
98*7c478bd9Sstevel@tonic-gate 		if (f != folded)
99*7c478bd9Sstevel@tonic-gate 			*f++ = ' ';
100*7c478bd9Sstevel@tonic-gate 
101*7c478bd9Sstevel@tonic-gate 		/* step 2: copy into folded until we hit more white space */
102*7c478bd9Sstevel@tonic-gate 		while (*s) {
103*7c478bd9Sstevel@tonic-gate 			int i;
104*7c478bd9Sstevel@tonic-gate 			len = mblen(s, MB_CUR_MAX);
105*7c478bd9Sstevel@tonic-gate 			if (len == 1 && isspace(*s))
106*7c478bd9Sstevel@tonic-gate 				break;
107*7c478bd9Sstevel@tonic-gate 
108*7c478bd9Sstevel@tonic-gate 			for (i = 0; i < len; i++)
109*7c478bd9Sstevel@tonic-gate 				*f++ = *s++;
110*7c478bd9Sstevel@tonic-gate 		}
111*7c478bd9Sstevel@tonic-gate 		*f = *s;
112*7c478bd9Sstevel@tonic-gate 		if (!*s++)
113*7c478bd9Sstevel@tonic-gate 			return (folded);
114*7c478bd9Sstevel@tonic-gate 	}
115*7c478bd9Sstevel@tonic-gate }
116*7c478bd9Sstevel@tonic-gate 
117*7c478bd9Sstevel@tonic-gate /*
118*7c478bd9Sstevel@tonic-gate  * performs like strcasecmp, but also folds white space before comparing,
119*7c478bd9Sstevel@tonic-gate  * and will handle UTF-8 comparisons (including case). Note that the
120*7c478bd9Sstevel@tonic-gate  * application's locale must have been set to a UTF-8 locale for this
121*7c478bd9Sstevel@tonic-gate  * to work properly.
122*7c478bd9Sstevel@tonic-gate  */
slp_strcasecmp(const char * s1,const char * s2)123*7c478bd9Sstevel@tonic-gate int slp_strcasecmp(const char *s1, const char *s2) {
124*7c478bd9Sstevel@tonic-gate 	int diff = -1;
125*7c478bd9Sstevel@tonic-gate 	char *p1, *p2;
126*7c478bd9Sstevel@tonic-gate 	size_t wcslen1, wcslen2;
127*7c478bd9Sstevel@tonic-gate 	wchar_t *wcs1, *wcs2;
128*7c478bd9Sstevel@tonic-gate 
129*7c478bd9Sstevel@tonic-gate 	p1 = p2 = NULL; wcs1 = wcs2 = NULL;
130*7c478bd9Sstevel@tonic-gate 
131*7c478bd9Sstevel@tonic-gate 	/* optimization: try simple case first */
132*7c478bd9Sstevel@tonic-gate 	if (strcasecmp(s1, s2) == 0)
133*7c478bd9Sstevel@tonic-gate 		return (0);
134*7c478bd9Sstevel@tonic-gate 
135*7c478bd9Sstevel@tonic-gate 	/* fold white space, and try again */
136*7c478bd9Sstevel@tonic-gate 	p1 = slp_fold_space(s1);
137*7c478bd9Sstevel@tonic-gate 	p2 = slp_fold_space(s2);
138*7c478bd9Sstevel@tonic-gate 	if (!p1 || !p2)
139*7c478bd9Sstevel@tonic-gate 		goto cleanup;
140*7c478bd9Sstevel@tonic-gate 
141*7c478bd9Sstevel@tonic-gate 	if ((diff = strcasecmp(p1, p2)) == 0)
142*7c478bd9Sstevel@tonic-gate 		goto cleanup;
143*7c478bd9Sstevel@tonic-gate 
144*7c478bd9Sstevel@tonic-gate 	/*
145*7c478bd9Sstevel@tonic-gate 	 * try converting to wide char -- we must be in a locale which
146*7c478bd9Sstevel@tonic-gate 	 * supports the UTF8 codeset for this to work.
147*7c478bd9Sstevel@tonic-gate 	 */
148*7c478bd9Sstevel@tonic-gate 	if ((wcslen1 = mbstowcs(NULL, p1, 0)) == (size_t)-1)
149*7c478bd9Sstevel@tonic-gate 		goto cleanup;
150*7c478bd9Sstevel@tonic-gate 
151*7c478bd9Sstevel@tonic-gate 	if (!(wcs1 = malloc(sizeof (*wcs1) * (wcslen1 + 1)))) {
152*7c478bd9Sstevel@tonic-gate 		slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
153*7c478bd9Sstevel@tonic-gate 		goto cleanup;
154*7c478bd9Sstevel@tonic-gate 	}
155*7c478bd9Sstevel@tonic-gate 
156*7c478bd9Sstevel@tonic-gate 	if ((wcslen2 = mbstowcs(NULL, p2, 0)) == (size_t)-1)
157*7c478bd9Sstevel@tonic-gate 		goto cleanup;
158*7c478bd9Sstevel@tonic-gate 
159*7c478bd9Sstevel@tonic-gate 	if (!(wcs2 = malloc(sizeof (*wcs2) * (wcslen2 + 1)))) {
160*7c478bd9Sstevel@tonic-gate 		slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
161*7c478bd9Sstevel@tonic-gate 		goto cleanup;
162*7c478bd9Sstevel@tonic-gate 	}
163*7c478bd9Sstevel@tonic-gate 	if (mbstowcs(wcs1, p1, wcslen1 + 1) == (size_t)-1)
164*7c478bd9Sstevel@tonic-gate 		goto cleanup;
165*7c478bd9Sstevel@tonic-gate 	if (mbstowcs(wcs2, p2, wcslen2 + 1) == (size_t)-1)
166*7c478bd9Sstevel@tonic-gate 		goto cleanup;
167*7c478bd9Sstevel@tonic-gate 
168*7c478bd9Sstevel@tonic-gate 	diff = wscasecmp(wcs1, wcs2);
169*7c478bd9Sstevel@tonic-gate 
170*7c478bd9Sstevel@tonic-gate cleanup:
171*7c478bd9Sstevel@tonic-gate 	if (p1) free(p1);
172*7c478bd9Sstevel@tonic-gate 	if (p2) free(p2);
173*7c478bd9Sstevel@tonic-gate 	if (wcs1) free(wcs1);
174*7c478bd9Sstevel@tonic-gate 	if (wcs2) free(wcs2);
175*7c478bd9Sstevel@tonic-gate 	return (diff);
176*7c478bd9Sstevel@tonic-gate }
177