xref: /titanic_44/usr/src/lib/libast/common/regex/regcoll.c (revision edcc07547a39d6570197493a9836083bd6b2a197)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *           Copyright (c) 1985-2007 AT&T Knowledge Ventures            *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                      by AT&T Knowledge Ventures                      *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                   Phong Vo <kpv@research.att.com>                    *
20 *                                                                      *
21 ***********************************************************************/
22 #pragma prototyped
23 /*
24  * regex collation symbol support
25  */
26 
27 #include "reglib.h"
28 
29 #include <ccode.h>
30 
31 #ifndef UCS_BYTE
32 #define UCS_BYTE	1
33 #endif
34 
35 #include "ucs_names.h"
36 
37 typedef struct Ucs_map_s
38 {
39 	Ucs_attr_t		attr[3];
40 	Ucs_code_t		code;
41 	const char*		name;
42 	Dtlink_t		link;
43 	struct Ucs_map_s*	next;
44 } Ucs_map_t;
45 
46 #define setattr(a,i)	((a)[(i)>>5]|=(1<<((i)&((1<<5)-1))))
47 #define tstattr(a,i)	((a)[(i)>>5]&(1<<((i)&((1<<5)-1))))
48 #define clrattr(a,i)	((a)[(i)>>5]&=~(1<<((i)&((1<<5)-1))))
49 
50 static struct Local_s
51 {
52 	int		fatal;
53 	Dt_t*		attrs;
54 	Dt_t*		names;
55 	Dtdisc_t	dtdisc;
56 #if CC_NATIVE != CC_ASCII
57 	unsigned char*	a2n;
58 #endif
59 } local;
60 
61 /*
62  * initialize the writeable tables from the readonly data
63  * the tables are big enough to be concerned about text vs. data vs. bss
64  *	UCS_BYTE==0 100K
65  *	UCS_BYTE==1  20K
66  */
67 
68 static int
69 initialize(void)
70 {
71 	register int		i;
72 	register Ucs_map_t*	a;
73 	register Ucs_map_t*	w;
74 
75 	if (local.fatal)
76 		return -1;
77 	local.dtdisc.link = offsetof(Ucs_map_t, link);
78 	local.dtdisc.key = offsetof(Ucs_map_t, name);
79 	local.dtdisc.size = -1;
80 	if (!(w = (Ucs_map_t*)malloc(sizeof(Ucs_map_t) * (elementsof(ucs_attrs) + elementsof(ucs_names)))))
81 	{
82 		local.fatal = 1;
83 		return -1;
84 	}
85 	if (!(local.attrs = dtopen(&local.dtdisc, Dttree)))
86 	{
87 		free(w);
88 		local.fatal = 1;
89 		return -1;
90 	}
91 	if (!(local.names = dtopen(&local.dtdisc, Dttree)))
92 	{
93 		free(w);
94 		dtclose(local.attrs);
95 		local.fatal = 1;
96 		return -1;
97 	}
98 	for (i = 0; i < elementsof(ucs_attrs); i++, w++)
99 	{
100 		memcpy(w, &ucs_attrs[i], offsetof(Ucs_dat_t, table));
101 		w->name = ucs_strings[ucs_attrs[i].table] + ucs_attrs[i].index;
102 		w->next = 0;
103 		dtinsert(local.attrs, w);
104 	}
105 	for (i = 0; i < elementsof(ucs_names); i++, w++)
106 	{
107 		memcpy(w, &ucs_names[i], offsetof(Ucs_dat_t, table));
108 		w->name = ucs_strings[ucs_names[i].table] + ucs_names[i].index;
109 		w->next = 0;
110 		if (a = (Ucs_map_t*)dtsearch(local.names, w))
111 		{
112 			while (a->next)
113 				a = a->next;
114 			a->next = w;
115 		}
116 		else
117 			dtinsert(local.names, w);
118 	}
119 #if CC_NATIVE != CC_ASCII
120 	local.a2n = ccmap(CC_ASCII, CC_NATIVE);
121 #endif
122 	return 0;
123 }
124 
125 /*
126  * return the collating symbol delimited by [c c], where c is either '=' or '.'
127  * s points to the first char after the initial [
128  * if e!=0 it is set to point to the next char in s on return
129  *
130  * the collating symbol is converted to multibyte in <buf,size>
131  * the return value is:
132  *	-1	syntax error or buf not large enough
133  *	>=0	size with 0-terminated mb collation element
134  *		or ligature value in buf
135  */
136 
137 int
138 regcollate(register const char* s, char** e, char* buf, int size)
139 {
140 	register int		c;
141 	register char*		u;
142 	register char*		b;
143 	register char*		x;
144 	register Ucs_map_t*	a;
145 	Ucs_map_t*		z;
146 	const char*		t;
147 	const char*		v;
148 	int			n;
149 	int			r;
150 	int			ul;
151 	int			term;
152 	wchar_t			w[2];
153 	Ucs_attr_t		attr[3];
154 
155 	if (size < 2)
156 		r = -1;
157 	else if ((term = *s++) != '.' && term != '=')
158 	{
159 		s--;
160 		r = -1;
161 	}
162 	else if (*s == term && *(s + 1) == ']')
163 		r = -1;
164 	else
165 	{
166 		t = s;
167 		mbchar(s);
168 		if ((n = (s - t)) == 1)
169 		{
170 			if (*s == term && *(s + 1) == ']')
171 			{
172 				s += 2;
173 				r = -1;
174 			}
175 			else
176 			{
177 				if (!local.attrs && initialize())
178 					return -1;
179 				attr[0] = attr[1] = attr[2] = 0;
180 				ul = 0;
181 				b = buf;
182 				x = buf + size - 2;
183 				r = 1;
184 				s = t;
185 				do
186 				{
187 					v = s;
188 					u = b;
189 					for (;;)
190 					{
191 						if (!(c = *s++))
192 							return -1;
193 						if (c == term)
194 						{
195 							if (!(c = *s++))
196 								return -1;
197 							if (c != term)
198 							{
199 								if (c != ']')
200 									return -1;
201 								r = -1;
202 								break;
203 							}
204 						}
205 						if (c == ' ' || c == '-' && u > b && *s != ' ' && *s != '-')
206 							break;
207 						if (isupper(c))
208 							c = tolower(c);
209 						if (u > x)
210 							break;
211 						*u++ = c;
212 					}
213 					*u = 0;
214 					if (a = (Ucs_map_t*)dtmatch(local.attrs, b))
215 						setattr(attr, a->code);
216 					else
217 					{
218 						if (u < x)
219 							*u++ = ' ';
220 						if (b == buf)
221 						{
222 							if (isupper(*v))
223 								ul = UCS_UC;
224 							else if (islower(*v))
225 								ul = UCS_LC;
226 						}
227 						b = u;
228 					}
229 				} while (r > 0);
230 				if (b > buf && *(b - 1) == ' ')
231 					b--;
232 				*b = 0;
233 				attr[0] &= ~((Ucs_attr_t)1);
234 				if (ul)
235 				{
236 					if (tstattr(attr, UCS_UC) || tstattr(attr, UCS_LC))
237 						ul = 0;
238 					else
239 						setattr(attr, ul);
240 				}
241 				if (z = (Ucs_map_t*)dtmatch(local.names, buf))
242 					for(;;)
243 					{
244 						for (a = z; a; a = a->next)
245 							if ((attr[0] & a->attr[0]) == attr[0] && (attr[1] & a->attr[1]) == attr[1] && (attr[2] & a->attr[2]) == attr[2])
246 							{
247 								if (a->code <= 0xff)
248 								{
249 #if CC_NATIVE != CC_ASCII
250 									buf[0] = local.a2n[a->code];
251 #else
252 									buf[0] = a->code;
253 #endif
254 									buf[r = 1] = 0;
255 									ul = 0;
256 									break;
257 								}
258 								w[0] = a->code;
259 								w[1] = 0;
260 								if ((r = wcstombs(buf, w, size)) > 0)
261 								{
262 									r--;
263 									ul = 0;
264 								}
265 								break;
266 							}
267 						if (!ul)
268 							break;
269 						clrattr(attr, ul);
270 						ul = 0;
271 					}
272 			}
273 			if (r < 0)
274 			{
275 				if ((r = s - t - 2) > (size - 1))
276 					return -1;
277 				memcpy(buf, t, r);
278 				buf[r] = 0;
279 			}
280 		}
281 		else if (*s++ != term || *s++ != ']')
282 		{
283 			s--;
284 			r = -1;
285 		}
286 		else if (n > (size - 1))
287 			r = -1;
288 		else
289 		{
290 			memcpy(buf, t, n);
291 			buf[r = n] = 0;
292 		}
293 	}
294 	if (e)
295 		*e = (char*)s;
296 	return r;
297 }
298