1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
23 /*
24 * regex collation symbol support
25 */
26
27 #include "reglib.h"
28
29 #include <ccode.h>
30
31 #ifndef UCS_BYTE
32 #define UCS_BYTE 1
33 #endif
34
35 #include "ucs_names.h"
36
37 typedef struct Ucs_map_s
38 {
39 Ucs_attr_t attr[3];
40 Ucs_code_t code;
41 const char* name;
42 Dtlink_t link;
43 struct Ucs_map_s* next;
44 } Ucs_map_t;
45
46 #define setattr(a,i) ((a)[(i)>>5]|=(1<<((i)&((1<<5)-1))))
47 #define tstattr(a,i) ((a)[(i)>>5]&(1<<((i)&((1<<5)-1))))
48 #define clrattr(a,i) ((a)[(i)>>5]&=~(1<<((i)&((1<<5)-1))))
49
50 static struct Local_s
51 {
52 int fatal;
53 Dt_t* attrs;
54 Dt_t* names;
55 Dtdisc_t dtdisc;
56 #if CC_NATIVE != CC_ASCII
57 unsigned char* a2n;
58 #endif
59 } local;
60
61 /*
62 * initialize the writeable tables from the readonly data
63 * the tables are big enough to be concerned about text vs. data vs. bss
64 * UCS_BYTE==0 100K
65 * UCS_BYTE==1 20K
66 */
67
68 static int
initialize(void)69 initialize(void)
70 {
71 register int i;
72 register Ucs_map_t* a;
73 register Ucs_map_t* w;
74
75 if (local.fatal)
76 return -1;
77 local.dtdisc.link = offsetof(Ucs_map_t, link);
78 local.dtdisc.key = offsetof(Ucs_map_t, name);
79 local.dtdisc.size = -1;
80 if (!(w = (Ucs_map_t*)malloc(sizeof(Ucs_map_t) * (elementsof(ucs_attrs) + elementsof(ucs_names)))))
81 {
82 local.fatal = 1;
83 return -1;
84 }
85 if (!(local.attrs = dtopen(&local.dtdisc, Dttree)))
86 {
87 free(w);
88 local.fatal = 1;
89 return -1;
90 }
91 if (!(local.names = dtopen(&local.dtdisc, Dttree)))
92 {
93 free(w);
94 dtclose(local.attrs);
95 local.fatal = 1;
96 return -1;
97 }
98 for (i = 0; i < elementsof(ucs_attrs); i++, w++)
99 {
100 memcpy(w, &ucs_attrs[i], offsetof(Ucs_dat_t, table));
101 w->name = ucs_strings[ucs_attrs[i].table] + ucs_attrs[i].index;
102 w->next = 0;
103 dtinsert(local.attrs, w);
104 }
105 for (i = 0; i < elementsof(ucs_names); i++, w++)
106 {
107 memcpy(w, &ucs_names[i], offsetof(Ucs_dat_t, table));
108 w->name = ucs_strings[ucs_names[i].table] + ucs_names[i].index;
109 w->next = 0;
110 if (a = (Ucs_map_t*)dtsearch(local.names, w))
111 {
112 while (a->next)
113 a = a->next;
114 a->next = w;
115 }
116 else
117 dtinsert(local.names, w);
118 }
119 #if CC_NATIVE != CC_ASCII
120 local.a2n = ccmap(CC_ASCII, CC_NATIVE);
121 #endif
122 return 0;
123 }
124
125 /*
126 * return the collating symbol delimited by [c c], where c is either '=' or '.'
127 * s points to the first char after the initial [
128 * if e!=0 it is set to point to the next char in s on return
129 *
130 * the collating symbol is converted to multibyte in <buf,size>
131 * the return value is:
132 * -1 syntax error or buf not large enough
133 * >=0 size with 0-terminated mb collation element
134 * or ligature value in buf
135 */
136
137 int
regcollate(register const char * s,char ** e,char * buf,int size)138 regcollate(register const char* s, char** e, char* buf, int size)
139 {
140 register int c;
141 register char* u;
142 register char* b;
143 register char* x;
144 register Ucs_map_t* a;
145 Ucs_map_t* z;
146 const char* t;
147 const char* v;
148 int n;
149 int r;
150 int ul;
151 int term;
152 wchar_t w[2];
153 Ucs_attr_t attr[3];
154
155 if (size < 2)
156 r = -1;
157 else if ((term = *s++) != '.' && term != '=')
158 {
159 s--;
160 r = -1;
161 }
162 else if (*s == term && *(s + 1) == ']')
163 r = -1;
164 else
165 {
166 t = s;
167 mbchar(s);
168 if ((n = (s - t)) == 1)
169 {
170 if (*s == term && *(s + 1) == ']')
171 {
172 s += 2;
173 r = -1;
174 }
175 else
176 {
177 if (!local.attrs && initialize())
178 return -1;
179 attr[0] = attr[1] = attr[2] = 0;
180 ul = 0;
181 b = buf;
182 x = buf + size - 2;
183 r = 1;
184 s = t;
185 do
186 {
187 v = s;
188 u = b;
189 for (;;)
190 {
191 if (!(c = *s++))
192 return -1;
193 if (c == term)
194 {
195 if (!(c = *s++))
196 return -1;
197 if (c != term)
198 {
199 if (c != ']')
200 return -1;
201 r = -1;
202 break;
203 }
204 }
205 if (c == ' ' || c == '-' && u > b && *s != ' ' && *s != '-')
206 break;
207 if (isupper(c))
208 c = tolower(c);
209 if (u > x)
210 break;
211 *u++ = c;
212 }
213 *u = 0;
214 if (a = (Ucs_map_t*)dtmatch(local.attrs, b))
215 setattr(attr, a->code);
216 else
217 {
218 if (u < x)
219 *u++ = ' ';
220 if (b == buf)
221 {
222 if (isupper(*v))
223 ul = UCS_UC;
224 else if (islower(*v))
225 ul = UCS_LC;
226 }
227 b = u;
228 }
229 } while (r > 0);
230 if (b > buf && *(b - 1) == ' ')
231 b--;
232 *b = 0;
233 attr[0] &= ~((Ucs_attr_t)1);
234 if (ul)
235 {
236 if (tstattr(attr, UCS_UC) || tstattr(attr, UCS_LC))
237 ul = 0;
238 else
239 setattr(attr, ul);
240 }
241 if (z = (Ucs_map_t*)dtmatch(local.names, buf))
242 for(;;)
243 {
244 for (a = z; a; a = a->next)
245 if ((attr[0] & a->attr[0]) == attr[0] && (attr[1] & a->attr[1]) == attr[1] && (attr[2] & a->attr[2]) == attr[2])
246 {
247 #if 0
248 if (a->code <= 0xff)
249 {
250 #if CC_NATIVE != CC_ASCII
251 buf[0] = local.a2n[a->code];
252 #else
253 buf[0] = a->code;
254 #endif
255 buf[r = 1] = 0;
256 ul = 0;
257 break;
258 }
259 #endif
260 w[0] = a->code;
261 w[1] = 0;
262 if ((r = wcstombs(buf, w, size)) > 0)
263 ul = 0;
264 break;
265 }
266 if (!ul)
267 break;
268 clrattr(attr, ul);
269 ul = 0;
270 }
271 }
272 if (r < 0)
273 {
274 if ((n = s - t - 2) > (size - 1))
275 return -1;
276 memcpy(buf, t, n);
277 buf[n] = 0;
278 if (n == 1)
279 r = n;
280 else
281 {
282 for (t = buf; isalnum(*t); t++);
283 if (!*t)
284 r = n;
285 }
286 }
287 }
288 else if (*s++ != term || *s++ != ']')
289 {
290 s--;
291 r = -1;
292 }
293 else if (n > (size - 1))
294 r = -1;
295 else
296 {
297 memcpy(buf, t, n);
298 buf[r = n] = 0;
299 }
300 }
301 if (e)
302 *e = (char*)s;
303 return r;
304 }
305