1da2e3ebdSchin /***********************************************************************
2da2e3ebdSchin * *
3da2e3ebdSchin * This software is part of the ast package *
4*3e14f97fSRoger A. Faulkner * Copyright (c) 1985-2010 AT&T Intellectual Property *
5da2e3ebdSchin * and is licensed under the *
6da2e3ebdSchin * Common Public License, Version 1.0 *
77c2fbfb3SApril Chin * by AT&T Intellectual Property *
8da2e3ebdSchin * *
9da2e3ebdSchin * A copy of the License is available at *
10da2e3ebdSchin * http://www.opensource.org/licenses/cpl1.0.txt *
11da2e3ebdSchin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12da2e3ebdSchin * *
13da2e3ebdSchin * Information and Software Systems Research *
14da2e3ebdSchin * AT&T Research *
15da2e3ebdSchin * Florham Park NJ *
16da2e3ebdSchin * *
17da2e3ebdSchin * Glenn Fowler <gsf@research.att.com> *
18da2e3ebdSchin * David Korn <dgk@research.att.com> *
19da2e3ebdSchin * Phong Vo <kpv@research.att.com> *
20da2e3ebdSchin * *
21da2e3ebdSchin ***********************************************************************/
22da2e3ebdSchin #pragma prototyped
23da2e3ebdSchin /*
24da2e3ebdSchin * regex collation symbol support
25da2e3ebdSchin */
26da2e3ebdSchin
27da2e3ebdSchin #include "reglib.h"
28da2e3ebdSchin
29da2e3ebdSchin #include <ccode.h>
30da2e3ebdSchin
31da2e3ebdSchin #ifndef UCS_BYTE
32da2e3ebdSchin #define UCS_BYTE 1
33da2e3ebdSchin #endif
34da2e3ebdSchin
35da2e3ebdSchin #include "ucs_names.h"
36da2e3ebdSchin
37da2e3ebdSchin typedef struct Ucs_map_s
38da2e3ebdSchin {
39da2e3ebdSchin Ucs_attr_t attr[3];
40da2e3ebdSchin Ucs_code_t code;
41da2e3ebdSchin const char* name;
42da2e3ebdSchin Dtlink_t link;
43da2e3ebdSchin struct Ucs_map_s* next;
44da2e3ebdSchin } Ucs_map_t;
45da2e3ebdSchin
46da2e3ebdSchin #define setattr(a,i) ((a)[(i)>>5]|=(1<<((i)&((1<<5)-1))))
47da2e3ebdSchin #define tstattr(a,i) ((a)[(i)>>5]&(1<<((i)&((1<<5)-1))))
48da2e3ebdSchin #define clrattr(a,i) ((a)[(i)>>5]&=~(1<<((i)&((1<<5)-1))))
49da2e3ebdSchin
50da2e3ebdSchin static struct Local_s
51da2e3ebdSchin {
52da2e3ebdSchin int fatal;
53da2e3ebdSchin Dt_t* attrs;
54da2e3ebdSchin Dt_t* names;
55da2e3ebdSchin Dtdisc_t dtdisc;
56da2e3ebdSchin #if CC_NATIVE != CC_ASCII
57da2e3ebdSchin unsigned char* a2n;
58da2e3ebdSchin #endif
59da2e3ebdSchin } local;
60da2e3ebdSchin
61da2e3ebdSchin /*
62da2e3ebdSchin * initialize the writeable tables from the readonly data
63da2e3ebdSchin * the tables are big enough to be concerned about text vs. data vs. bss
64da2e3ebdSchin * UCS_BYTE==0 100K
65da2e3ebdSchin * UCS_BYTE==1 20K
66da2e3ebdSchin */
67da2e3ebdSchin
68da2e3ebdSchin static int
initialize(void)69da2e3ebdSchin initialize(void)
70da2e3ebdSchin {
71da2e3ebdSchin register int i;
72da2e3ebdSchin register Ucs_map_t* a;
73da2e3ebdSchin register Ucs_map_t* w;
74da2e3ebdSchin
75da2e3ebdSchin if (local.fatal)
76da2e3ebdSchin return -1;
77da2e3ebdSchin local.dtdisc.link = offsetof(Ucs_map_t, link);
78da2e3ebdSchin local.dtdisc.key = offsetof(Ucs_map_t, name);
79da2e3ebdSchin local.dtdisc.size = -1;
80da2e3ebdSchin if (!(w = (Ucs_map_t*)malloc(sizeof(Ucs_map_t) * (elementsof(ucs_attrs) + elementsof(ucs_names)))))
81da2e3ebdSchin {
82da2e3ebdSchin local.fatal = 1;
83da2e3ebdSchin return -1;
84da2e3ebdSchin }
85da2e3ebdSchin if (!(local.attrs = dtopen(&local.dtdisc, Dttree)))
86da2e3ebdSchin {
87da2e3ebdSchin free(w);
88da2e3ebdSchin local.fatal = 1;
89da2e3ebdSchin return -1;
90da2e3ebdSchin }
91da2e3ebdSchin if (!(local.names = dtopen(&local.dtdisc, Dttree)))
92da2e3ebdSchin {
93da2e3ebdSchin free(w);
94da2e3ebdSchin dtclose(local.attrs);
95da2e3ebdSchin local.fatal = 1;
96da2e3ebdSchin return -1;
97da2e3ebdSchin }
98da2e3ebdSchin for (i = 0; i < elementsof(ucs_attrs); i++, w++)
99da2e3ebdSchin {
100da2e3ebdSchin memcpy(w, &ucs_attrs[i], offsetof(Ucs_dat_t, table));
101da2e3ebdSchin w->name = ucs_strings[ucs_attrs[i].table] + ucs_attrs[i].index;
102da2e3ebdSchin w->next = 0;
103da2e3ebdSchin dtinsert(local.attrs, w);
104da2e3ebdSchin }
105da2e3ebdSchin for (i = 0; i < elementsof(ucs_names); i++, w++)
106da2e3ebdSchin {
107da2e3ebdSchin memcpy(w, &ucs_names[i], offsetof(Ucs_dat_t, table));
108da2e3ebdSchin w->name = ucs_strings[ucs_names[i].table] + ucs_names[i].index;
109da2e3ebdSchin w->next = 0;
110da2e3ebdSchin if (a = (Ucs_map_t*)dtsearch(local.names, w))
111da2e3ebdSchin {
112da2e3ebdSchin while (a->next)
113da2e3ebdSchin a = a->next;
114da2e3ebdSchin a->next = w;
115da2e3ebdSchin }
116da2e3ebdSchin else
117da2e3ebdSchin dtinsert(local.names, w);
118da2e3ebdSchin }
119da2e3ebdSchin #if CC_NATIVE != CC_ASCII
120da2e3ebdSchin local.a2n = ccmap(CC_ASCII, CC_NATIVE);
121da2e3ebdSchin #endif
122da2e3ebdSchin return 0;
123da2e3ebdSchin }
124da2e3ebdSchin
125da2e3ebdSchin /*
126da2e3ebdSchin * return the collating symbol delimited by [c c], where c is either '=' or '.'
127da2e3ebdSchin * s points to the first char after the initial [
128da2e3ebdSchin * if e!=0 it is set to point to the next char in s on return
129da2e3ebdSchin *
130da2e3ebdSchin * the collating symbol is converted to multibyte in <buf,size>
131da2e3ebdSchin * the return value is:
132da2e3ebdSchin * -1 syntax error or buf not large enough
133da2e3ebdSchin * >=0 size with 0-terminated mb collation element
134da2e3ebdSchin * or ligature value in buf
135da2e3ebdSchin */
136da2e3ebdSchin
137da2e3ebdSchin int
regcollate(register const char * s,char ** e,char * buf,int size)138da2e3ebdSchin regcollate(register const char* s, char** e, char* buf, int size)
139da2e3ebdSchin {
140da2e3ebdSchin register int c;
141da2e3ebdSchin register char* u;
142da2e3ebdSchin register char* b;
143da2e3ebdSchin register char* x;
144da2e3ebdSchin register Ucs_map_t* a;
145da2e3ebdSchin Ucs_map_t* z;
146da2e3ebdSchin const char* t;
147da2e3ebdSchin const char* v;
148da2e3ebdSchin int n;
149da2e3ebdSchin int r;
150da2e3ebdSchin int ul;
151da2e3ebdSchin int term;
152da2e3ebdSchin wchar_t w[2];
153da2e3ebdSchin Ucs_attr_t attr[3];
154da2e3ebdSchin
155da2e3ebdSchin if (size < 2)
156da2e3ebdSchin r = -1;
157da2e3ebdSchin else if ((term = *s++) != '.' && term != '=')
158da2e3ebdSchin {
159da2e3ebdSchin s--;
160da2e3ebdSchin r = -1;
161da2e3ebdSchin }
162da2e3ebdSchin else if (*s == term && *(s + 1) == ']')
163da2e3ebdSchin r = -1;
164da2e3ebdSchin else
165da2e3ebdSchin {
166da2e3ebdSchin t = s;
167da2e3ebdSchin mbchar(s);
168da2e3ebdSchin if ((n = (s - t)) == 1)
169da2e3ebdSchin {
170da2e3ebdSchin if (*s == term && *(s + 1) == ']')
171da2e3ebdSchin {
172da2e3ebdSchin s += 2;
173da2e3ebdSchin r = -1;
174da2e3ebdSchin }
175da2e3ebdSchin else
176da2e3ebdSchin {
177da2e3ebdSchin if (!local.attrs && initialize())
178da2e3ebdSchin return -1;
179da2e3ebdSchin attr[0] = attr[1] = attr[2] = 0;
180da2e3ebdSchin ul = 0;
181da2e3ebdSchin b = buf;
182da2e3ebdSchin x = buf + size - 2;
183da2e3ebdSchin r = 1;
184da2e3ebdSchin s = t;
185da2e3ebdSchin do
186da2e3ebdSchin {
187da2e3ebdSchin v = s;
188da2e3ebdSchin u = b;
189da2e3ebdSchin for (;;)
190da2e3ebdSchin {
191da2e3ebdSchin if (!(c = *s++))
192da2e3ebdSchin return -1;
193da2e3ebdSchin if (c == term)
194da2e3ebdSchin {
195da2e3ebdSchin if (!(c = *s++))
196da2e3ebdSchin return -1;
197da2e3ebdSchin if (c != term)
198da2e3ebdSchin {
199da2e3ebdSchin if (c != ']')
200da2e3ebdSchin return -1;
201da2e3ebdSchin r = -1;
202da2e3ebdSchin break;
203da2e3ebdSchin }
204da2e3ebdSchin }
205da2e3ebdSchin if (c == ' ' || c == '-' && u > b && *s != ' ' && *s != '-')
206da2e3ebdSchin break;
207da2e3ebdSchin if (isupper(c))
208da2e3ebdSchin c = tolower(c);
209da2e3ebdSchin if (u > x)
210da2e3ebdSchin break;
211da2e3ebdSchin *u++ = c;
212da2e3ebdSchin }
213da2e3ebdSchin *u = 0;
214da2e3ebdSchin if (a = (Ucs_map_t*)dtmatch(local.attrs, b))
215da2e3ebdSchin setattr(attr, a->code);
216da2e3ebdSchin else
217da2e3ebdSchin {
218da2e3ebdSchin if (u < x)
219da2e3ebdSchin *u++ = ' ';
220da2e3ebdSchin if (b == buf)
221da2e3ebdSchin {
222da2e3ebdSchin if (isupper(*v))
223da2e3ebdSchin ul = UCS_UC;
224da2e3ebdSchin else if (islower(*v))
225da2e3ebdSchin ul = UCS_LC;
226da2e3ebdSchin }
227da2e3ebdSchin b = u;
228da2e3ebdSchin }
229da2e3ebdSchin } while (r > 0);
230da2e3ebdSchin if (b > buf && *(b - 1) == ' ')
231da2e3ebdSchin b--;
232da2e3ebdSchin *b = 0;
233da2e3ebdSchin attr[0] &= ~((Ucs_attr_t)1);
234da2e3ebdSchin if (ul)
235da2e3ebdSchin {
236da2e3ebdSchin if (tstattr(attr, UCS_UC) || tstattr(attr, UCS_LC))
237da2e3ebdSchin ul = 0;
238da2e3ebdSchin else
239da2e3ebdSchin setattr(attr, ul);
240da2e3ebdSchin }
241da2e3ebdSchin if (z = (Ucs_map_t*)dtmatch(local.names, buf))
242da2e3ebdSchin for(;;)
243da2e3ebdSchin {
244da2e3ebdSchin for (a = z; a; a = a->next)
245da2e3ebdSchin if ((attr[0] & a->attr[0]) == attr[0] && (attr[1] & a->attr[1]) == attr[1] && (attr[2] & a->attr[2]) == attr[2])
246da2e3ebdSchin {
2477c2fbfb3SApril Chin #if 0
248da2e3ebdSchin if (a->code <= 0xff)
249da2e3ebdSchin {
250da2e3ebdSchin #if CC_NATIVE != CC_ASCII
251da2e3ebdSchin buf[0] = local.a2n[a->code];
252da2e3ebdSchin #else
253da2e3ebdSchin buf[0] = a->code;
254da2e3ebdSchin #endif
255da2e3ebdSchin buf[r = 1] = 0;
256da2e3ebdSchin ul = 0;
257da2e3ebdSchin break;
258da2e3ebdSchin }
2597c2fbfb3SApril Chin #endif
260da2e3ebdSchin w[0] = a->code;
261da2e3ebdSchin w[1] = 0;
262da2e3ebdSchin if ((r = wcstombs(buf, w, size)) > 0)
263da2e3ebdSchin ul = 0;
264da2e3ebdSchin break;
265da2e3ebdSchin }
266da2e3ebdSchin if (!ul)
267da2e3ebdSchin break;
268da2e3ebdSchin clrattr(attr, ul);
269da2e3ebdSchin ul = 0;
270da2e3ebdSchin }
271da2e3ebdSchin }
272da2e3ebdSchin if (r < 0)
273da2e3ebdSchin {
2747c2fbfb3SApril Chin if ((n = s - t - 2) > (size - 1))
275da2e3ebdSchin return -1;
2767c2fbfb3SApril Chin memcpy(buf, t, n);
2777c2fbfb3SApril Chin buf[n] = 0;
2787c2fbfb3SApril Chin if (n == 1)
2797c2fbfb3SApril Chin r = n;
2807c2fbfb3SApril Chin else
2817c2fbfb3SApril Chin {
2827c2fbfb3SApril Chin for (t = buf; isalnum(*t); t++);
2837c2fbfb3SApril Chin if (!*t)
2847c2fbfb3SApril Chin r = n;
2857c2fbfb3SApril Chin }
286da2e3ebdSchin }
287da2e3ebdSchin }
288da2e3ebdSchin else if (*s++ != term || *s++ != ']')
289da2e3ebdSchin {
290da2e3ebdSchin s--;
291da2e3ebdSchin r = -1;
292da2e3ebdSchin }
293da2e3ebdSchin else if (n > (size - 1))
294da2e3ebdSchin r = -1;
295da2e3ebdSchin else
296da2e3ebdSchin {
297da2e3ebdSchin memcpy(buf, t, n);
298da2e3ebdSchin buf[r = n] = 0;
299da2e3ebdSchin }
300da2e3ebdSchin }
301da2e3ebdSchin if (e)
302da2e3ebdSchin *e = (char*)s;
303da2e3ebdSchin return r;
304da2e3ebdSchin }
305