1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2009 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 /* 24 * regex collation symbol support 25 */ 26 27 #include "reglib.h" 28 29 #include <ccode.h> 30 31 #ifndef UCS_BYTE 32 #define UCS_BYTE 1 33 #endif 34 35 #include "ucs_names.h" 36 37 typedef struct Ucs_map_s 38 { 39 Ucs_attr_t attr[3]; 40 Ucs_code_t code; 41 const char* name; 42 Dtlink_t link; 43 struct Ucs_map_s* next; 44 } Ucs_map_t; 45 46 #define setattr(a,i) ((a)[(i)>>5]|=(1<<((i)&((1<<5)-1)))) 47 #define tstattr(a,i) ((a)[(i)>>5]&(1<<((i)&((1<<5)-1)))) 48 #define clrattr(a,i) ((a)[(i)>>5]&=~(1<<((i)&((1<<5)-1)))) 49 50 static struct Local_s 51 { 52 int fatal; 53 Dt_t* attrs; 54 Dt_t* names; 55 Dtdisc_t dtdisc; 56 #if CC_NATIVE != CC_ASCII 57 unsigned char* a2n; 58 #endif 59 } local; 60 61 /* 62 * initialize the writeable tables from the readonly data 63 * the tables are big enough to be concerned about text vs. data vs. bss 64 * UCS_BYTE==0 100K 65 * UCS_BYTE==1 20K 66 */ 67 68 static int 69 initialize(void) 70 { 71 register int i; 72 register Ucs_map_t* a; 73 register Ucs_map_t* w; 74 75 if (local.fatal) 76 return -1; 77 local.dtdisc.link = offsetof(Ucs_map_t, link); 78 local.dtdisc.key = offsetof(Ucs_map_t, name); 79 local.dtdisc.size = -1; 80 if (!(w = (Ucs_map_t*)malloc(sizeof(Ucs_map_t) * (elementsof(ucs_attrs) + elementsof(ucs_names))))) 81 { 82 local.fatal = 1; 83 return -1; 84 } 85 if (!(local.attrs = dtopen(&local.dtdisc, Dttree))) 86 { 87 free(w); 88 local.fatal = 1; 89 return -1; 90 } 91 if (!(local.names = dtopen(&local.dtdisc, Dttree))) 92 { 93 free(w); 94 dtclose(local.attrs); 95 local.fatal = 1; 96 return -1; 97 } 98 for (i = 0; i < elementsof(ucs_attrs); i++, w++) 99 { 100 memcpy(w, &ucs_attrs[i], offsetof(Ucs_dat_t, table)); 101 w->name = ucs_strings[ucs_attrs[i].table] + ucs_attrs[i].index; 102 w->next = 0; 103 dtinsert(local.attrs, w); 104 } 105 for (i = 0; i < elementsof(ucs_names); i++, w++) 106 { 107 memcpy(w, &ucs_names[i], offsetof(Ucs_dat_t, table)); 108 w->name = ucs_strings[ucs_names[i].table] + ucs_names[i].index; 109 w->next = 0; 110 if (a = (Ucs_map_t*)dtsearch(local.names, w)) 111 { 112 while (a->next) 113 a = a->next; 114 a->next = w; 115 } 116 else 117 dtinsert(local.names, w); 118 } 119 #if CC_NATIVE != CC_ASCII 120 local.a2n = ccmap(CC_ASCII, CC_NATIVE); 121 #endif 122 return 0; 123 } 124 125 /* 126 * return the collating symbol delimited by [c c], where c is either '=' or '.' 127 * s points to the first char after the initial [ 128 * if e!=0 it is set to point to the next char in s on return 129 * 130 * the collating symbol is converted to multibyte in <buf,size> 131 * the return value is: 132 * -1 syntax error or buf not large enough 133 * >=0 size with 0-terminated mb collation element 134 * or ligature value in buf 135 */ 136 137 int 138 regcollate(register const char* s, char** e, char* buf, int size) 139 { 140 register int c; 141 register char* u; 142 register char* b; 143 register char* x; 144 register Ucs_map_t* a; 145 Ucs_map_t* z; 146 const char* t; 147 const char* v; 148 int n; 149 int r; 150 int ul; 151 int term; 152 wchar_t w[2]; 153 Ucs_attr_t attr[3]; 154 155 if (size < 2) 156 r = -1; 157 else if ((term = *s++) != '.' && term != '=') 158 { 159 s--; 160 r = -1; 161 } 162 else if (*s == term && *(s + 1) == ']') 163 r = -1; 164 else 165 { 166 t = s; 167 mbchar(s); 168 if ((n = (s - t)) == 1) 169 { 170 if (*s == term && *(s + 1) == ']') 171 { 172 s += 2; 173 r = -1; 174 } 175 else 176 { 177 if (!local.attrs && initialize()) 178 return -1; 179 attr[0] = attr[1] = attr[2] = 0; 180 ul = 0; 181 b = buf; 182 x = buf + size - 2; 183 r = 1; 184 s = t; 185 do 186 { 187 v = s; 188 u = b; 189 for (;;) 190 { 191 if (!(c = *s++)) 192 return -1; 193 if (c == term) 194 { 195 if (!(c = *s++)) 196 return -1; 197 if (c != term) 198 { 199 if (c != ']') 200 return -1; 201 r = -1; 202 break; 203 } 204 } 205 if (c == ' ' || c == '-' && u > b && *s != ' ' && *s != '-') 206 break; 207 if (isupper(c)) 208 c = tolower(c); 209 if (u > x) 210 break; 211 *u++ = c; 212 } 213 *u = 0; 214 if (a = (Ucs_map_t*)dtmatch(local.attrs, b)) 215 setattr(attr, a->code); 216 else 217 { 218 if (u < x) 219 *u++ = ' '; 220 if (b == buf) 221 { 222 if (isupper(*v)) 223 ul = UCS_UC; 224 else if (islower(*v)) 225 ul = UCS_LC; 226 } 227 b = u; 228 } 229 } while (r > 0); 230 if (b > buf && *(b - 1) == ' ') 231 b--; 232 *b = 0; 233 attr[0] &= ~((Ucs_attr_t)1); 234 if (ul) 235 { 236 if (tstattr(attr, UCS_UC) || tstattr(attr, UCS_LC)) 237 ul = 0; 238 else 239 setattr(attr, ul); 240 } 241 if (z = (Ucs_map_t*)dtmatch(local.names, buf)) 242 for(;;) 243 { 244 for (a = z; a; a = a->next) 245 if ((attr[0] & a->attr[0]) == attr[0] && (attr[1] & a->attr[1]) == attr[1] && (attr[2] & a->attr[2]) == attr[2]) 246 { 247 #if 0 248 if (a->code <= 0xff) 249 { 250 #if CC_NATIVE != CC_ASCII 251 buf[0] = local.a2n[a->code]; 252 #else 253 buf[0] = a->code; 254 #endif 255 buf[r = 1] = 0; 256 ul = 0; 257 break; 258 } 259 #endif 260 w[0] = a->code; 261 w[1] = 0; 262 if ((r = wcstombs(buf, w, size)) > 0) 263 ul = 0; 264 break; 265 } 266 if (!ul) 267 break; 268 clrattr(attr, ul); 269 ul = 0; 270 } 271 } 272 if (r < 0) 273 { 274 if ((n = s - t - 2) > (size - 1)) 275 return -1; 276 memcpy(buf, t, n); 277 buf[n] = 0; 278 if (n == 1) 279 r = n; 280 else 281 { 282 for (t = buf; isalnum(*t); t++); 283 if (!*t) 284 r = n; 285 } 286 } 287 } 288 else if (*s++ != term || *s++ != ']') 289 { 290 s--; 291 r = -1; 292 } 293 else if (n > (size - 1)) 294 r = -1; 295 else 296 { 297 memcpy(buf, t, n); 298 buf[r = n] = 0; 299 } 300 } 301 if (e) 302 *e = (char*)s; 303 return r; 304 } 305