xref: /titanic_41/usr/src/lib/libast/common/regex/reglib.h (revision 4a7ceb24cfcc0a97f96d86cfe5852ae445b50e57)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *           Copyright (c) 1985-2007 AT&T Knowledge Ventures            *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                      by AT&T Knowledge Ventures                      *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                   Phong Vo <kpv@research.att.com>                    *
20 *                                                                      *
21 ***********************************************************************/
22 #pragma prototyped
23 
24 /*
25  * posix regex implementation
26  *
27  * based on Doug McIlroy's C++ implementation
28  * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
29  * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
30  */
31 
32 #ifndef _REGLIB_H
33 #define _REGLIB_H
34 
35 #define REG_VERSION_EXEC	20020509L
36 #define REG_VERSION_MAP		20030916L	/* regdisc_t.re_map	*/
37 
38 #define re_info		env
39 
40 #define alloc		_reg_alloc
41 #define classfun	_reg_classfun
42 #define drop		_reg_drop
43 #define fatal		_reg_fatal
44 #define state		_reg_state
45 
46 typedef struct regsubop_s
47 {
48 	int		op;		/* REG_SUB_LOWER,REG_SUB_UPPER	*/
49 	int		off;		/* re_rhs or match[] offset	*/
50 	int		len;		/* re_rhs len or len==0 match[]	*/
51 } regsubop_t;
52 
53 #define _REG_SUB_PRIVATE_ \
54 	char*		re_cur;		/* re_buf cursor		*/ \
55 	char*		re_end;		/* re_buf end			*/ \
56 	regsubop_t*	re_ops;		/* rhs ops			*/ \
57 	char		re_rhs[1];	/* substitution rhs		*/
58 
59 #include <ast.h>
60 #include <cdt.h>
61 #include <stk.h>
62 
63 #include "regex.h"
64 
65 #include <ctype.h>
66 #include <errno.h>
67 
68 #define MBSIZE(p)	((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
69 
70 #undef	RE_DUP_MAX			/* posix puts this in limits.h!	*/
71 #define RE_DUP_MAX	(INT_MAX/2-1)	/* 2*RE_DUP_MAX won't overflow	*/
72 #define RE_DUP_INF	(RE_DUP_MAX+1)	/* infinity, for *		*/
73 #define BACK_REF_MAX	9
74 
75 #define REG_COMP	(REG_DELIMITED|REG_ESCAPE|REG_EXTENDED|REG_FIRST|REG_ICASE|REG_NOSUB|REG_NEWLINE|REG_SHELL|REG_AUGMENTED|REG_LEFT|REG_LITERAL|REG_MINIMAL|REG_NULL|REG_RIGHT|REG_LENIENT|REG_MUSTDELIM)
76 #define REG_EXEC	(REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
77 
78 #define REX_NULL		0	/* null string (internal)	*/
79 #define REX_ALT			1	/* a|b				*/
80 #define REX_ALT_CATCH		2	/* REX_ALT catcher		*/
81 #define REX_BACK		3	/* \1, \2, etc			*/
82 #define REX_BEG			4	/* initial ^			*/
83 #define REX_BEG_STR		5	/* initial ^ w/ no newline	*/
84 #define REX_BM			6	/* Boyer-Moore			*/
85 #define REX_CAT			7	/* catenation catcher		*/
86 #define REX_CLASS		8	/* [...]			*/
87 #define REX_COLL_CLASS		9	/* collation order [...]	*/
88 #define REX_CONJ		10	/* a&b				*/
89 #define REX_CONJ_LEFT		11	/* REX_CONJ left catcher	*/
90 #define REX_CONJ_RIGHT		12	/* REX_CONJ right catcher	*/
91 #define REX_DONE		13	/* completed match (internal)	*/
92 #define REX_DOT			14	/* .				*/
93 #define REX_END			15	/* final $			*/
94 #define REX_END_STR		16	/* final $ before tail newline	*/
95 #define REX_EXEC		17	/* call re.re_exec()		*/
96 #define REX_FIN_STR		18	/* final $ w/ no newline	*/
97 #define REX_GROUP		19	/* \(...\)			*/
98 #define REX_GROUP_CATCH		20	/* REX_GROUP catcher		*/
99 #define REX_GROUP_AHEAD		21	/* 0-width lookahead		*/
100 #define REX_GROUP_AHEAD_CATCH	22	/* REX_GROUP_AHEAD catcher	*/
101 #define REX_GROUP_AHEAD_NOT	23	/* inverted 0-width lookahead	*/
102 #define REX_GROUP_BEHIND	24	/* 0-width lookbehind		*/
103 #define REX_GROUP_BEHIND_CATCH	25	/* REX_GROUP_BEHIND catcher	*/
104 #define REX_GROUP_BEHIND_NOT	26	/* inverted 0-width lookbehind	*/
105 #define REX_GROUP_BEHIND_NOT_CATCH 27	/* REX_GROUP_BEHIND_NOT catcher	*/
106 #define REX_GROUP_COND		28	/* conditional group		*/
107 #define REX_GROUP_COND_CATCH	29	/* conditional group catcher	*/
108 #define REX_GROUP_CUT		30	/* don't backtrack over this	*/
109 #define REX_GROUP_CUT_CATCH	31	/* REX_GROUP_CUT catcher	*/
110 #define REX_KMP			32	/* Knuth-Morris-Pratt		*/
111 #define REX_NEG			33	/* negation			*/
112 #define REX_NEG_CATCH		34	/* REX_NEG catcher		*/
113 #define REX_NEST		35	/* nested match			*/
114 #define REX_ONECHAR		36	/* a single-character literal	*/
115 #define REX_REP			37	/* Kleene closure		*/
116 #define REX_REP_CATCH		38	/* REX_REP catcher		*/
117 #define REX_STRING		39	/* some chars			*/
118 #define REX_TRIE		40	/* alternation of strings	*/
119 #define REX_WBEG		41	/* \<				*/
120 #define REX_WEND		42	/* \>				*/
121 #define REX_WORD		43	/* word boundary		*/
122 #define REX_WORD_NOT		44	/* not word boundary		*/
123 
124 #define T_META		((int)UCHAR_MAX+1)
125 #define T_STAR		(T_META+0)
126 #define T_PLUS		(T_META+1)
127 #define T_QUES		(T_META+2)
128 #define T_BANG		(T_META+3)
129 #define T_AT		(T_META+4)
130 #define T_TILDE		(T_META+5)
131 #define T_PERCENT	(T_META+6)
132 #define T_LEFT		(T_META+7)
133 #define T_OPEN		(T_META+8)
134 #define T_CLOSE		(T_OPEN+1)
135 #define T_RIGHT		(T_OPEN+2)
136 #define T_CFLX		(T_OPEN+3)
137 #define T_DOT		(T_OPEN+4)
138 #define T_DOTSTAR	(T_OPEN+5)
139 #define T_END		(T_OPEN+6)
140 #define T_BAD		(T_OPEN+7)
141 #define T_DOLL		(T_OPEN+8)
142 #define T_BRA		(T_OPEN+9)
143 #define T_BAR		(T_OPEN+10)
144 #define T_AND		(T_OPEN+11)
145 #define T_LT		(T_OPEN+12)
146 #define T_GT		(T_OPEN+13)
147 #define T_SLASHPLUS	(T_OPEN+14)
148 #define T_GROUP		(T_OPEN+15)
149 #define T_WORD		(T_OPEN+16)
150 #define T_WORD_NOT	(T_WORD+1)
151 #define T_BEG_STR	(T_WORD+2)
152 #define T_END_STR	(T_WORD+3)
153 #define T_FIN_STR	(T_WORD+4)
154 #define T_ESCAPE	(T_WORD+5)
155 #define T_ALNUM		(T_WORD+6)
156 #define T_ALNUM_NOT	(T_ALNUM+1)
157 #define T_DIGIT		(T_ALNUM+2)
158 #define T_DIGIT_NOT	(T_ALNUM+3)
159 #define T_SPACE		(T_ALNUM+4)
160 #define T_SPACE_NOT	(T_ALNUM+5)
161 #define T_BACK		(T_ALNUM+6)
162 
163 #define BRE		0
164 #define ERE		3
165 #define ARE		6
166 #define SRE		9
167 #define KRE		12
168 
169 #define HIT		SSIZE_MAX
170 
171 #define bitclr(p,c)	((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
172 #define bitset(p,c)	((p)[((c)>>3)&037]|=(1<<((c)&07)))
173 #define bittst(p,c)	((p)[((c)>>3)&037]&(1<<((c)&07)))
174 
175 #define setadd(p,c)	bitset((p)->bits,c)
176 #define setclr(p,c)	bitclr((p)->bits,c)
177 #define settst(p,c)	bittst((p)->bits,c)
178 
179 #if _hdr_wchar && _lib_wctype && _lib_iswctype
180 
181 #include <stdio.h> /* because <wchar.h> includes it and we generate it */
182 #include <wchar.h>
183 #if _hdr_wctype
184 #include <wctype.h>
185 #endif
186 
187 #if !defined(iswblank) && !_lib_iswblank
188 #define _need_iswblank	1
189 #define iswblank(x)	_reg_iswblank(x)
190 extern int		_reg_iswblank(wint_t);
191 #endif
192 
193 #if !defined(towupper) && !_lib_towupper
194 #define towupper(x)	toupper(x)
195 #endif
196 
197 #if !defined(towlower) && !_lib_towlower
198 #define towlower(x)	tolower(x)
199 #endif
200 
201 #else
202 
203 #undef	_lib_wctype
204 
205 #ifndef iswalnum
206 #define iswalnum(x)	isalnum(x)
207 #endif
208 #ifndef iswalpha
209 #define iswalpha(x)	isalpha(x)
210 #endif
211 #ifndef iswcntrl
212 #define iswcntrl(x)	iscntrl(x)
213 #endif
214 #ifndef iswdigit
215 #define iswdigit(x)	isdigit(x)
216 #endif
217 #ifndef iswgraph
218 #define iswgraph(x)	isgraph(x)
219 #endif
220 #ifndef iswlower
221 #define iswlower(x)	islower(x)
222 #endif
223 #ifndef iswprint
224 #define iswprint(x)	isprint(x)
225 #endif
226 #ifndef iswpunct
227 #define iswpunct(x)	ispunct(x)
228 #endif
229 #ifndef iswspace
230 #define iswspace(x)	isspace(x)
231 #endif
232 #ifndef iswupper
233 #define iswupper(x)	isupper(x)
234 #endif
235 #ifndef iswxdigit
236 #define iswxdigit(x)	isxdigit(x)
237 #endif
238 
239 #ifndef towlower
240 #define towlower(x)	tolower(x)
241 #endif
242 #ifndef towupper
243 #define towupper(x)	toupper(x)
244 #endif
245 
246 #endif
247 
248 #ifndef	iswblank
249 #define	iswblank(x)	((x)==' '||(x)=='\t')
250 #endif
251 
252 #ifndef iswgraph
253 #define	iswgraph(x)	(iswprint(x)&&!iswblank(x))
254 #endif
255 
256 #define isword(x)	(isalnum(x)||(x)=='_')
257 
258 /*
259  * collation element support
260  */
261 
262 #define COLL_KEY_MAX	15
263 
264 #if COLL_KEY_MAX < MB_LEN_MAX
265 #undef	COLL_KEY_MAX
266 #define COLL_KEY_MAX	MB_LEN_MAX
267 #endif
268 
269 typedef unsigned char Ckey_t[COLL_KEY_MAX+1];
270 
271 #define COLL_end	0
272 #define COLL_call	1
273 #define COLL_char	2
274 #define COLL_range	3
275 #define COLL_range_lc	4
276 #define COLL_range_uc	5
277 
278 typedef struct Celt_s
279 {
280 	short		typ;
281 	short		min;
282 	short		max;
283 	regclass_t	fun;
284 	Ckey_t		beg;
285 	Ckey_t		end;
286 } Celt_t;
287 
288 /*
289  * private stuff hanging off regex_t
290  */
291 
292 typedef struct Stk_pos_s
293 {
294 	off_t		offset;
295 	char*		base;
296 } Stk_pos_t;
297 
298 typedef struct Vector_s
299 {
300 	Stk_t*		stk;		/* stack pointer		*/
301 	char*		vec;		/* the data			*/
302 	int		inc;		/* growth increment		*/
303 	int		siz;		/* element size			*/
304 	int		max;		/* max index			*/
305 	int		cur;		/* current index -- user domain	*/
306 } Vector_t;
307 
308 /*
309  * Rex_t subtypes
310  */
311 
312 typedef struct Cond_s
313 {
314 	unsigned char*	beg;		/* beginning of next match	*/
315 	struct Rex_s*	next[2];	/* 0:no 1:yes next pattern	*/
316 	struct Rex_s*	cont;		/* right catcher		*/
317 	int		yes;		/* yes condition hit		*/
318 } Cond_t;
319 
320 typedef struct Conj_left_s
321 {
322 	unsigned char*	beg;		/* beginning of left match	*/
323 	struct Rex_s*	right;		/* right pattern		*/
324 	struct Rex_s*	cont;		/* right catcher		*/
325 } Conj_left_t;
326 
327 typedef struct Conj_right_s
328 {
329 	unsigned char*	end;		/* end of left match		*/
330 	struct Rex_s*	cont;		/* ambient continuation		*/
331 } Conj_right_t;
332 
333 typedef unsigned int Bm_mask_t;
334 
335 typedef struct Bm_s
336 {
337 	Bm_mask_t**	mask;
338 	size_t*		skip;
339 	size_t*		fail;
340 	size_t		size;
341 	ssize_t		back;
342 	ssize_t		left;
343 	ssize_t		right;
344 	size_t		complete;
345 } Bm_t;
346 
347 typedef struct String_s
348 {
349 	int*		fail;
350 	unsigned char*	base;
351 	size_t		size;
352 } String_t;
353 
354 typedef struct Set_s
355 {
356 	unsigned char	bits[(UCHAR_MAX+1)/CHAR_BIT];
357 } Set_t;
358 
359 typedef struct Collate_s
360 {
361 	int		invert;
362 	Celt_t*		elements;
363 } Collate_t;
364 
365 typedef struct Binary_s
366 {
367 	struct Rex_s*	left;
368 	struct Rex_s*	right;
369 	int		serial;
370 } Binary_t;
371 
372 typedef struct Group_s
373 {
374 	int		number;		/* group number			*/
375 	int		last;		/* last contained group number	*/
376 	int		size;		/* lookbehind size		*/
377 	int		back;		/* backreferenced		*/
378 	regflags_t	flags;		/* group flags			*/
379 	union
380 	{
381 	Binary_t	binary;
382 	struct Rex_s*	rex;
383 	}		expr;
384 } Group_t;
385 
386 typedef struct Exec_s
387 {
388 	void*		data;
389 	const char*	text;
390 	size_t		size;
391 } Exec_t;
392 
393 #define REX_NEST_open		0x01
394 #define REX_NEST_close		0x02
395 #define REX_NEST_escape		0x04
396 #define REX_NEST_quote		0x08
397 #define REX_NEST_literal	0x10
398 #define REX_NEST_delimiter	0x20
399 #define REX_NEST_terminator	0x40
400 #define REX_NEST_separator	0x80
401 
402 #define REX_NEST_SHIFT		8
403 
404 typedef struct Nest_s
405 {
406 	int		primary;
407 	unsigned short	none;		/* for Nest_t.type[-1] */
408 	unsigned short	type[1];
409 } Nest_t;
410 
411 /*
412  * REX_ALT catcher, solely to get control at the end of an
413  * alternative to keep records for comparing matches.
414  */
415 
416 typedef struct Alt_catch_s
417 {
418 	struct Rex_s*	cont;
419 } Alt_catch_t;
420 
421 typedef struct Group_catch_s
422 {
423 	struct Rex_s*	cont;
424 	regoff_t*	eo;
425 } Group_catch_t;
426 
427 typedef struct Behind_catch_s
428 {
429 	struct Rex_s*	cont;
430 	unsigned char*	beg;
431 	unsigned char*	end;
432 } Behind_catch_t;
433 
434 /*
435  * REX_NEG catcher determines what string lengths can be matched,
436  * then Neg investigates continuations of other lengths.
437  * This is inefficient.  For !POSITIONS expressions, we can do better:
438  * since matches to rex will be enumerated in decreasing order,
439  * we can investigate continuations whenever a length is skipped.
440  */
441 
442 typedef struct Neg_catch_s
443 {
444 	unsigned char*	beg;
445 	unsigned char*	index;
446 } Neg_catch_t;
447 
448 /*
449  * REX_REP catcher.  One is created on the stack for
450  * each iteration of a complex repetition.
451  */
452 
453 typedef struct Rep_catch_s
454 {
455 	struct Rex_s*	cont;
456 	struct Rex_s*	ref;
457 	unsigned char*	beg;
458 	int		n;
459 } Rep_catch_t;
460 
461 /*
462  * data structure for an alternation of pure strings
463  * son points to a subtree of all strings with a common
464  * prefix ending in character c.  sib links alternate
465  * letters in the same position of a word.  end=1 if
466  * some word ends with c.  the order of strings is
467  * irrelevant, except long words must be investigated
468  * before short ones.
469  */
470 
471 typedef struct Trie_node_s
472 {
473 	unsigned char		c;
474 	unsigned char		end;
475 	struct Trie_node_s*	son;
476 	struct Trie_node_s*	sib;
477 } Trie_node_t;
478 
479 typedef struct Trie_s
480 {
481 	Trie_node_t**	root;
482 	int		min;
483 	int		max;
484 } Trie_t;
485 
486 /*
487  * Rex_t is a node in a regular expression
488  */
489 
490 typedef struct Rex_s
491 {
492 	unsigned char	type;			/* node type		*/
493 	unsigned char	marked;			/* already marked	*/
494 	short		serial;			/* subpattern number	*/
495 	regflags_t	flags;			/* scoped flags		*/
496 	int		explicit;		/* scoped explicit match*/
497 	struct Rex_s*	next;			/* remaining parts	*/
498 	int		lo;			/* lo dup count		*/
499 	int		hi;			/* hi dup count		*/
500 	unsigned char*	map;			/* fold and/or ccode map*/
501 	union
502 	{
503 	Alt_catch_t	alt_catch;		/* alt catcher		*/
504 	Bm_t		bm;			/* bm			*/
505 	Behind_catch_t	behind_catch;		/* behind catcher	*/
506 	Set_t*		charclass;		/* char class		*/
507 	Collate_t	collate;		/* collation class	*/
508 	Cond_t		cond_catch;		/* cond catcher		*/
509 	Conj_left_t	conj_left;		/* conj left catcher	*/
510 	Conj_right_t	conj_right;		/* conj right catcher	*/
511 	void*		data;			/* data after Rex_t	*/
512 	Exec_t		exec;			/* re.re_exec() args	*/
513 	Group_t		group;			/* a|b or rep		*/
514 	Group_catch_t	group_catch;		/* group catcher	*/
515 	Neg_catch_t	neg_catch;		/* neg catcher		*/
516 	Nest_t		nest;			/* nested match		*/
517 	unsigned char	onechar;		/* single char		*/
518 	Rep_catch_t	rep_catch;		/* rep catcher		*/
519 	String_t	string;			/* string/kmp		*/
520 	Trie_t		trie;			/* trie			*/
521 	}		re;
522 } Rex_t;
523 
524 typedef struct reglib_s			/* library private regex_t info	*/
525 {
526 	struct Rex_s*	rex;		/* compiled expression		*/
527 	regdisc_t*	disc;		/* REG_DISCIPLINE discipline	*/
528 	const regex_t*	regex;		/* from regexec			*/
529 	unsigned char*	beg;		/* beginning of string		*/
530 	unsigned char*	end;		/* end of string		*/
531 	Vector_t*	pos;		/* posns of certain subpatterns	*/
532 	Vector_t*	bestpos;	/* ditto for best match		*/
533 	regmatch_t*	match;		/* subexrs in current match 	*/
534 	regmatch_t*	best;		/* ditto in best match yet	*/
535 	Stk_pos_t	stk;		/* exec stack pos		*/
536 	size_t		min;		/* minimum match length		*/
537 	size_t		nsub;		/* internal re_nsub		*/
538 	regflags_t	flags;		/* flags from regcomp()		*/
539 	int		error;		/* last error			*/
540 	int		explicit;	/* explicit match on this char	*/
541 	int		leading;	/* leading match on this char	*/
542 	int		refs;		/* regcomp()+regdup() references*/
543 	Rex_t		done;		/* the last continuation	*/
544 	regstat_t	stats;		/* for regstat()		*/
545 	unsigned char	fold[UCHAR_MAX+1]; /* REG_ICASE map		*/
546 	unsigned char	hard;		/* hard comp			*/
547 	unsigned char	once;		/* if 1st parse fails, quit	*/
548 	unsigned char	separate;	/* cannot combine		*/
549 	unsigned char	stack;		/* hard comp or exec		*/
550 	unsigned char	sub;		/* re_sub is valid		*/
551 	unsigned char	test;		/* debug/test bitmask		*/
552 } Env_t;
553 
554 typedef struct State_s				/* shared state		*/
555 {
556 	regmatch_t	nomatch;
557 	struct
558 	{
559 	unsigned char	key;
560 	short		val[15];
561 	}		escape[52];
562 	short*		magic[UCHAR_MAX+1];
563 	regdisc_t	disc;
564 	int		fatal;
565 	int		initialized;
566 	Dt_t*		attrs;
567 	Dt_t*		names;
568 	Dtdisc_t	dtdisc;
569 } State_t;
570 
571 extern State_t		state;
572 
573 extern void*		alloc(regdisc_t*, void*, size_t);
574 extern regclass_t	classfun(int);
575 extern void		drop(regdisc_t*, Rex_t*);
576 extern int		fatal(regdisc_t*, int, const char*);
577 
578 #endif
579