xref: /illumos-gate/usr/src/contrib/ast/src/lib/libast/regex/reglib.h (revision b30d193948be5a7794d7ae3ba0ed9c2f72c88e0f)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1985-2012 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                   Phong Vo <kpv@research.att.com>                    *
20 *                                                                      *
21 ***********************************************************************/
22 #pragma prototyped
23 
24 /*
25  * posix regex implementation
26  *
27  * based on Doug McIlroy's C++ implementation
28  * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
29  * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
30  */
31 
32 #ifndef _REGLIB_H
33 #define _REGLIB_H
34 
35 #define REG_VERSION_EXEC	20020509L
36 #define REG_VERSION_MAP		20030916L	/* regdisc_t.re_map	*/
37 
38 #define re_info		env
39 
40 #define alloc		_reg_alloc
41 #define classfun	_reg_classfun
42 #define drop		_reg_drop
43 #define fatal		_reg_fatal
44 #define state		_reg_state
45 
46 typedef struct regsubop_s
47 {
48 	int		op;		/* REG_SUB_LOWER,REG_SUB_UPPER	*/
49 	int		off;		/* re_rhs or match[] offset	*/
50 	int		len;		/* re_rhs len or len==0 match[]	*/
51 } regsubop_t;
52 
53 #define _REG_SUB_PRIVATE_ \
54 	char*		re_cur;		/* re_buf cursor		*/ \
55 	char*		re_end;		/* re_buf end			*/ \
56 	regsubop_t*	re_ops;		/* rhs ops			*/ \
57 	char		re_rhs[1];	/* substitution rhs		*/
58 
59 #include <ast.h>
60 #include <cdt.h>
61 #include <stk.h>
62 
63 #include "regex.h"
64 
65 #include <ctype.h>
66 #include <errno.h>
67 
68 #if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
69 #define _AST_REGEX_DEBUG	1
70 #endif
71 
72 #define MBSIZE(p)	((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
73 
74 #undef	RE_DUP_MAX			/* posix puts this in limits.h!	*/
75 #define RE_DUP_MAX	(INT_MAX/2-1)	/* 2*RE_DUP_MAX won't overflow	*/
76 #define RE_DUP_INF	(RE_DUP_MAX+1)	/* infinity, for *		*/
77 #define BACK_REF_MAX	9
78 
79 #define REG_COMP	(~REG_EXEC)
80 #define REG_EXEC	(REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
81 
82 #define REX_NULL		0	/* null string (internal)	*/
83 #define REX_ALT			1	/* a|b				*/
84 #define REX_ALT_CATCH		2	/* REX_ALT catcher		*/
85 #define REX_BACK		3	/* \1, \2, etc			*/
86 #define REX_BEG			4	/* initial ^			*/
87 #define REX_BEG_STR		5	/* initial ^ w/ no newline	*/
88 #define REX_BM			6	/* Boyer-Moore			*/
89 #define REX_CAT			7	/* catenation catcher		*/
90 #define REX_CLASS		8	/* [...]			*/
91 #define REX_COLL_CLASS		9	/* collation order [...]	*/
92 #define REX_CONJ		10	/* a&b				*/
93 #define REX_CONJ_LEFT		11	/* REX_CONJ left catcher	*/
94 #define REX_CONJ_RIGHT		12	/* REX_CONJ right catcher	*/
95 #define REX_DONE		13	/* completed match (internal)	*/
96 #define REX_DOT			14	/* .				*/
97 #define REX_END			15	/* final $			*/
98 #define REX_END_STR		16	/* final $ before tail newline	*/
99 #define REX_EXEC		17	/* call re.re_exec()		*/
100 #define REX_FIN_STR		18	/* final $ w/ no newline	*/
101 #define REX_GROUP		19	/* \(...\)			*/
102 #define REX_GROUP_CATCH		20	/* REX_GROUP catcher		*/
103 #define REX_GROUP_AHEAD		21	/* 0-width lookahead		*/
104 #define REX_GROUP_AHEAD_CATCH	22	/* REX_GROUP_AHEAD catcher	*/
105 #define REX_GROUP_AHEAD_NOT	23	/* inverted 0-width lookahead	*/
106 #define REX_GROUP_BEHIND	24	/* 0-width lookbehind		*/
107 #define REX_GROUP_BEHIND_CATCH	25	/* REX_GROUP_BEHIND catcher	*/
108 #define REX_GROUP_BEHIND_NOT	26	/* inverted 0-width lookbehind	*/
109 #define REX_GROUP_BEHIND_NOT_CATCH 27	/* REX_GROUP_BEHIND_NOT catcher	*/
110 #define REX_GROUP_COND		28	/* conditional group		*/
111 #define REX_GROUP_COND_CATCH	29	/* conditional group catcher	*/
112 #define REX_GROUP_CUT		30	/* don't backtrack over this	*/
113 #define REX_GROUP_CUT_CATCH	31	/* REX_GROUP_CUT catcher	*/
114 #define REX_KMP			32	/* Knuth-Morris-Pratt		*/
115 #define REX_NEG			33	/* negation			*/
116 #define REX_NEG_CATCH		34	/* REX_NEG catcher		*/
117 #define REX_NEST		35	/* nested match			*/
118 #define REX_ONECHAR		36	/* a single-character literal	*/
119 #define REX_REP			37	/* Kleene closure		*/
120 #define REX_REP_CATCH		38	/* REX_REP catcher		*/
121 #define REX_STRING		39	/* some chars			*/
122 #define REX_TRIE		40	/* alternation of strings	*/
123 #define REX_WBEG		41	/* \<				*/
124 #define REX_WEND		42	/* \>				*/
125 #define REX_WORD		43	/* word boundary		*/
126 #define REX_WORD_NOT		44	/* not word boundary		*/
127 
128 #define T_META		((int)UCHAR_MAX+1)
129 #define T_STAR		(T_META+0)
130 #define T_PLUS		(T_META+1)
131 #define T_QUES		(T_META+2)
132 #define T_BANG		(T_META+3)
133 #define T_AT		(T_META+4)
134 #define T_TILDE		(T_META+5)
135 #define T_PERCENT	(T_META+6)
136 #define T_LEFT		(T_META+7)
137 #define T_OPEN		(T_META+8)
138 #define T_CLOSE		(T_OPEN+1)
139 #define T_RIGHT		(T_OPEN+2)
140 #define T_CFLX		(T_OPEN+3)
141 #define T_DOT		(T_OPEN+4)
142 #define T_DOTSTAR	(T_OPEN+5)
143 #define T_END		(T_OPEN+6)
144 #define T_BAD		(T_OPEN+7)
145 #define T_DOLL		(T_OPEN+8)
146 #define T_BRA		(T_OPEN+9)
147 #define T_BAR		(T_OPEN+10)
148 #define T_AND		(T_OPEN+11)
149 #define T_LT		(T_OPEN+12)
150 #define T_GT		(T_OPEN+13)
151 #define T_SLASHPLUS	(T_OPEN+14)
152 #define T_GROUP		(T_OPEN+15)
153 #define T_WORD		(T_OPEN+16)
154 #define T_WORD_NOT	(T_WORD+1)
155 #define T_BEG_STR	(T_WORD+2)
156 #define T_END_STR	(T_WORD+3)
157 #define T_FIN_STR	(T_WORD+4)
158 #define T_ESCAPE	(T_WORD+5)
159 #define T_ALNUM		(T_WORD+6)
160 #define T_ALNUM_NOT	(T_ALNUM+1)
161 #define T_DIGIT		(T_ALNUM+2)
162 #define T_DIGIT_NOT	(T_ALNUM+3)
163 #define T_SPACE		(T_ALNUM+4)
164 #define T_SPACE_NOT	(T_ALNUM+5)
165 #define T_BACK		(T_ALNUM+6)
166 
167 #define BRE		0
168 #define ERE		3
169 #define ARE		6
170 #define SRE		9
171 #define KRE		12
172 
173 #define HIT		SSIZE_MAX
174 
175 #define bitclr(p,c)	((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
176 #define bitset(p,c)	((p)[((c)>>3)&037]|=(1<<((c)&07)))
177 #define bittst(p,c)	((p)[((c)>>3)&037]&(1<<((c)&07)))
178 
179 #define setadd(p,c)	bitset((p)->bits,c)
180 #define setclr(p,c)	bitclr((p)->bits,c)
181 #define settst(p,c)	bittst((p)->bits,c)
182 
183 #if _hdr_wchar && _lib_wctype && _lib_iswctype
184 
185 #include <stdio.h> /* because <wchar.h> includes it and we generate it */
186 #include <wchar.h>
187 #if _hdr_wctype
188 #include <wctype.h>
189 #endif
190 
191 #if !defined(iswblank) && !_lib_iswblank
192 #define _need_iswblank	1
193 #define iswblank(x)	_reg_iswblank(x)
194 extern int		_reg_iswblank(wint_t);
195 #endif
196 
197 #if !defined(towupper) && !_lib_towupper
198 #define towupper(x)	toupper(x)
199 #endif
200 
201 #if !defined(towlower) && !_lib_towlower
202 #define towlower(x)	tolower(x)
203 #endif
204 
205 #else
206 
207 #undef	_lib_wctype
208 
209 #ifndef iswalnum
210 #define iswalnum(x)	isalnum(x)
211 #endif
212 #ifndef iswalpha
213 #define iswalpha(x)	isalpha(x)
214 #endif
215 #ifndef iswcntrl
216 #define iswcntrl(x)	iscntrl(x)
217 #endif
218 #ifndef iswdigit
219 #define iswdigit(x)	isdigit(x)
220 #endif
221 #ifndef iswgraph
222 #define iswgraph(x)	isgraph(x)
223 #endif
224 #ifndef iswlower
225 #define iswlower(x)	islower(x)
226 #endif
227 #ifndef iswprint
228 #define iswprint(x)	isprint(x)
229 #endif
230 #ifndef iswpunct
231 #define iswpunct(x)	ispunct(x)
232 #endif
233 #ifndef iswspace
234 #define iswspace(x)	isspace(x)
235 #endif
236 #ifndef iswupper
237 #define iswupper(x)	isupper(x)
238 #endif
239 #ifndef iswxdigit
240 #define iswxdigit(x)	isxdigit(x)
241 #endif
242 
243 #ifndef towlower
244 #define towlower(x)	tolower(x)
245 #endif
246 #ifndef towupper
247 #define towupper(x)	toupper(x)
248 #endif
249 
250 #endif
251 
252 #ifndef	iswblank
253 #define	iswblank(x)	((x)==' '||(x)=='\t')
254 #endif
255 
256 #ifndef iswgraph
257 #define	iswgraph(x)	(iswprint(x)&&!iswblank(x))
258 #endif
259 
260 #define isword(x)	(isalnum(x)||(x)=='_')
261 
262 /*
263  * collation element support
264  */
265 
266 #define COLL_KEY_MAX	32
267 
268 #if COLL_KEY_MAX < MB_LEN_MAX
269 #undef	COLL_KEY_MAX
270 #define COLL_KEY_MAX	MB_LEN_MAX
271 #endif
272 
273 typedef unsigned char Ckey_t[COLL_KEY_MAX+1];
274 
275 #define COLL_end	0
276 #define COLL_call	1
277 #define COLL_char	2
278 #define COLL_range	3
279 #define COLL_range_lc	4
280 #define COLL_range_uc	5
281 
282 typedef struct Celt_s
283 {
284 	short		typ;
285 	short		min;
286 	short		max;
287 	regclass_t	fun;
288 	Ckey_t		beg;
289 	Ckey_t		end;
290 } Celt_t;
291 
292 /*
293  * private stuff hanging off regex_t
294  */
295 
296 typedef struct Stk_pos_s
297 {
298 	off_t		offset;
299 	char*		base;
300 } Stk_pos_t;
301 
302 typedef struct Vector_s
303 {
304 	Stk_t*		stk;		/* stack pointer		*/
305 	char*		vec;		/* the data			*/
306 	int		inc;		/* growth increment		*/
307 	int		siz;		/* element size			*/
308 	ssize_t		max;		/* max index			*/
309 	ssize_t		cur;		/* current index -- user domain	*/
310 } Vector_t;
311 
312 /*
313  * Rex_t subtypes
314  */
315 
316 typedef struct Cond_s
317 {
318 	unsigned char*	beg;		/* beginning of next match	*/
319 	struct Rex_s*	next[2];	/* 0:no 1:yes next pattern	*/
320 	struct Rex_s*	cont;		/* right catcher		*/
321 	int		yes;		/* yes condition hit		*/
322 } Cond_t;
323 
324 typedef struct Conj_left_s
325 {
326 	unsigned char*	beg;		/* beginning of left match	*/
327 	struct Rex_s*	right;		/* right pattern		*/
328 	struct Rex_s*	cont;		/* right catcher		*/
329 } Conj_left_t;
330 
331 typedef struct Conj_right_s
332 {
333 	unsigned char*	end;		/* end of left match		*/
334 	struct Rex_s*	cont;		/* ambient continuation		*/
335 } Conj_right_t;
336 
337 typedef unsigned int Bm_mask_t;
338 
339 typedef struct Bm_s
340 {
341 	Bm_mask_t**	mask;
342 	size_t*		skip;
343 	size_t*		fail;
344 	size_t		size;
345 	ssize_t		back;
346 	ssize_t		left;
347 	ssize_t		right;
348 	size_t		complete;
349 } Bm_t;
350 
351 typedef struct String_s
352 {
353 	int*		fail;
354 	unsigned char*	base;
355 	size_t		size;
356 } String_t;
357 
358 typedef struct Set_s
359 {
360 	unsigned char	bits[(UCHAR_MAX+1)/CHAR_BIT];
361 } Set_t;
362 
363 typedef struct Collate_s
364 {
365 	int		invert;
366 	Celt_t*		elements;
367 } Collate_t;
368 
369 typedef struct Binary_s
370 {
371 	struct Rex_s*	left;
372 	struct Rex_s*	right;
373 	int		serial;
374 } Binary_t;
375 
376 typedef struct Group_s
377 {
378 	int		number;		/* group number			*/
379 	int		last;		/* last contained group number	*/
380 	int		size;		/* lookbehind size		*/
381 	int		back;		/* backreferenced		*/
382 	regflags_t	flags;		/* group flags			*/
383 	union
384 	{
385 	Binary_t	binary;
386 	struct Rex_s*	rex;
387 	}		expr;
388 } Group_t;
389 
390 typedef struct Exec_s
391 {
392 	void*		data;
393 	const char*	text;
394 	size_t		size;
395 } Exec_t;
396 
397 #define REX_NEST_open		0x01
398 #define REX_NEST_close		0x02
399 #define REX_NEST_escape		0x04
400 #define REX_NEST_quote		0x08
401 #define REX_NEST_literal	0x10
402 #define REX_NEST_delimiter	0x20
403 #define REX_NEST_terminator	0x40
404 #define REX_NEST_separator	0x80
405 
406 #define REX_NEST_SHIFT		8
407 
408 typedef struct Nest_s
409 {
410 	int		primary;
411 	unsigned short	none;		/* for Nest_t.type[-1] */
412 	unsigned short	type[1];
413 } Nest_t;
414 
415 /*
416  * REX_ALT catcher, solely to get control at the end of an
417  * alternative to keep records for comparing matches.
418  */
419 
420 typedef struct Alt_catch_s
421 {
422 	struct Rex_s*	cont;
423 } Alt_catch_t;
424 
425 typedef struct Group_catch_s
426 {
427 	struct Rex_s*	cont;
428 	regoff_t*	eo;
429 } Group_catch_t;
430 
431 typedef struct Behind_catch_s
432 {
433 	struct Rex_s*	cont;
434 	unsigned char*	beg;
435 	unsigned char*	end;
436 } Behind_catch_t;
437 
438 /*
439  * REX_NEG catcher determines what string lengths can be matched,
440  * then Neg investigates continuations of other lengths.
441  * This is inefficient.  For !POSITIONS expressions, we can do better:
442  * since matches to rex will be enumerated in decreasing order,
443  * we can investigate continuations whenever a length is skipped.
444  */
445 
446 typedef struct Neg_catch_s
447 {
448 	unsigned char*	beg;
449 	unsigned char*	index;
450 } Neg_catch_t;
451 
452 /*
453  * REX_REP catcher.  One is created on the stack for
454  * each iteration of a complex repetition.
455  */
456 
457 typedef struct Rep_catch_s
458 {
459 	struct Rex_s*	cont;
460 	struct Rex_s*	ref;
461 	unsigned char*	beg;
462 	int		n;
463 } Rep_catch_t;
464 
465 /*
466  * data structure for an alternation of pure strings
467  * son points to a subtree of all strings with a common
468  * prefix ending in character c.  sib links alternate
469  * letters in the same position of a word.  end=1 if
470  * some word ends with c.  the order of strings is
471  * irrelevant, except long words must be investigated
472  * before short ones.
473  */
474 
475 typedef struct Trie_node_s
476 {
477 	unsigned char		c;
478 	unsigned char		end;
479 	struct Trie_node_s*	son;
480 	struct Trie_node_s*	sib;
481 } Trie_node_t;
482 
483 typedef struct Trie_s
484 {
485 	Trie_node_t**	root;
486 	int		min;
487 	int		max;
488 } Trie_t;
489 
490 /*
491  * Rex_t is a node in a regular expression
492  */
493 
494 typedef struct Rex_s
495 {
496 	unsigned char	type;			/* node type		*/
497 	unsigned char	marked;			/* already marked	*/
498 	short		serial;			/* subpattern number	*/
499 	regflags_t	flags;			/* scoped flags		*/
500 	int		explicit;		/* scoped explicit match*/
501 	struct Rex_s*	next;			/* remaining parts	*/
502 	int		lo;			/* lo dup count		*/
503 	int		hi;			/* hi dup count		*/
504 	unsigned char*	map;			/* fold and/or ccode map*/
505 	union
506 	{
507 	Alt_catch_t	alt_catch;		/* alt catcher		*/
508 	Bm_t		bm;			/* bm			*/
509 	Behind_catch_t	behind_catch;		/* behind catcher	*/
510 	Set_t*		charclass;		/* char class		*/
511 	Collate_t	collate;		/* collation class	*/
512 	Cond_t		cond_catch;		/* cond catcher		*/
513 	Conj_left_t	conj_left;		/* conj left catcher	*/
514 	Conj_right_t	conj_right;		/* conj right catcher	*/
515 	void*		data;			/* data after Rex_t	*/
516 	Exec_t		exec;			/* re.re_exec() args	*/
517 	Group_t		group;			/* a|b or rep		*/
518 	Group_catch_t	group_catch;		/* group catcher	*/
519 	Neg_catch_t	neg_catch;		/* neg catcher		*/
520 	Nest_t		nest;			/* nested match		*/
521 	unsigned char	onechar;		/* single char		*/
522 	Rep_catch_t	rep_catch;		/* rep catcher		*/
523 	String_t	string;			/* string/kmp		*/
524 	Trie_t		trie;			/* trie			*/
525 	}		re;
526 } Rex_t;
527 
528 typedef struct reglib_s			/* library private regex_t info	*/
529 {
530 	struct Rex_s*	rex;		/* compiled expression		*/
531 	regdisc_t*	disc;		/* REG_DISCIPLINE discipline	*/
532 	const regex_t*	regex;		/* from regexec			*/
533 	unsigned char*	beg;		/* beginning of string		*/
534 	unsigned char*	end;		/* end of string		*/
535 	Vector_t*	pos;		/* posns of certain subpatterns	*/
536 	Vector_t*	bestpos;	/* ditto for best match		*/
537 	regmatch_t*	match;		/* subexrs in current match 	*/
538 	regmatch_t*	best;		/* ditto in best match yet	*/
539 	Stk_pos_t	stk;		/* exec stack pos		*/
540 	size_t		min;		/* minimum match length		*/
541 	size_t		nsub;		/* internal re_nsub		*/
542 	regflags_t	flags;		/* flags from regcomp()		*/
543 	int		error;		/* last error			*/
544 	int		explicit;	/* explicit match on this char	*/
545 	int		leading;	/* leading match on this char	*/
546 	int		refs;		/* regcomp()+regdup() references*/
547 	Rex_t		done;		/* the last continuation	*/
548 	regstat_t	stats;		/* for regstat()		*/
549 	unsigned char	fold[UCHAR_MAX+1]; /* REG_ICASE map		*/
550 	unsigned char	hard;		/* hard comp			*/
551 	unsigned char	once;		/* if 1st parse fails, quit	*/
552 	unsigned char	separate;	/* cannot combine		*/
553 	unsigned char	stack;		/* hard comp or exec		*/
554 	unsigned char	sub;		/* re_sub is valid		*/
555 	unsigned char	test;		/* debug/test bitmask		*/
556 } Env_t;
557 
558 typedef struct oldregmatch_s		/* pre-20120528 regmatch_t	*/
559 {
560 	int		rm_so;		/* offset of start		*/
561 	int		rm_eo;		/* offset of end		*/
562 } oldregmatch_t;
563 
564 typedef struct State_s				/* shared state		*/
565 {
566 	regmatch_t	nomatch;
567 	struct
568 	{
569 	unsigned char	key;
570 	short		val[15];
571 	}		escape[52];
572 	short*		magic[UCHAR_MAX+1];
573 	regdisc_t	disc;
574 	int		fatal;
575 	int		initialized;
576 	Dt_t*		attrs;
577 	Dt_t*		names;
578 	Dtdisc_t	dtdisc;
579 } State_t;
580 
581 extern State_t		state;
582 
583 extern void*		alloc(regdisc_t*, void*, size_t);
584 extern regclass_t	classfun(int);
585 extern void		drop(regdisc_t*, Rex_t*);
586 extern int		fatal(regdisc_t*, int, const char*);
587 
588 #endif
589