xref: /freebsd/contrib/less/pattern.c (revision 252d6dde57d5dd0184929d1f8fb65e7713f51c6d)
1 /*
2  * Copyright (C) 1984-2025  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information, see the README file.
8  */
9 
10 /*
11  * Routines to do pattern matching.
12  */
13 
14 #include "less.h"
15 
16 extern int caseless;
17 extern int is_caseless;
18 extern int utf_mode;
19 
20 /*
21  * Compile a search pattern, for future use by match_pattern.
22  */
compile_pattern2(constant char * pattern,int search_type,PATTERN_TYPE * comp_pattern,int show_error)23 static int compile_pattern2(constant char *pattern, int search_type, PATTERN_TYPE *comp_pattern, int show_error)
24 {
25 	if (search_type & SRCH_NO_REGEX)
26 		return (0);
27   {
28 #if HAVE_GNU_REGEX
29 	struct re_pattern_buffer *comp = (struct re_pattern_buffer *)
30 		ecalloc(1, sizeof(struct re_pattern_buffer));
31 	re_set_syntax(RE_SYNTAX_POSIX_EXTENDED);
32 	if (re_compile_pattern(pattern, strlen(pattern), comp))
33 	{
34 		free(comp);
35 		if (show_error)
36 			error("Invalid pattern", NULL_PARG);
37 		return (-1);
38 	}
39 	if (*comp_pattern != NULL)
40 	{
41 		regfree(*comp_pattern);
42 		free(*comp_pattern);
43 	}
44 	*comp_pattern = comp;
45 #endif
46 #if HAVE_POSIX_REGCOMP
47 	regex_t *comp = (regex_t *) ecalloc(1, sizeof(regex_t));
48 	if (regcomp(comp, pattern, REGCOMP_FLAG | (is_caseless ? REG_ICASE : 0)))
49 	{
50 		free(comp);
51 		if (show_error)
52 			error("Invalid pattern", NULL_PARG);
53 		return (-1);
54 	}
55 	if (*comp_pattern != NULL)
56 	{
57 		regfree(*comp_pattern);
58 		free(*comp_pattern);
59 	}
60 	*comp_pattern = comp;
61 #endif
62 #if HAVE_PCRE
63 	constant char *errstring;
64 	int erroffset;
65 	PARG parg;
66 	pcre *comp = pcre_compile(pattern,
67 			((utf_mode) ? PCRE_UTF8 | PCRE_NO_UTF8_CHECK : 0) |
68 			(is_caseless ? PCRE_CASELESS : 0),
69 			&errstring, &erroffset, NULL);
70 	if (comp == NULL)
71 	{
72 		parg.p_string = (char *) errstring;
73 		if (show_error)
74 			error("%s", &parg);
75 		return (-1);
76 	}
77 	*comp_pattern = comp;
78 #endif
79 #if HAVE_PCRE2
80 	int errcode;
81 	PCRE2_SIZE erroffset;
82 	PARG parg;
83 	pcre2_code *comp = pcre2_compile((PCRE2_SPTR)pattern, strlen(pattern),
84 			((utf_mode) ? PCRE2_UTF | PCRE2_NO_UTF_CHECK : 0) |
85 			(is_caseless ? PCRE2_CASELESS : 0),
86 			&errcode, &erroffset, NULL);
87 	if (comp == NULL)
88 	{
89 		if (show_error)
90 		{
91 			char msg[160];
92 			pcre2_get_error_message(errcode, (PCRE2_UCHAR*)msg, sizeof(msg));
93 			parg.p_string = msg;
94 			error("%s", &parg);
95 		}
96 		return (-1);
97 	}
98 	*comp_pattern = comp;
99 #endif
100 #if HAVE_RE_COMP
101 	PARG parg;
102 	if ((parg.p_string = re_comp(pattern)) != NULL)
103 	{
104 		if (show_error)
105 			error("%s", &parg);
106 		return (-1);
107 	}
108 	*comp_pattern = 1;
109 #endif
110 #if HAVE_REGCMP
111 	char *comp;
112 	if ((comp = regcmp(pattern, 0)) == NULL)
113 	{
114 		if (show_error)
115 			error("Invalid pattern", NULL_PARG);
116 		return (-1);
117 	}
118 	if (comp_pattern != NULL)
119 		free(*comp_pattern);
120 	*comp_pattern = comp;
121 #endif
122 #if HAVE_V8_REGCOMP
123 	struct regexp *comp;
124 	reg_show_error = show_error;
125 	comp = regcomp(pattern);
126 	reg_show_error = 1;
127 	if (comp == NULL)
128 	{
129 		/*
130 		 * regcomp has already printed an error message
131 		 * via regerror().
132 		 */
133 		return (-1);
134 	}
135 	if (*comp_pattern != NULL)
136 		free(*comp_pattern);
137 	*comp_pattern = comp;
138 #endif
139   }
140 	return (0);
141 }
142 
143 /*
144  * Like compile_pattern2, but convert the pattern to lowercase if necessary.
145  */
compile_pattern(constant char * pattern,int search_type,int show_error,PATTERN_TYPE * comp_pattern)146 public int compile_pattern(constant char *pattern, int search_type, int show_error, PATTERN_TYPE *comp_pattern)
147 {
148 	int result;
149 
150 	if (caseless != OPT_ONPLUS || (re_handles_caseless && !(search_type & SRCH_NO_REGEX)))
151 	{
152 		result = compile_pattern2(pattern, search_type, comp_pattern, show_error);
153 	} else
154 	{
155 		char *cvt_pattern = (char*) ecalloc(1, cvt_length(strlen(pattern), CVT_TO_LC));
156 		cvt_text(cvt_pattern, pattern, NULL, NULL, CVT_TO_LC);
157 		result = compile_pattern2(cvt_pattern, search_type, comp_pattern, show_error);
158 		free(cvt_pattern);
159 	}
160 	return (result);
161 }
162 
163 /*
164  * Forget that we have a compiled pattern.
165  */
uncompile_pattern(PATTERN_TYPE * pattern)166 public void uncompile_pattern(PATTERN_TYPE *pattern)
167 {
168 #if HAVE_GNU_REGEX
169 	if (*pattern != NULL)
170 	{
171 		regfree(*pattern);
172 		free(*pattern);
173 	}
174 	*pattern = NULL;
175 #endif
176 #if HAVE_POSIX_REGCOMP
177 	if (*pattern != NULL)
178 	{
179 		regfree(*pattern);
180 		free(*pattern);
181 	}
182 	*pattern = NULL;
183 #endif
184 #if HAVE_PCRE
185 	if (*pattern != NULL)
186 		pcre_free(*pattern);
187 	*pattern = NULL;
188 #endif
189 #if HAVE_PCRE2
190 	if (*pattern != NULL)
191 		pcre2_code_free(*pattern);
192 	*pattern = NULL;
193 #endif
194 #if HAVE_RE_COMP
195 	*pattern = 0;
196 #endif
197 #if HAVE_REGCMP
198 	if (*pattern != NULL)
199 		free(*pattern);
200 	*pattern = NULL;
201 #endif
202 #if HAVE_V8_REGCOMP
203 	if (*pattern != NULL)
204 		free(*pattern);
205 	*pattern = NULL;
206 #endif
207 }
208 
209 #if 0
210 /*
211  * Can a pattern be successfully compiled?
212  */
213 public int valid_pattern(char *pattern)
214 {
215 	PATTERN_TYPE comp_pattern;
216 	int result;
217 
218 	SET_NULL_PATTERN(comp_pattern);
219 	result = compile_pattern2(pattern, 0, &comp_pattern, 0);
220 	if (result != 0)
221 		return (0);
222 	uncompile_pattern(&comp_pattern);
223 	return (1);
224 }
225 #endif
226 
227 /*
228  * Is a compiled pattern null?
229  */
is_null_pattern(PATTERN_TYPE pattern)230 public lbool is_null_pattern(PATTERN_TYPE pattern)
231 {
232 #if HAVE_GNU_REGEX
233 	return (pattern == NULL);
234 #endif
235 #if HAVE_POSIX_REGCOMP
236 	return (pattern == NULL);
237 #endif
238 #if HAVE_PCRE
239 	return (pattern == NULL);
240 #endif
241 #if HAVE_PCRE2
242 	return (pattern == NULL);
243 #endif
244 #if HAVE_RE_COMP
245 	return (pattern == 0);
246 #endif
247 #if HAVE_REGCMP
248 	return (pattern == NULL);
249 #endif
250 #if HAVE_V8_REGCOMP
251 	return (pattern == NULL);
252 #endif
253 #if NO_REGEX
254 	return (pattern == NULL);
255 #endif
256 }
257 /*
258  * Simple pattern matching function.
259  * It supports no metacharacters like *, etc.
260  */
match(constant char * pattern,size_t pattern_len,constant char * buf,int buf_len,constant char *** sp,constant char *** ep,int nsubs)261 static int match(constant char *pattern, size_t pattern_len, constant char *buf, int buf_len, constant char ***sp, constant char ***ep, int nsubs)
262 {
263 	constant char *pp;
264 	constant char *lp;
265 	constant char *pattern_end = pattern + pattern_len;
266 	constant char *buf_end = buf + buf_len;
267 
268 	(void) nsubs;
269 	for ( ;  buf < buf_end;  buf++)
270 	{
271 		for (pp = pattern, lp = buf;  ;  pp++, lp++)
272 		{
273 			char cp = *pp;
274 			char cl = *lp;
275 			if (caseless == OPT_ONPLUS && ASCII_IS_UPPER(cp))
276 				cp = ASCII_TO_LOWER(cp);
277 			if (cp != cl)
278 				break;
279 			if (pp == pattern_end || lp == buf_end)
280 				break;
281 		}
282 		if (pp == pattern_end)
283 		{
284 			*(*sp)++ = buf;
285 			*(*ep)++ = lp;
286 			return (1);
287 		}
288 	}
289 	**sp = **ep = NULL;
290 	return (0);
291 }
292 
293 /*
294  * Perform a pattern match with the previously compiled pattern.
295  * Set sp[0] and ep[0] to the start and end of the matched string.
296  * Set sp[i] and ep[i] to the start and end of the i-th matched subpattern.
297  * Subpatterns are defined by parentheses in the regex language.
298  */
match_pattern1(PATTERN_TYPE pattern,constant char * tpattern,constant char * line,size_t aline_len,size_t line_off,constant char ** sp,constant char ** ep,int nsp,int notbol,int search_type)299 static lbool match_pattern1(PATTERN_TYPE pattern, constant char *tpattern, constant char *line, size_t aline_len, size_t line_off, constant char **sp, constant char **ep, int nsp, int notbol, int search_type)
300 {
301 	int matched;
302 	int line_len = (int) aline_len; /*{{type-issue}}*/
303 
304 #if NO_REGEX
305 	search_type |= SRCH_NO_REGEX;
306 #endif
307 	if (search_type & SRCH_NO_REGEX)
308 		matched = match(tpattern, strlen(tpattern), line + line_off, line_len - line_off, &sp, &ep, nsp);
309 	else
310 	{
311 #if HAVE_GNU_REGEX
312 	{
313 		struct re_registers search_regs;
314 		pattern->not_bol = notbol;
315 		pattern->regs_allocated = REGS_UNALLOCATED;
316 		matched = re_search(pattern, line, line_len, line_off, line_len - line_off, &search_regs) >= 0;
317 		if (matched)
318 		{
319 			*sp++ = line + search_regs.start[0];
320 			*ep++ = line + search_regs.end[0];
321 		}
322 	}
323 #endif
324 #if HAVE_POSIX_REGCOMP
325 	{
326 		#define RM_COUNT (NUM_SEARCH_COLORS+2)
327 		regmatch_t rm[RM_COUNT];
328 		int flags = (notbol) ? REG_NOTBOL : 0;
329 #ifdef REG_STARTEND
330 		flags |= REG_STARTEND;
331 		rm[0].rm_so = line_off;
332 		rm[0].rm_eo = line_len;
333 #else
334 		line += line_off;
335 #endif
336 		matched = !regexec(pattern, line, RM_COUNT, rm, flags);
337 		if (matched)
338 		{
339 			int i;
340 			int ecount;
341 			for (ecount = RM_COUNT;  ecount > 0;  ecount--)
342 				if (rm[ecount-1].rm_so >= 0)
343 					break;
344 			if (ecount >= nsp)
345 				ecount = nsp-1;
346 			for (i = 0;  i < ecount;  i++)
347 			{
348 				if (rm[i].rm_so < 0)
349 				{
350 					*sp++ = *ep++ = line;
351 				} else
352 				{
353 #ifndef __WATCOMC__
354 					*sp++ = line + rm[i].rm_so;
355 					*ep++ = line + rm[i].rm_eo;
356 #else
357 					*sp++ = rm[i].rm_sp;
358 					*ep++ = rm[i].rm_ep;
359 #endif
360 				}
361 			}
362 		}
363 	}
364 #endif
365 #if HAVE_PCRE
366 	{
367 		#define OVECTOR_COUNT ((3*NUM_SEARCH_COLORS)+3)
368 		int ovector[OVECTOR_COUNT];
369 		int flags = (notbol) ? PCRE_NOTBOL : 0;
370 		int i;
371 		int ecount;
372 		int mcount = pcre_exec(pattern, NULL, line, line_len,
373 			line_off, flags, ovector, OVECTOR_COUNT);
374 		matched = (mcount > 0);
375 		ecount = nsp-1;
376 		if (ecount > mcount) ecount = mcount;
377 		for (i = 0;  i < ecount*2; )
378 		{
379 			if (ovector[i] < 0 || ovector[i+1] < 0)
380 			{
381 				*sp++ = *ep++ = line;
382 				i += 2;
383 			} else
384 			{
385 				*sp++ = line + ovector[i++];
386 				*ep++ = line + ovector[i++];
387 			}
388 		}
389 	}
390 #endif
391 #if HAVE_PCRE2
392 	{
393 		int flags = (notbol) ? PCRE2_NOTBOL : 0;
394 		pcre2_match_data *md = pcre2_match_data_create(nsp-1, NULL);
395 		int mcount = pcre2_match(pattern, (PCRE2_SPTR)line, line_len,
396 			line_off, flags, md, NULL);
397 		matched = (mcount > 0);
398 		if (matched)
399 		{
400 			PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md);
401 			int i;
402 			int ecount = nsp-1;
403 			if (ecount > mcount) ecount = mcount;
404 			for (i = 0;  i < ecount*2; )
405 			{
406 				if (ovector[i] < 0 || ovector[i+1] < 0)
407 				{
408 					*sp++ = *ep++ = line;
409 					i += 2;
410 				} else
411 				{
412 					*sp++ = line + ovector[i++];
413 					*ep++ = line + ovector[i++];
414 				}
415 			}
416 		}
417 		pcre2_match_data_free(md);
418 	}
419 #endif
420 #if HAVE_RE_COMP
421 	matched = (re_exec(line + line_off) == 1);
422 	/*
423 	 * re_exec doesn't seem to provide a way to get the matched string.
424 	 */
425 #endif
426 #if HAVE_REGCMP
427 	matched = ((*ep++ = regex(pattern, line + line_off)) != NULL);
428 	if (matched)
429 		*sp++ = __loc1;
430 #endif
431 #if HAVE_V8_REGCOMP
432 #if HAVE_REGEXEC2
433 	matched = regexec2(pattern, line + line_off, notbol);
434 #else
435 	matched = regexec(pattern, line + line_off);
436 #endif
437 	if (matched)
438 	{
439 		*sp++ = pattern->startp[0];
440 		*ep++ = pattern->endp[0];
441 	}
442 #endif
443 	}
444 	*sp = *ep = NULL;
445 	matched = (!(search_type & SRCH_NO_MATCH) && matched) ||
446 			((search_type & SRCH_NO_MATCH) && !matched);
447 	return (matched != 0);
448 }
449 
450 /*
451  * Return TRUE if the match satisfies all SUBSEARCH conditions.
452  */
subsearch_ok(constant char ** sp,constant char ** ep,int search_type)453 static lbool subsearch_ok(constant char **sp, constant char **ep, int search_type)
454 {
455 	int i;
456 	for (i = 1;  i <= NUM_SEARCH_COLORS;  i++)
457 	{
458 		if ((search_type & SRCH_SUBSEARCH(i)) && ep[i] == sp[i])
459 			return FALSE;
460 	}
461 	return TRUE;
462 }
463 
match_pattern(PATTERN_TYPE pattern,constant char * tpattern,constant char * line,size_t line_len,size_t line_off,constant char ** sp,constant char ** ep,int nsp,int notbol,int search_type)464 public lbool match_pattern(PATTERN_TYPE pattern, constant char *tpattern, constant char *line, size_t line_len, size_t line_off, constant char **sp, constant char **ep, int nsp, int notbol, int search_type)
465 {
466 	for (;;)
467 	{
468 		size_t mlen;
469 		lbool matched = match_pattern1(pattern, tpattern, line, line_len, line_off, sp, ep, nsp, notbol, search_type);
470 		if (!matched || subsearch_ok(sp, ep, search_type))
471 			return matched;
472 		mlen = ep[0] - line;
473 		line += mlen;
474 		line_len -= mlen;
475 		notbol = 1;
476 	}
477 }
478 
479 /*
480  * Return the name of the pattern matching library.
481  */
pattern_lib_name(void)482 public constant char * pattern_lib_name(void)
483 {
484 #if HAVE_GNU_REGEX
485 	return ("GNU");
486 #else
487 #if HAVE_POSIX_REGCOMP
488 	return ("POSIX");
489 #else
490 #if HAVE_PCRE2
491 	return ("PCRE2");
492 #else
493 #if HAVE_PCRE
494 	return ("PCRE");
495 #else
496 #if HAVE_RE_COMP
497 	return ("BSD");
498 #else
499 #if HAVE_REGCMP
500 	return ("V8");
501 #else
502 #if HAVE_V8_REGCOMP
503 	return ("Spencer V8");
504 #else
505 	return ("no");
506 #endif
507 #endif
508 #endif
509 #endif
510 #endif
511 #endif
512 #endif
513 }
514