xref: /titanic_44/usr/src/cmd/grep_xpg4/grep.c (revision 2d4be7aab5a43b4b2bf4e1b527dcddb9bff39ee6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * grep - pattern matching program - combined grep, egrep, and fgrep.
29  *	Based on MKS grep command, with XCU & Solaris mods.
30  */
31 
32 /*
33  * Copyright 1985, 1992 by Mortice Kern Systems Inc.  All rights reserved.
34  *
35  */
36 
37 /* Copyright 2012 Nexenta Systems, Inc.  All rights reserved. */
38 
39 #include <string.h>
40 #include <stdlib.h>
41 #include <ctype.h>
42 #include <stdarg.h>
43 #include <regex.h>
44 #include <limits.h>
45 #include <sys/types.h>
46 #include <sys/stat.h>
47 #include <fcntl.h>
48 #include <stdio.h>
49 #include <locale.h>
50 #include <wchar.h>
51 #include <errno.h>
52 #include <unistd.h>
53 #include <wctype.h>
54 #include <ftw.h>
55 #include <sys/param.h>
56 
57 #define	BSIZE		512		/* Size of block for -b */
58 #define	BUFSIZE		8192		/* Input buffer size */
59 #define	MAX_DEPTH	1000		/* how deep to recurse */
60 
61 #define	M_CSETSIZE	256		/* singlebyte chars */
62 static int	bmglen;			/* length of BMG pattern */
63 static char	*bmgpat;		/* BMG pattern */
64 static int	bmgtab[M_CSETSIZE];	/* BMG delta1 table */
65 
66 typedef	struct	_PATTERN	{
67 	char	*pattern;		/* original pattern */
68 	wchar_t	*wpattern;		/* wide, lowercased pattern */
69 	struct	_PATTERN	*next;
70 	regex_t	re;			/* compiled pattern */
71 } PATTERN;
72 
73 static PATTERN	*patterns;
74 static char	errstr[128];		/* regerror string buffer */
75 static int	regflags = 0;		/* regcomp options */
76 static int	matched = 0;		/* return of the grep() */
77 static int	errors = 0;		/* count of errors */
78 static uchar_t	fgrep = 0;		/* Invoked as fgrep */
79 static uchar_t	egrep = 0;		/* Invoked as egrep */
80 static uchar_t	nvflag = 1;		/* Print matching lines */
81 static uchar_t	cflag;			/* Count of matches */
82 static uchar_t	iflag;			/* Case insensitve matching */
83 static uchar_t	hflag;			/* Supress printing of filename */
84 static uchar_t	lflag;			/* Print file names of matches */
85 static uchar_t	nflag;			/* Precede lines by line number */
86 static uchar_t	rflag;			/* Search directories recursively */
87 static uchar_t	bflag;			/* Preccede matches by block number */
88 static uchar_t	sflag;			/* Suppress file error messages */
89 static uchar_t	qflag;			/* Suppress standard output */
90 static uchar_t	wflag;			/* Search for expression as a word */
91 static uchar_t	xflag;			/* Anchoring */
92 static uchar_t	Eflag;			/* Egrep or -E flag */
93 static uchar_t	Fflag;			/* Fgrep or -F flag */
94 static uchar_t	Rflag;			/* Like rflag, but follow symlinks */
95 static uchar_t	outfn;			/* Put out file name */
96 static char	*cmdname;
97 
98 static int	use_wchar, use_bmg, mblocale;
99 
100 static size_t	outbuflen, prntbuflen;
101 static char	*prntbuf;
102 static wchar_t	*outline;
103 
104 static void	addfile(const char *fn);
105 static void	addpattern(char *s);
106 static void	fixpatterns(void);
107 static void	usage(void);
108 static int	grep(int, const char *);
109 static void	bmgcomp(char *, int);
110 static char	*bmgexec(char *, char *);
111 static int	recursive(const char *, const struct stat *, int, struct FTW *);
112 static void	process_path(const char *);
113 static void	process_file(const char *, int);
114 
115 /*
116  * mainline for grep
117  */
118 int
119 main(int argc, char **argv)
120 {
121 	char	*ap;
122 	int	c;
123 	int	fflag = 0;
124 	int	i, n_pattern = 0, n_file = 0;
125 	char	**pattern_list = NULL;
126 	char	**file_list = NULL;
127 
128 	(void) setlocale(LC_ALL, "");
129 #if !defined(TEXT_DOMAIN)	/* Should be defined by cc -D */
130 #define	TEXT_DOMAIN	"SYS_TEST"	/* Use this only if it weren't */
131 #endif
132 	(void) textdomain(TEXT_DOMAIN);
133 
134 	/*
135 	 * true if this is running on the multibyte locale
136 	 */
137 	mblocale = (MB_CUR_MAX > 1);
138 	/*
139 	 * Skip leading slashes
140 	 */
141 	cmdname = argv[0];
142 	if (ap = strrchr(cmdname, '/'))
143 		cmdname = ap + 1;
144 
145 	ap = cmdname;
146 	/*
147 	 * Detect egrep/fgrep via command name, map to -E and -F options.
148 	 */
149 	if (*ap == 'e' || *ap == 'E') {
150 		regflags |= REG_EXTENDED;
151 		egrep++;
152 	} else {
153 		if (*ap == 'f' || *ap == 'F') {
154 			fgrep++;
155 		}
156 	}
157 
158 	while ((c = getopt(argc, argv, "vwchilnrbse:f:qxEFIR")) != EOF) {
159 		switch (c) {
160 		case 'v':	/* POSIX: negate matches */
161 			nvflag = 0;
162 			break;
163 
164 		case 'c':	/* POSIX: write count */
165 			cflag++;
166 			break;
167 
168 		case 'i':	/* POSIX: ignore case */
169 			iflag++;
170 			regflags |= REG_ICASE;
171 			break;
172 
173 		case 'l':	/* POSIX: Write filenames only */
174 			lflag++;
175 			break;
176 
177 		case 'n':	/* POSIX: Write line numbers */
178 			nflag++;
179 			break;
180 
181 		case 'r':	/* Solaris: search recursively */
182 			rflag++;
183 			break;
184 
185 		case 'b':	/* Solaris: Write file block numbers */
186 			bflag++;
187 			break;
188 
189 		case 's':	/* POSIX: No error msgs for files */
190 			sflag++;
191 			break;
192 
193 		case 'e':	/* POSIX: pattern list */
194 			n_pattern++;
195 			pattern_list = realloc(pattern_list,
196 			    sizeof (char *) * n_pattern);
197 			if (pattern_list == NULL) {
198 				(void) fprintf(stderr,
199 				    gettext("%s: out of memory\n"),
200 				    cmdname);
201 				exit(2);
202 			}
203 			*(pattern_list + n_pattern - 1) = optarg;
204 			break;
205 
206 		case 'f':	/* POSIX: pattern file */
207 			fflag = 1;
208 			n_file++;
209 			file_list = realloc(file_list,
210 			    sizeof (char *) * n_file);
211 			if (file_list == NULL) {
212 				(void) fprintf(stderr,
213 				    gettext("%s: out of memory\n"),
214 				    cmdname);
215 				exit(2);
216 			}
217 			*(file_list + n_file - 1) = optarg;
218 			break;
219 		case 'h':	/* Solaris: supress printing of file name */
220 			hflag = 1;
221 			break;
222 
223 		case 'q':	/* POSIX: quiet: status only */
224 			qflag++;
225 			break;
226 
227 		case 'w':	/* Solaris: treat pattern as word */
228 			wflag++;
229 			break;
230 
231 		case 'x':	/* POSIX: full line matches */
232 			xflag++;
233 			regflags |= REG_ANCHOR;
234 			break;
235 
236 		case 'E':	/* POSIX: Extended RE's */
237 			regflags |= REG_EXTENDED;
238 			Eflag++;
239 			break;
240 
241 		case 'F':	/* POSIX: strings, not RE's */
242 			Fflag++;
243 			break;
244 
245 		case 'R':	/* Solaris: like rflag, but follow symlinks */
246 			Rflag++;
247 			rflag++;
248 			break;
249 
250 		default:
251 			usage();
252 		}
253 	}
254 	/*
255 	 * If we're invoked as egrep or fgrep we need to do some checks
256 	 */
257 
258 	if (egrep || fgrep) {
259 		/*
260 		 * Use of -E or -F with egrep or fgrep is illegal
261 		 */
262 		if (Eflag || Fflag)
263 			usage();
264 		/*
265 		 * Don't allow use of wflag with egrep / fgrep
266 		 */
267 		if (wflag)
268 			usage();
269 		/*
270 		 * For Solaris the -s flag is equivalent to XCU -q
271 		 */
272 		if (sflag)
273 			qflag++;
274 		/*
275 		 * done with above checks - set the appropriate flags
276 		 */
277 		if (egrep)
278 			Eflag++;
279 		else			/* Else fgrep */
280 			Fflag++;
281 	}
282 
283 	if (wflag && (Eflag || Fflag)) {
284 		/*
285 		 * -w cannot be specified with grep -F
286 		 */
287 		usage();
288 	}
289 
290 	/*
291 	 * -E and -F flags are mutually exclusive - check for this
292 	 */
293 	if (Eflag && Fflag)
294 		usage();
295 
296 	/*
297 	 * -c, -l and -q flags are mutually exclusive
298 	 * We have -c override -l like in Solaris.
299 	 * -q overrides -l & -c programmatically in grep() function.
300 	 */
301 	if (cflag && lflag)
302 		lflag = 0;
303 
304 	argv += optind - 1;
305 	argc -= optind - 1;
306 
307 	/*
308 	 * Now handling -e and -f option
309 	 */
310 	if (pattern_list) {
311 		for (i = 0; i < n_pattern; i++) {
312 			addpattern(pattern_list[i]);
313 		}
314 		free(pattern_list);
315 	}
316 	if (file_list) {
317 		for (i = 0; i < n_file; i++) {
318 			addfile(file_list[i]);
319 		}
320 		free(file_list);
321 	}
322 
323 	/*
324 	 * No -e or -f?  Make sure there is one more arg, use it as the pattern.
325 	 */
326 	if (patterns == NULL && !fflag) {
327 		if (argc < 2)
328 			usage();
329 		addpattern(argv[1]);
330 		argc--;
331 		argv++;
332 	}
333 
334 	/*
335 	 * If -x flag is not specified or -i flag is specified
336 	 * with fgrep in a multibyte locale, need to use
337 	 * the wide character APIs.  Otherwise, byte-oriented
338 	 * process will be done.
339 	 */
340 	use_wchar = Fflag && mblocale && (!xflag || iflag);
341 
342 	/*
343 	 * Compile Patterns and also decide if BMG can be used
344 	 */
345 	fixpatterns();
346 
347 	/* Process all files: stdin, or rest of arg list */
348 	if (argc < 2) {
349 		matched = grep(0, gettext("(standard input)"));
350 	} else {
351 		if (argc > 2 && hflag == 0)
352 			outfn = 1;	/* Print filename on match line */
353 		for (argv++; *argv != NULL; argv++) {
354 			process_path(*argv);
355 		}
356 	}
357 	/*
358 	 * Return() here is used instead of exit
359 	 */
360 
361 	(void) fflush(stdout);
362 
363 	if (errors)
364 		return (2);
365 	return (matched ? 0 : 1);
366 }
367 
368 static void
369 process_path(const char *path)
370 {
371 	struct	stat st;
372 	int	walkflags = FTW_CHDIR;
373 	char	*buf = NULL;
374 
375 	if (rflag) {
376 		if (stat(path, &st) != -1 &&
377 		    (st.st_mode & S_IFMT) == S_IFDIR) {
378 			outfn = 1; /* Print filename */
379 
380 			/*
381 			 * Add trailing slash if arg
382 			 * is directory, to resolve symlinks.
383 			 */
384 			if (path[strlen(path) - 1] != '/') {
385 				(void) asprintf(&buf, "%s/", path);
386 				if (buf != NULL)
387 					path = buf;
388 			}
389 
390 			/*
391 			 * Search through subdirs if path is directory.
392 			 * Don't follow symlinks if Rflag is not set.
393 			 */
394 			if (!Rflag)
395 				walkflags |= FTW_PHYS;
396 
397 			if (nftw(path, recursive, MAX_DEPTH, walkflags) != 0) {
398 				if (!sflag)
399 					(void) fprintf(stderr,
400 					    gettext("%s: can't open \"%s\"\n"),
401 					    cmdname, path);
402 				errors = 1;
403 			}
404 			return;
405 		}
406 	}
407 	process_file(path, 0);
408 }
409 
410 /*
411  * Read and process all files in directory recursively.
412  */
413 static int
414 recursive(const char *name, const struct stat *statp, int info, struct FTW *ftw)
415 {
416 	/*
417 	 * Process files and follow symlinks if Rflag set.
418 	 */
419 	if (info != FTW_F) {
420 		/* Report broken symlinks and unreadable files */
421 		if (!sflag &&
422 		    (info == FTW_SLN || info == FTW_DNR || info == FTW_NS)) {
423 			(void) fprintf(stderr,
424 			    gettext("%s: can't open \"%s\"\n"), cmdname, name);
425 		}
426 		return (0);
427 	}
428 
429 
430 	/* Skip devices and pipes if Rflag is not set */
431 	if (!Rflag && !S_ISREG(statp->st_mode))
432 		return (0);
433 	/* Pass offset to relative name from FTW_CHDIR */
434 	process_file(name, ftw->base);
435 	return (0);
436 }
437 
438 /*
439  * Opens file and call grep function.
440  */
441 static void
442 process_file(const char *name, int base)
443 {
444 	int fd;
445 
446 	if ((fd = open(name + base, O_RDONLY)) == -1) {
447 		errors = 1;
448 		if (!sflag) /* Silent mode */
449 			(void) fprintf(stderr, gettext(
450 			    "%s: can't open \"%s\"\n"),
451 			    cmdname, name);
452 		return;
453 	}
454 	matched |= grep(fd, name);
455 	(void) close(fd);
456 
457 	if (ferror(stdout)) {
458 		(void) fprintf(stderr, gettext(
459 		    "%s: error writing to stdout\n"),
460 		    cmdname);
461 		(void) fflush(stdout);
462 		exit(2);
463 	}
464 
465 }
466 
467 /*
468  * Add a file of strings to the pattern list.
469  */
470 static void
471 addfile(const char *fn)
472 {
473 	FILE	*fp;
474 	char	*inbuf;
475 	char	*bufp;
476 	size_t	bufsiz, buflen, bufused;
477 
478 	/*
479 	 * Open the pattern file
480 	 */
481 	if ((fp = fopen(fn, "r")) == NULL) {
482 		(void) fprintf(stderr, gettext("%s: can't open \"%s\"\n"),
483 		    cmdname, fn);
484 		exit(2);
485 	}
486 	bufsiz = BUFSIZE;
487 	if ((inbuf = malloc(bufsiz)) == NULL) {
488 		(void) fprintf(stderr,
489 		    gettext("%s: out of memory\n"), cmdname);
490 		exit(2);
491 	}
492 	bufp = inbuf;
493 	bufused = 0;
494 	/*
495 	 * Read in the file, reallocing as we need more memory
496 	 */
497 	while (fgets(bufp, bufsiz - bufused, fp) != NULL) {
498 		buflen = strlen(bufp);
499 		bufused += buflen;
500 		if (bufused + 1 == bufsiz && bufp[buflen - 1] != '\n') {
501 			/*
502 			 * if this line does not fit to the buffer,
503 			 * realloc larger buffer
504 			 */
505 			bufsiz += BUFSIZE;
506 			if ((inbuf = realloc(inbuf, bufsiz)) == NULL) {
507 				(void) fprintf(stderr,
508 				    gettext("%s: out of memory\n"),
509 				    cmdname);
510 				exit(2);
511 			}
512 			bufp = inbuf + bufused;
513 			continue;
514 		}
515 		if (bufp[buflen - 1] == '\n') {
516 			bufp[--buflen] = '\0';
517 		}
518 		addpattern(inbuf);
519 
520 		bufp = inbuf;
521 		bufused = 0;
522 	}
523 	free(inbuf);
524 	(void) fclose(fp);
525 }
526 
527 /*
528  * Add a string to the pattern list.
529  */
530 static void
531 addpattern(char *s)
532 {
533 	PATTERN	*pp;
534 	char	*wordbuf;
535 	char	*np;
536 
537 	for (; ; ) {
538 		np = strchr(s, '\n');
539 		if (np != NULL)
540 			*np = '\0';
541 		if ((pp = malloc(sizeof (PATTERN))) == NULL) {
542 			(void) fprintf(stderr, gettext(
543 			    "%s: out of memory\n"),
544 			    cmdname);
545 			exit(2);
546 		}
547 		if (wflag) {
548 			/*
549 			 * Solaris wflag support: Add '<' '>' to pattern to
550 			 * select it as a word. Doesn't make sense with -F
551 			 * but we're Libertarian.
552 			 */
553 			size_t	slen, wordlen;
554 
555 			slen = strlen(s);
556 			wordlen = slen + 5; /* '\\' '<' s '\\' '>' '\0' */
557 			if ((wordbuf = malloc(wordlen)) == NULL) {
558 				(void) fprintf(stderr,
559 				    gettext("%s: out of memory\n"),
560 				    cmdname);
561 				exit(2);
562 			}
563 			(void) strcpy(wordbuf, "\\<");
564 			(void) strcpy(wordbuf + 2, s);
565 			(void) strcpy(wordbuf + 2 + slen, "\\>");
566 		} else {
567 			if ((wordbuf = strdup(s)) == NULL) {
568 				(void) fprintf(stderr,
569 				    gettext("%s: out of memory\n"),
570 				    cmdname);
571 				exit(2);
572 			}
573 		}
574 		pp->pattern = wordbuf;
575 		pp->next = patterns;
576 		patterns = pp;
577 		if (np == NULL)
578 			break;
579 		s = np + 1;
580 	}
581 }
582 
583 /*
584  * Fix patterns.
585  * Must do after all arguments read, in case later -i option.
586  */
587 static void
588 fixpatterns(void)
589 {
590 	PATTERN	*pp;
591 	int	rv, fix_pattern, npatterns;
592 
593 	/*
594 	 * As REG_ANCHOR flag is not supported in the current Solaris,
595 	 * need to fix the specified pattern if -x is specified with
596 	 * grep or egrep
597 	 */
598 	fix_pattern = !Fflag && xflag;
599 
600 	for (npatterns = 0, pp = patterns; pp != NULL; pp = pp->next) {
601 		npatterns++;
602 		if (fix_pattern) {
603 			char	*cp, *cq;
604 			size_t	plen, nplen;
605 
606 			plen = strlen(pp->pattern);
607 			/* '^' pattern '$' */
608 			nplen = 1 + plen + 1 + 1;
609 			if ((cp = malloc(nplen)) == NULL) {
610 				(void) fprintf(stderr,
611 				    gettext("%s: out of memory\n"),
612 				    cmdname);
613 				exit(2);
614 			}
615 			cq = cp;
616 			*cq++ = '^';
617 			cq = strcpy(cq, pp->pattern) + plen;
618 			*cq++ = '$';
619 			*cq = '\0';
620 			free(pp->pattern);
621 			pp->pattern = cp;
622 		}
623 
624 		if (Fflag) {
625 			if (use_wchar) {
626 				/*
627 				 * Fflag && mblocale && iflag
628 				 * Fflag && mblocale && !xflag
629 				 */
630 				size_t	n;
631 				n = strlen(pp->pattern) + 1;
632 				if ((pp->wpattern =
633 				    malloc(sizeof (wchar_t) * n)) == NULL) {
634 					(void) fprintf(stderr,
635 					    gettext("%s: out of memory\n"),
636 					    cmdname);
637 					exit(2);
638 				}
639 				if (mbstowcs(pp->wpattern, pp->pattern, n) ==
640 				    (size_t)-1) {
641 					(void) fprintf(stderr,
642 					    gettext("%s: failed to convert "
643 					    "\"%s\" to wide-characters\n"),
644 					    cmdname, pp->pattern);
645 					exit(2);
646 				}
647 				if (iflag) {
648 					wchar_t	*wp;
649 					for (wp = pp->wpattern; *wp != L'\0';
650 					    wp++) {
651 						*wp = towlower((wint_t)*wp);
652 					}
653 				}
654 				free(pp->pattern);
655 			} else {
656 				/*
657 				 * Fflag && mblocale && !iflag
658 				 * Fflag && !mblocale && iflag
659 				 * Fflag && !mblocale && !iflag
660 				 */
661 				if (iflag) {
662 					unsigned char	*cp;
663 					for (cp = (unsigned char *)pp->pattern;
664 					    *cp != '\0'; cp++) {
665 						*cp = tolower(*cp);
666 					}
667 				}
668 			}
669 			/*
670 			 * fgrep: No regular expressions.
671 			 */
672 			continue;
673 		}
674 
675 		/*
676 		 * For non-fgrep, compile the regular expression,
677 		 * give an informative error message, and exit if
678 		 * it didn't compile.
679 		 */
680 		if ((rv = regcomp(&pp->re, pp->pattern, regflags)) != 0) {
681 			(void) regerror(rv, &pp->re, errstr, sizeof (errstr));
682 			(void) fprintf(stderr,
683 			    gettext("%s: RE error in %s: %s\n"),
684 			    cmdname, pp->pattern, errstr);
685 			exit(2);
686 		}
687 		free(pp->pattern);
688 	}
689 
690 	/*
691 	 * Decide if we are able to run the Boyer-Moore-Gosper algorithm.
692 	 * Use the Boyer-Moore-Gosper algorithm if:
693 	 * - fgrep			(Fflag)
694 	 * - singlebyte locale		(!mblocale)
695 	 * - no ignoring case		(!iflag)
696 	 * - no printing line numbers	(!nflag)
697 	 * - no negating the output	(nvflag)
698 	 * - only one pattern		(npatterns == 1)
699 	 * - non zero length pattern	(strlen(patterns->pattern) != 0)
700 	 *
701 	 * It's guaranteed patterns->pattern is still alive
702 	 * when Fflag && !mblocale.
703 	 */
704 	use_bmg = Fflag && !mblocale && !iflag && !nflag && nvflag &&
705 	    (npatterns == 1) && (strlen(patterns->pattern) != 0);
706 }
707 
708 /*
709  * Search a newline from the beginning of the string
710  */
711 static char *
712 find_nl(const char *ptr, size_t len)
713 {
714 	while (len-- != 0) {
715 		if (*ptr++ == '\n') {
716 			return ((char *)--ptr);
717 		}
718 	}
719 	return (NULL);
720 }
721 
722 /*
723  * Search a newline from the end of the string
724  */
725 static char *
726 rfind_nl(const char *ptr, size_t len)
727 {
728 	const char	*uptr = ptr + len;
729 	while (len--) {
730 		if (*--uptr == '\n') {
731 			return ((char *)uptr);
732 		}
733 	}
734 	return (NULL);
735 }
736 
737 /*
738  * Duplicate the specified string converting each character
739  * into a lower case.
740  */
741 static char *
742 istrdup(const char *s1)
743 {
744 	static size_t	ibuflen = 0;
745 	static char	*ibuf = NULL;
746 	size_t	slen;
747 	char	*p;
748 
749 	slen = strlen(s1);
750 	if (slen >= ibuflen) {
751 		/* ibuf does not fit to s1 */
752 		ibuflen = slen + 1;
753 		ibuf = realloc(ibuf, ibuflen);
754 		if (ibuf == NULL) {
755 			(void) fprintf(stderr,
756 			    gettext("%s: out of memory\n"), cmdname);
757 			exit(2);
758 		}
759 	}
760 	p = ibuf;
761 	do {
762 		*p++ = tolower(*s1);
763 	} while (*s1++ != '\0');
764 	return (ibuf);
765 }
766 
767 /*
768  * Do grep on a single file.
769  * Return true in any lines matched.
770  *
771  * We have two strategies:
772  * The fast one is used when we have a single pattern with
773  * a string known to occur in the pattern. We can then
774  * do a BMG match on the whole buffer.
775  * This is an order of magnitude faster.
776  * Otherwise we split the buffer into lines,
777  * and check for a match on each line.
778  */
779 static int
780 grep(int fd, const char *fn)
781 {
782 	PATTERN *pp;
783 	off_t	data_len;	/* length of the data chunk */
784 	off_t	line_len;	/* length of the current line */
785 	off_t	line_offset;	/* current line's offset from the beginning */
786 	long long	lineno;
787 	long long	matches = 0;	/* Number of matching lines */
788 	int	newlinep;	/* 0 if the last line of file has no newline */
789 	char	*ptr, *ptrend;
790 
791 
792 	if (patterns == NULL)
793 		return (0);	/* no patterns to match -- just return */
794 
795 	pp = patterns;
796 
797 	if (use_bmg) {
798 		bmgcomp(pp->pattern, strlen(pp->pattern));
799 	}
800 
801 	if (use_wchar && outline == NULL) {
802 		outbuflen = BUFSIZE + 1;
803 		outline = malloc(sizeof (wchar_t) * outbuflen);
804 		if (outline == NULL) {
805 			(void) fprintf(stderr, gettext("%s: out of memory\n"),
806 			    cmdname);
807 			exit(2);
808 		}
809 	}
810 
811 	if (prntbuf == NULL) {
812 		prntbuflen = BUFSIZE;
813 		if ((prntbuf = malloc(prntbuflen + 1)) == NULL) {
814 			(void) fprintf(stderr, gettext("%s: out of memory\n"),
815 			    cmdname);
816 			exit(2);
817 		}
818 	}
819 
820 	line_offset = 0;
821 	lineno = 0;
822 	newlinep = 1;
823 	data_len = 0;
824 	for (; ; ) {
825 		long	count;
826 		off_t	offset = 0;
827 
828 		if (data_len == 0) {
829 			/*
830 			 * If no data in the buffer, reset ptr
831 			 */
832 			ptr = prntbuf;
833 		}
834 		if (ptr == prntbuf) {
835 			/*
836 			 * The current data chunk starts from prntbuf.
837 			 * This means either the buffer has no data
838 			 * or the buffer has no newline.
839 			 * So, read more data from input.
840 			 */
841 			count = read(fd, ptr + data_len, prntbuflen - data_len);
842 			if (count < 0) {
843 				/* read error */
844 				if (cflag) {
845 					if (outfn && !rflag) {
846 						(void) fprintf(stdout,
847 						    "%s:", fn);
848 					}
849 					if (!qflag && !rflag) {
850 						(void) fprintf(stdout, "%lld\n",
851 						    matches);
852 					}
853 				}
854 				return (0);
855 			} else if (count == 0) {
856 				/* no new data */
857 				if (data_len == 0) {
858 					/* end of file already reached */
859 					break;
860 				}
861 				/* last line of file has no newline */
862 				ptrend = ptr + data_len;
863 				newlinep = 0;
864 				goto L_start_process;
865 			}
866 			offset = data_len;
867 			data_len += count;
868 		}
869 
870 		/*
871 		 * Look for newline in the chunk
872 		 * between ptr + offset and ptr + data_len - offset.
873 		 */
874 		ptrend = find_nl(ptr + offset, data_len - offset);
875 		if (ptrend == NULL) {
876 			/* no newline found in this chunk */
877 			if (ptr > prntbuf) {
878 				/*
879 				 * Move remaining data to the beginning
880 				 * of the buffer.
881 				 * Remaining data lie from ptr for
882 				 * data_len bytes.
883 				 */
884 				(void) memmove(prntbuf, ptr, data_len);
885 			}
886 			if (data_len == prntbuflen) {
887 				/*
888 				 * No enough room in the buffer
889 				 */
890 				prntbuflen += BUFSIZE;
891 				prntbuf = realloc(prntbuf, prntbuflen + 1);
892 				if (prntbuf == NULL) {
893 					(void) fprintf(stderr,
894 					    gettext("%s: out of memory\n"),
895 					    cmdname);
896 					exit(2);
897 				}
898 			}
899 			ptr = prntbuf;
900 			/* read the next input */
901 			continue;
902 		}
903 L_start_process:
904 
905 		/*
906 		 * Beginning of the chunk:	ptr
907 		 * End of the chunk:		ptr + data_len
908 		 * Beginning of the line:	ptr
909 		 * End of the line:		ptrend
910 		 */
911 
912 		if (use_bmg) {
913 			/*
914 			 * Use Boyer-Moore-Gosper algorithm to find out if
915 			 * this chunk (not this line) contains the specified
916 			 * pattern.  If not, restart from the last line
917 			 * of this chunk.
918 			 */
919 			char	*bline;
920 			bline = bmgexec(ptr, ptr + data_len);
921 			if (bline == NULL) {
922 				/*
923 				 * No pattern found in this chunk.
924 				 * Need to find the last line
925 				 * in this chunk.
926 				 */
927 				ptrend = rfind_nl(ptr, data_len);
928 
929 				/*
930 				 * When this chunk does not contain newline,
931 				 * ptrend becomes NULL, which should happen
932 				 * when the last line of file does not end
933 				 * with a newline.  At such a point,
934 				 * newlinep should have been set to 0.
935 				 * Therefore, just after jumping to
936 				 * L_skip_line, the main for-loop quits,
937 				 * and the line_len value won't be
938 				 * used.
939 				 */
940 				line_len = ptrend - ptr;
941 				goto L_skip_line;
942 			}
943 			if (bline > ptrend) {
944 				/*
945 				 * Pattern found not in the first line
946 				 * of this chunk.
947 				 * Discard the first line.
948 				 */
949 				line_len = ptrend - ptr;
950 				goto L_skip_line;
951 			}
952 			/*
953 			 * Pattern found in the first line of this chunk.
954 			 * Using this result.
955 			 */
956 			*ptrend = '\0';
957 			line_len = ptrend - ptr;
958 
959 			/*
960 			 * before jumping to L_next_line,
961 			 * need to handle xflag if specified
962 			 */
963 			if (xflag && (line_len != bmglen ||
964 			    strcmp(bmgpat, ptr) != 0)) {
965 				/* didn't match */
966 				pp = NULL;
967 			} else {
968 				pp = patterns; /* to make it happen */
969 			}
970 			goto L_next_line;
971 		}
972 		lineno++;
973 		/*
974 		 * Line starts from ptr and ends at ptrend.
975 		 * line_len will be the length of the line.
976 		 */
977 		*ptrend = '\0';
978 		line_len = ptrend - ptr;
979 
980 		/*
981 		 * From now, the process will be performed based
982 		 * on the line from ptr to ptrend.
983 		 */
984 		if (use_wchar) {
985 			size_t	len;
986 
987 			if (line_len >= outbuflen) {
988 				outbuflen = line_len + 1;
989 				outline = realloc(outline,
990 				    sizeof (wchar_t) * outbuflen);
991 				if (outline == NULL) {
992 					(void) fprintf(stderr,
993 					    gettext("%s: out of memory\n"),
994 					    cmdname);
995 					exit(2);
996 				}
997 			}
998 
999 			len = mbstowcs(outline, ptr, line_len);
1000 			if (len == (size_t)-1) {
1001 				(void) fprintf(stderr, gettext(
1002 	"%s: input file \"%s\": line %lld: invalid multibyte character\n"),
1003 				    cmdname, fn, lineno);
1004 				/* never match a line with invalid sequence */
1005 				goto L_skip_line;
1006 			}
1007 			outline[len] = L'\0';
1008 
1009 			if (iflag) {
1010 				wchar_t	*cp;
1011 				for (cp = outline; *cp != '\0'; cp++) {
1012 					*cp = towlower((wint_t)*cp);
1013 				}
1014 			}
1015 
1016 			if (xflag) {
1017 				for (pp = patterns; pp; pp = pp->next) {
1018 					if (outline[0] == pp->wpattern[0] &&
1019 					    wcscmp(outline,
1020 					    pp->wpattern) == 0) {
1021 						/* matched */
1022 						break;
1023 					}
1024 				}
1025 			} else {
1026 				for (pp = patterns; pp; pp = pp->next) {
1027 					if (wcswcs(outline, pp->wpattern)
1028 					    != NULL) {
1029 						/* matched */
1030 						break;
1031 					}
1032 				}
1033 			}
1034 		} else if (Fflag) {
1035 			/* fgrep in byte-oriented handling */
1036 			char	*fptr;
1037 			if (iflag) {
1038 				fptr = istrdup(ptr);
1039 			} else {
1040 				fptr = ptr;
1041 			}
1042 			if (xflag) {
1043 				/* fgrep -x */
1044 				for (pp = patterns; pp; pp = pp->next) {
1045 					if (fptr[0] == pp->pattern[0] &&
1046 					    strcmp(fptr, pp->pattern) == 0) {
1047 						/* matched */
1048 						break;
1049 					}
1050 				}
1051 			} else {
1052 				for (pp = patterns; pp; pp = pp->next) {
1053 					if (strstr(fptr, pp->pattern) != NULL) {
1054 						/* matched */
1055 						break;
1056 					}
1057 				}
1058 			}
1059 		} else {
1060 			/* grep or egrep */
1061 			for (pp = patterns; pp; pp = pp->next) {
1062 				int	rv;
1063 
1064 				rv = regexec(&pp->re, ptr, 0, NULL, 0);
1065 				if (rv == REG_OK) {
1066 					/* matched */
1067 					break;
1068 				}
1069 
1070 				switch (rv) {
1071 				case REG_NOMATCH:
1072 					break;
1073 				case REG_ECHAR:
1074 					(void) fprintf(stderr, gettext(
1075 	    "%s: input file \"%s\": line %lld: invalid multibyte character\n"),
1076 					    cmdname, fn, lineno);
1077 					break;
1078 				default:
1079 					(void) regerror(rv, &pp->re, errstr,
1080 					    sizeof (errstr));
1081 					(void) fprintf(stderr, gettext(
1082 	    "%s: input file \"%s\": line %lld: %s\n"),
1083 					    cmdname, fn, lineno, errstr);
1084 					exit(2);
1085 				}
1086 			}
1087 		}
1088 
1089 L_next_line:
1090 		/*
1091 		 * Here, if pp points to non-NULL, something has been matched
1092 		 * to the pattern.
1093 		 */
1094 		if (nvflag == (pp != NULL)) {
1095 			matches++;
1096 			/*
1097 			 * Handle q, l, and c flags.
1098 			 */
1099 			if (qflag) {
1100 				/* no need to continue */
1101 				/*
1102 				 * End of this line is ptrend.
1103 				 * We have read up to ptr + data_len.
1104 				 */
1105 				off_t	pos;
1106 				pos = ptr + data_len - (ptrend + 1);
1107 				(void) lseek(fd, -pos, SEEK_CUR);
1108 				exit(0);
1109 			}
1110 			if (lflag) {
1111 				(void) printf("%s\n", fn);
1112 				break;
1113 			}
1114 			if (!cflag) {
1115 				if (outfn) {
1116 					(void) printf("%s:", fn);
1117 				}
1118 				if (bflag) {
1119 					(void) printf("%lld:", (offset_t)
1120 					    (line_offset / BSIZE));
1121 				}
1122 				if (nflag) {
1123 					(void) printf("%lld:", lineno);
1124 				}
1125 				*ptrend = '\n';
1126 				(void) fwrite(ptr, 1, line_len + 1, stdout);
1127 			}
1128 			if (ferror(stdout)) {
1129 				return (0);
1130 			}
1131 		}
1132 L_skip_line:
1133 		if (!newlinep)
1134 			break;
1135 
1136 		data_len -= line_len + 1;
1137 		line_offset += line_len + 1;
1138 		ptr = ptrend + 1;
1139 	}
1140 
1141 	if (cflag) {
1142 		if (outfn) {
1143 			(void) printf("%s:", fn);
1144 		}
1145 		if (!qflag) {
1146 			(void) printf("%lld\n", matches);
1147 		}
1148 	}
1149 	return (matches != 0);
1150 }
1151 
1152 /*
1153  * usage message for grep
1154  */
1155 static void
1156 usage(void)
1157 {
1158 	if (egrep || fgrep) {
1159 		(void) fprintf(stderr, gettext("Usage:\t%s"), cmdname);
1160 		(void) fprintf(stderr,
1161 		    gettext(" [-c|-l|-q] [-r|-R] [-bhinsvx] "
1162 		    "pattern_list [file ...]\n"));
1163 
1164 		(void) fprintf(stderr, "\t%s", cmdname);
1165 		(void) fprintf(stderr,
1166 		    gettext(" [-c|-l|-q] [-r|-R] [-bhinsvx] "
1167 		    "[-e pattern_list]... "
1168 		    "[-f pattern_file]... [file...]\n"));
1169 	} else {
1170 		(void) fprintf(stderr, gettext("Usage:\t%s"), cmdname);
1171 		(void) fprintf(stderr,
1172 		    gettext(" [-c|-l|-q] [-r|-R] [-bhinsvwx] "
1173 		    "pattern_list [file ...]\n"));
1174 
1175 		(void) fprintf(stderr, "\t%s", cmdname);
1176 		(void) fprintf(stderr,
1177 		    gettext(" [-c|-l|-q] [-r|-R] [-bhinsvwx] "
1178 		    "[-e pattern_list]... "
1179 		    "[-f pattern_file]... [file...]\n"));
1180 
1181 		(void) fprintf(stderr, "\t%s", cmdname);
1182 		(void) fprintf(stderr,
1183 		    gettext(" -E [-c|-l|-q] [-r|-R] [-bhinsvx] "
1184 		    "pattern_list [file ...]\n"));
1185 
1186 		(void) fprintf(stderr, "\t%s", cmdname);
1187 		(void) fprintf(stderr,
1188 		    gettext(" -E [-c|-l|-q] [-r|-R] [-bhinsvx] "
1189 		    "[-e pattern_list]... "
1190 		    "[-f pattern_file]... [file...]\n"));
1191 
1192 		(void) fprintf(stderr, "\t%s", cmdname);
1193 		(void) fprintf(stderr,
1194 		    gettext(" -F [-c|-l|-q] [-r|-R] [-bhinsvx] "
1195 		    "pattern_list [file ...]\n"));
1196 
1197 		(void) fprintf(stderr, "\t%s", cmdname);
1198 		(void) fprintf(stderr,
1199 		    gettext(" -F [-c|-l|-q] [-bhinsvx] [-e pattern_list]... "
1200 		    "[-f pattern_file]... [file...]\n"));
1201 	}
1202 	exit(2);
1203 	/* NOTREACHED */
1204 }
1205 
1206 /*
1207  * Compile literal pattern into BMG tables
1208  */
1209 static void
1210 bmgcomp(char *pat, int len)
1211 {
1212 	int	i;
1213 	int	tlen;
1214 	unsigned char	*uc = (unsigned char *)pat;
1215 
1216 	bmglen = len;
1217 	bmgpat = pat;
1218 
1219 	for (i = 0; i < M_CSETSIZE; i++) {
1220 		bmgtab[i] = len;
1221 	}
1222 
1223 	len--;
1224 	for (tlen = len, i = 0; i <= len; i++, tlen--) {
1225 		bmgtab[*uc++] = tlen;
1226 	}
1227 }
1228 
1229 /*
1230  * BMG search.
1231  */
1232 static char *
1233 bmgexec(char *str, char *end)
1234 {
1235 	int	t;
1236 	char	*k, *s, *p;
1237 
1238 	k = str + bmglen - 1;
1239 	if (bmglen == 1) {
1240 		return (memchr(str, bmgpat[0], end - str));
1241 	}
1242 	for (; ; ) {
1243 		/* inner loop, should be most optimized */
1244 		while (k < end && (t = bmgtab[(unsigned char)*k]) != 0) {
1245 			k += t;
1246 		}
1247 		if (k >= end) {
1248 			return (NULL);
1249 		}
1250 		for (s = k, p = bmgpat + bmglen - 1; *--s == *--p; ) {
1251 			if (p == bmgpat) {
1252 				return (s);
1253 			}
1254 		}
1255 		k++;
1256 	}
1257 	/* NOTREACHED */
1258 }
1259