xref: /freebsd/usr.bin/grep/file.c (revision 2a2234c0f41da33b8cfc938e46b54a8234b64135)
1 /*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2 /*	$FreeBSD$	*/
3 /*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4 
5 /*-
6  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
7  *
8  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
9  * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
10  * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
11  * All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/mman.h>
40 #include <sys/stat.h>
41 #include <sys/types.h>
42 
43 #include <err.h>
44 #include <errno.h>
45 #include <fcntl.h>
46 #include <stddef.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50 #include <wchar.h>
51 #include <wctype.h>
52 #include <zlib.h>
53 
54 #ifndef WITHOUT_LZMA
55 #include <lzma.h>
56 #endif
57 
58 #ifndef WITHOUT_BZIP2
59 #include <bzlib.h>
60 #endif
61 
62 #include "grep.h"
63 
64 #define	MAXBUFSIZ	(32 * 1024)
65 #define	LNBUFBUMP	80
66 
67 static gzFile gzbufdesc;
68 #ifndef WITHOUT_LZMA
69 static lzma_stream lstrm = LZMA_STREAM_INIT;
70 static lzma_action laction;
71 static uint8_t lin_buf[MAXBUFSIZ];
72 #endif
73 #ifndef WITHOUT_BZIP2
74 static BZFILE* bzbufdesc;
75 #endif
76 
77 static unsigned char *buffer;
78 static unsigned char *bufpos;
79 static size_t bufrem;
80 static size_t fsiz;
81 
82 static unsigned char *lnbuf;
83 static size_t lnbuflen;
84 
85 static inline int
86 grep_refill(struct file *f)
87 {
88 	ssize_t nr;
89 #ifndef WITHOUT_LZMA
90 	lzma_ret lzmaret;
91 #endif
92 
93 	if (filebehave == FILE_MMAP)
94 		return (0);
95 
96 	bufpos = buffer;
97 	bufrem = 0;
98 
99 	switch (filebehave) {
100 	case FILE_GZIP:
101 		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
102 		break;
103 #ifndef WITHOUT_BZIP2
104 	case FILE_BZIP:
105 		if (bzbufdesc != NULL) {
106 			int bzerr;
107 
108 			nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
109 			switch (bzerr) {
110 			case BZ_OK:
111 			case BZ_STREAM_END:
112 				/* No problem, nr will be okay */
113 				break;
114 			case BZ_DATA_ERROR_MAGIC:
115 				/*
116 				 * As opposed to gzread(), which simply returns the
117 				 * plain file data, if it is not in the correct
118 				 * compressed format, BZ2_bzRead() instead aborts.
119 				 *
120 				 * So, just restart at the beginning of the file again,
121 				 * and use plain reads from now on.
122 				 */
123 				BZ2_bzReadClose(&bzerr, bzbufdesc);
124 				bzbufdesc = NULL;
125 				if (lseek(f->fd, 0, SEEK_SET) == -1)
126 					return (-1);
127 				nr = read(f->fd, buffer, MAXBUFSIZ);
128 				break;
129 			default:
130 				/* Make sure we exit with an error */
131 				nr = -1;
132 			}
133 		} else
134 			/*
135 			 * Also an error case; we should never have a scenario
136 			 * where we have an open file but no bzip descriptor
137 			 * at this point. See: grep_open
138 			 */
139 			nr = -1;
140 		break;
141 #endif
142 #ifndef WITHOUT_LZMA
143 	case FILE_XZ:
144 	case FILE_LZMA:
145 		lstrm.next_out = buffer;
146 
147 		do {
148 			if (lstrm.avail_in == 0) {
149 				lstrm.next_in = lin_buf;
150 				nr = read(f->fd, lin_buf, MAXBUFSIZ);
151 
152 				if (nr < 0)
153 					return (-1);
154 				else if (nr == 0)
155 					laction = LZMA_FINISH;
156 
157 				lstrm.avail_in = nr;
158 			}
159 
160 			lzmaret = lzma_code(&lstrm, laction);
161 
162 			if (lzmaret != LZMA_OK && lzmaret != LZMA_STREAM_END)
163 				return (-1);
164 
165 			if (lstrm.avail_out == 0 || lzmaret == LZMA_STREAM_END) {
166 				bufrem = MAXBUFSIZ - lstrm.avail_out;
167 				lstrm.next_out = buffer;
168 				lstrm.avail_out = MAXBUFSIZ;
169 			}
170 		} while (bufrem == 0 && lzmaret != LZMA_STREAM_END);
171 
172 		return (0);
173 #endif	/* WITHOUT_LZMA */
174 	default:
175 		nr = read(f->fd, buffer, MAXBUFSIZ);
176 	}
177 	if (nr < 0)
178 		return (-1);
179 
180 	bufrem = nr;
181 	return (0);
182 }
183 
184 static inline int
185 grep_lnbufgrow(size_t newlen)
186 {
187 
188 	if (lnbuflen < newlen) {
189 		lnbuf = grep_realloc(lnbuf, newlen);
190 		lnbuflen = newlen;
191 	}
192 
193 	return (0);
194 }
195 
196 char *
197 grep_fgetln(struct file *f, size_t *lenp)
198 {
199 	unsigned char *p;
200 	char *ret;
201 	size_t len;
202 	size_t off;
203 	ptrdiff_t diff;
204 
205 	/* Fill the buffer, if necessary */
206 	if (bufrem == 0 && grep_refill(f) != 0)
207 		goto error;
208 
209 	if (bufrem == 0) {
210 		/* Return zero length to indicate EOF */
211 		*lenp = 0;
212 		return (bufpos);
213 	}
214 
215 	/* Look for a newline in the remaining part of the buffer */
216 	if ((p = memchr(bufpos, fileeol, bufrem)) != NULL) {
217 		++p; /* advance over newline */
218 		ret = bufpos;
219 		len = p - bufpos;
220 		bufrem -= len;
221 		bufpos = p;
222 		*lenp = len;
223 		return (ret);
224 	}
225 
226 	/* We have to copy the current buffered data to the line buffer */
227 	for (len = bufrem, off = 0; ; len += bufrem) {
228 		/* Make sure there is room for more data */
229 		if (grep_lnbufgrow(len + LNBUFBUMP))
230 			goto error;
231 		memcpy(lnbuf + off, bufpos, len - off);
232 		/* With FILE_MMAP, this is EOF; there's no more to refill */
233 		if (filebehave == FILE_MMAP) {
234 			bufrem -= len;
235 			break;
236 		}
237 		off = len;
238 		/* Fetch more to try and find EOL/EOF */
239 		if (grep_refill(f) != 0)
240 			goto error;
241 		if (bufrem == 0)
242 			/* EOF: return partial line */
243 			break;
244 		if ((p = memchr(bufpos, fileeol, bufrem)) == NULL)
245 			continue;
246 		/* got it: finish up the line (like code above) */
247 		++p;
248 		diff = p - bufpos;
249 		len += diff;
250 		if (grep_lnbufgrow(len))
251 		    goto error;
252 		memcpy(lnbuf + off, bufpos, diff);
253 		bufrem -= diff;
254 		bufpos = p;
255 		break;
256 	}
257 	*lenp = len;
258 	return (lnbuf);
259 
260 error:
261 	*lenp = 0;
262 	return (NULL);
263 }
264 
265 /*
266  * Opens a file for processing.
267  */
268 struct file *
269 grep_open(const char *path)
270 {
271 	struct file *f;
272 #ifndef WITHOUT_LZMA
273 	lzma_ret lzmaret;
274 #endif
275 
276 	f = grep_malloc(sizeof *f);
277 	memset(f, 0, sizeof *f);
278 	if (path == NULL) {
279 		/* Processing stdin implies --line-buffered. */
280 		lbflag = true;
281 		f->fd = STDIN_FILENO;
282 	} else if ((f->fd = open(path, O_RDONLY)) == -1)
283 		goto error1;
284 
285 	if (filebehave == FILE_MMAP) {
286 		struct stat st;
287 
288 		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
289 		    (!S_ISREG(st.st_mode)))
290 			filebehave = FILE_STDIO;
291 		else {
292 			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
293 #ifdef MAP_PREFAULT_READ
294 			flags |= MAP_PREFAULT_READ;
295 #endif
296 			fsiz = st.st_size;
297 			buffer = mmap(NULL, fsiz, PROT_READ, flags,
298 			     f->fd, (off_t)0);
299 			if (buffer == MAP_FAILED)
300 				filebehave = FILE_STDIO;
301 			else {
302 				bufrem = st.st_size;
303 				bufpos = buffer;
304 				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
305 			}
306 		}
307 	}
308 
309 	if ((buffer == NULL) || (buffer == MAP_FAILED))
310 		buffer = grep_malloc(MAXBUFSIZ);
311 
312 	switch (filebehave) {
313 	case FILE_GZIP:
314 		if ((gzbufdesc = gzdopen(f->fd, "r")) == NULL)
315 			goto error2;
316 		break;
317 #ifndef WITHOUT_BZIP2
318 	case FILE_BZIP:
319 		if ((bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
320 			goto error2;
321 		break;
322 #endif
323 #ifndef WITHOUT_LZMA
324 	case FILE_XZ:
325 	case FILE_LZMA:
326 
327 		if (filebehave == FILE_XZ)
328 			lzmaret = lzma_stream_decoder(&lstrm, UINT64_MAX,
329 			    LZMA_CONCATENATED);
330 		else
331 			lzmaret = lzma_alone_decoder(&lstrm, UINT64_MAX);
332 
333 		if (lzmaret != LZMA_OK)
334 			goto error2;
335 
336 		lstrm.avail_in = 0;
337 		lstrm.avail_out = MAXBUFSIZ;
338 		laction = LZMA_RUN;
339 		break;
340 	}
341 #endif
342 
343 	/* Fill read buffer, also catches errors early */
344 	if (bufrem == 0 && grep_refill(f) != 0)
345 		goto error2;
346 
347 	/* Check for binary stuff, if necessary */
348 	if (binbehave != BINFILE_TEXT && fileeol != '\0' &&
349 	    memchr(bufpos, '\0', bufrem) != NULL)
350 		f->binary = true;
351 
352 	return (f);
353 
354 error2:
355 	close(f->fd);
356 error1:
357 	free(f);
358 	return (NULL);
359 }
360 
361 /*
362  * Closes a file.
363  */
364 void
365 grep_close(struct file *f)
366 {
367 
368 	close(f->fd);
369 
370 	/* Reset read buffer and line buffer */
371 	if (filebehave == FILE_MMAP) {
372 		munmap(buffer, fsiz);
373 		buffer = NULL;
374 	}
375 	bufpos = buffer;
376 	bufrem = 0;
377 
378 	free(lnbuf);
379 	lnbuf = NULL;
380 	lnbuflen = 0;
381 }
382