xref: /freebsd/usr.bin/grep/file.c (revision eb69d1f144a6fcc765d1b9d44a5ae8082353e70b)
1 /*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2 /*	$FreeBSD$	*/
3 /*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4 
5 /*-
6  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
7  *
8  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
9  * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
10  * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
11  * All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/mman.h>
40 #include <sys/stat.h>
41 #include <sys/types.h>
42 
43 #include <err.h>
44 #include <errno.h>
45 #include <fcntl.h>
46 #include <stddef.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50 #include <wchar.h>
51 #include <wctype.h>
52 #include <zlib.h>
53 
54 #ifndef WITHOUT_LZMA
55 #include <lzma.h>
56 #endif
57 
58 #ifndef WITHOUT_BZIP2
59 #include <bzlib.h>
60 #endif
61 
62 #include "grep.h"
63 
64 #define	MAXBUFSIZ	(32 * 1024)
65 #define	LNBUFBUMP	80
66 
67 static gzFile gzbufdesc;
68 #ifndef WITHOUT_LZMA
69 static lzma_stream lstrm = LZMA_STREAM_INIT;
70 static lzma_action laction;
71 static uint8_t lin_buf[MAXBUFSIZ];
72 #endif
73 #ifndef WITHOUT_BZIP2
74 static BZFILE* bzbufdesc;
75 #endif
76 
77 static unsigned char *buffer;
78 static unsigned char *bufpos;
79 static size_t bufrem;
80 static size_t fsiz;
81 
82 static unsigned char *lnbuf;
83 static size_t lnbuflen;
84 
85 static inline int
86 grep_refill(struct file *f)
87 {
88 	ssize_t nr;
89 
90 	if (filebehave == FILE_MMAP)
91 		return (0);
92 
93 	bufpos = buffer;
94 	bufrem = 0;
95 
96 	if (filebehave == FILE_GZIP) {
97 		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
98 #ifndef WITHOUT_BZIP2
99 	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
100 		int bzerr;
101 
102 		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
103 		switch (bzerr) {
104 		case BZ_OK:
105 		case BZ_STREAM_END:
106 			/* No problem, nr will be okay */
107 			break;
108 		case BZ_DATA_ERROR_MAGIC:
109 			/*
110 			 * As opposed to gzread(), which simply returns the
111 			 * plain file data, if it is not in the correct
112 			 * compressed format, BZ2_bzRead() instead aborts.
113 			 *
114 			 * So, just restart at the beginning of the file again,
115 			 * and use plain reads from now on.
116 			 */
117 			BZ2_bzReadClose(&bzerr, bzbufdesc);
118 			bzbufdesc = NULL;
119 			if (lseek(f->fd, 0, SEEK_SET) == -1)
120 				return (-1);
121 			nr = read(f->fd, buffer, MAXBUFSIZ);
122 			break;
123 		default:
124 			/* Make sure we exit with an error */
125 			nr = -1;
126 		}
127 #endif
128 #ifndef WITHOUT_LZMA
129 	} else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
130 		lzma_ret ret;
131 		lstrm.next_out = buffer;
132 
133 		do {
134 			if (lstrm.avail_in == 0) {
135 				lstrm.next_in = lin_buf;
136 				nr = read(f->fd, lin_buf, MAXBUFSIZ);
137 
138 				if (nr < 0)
139 					return (-1);
140 				else if (nr == 0)
141 					laction = LZMA_FINISH;
142 
143 				lstrm.avail_in = nr;
144 			}
145 
146 			ret = lzma_code(&lstrm, laction);
147 
148 			if (ret != LZMA_OK && ret != LZMA_STREAM_END)
149 				return (-1);
150 
151 			if (lstrm.avail_out == 0 || ret == LZMA_STREAM_END) {
152 				bufrem = MAXBUFSIZ - lstrm.avail_out;
153 				lstrm.next_out = buffer;
154 				lstrm.avail_out = MAXBUFSIZ;
155 			}
156 		} while (bufrem == 0 && ret != LZMA_STREAM_END);
157 
158 		return (0);
159 #endif	/* WIHTOUT_LZMA */
160 	} else
161 		nr = read(f->fd, buffer, MAXBUFSIZ);
162 
163 	if (nr < 0)
164 		return (-1);
165 
166 	bufrem = nr;
167 	return (0);
168 }
169 
170 static inline int
171 grep_lnbufgrow(size_t newlen)
172 {
173 
174 	if (lnbuflen < newlen) {
175 		lnbuf = grep_realloc(lnbuf, newlen);
176 		lnbuflen = newlen;
177 	}
178 
179 	return (0);
180 }
181 
182 char *
183 grep_fgetln(struct file *f, size_t *lenp)
184 {
185 	unsigned char *p;
186 	char *ret;
187 	size_t len;
188 	size_t off;
189 	ptrdiff_t diff;
190 
191 	/* Fill the buffer, if necessary */
192 	if (bufrem == 0 && grep_refill(f) != 0)
193 		goto error;
194 
195 	if (bufrem == 0) {
196 		/* Return zero length to indicate EOF */
197 		*lenp = 0;
198 		return (bufpos);
199 	}
200 
201 	/* Look for a newline in the remaining part of the buffer */
202 	if ((p = memchr(bufpos, fileeol, bufrem)) != NULL) {
203 		++p; /* advance over newline */
204 		ret = bufpos;
205 		len = p - bufpos;
206 		bufrem -= len;
207 		bufpos = p;
208 		*lenp = len;
209 		return (ret);
210 	}
211 
212 	/* We have to copy the current buffered data to the line buffer */
213 	for (len = bufrem, off = 0; ; len += bufrem) {
214 		/* Make sure there is room for more data */
215 		if (grep_lnbufgrow(len + LNBUFBUMP))
216 			goto error;
217 		memcpy(lnbuf + off, bufpos, len - off);
218 		/* With FILE_MMAP, this is EOF; there's no more to refill */
219 		if (filebehave == FILE_MMAP) {
220 			bufrem -= len;
221 			break;
222 		}
223 		off = len;
224 		/* Fetch more to try and find EOL/EOF */
225 		if (grep_refill(f) != 0)
226 			goto error;
227 		if (bufrem == 0)
228 			/* EOF: return partial line */
229 			break;
230 		if ((p = memchr(bufpos, fileeol, bufrem)) == NULL)
231 			continue;
232 		/* got it: finish up the line (like code above) */
233 		++p;
234 		diff = p - bufpos;
235 		len += diff;
236 		if (grep_lnbufgrow(len))
237 		    goto error;
238 		memcpy(lnbuf + off, bufpos, diff);
239 		bufrem -= diff;
240 		bufpos = p;
241 		break;
242 	}
243 	*lenp = len;
244 	return (lnbuf);
245 
246 error:
247 	*lenp = 0;
248 	return (NULL);
249 }
250 
251 /*
252  * Opens a file for processing.
253  */
254 struct file *
255 grep_open(const char *path)
256 {
257 	struct file *f;
258 
259 	f = grep_malloc(sizeof *f);
260 	memset(f, 0, sizeof *f);
261 	if (path == NULL) {
262 		/* Processing stdin implies --line-buffered. */
263 		lbflag = true;
264 		f->fd = STDIN_FILENO;
265 	} else if ((f->fd = open(path, O_RDONLY)) == -1)
266 		goto error1;
267 
268 	if (filebehave == FILE_MMAP) {
269 		struct stat st;
270 
271 		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
272 		    (!S_ISREG(st.st_mode)))
273 			filebehave = FILE_STDIO;
274 		else {
275 			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
276 #ifdef MAP_PREFAULT_READ
277 			flags |= MAP_PREFAULT_READ;
278 #endif
279 			fsiz = st.st_size;
280 			buffer = mmap(NULL, fsiz, PROT_READ, flags,
281 			     f->fd, (off_t)0);
282 			if (buffer == MAP_FAILED)
283 				filebehave = FILE_STDIO;
284 			else {
285 				bufrem = st.st_size;
286 				bufpos = buffer;
287 				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
288 			}
289 		}
290 	}
291 
292 	if ((buffer == NULL) || (buffer == MAP_FAILED))
293 		buffer = grep_malloc(MAXBUFSIZ);
294 
295 	if (filebehave == FILE_GZIP &&
296 	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
297 		goto error2;
298 
299 #ifndef WITHOUT_BZIP2
300 	if (filebehave == FILE_BZIP &&
301 	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
302 		goto error2;
303 #endif
304 #ifndef WITHOUT_LZMA
305 	else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
306 		lzma_ret ret;
307 
308 		ret = (filebehave == FILE_XZ) ?
309 			lzma_stream_decoder(&lstrm, UINT64_MAX,
310 					LZMA_CONCATENATED) :
311 			lzma_alone_decoder(&lstrm, UINT64_MAX);
312 
313 		if (ret != LZMA_OK)
314 			goto error2;
315 
316 		lstrm.avail_in = 0;
317 		lstrm.avail_out = MAXBUFSIZ;
318 		laction = LZMA_RUN;
319 	}
320 #endif
321 
322 	/* Fill read buffer, also catches errors early */
323 	if (bufrem == 0 && grep_refill(f) != 0)
324 		goto error2;
325 
326 	/* Check for binary stuff, if necessary */
327 	if (binbehave != BINFILE_TEXT && fileeol != '\0' &&
328 	    memchr(bufpos, '\0', bufrem) != NULL)
329 	f->binary = true;
330 
331 	return (f);
332 
333 error2:
334 	close(f->fd);
335 error1:
336 	free(f);
337 	return (NULL);
338 }
339 
340 /*
341  * Closes a file.
342  */
343 void
344 grep_close(struct file *f)
345 {
346 
347 	close(f->fd);
348 
349 	/* Reset read buffer and line buffer */
350 	if (filebehave == FILE_MMAP) {
351 		munmap(buffer, fsiz);
352 		buffer = NULL;
353 	}
354 	bufpos = buffer;
355 	bufrem = 0;
356 
357 	free(lnbuf);
358 	lnbuf = NULL;
359 	lnbuflen = 0;
360 }
361