xref: /freebsd/usr.bin/grep/file.c (revision ea825d02749f382c3f7e17f28247f20a48733eab)
1 /*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2 /*	$FreeBSD$	*/
3 /*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4 
5 /*-
6  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
7  * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
8  * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include <sys/param.h>
37 #include <sys/mman.h>
38 #include <sys/stat.h>
39 #include <sys/types.h>
40 
41 #include <err.h>
42 #include <errno.h>
43 #include <fcntl.h>
44 #include <stddef.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <wchar.h>
49 #include <wctype.h>
50 #include <zlib.h>
51 
52 #ifndef WITHOUT_LZMA
53 #include <lzma.h>
54 #endif
55 
56 #ifndef WITHOUT_BZIP2
57 #include <bzlib.h>
58 #endif
59 
60 #include "grep.h"
61 
62 #define	MAXBUFSIZ	(32 * 1024)
63 #define	LNBUFBUMP	80
64 
65 static gzFile gzbufdesc;
66 #ifndef WITHOUT_LZMA
67 static lzma_stream lstrm = LZMA_STREAM_INIT;
68 static lzma_action laction;
69 static uint8_t lin_buf[MAXBUFSIZ];
70 #endif
71 #ifndef WITHOUT_BZIP2
72 static BZFILE* bzbufdesc;
73 #endif
74 
75 static unsigned char *buffer;
76 static unsigned char *bufpos;
77 static size_t bufrem;
78 static size_t fsiz;
79 
80 static unsigned char *lnbuf;
81 static size_t lnbuflen;
82 
83 static inline int
84 grep_refill(struct file *f)
85 {
86 	ssize_t nr;
87 
88 	if (filebehave == FILE_MMAP)
89 		return (0);
90 
91 	bufpos = buffer;
92 	bufrem = 0;
93 
94 	if (filebehave == FILE_GZIP) {
95 		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
96 #ifndef WITHOUT_BZIP2
97 	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
98 		int bzerr;
99 
100 		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
101 		switch (bzerr) {
102 		case BZ_OK:
103 		case BZ_STREAM_END:
104 			/* No problem, nr will be okay */
105 			break;
106 		case BZ_DATA_ERROR_MAGIC:
107 			/*
108 			 * As opposed to gzread(), which simply returns the
109 			 * plain file data, if it is not in the correct
110 			 * compressed format, BZ2_bzRead() instead aborts.
111 			 *
112 			 * So, just restart at the beginning of the file again,
113 			 * and use plain reads from now on.
114 			 */
115 			BZ2_bzReadClose(&bzerr, bzbufdesc);
116 			bzbufdesc = NULL;
117 			if (lseek(f->fd, 0, SEEK_SET) == -1)
118 				return (-1);
119 			nr = read(f->fd, buffer, MAXBUFSIZ);
120 			break;
121 		default:
122 			/* Make sure we exit with an error */
123 			nr = -1;
124 		}
125 #endif
126 #ifndef WITHOUT_LZMA
127 	} else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
128 		lzma_ret ret;
129 		lstrm.next_out = buffer;
130 
131 		do {
132 			if (lstrm.avail_in == 0) {
133 				lstrm.next_in = lin_buf;
134 				nr = read(f->fd, lin_buf, MAXBUFSIZ);
135 
136 				if (nr < 0)
137 					return (-1);
138 				else if (nr == 0)
139 					laction = LZMA_FINISH;
140 
141 				lstrm.avail_in = nr;
142 			}
143 
144 			ret = lzma_code(&lstrm, laction);
145 
146 			if (ret != LZMA_OK && ret != LZMA_STREAM_END)
147 				return (-1);
148 
149 			if (lstrm.avail_out == 0 || ret == LZMA_STREAM_END) {
150 				bufrem = MAXBUFSIZ - lstrm.avail_out;
151 				lstrm.next_out = buffer;
152 				lstrm.avail_out = MAXBUFSIZ;
153 			}
154 		} while (bufrem == 0 && ret != LZMA_STREAM_END);
155 
156 		return (0);
157 #endif	/* WIHTOUT_LZMA */
158 	} else
159 		nr = read(f->fd, buffer, MAXBUFSIZ);
160 
161 	if (nr < 0)
162 		return (-1);
163 
164 	bufrem = nr;
165 	return (0);
166 }
167 
168 static inline int
169 grep_lnbufgrow(size_t newlen)
170 {
171 
172 	if (lnbuflen < newlen) {
173 		lnbuf = grep_realloc(lnbuf, newlen);
174 		lnbuflen = newlen;
175 	}
176 
177 	return (0);
178 }
179 
180 char *
181 grep_fgetln(struct file *f, size_t *lenp)
182 {
183 	unsigned char *p;
184 	char *ret;
185 	size_t len;
186 	size_t off;
187 	ptrdiff_t diff;
188 
189 	/* Fill the buffer, if necessary */
190 	if (bufrem == 0 && grep_refill(f) != 0)
191 		goto error;
192 
193 	if (bufrem == 0) {
194 		/* Return zero length to indicate EOF */
195 		*lenp = 0;
196 		return (bufpos);
197 	}
198 
199 	/* Look for a newline in the remaining part of the buffer */
200 	if ((p = memchr(bufpos, fileeol, bufrem)) != NULL) {
201 		++p; /* advance over newline */
202 		ret = bufpos;
203 		len = p - bufpos;
204 		bufrem -= len;
205 		bufpos = p;
206 		*lenp = len;
207 		return (ret);
208 	}
209 
210 	/* We have to copy the current buffered data to the line buffer */
211 	for (len = bufrem, off = 0; ; len += bufrem) {
212 		/* Make sure there is room for more data */
213 		if (grep_lnbufgrow(len + LNBUFBUMP))
214 			goto error;
215 		memcpy(lnbuf + off, bufpos, len - off);
216 		/* With FILE_MMAP, this is EOF; there's no more to refill */
217 		if (filebehave == FILE_MMAP) {
218 			bufrem -= len;
219 			break;
220 		}
221 		off = len;
222 		/* Fetch more to try and find EOL/EOF */
223 		if (grep_refill(f) != 0)
224 			goto error;
225 		if (bufrem == 0)
226 			/* EOF: return partial line */
227 			break;
228 		if ((p = memchr(bufpos, fileeol, bufrem)) == NULL)
229 			continue;
230 		/* got it: finish up the line (like code above) */
231 		++p;
232 		diff = p - bufpos;
233 		len += diff;
234 		if (grep_lnbufgrow(len))
235 		    goto error;
236 		memcpy(lnbuf + off, bufpos, diff);
237 		bufrem -= diff;
238 		bufpos = p;
239 		break;
240 	}
241 	*lenp = len;
242 	return (lnbuf);
243 
244 error:
245 	*lenp = 0;
246 	return (NULL);
247 }
248 
249 /*
250  * Opens a file for processing.
251  */
252 struct file *
253 grep_open(const char *path)
254 {
255 	struct file *f;
256 
257 	f = grep_malloc(sizeof *f);
258 	memset(f, 0, sizeof *f);
259 	if (path == NULL) {
260 		/* Processing stdin implies --line-buffered. */
261 		lbflag = true;
262 		f->fd = STDIN_FILENO;
263 	} else if ((f->fd = open(path, O_RDONLY)) == -1)
264 		goto error1;
265 
266 	if (filebehave == FILE_MMAP) {
267 		struct stat st;
268 
269 		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
270 		    (!S_ISREG(st.st_mode)))
271 			filebehave = FILE_STDIO;
272 		else {
273 			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
274 #ifdef MAP_PREFAULT_READ
275 			flags |= MAP_PREFAULT_READ;
276 #endif
277 			fsiz = st.st_size;
278 			buffer = mmap(NULL, fsiz, PROT_READ, flags,
279 			     f->fd, (off_t)0);
280 			if (buffer == MAP_FAILED)
281 				filebehave = FILE_STDIO;
282 			else {
283 				bufrem = st.st_size;
284 				bufpos = buffer;
285 				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
286 			}
287 		}
288 	}
289 
290 	if ((buffer == NULL) || (buffer == MAP_FAILED))
291 		buffer = grep_malloc(MAXBUFSIZ);
292 
293 	if (filebehave == FILE_GZIP &&
294 	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
295 		goto error2;
296 
297 #ifndef WITHOUT_BZIP2
298 	if (filebehave == FILE_BZIP &&
299 	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
300 		goto error2;
301 #endif
302 #ifndef WITHOUT_LZMA
303 	else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
304 		lzma_ret ret;
305 
306 		ret = (filebehave == FILE_XZ) ?
307 			lzma_stream_decoder(&lstrm, UINT64_MAX,
308 					LZMA_CONCATENATED) :
309 			lzma_alone_decoder(&lstrm, UINT64_MAX);
310 
311 		if (ret != LZMA_OK)
312 			goto error2;
313 
314 		lstrm.avail_in = 0;
315 		lstrm.avail_out = MAXBUFSIZ;
316 		laction = LZMA_RUN;
317 	}
318 #endif
319 
320 	/* Fill read buffer, also catches errors early */
321 	if (bufrem == 0 && grep_refill(f) != 0)
322 		goto error2;
323 
324 	/* Check for binary stuff, if necessary */
325 	if (binbehave != BINFILE_TEXT && fileeol != '\0' &&
326 	    memchr(bufpos, '\0', bufrem) != NULL)
327 	f->binary = true;
328 
329 	return (f);
330 
331 error2:
332 	close(f->fd);
333 error1:
334 	free(f);
335 	return (NULL);
336 }
337 
338 /*
339  * Closes a file.
340  */
341 void
342 grep_close(struct file *f)
343 {
344 
345 	close(f->fd);
346 
347 	/* Reset read buffer and line buffer */
348 	if (filebehave == FILE_MMAP) {
349 		munmap(buffer, fsiz);
350 		buffer = NULL;
351 	}
352 	bufpos = buffer;
353 	bufrem = 0;
354 
355 	free(lnbuf);
356 	lnbuf = NULL;
357 	lnbuflen = 0;
358 }
359