xref: /freebsd/usr.bin/grep/file.c (revision 3823d5e198425b4f5e5a80267d195769d1063773)
1 /*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2 /*	$FreeBSD$	*/
3 /*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4 
5 /*-
6  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
7  * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
8  * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include <sys/param.h>
37 #include <sys/mman.h>
38 #include <sys/stat.h>
39 #include <sys/types.h>
40 
41 #include <err.h>
42 #include <errno.h>
43 #include <fcntl.h>
44 #include <stddef.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <wchar.h>
49 #include <wctype.h>
50 #include <zlib.h>
51 
52 #ifndef WITHOUT_LZMA
53 #include <lzma.h>
54 #endif
55 
56 #ifndef WITHOUT_BZIP2
57 #include <bzlib.h>
58 #endif
59 
60 #include "grep.h"
61 
62 #define	MAXBUFSIZ	(32 * 1024)
63 #define	LNBUFBUMP	80
64 
65 static gzFile gzbufdesc;
66 #ifndef WITHOUT_LZMA
67 static lzma_stream lstrm = LZMA_STREAM_INIT;
68 #endif
69 #ifndef WITHOUT_BZIP2
70 static BZFILE* bzbufdesc;
71 #endif
72 
73 static unsigned char *buffer;
74 static unsigned char *bufpos;
75 static size_t bufrem;
76 static size_t fsiz;
77 
78 static unsigned char *lnbuf;
79 static size_t lnbuflen;
80 
81 static inline int
82 grep_refill(struct file *f)
83 {
84 	ssize_t nr;
85 
86 	if (filebehave == FILE_MMAP)
87 		return (0);
88 
89 	bufpos = buffer;
90 	bufrem = 0;
91 
92 	if (filebehave == FILE_GZIP) {
93 		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
94 #ifndef WITHOUT_BZIP2
95 	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
96 		int bzerr;
97 
98 		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
99 		switch (bzerr) {
100 		case BZ_OK:
101 		case BZ_STREAM_END:
102 			/* No problem, nr will be okay */
103 			break;
104 		case BZ_DATA_ERROR_MAGIC:
105 			/*
106 			 * As opposed to gzread(), which simply returns the
107 			 * plain file data, if it is not in the correct
108 			 * compressed format, BZ2_bzRead() instead aborts.
109 			 *
110 			 * So, just restart at the beginning of the file again,
111 			 * and use plain reads from now on.
112 			 */
113 			BZ2_bzReadClose(&bzerr, bzbufdesc);
114 			bzbufdesc = NULL;
115 			if (lseek(f->fd, 0, SEEK_SET) == -1)
116 				return (-1);
117 			nr = read(f->fd, buffer, MAXBUFSIZ);
118 			break;
119 		default:
120 			/* Make sure we exit with an error */
121 			nr = -1;
122 		}
123 #endif
124 #ifndef WITHOUT_LZMA
125 	} else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
126 		lzma_action action = LZMA_RUN;
127 		uint8_t in_buf[MAXBUFSIZ];
128 		lzma_ret ret;
129 
130 		ret = (filebehave == FILE_XZ) ?
131 		    lzma_stream_decoder(&lstrm, UINT64_MAX,
132 		    LZMA_CONCATENATED) :
133 		    lzma_alone_decoder(&lstrm, UINT64_MAX);
134 
135 		if (ret != LZMA_OK)
136 			return (-1);
137 
138 		lstrm.next_out = buffer;
139 		lstrm.avail_out = MAXBUFSIZ;
140 		lstrm.next_in = in_buf;
141 		nr = read(f->fd, in_buf, MAXBUFSIZ);
142 
143 		if (nr < 0)
144 			return (-1);
145 		else if (nr == 0)
146 			action = LZMA_FINISH;
147 
148 		lstrm.avail_in = nr;
149 		ret = lzma_code(&lstrm, action);
150 
151 		if (ret != LZMA_OK && ret != LZMA_STREAM_END)
152 			return (-1);
153 		bufrem = MAXBUFSIZ - lstrm.avail_out;
154 		return (0);
155 #endif	/* WIHTOUT_LZMA */
156 	} else
157 		nr = read(f->fd, buffer, MAXBUFSIZ);
158 
159 	if (nr < 0)
160 		return (-1);
161 
162 	bufrem = nr;
163 	return (0);
164 }
165 
166 static inline int
167 grep_lnbufgrow(size_t newlen)
168 {
169 
170 	if (lnbuflen < newlen) {
171 		lnbuf = grep_realloc(lnbuf, newlen);
172 		lnbuflen = newlen;
173 	}
174 
175 	return (0);
176 }
177 
178 char *
179 grep_fgetln(struct file *f, size_t *lenp)
180 {
181 	unsigned char *p;
182 	char *ret;
183 	size_t len;
184 	size_t off;
185 	ptrdiff_t diff;
186 
187 	/* Fill the buffer, if necessary */
188 	if (bufrem == 0 && grep_refill(f) != 0)
189 		goto error;
190 
191 	if (bufrem == 0) {
192 		/* Return zero length to indicate EOF */
193 		*lenp = 0;
194 		return (bufpos);
195 	}
196 
197 	/* Look for a newline in the remaining part of the buffer */
198 	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
199 		++p; /* advance over newline */
200 		ret = bufpos;
201 		len = p - bufpos;
202 		bufrem -= len;
203 		bufpos = p;
204 		*lenp = len;
205 		return (ret);
206 	}
207 
208 	/* We have to copy the current buffered data to the line buffer */
209 	for (len = bufrem, off = 0; ; len += bufrem) {
210 		/* Make sure there is room for more data */
211 		if (grep_lnbufgrow(len + LNBUFBUMP))
212 			goto error;
213 		memcpy(lnbuf + off, bufpos, len - off);
214 		off = len;
215 		if (grep_refill(f) != 0)
216 			goto error;
217 		if (bufrem == 0)
218 			/* EOF: return partial line */
219 			break;
220 		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
221 			continue;
222 		/* got it: finish up the line (like code above) */
223 		++p;
224 		diff = p - bufpos;
225 		len += diff;
226 		if (grep_lnbufgrow(len))
227 		    goto error;
228 		memcpy(lnbuf + off, bufpos, diff);
229 		bufrem -= diff;
230 		bufpos = p;
231 		break;
232 	}
233 	*lenp = len;
234 	return (lnbuf);
235 
236 error:
237 	*lenp = 0;
238 	return (NULL);
239 }
240 
241 /*
242  * Opens a file for processing.
243  */
244 struct file *
245 grep_open(const char *path)
246 {
247 	struct file *f;
248 
249 	f = grep_malloc(sizeof *f);
250 	memset(f, 0, sizeof *f);
251 	if (path == NULL) {
252 		/* Processing stdin implies --line-buffered. */
253 		lbflag = true;
254 		f->fd = STDIN_FILENO;
255 	} else if ((f->fd = open(path, O_RDONLY)) == -1)
256 		goto error1;
257 
258 	if (filebehave == FILE_MMAP) {
259 		struct stat st;
260 
261 		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
262 		    (!S_ISREG(st.st_mode)))
263 			filebehave = FILE_STDIO;
264 		else {
265 			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
266 #ifdef MAP_PREFAULT_READ
267 			flags |= MAP_PREFAULT_READ;
268 #endif
269 			fsiz = st.st_size;
270 			buffer = mmap(NULL, fsiz, PROT_READ, flags,
271 			     f->fd, (off_t)0);
272 			if (buffer == MAP_FAILED)
273 				filebehave = FILE_STDIO;
274 			else {
275 				bufrem = st.st_size;
276 				bufpos = buffer;
277 				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
278 			}
279 		}
280 	}
281 
282 	if ((buffer == NULL) || (buffer == MAP_FAILED))
283 		buffer = grep_malloc(MAXBUFSIZ);
284 
285 	if (filebehave == FILE_GZIP &&
286 	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
287 		goto error2;
288 
289 #ifndef WITHOUT_BZIP2
290 	if (filebehave == FILE_BZIP &&
291 	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
292 		goto error2;
293 #endif
294 
295 	/* Fill read buffer, also catches errors early */
296 	if (bufrem == 0 && grep_refill(f) != 0)
297 		goto error2;
298 
299 	/* Check for binary stuff, if necessary */
300 	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
301 	f->binary = true;
302 
303 	return (f);
304 
305 error2:
306 	close(f->fd);
307 error1:
308 	free(f);
309 	return (NULL);
310 }
311 
312 /*
313  * Closes a file.
314  */
315 void
316 grep_close(struct file *f)
317 {
318 
319 	close(f->fd);
320 
321 	/* Reset read buffer and line buffer */
322 	if (filebehave == FILE_MMAP) {
323 		munmap(buffer, fsiz);
324 		buffer = NULL;
325 	}
326 	bufpos = buffer;
327 	bufrem = 0;
328 
329 	free(lnbuf);
330 	lnbuf = NULL;
331 	lnbuflen = 0;
332 }
333