xref: /freebsd/usr.bin/grep/file.c (revision 39ee7a7a6bdd1557b1c3532abf60d139798ac88b)
1 /*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2 /*	$FreeBSD$	*/
3 /*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4 
5 /*-
6  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
7  * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
8  * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include <sys/param.h>
37 #include <sys/mman.h>
38 #include <sys/stat.h>
39 #include <sys/types.h>
40 
41 #include <err.h>
42 #include <errno.h>
43 #include <fcntl.h>
44 #include <stddef.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <wchar.h>
49 #include <wctype.h>
50 #include <zlib.h>
51 
52 #ifndef WITHOUT_LZMA
53 #include <lzma.h>
54 #endif
55 
56 #ifndef WITHOUT_BZIP2
57 #include <bzlib.h>
58 #endif
59 
60 #include "grep.h"
61 
62 #define	MAXBUFSIZ	(32 * 1024)
63 #define	LNBUFBUMP	80
64 
65 static gzFile gzbufdesc;
66 #ifndef WITHOUT_LZMA
67 static lzma_stream lstrm = LZMA_STREAM_INIT;
68 static lzma_action laction;
69 static uint8_t lin_buf[MAXBUFSIZ];
70 #endif
71 #ifndef WITHOUT_BZIP2
72 static BZFILE* bzbufdesc;
73 #endif
74 
75 static unsigned char *buffer;
76 static unsigned char *bufpos;
77 static size_t bufrem;
78 static size_t fsiz;
79 
80 static unsigned char *lnbuf;
81 static size_t lnbuflen;
82 
83 static inline int
84 grep_refill(struct file *f)
85 {
86 	ssize_t nr;
87 
88 	if (filebehave == FILE_MMAP)
89 		return (0);
90 
91 	bufpos = buffer;
92 	bufrem = 0;
93 
94 	if (filebehave == FILE_GZIP) {
95 		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
96 #ifndef WITHOUT_BZIP2
97 	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
98 		int bzerr;
99 
100 		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
101 		switch (bzerr) {
102 		case BZ_OK:
103 		case BZ_STREAM_END:
104 			/* No problem, nr will be okay */
105 			break;
106 		case BZ_DATA_ERROR_MAGIC:
107 			/*
108 			 * As opposed to gzread(), which simply returns the
109 			 * plain file data, if it is not in the correct
110 			 * compressed format, BZ2_bzRead() instead aborts.
111 			 *
112 			 * So, just restart at the beginning of the file again,
113 			 * and use plain reads from now on.
114 			 */
115 			BZ2_bzReadClose(&bzerr, bzbufdesc);
116 			bzbufdesc = NULL;
117 			if (lseek(f->fd, 0, SEEK_SET) == -1)
118 				return (-1);
119 			nr = read(f->fd, buffer, MAXBUFSIZ);
120 			break;
121 		default:
122 			/* Make sure we exit with an error */
123 			nr = -1;
124 		}
125 #endif
126 #ifndef WITHOUT_LZMA
127 	} else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
128 		lzma_ret ret;
129 		lstrm.next_out = buffer;
130 
131 		do {
132 			if (lstrm.avail_in == 0) {
133 				lstrm.next_in = lin_buf;
134 				nr = read(f->fd, lin_buf, MAXBUFSIZ);
135 
136 				if (nr < 0)
137 					return (-1);
138 				else if (nr == 0)
139 					laction = LZMA_FINISH;
140 
141 				lstrm.avail_in = nr;
142 			}
143 
144 			ret = lzma_code(&lstrm, laction);
145 
146 			if (ret != LZMA_OK && ret != LZMA_STREAM_END)
147 				return (-1);
148 
149 			if (lstrm.avail_out == 0 || ret == LZMA_STREAM_END) {
150 				bufrem = MAXBUFSIZ - lstrm.avail_out;
151 				lstrm.next_out = buffer;
152 				lstrm.avail_out = MAXBUFSIZ;
153 			}
154 		} while (bufrem == 0 && ret != LZMA_STREAM_END);
155 
156 		return (0);
157 #endif	/* WIHTOUT_LZMA */
158 	} else
159 		nr = read(f->fd, buffer, MAXBUFSIZ);
160 
161 	if (nr < 0)
162 		return (-1);
163 
164 	bufrem = nr;
165 	return (0);
166 }
167 
168 static inline int
169 grep_lnbufgrow(size_t newlen)
170 {
171 
172 	if (lnbuflen < newlen) {
173 		lnbuf = grep_realloc(lnbuf, newlen);
174 		lnbuflen = newlen;
175 	}
176 
177 	return (0);
178 }
179 
180 char *
181 grep_fgetln(struct file *f, size_t *lenp)
182 {
183 	unsigned char *p;
184 	char *ret;
185 	size_t len;
186 	size_t off;
187 	ptrdiff_t diff;
188 
189 	/* Fill the buffer, if necessary */
190 	if (bufrem == 0 && grep_refill(f) != 0)
191 		goto error;
192 
193 	if (bufrem == 0) {
194 		/* Return zero length to indicate EOF */
195 		*lenp = 0;
196 		return (bufpos);
197 	}
198 
199 	/* Look for a newline in the remaining part of the buffer */
200 	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
201 		++p; /* advance over newline */
202 		ret = bufpos;
203 		len = p - bufpos;
204 		bufrem -= len;
205 		bufpos = p;
206 		*lenp = len;
207 		return (ret);
208 	}
209 
210 	/* We have to copy the current buffered data to the line buffer */
211 	for (len = bufrem, off = 0; ; len += bufrem) {
212 		/* Make sure there is room for more data */
213 		if (grep_lnbufgrow(len + LNBUFBUMP))
214 			goto error;
215 		memcpy(lnbuf + off, bufpos, len - off);
216 		off = len;
217 		if (grep_refill(f) != 0)
218 			goto error;
219 		if (bufrem == 0)
220 			/* EOF: return partial line */
221 			break;
222 		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
223 			continue;
224 		/* got it: finish up the line (like code above) */
225 		++p;
226 		diff = p - bufpos;
227 		len += diff;
228 		if (grep_lnbufgrow(len))
229 		    goto error;
230 		memcpy(lnbuf + off, bufpos, diff);
231 		bufrem -= diff;
232 		bufpos = p;
233 		break;
234 	}
235 	*lenp = len;
236 	return (lnbuf);
237 
238 error:
239 	*lenp = 0;
240 	return (NULL);
241 }
242 
243 /*
244  * Opens a file for processing.
245  */
246 struct file *
247 grep_open(const char *path)
248 {
249 	struct file *f;
250 
251 	f = grep_malloc(sizeof *f);
252 	memset(f, 0, sizeof *f);
253 	if (path == NULL) {
254 		/* Processing stdin implies --line-buffered. */
255 		lbflag = true;
256 		f->fd = STDIN_FILENO;
257 	} else if ((f->fd = open(path, O_RDONLY)) == -1)
258 		goto error1;
259 
260 	if (filebehave == FILE_MMAP) {
261 		struct stat st;
262 
263 		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
264 		    (!S_ISREG(st.st_mode)))
265 			filebehave = FILE_STDIO;
266 		else {
267 			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
268 #ifdef MAP_PREFAULT_READ
269 			flags |= MAP_PREFAULT_READ;
270 #endif
271 			fsiz = st.st_size;
272 			buffer = mmap(NULL, fsiz, PROT_READ, flags,
273 			     f->fd, (off_t)0);
274 			if (buffer == MAP_FAILED)
275 				filebehave = FILE_STDIO;
276 			else {
277 				bufrem = st.st_size;
278 				bufpos = buffer;
279 				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
280 			}
281 		}
282 	}
283 
284 	if ((buffer == NULL) || (buffer == MAP_FAILED))
285 		buffer = grep_malloc(MAXBUFSIZ);
286 
287 	if (filebehave == FILE_GZIP &&
288 	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
289 		goto error2;
290 
291 #ifndef WITHOUT_BZIP2
292 	if (filebehave == FILE_BZIP &&
293 	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
294 		goto error2;
295 #endif
296 #ifndef WITHOUT_LZMA
297 	else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
298 		lzma_ret ret;
299 
300 		ret = (filebehave == FILE_XZ) ?
301 			lzma_stream_decoder(&lstrm, UINT64_MAX,
302 					LZMA_CONCATENATED) :
303 			lzma_alone_decoder(&lstrm, UINT64_MAX);
304 
305 		if (ret != LZMA_OK)
306 			goto error2;
307 
308 		lstrm.avail_in = 0;
309 		lstrm.avail_out = MAXBUFSIZ;
310 		laction = LZMA_RUN;
311 	}
312 #endif
313 
314 	/* Fill read buffer, also catches errors early */
315 	if (bufrem == 0 && grep_refill(f) != 0)
316 		goto error2;
317 
318 	/* Check for binary stuff, if necessary */
319 	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
320 	f->binary = true;
321 
322 	return (f);
323 
324 error2:
325 	close(f->fd);
326 error1:
327 	free(f);
328 	return (NULL);
329 }
330 
331 /*
332  * Closes a file.
333  */
334 void
335 grep_close(struct file *f)
336 {
337 
338 	close(f->fd);
339 
340 	/* Reset read buffer and line buffer */
341 	if (filebehave == FILE_MMAP) {
342 		munmap(buffer, fsiz);
343 		buffer = NULL;
344 	}
345 	bufpos = buffer;
346 	bufrem = 0;
347 
348 	free(lnbuf);
349 	lnbuf = NULL;
350 	lnbuflen = 0;
351 }
352