xref: /illumos-gate/usr/src/cmd/mandoc/read.c (revision c5749750a3e052f1194f65a303456224c51dea63)
1 /*	$Id: read.c,v 1.196 2018/07/28 18:34:15 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2010-2018 Ingo Schwarze <schwarze@openbsd.org>
5  * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include "config.h"
20 
21 #include <sys/types.h>
22 #include <sys/mman.h>
23 #include <sys/stat.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stdarg.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include <zlib.h>
35 
36 #include "mandoc_aux.h"
37 #include "mandoc.h"
38 #include "roff.h"
39 #include "mdoc.h"
40 #include "man.h"
41 #include "libmandoc.h"
42 
43 #define	REPARSE_LIMIT	1000
44 
45 struct	mparse {
46 	struct roff	 *roff; /* roff parser (!NULL) */
47 	struct roff_man	 *man; /* man parser */
48 	char		 *sodest; /* filename pointed to by .so */
49 	const char	 *file; /* filename of current input file */
50 	struct buf	 *primary; /* buffer currently being parsed */
51 	struct buf	 *secondary; /* preprocessed copy of input */
52 	const char	 *os_s; /* default operating system */
53 	mandocmsg	  mmsg; /* warning/error message handler */
54 	enum mandoclevel  file_status; /* status of current parse */
55 	enum mandocerr	  mmin; /* ignore messages below this */
56 	int		  options; /* parser options */
57 	int		  gzip; /* current input file is gzipped */
58 	int		  filenc; /* encoding of the current file */
59 	int		  reparse_count; /* finite interp. stack */
60 	int		  line; /* line number in the file */
61 };
62 
63 static	void	  choose_parser(struct mparse *);
64 static	void	  resize_buf(struct buf *, size_t);
65 static	int	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
66 static	int	  read_whole_file(struct mparse *, const char *, int,
67 				struct buf *, int *);
68 static	void	  mparse_end(struct mparse *);
69 static	void	  mparse_parse_buffer(struct mparse *, struct buf,
70 			const char *);
71 
72 static	const enum mandocerr	mandoclimits[MANDOCLEVEL_MAX] = {
73 	MANDOCERR_OK,
74 	MANDOCERR_OK,
75 	MANDOCERR_WARNING,
76 	MANDOCERR_ERROR,
77 	MANDOCERR_UNSUPP,
78 	MANDOCERR_MAX,
79 	MANDOCERR_MAX
80 };
81 
82 static	const char * const	mandocerrs[MANDOCERR_MAX] = {
83 	"ok",
84 
85 	"base system convention",
86 
87 	"Mdocdate found",
88 	"Mdocdate missing",
89 	"unknown architecture",
90 	"operating system explicitly specified",
91 	"RCS id missing",
92 	"referenced manual not found",
93 
94 	"generic style suggestion",
95 
96 	"legacy man(7) date format",
97 	"normalizing date format to",
98 	"lower case character in document title",
99 	"duplicate RCS id",
100 	"possible typo in section name",
101 	"unterminated quoted argument",
102 	"useless macro",
103 	"consider using OS macro",
104 	"errnos out of order",
105 	"duplicate errno",
106 	"trailing delimiter",
107 	"no blank before trailing delimiter",
108 	"fill mode already enabled, skipping",
109 	"fill mode already disabled, skipping",
110 	"verbatim \"--\", maybe consider using \\(em",
111 	"function name without markup",
112 	"whitespace at end of input line",
113 	"bad comment style",
114 
115 	"generic warning",
116 
117 	/* related to the prologue */
118 	"missing manual title, using UNTITLED",
119 	"missing manual title, using \"\"",
120 	"missing manual section, using \"\"",
121 	"unknown manual section",
122 	"missing date, using today's date",
123 	"cannot parse date, using it verbatim",
124 	"date in the future, using it anyway",
125 	"missing Os macro, using \"\"",
126 	"late prologue macro",
127 	"prologue macros out of order",
128 
129 	/* related to document structure */
130 	".so is fragile, better use ln(1)",
131 	"no document body",
132 	"content before first section header",
133 	"first section is not \"NAME\"",
134 	"NAME section without Nm before Nd",
135 	"NAME section without description",
136 	"description not at the end of NAME",
137 	"bad NAME section content",
138 	"missing comma before name",
139 	"missing description line, using \"\"",
140 	"description line outside NAME section",
141 	"sections out of conventional order",
142 	"duplicate section title",
143 	"unexpected section",
144 	"cross reference to self",
145 	"unusual Xr order",
146 	"unusual Xr punctuation",
147 	"AUTHORS section without An macro",
148 
149 	/* related to macros and nesting */
150 	"obsolete macro",
151 	"macro neither callable nor escaped",
152 	"skipping paragraph macro",
153 	"moving paragraph macro out of list",
154 	"skipping no-space macro",
155 	"blocks badly nested",
156 	"nested displays are not portable",
157 	"moving content out of list",
158 	"first macro on line",
159 	"line scope broken",
160 	"skipping blank line in line scope",
161 
162 	/* related to missing macro arguments */
163 	"skipping empty request",
164 	"conditional request controls empty scope",
165 	"skipping empty macro",
166 	"empty block",
167 	"empty argument, using 0n",
168 	"missing display type, using -ragged",
169 	"list type is not the first argument",
170 	"missing -width in -tag list, using 6n",
171 	"missing utility name, using \"\"",
172 	"missing function name, using \"\"",
173 	"empty head in list item",
174 	"empty list item",
175 	"missing argument, using next line",
176 	"missing font type, using \\fR",
177 	"unknown font type, using \\fR",
178 	"nothing follows prefix",
179 	"empty reference block",
180 	"missing section argument",
181 	"missing -std argument, adding it",
182 	"missing option string, using \"\"",
183 	"missing resource identifier, using \"\"",
184 	"missing eqn box, using \"\"",
185 
186 	/* related to bad macro arguments */
187 	"duplicate argument",
188 	"skipping duplicate argument",
189 	"skipping duplicate display type",
190 	"skipping duplicate list type",
191 	"skipping -width argument",
192 	"wrong number of cells",
193 	"unknown AT&T UNIX version",
194 	"comma in function argument",
195 	"parenthesis in function name",
196 	"unknown library name",
197 	"invalid content in Rs block",
198 	"invalid Boolean argument",
199 	"unknown font, skipping request",
200 	"odd number of characters in request",
201 
202 	/* related to plain text */
203 	"blank line in fill mode, using .sp",
204 	"tab in filled text",
205 	"new sentence, new line",
206 	"invalid escape sequence",
207 	"undefined string, using \"\"",
208 
209 	/* related to tables */
210 	"tbl line starts with span",
211 	"tbl column starts with span",
212 	"skipping vertical bar in tbl layout",
213 
214 	"generic error",
215 
216 	/* related to tables */
217 	"non-alphabetic character in tbl options",
218 	"skipping unknown tbl option",
219 	"missing tbl option argument",
220 	"wrong tbl option argument size",
221 	"empty tbl layout",
222 	"invalid character in tbl layout",
223 	"unmatched parenthesis in tbl layout",
224 	"tbl without any data cells",
225 	"ignoring data in spanned tbl cell",
226 	"ignoring extra tbl data cells",
227 	"data block open at end of tbl",
228 
229 	/* related to document structure and macros */
230 	NULL,
231 	"duplicate prologue macro",
232 	"skipping late title macro",
233 	"input stack limit exceeded, infinite loop?",
234 	"skipping bad character",
235 	"skipping unknown macro",
236 	"skipping insecure request",
237 	"skipping item outside list",
238 	"skipping column outside column list",
239 	"skipping end of block that is not open",
240 	"fewer RS blocks open, skipping",
241 	"inserting missing end of block",
242 	"appending missing end of block",
243 
244 	/* related to request and macro arguments */
245 	"escaped character not allowed in a name",
246 	"NOT IMPLEMENTED: Bd -file",
247 	"skipping display without arguments",
248 	"missing list type, using -item",
249 	"argument is not numeric, using 1",
250 	"missing manual name, using \"\"",
251 	"uname(3) system call failed, using UNKNOWN",
252 	"unknown standard specifier",
253 	"skipping request without numeric argument",
254 	"NOT IMPLEMENTED: .so with absolute path or \"..\"",
255 	".so request failed",
256 	"skipping all arguments",
257 	"skipping excess arguments",
258 	"divide by zero",
259 
260 	"unsupported feature",
261 	"input too large",
262 	"unsupported control character",
263 	"unsupported roff request",
264 	"eqn delim option in tbl",
265 	"unsupported tbl layout modifier",
266 	"ignoring macro in table",
267 };
268 
269 static	const char * const	mandoclevels[MANDOCLEVEL_MAX] = {
270 	"SUCCESS",
271 	"STYLE",
272 	"WARNING",
273 	"ERROR",
274 	"UNSUPP",
275 	"BADARG",
276 	"SYSERR"
277 };
278 
279 
280 static void
281 resize_buf(struct buf *buf, size_t initial)
282 {
283 
284 	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
285 	buf->buf = mandoc_realloc(buf->buf, buf->sz);
286 }
287 
288 static void
289 choose_parser(struct mparse *curp)
290 {
291 	char		*cp, *ep;
292 	int		 format;
293 
294 	/*
295 	 * If neither command line arguments -mdoc or -man select
296 	 * a parser nor the roff parser found a .Dd or .TH macro
297 	 * yet, look ahead in the main input buffer.
298 	 */
299 
300 	if ((format = roff_getformat(curp->roff)) == 0) {
301 		cp = curp->primary->buf;
302 		ep = cp + curp->primary->sz;
303 		while (cp < ep) {
304 			if (*cp == '.' || *cp == '\'') {
305 				cp++;
306 				if (cp[0] == 'D' && cp[1] == 'd') {
307 					format = MPARSE_MDOC;
308 					break;
309 				}
310 				if (cp[0] == 'T' && cp[1] == 'H') {
311 					format = MPARSE_MAN;
312 					break;
313 				}
314 			}
315 			cp = memchr(cp, '\n', ep - cp);
316 			if (cp == NULL)
317 				break;
318 			cp++;
319 		}
320 	}
321 
322 	if (format == MPARSE_MDOC) {
323 		curp->man->macroset = MACROSET_MDOC;
324 		if (curp->man->mdocmac == NULL)
325 			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
326 	} else {
327 		curp->man->macroset = MACROSET_MAN;
328 		if (curp->man->manmac == NULL)
329 			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
330 	}
331 	curp->man->first->tok = TOKEN_NONE;
332 }
333 
334 /*
335  * Main parse routine for a buffer.
336  * It assumes encoding and line numbering are already set up.
337  * It can recurse directly (for invocations of user-defined
338  * macros, inline equations, and input line traps)
339  * and indirectly (for .so file inclusion).
340  */
341 static int
342 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
343 {
344 	struct buf	 ln;
345 	const char	*save_file;
346 	char		*cp;
347 	size_t		 pos; /* byte number in the ln buffer */
348 	enum rofferr	 rr;
349 	int		 of;
350 	int		 lnn; /* line number in the real file */
351 	int		 fd;
352 	unsigned char	 c;
353 
354 	memset(&ln, 0, sizeof(ln));
355 
356 	lnn = curp->line;
357 	pos = 0;
358 
359 	while (i < blk.sz) {
360 		if (0 == pos && '\0' == blk.buf[i])
361 			break;
362 
363 		if (start) {
364 			curp->line = lnn;
365 			curp->reparse_count = 0;
366 
367 			if (lnn < 3 &&
368 			    curp->filenc & MPARSE_UTF8 &&
369 			    curp->filenc & MPARSE_LATIN1)
370 				curp->filenc = preconv_cue(&blk, i);
371 		}
372 
373 		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
374 
375 			/*
376 			 * When finding an unescaped newline character,
377 			 * leave the character loop to process the line.
378 			 * Skip a preceding carriage return, if any.
379 			 */
380 
381 			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
382 			    '\n' == blk.buf[i + 1])
383 				++i;
384 			if ('\n' == blk.buf[i]) {
385 				++i;
386 				++lnn;
387 				break;
388 			}
389 
390 			/*
391 			 * Make sure we have space for the worst
392 			 * case of 11 bytes: "\\[u10ffff]\0"
393 			 */
394 
395 			if (pos + 11 > ln.sz)
396 				resize_buf(&ln, 256);
397 
398 			/*
399 			 * Encode 8-bit input.
400 			 */
401 
402 			c = blk.buf[i];
403 			if (c & 0x80) {
404 				if ( ! (curp->filenc && preconv_encode(
405 				    &blk, &i, &ln, &pos, &curp->filenc))) {
406 					mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
407 					    curp->line, pos, "0x%x", c);
408 					ln.buf[pos++] = '?';
409 					i++;
410 				}
411 				continue;
412 			}
413 
414 			/*
415 			 * Exclude control characters.
416 			 */
417 
418 			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
419 				mandoc_vmsg(c == 0x00 || c == 0x04 ||
420 				    c > 0x0a ? MANDOCERR_CHAR_BAD :
421 				    MANDOCERR_CHAR_UNSUPP,
422 				    curp, curp->line, pos, "0x%x", c);
423 				i++;
424 				if (c != '\r')
425 					ln.buf[pos++] = '?';
426 				continue;
427 			}
428 
429 			ln.buf[pos++] = blk.buf[i++];
430 		}
431 
432 		if (pos + 1 >= ln.sz)
433 			resize_buf(&ln, 256);
434 
435 		if (i == blk.sz || blk.buf[i] == '\0')
436 			ln.buf[pos++] = '\n';
437 		ln.buf[pos] = '\0';
438 
439 		/*
440 		 * A significant amount of complexity is contained by
441 		 * the roff preprocessor.  It's line-oriented but can be
442 		 * expressed on one line, so we need at times to
443 		 * readjust our starting point and re-run it.  The roff
444 		 * preprocessor can also readjust the buffers with new
445 		 * data, so we pass them in wholesale.
446 		 */
447 
448 		of = 0;
449 
450 		/*
451 		 * Maintain a lookaside buffer of all parsed lines.  We
452 		 * only do this if mparse_keep() has been invoked (the
453 		 * buffer may be accessed with mparse_getkeep()).
454 		 */
455 
456 		if (curp->secondary) {
457 			curp->secondary->buf = mandoc_realloc(
458 			    curp->secondary->buf,
459 			    curp->secondary->sz + pos + 2);
460 			memcpy(curp->secondary->buf +
461 			    curp->secondary->sz,
462 			    ln.buf, pos);
463 			curp->secondary->sz += pos;
464 			curp->secondary->buf
465 				[curp->secondary->sz] = '\n';
466 			curp->secondary->sz++;
467 			curp->secondary->buf
468 				[curp->secondary->sz] = '\0';
469 		}
470 rerun:
471 		rr = roff_parseln(curp->roff, curp->line, &ln, &of);
472 
473 		switch (rr) {
474 		case ROFF_REPARSE:
475 			if (++curp->reparse_count > REPARSE_LIMIT)
476 				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
477 				    curp->line, pos, NULL);
478 			else if (mparse_buf_r(curp, ln, of, 0) == 1 ||
479 			    start == 1) {
480 				pos = 0;
481 				continue;
482 			}
483 			free(ln.buf);
484 			return 0;
485 		case ROFF_APPEND:
486 			pos = strlen(ln.buf);
487 			continue;
488 		case ROFF_RERUN:
489 			goto rerun;
490 		case ROFF_IGN:
491 			pos = 0;
492 			continue;
493 		case ROFF_SO:
494 			if ( ! (curp->options & MPARSE_SO) &&
495 			    (i >= blk.sz || blk.buf[i] == '\0')) {
496 				curp->sodest = mandoc_strdup(ln.buf + of);
497 				free(ln.buf);
498 				return 1;
499 			}
500 			/*
501 			 * We remove `so' clauses from our lookaside
502 			 * buffer because we're going to descend into
503 			 * the file recursively.
504 			 */
505 			if (curp->secondary)
506 				curp->secondary->sz -= pos + 1;
507 			save_file = curp->file;
508 			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
509 				mparse_readfd(curp, fd, ln.buf + of);
510 				close(fd);
511 				curp->file = save_file;
512 			} else {
513 				curp->file = save_file;
514 				mandoc_vmsg(MANDOCERR_SO_FAIL,
515 				    curp, curp->line, pos,
516 				    ".so %s", ln.buf + of);
517 				ln.sz = mandoc_asprintf(&cp,
518 				    ".sp\nSee the file %s.\n.sp",
519 				    ln.buf + of);
520 				free(ln.buf);
521 				ln.buf = cp;
522 				of = 0;
523 				mparse_buf_r(curp, ln, of, 0);
524 			}
525 			pos = 0;
526 			continue;
527 		default:
528 			break;
529 		}
530 
531 		if (curp->man->macroset == MACROSET_NONE)
532 			choose_parser(curp);
533 
534 		if ((curp->man->macroset == MACROSET_MDOC ?
535 		    mdoc_parseln(curp->man, curp->line, ln.buf, of) :
536 		    man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
537 				break;
538 
539 		/* Temporary buffers typically are not full. */
540 
541 		if (0 == start && '\0' == blk.buf[i])
542 			break;
543 
544 		/* Start the next input line. */
545 
546 		pos = 0;
547 	}
548 
549 	free(ln.buf);
550 	return 1;
551 }
552 
553 static int
554 read_whole_file(struct mparse *curp, const char *file, int fd,
555 		struct buf *fb, int *with_mmap)
556 {
557 	struct stat	 st;
558 	gzFile		 gz;
559 	size_t		 off;
560 	ssize_t		 ssz;
561 	int		 gzerrnum, retval;
562 
563 	if (fstat(fd, &st) == -1) {
564 		mandoc_vmsg(MANDOCERR_FILE, curp, 0, 0,
565 		    "fstat: %s", strerror(errno));
566 		return 0;
567 	}
568 
569 	/*
570 	 * If we're a regular file, try just reading in the whole entry
571 	 * via mmap().  This is faster than reading it into blocks, and
572 	 * since each file is only a few bytes to begin with, I'm not
573 	 * concerned that this is going to tank any machines.
574 	 */
575 
576 	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
577 		if (st.st_size > 0x7fffffff) {
578 			mandoc_msg(MANDOCERR_TOOLARGE, curp, 0, 0, NULL);
579 			return 0;
580 		}
581 		*with_mmap = 1;
582 		fb->sz = (size_t)st.st_size;
583 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
584 		if (fb->buf != MAP_FAILED)
585 			return 1;
586 	}
587 
588 	if (curp->gzip) {
589 		/*
590 		 * Duplicating the file descriptor is required
591 		 * because we will have to call gzclose(3)
592 		 * to free memory used internally by zlib,
593 		 * but that will also close the file descriptor,
594 		 * which this function must not do.
595 		 */
596 		if ((fd = dup(fd)) == -1) {
597 			mandoc_vmsg(MANDOCERR_FILE, curp, 0, 0,
598 			    "dup: %s", strerror(errno));
599 			return 0;
600 		}
601 		if ((gz = gzdopen(fd, "rb")) == NULL) {
602 			mandoc_vmsg(MANDOCERR_FILE, curp, 0, 0,
603 			    "gzdopen: %s", strerror(errno));
604 			close(fd);
605 			return 0;
606 		}
607 	} else
608 		gz = NULL;
609 
610 	/*
611 	 * If this isn't a regular file (like, say, stdin), then we must
612 	 * go the old way and just read things in bit by bit.
613 	 */
614 
615 	*with_mmap = 0;
616 	off = 0;
617 	retval = 0;
618 	fb->sz = 0;
619 	fb->buf = NULL;
620 	for (;;) {
621 		if (off == fb->sz) {
622 			if (fb->sz == (1U << 31)) {
623 				mandoc_msg(MANDOCERR_TOOLARGE, curp,
624 				    0, 0, NULL);
625 				break;
626 			}
627 			resize_buf(fb, 65536);
628 		}
629 		ssz = curp->gzip ?
630 		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
631 		    read(fd, fb->buf + (int)off, fb->sz - off);
632 		if (ssz == 0) {
633 			fb->sz = off;
634 			retval = 1;
635 			break;
636 		}
637 		if (ssz == -1) {
638 			if (curp->gzip)
639 				(void)gzerror(gz, &gzerrnum);
640 			mandoc_vmsg(MANDOCERR_FILE, curp, 0, 0, "read: %s",
641 			    curp->gzip && gzerrnum != Z_ERRNO ?
642 			    zError(gzerrnum) : strerror(errno));
643 			break;
644 		}
645 		off += (size_t)ssz;
646 	}
647 
648 	if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK)
649 		mandoc_vmsg(MANDOCERR_FILE, curp, 0, 0, "gzclose: %s",
650 		    gzerrnum == Z_ERRNO ? strerror(errno) :
651 		    zError(gzerrnum));
652 	if (retval == 0) {
653 		free(fb->buf);
654 		fb->buf = NULL;
655 	}
656 	return retval;
657 }
658 
659 static void
660 mparse_end(struct mparse *curp)
661 {
662 	if (curp->man->macroset == MACROSET_NONE)
663 		curp->man->macroset = MACROSET_MAN;
664 	if (curp->man->macroset == MACROSET_MDOC)
665 		mdoc_endparse(curp->man);
666 	else
667 		man_endparse(curp->man);
668 	roff_endparse(curp->roff);
669 }
670 
671 static void
672 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
673 {
674 	struct buf	*svprimary;
675 	const char	*svfile;
676 	size_t		 offset;
677 	static int	 recursion_depth;
678 
679 	if (64 < recursion_depth) {
680 		mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
681 		return;
682 	}
683 
684 	/* Line number is per-file. */
685 	svfile = curp->file;
686 	curp->file = file;
687 	svprimary = curp->primary;
688 	curp->primary = &blk;
689 	curp->line = 1;
690 	recursion_depth++;
691 
692 	/* Skip an UTF-8 byte order mark. */
693 	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
694 	    (unsigned char)blk.buf[0] == 0xef &&
695 	    (unsigned char)blk.buf[1] == 0xbb &&
696 	    (unsigned char)blk.buf[2] == 0xbf) {
697 		offset = 3;
698 		curp->filenc &= ~MPARSE_LATIN1;
699 	} else
700 		offset = 0;
701 
702 	mparse_buf_r(curp, blk, offset, 1);
703 
704 	if (--recursion_depth == 0)
705 		mparse_end(curp);
706 
707 	curp->primary = svprimary;
708 	curp->file = svfile;
709 }
710 
711 enum mandoclevel
712 mparse_readmem(struct mparse *curp, void *buf, size_t len,
713 		const char *file)
714 {
715 	struct buf blk;
716 
717 	blk.buf = buf;
718 	blk.sz = len;
719 
720 	mparse_parse_buffer(curp, blk, file);
721 	return curp->file_status;
722 }
723 
724 /*
725  * Read the whole file into memory and call the parsers.
726  * Called recursively when an .so request is encountered.
727  */
728 enum mandoclevel
729 mparse_readfd(struct mparse *curp, int fd, const char *file)
730 {
731 	struct buf	 blk;
732 	int		 with_mmap;
733 	int		 save_filenc;
734 
735 	if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
736 		save_filenc = curp->filenc;
737 		curp->filenc = curp->options &
738 		    (MPARSE_UTF8 | MPARSE_LATIN1);
739 		mparse_parse_buffer(curp, blk, file);
740 		curp->filenc = save_filenc;
741 		if (with_mmap)
742 			munmap(blk.buf, blk.sz);
743 		else
744 			free(blk.buf);
745 	}
746 	return curp->file_status;
747 }
748 
749 int
750 mparse_open(struct mparse *curp, const char *file)
751 {
752 	char		 *cp;
753 	int		  fd;
754 
755 	curp->file = file;
756 	cp = strrchr(file, '.');
757 	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
758 
759 	/* First try to use the filename as it is. */
760 
761 	if ((fd = open(file, O_RDONLY)) != -1)
762 		return fd;
763 
764 	/*
765 	 * If that doesn't work and the filename doesn't
766 	 * already  end in .gz, try appending .gz.
767 	 */
768 
769 	if ( ! curp->gzip) {
770 		mandoc_asprintf(&cp, "%s.gz", file);
771 		fd = open(cp, O_RDONLY);
772 		free(cp);
773 		if (fd != -1) {
774 			curp->gzip = 1;
775 			return fd;
776 		}
777 	}
778 
779 	/* Neither worked, give up. */
780 
781 	mandoc_msg(MANDOCERR_FILE, curp, 0, 0, strerror(errno));
782 	return -1;
783 }
784 
785 struct mparse *
786 mparse_alloc(int options, enum mandocerr mmin, mandocmsg mmsg,
787     enum mandoc_os os_e, const char *os_s)
788 {
789 	struct mparse	*curp;
790 
791 	curp = mandoc_calloc(1, sizeof(struct mparse));
792 
793 	curp->options = options;
794 	curp->mmin = mmin;
795 	curp->mmsg = mmsg;
796 	curp->os_s = os_s;
797 
798 	curp->roff = roff_alloc(curp, options);
799 	curp->man = roff_man_alloc(curp->roff, curp, curp->os_s,
800 		curp->options & MPARSE_QUICK ? 1 : 0);
801 	if (curp->options & MPARSE_MDOC) {
802 		curp->man->macroset = MACROSET_MDOC;
803 		if (curp->man->mdocmac == NULL)
804 			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
805 	} else if (curp->options & MPARSE_MAN) {
806 		curp->man->macroset = MACROSET_MAN;
807 		if (curp->man->manmac == NULL)
808 			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
809 	}
810 	curp->man->first->tok = TOKEN_NONE;
811 	curp->man->meta.os_e = os_e;
812 	return curp;
813 }
814 
815 void
816 mparse_reset(struct mparse *curp)
817 {
818 	roff_reset(curp->roff);
819 	roff_man_reset(curp->man);
820 
821 	free(curp->sodest);
822 	curp->sodest = NULL;
823 
824 	if (curp->secondary)
825 		curp->secondary->sz = 0;
826 
827 	curp->file_status = MANDOCLEVEL_OK;
828 	curp->gzip = 0;
829 }
830 
831 void
832 mparse_free(struct mparse *curp)
833 {
834 
835 	roffhash_free(curp->man->mdocmac);
836 	roffhash_free(curp->man->manmac);
837 	roff_man_free(curp->man);
838 	roff_free(curp->roff);
839 	if (curp->secondary)
840 		free(curp->secondary->buf);
841 
842 	free(curp->secondary);
843 	free(curp->sodest);
844 	free(curp);
845 }
846 
847 void
848 mparse_result(struct mparse *curp, struct roff_man **man,
849 	char **sodest)
850 {
851 
852 	if (sodest && NULL != (*sodest = curp->sodest)) {
853 		*man = NULL;
854 		return;
855 	}
856 	if (man)
857 		*man = curp->man;
858 }
859 
860 void
861 mparse_updaterc(struct mparse *curp, enum mandoclevel *rc)
862 {
863 	if (curp->file_status > *rc)
864 		*rc = curp->file_status;
865 }
866 
867 void
868 mandoc_vmsg(enum mandocerr t, struct mparse *m,
869 		int ln, int pos, const char *fmt, ...)
870 {
871 	char		 buf[256];
872 	va_list		 ap;
873 
874 	va_start(ap, fmt);
875 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
876 	va_end(ap);
877 
878 	mandoc_msg(t, m, ln, pos, buf);
879 }
880 
881 void
882 mandoc_msg(enum mandocerr er, struct mparse *m,
883 		int ln, int col, const char *msg)
884 {
885 	enum mandoclevel level;
886 
887 	if (er < m->mmin && er != MANDOCERR_FILE)
888 		return;
889 
890 	level = MANDOCLEVEL_UNSUPP;
891 	while (er < mandoclimits[level])
892 		level--;
893 
894 	if (m->mmsg)
895 		(*m->mmsg)(er, level, m->file, ln, col, msg);
896 
897 	if (m->file_status < level)
898 		m->file_status = level;
899 }
900 
901 const char *
902 mparse_strerror(enum mandocerr er)
903 {
904 
905 	return mandocerrs[er];
906 }
907 
908 const char *
909 mparse_strlevel(enum mandoclevel lvl)
910 {
911 	return mandoclevels[lvl];
912 }
913 
914 void
915 mparse_keep(struct mparse *p)
916 {
917 
918 	assert(NULL == p->secondary);
919 	p->secondary = mandoc_calloc(1, sizeof(struct buf));
920 }
921 
922 const char *
923 mparse_getkeep(const struct mparse *p)
924 {
925 
926 	assert(p->secondary);
927 	return p->secondary->sz ? p->secondary->buf : NULL;
928 }
929