xref: /illumos-gate/usr/src/cmd/mandoc/read.c (revision ff31d5bfa079d4db9f78f481637d7ed9f9fa4a49)
1 /*	$Id: read.c,v 1.150.2.5 2017/01/09 02:25:53 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2010-2017 Ingo Schwarze <schwarze@openbsd.org>
5  * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include "config.h"
20 
21 #include <sys/types.h>
22 #if HAVE_MMAP
23 #include <sys/mman.h>
24 #include <sys/stat.h>
25 #endif
26 
27 #include <assert.h>
28 #include <ctype.h>
29 #if HAVE_ERR
30 #include <err.h>
31 #endif
32 #include <errno.h>
33 #include <fcntl.h>
34 #include <stdarg.h>
35 #include <stdint.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <zlib.h>
41 
42 #include "mandoc_aux.h"
43 #include "mandoc.h"
44 #include "roff.h"
45 #include "mdoc.h"
46 #include "man.h"
47 #include "libmandoc.h"
48 #include "roff_int.h"
49 
50 #define	REPARSE_LIMIT	1000
51 
52 struct	mparse {
53 	struct roff_man	 *man; /* man parser */
54 	struct roff	 *roff; /* roff parser (!NULL) */
55 	char		 *sodest; /* filename pointed to by .so */
56 	const char	 *file; /* filename of current input file */
57 	struct buf	 *primary; /* buffer currently being parsed */
58 	struct buf	 *secondary; /* preprocessed copy of input */
59 	const char	 *defos; /* default operating system */
60 	mandocmsg	  mmsg; /* warning/error message handler */
61 	enum mandoclevel  file_status; /* status of current parse */
62 	enum mandoclevel  wlevel; /* ignore messages below this */
63 	int		  options; /* parser options */
64 	int		  gzip; /* current input file is gzipped */
65 	int		  filenc; /* encoding of the current file */
66 	int		  reparse_count; /* finite interp. stack */
67 	int		  line; /* line number in the file */
68 };
69 
70 static	void	  choose_parser(struct mparse *);
71 static	void	  resize_buf(struct buf *, size_t);
72 static	void	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
73 static	int	  read_whole_file(struct mparse *, const char *, int,
74 				struct buf *, int *);
75 static	void	  mparse_end(struct mparse *);
76 static	void	  mparse_parse_buffer(struct mparse *, struct buf,
77 			const char *);
78 
79 static	const enum mandocerr	mandoclimits[MANDOCLEVEL_MAX] = {
80 	MANDOCERR_OK,
81 	MANDOCERR_WARNING,
82 	MANDOCERR_WARNING,
83 	MANDOCERR_ERROR,
84 	MANDOCERR_UNSUPP,
85 	MANDOCERR_MAX,
86 	MANDOCERR_MAX
87 };
88 
89 static	const char * const	mandocerrs[MANDOCERR_MAX] = {
90 	"ok",
91 
92 	"generic warning",
93 
94 	/* related to the prologue */
95 	"missing manual title, using UNTITLED",
96 	"missing manual title, using \"\"",
97 	"lower case character in document title",
98 	"missing manual section, using \"\"",
99 	"unknown manual section",
100 	"missing date, using today's date",
101 	"cannot parse date, using it verbatim",
102 	"missing Os macro, using \"\"",
103 	"duplicate prologue macro",
104 	"late prologue macro",
105 	"skipping late title macro",
106 	"prologue macros out of order",
107 
108 	/* related to document structure */
109 	".so is fragile, better use ln(1)",
110 	"no document body",
111 	"content before first section header",
112 	"first section is not \"NAME\"",
113 	"NAME section without Nm before Nd",
114 	"NAME section without description",
115 	"description not at the end of NAME",
116 	"bad NAME section content",
117 	"missing comma before name",
118 	"missing description line, using \"\"",
119 	"sections out of conventional order",
120 	"duplicate section title",
121 	"unexpected section",
122 	"unusual Xr order",
123 	"unusual Xr punctuation",
124 	"AUTHORS section without An macro",
125 
126 	/* related to macros and nesting */
127 	"obsolete macro",
128 	"macro neither callable nor escaped",
129 	"skipping paragraph macro",
130 	"moving paragraph macro out of list",
131 	"skipping no-space macro",
132 	"blocks badly nested",
133 	"nested displays are not portable",
134 	"moving content out of list",
135 	"fill mode already enabled, skipping",
136 	"fill mode already disabled, skipping",
137 	"line scope broken",
138 
139 	/* related to missing macro arguments */
140 	"skipping empty request",
141 	"conditional request controls empty scope",
142 	"skipping empty macro",
143 	"empty block",
144 	"empty argument, using 0n",
145 	"missing display type, using -ragged",
146 	"list type is not the first argument",
147 	"missing -width in -tag list, using 6n",
148 	"missing utility name, using \"\"",
149 	"missing function name, using \"\"",
150 	"empty head in list item",
151 	"empty list item",
152 	"missing font type, using \\fR",
153 	"unknown font type, using \\fR",
154 	"nothing follows prefix",
155 	"empty reference block",
156 	"missing section argument",
157 	"missing -std argument, adding it",
158 	"missing option string, using \"\"",
159 	"missing resource identifier, using \"\"",
160 	"missing eqn box, using \"\"",
161 
162 	/* related to bad macro arguments */
163 	"unterminated quoted argument",
164 	"duplicate argument",
165 	"skipping duplicate argument",
166 	"skipping duplicate display type",
167 	"skipping duplicate list type",
168 	"skipping -width argument",
169 	"wrong number of cells",
170 	"unknown AT&T UNIX version",
171 	"comma in function argument",
172 	"parenthesis in function name",
173 	"invalid content in Rs block",
174 	"invalid Boolean argument",
175 	"unknown font, skipping request",
176 	"odd number of characters in request",
177 
178 	/* related to plain text */
179 	"blank line in fill mode, using .sp",
180 	"tab in filled text",
181 	"whitespace at end of input line",
182 	"bad comment style",
183 	"invalid escape sequence",
184 	"undefined string, using \"\"",
185 
186 	/* related to tables */
187 	"tbl line starts with span",
188 	"tbl column starts with span",
189 	"skipping vertical bar in tbl layout",
190 
191 	"generic error",
192 
193 	/* related to tables */
194 	"non-alphabetic character in tbl options",
195 	"skipping unknown tbl option",
196 	"missing tbl option argument",
197 	"wrong tbl option argument size",
198 	"empty tbl layout",
199 	"invalid character in tbl layout",
200 	"unmatched parenthesis in tbl layout",
201 	"tbl without any data cells",
202 	"ignoring data in spanned tbl cell",
203 	"ignoring extra tbl data cells",
204 	"data block open at end of tbl",
205 
206 	/* related to document structure and macros */
207 	NULL,
208 	"input stack limit exceeded, infinite loop?",
209 	"skipping bad character",
210 	"skipping unknown macro",
211 	"skipping insecure request",
212 	"skipping item outside list",
213 	"skipping column outside column list",
214 	"skipping end of block that is not open",
215 	"fewer RS blocks open, skipping",
216 	"inserting missing end of block",
217 	"appending missing end of block",
218 
219 	/* related to request and macro arguments */
220 	"escaped character not allowed in a name",
221 	"NOT IMPLEMENTED: Bd -file",
222 	"skipping display without arguments",
223 	"missing list type, using -item",
224 	"missing manual name, using \"\"",
225 	"uname(3) system call failed, using UNKNOWN",
226 	"unknown standard specifier",
227 	"skipping request without numeric argument",
228 	"NOT IMPLEMENTED: .so with absolute path or \"..\"",
229 	".so request failed",
230 	"skipping all arguments",
231 	"skipping excess arguments",
232 	"divide by zero",
233 
234 	"unsupported feature",
235 	"input too large",
236 	"unsupported control character",
237 	"unsupported roff request",
238 	"eqn delim option in tbl",
239 	"unsupported tbl layout modifier",
240 	"ignoring macro in table",
241 };
242 
243 static	const char * const	mandoclevels[MANDOCLEVEL_MAX] = {
244 	"SUCCESS",
245 	"RESERVED",
246 	"WARNING",
247 	"ERROR",
248 	"UNSUPP",
249 	"BADARG",
250 	"SYSERR"
251 };
252 
253 
254 static void
255 resize_buf(struct buf *buf, size_t initial)
256 {
257 
258 	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
259 	buf->buf = mandoc_realloc(buf->buf, buf->sz);
260 }
261 
262 static void
263 choose_parser(struct mparse *curp)
264 {
265 	char		*cp, *ep;
266 	int		 format;
267 
268 	/*
269 	 * If neither command line arguments -mdoc or -man select
270 	 * a parser nor the roff parser found a .Dd or .TH macro
271 	 * yet, look ahead in the main input buffer.
272 	 */
273 
274 	if ((format = roff_getformat(curp->roff)) == 0) {
275 		cp = curp->primary->buf;
276 		ep = cp + curp->primary->sz;
277 		while (cp < ep) {
278 			if (*cp == '.' || *cp == '\'') {
279 				cp++;
280 				if (cp[0] == 'D' && cp[1] == 'd') {
281 					format = MPARSE_MDOC;
282 					break;
283 				}
284 				if (cp[0] == 'T' && cp[1] == 'H') {
285 					format = MPARSE_MAN;
286 					break;
287 				}
288 			}
289 			cp = memchr(cp, '\n', ep - cp);
290 			if (cp == NULL)
291 				break;
292 			cp++;
293 		}
294 	}
295 
296 	if (format == MPARSE_MDOC) {
297 		mdoc_hash_init();
298 		curp->man->macroset = MACROSET_MDOC;
299 		curp->man->first->tok = TOKEN_NONE;
300 	} else {
301 		man_hash_init();
302 		curp->man->macroset = MACROSET_MAN;
303 		curp->man->first->tok = TOKEN_NONE;
304 	}
305 }
306 
307 /*
308  * Main parse routine for a buffer.
309  * It assumes encoding and line numbering are already set up.
310  * It can recurse directly (for invocations of user-defined
311  * macros, inline equations, and input line traps)
312  * and indirectly (for .so file inclusion).
313  */
314 static void
315 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
316 {
317 	const struct tbl_span	*span;
318 	struct buf	 ln;
319 	const char	*save_file;
320 	char		*cp;
321 	size_t		 pos; /* byte number in the ln buffer */
322 	size_t		 j;  /* auxiliary byte number in the blk buffer */
323 	enum rofferr	 rr;
324 	int		 of;
325 	int		 lnn; /* line number in the real file */
326 	int		 fd;
327 	unsigned char	 c;
328 
329 	memset(&ln, 0, sizeof(ln));
330 
331 	lnn = curp->line;
332 	pos = 0;
333 
334 	while (i < blk.sz) {
335 		if (0 == pos && '\0' == blk.buf[i])
336 			break;
337 
338 		if (start) {
339 			curp->line = lnn;
340 			curp->reparse_count = 0;
341 
342 			if (lnn < 3 &&
343 			    curp->filenc & MPARSE_UTF8 &&
344 			    curp->filenc & MPARSE_LATIN1)
345 				curp->filenc = preconv_cue(&blk, i);
346 		}
347 
348 		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
349 
350 			/*
351 			 * When finding an unescaped newline character,
352 			 * leave the character loop to process the line.
353 			 * Skip a preceding carriage return, if any.
354 			 */
355 
356 			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
357 			    '\n' == blk.buf[i + 1])
358 				++i;
359 			if ('\n' == blk.buf[i]) {
360 				++i;
361 				++lnn;
362 				break;
363 			}
364 
365 			/*
366 			 * Make sure we have space for the worst
367 			 * case of 11 bytes: "\\[u10ffff]\0"
368 			 */
369 
370 			if (pos + 11 > ln.sz)
371 				resize_buf(&ln, 256);
372 
373 			/*
374 			 * Encode 8-bit input.
375 			 */
376 
377 			c = blk.buf[i];
378 			if (c & 0x80) {
379 				if ( ! (curp->filenc && preconv_encode(
380 				    &blk, &i, &ln, &pos, &curp->filenc))) {
381 					mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
382 					    curp->line, pos, "0x%x", c);
383 					ln.buf[pos++] = '?';
384 					i++;
385 				}
386 				continue;
387 			}
388 
389 			/*
390 			 * Exclude control characters.
391 			 */
392 
393 			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
394 				mandoc_vmsg(c == 0x00 || c == 0x04 ||
395 				    c > 0x0a ? MANDOCERR_CHAR_BAD :
396 				    MANDOCERR_CHAR_UNSUPP,
397 				    curp, curp->line, pos, "0x%x", c);
398 				i++;
399 				if (c != '\r')
400 					ln.buf[pos++] = '?';
401 				continue;
402 			}
403 
404 			/* Trailing backslash = a plain char. */
405 
406 			if (blk.buf[i] != '\\' || i + 1 == blk.sz) {
407 				ln.buf[pos++] = blk.buf[i++];
408 				continue;
409 			}
410 
411 			/*
412 			 * Found escape and at least one other character.
413 			 * When it's a newline character, skip it.
414 			 * When there is a carriage return in between,
415 			 * skip that one as well.
416 			 */
417 
418 			if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz &&
419 			    '\n' == blk.buf[i + 2])
420 				++i;
421 			if ('\n' == blk.buf[i + 1]) {
422 				i += 2;
423 				++lnn;
424 				continue;
425 			}
426 
427 			if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
428 				j = i;
429 				i += 2;
430 				/* Comment, skip to end of line */
431 				for (; i < blk.sz; ++i) {
432 					if (blk.buf[i] != '\n')
433 						continue;
434 					if (blk.buf[i - 1] == ' ' ||
435 					    blk.buf[i - 1] == '\t')
436 						mandoc_msg(
437 						    MANDOCERR_SPACE_EOL,
438 						    curp, curp->line,
439 						    pos + i-1 - j, NULL);
440 					++i;
441 					++lnn;
442 					break;
443 				}
444 
445 				/* Backout trailing whitespaces */
446 				for (; pos > 0; --pos) {
447 					if (ln.buf[pos - 1] != ' ')
448 						break;
449 					if (pos > 2 && ln.buf[pos - 2] == '\\')
450 						break;
451 				}
452 				break;
453 			}
454 
455 			/* Catch escaped bogus characters. */
456 
457 			c = (unsigned char) blk.buf[i+1];
458 
459 			if ( ! (isascii(c) &&
460 			    (isgraph(c) || isblank(c)))) {
461 				mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
462 				    curp->line, pos, "0x%x", c);
463 				i += 2;
464 				ln.buf[pos++] = '?';
465 				continue;
466 			}
467 
468 			/* Some other escape sequence, copy & cont. */
469 
470 			ln.buf[pos++] = blk.buf[i++];
471 			ln.buf[pos++] = blk.buf[i++];
472 		}
473 
474 		if (pos >= ln.sz)
475 			resize_buf(&ln, 256);
476 
477 		ln.buf[pos] = '\0';
478 
479 		/*
480 		 * A significant amount of complexity is contained by
481 		 * the roff preprocessor.  It's line-oriented but can be
482 		 * expressed on one line, so we need at times to
483 		 * readjust our starting point and re-run it.  The roff
484 		 * preprocessor can also readjust the buffers with new
485 		 * data, so we pass them in wholesale.
486 		 */
487 
488 		of = 0;
489 
490 		/*
491 		 * Maintain a lookaside buffer of all parsed lines.  We
492 		 * only do this if mparse_keep() has been invoked (the
493 		 * buffer may be accessed with mparse_getkeep()).
494 		 */
495 
496 		if (curp->secondary) {
497 			curp->secondary->buf = mandoc_realloc(
498 			    curp->secondary->buf,
499 			    curp->secondary->sz + pos + 2);
500 			memcpy(curp->secondary->buf +
501 			    curp->secondary->sz,
502 			    ln.buf, pos);
503 			curp->secondary->sz += pos;
504 			curp->secondary->buf
505 				[curp->secondary->sz] = '\n';
506 			curp->secondary->sz++;
507 			curp->secondary->buf
508 				[curp->secondary->sz] = '\0';
509 		}
510 rerun:
511 		rr = roff_parseln(curp->roff, curp->line, &ln, &of);
512 
513 		switch (rr) {
514 		case ROFF_REPARSE:
515 			if (REPARSE_LIMIT >= ++curp->reparse_count)
516 				mparse_buf_r(curp, ln, of, 0);
517 			else
518 				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
519 				    curp->line, pos, NULL);
520 			pos = 0;
521 			continue;
522 		case ROFF_APPEND:
523 			pos = strlen(ln.buf);
524 			continue;
525 		case ROFF_RERUN:
526 			goto rerun;
527 		case ROFF_IGN:
528 			pos = 0;
529 			continue;
530 		case ROFF_SO:
531 			if ( ! (curp->options & MPARSE_SO) &&
532 			    (i >= blk.sz || blk.buf[i] == '\0')) {
533 				curp->sodest = mandoc_strdup(ln.buf + of);
534 				free(ln.buf);
535 				return;
536 			}
537 			/*
538 			 * We remove `so' clauses from our lookaside
539 			 * buffer because we're going to descend into
540 			 * the file recursively.
541 			 */
542 			if (curp->secondary)
543 				curp->secondary->sz -= pos + 1;
544 			save_file = curp->file;
545 			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
546 				mparse_readfd(curp, fd, ln.buf + of);
547 				close(fd);
548 				curp->file = save_file;
549 			} else {
550 				curp->file = save_file;
551 				mandoc_vmsg(MANDOCERR_SO_FAIL,
552 				    curp, curp->line, pos,
553 				    ".so %s", ln.buf + of);
554 				ln.sz = mandoc_asprintf(&cp,
555 				    ".sp\nSee the file %s.\n.sp",
556 				    ln.buf + of);
557 				free(ln.buf);
558 				ln.buf = cp;
559 				of = 0;
560 				mparse_buf_r(curp, ln, of, 0);
561 			}
562 			pos = 0;
563 			continue;
564 		default:
565 			break;
566 		}
567 
568 		if (curp->man->macroset == MACROSET_NONE)
569 			choose_parser(curp);
570 
571 		/*
572 		 * Lastly, push down into the parsers themselves.
573 		 * If libroff returns ROFF_TBL, then add it to the
574 		 * currently open parse.  Since we only get here if
575 		 * there does exist data (see tbl_data.c), we're
576 		 * guaranteed that something's been allocated.
577 		 * Do the same for ROFF_EQN.
578 		 */
579 
580 		if (rr == ROFF_TBL)
581 			while ((span = roff_span(curp->roff)) != NULL)
582 				roff_addtbl(curp->man, span);
583 		else if (rr == ROFF_EQN)
584 			roff_addeqn(curp->man, roff_eqn(curp->roff));
585 		else if ((curp->man->macroset == MACROSET_MDOC ?
586 		    mdoc_parseln(curp->man, curp->line, ln.buf, of) :
587 		    man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
588 				break;
589 
590 		/* Temporary buffers typically are not full. */
591 
592 		if (0 == start && '\0' == blk.buf[i])
593 			break;
594 
595 		/* Start the next input line. */
596 
597 		pos = 0;
598 	}
599 
600 	free(ln.buf);
601 }
602 
603 static int
604 read_whole_file(struct mparse *curp, const char *file, int fd,
605 		struct buf *fb, int *with_mmap)
606 {
607 	gzFile		 gz;
608 	size_t		 off;
609 	ssize_t		 ssz;
610 
611 #if HAVE_MMAP
612 	struct stat	 st;
613 
614 	if (fstat(fd, &st) == -1)
615 		err((int)MANDOCLEVEL_SYSERR, "%s", file);
616 
617 	/*
618 	 * If we're a regular file, try just reading in the whole entry
619 	 * via mmap().  This is faster than reading it into blocks, and
620 	 * since each file is only a few bytes to begin with, I'm not
621 	 * concerned that this is going to tank any machines.
622 	 */
623 
624 	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
625 		if (st.st_size > 0x7fffffff) {
626 			mandoc_msg(MANDOCERR_TOOLARGE, curp, 0, 0, NULL);
627 			return 0;
628 		}
629 		*with_mmap = 1;
630 		fb->sz = (size_t)st.st_size;
631 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
632 		if (fb->buf != MAP_FAILED)
633 			return 1;
634 	}
635 #endif
636 
637 	if (curp->gzip) {
638 		if ((gz = gzdopen(fd, "rb")) == NULL)
639 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
640 	} else
641 		gz = NULL;
642 
643 	/*
644 	 * If this isn't a regular file (like, say, stdin), then we must
645 	 * go the old way and just read things in bit by bit.
646 	 */
647 
648 	*with_mmap = 0;
649 	off = 0;
650 	fb->sz = 0;
651 	fb->buf = NULL;
652 	for (;;) {
653 		if (off == fb->sz) {
654 			if (fb->sz == (1U << 31)) {
655 				mandoc_msg(MANDOCERR_TOOLARGE, curp,
656 				    0, 0, NULL);
657 				break;
658 			}
659 			resize_buf(fb, 65536);
660 		}
661 		ssz = curp->gzip ?
662 		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
663 		    read(fd, fb->buf + (int)off, fb->sz - off);
664 		if (ssz == 0) {
665 			fb->sz = off;
666 			return 1;
667 		}
668 		if (ssz == -1)
669 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
670 		off += (size_t)ssz;
671 	}
672 
673 	free(fb->buf);
674 	fb->buf = NULL;
675 	return 0;
676 }
677 
678 static void
679 mparse_end(struct mparse *curp)
680 {
681 	if (curp->man->macroset == MACROSET_NONE)
682 		curp->man->macroset = MACROSET_MAN;
683 	if (curp->man->macroset == MACROSET_MDOC)
684 		mdoc_endparse(curp->man);
685 	else
686 		man_endparse(curp->man);
687 	roff_endparse(curp->roff);
688 }
689 
690 static void
691 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
692 {
693 	struct buf	*svprimary;
694 	const char	*svfile;
695 	size_t		 offset;
696 	static int	 recursion_depth;
697 
698 	if (64 < recursion_depth) {
699 		mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
700 		return;
701 	}
702 
703 	/* Line number is per-file. */
704 	svfile = curp->file;
705 	curp->file = file;
706 	svprimary = curp->primary;
707 	curp->primary = &blk;
708 	curp->line = 1;
709 	recursion_depth++;
710 
711 	/* Skip an UTF-8 byte order mark. */
712 	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
713 	    (unsigned char)blk.buf[0] == 0xef &&
714 	    (unsigned char)blk.buf[1] == 0xbb &&
715 	    (unsigned char)blk.buf[2] == 0xbf) {
716 		offset = 3;
717 		curp->filenc &= ~MPARSE_LATIN1;
718 	} else
719 		offset = 0;
720 
721 	mparse_buf_r(curp, blk, offset, 1);
722 
723 	if (--recursion_depth == 0)
724 		mparse_end(curp);
725 
726 	curp->primary = svprimary;
727 	curp->file = svfile;
728 }
729 
730 enum mandoclevel
731 mparse_readmem(struct mparse *curp, void *buf, size_t len,
732 		const char *file)
733 {
734 	struct buf blk;
735 
736 	blk.buf = buf;
737 	blk.sz = len;
738 
739 	mparse_parse_buffer(curp, blk, file);
740 	return curp->file_status;
741 }
742 
743 /*
744  * Read the whole file into memory and call the parsers.
745  * Called recursively when an .so request is encountered.
746  */
747 enum mandoclevel
748 mparse_readfd(struct mparse *curp, int fd, const char *file)
749 {
750 	struct buf	 blk;
751 	int		 with_mmap;
752 	int		 save_filenc;
753 
754 	if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
755 		save_filenc = curp->filenc;
756 		curp->filenc = curp->options &
757 		    (MPARSE_UTF8 | MPARSE_LATIN1);
758 		mparse_parse_buffer(curp, blk, file);
759 		curp->filenc = save_filenc;
760 #if HAVE_MMAP
761 		if (with_mmap)
762 			munmap(blk.buf, blk.sz);
763 		else
764 #endif
765 			free(blk.buf);
766 	}
767 	return curp->file_status;
768 }
769 
770 int
771 mparse_open(struct mparse *curp, const char *file)
772 {
773 	char		 *cp;
774 	int		  fd;
775 
776 	curp->file = file;
777 	cp = strrchr(file, '.');
778 	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
779 
780 	/* First try to use the filename as it is. */
781 
782 	if ((fd = open(file, O_RDONLY)) != -1)
783 		return fd;
784 
785 	/*
786 	 * If that doesn't work and the filename doesn't
787 	 * already  end in .gz, try appending .gz.
788 	 */
789 
790 	if ( ! curp->gzip) {
791 		mandoc_asprintf(&cp, "%s.gz", file);
792 		fd = open(cp, O_RDONLY);
793 		free(cp);
794 		if (fd != -1) {
795 			curp->gzip = 1;
796 			return fd;
797 		}
798 	}
799 
800 	/* Neither worked, give up. */
801 
802 	mandoc_msg(MANDOCERR_FILE, curp, 0, 0, strerror(errno));
803 	return -1;
804 }
805 
806 struct mparse *
807 mparse_alloc(int options, enum mandoclevel wlevel, mandocmsg mmsg,
808     const char *defos)
809 {
810 	struct mparse	*curp;
811 
812 	curp = mandoc_calloc(1, sizeof(struct mparse));
813 
814 	curp->options = options;
815 	curp->wlevel = wlevel;
816 	curp->mmsg = mmsg;
817 	curp->defos = defos;
818 
819 	curp->roff = roff_alloc(curp, options);
820 	curp->man = roff_man_alloc( curp->roff, curp, curp->defos,
821 		curp->options & MPARSE_QUICK ? 1 : 0);
822 	if (curp->options & MPARSE_MDOC) {
823 		mdoc_hash_init();
824 		curp->man->macroset = MACROSET_MDOC;
825 	} else if (curp->options & MPARSE_MAN) {
826 		man_hash_init();
827 		curp->man->macroset = MACROSET_MAN;
828 	}
829 	curp->man->first->tok = TOKEN_NONE;
830 	return curp;
831 }
832 
833 void
834 mparse_reset(struct mparse *curp)
835 {
836 	roff_reset(curp->roff);
837 	roff_man_reset(curp->man);
838 	if (curp->secondary)
839 		curp->secondary->sz = 0;
840 
841 	curp->file_status = MANDOCLEVEL_OK;
842 
843 	free(curp->sodest);
844 	curp->sodest = NULL;
845 }
846 
847 void
848 mparse_free(struct mparse *curp)
849 {
850 
851 	roff_man_free(curp->man);
852 	if (curp->roff)
853 		roff_free(curp->roff);
854 	if (curp->secondary)
855 		free(curp->secondary->buf);
856 
857 	free(curp->secondary);
858 	free(curp->sodest);
859 	free(curp);
860 }
861 
862 void
863 mparse_result(struct mparse *curp, struct roff_man **man,
864 	char **sodest)
865 {
866 
867 	if (sodest && NULL != (*sodest = curp->sodest)) {
868 		*man = NULL;
869 		return;
870 	}
871 	if (man)
872 		*man = curp->man;
873 }
874 
875 void
876 mparse_updaterc(struct mparse *curp, enum mandoclevel *rc)
877 {
878 	if (curp->file_status > *rc)
879 		*rc = curp->file_status;
880 }
881 
882 void
883 mandoc_vmsg(enum mandocerr t, struct mparse *m,
884 		int ln, int pos, const char *fmt, ...)
885 {
886 	char		 buf[256];
887 	va_list		 ap;
888 
889 	va_start(ap, fmt);
890 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
891 	va_end(ap);
892 
893 	mandoc_msg(t, m, ln, pos, buf);
894 }
895 
896 void
897 mandoc_msg(enum mandocerr er, struct mparse *m,
898 		int ln, int col, const char *msg)
899 {
900 	enum mandoclevel level;
901 
902 	level = MANDOCLEVEL_UNSUPP;
903 	while (er < mandoclimits[level])
904 		level--;
905 
906 	if (level < m->wlevel && er != MANDOCERR_FILE)
907 		return;
908 
909 	if (m->mmsg)
910 		(*m->mmsg)(er, level, m->file, ln, col, msg);
911 
912 	if (m->file_status < level)
913 		m->file_status = level;
914 }
915 
916 const char *
917 mparse_strerror(enum mandocerr er)
918 {
919 
920 	return mandocerrs[er];
921 }
922 
923 const char *
924 mparse_strlevel(enum mandoclevel lvl)
925 {
926 	return mandoclevels[lvl];
927 }
928 
929 void
930 mparse_keep(struct mparse *p)
931 {
932 
933 	assert(NULL == p->secondary);
934 	p->secondary = mandoc_calloc(1, sizeof(struct buf));
935 }
936 
937 const char *
938 mparse_getkeep(const struct mparse *p)
939 {
940 
941 	assert(p->secondary);
942 	return p->secondary->sz ? p->secondary->buf : NULL;
943 }
944