xref: /titanic_50/usr/src/cmd/mandoc/read.c (revision 260e9a87725c090ba5835b1f9f0b62fa2f96036f)
1 /*	$Id: read.c,v 1.131 2015/03/11 13:05:20 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2010-2015 Ingo Schwarze <schwarze@openbsd.org>
5  * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include "config.h"
20 
21 #include <sys/types.h>
22 #if HAVE_MMAP
23 #include <sys/mman.h>
24 #include <sys/stat.h>
25 #endif
26 #include <sys/wait.h>
27 
28 #include <assert.h>
29 #include <ctype.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <stdarg.h>
33 #include <stdint.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <unistd.h>
38 
39 #include "mandoc.h"
40 #include "mandoc_aux.h"
41 #include "libmandoc.h"
42 #include "mdoc.h"
43 #include "man.h"
44 
45 #define	REPARSE_LIMIT	1000
46 
47 struct	mparse {
48 	struct man	 *pman; /* persistent man parser */
49 	struct mdoc	 *pmdoc; /* persistent mdoc parser */
50 	struct man	 *man; /* man parser */
51 	struct mdoc	 *mdoc; /* mdoc parser */
52 	struct roff	 *roff; /* roff parser (!NULL) */
53 	const struct mchars *mchars; /* character table */
54 	char		 *sodest; /* filename pointed to by .so */
55 	const char	 *file; /* filename of current input file */
56 	struct buf	 *primary; /* buffer currently being parsed */
57 	struct buf	 *secondary; /* preprocessed copy of input */
58 	const char	 *defos; /* default operating system */
59 	mandocmsg	  mmsg; /* warning/error message handler */
60 	enum mandoclevel  file_status; /* status of current parse */
61 	enum mandoclevel  wlevel; /* ignore messages below this */
62 	int		  options; /* parser options */
63 	int		  filenc; /* encoding of the current file */
64 	int		  reparse_count; /* finite interp. stack */
65 	int		  line; /* line number in the file */
66 	pid_t		  child; /* the gunzip(1) process */
67 };
68 
69 static	void	  choose_parser(struct mparse *);
70 static	void	  resize_buf(struct buf *, size_t);
71 static	void	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
72 static	int	  read_whole_file(struct mparse *, const char *, int,
73 				struct buf *, int *);
74 static	void	  mparse_end(struct mparse *);
75 static	void	  mparse_parse_buffer(struct mparse *, struct buf,
76 			const char *);
77 
78 static	const enum mandocerr	mandoclimits[MANDOCLEVEL_MAX] = {
79 	MANDOCERR_OK,
80 	MANDOCERR_WARNING,
81 	MANDOCERR_WARNING,
82 	MANDOCERR_ERROR,
83 	MANDOCERR_UNSUPP,
84 	MANDOCERR_MAX,
85 	MANDOCERR_MAX
86 };
87 
88 static	const char * const	mandocerrs[MANDOCERR_MAX] = {
89 	"ok",
90 
91 	"generic warning",
92 
93 	/* related to the prologue */
94 	"missing manual title, using UNTITLED",
95 	"missing manual title, using \"\"",
96 	"lower case character in document title",
97 	"missing manual section, using \"\"",
98 	"unknown manual section",
99 	"missing date, using today's date",
100 	"cannot parse date, using it verbatim",
101 	"missing Os macro, using \"\"",
102 	"duplicate prologue macro",
103 	"late prologue macro",
104 	"skipping late title macro",
105 	"prologue macros out of order",
106 
107 	/* related to document structure */
108 	".so is fragile, better use ln(1)",
109 	"no document body",
110 	"content before first section header",
111 	"first section is not \"NAME\"",
112 	"NAME section without name",
113 	"NAME section without description",
114 	"description not at the end of NAME",
115 	"bad NAME section content",
116 	"missing description line, using \"\"",
117 	"sections out of conventional order",
118 	"duplicate section title",
119 	"unexpected section",
120 	"unusual Xr order",
121 	"unusual Xr punctuation",
122 	"AUTHORS section without An macro",
123 
124 	/* related to macros and nesting */
125 	"obsolete macro",
126 	"macro neither callable nor escaped",
127 	"skipping paragraph macro",
128 	"moving paragraph macro out of list",
129 	"skipping no-space macro",
130 	"blocks badly nested",
131 	"nested displays are not portable",
132 	"moving content out of list",
133 	".Vt block has child macro",
134 	"fill mode already enabled, skipping",
135 	"fill mode already disabled, skipping",
136 	"line scope broken",
137 
138 	/* related to missing macro arguments */
139 	"skipping empty request",
140 	"conditional request controls empty scope",
141 	"skipping empty macro",
142 	"empty block",
143 	"empty argument, using 0n",
144 	"missing display type, using -ragged",
145 	"list type is not the first argument",
146 	"missing -width in -tag list, using 8n",
147 	"missing utility name, using \"\"",
148 	"missing function name, using \"\"",
149 	"empty head in list item",
150 	"empty list item",
151 	"missing font type, using \\fR",
152 	"unknown font type, using \\fR",
153 	"nothing follows prefix",
154 	"empty reference block",
155 	"missing -std argument, adding it",
156 	"missing option string, using \"\"",
157 	"missing resource identifier, using \"\"",
158 	"missing eqn box, using \"\"",
159 
160 	/* related to bad macro arguments */
161 	"unterminated quoted argument",
162 	"duplicate argument",
163 	"skipping duplicate argument",
164 	"skipping duplicate display type",
165 	"skipping duplicate list type",
166 	"skipping -width argument",
167 	"wrong number of cells",
168 	"unknown AT&T UNIX version",
169 	"comma in function argument",
170 	"parenthesis in function name",
171 	"invalid content in Rs block",
172 	"invalid Boolean argument",
173 	"unknown font, skipping request",
174 	"odd number of characters in request",
175 
176 	/* related to plain text */
177 	"blank line in fill mode, using .sp",
178 	"tab in filled text",
179 	"whitespace at end of input line",
180 	"bad comment style",
181 	"invalid escape sequence",
182 	"undefined string, using \"\"",
183 
184 	/* related to tables */
185 	"tbl line starts with span",
186 	"tbl column starts with span",
187 	"skipping vertical bar in tbl layout",
188 
189 	"generic error",
190 
191 	/* related to tables */
192 	"non-alphabetic character in tbl options",
193 	"skipping unknown tbl option",
194 	"missing tbl option argument",
195 	"wrong tbl option argument size",
196 	"empty tbl layout",
197 	"invalid character in tbl layout",
198 	"unmatched parenthesis in tbl layout",
199 	"tbl without any data cells",
200 	"ignoring data in spanned tbl cell",
201 	"ignoring extra tbl data cells",
202 	"data block open at end of tbl",
203 
204 	/* related to document structure and macros */
205 	NULL,
206 	"input stack limit exceeded, infinite loop?",
207 	"skipping bad character",
208 	"skipping unknown macro",
209 	"skipping insecure request",
210 	"skipping item outside list",
211 	"skipping column outside column list",
212 	"skipping end of block that is not open",
213 	"fewer RS blocks open, skipping",
214 	"inserting missing end of block",
215 	"appending missing end of block",
216 
217 	/* related to request and macro arguments */
218 	"escaped character not allowed in a name",
219 	"NOT IMPLEMENTED: Bd -file",
220 	"missing list type, using -item",
221 	"missing manual name, using \"\"",
222 	"uname(3) system call failed, using UNKNOWN",
223 	"unknown standard specifier",
224 	"skipping request without numeric argument",
225 	"NOT IMPLEMENTED: .so with absolute path or \"..\"",
226 	".so request failed",
227 	"skipping all arguments",
228 	"skipping excess arguments",
229 	"divide by zero",
230 
231 	"unsupported feature",
232 	"input too large",
233 	"unsupported control character",
234 	"unsupported roff request",
235 	"eqn delim option in tbl",
236 	"unsupported tbl layout modifier",
237 	"ignoring macro in table",
238 };
239 
240 static	const char * const	mandoclevels[MANDOCLEVEL_MAX] = {
241 	"SUCCESS",
242 	"RESERVED",
243 	"WARNING",
244 	"ERROR",
245 	"UNSUPP",
246 	"BADARG",
247 	"SYSERR"
248 };
249 
250 
251 static void
resize_buf(struct buf * buf,size_t initial)252 resize_buf(struct buf *buf, size_t initial)
253 {
254 
255 	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
256 	buf->buf = mandoc_realloc(buf->buf, buf->sz);
257 }
258 
259 static void
choose_parser(struct mparse * curp)260 choose_parser(struct mparse *curp)
261 {
262 	char		*cp, *ep;
263 	int		 format;
264 
265 	/*
266 	 * If neither command line arguments -mdoc or -man select
267 	 * a parser nor the roff parser found a .Dd or .TH macro
268 	 * yet, look ahead in the main input buffer.
269 	 */
270 
271 	if ((format = roff_getformat(curp->roff)) == 0) {
272 		cp = curp->primary->buf;
273 		ep = cp + curp->primary->sz;
274 		while (cp < ep) {
275 			if (*cp == '.' || *cp == '\'') {
276 				cp++;
277 				if (cp[0] == 'D' && cp[1] == 'd') {
278 					format = MPARSE_MDOC;
279 					break;
280 				}
281 				if (cp[0] == 'T' && cp[1] == 'H') {
282 					format = MPARSE_MAN;
283 					break;
284 				}
285 			}
286 			cp = memchr(cp, '\n', ep - cp);
287 			if (cp == NULL)
288 				break;
289 			cp++;
290 		}
291 	}
292 
293 	if (format == MPARSE_MDOC) {
294 		if (NULL == curp->pmdoc)
295 			curp->pmdoc = mdoc_alloc(
296 			    curp->roff, curp, curp->defos,
297 			    MPARSE_QUICK & curp->options ? 1 : 0);
298 		assert(curp->pmdoc);
299 		curp->mdoc = curp->pmdoc;
300 		return;
301 	}
302 
303 	/* Fall back to man(7) as a last resort. */
304 
305 	if (NULL == curp->pman)
306 		curp->pman = man_alloc(
307 		    curp->roff, curp, curp->defos,
308 		    MPARSE_QUICK & curp->options ? 1 : 0);
309 	assert(curp->pman);
310 	curp->man = curp->pman;
311 }
312 
313 /*
314  * Main parse routine for a buffer.
315  * It assumes encoding and line numbering are already set up.
316  * It can recurse directly (for invocations of user-defined
317  * macros, inline equations, and input line traps)
318  * and indirectly (for .so file inclusion).
319  */
320 static void
mparse_buf_r(struct mparse * curp,struct buf blk,size_t i,int start)321 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
322 {
323 	const struct tbl_span	*span;
324 	struct buf	 ln;
325 	const char	*save_file;
326 	char		*cp;
327 	size_t		 pos; /* byte number in the ln buffer */
328 	enum rofferr	 rr;
329 	int		 of;
330 	int		 lnn; /* line number in the real file */
331 	int		 fd;
332 	pid_t		 save_child;
333 	unsigned char	 c;
334 
335 	memset(&ln, 0, sizeof(ln));
336 
337 	lnn = curp->line;
338 	pos = 0;
339 
340 	while (i < blk.sz) {
341 		if (0 == pos && '\0' == blk.buf[i])
342 			break;
343 
344 		if (start) {
345 			curp->line = lnn;
346 			curp->reparse_count = 0;
347 
348 			if (lnn < 3 &&
349 			    curp->filenc & MPARSE_UTF8 &&
350 			    curp->filenc & MPARSE_LATIN1)
351 				curp->filenc = preconv_cue(&blk, i);
352 		}
353 
354 		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
355 
356 			/*
357 			 * When finding an unescaped newline character,
358 			 * leave the character loop to process the line.
359 			 * Skip a preceding carriage return, if any.
360 			 */
361 
362 			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
363 			    '\n' == blk.buf[i + 1])
364 				++i;
365 			if ('\n' == blk.buf[i]) {
366 				++i;
367 				++lnn;
368 				break;
369 			}
370 
371 			/*
372 			 * Make sure we have space for the worst
373 			 * case of 11 bytes: "\\[u10ffff]\0"
374 			 */
375 
376 			if (pos + 11 > ln.sz)
377 				resize_buf(&ln, 256);
378 
379 			/*
380 			 * Encode 8-bit input.
381 			 */
382 
383 			c = blk.buf[i];
384 			if (c & 0x80) {
385 				if ( ! (curp->filenc && preconv_encode(
386 				    &blk, &i, &ln, &pos, &curp->filenc))) {
387 					mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
388 					    curp->line, pos, "0x%x", c);
389 					ln.buf[pos++] = '?';
390 					i++;
391 				}
392 				continue;
393 			}
394 
395 			/*
396 			 * Exclude control characters.
397 			 */
398 
399 			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
400 				mandoc_vmsg(c == 0x00 || c == 0x04 ||
401 				    c > 0x0a ? MANDOCERR_CHAR_BAD :
402 				    MANDOCERR_CHAR_UNSUPP,
403 				    curp, curp->line, pos, "0x%x", c);
404 				i++;
405 				if (c != '\r')
406 					ln.buf[pos++] = '?';
407 				continue;
408 			}
409 
410 			/* Trailing backslash = a plain char. */
411 
412 			if (blk.buf[i] != '\\' || i + 1 == blk.sz) {
413 				ln.buf[pos++] = blk.buf[i++];
414 				continue;
415 			}
416 
417 			/*
418 			 * Found escape and at least one other character.
419 			 * When it's a newline character, skip it.
420 			 * When there is a carriage return in between,
421 			 * skip that one as well.
422 			 */
423 
424 			if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz &&
425 			    '\n' == blk.buf[i + 2])
426 				++i;
427 			if ('\n' == blk.buf[i + 1]) {
428 				i += 2;
429 				++lnn;
430 				continue;
431 			}
432 
433 			if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
434 				i += 2;
435 				/* Comment, skip to end of line */
436 				for (; i < blk.sz; ++i) {
437 					if ('\n' == blk.buf[i]) {
438 						++i;
439 						++lnn;
440 						break;
441 					}
442 				}
443 
444 				/* Backout trailing whitespaces */
445 				for (; pos > 0; --pos) {
446 					if (ln.buf[pos - 1] != ' ')
447 						break;
448 					if (pos > 2 && ln.buf[pos - 2] == '\\')
449 						break;
450 				}
451 				break;
452 			}
453 
454 			/* Catch escaped bogus characters. */
455 
456 			c = (unsigned char) blk.buf[i+1];
457 
458 			if ( ! (isascii(c) &&
459 			    (isgraph(c) || isblank(c)))) {
460 				mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
461 				    curp->line, pos, "0x%x", c);
462 				i += 2;
463 				ln.buf[pos++] = '?';
464 				continue;
465 			}
466 
467 			/* Some other escape sequence, copy & cont. */
468 
469 			ln.buf[pos++] = blk.buf[i++];
470 			ln.buf[pos++] = blk.buf[i++];
471 		}
472 
473 		if (pos >= ln.sz)
474 			resize_buf(&ln, 256);
475 
476 		ln.buf[pos] = '\0';
477 
478 		/*
479 		 * A significant amount of complexity is contained by
480 		 * the roff preprocessor.  It's line-oriented but can be
481 		 * expressed on one line, so we need at times to
482 		 * readjust our starting point and re-run it.  The roff
483 		 * preprocessor can also readjust the buffers with new
484 		 * data, so we pass them in wholesale.
485 		 */
486 
487 		of = 0;
488 
489 		/*
490 		 * Maintain a lookaside buffer of all parsed lines.  We
491 		 * only do this if mparse_keep() has been invoked (the
492 		 * buffer may be accessed with mparse_getkeep()).
493 		 */
494 
495 		if (curp->secondary) {
496 			curp->secondary->buf = mandoc_realloc(
497 			    curp->secondary->buf,
498 			    curp->secondary->sz + pos + 2);
499 			memcpy(curp->secondary->buf +
500 			    curp->secondary->sz,
501 			    ln.buf, pos);
502 			curp->secondary->sz += pos;
503 			curp->secondary->buf
504 				[curp->secondary->sz] = '\n';
505 			curp->secondary->sz++;
506 			curp->secondary->buf
507 				[curp->secondary->sz] = '\0';
508 		}
509 rerun:
510 		rr = roff_parseln(curp->roff, curp->line, &ln, &of);
511 
512 		switch (rr) {
513 		case ROFF_REPARSE:
514 			if (REPARSE_LIMIT >= ++curp->reparse_count)
515 				mparse_buf_r(curp, ln, of, 0);
516 			else
517 				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
518 				    curp->line, pos, NULL);
519 			pos = 0;
520 			continue;
521 		case ROFF_APPEND:
522 			pos = strlen(ln.buf);
523 			continue;
524 		case ROFF_RERUN:
525 			goto rerun;
526 		case ROFF_IGN:
527 			pos = 0;
528 			continue;
529 		case ROFF_SO:
530 			if ( ! (curp->options & MPARSE_SO) &&
531 			    (i >= blk.sz || blk.buf[i] == '\0')) {
532 				curp->sodest = mandoc_strdup(ln.buf + of);
533 				free(ln.buf);
534 				return;
535 			}
536 			/*
537 			 * We remove `so' clauses from our lookaside
538 			 * buffer because we're going to descend into
539 			 * the file recursively.
540 			 */
541 			if (curp->secondary)
542 				curp->secondary->sz -= pos + 1;
543 			save_file = curp->file;
544 			save_child = curp->child;
545 			if (mparse_open(curp, &fd, ln.buf + of) ==
546 			    MANDOCLEVEL_OK) {
547 				mparse_readfd(curp, fd, ln.buf + of);
548 				curp->file = save_file;
549 			} else {
550 				curp->file = save_file;
551 				mandoc_vmsg(MANDOCERR_SO_FAIL,
552 				    curp, curp->line, pos,
553 				    ".so %s", ln.buf + of);
554 				ln.sz = mandoc_asprintf(&cp,
555 				    ".sp\nSee the file %s.\n.sp",
556 				    ln.buf + of);
557 				free(ln.buf);
558 				ln.buf = cp;
559 				of = 0;
560 				mparse_buf_r(curp, ln, of, 0);
561 			}
562 			curp->child = save_child;
563 			pos = 0;
564 			continue;
565 		default:
566 			break;
567 		}
568 
569 		/*
570 		 * If input parsers have not been allocated, do so now.
571 		 * We keep these instanced between parsers, but set them
572 		 * locally per parse routine since we can use different
573 		 * parsers with each one.
574 		 */
575 
576 		if ( ! (curp->man || curp->mdoc))
577 			choose_parser(curp);
578 
579 		/*
580 		 * Lastly, push down into the parsers themselves.
581 		 * If libroff returns ROFF_TBL, then add it to the
582 		 * currently open parse.  Since we only get here if
583 		 * there does exist data (see tbl_data.c), we're
584 		 * guaranteed that something's been allocated.
585 		 * Do the same for ROFF_EQN.
586 		 */
587 
588 		if (rr == ROFF_TBL) {
589 			while ((span = roff_span(curp->roff)) != NULL)
590 				if (curp->man == NULL)
591 					mdoc_addspan(curp->mdoc, span);
592 				else
593 					man_addspan(curp->man, span);
594 		} else if (rr == ROFF_EQN) {
595 			if (curp->man == NULL)
596 				mdoc_addeqn(curp->mdoc, roff_eqn(curp->roff));
597 			else
598 				man_addeqn(curp->man, roff_eqn(curp->roff));
599 		} else if ((curp->man == NULL ?
600 		    mdoc_parseln(curp->mdoc, curp->line, ln.buf, of) :
601 		    man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
602 				break;
603 
604 		/* Temporary buffers typically are not full. */
605 
606 		if (0 == start && '\0' == blk.buf[i])
607 			break;
608 
609 		/* Start the next input line. */
610 
611 		pos = 0;
612 	}
613 
614 	free(ln.buf);
615 }
616 
617 static int
read_whole_file(struct mparse * curp,const char * file,int fd,struct buf * fb,int * with_mmap)618 read_whole_file(struct mparse *curp, const char *file, int fd,
619 		struct buf *fb, int *with_mmap)
620 {
621 	size_t		 off;
622 	ssize_t		 ssz;
623 
624 #if HAVE_MMAP
625 	struct stat	 st;
626 	if (-1 == fstat(fd, &st)) {
627 		perror(file);
628 		exit((int)MANDOCLEVEL_SYSERR);
629 	}
630 
631 	/*
632 	 * If we're a regular file, try just reading in the whole entry
633 	 * via mmap().  This is faster than reading it into blocks, and
634 	 * since each file is only a few bytes to begin with, I'm not
635 	 * concerned that this is going to tank any machines.
636 	 */
637 
638 	if (S_ISREG(st.st_mode)) {
639 		if (st.st_size > 0x7fffffff) {
640 			mandoc_msg(MANDOCERR_TOOLARGE, curp, 0, 0, NULL);
641 			return(0);
642 		}
643 		*with_mmap = 1;
644 		fb->sz = (size_t)st.st_size;
645 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
646 		if (fb->buf != MAP_FAILED)
647 			return(1);
648 	}
649 #endif
650 
651 	/*
652 	 * If this isn't a regular file (like, say, stdin), then we must
653 	 * go the old way and just read things in bit by bit.
654 	 */
655 
656 	*with_mmap = 0;
657 	off = 0;
658 	fb->sz = 0;
659 	fb->buf = NULL;
660 	for (;;) {
661 		if (off == fb->sz) {
662 			if (fb->sz == (1U << 31)) {
663 				mandoc_msg(MANDOCERR_TOOLARGE, curp,
664 				    0, 0, NULL);
665 				break;
666 			}
667 			resize_buf(fb, 65536);
668 		}
669 		ssz = read(fd, fb->buf + (int)off, fb->sz - off);
670 		if (ssz == 0) {
671 			fb->sz = off;
672 			return(1);
673 		}
674 		if (ssz == -1) {
675 			perror(file);
676 			exit((int)MANDOCLEVEL_SYSERR);
677 		}
678 		off += (size_t)ssz;
679 	}
680 
681 	free(fb->buf);
682 	fb->buf = NULL;
683 	return(0);
684 }
685 
686 static void
mparse_end(struct mparse * curp)687 mparse_end(struct mparse *curp)
688 {
689 
690 	if (curp->mdoc == NULL &&
691 	    curp->man == NULL &&
692 	    curp->sodest == NULL) {
693 		if (curp->options & MPARSE_MDOC)
694 			curp->mdoc = curp->pmdoc;
695 		else {
696 			if (curp->pman == NULL)
697 				curp->pman = man_alloc(
698 				    curp->roff, curp, curp->defos,
699 				    curp->options & MPARSE_QUICK ? 1 : 0);
700 			curp->man = curp->pman;
701 		}
702 	}
703 	if (curp->mdoc)
704 		mdoc_endparse(curp->mdoc);
705 	if (curp->man)
706 		man_endparse(curp->man);
707 	roff_endparse(curp->roff);
708 }
709 
710 static void
mparse_parse_buffer(struct mparse * curp,struct buf blk,const char * file)711 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
712 {
713 	struct buf	*svprimary;
714 	const char	*svfile;
715 	size_t		 offset;
716 	static int	 recursion_depth;
717 
718 	if (64 < recursion_depth) {
719 		mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
720 		return;
721 	}
722 
723 	/* Line number is per-file. */
724 	svfile = curp->file;
725 	curp->file = file;
726 	svprimary = curp->primary;
727 	curp->primary = &blk;
728 	curp->line = 1;
729 	recursion_depth++;
730 
731 	/* Skip an UTF-8 byte order mark. */
732 	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
733 	    (unsigned char)blk.buf[0] == 0xef &&
734 	    (unsigned char)blk.buf[1] == 0xbb &&
735 	    (unsigned char)blk.buf[2] == 0xbf) {
736 		offset = 3;
737 		curp->filenc &= ~MPARSE_LATIN1;
738 	} else
739 		offset = 0;
740 
741 	mparse_buf_r(curp, blk, offset, 1);
742 
743 	if (--recursion_depth == 0)
744 		mparse_end(curp);
745 
746 	curp->primary = svprimary;
747 	curp->file = svfile;
748 }
749 
750 enum mandoclevel
mparse_readmem(struct mparse * curp,void * buf,size_t len,const char * file)751 mparse_readmem(struct mparse *curp, void *buf, size_t len,
752 		const char *file)
753 {
754 	struct buf blk;
755 
756 	blk.buf = buf;
757 	blk.sz = len;
758 
759 	mparse_parse_buffer(curp, blk, file);
760 	return(curp->file_status);
761 }
762 
763 /*
764  * Read the whole file into memory and call the parsers.
765  * Called recursively when an .so request is encountered.
766  */
767 enum mandoclevel
mparse_readfd(struct mparse * curp,int fd,const char * file)768 mparse_readfd(struct mparse *curp, int fd, const char *file)
769 {
770 	struct buf	 blk;
771 	int		 with_mmap;
772 	int		 save_filenc;
773 
774 	if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
775 		save_filenc = curp->filenc;
776 		curp->filenc = curp->options &
777 		    (MPARSE_UTF8 | MPARSE_LATIN1);
778 		mparse_parse_buffer(curp, blk, file);
779 		curp->filenc = save_filenc;
780 #if HAVE_MMAP
781 		if (with_mmap)
782 			munmap(blk.buf, blk.sz);
783 		else
784 #endif
785 			free(blk.buf);
786 	}
787 
788 	if (fd != STDIN_FILENO && close(fd) == -1)
789 		perror(file);
790 
791 	mparse_wait(curp);
792 	return(curp->file_status);
793 }
794 
795 enum mandoclevel
mparse_open(struct mparse * curp,int * fd,const char * file)796 mparse_open(struct mparse *curp, int *fd, const char *file)
797 {
798 	int		  pfd[2];
799 	int		  save_errno;
800 	char		 *cp;
801 
802 	curp->file = file;
803 
804 	/* Unless zipped, try to just open the file. */
805 
806 	if ((cp = strrchr(file, '.')) == NULL ||
807 	    strcmp(cp + 1, "gz")) {
808 		curp->child = 0;
809 		if ((*fd = open(file, O_RDONLY)) != -1)
810 			return(MANDOCLEVEL_OK);
811 
812 		/* Open failed; try to append ".gz". */
813 
814 		mandoc_asprintf(&cp, "%s.gz", file);
815 		file = cp;
816 	} else
817 		cp = NULL;
818 
819 	/* Before forking, make sure the file can be read. */
820 
821 	save_errno = errno;
822 	if (access(file, R_OK) == -1) {
823 		if (cp != NULL)
824 			errno = save_errno;
825 		free(cp);
826 		*fd = -1;
827 		curp->child = 0;
828 		mandoc_msg(MANDOCERR_FILE, curp, 0, 0, strerror(errno));
829 		return(MANDOCLEVEL_ERROR);
830 	}
831 
832 	/* Run gunzip(1). */
833 
834 	if (pipe(pfd) == -1) {
835 		perror("pipe");
836 		exit((int)MANDOCLEVEL_SYSERR);
837 	}
838 
839 	switch (curp->child = fork()) {
840 	case -1:
841 		perror("fork");
842 		exit((int)MANDOCLEVEL_SYSERR);
843 	case 0:
844 		close(pfd[0]);
845 		if (dup2(pfd[1], STDOUT_FILENO) == -1) {
846 			perror("dup");
847 			exit((int)MANDOCLEVEL_SYSERR);
848 		}
849 		execlp("gunzip", "gunzip", "-c", file, NULL);
850 		perror("exec");
851 		exit((int)MANDOCLEVEL_SYSERR);
852 	default:
853 		close(pfd[1]);
854 		*fd = pfd[0];
855 		return(MANDOCLEVEL_OK);
856 	}
857 }
858 
859 enum mandoclevel
mparse_wait(struct mparse * curp)860 mparse_wait(struct mparse *curp)
861 {
862 	int	  status;
863 
864 	if (curp->child == 0)
865 		return(MANDOCLEVEL_OK);
866 
867 	if (waitpid(curp->child, &status, 0) == -1) {
868 		perror("wait");
869 		exit((int)MANDOCLEVEL_SYSERR);
870 	}
871 	curp->child = 0;
872 	if (WIFSIGNALED(status)) {
873 		mandoc_vmsg(MANDOCERR_FILE, curp, 0, 0,
874 		    "gunzip died from signal %d", WTERMSIG(status));
875 		return(MANDOCLEVEL_ERROR);
876 	}
877 	if (WEXITSTATUS(status)) {
878 		mandoc_vmsg(MANDOCERR_FILE, curp, 0, 0,
879 		    "gunzip failed with code %d", WEXITSTATUS(status));
880 		return(MANDOCLEVEL_ERROR);
881 	}
882 	return(MANDOCLEVEL_OK);
883 }
884 
885 struct mparse *
mparse_alloc(int options,enum mandoclevel wlevel,mandocmsg mmsg,const struct mchars * mchars,const char * defos)886 mparse_alloc(int options, enum mandoclevel wlevel, mandocmsg mmsg,
887     const struct mchars *mchars, const char *defos)
888 {
889 	struct mparse	*curp;
890 
891 	curp = mandoc_calloc(1, sizeof(struct mparse));
892 
893 	curp->options = options;
894 	curp->wlevel = wlevel;
895 	curp->mmsg = mmsg;
896 	curp->defos = defos;
897 
898 	curp->mchars = mchars;
899 	curp->roff = roff_alloc(curp, curp->mchars, options);
900 	if (curp->options & MPARSE_MDOC)
901 		curp->pmdoc = mdoc_alloc(
902 		    curp->roff, curp, curp->defos,
903 		    curp->options & MPARSE_QUICK ? 1 : 0);
904 	if (curp->options & MPARSE_MAN)
905 		curp->pman = man_alloc(
906 		    curp->roff, curp, curp->defos,
907 		    curp->options & MPARSE_QUICK ? 1 : 0);
908 
909 	return(curp);
910 }
911 
912 void
mparse_reset(struct mparse * curp)913 mparse_reset(struct mparse *curp)
914 {
915 
916 	roff_reset(curp->roff);
917 
918 	if (curp->mdoc)
919 		mdoc_reset(curp->mdoc);
920 	if (curp->man)
921 		man_reset(curp->man);
922 	if (curp->secondary)
923 		curp->secondary->sz = 0;
924 
925 	curp->file_status = MANDOCLEVEL_OK;
926 	curp->mdoc = NULL;
927 	curp->man = NULL;
928 
929 	free(curp->sodest);
930 	curp->sodest = NULL;
931 }
932 
933 void
mparse_free(struct mparse * curp)934 mparse_free(struct mparse *curp)
935 {
936 
937 	if (curp->pmdoc)
938 		mdoc_free(curp->pmdoc);
939 	if (curp->pman)
940 		man_free(curp->pman);
941 	if (curp->roff)
942 		roff_free(curp->roff);
943 	if (curp->secondary)
944 		free(curp->secondary->buf);
945 
946 	free(curp->secondary);
947 	free(curp->sodest);
948 	free(curp);
949 }
950 
951 void
mparse_result(struct mparse * curp,struct mdoc ** mdoc,struct man ** man,char ** sodest)952 mparse_result(struct mparse *curp,
953 	struct mdoc **mdoc, struct man **man, char **sodest)
954 {
955 
956 	if (sodest && NULL != (*sodest = curp->sodest)) {
957 		*mdoc = NULL;
958 		*man = NULL;
959 		return;
960 	}
961 	if (mdoc)
962 		*mdoc = curp->mdoc;
963 	if (man)
964 		*man = curp->man;
965 }
966 
967 void
mandoc_vmsg(enum mandocerr t,struct mparse * m,int ln,int pos,const char * fmt,...)968 mandoc_vmsg(enum mandocerr t, struct mparse *m,
969 		int ln, int pos, const char *fmt, ...)
970 {
971 	char		 buf[256];
972 	va_list		 ap;
973 
974 	va_start(ap, fmt);
975 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
976 	va_end(ap);
977 
978 	mandoc_msg(t, m, ln, pos, buf);
979 }
980 
981 void
mandoc_msg(enum mandocerr er,struct mparse * m,int ln,int col,const char * msg)982 mandoc_msg(enum mandocerr er, struct mparse *m,
983 		int ln, int col, const char *msg)
984 {
985 	enum mandoclevel level;
986 
987 	level = MANDOCLEVEL_UNSUPP;
988 	while (er < mandoclimits[level])
989 		level--;
990 
991 	if (level < m->wlevel && er != MANDOCERR_FILE)
992 		return;
993 
994 	if (m->mmsg)
995 		(*m->mmsg)(er, level, m->file, ln, col, msg);
996 
997 	if (m->file_status < level)
998 		m->file_status = level;
999 }
1000 
1001 const char *
mparse_strerror(enum mandocerr er)1002 mparse_strerror(enum mandocerr er)
1003 {
1004 
1005 	return(mandocerrs[er]);
1006 }
1007 
1008 const char *
mparse_strlevel(enum mandoclevel lvl)1009 mparse_strlevel(enum mandoclevel lvl)
1010 {
1011 	return(mandoclevels[lvl]);
1012 }
1013 
1014 void
mparse_keep(struct mparse * p)1015 mparse_keep(struct mparse *p)
1016 {
1017 
1018 	assert(NULL == p->secondary);
1019 	p->secondary = mandoc_calloc(1, sizeof(struct buf));
1020 }
1021 
1022 const char *
mparse_getkeep(const struct mparse * p)1023 mparse_getkeep(const struct mparse *p)
1024 {
1025 
1026 	assert(p->secondary);
1027 	return(p->secondary->sz ? p->secondary->buf : NULL);
1028 }
1029