1 /* $Id: read.c,v 1.220 2021/06/27 17:57:54 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org> 4 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 * 19 * Top-level functions of the mandoc(3) parser: 20 * Parser and input encoding selection, decompression, 21 * handling of input bytes, characters, lines, and files, 22 * handling of roff(7) loops and file inclusion, 23 * and steering of the various parsers. 24 */ 25 #include "config.h" 26 27 #include <sys/types.h> 28 #include <sys/mman.h> 29 #include <sys/stat.h> 30 31 #include <assert.h> 32 #include <ctype.h> 33 #include <errno.h> 34 #include <fcntl.h> 35 #include <stdarg.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <string.h> 39 #include <unistd.h> 40 #include <zlib.h> 41 42 #include "mandoc_aux.h" 43 #include "mandoc.h" 44 #include "roff.h" 45 #include "mdoc.h" 46 #include "man.h" 47 #include "mandoc_parse.h" 48 #include "libmandoc.h" 49 #include "roff_int.h" 50 #include "tag.h" 51 52 #define REPARSE_LIMIT 1000 53 54 struct mparse { 55 struct roff *roff; /* roff parser (!NULL) */ 56 struct roff_man *man; /* man parser */ 57 struct buf *primary; /* buffer currently being parsed */ 58 struct buf *secondary; /* copy of top level input */ 59 struct buf *loop; /* open .while request line */ 60 const char *os_s; /* default operating system */ 61 int options; /* parser options */ 62 int gzip; /* current input file is gzipped */ 63 int filenc; /* encoding of the current file */ 64 int reparse_count; /* finite interp. stack */ 65 int line; /* line number in the file */ 66 }; 67 68 static void choose_parser(struct mparse *); 69 static void free_buf_list(struct buf *); 70 static void resize_buf(struct buf *, size_t); 71 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 72 static int read_whole_file(struct mparse *, int, struct buf *, int *); 73 static void mparse_end(struct mparse *); 74 75 76 static void 77 resize_buf(struct buf *buf, size_t initial) 78 { 79 80 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 81 buf->buf = mandoc_realloc(buf->buf, buf->sz); 82 } 83 84 static void 85 free_buf_list(struct buf *buf) 86 { 87 struct buf *tmp; 88 89 while (buf != NULL) { 90 tmp = buf; 91 buf = tmp->next; 92 free(tmp->buf); 93 free(tmp); 94 } 95 } 96 97 static void 98 choose_parser(struct mparse *curp) 99 { 100 char *cp, *ep; 101 int format; 102 103 /* 104 * If neither command line arguments -mdoc or -man select 105 * a parser nor the roff parser found a .Dd or .TH macro 106 * yet, look ahead in the main input buffer. 107 */ 108 109 if ((format = roff_getformat(curp->roff)) == 0) { 110 cp = curp->primary->buf; 111 ep = cp + curp->primary->sz; 112 while (cp < ep) { 113 if (*cp == '.' || *cp == '\'') { 114 cp++; 115 if (cp[0] == 'D' && cp[1] == 'd') { 116 format = MPARSE_MDOC; 117 break; 118 } 119 if (cp[0] == 'T' && cp[1] == 'H') { 120 format = MPARSE_MAN; 121 break; 122 } 123 } 124 cp = memchr(cp, '\n', ep - cp); 125 if (cp == NULL) 126 break; 127 cp++; 128 } 129 } 130 131 if (format == MPARSE_MDOC) { 132 curp->man->meta.macroset = MACROSET_MDOC; 133 if (curp->man->mdocmac == NULL) 134 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 135 } else { 136 curp->man->meta.macroset = MACROSET_MAN; 137 if (curp->man->manmac == NULL) 138 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 139 } 140 curp->man->meta.first->tok = TOKEN_NONE; 141 } 142 143 /* 144 * Main parse routine for a buffer. 145 * It assumes encoding and line numbering are already set up. 146 * It can recurse directly (for invocations of user-defined 147 * macros, inline equations, and input line traps) 148 * and indirectly (for .so file inclusion). 149 */ 150 static int 151 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 152 { 153 struct buf ln; 154 struct buf *firstln, *lastln, *thisln, *loop; 155 char *cp; 156 size_t pos; /* byte number in the ln buffer */ 157 size_t spos; /* at the start of the current line parse */ 158 int line_result, result; 159 int of; 160 int lnn; /* line number in the real file */ 161 int fd; 162 int inloop; /* Saw .while on this level. */ 163 unsigned char c; 164 165 ln.sz = 256; 166 ln.buf = mandoc_malloc(ln.sz); 167 ln.next = NULL; 168 firstln = lastln = loop = NULL; 169 lnn = curp->line; 170 pos = 0; 171 inloop = 0; 172 result = ROFF_CONT; 173 174 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 175 if (start) { 176 curp->line = lnn; 177 curp->reparse_count = 0; 178 179 if (lnn < 3 && 180 curp->filenc & MPARSE_UTF8 && 181 curp->filenc & MPARSE_LATIN1) 182 curp->filenc = preconv_cue(&blk, i); 183 } 184 spos = pos; 185 186 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 187 188 /* 189 * When finding an unescaped newline character, 190 * leave the character loop to process the line. 191 * Skip a preceding carriage return, if any. 192 */ 193 194 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 195 '\n' == blk.buf[i + 1]) 196 ++i; 197 if ('\n' == blk.buf[i]) { 198 ++i; 199 ++lnn; 200 break; 201 } 202 203 /* 204 * Make sure we have space for the worst 205 * case of 12 bytes: "\\[u10ffff]\n\0" 206 */ 207 208 if (pos + 12 > ln.sz) 209 resize_buf(&ln, 256); 210 211 /* 212 * Encode 8-bit input. 213 */ 214 215 c = blk.buf[i]; 216 if (c & 0x80) { 217 if ( ! (curp->filenc && preconv_encode( 218 &blk, &i, &ln, &pos, &curp->filenc))) { 219 mandoc_msg(MANDOCERR_CHAR_BAD, 220 curp->line, pos, "0x%x", c); 221 ln.buf[pos++] = '?'; 222 i++; 223 } 224 continue; 225 } 226 227 /* 228 * Exclude control characters. 229 */ 230 231 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 232 mandoc_msg(c == 0x00 || c == 0x04 || 233 c > 0x0a ? MANDOCERR_CHAR_BAD : 234 MANDOCERR_CHAR_UNSUPP, 235 curp->line, pos, "0x%x", c); 236 i++; 237 if (c != '\r') 238 ln.buf[pos++] = '?'; 239 continue; 240 } 241 242 ln.buf[pos++] = blk.buf[i++]; 243 } 244 ln.buf[pos] = '\0'; 245 246 /* 247 * Maintain a lookaside buffer of all lines. 248 * parsed from this input source. 249 */ 250 251 thisln = mandoc_malloc(sizeof(*thisln)); 252 thisln->buf = mandoc_strdup(ln.buf); 253 thisln->sz = strlen(ln.buf) + 1; 254 thisln->next = NULL; 255 if (firstln == NULL) { 256 firstln = lastln = thisln; 257 if (curp->secondary == NULL) 258 curp->secondary = firstln; 259 } else { 260 lastln->next = thisln; 261 lastln = thisln; 262 } 263 264 /* XXX Ugly hack to mark the end of the input. */ 265 266 if (i == blk.sz || blk.buf[i] == '\0') { 267 if (pos + 2 > ln.sz) 268 resize_buf(&ln, 256); 269 ln.buf[pos++] = '\n'; 270 ln.buf[pos] = '\0'; 271 } 272 273 /* 274 * A significant amount of complexity is contained by 275 * the roff preprocessor. It's line-oriented but can be 276 * expressed on one line, so we need at times to 277 * readjust our starting point and re-run it. The roff 278 * preprocessor can also readjust the buffers with new 279 * data, so we pass them in wholesale. 280 */ 281 282 of = 0; 283 rerun: 284 line_result = roff_parseln(curp->roff, curp->line, 285 &ln, &of, start && spos == 0 ? pos : 0); 286 287 /* Process options. */ 288 289 if (line_result & ROFF_APPEND) 290 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 291 292 if (line_result & ROFF_USERCALL) 293 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 294 295 if (line_result & ROFF_USERRET) { 296 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 297 if (start == 0) { 298 /* Return from the current macro. */ 299 result = ROFF_USERRET; 300 goto out; 301 } 302 } 303 304 switch (line_result & ROFF_LOOPMASK) { 305 case ROFF_IGN: 306 break; 307 case ROFF_WHILE: 308 if (curp->loop != NULL) { 309 if (loop == curp->loop) 310 break; 311 mandoc_msg(MANDOCERR_WHILE_NEST, 312 curp->line, pos, NULL); 313 } 314 curp->loop = thisln; 315 loop = NULL; 316 inloop = 1; 317 break; 318 case ROFF_LOOPCONT: 319 case ROFF_LOOPEXIT: 320 if (curp->loop == NULL) { 321 mandoc_msg(MANDOCERR_WHILE_FAIL, 322 curp->line, pos, NULL); 323 break; 324 } 325 if (inloop == 0) { 326 mandoc_msg(MANDOCERR_WHILE_INTO, 327 curp->line, pos, NULL); 328 curp->loop = loop = NULL; 329 break; 330 } 331 if (line_result & ROFF_LOOPCONT) 332 loop = curp->loop; 333 else { 334 curp->loop = loop = NULL; 335 inloop = 0; 336 } 337 break; 338 default: 339 abort(); 340 } 341 342 /* Process the main instruction from the roff parser. */ 343 344 switch (line_result & ROFF_MASK) { 345 case ROFF_IGN: 346 break; 347 case ROFF_CONT: 348 if (curp->man->meta.macroset == MACROSET_NONE) 349 choose_parser(curp); 350 if ((curp->man->meta.macroset == MACROSET_MDOC ? 351 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 352 man_parseln(curp->man, curp->line, ln.buf, of) 353 ) == 2) 354 goto out; 355 break; 356 case ROFF_RERUN: 357 goto rerun; 358 case ROFF_REPARSE: 359 if (++curp->reparse_count > REPARSE_LIMIT) { 360 /* Abort and return to the top level. */ 361 result = ROFF_IGN; 362 mandoc_msg(MANDOCERR_ROFFLOOP, 363 curp->line, pos, NULL); 364 goto out; 365 } 366 result = mparse_buf_r(curp, ln, of, 0); 367 if (line_result & ROFF_USERCALL) { 368 roff_userret(curp->roff); 369 /* Continue normally. */ 370 if (result & ROFF_USERRET) 371 result = ROFF_CONT; 372 } 373 if (start == 0 && result != ROFF_CONT) 374 goto out; 375 break; 376 case ROFF_SO: 377 if ( ! (curp->options & MPARSE_SO) && 378 (i >= blk.sz || blk.buf[i] == '\0')) { 379 curp->man->meta.sodest = 380 mandoc_strdup(ln.buf + of); 381 goto out; 382 } 383 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 384 mparse_readfd(curp, fd, ln.buf + of); 385 close(fd); 386 } else { 387 mandoc_msg(MANDOCERR_SO_FAIL, 388 curp->line, of, ".so %s: %s", 389 ln.buf + of, strerror(errno)); 390 ln.sz = mandoc_asprintf(&cp, 391 ".sp\nSee the file %s.\n.sp", 392 ln.buf + of); 393 free(ln.buf); 394 ln.buf = cp; 395 of = 0; 396 mparse_buf_r(curp, ln, of, 0); 397 } 398 break; 399 default: 400 abort(); 401 } 402 403 /* Start the next input line. */ 404 405 if (loop != NULL && 406 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 407 loop = loop->next; 408 409 if (loop != NULL) { 410 if ((line_result & ROFF_APPEND) == 0) 411 *ln.buf = '\0'; 412 if (ln.sz < loop->sz) 413 resize_buf(&ln, loop->sz); 414 (void)strlcat(ln.buf, loop->buf, ln.sz); 415 of = 0; 416 goto rerun; 417 } 418 419 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 420 } 421 out: 422 if (inloop) { 423 if (result != ROFF_USERRET) 424 mandoc_msg(MANDOCERR_WHILE_OUTOF, 425 curp->line, pos, NULL); 426 curp->loop = NULL; 427 } 428 free(ln.buf); 429 if (firstln != curp->secondary) 430 free_buf_list(firstln); 431 return result; 432 } 433 434 static int 435 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 436 { 437 struct stat st; 438 gzFile gz; 439 size_t off; 440 ssize_t ssz; 441 int gzerrnum, retval; 442 443 if (fstat(fd, &st) == -1) { 444 mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno)); 445 return -1; 446 } 447 448 /* 449 * If we're a regular file, try just reading in the whole entry 450 * via mmap(). This is faster than reading it into blocks, and 451 * since each file is only a few bytes to begin with, I'm not 452 * concerned that this is going to tank any machines. 453 */ 454 455 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 456 if (st.st_size > 0x7fffffff) { 457 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 458 return -1; 459 } 460 *with_mmap = 1; 461 fb->sz = (size_t)st.st_size; 462 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 463 if (fb->buf != MAP_FAILED) 464 return 0; 465 } 466 467 if (curp->gzip) { 468 /* 469 * Duplicating the file descriptor is required 470 * because we will have to call gzclose(3) 471 * to free memory used internally by zlib, 472 * but that will also close the file descriptor, 473 * which this function must not do. 474 */ 475 if ((fd = dup(fd)) == -1) { 476 mandoc_msg(MANDOCERR_DUP, 0, 0, 477 "%s", strerror(errno)); 478 return -1; 479 } 480 if ((gz = gzdopen(fd, "rb")) == NULL) { 481 mandoc_msg(MANDOCERR_GZDOPEN, 0, 0, 482 "%s", strerror(errno)); 483 close(fd); 484 return -1; 485 } 486 } else 487 gz = NULL; 488 489 /* 490 * If this isn't a regular file (like, say, stdin), then we must 491 * go the old way and just read things in bit by bit. 492 */ 493 494 *with_mmap = 0; 495 off = 0; 496 retval = -1; 497 fb->sz = 0; 498 fb->buf = NULL; 499 for (;;) { 500 if (off == fb->sz) { 501 if (fb->sz == (1U << 31)) { 502 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 503 break; 504 } 505 resize_buf(fb, 65536); 506 } 507 ssz = curp->gzip ? 508 gzread(gz, fb->buf + (int)off, fb->sz - off) : 509 read(fd, fb->buf + (int)off, fb->sz - off); 510 if (ssz == 0) { 511 fb->sz = off; 512 retval = 0; 513 break; 514 } 515 if (ssz == -1) { 516 if (curp->gzip) 517 (void)gzerror(gz, &gzerrnum); 518 mandoc_msg(MANDOCERR_READ, 0, 0, "%s", 519 curp->gzip && gzerrnum != Z_ERRNO ? 520 zError(gzerrnum) : strerror(errno)); 521 break; 522 } 523 off += (size_t)ssz; 524 } 525 526 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 527 mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s", 528 gzerrnum == Z_ERRNO ? strerror(errno) : 529 zError(gzerrnum)); 530 if (retval == -1) { 531 free(fb->buf); 532 fb->buf = NULL; 533 } 534 return retval; 535 } 536 537 static void 538 mparse_end(struct mparse *curp) 539 { 540 if (curp->man->meta.macroset == MACROSET_NONE) 541 curp->man->meta.macroset = MACROSET_MAN; 542 if (curp->man->meta.macroset == MACROSET_MDOC) 543 mdoc_endparse(curp->man); 544 else 545 man_endparse(curp->man); 546 roff_endparse(curp->roff); 547 } 548 549 /* 550 * Read the whole file into memory and call the parsers. 551 * Called recursively when an .so request is encountered. 552 */ 553 void 554 mparse_readfd(struct mparse *curp, int fd, const char *filename) 555 { 556 static int recursion_depth; 557 558 struct buf blk; 559 struct buf *save_primary; 560 const char *save_filename, *cp; 561 size_t offset; 562 int save_filenc, save_lineno; 563 int with_mmap; 564 565 if (recursion_depth > 64) { 566 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 567 return; 568 } else if (recursion_depth == 0 && 569 (cp = strrchr(filename, '.')) != NULL && 570 cp[1] >= '1' && cp[1] <= '9') 571 curp->man->filesec = cp[1]; 572 else 573 curp->man->filesec = '\0'; 574 575 if (read_whole_file(curp, fd, &blk, &with_mmap) == -1) 576 return; 577 578 /* 579 * Save some properties of the parent file. 580 */ 581 582 save_primary = curp->primary; 583 save_filenc = curp->filenc; 584 save_lineno = curp->line; 585 save_filename = mandoc_msg_getinfilename(); 586 587 curp->primary = &blk; 588 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 589 curp->line = 1; 590 mandoc_msg_setinfilename(filename); 591 592 /* Skip an UTF-8 byte order mark. */ 593 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 594 (unsigned char)blk.buf[0] == 0xef && 595 (unsigned char)blk.buf[1] == 0xbb && 596 (unsigned char)blk.buf[2] == 0xbf) { 597 offset = 3; 598 curp->filenc &= ~MPARSE_LATIN1; 599 } else 600 offset = 0; 601 602 recursion_depth++; 603 mparse_buf_r(curp, blk, offset, 1); 604 if (--recursion_depth == 0) 605 mparse_end(curp); 606 607 /* 608 * Clean up and restore saved parent properties. 609 */ 610 611 if (with_mmap) 612 munmap(blk.buf, blk.sz); 613 else 614 free(blk.buf); 615 616 curp->primary = save_primary; 617 curp->filenc = save_filenc; 618 curp->line = save_lineno; 619 if (save_filename != NULL) 620 mandoc_msg_setinfilename(save_filename); 621 } 622 623 int 624 mparse_open(struct mparse *curp, const char *file) 625 { 626 char *cp; 627 int fd, save_errno; 628 629 cp = strrchr(file, '.'); 630 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 631 632 /* First try to use the filename as it is. */ 633 634 if ((fd = open(file, O_RDONLY)) != -1) 635 return fd; 636 637 /* 638 * If that doesn't work and the filename doesn't 639 * already end in .gz, try appending .gz. 640 */ 641 642 if ( ! curp->gzip) { 643 save_errno = errno; 644 mandoc_asprintf(&cp, "%s.gz", file); 645 fd = open(cp, O_RDONLY); 646 free(cp); 647 errno = save_errno; 648 if (fd != -1) { 649 curp->gzip = 1; 650 return fd; 651 } 652 } 653 654 /* Neither worked, give up. */ 655 656 return -1; 657 } 658 659 struct mparse * 660 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 661 { 662 struct mparse *curp; 663 664 curp = mandoc_calloc(1, sizeof(struct mparse)); 665 666 curp->options = options; 667 curp->os_s = os_s; 668 669 curp->roff = roff_alloc(options); 670 curp->man = roff_man_alloc(curp->roff, curp->os_s, 671 curp->options & MPARSE_QUICK ? 1 : 0); 672 if (curp->options & MPARSE_MDOC) { 673 curp->man->meta.macroset = MACROSET_MDOC; 674 if (curp->man->mdocmac == NULL) 675 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 676 } else if (curp->options & MPARSE_MAN) { 677 curp->man->meta.macroset = MACROSET_MAN; 678 if (curp->man->manmac == NULL) 679 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 680 } 681 curp->man->meta.first->tok = TOKEN_NONE; 682 curp->man->meta.os_e = os_e; 683 tag_alloc(); 684 return curp; 685 } 686 687 void 688 mparse_reset(struct mparse *curp) 689 { 690 tag_free(); 691 roff_reset(curp->roff); 692 roff_man_reset(curp->man); 693 free_buf_list(curp->secondary); 694 curp->secondary = NULL; 695 curp->gzip = 0; 696 tag_alloc(); 697 } 698 699 void 700 mparse_free(struct mparse *curp) 701 { 702 tag_free(); 703 roffhash_free(curp->man->mdocmac); 704 roffhash_free(curp->man->manmac); 705 roff_man_free(curp->man); 706 roff_free(curp->roff); 707 free_buf_list(curp->secondary); 708 free(curp); 709 } 710 711 struct roff_meta * 712 mparse_result(struct mparse *curp) 713 { 714 roff_state_reset(curp->man); 715 if (curp->options & MPARSE_VALIDATE) { 716 if (curp->man->meta.macroset == MACROSET_MDOC) 717 mdoc_validate(curp->man); 718 else 719 man_validate(curp->man); 720 tag_postprocess(curp->man, curp->man->meta.first); 721 } 722 return &curp->man->meta; 723 } 724 725 void 726 mparse_copy(const struct mparse *p) 727 { 728 struct buf *buf; 729 730 for (buf = p->secondary; buf != NULL; buf = buf->next) 731 puts(buf->buf); 732 } 733