1 /* $Id: read.c,v 1.221 2022/05/19 14:48:56 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org> 4 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 * 19 * Top-level functions of the mandoc(3) parser: 20 * Parser and input encoding selection, decompression, 21 * handling of input bytes, characters, lines, and files, 22 * handling of roff(7) loops and file inclusion, 23 * and steering of the various parsers. 24 */ 25 #include "config.h" 26 27 #include <sys/types.h> 28 #include <sys/mman.h> 29 #include <sys/stat.h> 30 31 #include <assert.h> 32 #include <ctype.h> 33 #include <errno.h> 34 #include <fcntl.h> 35 #include <stdarg.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <string.h> 39 #include <unistd.h> 40 #include <zlib.h> 41 42 #include "mandoc_aux.h" 43 #include "mandoc.h" 44 #include "roff.h" 45 #include "mdoc.h" 46 #include "man.h" 47 #include "mandoc_parse.h" 48 #include "libmandoc.h" 49 #include "roff_int.h" 50 #include "tag.h" 51 52 #define REPARSE_LIMIT 1000 53 54 struct mparse { 55 struct roff *roff; /* roff parser (!NULL) */ 56 struct roff_man *man; /* man parser */ 57 struct buf *primary; /* buffer currently being parsed */ 58 struct buf *secondary; /* copy of top level input */ 59 struct buf *loop; /* open .while request line */ 60 const char *os_s; /* default operating system */ 61 int options; /* parser options */ 62 int gzip; /* current input file is gzipped */ 63 int filenc; /* encoding of the current file */ 64 int reparse_count; /* finite interp. stack */ 65 int line; /* line number in the file */ 66 }; 67 68 static void choose_parser(struct mparse *); 69 static void free_buf_list(struct buf *); 70 static void resize_buf(struct buf *, size_t); 71 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 72 static int read_whole_file(struct mparse *, int, struct buf *, int *); 73 static void mparse_end(struct mparse *); 74 75 76 static void 77 resize_buf(struct buf *buf, size_t initial) 78 { 79 80 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 81 buf->buf = mandoc_realloc(buf->buf, buf->sz); 82 } 83 84 static void 85 free_buf_list(struct buf *buf) 86 { 87 struct buf *tmp; 88 89 while (buf != NULL) { 90 tmp = buf; 91 buf = tmp->next; 92 free(tmp->buf); 93 free(tmp); 94 } 95 } 96 97 static void 98 choose_parser(struct mparse *curp) 99 { 100 char *cp, *ep; 101 int format; 102 103 /* 104 * If neither command line arguments -mdoc or -man select 105 * a parser nor the roff parser found a .Dd or .TH macro 106 * yet, look ahead in the main input buffer. 107 */ 108 109 if ((format = roff_getformat(curp->roff)) == 0) { 110 cp = curp->primary->buf; 111 ep = cp + curp->primary->sz; 112 while (cp < ep) { 113 if (*cp == '.' || *cp == '\'') { 114 cp++; 115 if (cp[0] == 'D' && cp[1] == 'd') { 116 format = MPARSE_MDOC; 117 break; 118 } 119 if (cp[0] == 'T' && cp[1] == 'H') { 120 format = MPARSE_MAN; 121 break; 122 } 123 } 124 cp = memchr(cp, '\n', ep - cp); 125 if (cp == NULL) 126 break; 127 cp++; 128 } 129 } 130 131 if (format == MPARSE_MDOC) { 132 curp->man->meta.macroset = MACROSET_MDOC; 133 if (curp->man->mdocmac == NULL) 134 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 135 } else { 136 curp->man->meta.macroset = MACROSET_MAN; 137 if (curp->man->manmac == NULL) 138 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 139 } 140 curp->man->meta.first->tok = TOKEN_NONE; 141 } 142 143 /* 144 * Main parse routine for a buffer. 145 * It assumes encoding and line numbering are already set up. 146 * It can recurse directly (for invocations of user-defined 147 * macros, inline equations, and input line traps) 148 * and indirectly (for .so file inclusion). 149 */ 150 static int 151 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 152 { 153 struct buf ln; 154 struct buf *firstln, *lastln, *thisln, *loop; 155 char *cp; 156 size_t pos; /* byte number in the ln buffer */ 157 size_t spos; /* at the start of the current line parse */ 158 int line_result, result; 159 int of; 160 int lnn; /* line number in the real file */ 161 int fd; 162 int inloop; /* Saw .while on this level. */ 163 unsigned char c; 164 165 ln.sz = 256; 166 ln.buf = mandoc_malloc(ln.sz); 167 ln.next = NULL; 168 firstln = lastln = loop = NULL; 169 lnn = curp->line; 170 pos = 0; 171 inloop = 0; 172 result = ROFF_CONT; 173 174 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 175 if (start) { 176 curp->line = lnn; 177 curp->reparse_count = 0; 178 179 if (lnn < 3 && 180 curp->filenc & MPARSE_UTF8 && 181 curp->filenc & MPARSE_LATIN1) 182 curp->filenc = preconv_cue(&blk, i); 183 } 184 spos = pos; 185 186 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 187 188 /* 189 * When finding an unescaped newline character, 190 * leave the character loop to process the line. 191 * Skip a preceding carriage return, if any. 192 */ 193 194 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 195 '\n' == blk.buf[i + 1]) 196 ++i; 197 if ('\n' == blk.buf[i]) { 198 ++i; 199 ++lnn; 200 break; 201 } 202 203 /* 204 * Make sure we have space for the worst 205 * case of 12 bytes: "\\[u10ffff]\n\0" 206 */ 207 208 if (pos + 12 > ln.sz) 209 resize_buf(&ln, 256); 210 211 /* 212 * Encode 8-bit input. 213 */ 214 215 c = blk.buf[i]; 216 if (c & 0x80) { 217 if ( ! (curp->filenc && preconv_encode( 218 &blk, &i, &ln, &pos, &curp->filenc))) { 219 mandoc_msg(MANDOCERR_CHAR_BAD, 220 curp->line, pos, "0x%x", c); 221 ln.buf[pos++] = '?'; 222 i++; 223 } 224 continue; 225 } 226 227 /* 228 * Exclude control characters. 229 */ 230 231 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 232 mandoc_msg(c == 0x00 || c == 0x04 || 233 c > 0x0a ? MANDOCERR_CHAR_BAD : 234 MANDOCERR_CHAR_UNSUPP, 235 curp->line, pos, "0x%x", c); 236 i++; 237 if (c != '\r') 238 ln.buf[pos++] = '?'; 239 continue; 240 } 241 242 ln.buf[pos++] = blk.buf[i++]; 243 } 244 ln.buf[pos] = '\0'; 245 246 /* 247 * Maintain a lookaside buffer of all lines. 248 * parsed from this input source. 249 */ 250 251 thisln = mandoc_malloc(sizeof(*thisln)); 252 thisln->buf = mandoc_strdup(ln.buf); 253 thisln->sz = strlen(ln.buf) + 1; 254 thisln->next = NULL; 255 if (firstln == NULL) { 256 firstln = lastln = thisln; 257 if (curp->secondary == NULL) 258 curp->secondary = firstln; 259 } else { 260 lastln->next = thisln; 261 lastln = thisln; 262 } 263 264 /* 265 * XXX Ugly hack to mark the end of the input, 266 * such that the function roff_parse_comment() 267 * doesn't attempt to append another line if the 268 * last input line ends with an escape character. 269 */ 270 271 if (i == blk.sz || blk.buf[i] == '\0') { 272 if (pos + 2 > ln.sz) 273 resize_buf(&ln, 256); 274 ln.buf[pos++] = '\n'; 275 ln.buf[pos] = '\0'; 276 } 277 278 /* 279 * A significant amount of complexity is contained by 280 * the roff preprocessor. It's line-oriented but can be 281 * expressed on one line, so we need at times to 282 * readjust our starting point and re-run it. The roff 283 * preprocessor can also readjust the buffers with new 284 * data, so we pass them in wholesale. 285 */ 286 287 of = 0; 288 rerun: 289 line_result = roff_parseln(curp->roff, curp->line, 290 &ln, &of, start && spos == 0 ? pos : 0); 291 292 /* Process options. */ 293 294 if (line_result & ROFF_APPEND) 295 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 296 297 if (line_result & ROFF_USERCALL) 298 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 299 300 if (line_result & ROFF_USERRET) { 301 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 302 if (start == 0) { 303 /* Return from the current macro. */ 304 result = ROFF_USERRET; 305 goto out; 306 } 307 } 308 309 switch (line_result & ROFF_LOOPMASK) { 310 case ROFF_IGN: 311 break; 312 case ROFF_WHILE: 313 if (curp->loop != NULL) { 314 if (loop == curp->loop) 315 break; 316 mandoc_msg(MANDOCERR_WHILE_NEST, 317 curp->line, pos, NULL); 318 } 319 curp->loop = thisln; 320 loop = NULL; 321 inloop = 1; 322 break; 323 case ROFF_LOOPCONT: 324 case ROFF_LOOPEXIT: 325 if (curp->loop == NULL) { 326 mandoc_msg(MANDOCERR_WHILE_FAIL, 327 curp->line, pos, NULL); 328 break; 329 } 330 if (inloop == 0) { 331 mandoc_msg(MANDOCERR_WHILE_INTO, 332 curp->line, pos, NULL); 333 curp->loop = loop = NULL; 334 break; 335 } 336 if (line_result & ROFF_LOOPCONT) 337 loop = curp->loop; 338 else { 339 curp->loop = loop = NULL; 340 inloop = 0; 341 } 342 break; 343 default: 344 abort(); 345 } 346 347 /* Process the main instruction from the roff parser. */ 348 349 switch (line_result & ROFF_MASK) { 350 case ROFF_IGN: 351 break; 352 case ROFF_CONT: 353 if (curp->man->meta.macroset == MACROSET_NONE) 354 choose_parser(curp); 355 if ((curp->man->meta.macroset == MACROSET_MDOC ? 356 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 357 man_parseln(curp->man, curp->line, ln.buf, of) 358 ) == 2) 359 goto out; 360 break; 361 case ROFF_RERUN: 362 goto rerun; 363 case ROFF_REPARSE: 364 if (++curp->reparse_count > REPARSE_LIMIT) { 365 /* Abort and return to the top level. */ 366 result = ROFF_IGN; 367 mandoc_msg(MANDOCERR_ROFFLOOP, 368 curp->line, pos, NULL); 369 goto out; 370 } 371 result = mparse_buf_r(curp, ln, of, 0); 372 if (line_result & ROFF_USERCALL) { 373 roff_userret(curp->roff); 374 /* Continue normally. */ 375 if (result & ROFF_USERRET) 376 result = ROFF_CONT; 377 } 378 if (start == 0 && result != ROFF_CONT) 379 goto out; 380 break; 381 case ROFF_SO: 382 if ( ! (curp->options & MPARSE_SO) && 383 (i >= blk.sz || blk.buf[i] == '\0')) { 384 curp->man->meta.sodest = 385 mandoc_strdup(ln.buf + of); 386 goto out; 387 } 388 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 389 mparse_readfd(curp, fd, ln.buf + of); 390 close(fd); 391 } else { 392 mandoc_msg(MANDOCERR_SO_FAIL, 393 curp->line, of, ".so %s: %s", 394 ln.buf + of, strerror(errno)); 395 ln.sz = mandoc_asprintf(&cp, 396 ".sp\nSee the file %s.\n.sp", 397 ln.buf + of); 398 free(ln.buf); 399 ln.buf = cp; 400 of = 0; 401 mparse_buf_r(curp, ln, of, 0); 402 } 403 break; 404 default: 405 abort(); 406 } 407 408 /* Start the next input line. */ 409 410 if (loop != NULL && 411 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 412 loop = loop->next; 413 414 if (loop != NULL) { 415 if ((line_result & ROFF_APPEND) == 0) 416 *ln.buf = '\0'; 417 if (ln.sz < loop->sz) 418 resize_buf(&ln, loop->sz); 419 (void)strlcat(ln.buf, loop->buf, ln.sz); 420 of = 0; 421 goto rerun; 422 } 423 424 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 425 } 426 out: 427 if (inloop) { 428 if (result != ROFF_USERRET) 429 mandoc_msg(MANDOCERR_WHILE_OUTOF, 430 curp->line, pos, NULL); 431 curp->loop = NULL; 432 } 433 free(ln.buf); 434 if (firstln != curp->secondary) 435 free_buf_list(firstln); 436 return result; 437 } 438 439 static int 440 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 441 { 442 struct stat st; 443 gzFile gz; 444 size_t off; 445 ssize_t ssz; 446 int gzerrnum, retval; 447 448 if (fstat(fd, &st) == -1) { 449 mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno)); 450 return -1; 451 } 452 453 /* 454 * If we're a regular file, try just reading in the whole entry 455 * via mmap(). This is faster than reading it into blocks, and 456 * since each file is only a few bytes to begin with, I'm not 457 * concerned that this is going to tank any machines. 458 */ 459 460 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 461 if (st.st_size > 0x7fffffff) { 462 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 463 return -1; 464 } 465 *with_mmap = 1; 466 fb->sz = (size_t)st.st_size; 467 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 468 if (fb->buf != MAP_FAILED) 469 return 0; 470 } 471 472 if (curp->gzip) { 473 /* 474 * Duplicating the file descriptor is required 475 * because we will have to call gzclose(3) 476 * to free memory used internally by zlib, 477 * but that will also close the file descriptor, 478 * which this function must not do. 479 */ 480 if ((fd = dup(fd)) == -1) { 481 mandoc_msg(MANDOCERR_DUP, 0, 0, 482 "%s", strerror(errno)); 483 return -1; 484 } 485 if ((gz = gzdopen(fd, "rb")) == NULL) { 486 mandoc_msg(MANDOCERR_GZDOPEN, 0, 0, 487 "%s", strerror(errno)); 488 close(fd); 489 return -1; 490 } 491 } else 492 gz = NULL; 493 494 /* 495 * If this isn't a regular file (like, say, stdin), then we must 496 * go the old way and just read things in bit by bit. 497 */ 498 499 *with_mmap = 0; 500 off = 0; 501 retval = -1; 502 fb->sz = 0; 503 fb->buf = NULL; 504 for (;;) { 505 if (off == fb->sz) { 506 if (fb->sz == (1U << 31)) { 507 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 508 break; 509 } 510 resize_buf(fb, 65536); 511 } 512 ssz = curp->gzip ? 513 gzread(gz, fb->buf + (int)off, fb->sz - off) : 514 read(fd, fb->buf + (int)off, fb->sz - off); 515 if (ssz == 0) { 516 fb->sz = off; 517 retval = 0; 518 break; 519 } 520 if (ssz == -1) { 521 if (curp->gzip) 522 (void)gzerror(gz, &gzerrnum); 523 mandoc_msg(MANDOCERR_READ, 0, 0, "%s", 524 curp->gzip && gzerrnum != Z_ERRNO ? 525 zError(gzerrnum) : strerror(errno)); 526 break; 527 } 528 off += (size_t)ssz; 529 } 530 531 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 532 mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s", 533 gzerrnum == Z_ERRNO ? strerror(errno) : 534 zError(gzerrnum)); 535 if (retval == -1) { 536 free(fb->buf); 537 fb->buf = NULL; 538 } 539 return retval; 540 } 541 542 static void 543 mparse_end(struct mparse *curp) 544 { 545 if (curp->man->meta.macroset == MACROSET_NONE) 546 curp->man->meta.macroset = MACROSET_MAN; 547 if (curp->man->meta.macroset == MACROSET_MDOC) 548 mdoc_endparse(curp->man); 549 else 550 man_endparse(curp->man); 551 roff_endparse(curp->roff); 552 } 553 554 /* 555 * Read the whole file into memory and call the parsers. 556 * Called recursively when an .so request is encountered. 557 */ 558 void 559 mparse_readfd(struct mparse *curp, int fd, const char *filename) 560 { 561 static int recursion_depth; 562 563 struct buf blk; 564 struct buf *save_primary; 565 const char *save_filename, *cp; 566 size_t offset; 567 int save_filenc, save_lineno; 568 int with_mmap; 569 570 if (recursion_depth > 64) { 571 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 572 return; 573 } else if (recursion_depth == 0 && 574 (cp = strrchr(filename, '.')) != NULL && 575 cp[1] >= '1' && cp[1] <= '9') 576 curp->man->filesec = cp[1]; 577 else 578 curp->man->filesec = '\0'; 579 580 if (read_whole_file(curp, fd, &blk, &with_mmap) == -1) 581 return; 582 583 /* 584 * Save some properties of the parent file. 585 */ 586 587 save_primary = curp->primary; 588 save_filenc = curp->filenc; 589 save_lineno = curp->line; 590 save_filename = mandoc_msg_getinfilename(); 591 592 curp->primary = &blk; 593 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 594 curp->line = 1; 595 mandoc_msg_setinfilename(filename); 596 597 /* Skip an UTF-8 byte order mark. */ 598 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 599 (unsigned char)blk.buf[0] == 0xef && 600 (unsigned char)blk.buf[1] == 0xbb && 601 (unsigned char)blk.buf[2] == 0xbf) { 602 offset = 3; 603 curp->filenc &= ~MPARSE_LATIN1; 604 } else 605 offset = 0; 606 607 recursion_depth++; 608 mparse_buf_r(curp, blk, offset, 1); 609 if (--recursion_depth == 0) 610 mparse_end(curp); 611 612 /* 613 * Clean up and restore saved parent properties. 614 */ 615 616 if (with_mmap) 617 munmap(blk.buf, blk.sz); 618 else 619 free(blk.buf); 620 621 curp->primary = save_primary; 622 curp->filenc = save_filenc; 623 curp->line = save_lineno; 624 if (save_filename != NULL) 625 mandoc_msg_setinfilename(save_filename); 626 } 627 628 int 629 mparse_open(struct mparse *curp, const char *file) 630 { 631 char *cp; 632 int fd, save_errno; 633 634 cp = strrchr(file, '.'); 635 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 636 637 /* First try to use the filename as it is. */ 638 639 if ((fd = open(file, O_RDONLY)) != -1) 640 return fd; 641 642 /* 643 * If that doesn't work and the filename doesn't 644 * already end in .gz, try appending .gz. 645 */ 646 647 if ( ! curp->gzip) { 648 save_errno = errno; 649 mandoc_asprintf(&cp, "%s.gz", file); 650 fd = open(cp, O_RDONLY); 651 free(cp); 652 errno = save_errno; 653 if (fd != -1) { 654 curp->gzip = 1; 655 return fd; 656 } 657 } 658 659 /* Neither worked, give up. */ 660 661 return -1; 662 } 663 664 struct mparse * 665 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 666 { 667 struct mparse *curp; 668 669 curp = mandoc_calloc(1, sizeof(struct mparse)); 670 671 curp->options = options; 672 curp->os_s = os_s; 673 674 curp->roff = roff_alloc(options); 675 curp->man = roff_man_alloc(curp->roff, curp->os_s, 676 curp->options & MPARSE_QUICK ? 1 : 0); 677 if (curp->options & MPARSE_MDOC) { 678 curp->man->meta.macroset = MACROSET_MDOC; 679 if (curp->man->mdocmac == NULL) 680 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 681 } else if (curp->options & MPARSE_MAN) { 682 curp->man->meta.macroset = MACROSET_MAN; 683 if (curp->man->manmac == NULL) 684 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 685 } 686 curp->man->meta.first->tok = TOKEN_NONE; 687 curp->man->meta.os_e = os_e; 688 tag_alloc(); 689 return curp; 690 } 691 692 void 693 mparse_reset(struct mparse *curp) 694 { 695 tag_free(); 696 roff_reset(curp->roff); 697 roff_man_reset(curp->man); 698 free_buf_list(curp->secondary); 699 curp->secondary = NULL; 700 curp->gzip = 0; 701 tag_alloc(); 702 } 703 704 void 705 mparse_free(struct mparse *curp) 706 { 707 tag_free(); 708 roffhash_free(curp->man->mdocmac); 709 roffhash_free(curp->man->manmac); 710 roff_man_free(curp->man); 711 roff_free(curp->roff); 712 free_buf_list(curp->secondary); 713 free(curp); 714 } 715 716 struct roff_meta * 717 mparse_result(struct mparse *curp) 718 { 719 roff_state_reset(curp->man); 720 if (curp->options & MPARSE_VALIDATE) { 721 if (curp->man->meta.macroset == MACROSET_MDOC) 722 mdoc_validate(curp->man); 723 else 724 man_validate(curp->man); 725 tag_postprocess(curp->man, curp->man->meta.first); 726 } 727 return &curp->man->meta; 728 } 729 730 void 731 mparse_copy(const struct mparse *p) 732 { 733 struct buf *buf; 734 735 for (buf = p->secondary; buf != NULL; buf = buf->next) 736 puts(buf->buf); 737 } 738