1 /* $Id: read.c,v 1.214 2019/07/10 19:39:01 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2010-2019 Ingo Schwarze <schwarze@openbsd.org> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 #include "config.h" 20 21 #include <sys/types.h> 22 #include <sys/mman.h> 23 #include <sys/stat.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stdarg.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <unistd.h> 34 #include <zlib.h> 35 36 #include "mandoc_aux.h" 37 #include "mandoc.h" 38 #include "roff.h" 39 #include "mdoc.h" 40 #include "man.h" 41 #include "mandoc_parse.h" 42 #include "libmandoc.h" 43 #include "roff_int.h" 44 45 #define REPARSE_LIMIT 1000 46 47 struct mparse { 48 struct roff *roff; /* roff parser (!NULL) */ 49 struct roff_man *man; /* man parser */ 50 struct buf *primary; /* buffer currently being parsed */ 51 struct buf *secondary; /* copy of top level input */ 52 struct buf *loop; /* open .while request line */ 53 const char *os_s; /* default operating system */ 54 int options; /* parser options */ 55 int gzip; /* current input file is gzipped */ 56 int filenc; /* encoding of the current file */ 57 int reparse_count; /* finite interp. stack */ 58 int line; /* line number in the file */ 59 }; 60 61 static void choose_parser(struct mparse *); 62 static void free_buf_list(struct buf *); 63 static void resize_buf(struct buf *, size_t); 64 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 65 static int read_whole_file(struct mparse *, int, struct buf *, int *); 66 static void mparse_end(struct mparse *); 67 68 69 static void 70 resize_buf(struct buf *buf, size_t initial) 71 { 72 73 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 74 buf->buf = mandoc_realloc(buf->buf, buf->sz); 75 } 76 77 static void 78 free_buf_list(struct buf *buf) 79 { 80 struct buf *tmp; 81 82 while (buf != NULL) { 83 tmp = buf; 84 buf = tmp->next; 85 free(tmp->buf); 86 free(tmp); 87 } 88 } 89 90 static void 91 choose_parser(struct mparse *curp) 92 { 93 char *cp, *ep; 94 int format; 95 96 /* 97 * If neither command line arguments -mdoc or -man select 98 * a parser nor the roff parser found a .Dd or .TH macro 99 * yet, look ahead in the main input buffer. 100 */ 101 102 if ((format = roff_getformat(curp->roff)) == 0) { 103 cp = curp->primary->buf; 104 ep = cp + curp->primary->sz; 105 while (cp < ep) { 106 if (*cp == '.' || *cp == '\'') { 107 cp++; 108 if (cp[0] == 'D' && cp[1] == 'd') { 109 format = MPARSE_MDOC; 110 break; 111 } 112 if (cp[0] == 'T' && cp[1] == 'H') { 113 format = MPARSE_MAN; 114 break; 115 } 116 } 117 cp = memchr(cp, '\n', ep - cp); 118 if (cp == NULL) 119 break; 120 cp++; 121 } 122 } 123 124 if (format == MPARSE_MDOC) { 125 curp->man->meta.macroset = MACROSET_MDOC; 126 if (curp->man->mdocmac == NULL) 127 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 128 } else { 129 curp->man->meta.macroset = MACROSET_MAN; 130 if (curp->man->manmac == NULL) 131 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 132 } 133 curp->man->meta.first->tok = TOKEN_NONE; 134 } 135 136 /* 137 * Main parse routine for a buffer. 138 * It assumes encoding and line numbering are already set up. 139 * It can recurse directly (for invocations of user-defined 140 * macros, inline equations, and input line traps) 141 * and indirectly (for .so file inclusion). 142 */ 143 static int 144 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 145 { 146 struct buf ln; 147 struct buf *firstln, *lastln, *thisln, *loop; 148 char *cp; 149 size_t pos; /* byte number in the ln buffer */ 150 int line_result, result; 151 int of; 152 int lnn; /* line number in the real file */ 153 int fd; 154 int inloop; /* Saw .while on this level. */ 155 unsigned char c; 156 157 ln.sz = 256; 158 ln.buf = mandoc_malloc(ln.sz); 159 ln.next = NULL; 160 firstln = lastln = loop = NULL; 161 lnn = curp->line; 162 pos = 0; 163 inloop = 0; 164 result = ROFF_CONT; 165 166 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 167 if (start) { 168 curp->line = lnn; 169 curp->reparse_count = 0; 170 171 if (lnn < 3 && 172 curp->filenc & MPARSE_UTF8 && 173 curp->filenc & MPARSE_LATIN1) 174 curp->filenc = preconv_cue(&blk, i); 175 } 176 177 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 178 179 /* 180 * When finding an unescaped newline character, 181 * leave the character loop to process the line. 182 * Skip a preceding carriage return, if any. 183 */ 184 185 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 186 '\n' == blk.buf[i + 1]) 187 ++i; 188 if ('\n' == blk.buf[i]) { 189 ++i; 190 ++lnn; 191 break; 192 } 193 194 /* 195 * Make sure we have space for the worst 196 * case of 12 bytes: "\\[u10ffff]\n\0" 197 */ 198 199 if (pos + 12 > ln.sz) 200 resize_buf(&ln, 256); 201 202 /* 203 * Encode 8-bit input. 204 */ 205 206 c = blk.buf[i]; 207 if (c & 0x80) { 208 if ( ! (curp->filenc && preconv_encode( 209 &blk, &i, &ln, &pos, &curp->filenc))) { 210 mandoc_msg(MANDOCERR_CHAR_BAD, 211 curp->line, pos, "0x%x", c); 212 ln.buf[pos++] = '?'; 213 i++; 214 } 215 continue; 216 } 217 218 /* 219 * Exclude control characters. 220 */ 221 222 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 223 mandoc_msg(c == 0x00 || c == 0x04 || 224 c > 0x0a ? MANDOCERR_CHAR_BAD : 225 MANDOCERR_CHAR_UNSUPP, 226 curp->line, pos, "0x%x", c); 227 i++; 228 if (c != '\r') 229 ln.buf[pos++] = '?'; 230 continue; 231 } 232 233 ln.buf[pos++] = blk.buf[i++]; 234 } 235 ln.buf[pos] = '\0'; 236 237 /* 238 * Maintain a lookaside buffer of all lines. 239 * parsed from this input source. 240 */ 241 242 thisln = mandoc_malloc(sizeof(*thisln)); 243 thisln->buf = mandoc_strdup(ln.buf); 244 thisln->sz = strlen(ln.buf) + 1; 245 thisln->next = NULL; 246 if (firstln == NULL) { 247 firstln = lastln = thisln; 248 if (curp->secondary == NULL) 249 curp->secondary = firstln; 250 } else { 251 lastln->next = thisln; 252 lastln = thisln; 253 } 254 255 /* XXX Ugly hack to mark the end of the input. */ 256 257 if (i == blk.sz || blk.buf[i] == '\0') { 258 if (pos + 2 > ln.sz) 259 resize_buf(&ln, 256); 260 ln.buf[pos++] = '\n'; 261 ln.buf[pos] = '\0'; 262 } 263 264 /* 265 * A significant amount of complexity is contained by 266 * the roff preprocessor. It's line-oriented but can be 267 * expressed on one line, so we need at times to 268 * readjust our starting point and re-run it. The roff 269 * preprocessor can also readjust the buffers with new 270 * data, so we pass them in wholesale. 271 */ 272 273 of = 0; 274 rerun: 275 line_result = roff_parseln(curp->roff, curp->line, &ln, &of); 276 277 /* Process options. */ 278 279 if (line_result & ROFF_APPEND) 280 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 281 282 if (line_result & ROFF_USERCALL) 283 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 284 285 if (line_result & ROFF_USERRET) { 286 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 287 if (start == 0) { 288 /* Return from the current macro. */ 289 result = ROFF_USERRET; 290 goto out; 291 } 292 } 293 294 switch (line_result & ROFF_LOOPMASK) { 295 case ROFF_IGN: 296 break; 297 case ROFF_WHILE: 298 if (curp->loop != NULL) { 299 if (loop == curp->loop) 300 break; 301 mandoc_msg(MANDOCERR_WHILE_NEST, 302 curp->line, pos, NULL); 303 } 304 curp->loop = thisln; 305 loop = NULL; 306 inloop = 1; 307 break; 308 case ROFF_LOOPCONT: 309 case ROFF_LOOPEXIT: 310 if (curp->loop == NULL) { 311 mandoc_msg(MANDOCERR_WHILE_FAIL, 312 curp->line, pos, NULL); 313 break; 314 } 315 if (inloop == 0) { 316 mandoc_msg(MANDOCERR_WHILE_INTO, 317 curp->line, pos, NULL); 318 curp->loop = loop = NULL; 319 break; 320 } 321 if (line_result & ROFF_LOOPCONT) 322 loop = curp->loop; 323 else { 324 curp->loop = loop = NULL; 325 inloop = 0; 326 } 327 break; 328 default: 329 abort(); 330 } 331 332 /* Process the main instruction from the roff parser. */ 333 334 switch (line_result & ROFF_MASK) { 335 case ROFF_IGN: 336 break; 337 case ROFF_CONT: 338 if (curp->man->meta.macroset == MACROSET_NONE) 339 choose_parser(curp); 340 if ((curp->man->meta.macroset == MACROSET_MDOC ? 341 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 342 man_parseln(curp->man, curp->line, ln.buf, of) 343 ) == 2) 344 goto out; 345 break; 346 case ROFF_RERUN: 347 goto rerun; 348 case ROFF_REPARSE: 349 if (++curp->reparse_count > REPARSE_LIMIT) { 350 /* Abort and return to the top level. */ 351 result = ROFF_IGN; 352 mandoc_msg(MANDOCERR_ROFFLOOP, 353 curp->line, pos, NULL); 354 goto out; 355 } 356 result = mparse_buf_r(curp, ln, of, 0); 357 if (line_result & ROFF_USERCALL) { 358 roff_userret(curp->roff); 359 /* Continue normally. */ 360 if (result & ROFF_USERRET) 361 result = ROFF_CONT; 362 } 363 if (start == 0 && result != ROFF_CONT) 364 goto out; 365 break; 366 case ROFF_SO: 367 if ( ! (curp->options & MPARSE_SO) && 368 (i >= blk.sz || blk.buf[i] == '\0')) { 369 curp->man->meta.sodest = 370 mandoc_strdup(ln.buf + of); 371 goto out; 372 } 373 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 374 mparse_readfd(curp, fd, ln.buf + of); 375 close(fd); 376 } else { 377 mandoc_msg(MANDOCERR_SO_FAIL, 378 curp->line, of, ".so %s: %s", 379 ln.buf + of, strerror(errno)); 380 ln.sz = mandoc_asprintf(&cp, 381 ".sp\nSee the file %s.\n.sp", 382 ln.buf + of); 383 free(ln.buf); 384 ln.buf = cp; 385 of = 0; 386 mparse_buf_r(curp, ln, of, 0); 387 } 388 break; 389 default: 390 abort(); 391 } 392 393 /* Start the next input line. */ 394 395 if (loop != NULL && 396 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 397 loop = loop->next; 398 399 if (loop != NULL) { 400 if ((line_result & ROFF_APPEND) == 0) 401 *ln.buf = '\0'; 402 if (ln.sz < loop->sz) 403 resize_buf(&ln, loop->sz); 404 (void)strlcat(ln.buf, loop->buf, ln.sz); 405 of = 0; 406 goto rerun; 407 } 408 409 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 410 } 411 out: 412 if (inloop) { 413 if (result != ROFF_USERRET) 414 mandoc_msg(MANDOCERR_WHILE_OUTOF, 415 curp->line, pos, NULL); 416 curp->loop = NULL; 417 } 418 free(ln.buf); 419 if (firstln != curp->secondary) 420 free_buf_list(firstln); 421 return result; 422 } 423 424 static int 425 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 426 { 427 struct stat st; 428 gzFile gz; 429 size_t off; 430 ssize_t ssz; 431 int gzerrnum, retval; 432 433 if (fstat(fd, &st) == -1) { 434 mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno)); 435 return -1; 436 } 437 438 /* 439 * If we're a regular file, try just reading in the whole entry 440 * via mmap(). This is faster than reading it into blocks, and 441 * since each file is only a few bytes to begin with, I'm not 442 * concerned that this is going to tank any machines. 443 */ 444 445 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 446 if (st.st_size > 0x7fffffff) { 447 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 448 return -1; 449 } 450 *with_mmap = 1; 451 fb->sz = (size_t)st.st_size; 452 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 453 if (fb->buf != MAP_FAILED) 454 return 0; 455 } 456 457 if (curp->gzip) { 458 /* 459 * Duplicating the file descriptor is required 460 * because we will have to call gzclose(3) 461 * to free memory used internally by zlib, 462 * but that will also close the file descriptor, 463 * which this function must not do. 464 */ 465 if ((fd = dup(fd)) == -1) { 466 mandoc_msg(MANDOCERR_DUP, 0, 0, 467 "%s", strerror(errno)); 468 return -1; 469 } 470 if ((gz = gzdopen(fd, "rb")) == NULL) { 471 mandoc_msg(MANDOCERR_GZDOPEN, 0, 0, 472 "%s", strerror(errno)); 473 close(fd); 474 return -1; 475 } 476 } else 477 gz = NULL; 478 479 /* 480 * If this isn't a regular file (like, say, stdin), then we must 481 * go the old way and just read things in bit by bit. 482 */ 483 484 *with_mmap = 0; 485 off = 0; 486 retval = -1; 487 fb->sz = 0; 488 fb->buf = NULL; 489 for (;;) { 490 if (off == fb->sz) { 491 if (fb->sz == (1U << 31)) { 492 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 493 break; 494 } 495 resize_buf(fb, 65536); 496 } 497 ssz = curp->gzip ? 498 gzread(gz, fb->buf + (int)off, fb->sz - off) : 499 read(fd, fb->buf + (int)off, fb->sz - off); 500 if (ssz == 0) { 501 fb->sz = off; 502 retval = 0; 503 break; 504 } 505 if (ssz == -1) { 506 if (curp->gzip) 507 (void)gzerror(gz, &gzerrnum); 508 mandoc_msg(MANDOCERR_READ, 0, 0, "%s", 509 curp->gzip && gzerrnum != Z_ERRNO ? 510 zError(gzerrnum) : strerror(errno)); 511 break; 512 } 513 off += (size_t)ssz; 514 } 515 516 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 517 mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s", 518 gzerrnum == Z_ERRNO ? strerror(errno) : 519 zError(gzerrnum)); 520 if (retval == -1) { 521 free(fb->buf); 522 fb->buf = NULL; 523 } 524 return retval; 525 } 526 527 static void 528 mparse_end(struct mparse *curp) 529 { 530 if (curp->man->meta.macroset == MACROSET_NONE) 531 curp->man->meta.macroset = MACROSET_MAN; 532 if (curp->man->meta.macroset == MACROSET_MDOC) 533 mdoc_endparse(curp->man); 534 else 535 man_endparse(curp->man); 536 roff_endparse(curp->roff); 537 } 538 539 /* 540 * Read the whole file into memory and call the parsers. 541 * Called recursively when an .so request is encountered. 542 */ 543 void 544 mparse_readfd(struct mparse *curp, int fd, const char *filename) 545 { 546 static int recursion_depth; 547 548 struct buf blk; 549 struct buf *save_primary; 550 const char *save_filename; 551 size_t offset; 552 int save_filenc, save_lineno; 553 int with_mmap; 554 555 if (recursion_depth > 64) { 556 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 557 return; 558 } 559 if (read_whole_file(curp, fd, &blk, &with_mmap) == -1) 560 return; 561 562 /* 563 * Save some properties of the parent file. 564 */ 565 566 save_primary = curp->primary; 567 save_filenc = curp->filenc; 568 save_lineno = curp->line; 569 save_filename = mandoc_msg_getinfilename(); 570 571 curp->primary = &blk; 572 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 573 curp->line = 1; 574 mandoc_msg_setinfilename(filename); 575 576 /* Skip an UTF-8 byte order mark. */ 577 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 578 (unsigned char)blk.buf[0] == 0xef && 579 (unsigned char)blk.buf[1] == 0xbb && 580 (unsigned char)blk.buf[2] == 0xbf) { 581 offset = 3; 582 curp->filenc &= ~MPARSE_LATIN1; 583 } else 584 offset = 0; 585 586 recursion_depth++; 587 mparse_buf_r(curp, blk, offset, 1); 588 if (--recursion_depth == 0) 589 mparse_end(curp); 590 591 /* 592 * Clean up and restore saved parent properties. 593 */ 594 595 if (with_mmap) 596 munmap(blk.buf, blk.sz); 597 else 598 free(blk.buf); 599 600 curp->primary = save_primary; 601 curp->filenc = save_filenc; 602 curp->line = save_lineno; 603 if (save_filename != NULL) 604 mandoc_msg_setinfilename(save_filename); 605 } 606 607 int 608 mparse_open(struct mparse *curp, const char *file) 609 { 610 char *cp; 611 int fd, save_errno; 612 613 cp = strrchr(file, '.'); 614 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 615 616 /* First try to use the filename as it is. */ 617 618 if ((fd = open(file, O_RDONLY)) != -1) 619 return fd; 620 621 /* 622 * If that doesn't work and the filename doesn't 623 * already end in .gz, try appending .gz. 624 */ 625 626 if ( ! curp->gzip) { 627 save_errno = errno; 628 mandoc_asprintf(&cp, "%s.gz", file); 629 fd = open(cp, O_RDONLY); 630 free(cp); 631 errno = save_errno; 632 if (fd != -1) { 633 curp->gzip = 1; 634 return fd; 635 } 636 } 637 638 /* Neither worked, give up. */ 639 640 return -1; 641 } 642 643 struct mparse * 644 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 645 { 646 struct mparse *curp; 647 648 curp = mandoc_calloc(1, sizeof(struct mparse)); 649 650 curp->options = options; 651 curp->os_s = os_s; 652 653 curp->roff = roff_alloc(options); 654 curp->man = roff_man_alloc(curp->roff, curp->os_s, 655 curp->options & MPARSE_QUICK ? 1 : 0); 656 if (curp->options & MPARSE_MDOC) { 657 curp->man->meta.macroset = MACROSET_MDOC; 658 if (curp->man->mdocmac == NULL) 659 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 660 } else if (curp->options & MPARSE_MAN) { 661 curp->man->meta.macroset = MACROSET_MAN; 662 if (curp->man->manmac == NULL) 663 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 664 } 665 curp->man->meta.first->tok = TOKEN_NONE; 666 curp->man->meta.os_e = os_e; 667 return curp; 668 } 669 670 void 671 mparse_reset(struct mparse *curp) 672 { 673 roff_reset(curp->roff); 674 roff_man_reset(curp->man); 675 free_buf_list(curp->secondary); 676 curp->secondary = NULL; 677 curp->gzip = 0; 678 } 679 680 void 681 mparse_free(struct mparse *curp) 682 { 683 roffhash_free(curp->man->mdocmac); 684 roffhash_free(curp->man->manmac); 685 roff_man_free(curp->man); 686 roff_free(curp->roff); 687 free_buf_list(curp->secondary); 688 free(curp); 689 } 690 691 struct roff_meta * 692 mparse_result(struct mparse *curp) 693 { 694 roff_state_reset(curp->man); 695 if (curp->options & MPARSE_VALIDATE) { 696 if (curp->man->meta.macroset == MACROSET_MDOC) 697 mdoc_validate(curp->man); 698 else 699 man_validate(curp->man); 700 } 701 return &curp->man->meta; 702 } 703 704 void 705 mparse_copy(const struct mparse *p) 706 { 707 struct buf *buf; 708 709 for (buf = p->secondary; buf != NULL; buf = buf->next) 710 puts(buf->buf); 711 } 712