1 /* $Id: read.c,v 1.211 2019/01/11 17:04:44 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2010-2019 Ingo Schwarze <schwarze@openbsd.org> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 #include "config.h" 20 21 #include <sys/types.h> 22 #include <sys/mman.h> 23 #include <sys/stat.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stdarg.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <unistd.h> 34 #include <zlib.h> 35 36 #include "mandoc_aux.h" 37 #include "mandoc.h" 38 #include "roff.h" 39 #include "mdoc.h" 40 #include "man.h" 41 #include "mandoc_parse.h" 42 #include "libmandoc.h" 43 #include "roff_int.h" 44 45 #define REPARSE_LIMIT 1000 46 47 struct mparse { 48 struct roff *roff; /* roff parser (!NULL) */ 49 struct roff_man *man; /* man parser */ 50 struct buf *primary; /* buffer currently being parsed */ 51 struct buf *secondary; /* copy of top level input */ 52 struct buf *loop; /* open .while request line */ 53 const char *os_s; /* default operating system */ 54 int options; /* parser options */ 55 int gzip; /* current input file is gzipped */ 56 int filenc; /* encoding of the current file */ 57 int reparse_count; /* finite interp. stack */ 58 int line; /* line number in the file */ 59 }; 60 61 static void choose_parser(struct mparse *); 62 static void free_buf_list(struct buf *); 63 static void resize_buf(struct buf *, size_t); 64 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 65 static int read_whole_file(struct mparse *, int, struct buf *, int *); 66 static void mparse_end(struct mparse *); 67 68 69 static void 70 resize_buf(struct buf *buf, size_t initial) 71 { 72 73 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 74 buf->buf = mandoc_realloc(buf->buf, buf->sz); 75 } 76 77 static void 78 free_buf_list(struct buf *buf) 79 { 80 struct buf *tmp; 81 82 while (buf != NULL) { 83 tmp = buf; 84 buf = tmp->next; 85 free(tmp->buf); 86 free(tmp); 87 } 88 } 89 90 static void 91 choose_parser(struct mparse *curp) 92 { 93 char *cp, *ep; 94 int format; 95 96 /* 97 * If neither command line arguments -mdoc or -man select 98 * a parser nor the roff parser found a .Dd or .TH macro 99 * yet, look ahead in the main input buffer. 100 */ 101 102 if ((format = roff_getformat(curp->roff)) == 0) { 103 cp = curp->primary->buf; 104 ep = cp + curp->primary->sz; 105 while (cp < ep) { 106 if (*cp == '.' || *cp == '\'') { 107 cp++; 108 if (cp[0] == 'D' && cp[1] == 'd') { 109 format = MPARSE_MDOC; 110 break; 111 } 112 if (cp[0] == 'T' && cp[1] == 'H') { 113 format = MPARSE_MAN; 114 break; 115 } 116 } 117 cp = memchr(cp, '\n', ep - cp); 118 if (cp == NULL) 119 break; 120 cp++; 121 } 122 } 123 124 if (format == MPARSE_MDOC) { 125 curp->man->meta.macroset = MACROSET_MDOC; 126 if (curp->man->mdocmac == NULL) 127 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 128 } else { 129 curp->man->meta.macroset = MACROSET_MAN; 130 if (curp->man->manmac == NULL) 131 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 132 } 133 curp->man->meta.first->tok = TOKEN_NONE; 134 } 135 136 /* 137 * Main parse routine for a buffer. 138 * It assumes encoding and line numbering are already set up. 139 * It can recurse directly (for invocations of user-defined 140 * macros, inline equations, and input line traps) 141 * and indirectly (for .so file inclusion). 142 */ 143 static int 144 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 145 { 146 struct buf ln; 147 struct buf *firstln, *lastln, *thisln, *loop; 148 char *cp; 149 size_t pos; /* byte number in the ln buffer */ 150 int line_result, result; 151 int of; 152 int lnn; /* line number in the real file */ 153 int fd; 154 int inloop; /* Saw .while on this level. */ 155 unsigned char c; 156 157 ln.sz = 256; 158 ln.buf = mandoc_malloc(ln.sz); 159 ln.next = NULL; 160 firstln = loop = NULL; 161 lnn = curp->line; 162 pos = 0; 163 inloop = 0; 164 result = ROFF_CONT; 165 166 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 167 if (start) { 168 curp->line = lnn; 169 curp->reparse_count = 0; 170 171 if (lnn < 3 && 172 curp->filenc & MPARSE_UTF8 && 173 curp->filenc & MPARSE_LATIN1) 174 curp->filenc = preconv_cue(&blk, i); 175 } 176 177 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 178 179 /* 180 * When finding an unescaped newline character, 181 * leave the character loop to process the line. 182 * Skip a preceding carriage return, if any. 183 */ 184 185 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 186 '\n' == blk.buf[i + 1]) 187 ++i; 188 if ('\n' == blk.buf[i]) { 189 ++i; 190 ++lnn; 191 break; 192 } 193 194 /* 195 * Make sure we have space for the worst 196 * case of 12 bytes: "\\[u10ffff]\n\0" 197 */ 198 199 if (pos + 12 > ln.sz) 200 resize_buf(&ln, 256); 201 202 /* 203 * Encode 8-bit input. 204 */ 205 206 c = blk.buf[i]; 207 if (c & 0x80) { 208 if ( ! (curp->filenc && preconv_encode( 209 &blk, &i, &ln, &pos, &curp->filenc))) { 210 mandoc_msg(MANDOCERR_CHAR_BAD, 211 curp->line, pos, "0x%x", c); 212 ln.buf[pos++] = '?'; 213 i++; 214 } 215 continue; 216 } 217 218 /* 219 * Exclude control characters. 220 */ 221 222 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 223 mandoc_msg(c == 0x00 || c == 0x04 || 224 c > 0x0a ? MANDOCERR_CHAR_BAD : 225 MANDOCERR_CHAR_UNSUPP, 226 curp->line, pos, "0x%x", c); 227 i++; 228 if (c != '\r') 229 ln.buf[pos++] = '?'; 230 continue; 231 } 232 233 ln.buf[pos++] = blk.buf[i++]; 234 } 235 ln.buf[pos] = '\0'; 236 237 /* 238 * Maintain a lookaside buffer of all lines. 239 * parsed from this input source. 240 */ 241 242 thisln = mandoc_malloc(sizeof(*thisln)); 243 thisln->buf = mandoc_strdup(ln.buf); 244 thisln->sz = strlen(ln.buf) + 1; 245 thisln->next = NULL; 246 if (firstln == NULL) { 247 firstln = lastln = thisln; 248 if (curp->secondary == NULL) 249 curp->secondary = firstln; 250 } else { 251 lastln->next = thisln; 252 lastln = thisln; 253 } 254 255 /* XXX Ugly hack to mark the end of the input. */ 256 257 if (i == blk.sz || blk.buf[i] == '\0') { 258 ln.buf[pos++] = '\n'; 259 ln.buf[pos] = '\0'; 260 } 261 262 /* 263 * A significant amount of complexity is contained by 264 * the roff preprocessor. It's line-oriented but can be 265 * expressed on one line, so we need at times to 266 * readjust our starting point and re-run it. The roff 267 * preprocessor can also readjust the buffers with new 268 * data, so we pass them in wholesale. 269 */ 270 271 of = 0; 272 rerun: 273 line_result = roff_parseln(curp->roff, curp->line, &ln, &of); 274 275 /* Process options. */ 276 277 if (line_result & ROFF_APPEND) 278 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 279 280 if (line_result & ROFF_USERCALL) 281 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 282 283 if (line_result & ROFF_USERRET) { 284 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 285 if (start == 0) { 286 /* Return from the current macro. */ 287 result = ROFF_USERRET; 288 goto out; 289 } 290 } 291 292 switch (line_result & ROFF_LOOPMASK) { 293 case ROFF_IGN: 294 break; 295 case ROFF_WHILE: 296 if (curp->loop != NULL) { 297 if (loop == curp->loop) 298 break; 299 mandoc_msg(MANDOCERR_WHILE_NEST, 300 curp->line, pos, NULL); 301 } 302 curp->loop = thisln; 303 loop = NULL; 304 inloop = 1; 305 break; 306 case ROFF_LOOPCONT: 307 case ROFF_LOOPEXIT: 308 if (curp->loop == NULL) { 309 mandoc_msg(MANDOCERR_WHILE_FAIL, 310 curp->line, pos, NULL); 311 break; 312 } 313 if (inloop == 0) { 314 mandoc_msg(MANDOCERR_WHILE_INTO, 315 curp->line, pos, NULL); 316 curp->loop = loop = NULL; 317 break; 318 } 319 if (line_result & ROFF_LOOPCONT) 320 loop = curp->loop; 321 else { 322 curp->loop = loop = NULL; 323 inloop = 0; 324 } 325 break; 326 default: 327 abort(); 328 } 329 330 /* Process the main instruction from the roff parser. */ 331 332 switch (line_result & ROFF_MASK) { 333 case ROFF_IGN: 334 break; 335 case ROFF_CONT: 336 if (curp->man->meta.macroset == MACROSET_NONE) 337 choose_parser(curp); 338 if ((curp->man->meta.macroset == MACROSET_MDOC ? 339 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 340 man_parseln(curp->man, curp->line, ln.buf, of) 341 ) == 2) 342 goto out; 343 break; 344 case ROFF_RERUN: 345 goto rerun; 346 case ROFF_REPARSE: 347 if (++curp->reparse_count > REPARSE_LIMIT) { 348 /* Abort and return to the top level. */ 349 result = ROFF_IGN; 350 mandoc_msg(MANDOCERR_ROFFLOOP, 351 curp->line, pos, NULL); 352 goto out; 353 } 354 result = mparse_buf_r(curp, ln, of, 0); 355 if (line_result & ROFF_USERCALL) { 356 roff_userret(curp->roff); 357 /* Continue normally. */ 358 if (result & ROFF_USERRET) 359 result = ROFF_CONT; 360 } 361 if (start == 0 && result != ROFF_CONT) 362 goto out; 363 break; 364 case ROFF_SO: 365 if ( ! (curp->options & MPARSE_SO) && 366 (i >= blk.sz || blk.buf[i] == '\0')) { 367 curp->man->meta.sodest = 368 mandoc_strdup(ln.buf + of); 369 goto out; 370 } 371 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 372 mparse_readfd(curp, fd, ln.buf + of); 373 close(fd); 374 } else { 375 mandoc_msg(MANDOCERR_SO_FAIL, 376 curp->line, of, ".so %s: %s", 377 ln.buf + of, strerror(errno)); 378 ln.sz = mandoc_asprintf(&cp, 379 ".sp\nSee the file %s.\n.sp", 380 ln.buf + of); 381 free(ln.buf); 382 ln.buf = cp; 383 of = 0; 384 mparse_buf_r(curp, ln, of, 0); 385 } 386 break; 387 default: 388 abort(); 389 } 390 391 /* Start the next input line. */ 392 393 if (loop != NULL && 394 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 395 loop = loop->next; 396 397 if (loop != NULL) { 398 if ((line_result & ROFF_APPEND) == 0) 399 *ln.buf = '\0'; 400 if (ln.sz < loop->sz) 401 resize_buf(&ln, loop->sz); 402 (void)strlcat(ln.buf, loop->buf, ln.sz); 403 of = 0; 404 goto rerun; 405 } 406 407 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 408 } 409 out: 410 if (inloop) { 411 if (result != ROFF_USERRET) 412 mandoc_msg(MANDOCERR_WHILE_OUTOF, 413 curp->line, pos, NULL); 414 curp->loop = NULL; 415 } 416 free(ln.buf); 417 if (firstln != curp->secondary) 418 free_buf_list(firstln); 419 return result; 420 } 421 422 static int 423 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 424 { 425 struct stat st; 426 gzFile gz; 427 size_t off; 428 ssize_t ssz; 429 int gzerrnum, retval; 430 431 if (fstat(fd, &st) == -1) { 432 mandoc_msg(MANDOCERR_FILE, 0, 0, 433 "fstat: %s", strerror(errno)); 434 return 0; 435 } 436 437 /* 438 * If we're a regular file, try just reading in the whole entry 439 * via mmap(). This is faster than reading it into blocks, and 440 * since each file is only a few bytes to begin with, I'm not 441 * concerned that this is going to tank any machines. 442 */ 443 444 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 445 if (st.st_size > 0x7fffffff) { 446 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 447 return 0; 448 } 449 *with_mmap = 1; 450 fb->sz = (size_t)st.st_size; 451 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 452 if (fb->buf != MAP_FAILED) 453 return 1; 454 } 455 456 if (curp->gzip) { 457 /* 458 * Duplicating the file descriptor is required 459 * because we will have to call gzclose(3) 460 * to free memory used internally by zlib, 461 * but that will also close the file descriptor, 462 * which this function must not do. 463 */ 464 if ((fd = dup(fd)) == -1) { 465 mandoc_msg(MANDOCERR_FILE, 0, 0, 466 "dup: %s", strerror(errno)); 467 return 0; 468 } 469 if ((gz = gzdopen(fd, "rb")) == NULL) { 470 mandoc_msg(MANDOCERR_FILE, 0, 0, 471 "gzdopen: %s", strerror(errno)); 472 close(fd); 473 return 0; 474 } 475 } else 476 gz = NULL; 477 478 /* 479 * If this isn't a regular file (like, say, stdin), then we must 480 * go the old way and just read things in bit by bit. 481 */ 482 483 *with_mmap = 0; 484 off = 0; 485 retval = 0; 486 fb->sz = 0; 487 fb->buf = NULL; 488 for (;;) { 489 if (off == fb->sz) { 490 if (fb->sz == (1U << 31)) { 491 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 492 break; 493 } 494 resize_buf(fb, 65536); 495 } 496 ssz = curp->gzip ? 497 gzread(gz, fb->buf + (int)off, fb->sz - off) : 498 read(fd, fb->buf + (int)off, fb->sz - off); 499 if (ssz == 0) { 500 fb->sz = off; 501 retval = 1; 502 break; 503 } 504 if (ssz == -1) { 505 if (curp->gzip) 506 (void)gzerror(gz, &gzerrnum); 507 mandoc_msg(MANDOCERR_FILE, 0, 0, "read: %s", 508 curp->gzip && gzerrnum != Z_ERRNO ? 509 zError(gzerrnum) : strerror(errno)); 510 break; 511 } 512 off += (size_t)ssz; 513 } 514 515 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 516 mandoc_msg(MANDOCERR_FILE, 0, 0, "gzclose: %s", 517 gzerrnum == Z_ERRNO ? strerror(errno) : 518 zError(gzerrnum)); 519 if (retval == 0) { 520 free(fb->buf); 521 fb->buf = NULL; 522 } 523 return retval; 524 } 525 526 static void 527 mparse_end(struct mparse *curp) 528 { 529 if (curp->man->meta.macroset == MACROSET_NONE) 530 curp->man->meta.macroset = MACROSET_MAN; 531 if (curp->man->meta.macroset == MACROSET_MDOC) 532 mdoc_endparse(curp->man); 533 else 534 man_endparse(curp->man); 535 roff_endparse(curp->roff); 536 } 537 538 /* 539 * Read the whole file into memory and call the parsers. 540 * Called recursively when an .so request is encountered. 541 */ 542 void 543 mparse_readfd(struct mparse *curp, int fd, const char *filename) 544 { 545 static int recursion_depth; 546 547 struct buf blk; 548 struct buf *save_primary; 549 const char *save_filename; 550 size_t offset; 551 int save_filenc, save_lineno; 552 int with_mmap; 553 554 if (recursion_depth > 64) { 555 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 556 return; 557 } 558 if (read_whole_file(curp, fd, &blk, &with_mmap) == 0) 559 return; 560 561 /* 562 * Save some properties of the parent file. 563 */ 564 565 save_primary = curp->primary; 566 save_filenc = curp->filenc; 567 save_lineno = curp->line; 568 save_filename = mandoc_msg_getinfilename(); 569 570 curp->primary = &blk; 571 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 572 curp->line = 1; 573 mandoc_msg_setinfilename(filename); 574 575 /* Skip an UTF-8 byte order mark. */ 576 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 577 (unsigned char)blk.buf[0] == 0xef && 578 (unsigned char)blk.buf[1] == 0xbb && 579 (unsigned char)blk.buf[2] == 0xbf) { 580 offset = 3; 581 curp->filenc &= ~MPARSE_LATIN1; 582 } else 583 offset = 0; 584 585 recursion_depth++; 586 mparse_buf_r(curp, blk, offset, 1); 587 if (--recursion_depth == 0) 588 mparse_end(curp); 589 590 /* 591 * Clean up and restore saved parent properties. 592 */ 593 594 if (with_mmap) 595 munmap(blk.buf, blk.sz); 596 else 597 free(blk.buf); 598 599 curp->primary = save_primary; 600 curp->filenc = save_filenc; 601 curp->line = save_lineno; 602 if (save_filename != NULL) 603 mandoc_msg_setinfilename(save_filename); 604 } 605 606 int 607 mparse_open(struct mparse *curp, const char *file) 608 { 609 char *cp; 610 int fd, save_errno; 611 612 cp = strrchr(file, '.'); 613 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 614 615 /* First try to use the filename as it is. */ 616 617 if ((fd = open(file, O_RDONLY)) != -1) 618 return fd; 619 620 /* 621 * If that doesn't work and the filename doesn't 622 * already end in .gz, try appending .gz. 623 */ 624 625 if ( ! curp->gzip) { 626 save_errno = errno; 627 mandoc_asprintf(&cp, "%s.gz", file); 628 fd = open(cp, O_RDONLY); 629 free(cp); 630 errno = save_errno; 631 if (fd != -1) { 632 curp->gzip = 1; 633 return fd; 634 } 635 } 636 637 /* Neither worked, give up. */ 638 639 return -1; 640 } 641 642 struct mparse * 643 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 644 { 645 struct mparse *curp; 646 647 curp = mandoc_calloc(1, sizeof(struct mparse)); 648 649 curp->options = options; 650 curp->os_s = os_s; 651 652 curp->roff = roff_alloc(options); 653 curp->man = roff_man_alloc(curp->roff, curp->os_s, 654 curp->options & MPARSE_QUICK ? 1 : 0); 655 if (curp->options & MPARSE_MDOC) { 656 curp->man->meta.macroset = MACROSET_MDOC; 657 if (curp->man->mdocmac == NULL) 658 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 659 } else if (curp->options & MPARSE_MAN) { 660 curp->man->meta.macroset = MACROSET_MAN; 661 if (curp->man->manmac == NULL) 662 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 663 } 664 curp->man->meta.first->tok = TOKEN_NONE; 665 curp->man->meta.os_e = os_e; 666 return curp; 667 } 668 669 void 670 mparse_reset(struct mparse *curp) 671 { 672 roff_reset(curp->roff); 673 roff_man_reset(curp->man); 674 free_buf_list(curp->secondary); 675 curp->secondary = NULL; 676 curp->gzip = 0; 677 } 678 679 void 680 mparse_free(struct mparse *curp) 681 { 682 roffhash_free(curp->man->mdocmac); 683 roffhash_free(curp->man->manmac); 684 roff_man_free(curp->man); 685 roff_free(curp->roff); 686 free_buf_list(curp->secondary); 687 free(curp); 688 } 689 690 struct roff_meta * 691 mparse_result(struct mparse *curp) 692 { 693 roff_state_reset(curp->man); 694 if (curp->options & MPARSE_VALIDATE) { 695 if (curp->man->meta.macroset == MACROSET_MDOC) 696 mdoc_validate(curp->man); 697 else 698 man_validate(curp->man); 699 } 700 return &curp->man->meta; 701 } 702 703 void 704 mparse_copy(const struct mparse *p) 705 { 706 struct buf *buf; 707 708 for (buf = p->secondary; buf != NULL; buf = buf->next) 709 puts(buf->buf); 710 } 711