1 /* $Id: mandoc.c,v 1.119 2021/08/10 12:55:03 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017-2021 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include "config.h" 19 20 #include <sys/types.h> 21 22 #include <assert.h> 23 #include <ctype.h> 24 #include <errno.h> 25 #include <limits.h> 26 #include <stdlib.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <time.h> 30 31 #include "mandoc_aux.h" 32 #include "mandoc.h" 33 #include "roff.h" 34 #include "libmandoc.h" 35 #include "roff_int.h" 36 37 static int a2time(time_t *, const char *, const char *); 38 static char *time2a(time_t); 39 40 41 enum mandoc_esc 42 mandoc_font(const char *cp, int sz) 43 { 44 switch (sz) { 45 case 0: 46 return ESCAPE_FONTPREV; 47 case 1: 48 switch (cp[0]) { 49 case 'B': 50 case '3': 51 return ESCAPE_FONTBOLD; 52 case 'I': 53 case '2': 54 return ESCAPE_FONTITALIC; 55 case 'P': 56 return ESCAPE_FONTPREV; 57 case 'R': 58 case '1': 59 return ESCAPE_FONTROMAN; 60 case '4': 61 return ESCAPE_FONTBI; 62 default: 63 return ESCAPE_ERROR; 64 } 65 case 2: 66 switch (cp[0]) { 67 case 'B': 68 switch (cp[1]) { 69 case 'I': 70 return ESCAPE_FONTBI; 71 default: 72 return ESCAPE_ERROR; 73 } 74 case 'C': 75 switch (cp[1]) { 76 case 'B': 77 return ESCAPE_FONTCB; 78 case 'I': 79 return ESCAPE_FONTCI; 80 case 'R': 81 case 'W': 82 return ESCAPE_FONTCR; 83 default: 84 return ESCAPE_ERROR; 85 } 86 default: 87 return ESCAPE_ERROR; 88 } 89 default: 90 return ESCAPE_ERROR; 91 } 92 } 93 94 enum mandoc_esc 95 mandoc_escape(const char **end, const char **start, int *sz) 96 { 97 const char *local_start; 98 int local_sz, c, i; 99 char term; 100 enum mandoc_esc gly; 101 102 /* 103 * When the caller doesn't provide return storage, 104 * use local storage. 105 */ 106 107 if (NULL == start) 108 start = &local_start; 109 if (NULL == sz) 110 sz = &local_sz; 111 112 /* 113 * Treat "\E" just like "\"; 114 * it only makes a difference in copy mode. 115 */ 116 117 if (**end == 'E') 118 ++*end; 119 120 /* 121 * Beyond the backslash, at least one input character 122 * is part of the escape sequence. With one exception 123 * (see below), that character won't be returned. 124 */ 125 126 gly = ESCAPE_ERROR; 127 *start = ++*end; 128 *sz = 0; 129 term = '\0'; 130 131 switch ((*start)[-1]) { 132 /* 133 * First the glyphs. There are several different forms of 134 * these, but each eventually returns a substring of the glyph 135 * name. 136 */ 137 case '(': 138 gly = ESCAPE_SPECIAL; 139 *sz = 2; 140 break; 141 case '[': 142 if (**start == ' ') { 143 ++*end; 144 return ESCAPE_ERROR; 145 } 146 gly = ESCAPE_SPECIAL; 147 term = ']'; 148 break; 149 case 'C': 150 if ('\'' != **start) 151 return ESCAPE_ERROR; 152 *start = ++*end; 153 gly = ESCAPE_SPECIAL; 154 term = '\''; 155 break; 156 157 /* 158 * Escapes taking no arguments at all. 159 */ 160 case '!': 161 case '?': 162 return ESCAPE_UNSUPP; 163 case '%': 164 case '&': 165 case ')': 166 case ',': 167 case '/': 168 case '^': 169 case 'a': 170 case 'd': 171 case 'r': 172 case 't': 173 case 'u': 174 case '{': 175 case '|': 176 case '}': 177 return ESCAPE_IGNORE; 178 case 'c': 179 return ESCAPE_NOSPACE; 180 case 'p': 181 return ESCAPE_BREAK; 182 183 /* 184 * The \z escape is supposed to output the following 185 * character without advancing the cursor position. 186 * Since we are mostly dealing with terminal mode, 187 * let us just skip the next character. 188 */ 189 case 'z': 190 return ESCAPE_SKIPCHAR; 191 192 /* 193 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 194 * 'X' is the trigger. These have opaque sub-strings. 195 */ 196 case 'F': 197 case 'f': 198 case 'g': 199 case 'k': 200 case 'M': 201 case 'm': 202 case 'n': 203 case 'O': 204 case 'V': 205 case 'Y': 206 case '*': 207 switch ((*start)[-1]) { 208 case 'f': 209 gly = ESCAPE_FONT; 210 break; 211 case '*': 212 gly = ESCAPE_DEVICE; 213 break; 214 default: 215 gly = ESCAPE_IGNORE; 216 break; 217 } 218 switch (**start) { 219 case '(': 220 if ((*start)[-1] == 'O') 221 gly = ESCAPE_ERROR; 222 *start = ++*end; 223 *sz = 2; 224 break; 225 case '[': 226 if ((*start)[-1] == 'O') 227 gly = (*start)[1] == '5' ? 228 ESCAPE_UNSUPP : ESCAPE_ERROR; 229 *start = ++*end; 230 term = ']'; 231 break; 232 default: 233 if ((*start)[-1] == 'O') { 234 switch (**start) { 235 case '0': 236 gly = ESCAPE_UNSUPP; 237 break; 238 case '1': 239 case '2': 240 case '3': 241 case '4': 242 break; 243 default: 244 gly = ESCAPE_ERROR; 245 break; 246 } 247 } 248 *sz = 1; 249 break; 250 } 251 break; 252 253 /* 254 * These escapes are of the form \X'Y', where 'X' is the trigger 255 * and 'Y' is any string. These have opaque sub-strings. 256 * The \B and \w escapes are handled in roff.c, roff_res(). 257 */ 258 case 'A': 259 case 'b': 260 case 'D': 261 case 'R': 262 case 'X': 263 case 'Z': 264 gly = ESCAPE_IGNORE; 265 /* FALLTHROUGH */ 266 case 'o': 267 if (**start == '\0') 268 return ESCAPE_ERROR; 269 if (gly == ESCAPE_ERROR) 270 gly = ESCAPE_OVERSTRIKE; 271 term = **start; 272 *start = ++*end; 273 break; 274 275 /* 276 * These escapes are of the form \X'N', where 'X' is the trigger 277 * and 'N' resolves to a numerical expression. 278 */ 279 case 'h': 280 case 'H': 281 case 'L': 282 case 'l': 283 case 'S': 284 case 'v': 285 case 'x': 286 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 287 if ('\0' != **start) 288 ++*end; 289 return ESCAPE_ERROR; 290 } 291 switch ((*start)[-1]) { 292 case 'h': 293 gly = ESCAPE_HORIZ; 294 break; 295 case 'l': 296 gly = ESCAPE_HLINE; 297 break; 298 default: 299 gly = ESCAPE_IGNORE; 300 break; 301 } 302 term = **start; 303 *start = ++*end; 304 break; 305 306 /* 307 * Special handling for the numbered character escape. 308 * XXX Do any other escapes need similar handling? 309 */ 310 case 'N': 311 if ('\0' == **start) 312 return ESCAPE_ERROR; 313 (*end)++; 314 if (isdigit((unsigned char)**start)) { 315 *sz = 1; 316 return ESCAPE_IGNORE; 317 } 318 (*start)++; 319 while (isdigit((unsigned char)**end)) 320 (*end)++; 321 *sz = *end - *start; 322 if ('\0' != **end) 323 (*end)++; 324 return ESCAPE_NUMBERED; 325 326 /* 327 * Sizes get a special category of their own. 328 */ 329 case 's': 330 gly = ESCAPE_IGNORE; 331 332 /* See +/- counts as a sign. */ 333 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 334 *start = ++*end; 335 336 switch (**end) { 337 case '(': 338 *start = ++*end; 339 *sz = 2; 340 break; 341 case '[': 342 *start = ++*end; 343 term = ']'; 344 break; 345 case '\'': 346 *start = ++*end; 347 term = '\''; 348 break; 349 case '3': 350 case '2': 351 case '1': 352 *sz = (*end)[-1] == 's' && 353 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 354 break; 355 default: 356 *sz = 1; 357 break; 358 } 359 360 break; 361 362 /* 363 * Several special characters can be encoded as 364 * one-byte escape sequences without using \[]. 365 */ 366 case ' ': 367 case '\'': 368 case '-': 369 case '.': 370 case '0': 371 case ':': 372 case '_': 373 case '`': 374 case 'e': 375 case '~': 376 gly = ESCAPE_SPECIAL; 377 /* FALLTHROUGH */ 378 default: 379 if (gly == ESCAPE_ERROR) 380 gly = ESCAPE_UNDEF; 381 *start = --*end; 382 *sz = 1; 383 break; 384 } 385 386 /* 387 * Read up to the terminating character, 388 * paying attention to nested escapes. 389 */ 390 391 if ('\0' != term) { 392 while (**end != term) { 393 switch (**end) { 394 case '\0': 395 return ESCAPE_ERROR; 396 case '\\': 397 (*end)++; 398 if (ESCAPE_ERROR == 399 mandoc_escape(end, NULL, NULL)) 400 return ESCAPE_ERROR; 401 break; 402 default: 403 (*end)++; 404 break; 405 } 406 } 407 *sz = (*end)++ - *start; 408 409 /* 410 * The file chars.c only provides one common list 411 * of character names, but \[-] == \- is the only 412 * one of the characters with one-byte names that 413 * allows enclosing the name in brackets. 414 */ 415 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-') 416 return ESCAPE_ERROR; 417 } else { 418 assert(*sz > 0); 419 if ((size_t)*sz > strlen(*start)) 420 return ESCAPE_ERROR; 421 *end += *sz; 422 } 423 424 /* Run post-processors. */ 425 426 switch (gly) { 427 case ESCAPE_FONT: 428 gly = mandoc_font(*start, *sz); 429 break; 430 case ESCAPE_SPECIAL: 431 if (**start == 'c') { 432 if (*sz < 6 || *sz > 7 || 433 strncmp(*start, "char", 4) != 0 || 434 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 435 break; 436 c = 0; 437 for (i = 4; i < *sz; i++) 438 c = 10 * c + ((*start)[i] - '0'); 439 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 440 break; 441 *start += 4; 442 *sz -= 4; 443 gly = ESCAPE_NUMBERED; 444 break; 445 } 446 447 /* 448 * Unicode escapes are defined in groff as \[u0000] 449 * to \[u10FFFF], where the contained value must be 450 * a valid Unicode codepoint. Here, however, only 451 * check the length and range. 452 */ 453 if (**start != 'u' || *sz < 5 || *sz > 7) 454 break; 455 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 456 break; 457 if (*sz == 6 && (*start)[1] == '0') 458 break; 459 if (*sz == 5 && (*start)[1] == 'D' && 460 strchr("89ABCDEF", (*start)[2]) != NULL) 461 break; 462 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 463 + 1 == *sz) 464 gly = ESCAPE_UNICODE; 465 break; 466 case ESCAPE_DEVICE: 467 assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T'); 468 break; 469 default: 470 break; 471 } 472 473 return gly; 474 } 475 476 static int 477 a2time(time_t *t, const char *fmt, const char *p) 478 { 479 struct tm tm; 480 char *pp; 481 482 memset(&tm, 0, sizeof(struct tm)); 483 484 pp = NULL; 485 #if HAVE_STRPTIME 486 pp = strptime(p, fmt, &tm); 487 #endif 488 if (NULL != pp && '\0' == *pp) { 489 *t = mktime(&tm); 490 return 1; 491 } 492 493 return 0; 494 } 495 496 static char * 497 time2a(time_t t) 498 { 499 struct tm *tm; 500 char *buf, *p; 501 size_t ssz; 502 int isz; 503 504 buf = NULL; 505 tm = localtime(&t); 506 if (tm == NULL) 507 goto fail; 508 509 /* 510 * Reserve space: 511 * up to 9 characters for the month (September) + blank 512 * up to 2 characters for the day + comma + blank 513 * 4 characters for the year and a terminating '\0' 514 */ 515 516 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 517 518 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 519 goto fail; 520 p += (int)ssz; 521 522 /* 523 * The output format is just "%d" here, not "%2d" or "%02d". 524 * That's also the reason why we can't just format the 525 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 526 * Besides, the present approach is less prone to buffer 527 * overflows, in case anybody should ever introduce the bug 528 * of looking at LC_TIME. 529 */ 530 531 isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday); 532 if (isz < 0 || isz > 4) 533 goto fail; 534 p += isz; 535 536 if (strftime(p, 4 + 1, "%Y", tm) == 0) 537 goto fail; 538 return buf; 539 540 fail: 541 free(buf); 542 return mandoc_strdup(""); 543 } 544 545 char * 546 mandoc_normdate(struct roff_node *nch, struct roff_node *nbl) 547 { 548 char *cp; 549 time_t t; 550 551 /* No date specified. */ 552 553 if (nch == NULL) { 554 if (nbl == NULL) 555 mandoc_msg(MANDOCERR_DATE_MISSING, 0, 0, NULL); 556 else 557 mandoc_msg(MANDOCERR_DATE_MISSING, nbl->line, 558 nbl->pos, "%s", roff_name[nbl->tok]); 559 return mandoc_strdup(""); 560 } 561 if (*nch->string == '\0') { 562 mandoc_msg(MANDOCERR_DATE_MISSING, nch->line, 563 nch->pos, "%s", roff_name[nbl->tok]); 564 return mandoc_strdup(""); 565 } 566 if (strcmp(nch->string, "$" "Mdocdate$") == 0) 567 return time2a(time(NULL)); 568 569 /* Valid mdoc(7) date format. */ 570 571 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", nch->string) || 572 a2time(&t, "%b %d, %Y", nch->string)) { 573 cp = time2a(t); 574 if (t > time(NULL) + 86400) 575 mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line, 576 nch->pos, "%s %s", roff_name[nbl->tok], cp); 577 else if (*nch->string != '$' && 578 strcmp(nch->string, cp) != 0) 579 mandoc_msg(MANDOCERR_DATE_NORM, nch->line, 580 nch->pos, "%s %s", roff_name[nbl->tok], cp); 581 return cp; 582 } 583 584 /* In man(7), do not warn about the legacy format. */ 585 586 if (a2time(&t, "%Y-%m-%d", nch->string) == 0) 587 mandoc_msg(MANDOCERR_DATE_BAD, nch->line, nch->pos, 588 "%s %s", roff_name[nbl->tok], nch->string); 589 else if (t > time(NULL) + 86400) 590 mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line, nch->pos, 591 "%s %s", roff_name[nbl->tok], nch->string); 592 else if (nbl->tok == MDOC_Dd) 593 mandoc_msg(MANDOCERR_DATE_LEGACY, nch->line, nch->pos, 594 "Dd %s", nch->string); 595 596 /* Use any non-mdoc(7) date verbatim. */ 597 598 return mandoc_strdup(nch->string); 599 } 600 601 int 602 mandoc_eos(const char *p, size_t sz) 603 { 604 const char *q; 605 int enclosed, found; 606 607 if (0 == sz) 608 return 0; 609 610 /* 611 * End-of-sentence recognition must include situations where 612 * some symbols, such as `)', allow prior EOS punctuation to 613 * propagate outward. 614 */ 615 616 enclosed = found = 0; 617 for (q = p + (int)sz - 1; q >= p; q--) { 618 switch (*q) { 619 case '\"': 620 case '\'': 621 case ']': 622 case ')': 623 if (0 == found) 624 enclosed = 1; 625 break; 626 case '.': 627 case '!': 628 case '?': 629 found = 1; 630 break; 631 default: 632 return found && 633 (!enclosed || isalnum((unsigned char)*q)); 634 } 635 } 636 637 return found && !enclosed; 638 } 639 640 /* 641 * Convert a string to a long that may not be <0. 642 * If the string is invalid, or is less than 0, return -1. 643 */ 644 int 645 mandoc_strntoi(const char *p, size_t sz, int base) 646 { 647 char buf[32]; 648 char *ep; 649 long v; 650 651 if (sz > 31) 652 return -1; 653 654 memcpy(buf, p, sz); 655 buf[(int)sz] = '\0'; 656 657 errno = 0; 658 v = strtol(buf, &ep, base); 659 660 if (buf[0] == '\0' || *ep != '\0') 661 return -1; 662 663 if (v > INT_MAX) 664 v = INT_MAX; 665 if (v < INT_MIN) 666 v = INT_MIN; 667 668 return (int)v; 669 } 670