1 /* $Id: mandoc.c,v 1.116 2019/06/27 15:07:30 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include "config.h" 19 20 #include <sys/types.h> 21 22 #include <assert.h> 23 #include <ctype.h> 24 #include <errno.h> 25 #include <limits.h> 26 #include <stdlib.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <time.h> 30 31 #include "mandoc_aux.h" 32 #include "mandoc.h" 33 #include "roff.h" 34 #include "libmandoc.h" 35 #include "roff_int.h" 36 37 static int a2time(time_t *, const char *, const char *); 38 static char *time2a(time_t); 39 40 41 enum mandoc_esc 42 mandoc_font(const char *cp, int sz) 43 { 44 switch (sz) { 45 case 0: 46 return ESCAPE_FONTPREV; 47 case 1: 48 switch (cp[0]) { 49 case 'B': 50 case '3': 51 return ESCAPE_FONTBOLD; 52 case 'I': 53 case '2': 54 return ESCAPE_FONTITALIC; 55 case 'P': 56 return ESCAPE_FONTPREV; 57 case 'R': 58 case '1': 59 return ESCAPE_FONTROMAN; 60 case '4': 61 return ESCAPE_FONTBI; 62 default: 63 return ESCAPE_ERROR; 64 } 65 case 2: 66 switch (cp[0]) { 67 case 'B': 68 switch (cp[1]) { 69 case 'I': 70 return ESCAPE_FONTBI; 71 default: 72 return ESCAPE_ERROR; 73 } 74 case 'C': 75 switch (cp[1]) { 76 case 'B': 77 return ESCAPE_FONTBOLD; 78 case 'I': 79 return ESCAPE_FONTITALIC; 80 case 'R': 81 case 'W': 82 return ESCAPE_FONTCW; 83 default: 84 return ESCAPE_ERROR; 85 } 86 default: 87 return ESCAPE_ERROR; 88 } 89 default: 90 return ESCAPE_ERROR; 91 } 92 } 93 94 enum mandoc_esc 95 mandoc_escape(const char **end, const char **start, int *sz) 96 { 97 const char *local_start; 98 int local_sz, c, i; 99 char term; 100 enum mandoc_esc gly; 101 102 /* 103 * When the caller doesn't provide return storage, 104 * use local storage. 105 */ 106 107 if (NULL == start) 108 start = &local_start; 109 if (NULL == sz) 110 sz = &local_sz; 111 112 /* 113 * Treat "\E" just like "\"; 114 * it only makes a difference in copy mode. 115 */ 116 117 if (**end == 'E') 118 ++*end; 119 120 /* 121 * Beyond the backslash, at least one input character 122 * is part of the escape sequence. With one exception 123 * (see below), that character won't be returned. 124 */ 125 126 gly = ESCAPE_ERROR; 127 *start = ++*end; 128 *sz = 0; 129 term = '\0'; 130 131 switch ((*start)[-1]) { 132 /* 133 * First the glyphs. There are several different forms of 134 * these, but each eventually returns a substring of the glyph 135 * name. 136 */ 137 case '(': 138 gly = ESCAPE_SPECIAL; 139 *sz = 2; 140 break; 141 case '[': 142 if (**start == ' ') { 143 ++*end; 144 return ESCAPE_ERROR; 145 } 146 gly = ESCAPE_SPECIAL; 147 term = ']'; 148 break; 149 case 'C': 150 if ('\'' != **start) 151 return ESCAPE_ERROR; 152 *start = ++*end; 153 gly = ESCAPE_SPECIAL; 154 term = '\''; 155 break; 156 157 /* 158 * Escapes taking no arguments at all. 159 */ 160 case '!': 161 case '?': 162 return ESCAPE_UNSUPP; 163 case '%': 164 case '&': 165 case ')': 166 case ',': 167 case '/': 168 case '^': 169 case 'a': 170 case 'd': 171 case 'r': 172 case 't': 173 case 'u': 174 case '{': 175 case '|': 176 case '}': 177 return ESCAPE_IGNORE; 178 case 'c': 179 return ESCAPE_NOSPACE; 180 case 'p': 181 return ESCAPE_BREAK; 182 183 /* 184 * The \z escape is supposed to output the following 185 * character without advancing the cursor position. 186 * Since we are mostly dealing with terminal mode, 187 * let us just skip the next character. 188 */ 189 case 'z': 190 return ESCAPE_SKIPCHAR; 191 192 /* 193 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 194 * 'X' is the trigger. These have opaque sub-strings. 195 */ 196 case 'F': 197 case 'f': 198 case 'g': 199 case 'k': 200 case 'M': 201 case 'm': 202 case 'n': 203 case 'O': 204 case 'V': 205 case 'Y': 206 gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE; 207 switch (**start) { 208 case '(': 209 if ((*start)[-1] == 'O') 210 gly = ESCAPE_ERROR; 211 *start = ++*end; 212 *sz = 2; 213 break; 214 case '[': 215 if ((*start)[-1] == 'O') 216 gly = (*start)[1] == '5' ? 217 ESCAPE_UNSUPP : ESCAPE_ERROR; 218 *start = ++*end; 219 term = ']'; 220 break; 221 default: 222 if ((*start)[-1] == 'O') { 223 switch (**start) { 224 case '0': 225 gly = ESCAPE_UNSUPP; 226 break; 227 case '1': 228 case '2': 229 case '3': 230 case '4': 231 break; 232 default: 233 gly = ESCAPE_ERROR; 234 break; 235 } 236 } 237 *sz = 1; 238 break; 239 } 240 break; 241 case '*': 242 if (strncmp(*start, "(.T", 3) != 0) 243 abort(); 244 gly = ESCAPE_DEVICE; 245 *start = ++*end; 246 *sz = 2; 247 break; 248 249 /* 250 * These escapes are of the form \X'Y', where 'X' is the trigger 251 * and 'Y' is any string. These have opaque sub-strings. 252 * The \B and \w escapes are handled in roff.c, roff_res(). 253 */ 254 case 'A': 255 case 'b': 256 case 'D': 257 case 'R': 258 case 'X': 259 case 'Z': 260 gly = ESCAPE_IGNORE; 261 /* FALLTHROUGH */ 262 case 'o': 263 if (**start == '\0') 264 return ESCAPE_ERROR; 265 if (gly == ESCAPE_ERROR) 266 gly = ESCAPE_OVERSTRIKE; 267 term = **start; 268 *start = ++*end; 269 break; 270 271 /* 272 * These escapes are of the form \X'N', where 'X' is the trigger 273 * and 'N' resolves to a numerical expression. 274 */ 275 case 'h': 276 case 'H': 277 case 'L': 278 case 'l': 279 case 'S': 280 case 'v': 281 case 'x': 282 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 283 if ('\0' != **start) 284 ++*end; 285 return ESCAPE_ERROR; 286 } 287 switch ((*start)[-1]) { 288 case 'h': 289 gly = ESCAPE_HORIZ; 290 break; 291 case 'l': 292 gly = ESCAPE_HLINE; 293 break; 294 default: 295 gly = ESCAPE_IGNORE; 296 break; 297 } 298 term = **start; 299 *start = ++*end; 300 break; 301 302 /* 303 * Special handling for the numbered character escape. 304 * XXX Do any other escapes need similar handling? 305 */ 306 case 'N': 307 if ('\0' == **start) 308 return ESCAPE_ERROR; 309 (*end)++; 310 if (isdigit((unsigned char)**start)) { 311 *sz = 1; 312 return ESCAPE_IGNORE; 313 } 314 (*start)++; 315 while (isdigit((unsigned char)**end)) 316 (*end)++; 317 *sz = *end - *start; 318 if ('\0' != **end) 319 (*end)++; 320 return ESCAPE_NUMBERED; 321 322 /* 323 * Sizes get a special category of their own. 324 */ 325 case 's': 326 gly = ESCAPE_IGNORE; 327 328 /* See +/- counts as a sign. */ 329 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 330 *start = ++*end; 331 332 switch (**end) { 333 case '(': 334 *start = ++*end; 335 *sz = 2; 336 break; 337 case '[': 338 *start = ++*end; 339 term = ']'; 340 break; 341 case '\'': 342 *start = ++*end; 343 term = '\''; 344 break; 345 case '3': 346 case '2': 347 case '1': 348 *sz = (*end)[-1] == 's' && 349 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 350 break; 351 default: 352 *sz = 1; 353 break; 354 } 355 356 break; 357 358 /* 359 * Several special characters can be encoded as 360 * one-byte escape sequences without using \[]. 361 */ 362 case ' ': 363 case '\'': 364 case '-': 365 case '.': 366 case '0': 367 case ':': 368 case '_': 369 case '`': 370 case 'e': 371 case '~': 372 gly = ESCAPE_SPECIAL; 373 /* FALLTHROUGH */ 374 default: 375 if (gly == ESCAPE_ERROR) 376 gly = ESCAPE_UNDEF; 377 *start = --*end; 378 *sz = 1; 379 break; 380 } 381 382 /* 383 * Read up to the terminating character, 384 * paying attention to nested escapes. 385 */ 386 387 if ('\0' != term) { 388 while (**end != term) { 389 switch (**end) { 390 case '\0': 391 return ESCAPE_ERROR; 392 case '\\': 393 (*end)++; 394 if (ESCAPE_ERROR == 395 mandoc_escape(end, NULL, NULL)) 396 return ESCAPE_ERROR; 397 break; 398 default: 399 (*end)++; 400 break; 401 } 402 } 403 *sz = (*end)++ - *start; 404 405 /* 406 * The file chars.c only provides one common list 407 * of character names, but \[-] == \- is the only 408 * one of the characters with one-byte names that 409 * allows enclosing the name in brackets. 410 */ 411 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-') 412 return ESCAPE_ERROR; 413 } else { 414 assert(*sz > 0); 415 if ((size_t)*sz > strlen(*start)) 416 return ESCAPE_ERROR; 417 *end += *sz; 418 } 419 420 /* Run post-processors. */ 421 422 switch (gly) { 423 case ESCAPE_FONT: 424 gly = mandoc_font(*start, *sz); 425 break; 426 case ESCAPE_SPECIAL: 427 if (**start == 'c') { 428 if (*sz < 6 || *sz > 7 || 429 strncmp(*start, "char", 4) != 0 || 430 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 431 break; 432 c = 0; 433 for (i = 4; i < *sz; i++) 434 c = 10 * c + ((*start)[i] - '0'); 435 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 436 break; 437 *start += 4; 438 *sz -= 4; 439 gly = ESCAPE_NUMBERED; 440 break; 441 } 442 443 /* 444 * Unicode escapes are defined in groff as \[u0000] 445 * to \[u10FFFF], where the contained value must be 446 * a valid Unicode codepoint. Here, however, only 447 * check the length and range. 448 */ 449 if (**start != 'u' || *sz < 5 || *sz > 7) 450 break; 451 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 452 break; 453 if (*sz == 6 && (*start)[1] == '0') 454 break; 455 if (*sz == 5 && (*start)[1] == 'D' && 456 strchr("89ABCDEF", (*start)[2]) != NULL) 457 break; 458 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 459 + 1 == *sz) 460 gly = ESCAPE_UNICODE; 461 break; 462 default: 463 break; 464 } 465 466 return gly; 467 } 468 469 static int 470 a2time(time_t *t, const char *fmt, const char *p) 471 { 472 struct tm tm; 473 char *pp; 474 475 memset(&tm, 0, sizeof(struct tm)); 476 477 pp = NULL; 478 #if HAVE_STRPTIME 479 pp = strptime(p, fmt, &tm); 480 #endif 481 if (NULL != pp && '\0' == *pp) { 482 *t = mktime(&tm); 483 return 1; 484 } 485 486 return 0; 487 } 488 489 static char * 490 time2a(time_t t) 491 { 492 struct tm *tm; 493 char *buf, *p; 494 size_t ssz; 495 int isz; 496 497 buf = NULL; 498 tm = localtime(&t); 499 if (tm == NULL) 500 goto fail; 501 502 /* 503 * Reserve space: 504 * up to 9 characters for the month (September) + blank 505 * up to 2 characters for the day + comma + blank 506 * 4 characters for the year and a terminating '\0' 507 */ 508 509 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 510 511 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 512 goto fail; 513 p += (int)ssz; 514 515 /* 516 * The output format is just "%d" here, not "%2d" or "%02d". 517 * That's also the reason why we can't just format the 518 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 519 * Besides, the present approach is less prone to buffer 520 * overflows, in case anybody should ever introduce the bug 521 * of looking at LC_TIME. 522 */ 523 524 isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday); 525 if (isz < 0 || isz > 4) 526 goto fail; 527 p += isz; 528 529 if (strftime(p, 4 + 1, "%Y", tm) == 0) 530 goto fail; 531 return buf; 532 533 fail: 534 free(buf); 535 return mandoc_strdup(""); 536 } 537 538 char * 539 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 540 { 541 char *cp; 542 time_t t; 543 544 if (man->quick) 545 return mandoc_strdup(in == NULL ? "" : in); 546 547 /* No date specified: use today's date. */ 548 549 if (in == NULL || *in == '\0') 550 mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL); 551 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) 552 return time2a(time(NULL)); 553 554 /* Valid mdoc(7) date format. */ 555 556 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 557 a2time(&t, "%b %d, %Y", in)) { 558 cp = time2a(t); 559 if (t > time(NULL) + 86400) 560 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp); 561 else if (*in != '$' && strcmp(in, cp) != 0) 562 mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp); 563 return cp; 564 } 565 566 /* In man(7), do not warn about the legacy format. */ 567 568 if (a2time(&t, "%Y-%m-%d", in) == 0) 569 mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in); 570 else if (t > time(NULL) + 86400) 571 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in); 572 else if (man->meta.macroset == MACROSET_MDOC) 573 mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in); 574 575 /* Use any non-mdoc(7) date verbatim. */ 576 577 return mandoc_strdup(in); 578 } 579 580 int 581 mandoc_eos(const char *p, size_t sz) 582 { 583 const char *q; 584 int enclosed, found; 585 586 if (0 == sz) 587 return 0; 588 589 /* 590 * End-of-sentence recognition must include situations where 591 * some symbols, such as `)', allow prior EOS punctuation to 592 * propagate outward. 593 */ 594 595 enclosed = found = 0; 596 for (q = p + (int)sz - 1; q >= p; q--) { 597 switch (*q) { 598 case '\"': 599 case '\'': 600 case ']': 601 case ')': 602 if (0 == found) 603 enclosed = 1; 604 break; 605 case '.': 606 case '!': 607 case '?': 608 found = 1; 609 break; 610 default: 611 return found && 612 (!enclosed || isalnum((unsigned char)*q)); 613 } 614 } 615 616 return found && !enclosed; 617 } 618 619 /* 620 * Convert a string to a long that may not be <0. 621 * If the string is invalid, or is less than 0, return -1. 622 */ 623 int 624 mandoc_strntoi(const char *p, size_t sz, int base) 625 { 626 char buf[32]; 627 char *ep; 628 long v; 629 630 if (sz > 31) 631 return -1; 632 633 memcpy(buf, p, sz); 634 buf[(int)sz] = '\0'; 635 636 errno = 0; 637 v = strtol(buf, &ep, base); 638 639 if (buf[0] == '\0' || *ep != '\0') 640 return -1; 641 642 if (v > INT_MAX) 643 v = INT_MAX; 644 if (v < INT_MIN) 645 v = INT_MIN; 646 647 return (int)v; 648 } 649