1 /* $Id: mandoc.c,v 1.114 2018/12/30 00:49:55 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include "config.h" 19 20 #include <sys/types.h> 21 22 #include <assert.h> 23 #include <ctype.h> 24 #include <errno.h> 25 #include <limits.h> 26 #include <stdlib.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <time.h> 30 31 #include "mandoc_aux.h" 32 #include "mandoc.h" 33 #include "roff.h" 34 #include "libmandoc.h" 35 #include "roff_int.h" 36 37 static int a2time(time_t *, const char *, const char *); 38 static char *time2a(time_t); 39 40 41 enum mandoc_esc 42 mandoc_font(const char *cp, int sz) 43 { 44 switch (sz) { 45 case 0: 46 return ESCAPE_FONTPREV; 47 case 1: 48 switch (cp[0]) { 49 case 'B': 50 case '3': 51 return ESCAPE_FONTBOLD; 52 case 'I': 53 case '2': 54 return ESCAPE_FONTITALIC; 55 case 'P': 56 return ESCAPE_FONTPREV; 57 case 'R': 58 case '1': 59 return ESCAPE_FONTROMAN; 60 case '4': 61 return ESCAPE_FONTBI; 62 default: 63 return ESCAPE_ERROR; 64 } 65 case 2: 66 switch (cp[0]) { 67 case 'B': 68 switch (cp[1]) { 69 case 'I': 70 return ESCAPE_FONTBI; 71 default: 72 return ESCAPE_ERROR; 73 } 74 case 'C': 75 switch (cp[1]) { 76 case 'B': 77 return ESCAPE_FONTBOLD; 78 case 'I': 79 return ESCAPE_FONTITALIC; 80 case 'R': 81 case 'W': 82 return ESCAPE_FONTCW; 83 default: 84 return ESCAPE_ERROR; 85 } 86 default: 87 return ESCAPE_ERROR; 88 } 89 default: 90 return ESCAPE_ERROR; 91 } 92 } 93 94 enum mandoc_esc 95 mandoc_escape(const char **end, const char **start, int *sz) 96 { 97 const char *local_start; 98 int local_sz, c, i; 99 char term; 100 enum mandoc_esc gly; 101 102 /* 103 * When the caller doesn't provide return storage, 104 * use local storage. 105 */ 106 107 if (NULL == start) 108 start = &local_start; 109 if (NULL == sz) 110 sz = &local_sz; 111 112 /* 113 * Treat "\E" just like "\"; 114 * it only makes a difference in copy mode. 115 */ 116 117 if (**end == 'E') 118 ++*end; 119 120 /* 121 * Beyond the backslash, at least one input character 122 * is part of the escape sequence. With one exception 123 * (see below), that character won't be returned. 124 */ 125 126 gly = ESCAPE_ERROR; 127 *start = ++*end; 128 *sz = 0; 129 term = '\0'; 130 131 switch ((*start)[-1]) { 132 /* 133 * First the glyphs. There are several different forms of 134 * these, but each eventually returns a substring of the glyph 135 * name. 136 */ 137 case '(': 138 gly = ESCAPE_SPECIAL; 139 *sz = 2; 140 break; 141 case '[': 142 if (**start == ' ') { 143 ++*end; 144 return ESCAPE_ERROR; 145 } 146 gly = ESCAPE_SPECIAL; 147 term = ']'; 148 break; 149 case 'C': 150 if ('\'' != **start) 151 return ESCAPE_ERROR; 152 *start = ++*end; 153 gly = ESCAPE_SPECIAL; 154 term = '\''; 155 break; 156 157 /* 158 * Escapes taking no arguments at all. 159 */ 160 case '!': 161 case '?': 162 return ESCAPE_UNSUPP; 163 case '%': 164 case '&': 165 case ')': 166 case ',': 167 case '/': 168 case '^': 169 case 'a': 170 case 'd': 171 case 'r': 172 case 't': 173 case 'u': 174 case '{': 175 case '|': 176 case '}': 177 return ESCAPE_IGNORE; 178 case 'c': 179 return ESCAPE_NOSPACE; 180 case 'p': 181 return ESCAPE_BREAK; 182 183 /* 184 * The \z escape is supposed to output the following 185 * character without advancing the cursor position. 186 * Since we are mostly dealing with terminal mode, 187 * let us just skip the next character. 188 */ 189 case 'z': 190 return ESCAPE_SKIPCHAR; 191 192 /* 193 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 194 * 'X' is the trigger. These have opaque sub-strings. 195 */ 196 case 'F': 197 case 'f': 198 case 'g': 199 case 'k': 200 case 'M': 201 case 'm': 202 case 'n': 203 case 'O': 204 case 'V': 205 case 'Y': 206 gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE; 207 switch (**start) { 208 case '(': 209 if ((*start)[-1] == 'O') 210 gly = ESCAPE_ERROR; 211 *start = ++*end; 212 *sz = 2; 213 break; 214 case '[': 215 if ((*start)[-1] == 'O') 216 gly = (*start)[1] == '5' ? 217 ESCAPE_UNSUPP : ESCAPE_ERROR; 218 *start = ++*end; 219 term = ']'; 220 break; 221 default: 222 if ((*start)[-1] == 'O') { 223 switch (**start) { 224 case '0': 225 gly = ESCAPE_UNSUPP; 226 break; 227 case '1': 228 case '2': 229 case '3': 230 case '4': 231 break; 232 default: 233 gly = ESCAPE_ERROR; 234 break; 235 } 236 } 237 *sz = 1; 238 break; 239 } 240 break; 241 case '*': 242 if (strncmp(*start, "(.T", 3) != 0) 243 abort(); 244 gly = ESCAPE_DEVICE; 245 *start = ++*end; 246 *sz = 2; 247 break; 248 249 /* 250 * These escapes are of the form \X'Y', where 'X' is the trigger 251 * and 'Y' is any string. These have opaque sub-strings. 252 * The \B and \w escapes are handled in roff.c, roff_res(). 253 */ 254 case 'A': 255 case 'b': 256 case 'D': 257 case 'R': 258 case 'X': 259 case 'Z': 260 gly = ESCAPE_IGNORE; 261 /* FALLTHROUGH */ 262 case 'o': 263 if (**start == '\0') 264 return ESCAPE_ERROR; 265 if (gly == ESCAPE_ERROR) 266 gly = ESCAPE_OVERSTRIKE; 267 term = **start; 268 *start = ++*end; 269 break; 270 271 /* 272 * These escapes are of the form \X'N', where 'X' is the trigger 273 * and 'N' resolves to a numerical expression. 274 */ 275 case 'h': 276 case 'H': 277 case 'L': 278 case 'l': 279 case 'S': 280 case 'v': 281 case 'x': 282 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 283 if ('\0' != **start) 284 ++*end; 285 return ESCAPE_ERROR; 286 } 287 switch ((*start)[-1]) { 288 case 'h': 289 gly = ESCAPE_HORIZ; 290 break; 291 case 'l': 292 gly = ESCAPE_HLINE; 293 break; 294 default: 295 gly = ESCAPE_IGNORE; 296 break; 297 } 298 term = **start; 299 *start = ++*end; 300 break; 301 302 /* 303 * Special handling for the numbered character escape. 304 * XXX Do any other escapes need similar handling? 305 */ 306 case 'N': 307 if ('\0' == **start) 308 return ESCAPE_ERROR; 309 (*end)++; 310 if (isdigit((unsigned char)**start)) { 311 *sz = 1; 312 return ESCAPE_IGNORE; 313 } 314 (*start)++; 315 while (isdigit((unsigned char)**end)) 316 (*end)++; 317 *sz = *end - *start; 318 if ('\0' != **end) 319 (*end)++; 320 return ESCAPE_NUMBERED; 321 322 /* 323 * Sizes get a special category of their own. 324 */ 325 case 's': 326 gly = ESCAPE_IGNORE; 327 328 /* See +/- counts as a sign. */ 329 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 330 *start = ++*end; 331 332 switch (**end) { 333 case '(': 334 *start = ++*end; 335 *sz = 2; 336 break; 337 case '[': 338 *start = ++*end; 339 term = ']'; 340 break; 341 case '\'': 342 *start = ++*end; 343 term = '\''; 344 break; 345 case '3': 346 case '2': 347 case '1': 348 *sz = (*end)[-1] == 's' && 349 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 350 break; 351 default: 352 *sz = 1; 353 break; 354 } 355 356 break; 357 358 /* 359 * Several special characters can be encoded as 360 * one-byte escape sequences without using \[]. 361 */ 362 case ' ': 363 case '\'': 364 case '-': 365 case '.': 366 case '0': 367 case ':': 368 case '_': 369 case '`': 370 case 'e': 371 case '~': 372 gly = ESCAPE_SPECIAL; 373 /* FALLTHROUGH */ 374 default: 375 if (gly == ESCAPE_ERROR) 376 gly = ESCAPE_UNDEF; 377 *start = --*end; 378 *sz = 1; 379 break; 380 } 381 382 /* 383 * Read up to the terminating character, 384 * paying attention to nested escapes. 385 */ 386 387 if ('\0' != term) { 388 while (**end != term) { 389 switch (**end) { 390 case '\0': 391 return ESCAPE_ERROR; 392 case '\\': 393 (*end)++; 394 if (ESCAPE_ERROR == 395 mandoc_escape(end, NULL, NULL)) 396 return ESCAPE_ERROR; 397 break; 398 default: 399 (*end)++; 400 break; 401 } 402 } 403 *sz = (*end)++ - *start; 404 405 /* 406 * The file chars.c only provides one common list 407 * of character names, but \[-] == \- is the only 408 * one of the characters with one-byte names that 409 * allows enclosing the name in brackets. 410 */ 411 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-') 412 return ESCAPE_ERROR; 413 } else { 414 assert(*sz > 0); 415 if ((size_t)*sz > strlen(*start)) 416 return ESCAPE_ERROR; 417 *end += *sz; 418 } 419 420 /* Run post-processors. */ 421 422 switch (gly) { 423 case ESCAPE_FONT: 424 gly = mandoc_font(*start, *sz); 425 break; 426 case ESCAPE_SPECIAL: 427 if (**start == 'c') { 428 if (*sz < 6 || *sz > 7 || 429 strncmp(*start, "char", 4) != 0 || 430 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 431 break; 432 c = 0; 433 for (i = 4; i < *sz; i++) 434 c = 10 * c + ((*start)[i] - '0'); 435 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 436 break; 437 *start += 4; 438 *sz -= 4; 439 gly = ESCAPE_NUMBERED; 440 break; 441 } 442 443 /* 444 * Unicode escapes are defined in groff as \[u0000] 445 * to \[u10FFFF], where the contained value must be 446 * a valid Unicode codepoint. Here, however, only 447 * check the length and range. 448 */ 449 if (**start != 'u' || *sz < 5 || *sz > 7) 450 break; 451 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 452 break; 453 if (*sz == 6 && (*start)[1] == '0') 454 break; 455 if (*sz == 5 && (*start)[1] == 'D' && 456 strchr("89ABCDEF", (*start)[2]) != NULL) 457 break; 458 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 459 + 1 == *sz) 460 gly = ESCAPE_UNICODE; 461 break; 462 default: 463 break; 464 } 465 466 return gly; 467 } 468 469 static int 470 a2time(time_t *t, const char *fmt, const char *p) 471 { 472 struct tm tm; 473 char *pp; 474 475 memset(&tm, 0, sizeof(struct tm)); 476 477 pp = NULL; 478 #if HAVE_STRPTIME 479 pp = strptime(p, fmt, &tm); 480 #endif 481 if (NULL != pp && '\0' == *pp) { 482 *t = mktime(&tm); 483 return 1; 484 } 485 486 return 0; 487 } 488 489 static char * 490 time2a(time_t t) 491 { 492 struct tm *tm; 493 char *buf, *p; 494 size_t ssz; 495 int isz; 496 497 tm = localtime(&t); 498 if (tm == NULL) 499 return NULL; 500 501 /* 502 * Reserve space: 503 * up to 9 characters for the month (September) + blank 504 * up to 2 characters for the day + comma + blank 505 * 4 characters for the year and a terminating '\0' 506 */ 507 508 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 509 510 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 511 goto fail; 512 p += (int)ssz; 513 514 /* 515 * The output format is just "%d" here, not "%2d" or "%02d". 516 * That's also the reason why we can't just format the 517 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 518 * Besides, the present approach is less prone to buffer 519 * overflows, in case anybody should ever introduce the bug 520 * of looking at LC_TIME. 521 */ 522 523 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1) 524 goto fail; 525 p += isz; 526 527 if (strftime(p, 4 + 1, "%Y", tm) == 0) 528 goto fail; 529 return buf; 530 531 fail: 532 free(buf); 533 return NULL; 534 } 535 536 char * 537 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 538 { 539 char *cp; 540 time_t t; 541 542 /* No date specified: use today's date. */ 543 544 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) { 545 mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL); 546 return time2a(time(NULL)); 547 } 548 549 /* Valid mdoc(7) date format. */ 550 551 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 552 a2time(&t, "%b %d, %Y", in)) { 553 cp = time2a(t); 554 if (t > time(NULL) + 86400) 555 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp); 556 else if (*in != '$' && strcmp(in, cp) != 0) 557 mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp); 558 return cp; 559 } 560 561 /* In man(7), do not warn about the legacy format. */ 562 563 if (a2time(&t, "%Y-%m-%d", in) == 0) 564 mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in); 565 else if (t > time(NULL) + 86400) 566 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in); 567 else if (man->meta.macroset == MACROSET_MDOC) 568 mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in); 569 570 /* Use any non-mdoc(7) date verbatim. */ 571 572 return mandoc_strdup(in); 573 } 574 575 int 576 mandoc_eos(const char *p, size_t sz) 577 { 578 const char *q; 579 int enclosed, found; 580 581 if (0 == sz) 582 return 0; 583 584 /* 585 * End-of-sentence recognition must include situations where 586 * some symbols, such as `)', allow prior EOS punctuation to 587 * propagate outward. 588 */ 589 590 enclosed = found = 0; 591 for (q = p + (int)sz - 1; q >= p; q--) { 592 switch (*q) { 593 case '\"': 594 case '\'': 595 case ']': 596 case ')': 597 if (0 == found) 598 enclosed = 1; 599 break; 600 case '.': 601 case '!': 602 case '?': 603 found = 1; 604 break; 605 default: 606 return found && 607 (!enclosed || isalnum((unsigned char)*q)); 608 } 609 } 610 611 return found && !enclosed; 612 } 613 614 /* 615 * Convert a string to a long that may not be <0. 616 * If the string is invalid, or is less than 0, return -1. 617 */ 618 int 619 mandoc_strntoi(const char *p, size_t sz, int base) 620 { 621 char buf[32]; 622 char *ep; 623 long v; 624 625 if (sz > 31) 626 return -1; 627 628 memcpy(buf, p, sz); 629 buf[(int)sz] = '\0'; 630 631 errno = 0; 632 v = strtol(buf, &ep, base); 633 634 if (buf[0] == '\0' || *ep != '\0') 635 return -1; 636 637 if (v > INT_MAX) 638 v = INT_MAX; 639 if (v < INT_MIN) 640 v = INT_MIN; 641 642 return (int)v; 643 } 644