1 /* $Id: mandoc.c,v 1.103 2017/07/03 13:40:19 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include "config.h" 19 20 #include <sys/types.h> 21 22 #include <assert.h> 23 #include <ctype.h> 24 #include <errno.h> 25 #include <limits.h> 26 #include <stdlib.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <time.h> 30 31 #include "mandoc_aux.h" 32 #include "mandoc.h" 33 #include "roff.h" 34 #include "libmandoc.h" 35 36 static int a2time(time_t *, const char *, const char *); 37 static char *time2a(time_t); 38 39 40 enum mandoc_esc 41 mandoc_escape(const char **end, const char **start, int *sz) 42 { 43 const char *local_start; 44 int local_sz; 45 char term; 46 enum mandoc_esc gly; 47 48 /* 49 * When the caller doesn't provide return storage, 50 * use local storage. 51 */ 52 53 if (NULL == start) 54 start = &local_start; 55 if (NULL == sz) 56 sz = &local_sz; 57 58 /* 59 * Beyond the backslash, at least one input character 60 * is part of the escape sequence. With one exception 61 * (see below), that character won't be returned. 62 */ 63 64 gly = ESCAPE_ERROR; 65 *start = ++*end; 66 *sz = 0; 67 term = '\0'; 68 69 switch ((*start)[-1]) { 70 /* 71 * First the glyphs. There are several different forms of 72 * these, but each eventually returns a substring of the glyph 73 * name. 74 */ 75 case '(': 76 gly = ESCAPE_SPECIAL; 77 *sz = 2; 78 break; 79 case '[': 80 gly = ESCAPE_SPECIAL; 81 term = ']'; 82 break; 83 case 'C': 84 if ('\'' != **start) 85 return ESCAPE_ERROR; 86 *start = ++*end; 87 gly = ESCAPE_SPECIAL; 88 term = '\''; 89 break; 90 91 /* 92 * Escapes taking no arguments at all. 93 */ 94 case 'd': 95 case 'u': 96 case ',': 97 case '/': 98 return ESCAPE_IGNORE; 99 case 'p': 100 return ESCAPE_BREAK; 101 102 /* 103 * The \z escape is supposed to output the following 104 * character without advancing the cursor position. 105 * Since we are mostly dealing with terminal mode, 106 * let us just skip the next character. 107 */ 108 case 'z': 109 return ESCAPE_SKIPCHAR; 110 111 /* 112 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 113 * 'X' is the trigger. These have opaque sub-strings. 114 */ 115 case 'F': 116 case 'g': 117 case 'k': 118 case 'M': 119 case 'm': 120 case 'n': 121 case 'V': 122 case 'Y': 123 gly = ESCAPE_IGNORE; 124 /* FALLTHROUGH */ 125 case 'f': 126 if (ESCAPE_ERROR == gly) 127 gly = ESCAPE_FONT; 128 switch (**start) { 129 case '(': 130 *start = ++*end; 131 *sz = 2; 132 break; 133 case '[': 134 *start = ++*end; 135 term = ']'; 136 break; 137 default: 138 *sz = 1; 139 break; 140 } 141 break; 142 143 /* 144 * These escapes are of the form \X'Y', where 'X' is the trigger 145 * and 'Y' is any string. These have opaque sub-strings. 146 * The \B and \w escapes are handled in roff.c, roff_res(). 147 */ 148 case 'A': 149 case 'b': 150 case 'D': 151 case 'R': 152 case 'X': 153 case 'Z': 154 gly = ESCAPE_IGNORE; 155 /* FALLTHROUGH */ 156 case 'o': 157 if (**start == '\0') 158 return ESCAPE_ERROR; 159 if (gly == ESCAPE_ERROR) 160 gly = ESCAPE_OVERSTRIKE; 161 term = **start; 162 *start = ++*end; 163 break; 164 165 /* 166 * These escapes are of the form \X'N', where 'X' is the trigger 167 * and 'N' resolves to a numerical expression. 168 */ 169 case 'h': 170 case 'H': 171 case 'L': 172 case 'l': 173 case 'S': 174 case 'v': 175 case 'x': 176 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 177 if ('\0' != **start) 178 ++*end; 179 return ESCAPE_ERROR; 180 } 181 switch ((*start)[-1]) { 182 case 'h': 183 gly = ESCAPE_HORIZ; 184 break; 185 case 'l': 186 gly = ESCAPE_HLINE; 187 break; 188 default: 189 gly = ESCAPE_IGNORE; 190 break; 191 } 192 term = **start; 193 *start = ++*end; 194 break; 195 196 /* 197 * Special handling for the numbered character escape. 198 * XXX Do any other escapes need similar handling? 199 */ 200 case 'N': 201 if ('\0' == **start) 202 return ESCAPE_ERROR; 203 (*end)++; 204 if (isdigit((unsigned char)**start)) { 205 *sz = 1; 206 return ESCAPE_IGNORE; 207 } 208 (*start)++; 209 while (isdigit((unsigned char)**end)) 210 (*end)++; 211 *sz = *end - *start; 212 if ('\0' != **end) 213 (*end)++; 214 return ESCAPE_NUMBERED; 215 216 /* 217 * Sizes get a special category of their own. 218 */ 219 case 's': 220 gly = ESCAPE_IGNORE; 221 222 /* See +/- counts as a sign. */ 223 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 224 *start = ++*end; 225 226 switch (**end) { 227 case '(': 228 *start = ++*end; 229 *sz = 2; 230 break; 231 case '[': 232 *start = ++*end; 233 term = ']'; 234 break; 235 case '\'': 236 *start = ++*end; 237 term = '\''; 238 break; 239 case '3': 240 case '2': 241 case '1': 242 *sz = (*end)[-1] == 's' && 243 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 244 break; 245 default: 246 *sz = 1; 247 break; 248 } 249 250 break; 251 252 /* 253 * Anything else is assumed to be a glyph. 254 * In this case, pass back the character after the backslash. 255 */ 256 default: 257 gly = ESCAPE_SPECIAL; 258 *start = --*end; 259 *sz = 1; 260 break; 261 } 262 263 assert(ESCAPE_ERROR != gly); 264 265 /* 266 * Read up to the terminating character, 267 * paying attention to nested escapes. 268 */ 269 270 if ('\0' != term) { 271 while (**end != term) { 272 switch (**end) { 273 case '\0': 274 return ESCAPE_ERROR; 275 case '\\': 276 (*end)++; 277 if (ESCAPE_ERROR == 278 mandoc_escape(end, NULL, NULL)) 279 return ESCAPE_ERROR; 280 break; 281 default: 282 (*end)++; 283 break; 284 } 285 } 286 *sz = (*end)++ - *start; 287 } else { 288 assert(*sz > 0); 289 if ((size_t)*sz > strlen(*start)) 290 return ESCAPE_ERROR; 291 *end += *sz; 292 } 293 294 /* Run post-processors. */ 295 296 switch (gly) { 297 case ESCAPE_FONT: 298 if (2 == *sz) { 299 if ('C' == **start) { 300 /* 301 * Treat constant-width font modes 302 * just like regular font modes. 303 */ 304 (*start)++; 305 (*sz)--; 306 } else { 307 if ('B' == (*start)[0] && 'I' == (*start)[1]) 308 gly = ESCAPE_FONTBI; 309 break; 310 } 311 } else if (1 != *sz) 312 break; 313 314 switch (**start) { 315 case '3': 316 case 'B': 317 gly = ESCAPE_FONTBOLD; 318 break; 319 case '2': 320 case 'I': 321 gly = ESCAPE_FONTITALIC; 322 break; 323 case 'P': 324 gly = ESCAPE_FONTPREV; 325 break; 326 case '1': 327 case 'R': 328 gly = ESCAPE_FONTROMAN; 329 break; 330 } 331 break; 332 case ESCAPE_SPECIAL: 333 if (1 == *sz && 'c' == **start) 334 gly = ESCAPE_NOSPACE; 335 /* 336 * Unicode escapes are defined in groff as \[u0000] 337 * to \[u10FFFF], where the contained value must be 338 * a valid Unicode codepoint. Here, however, only 339 * check the length and range. 340 */ 341 if (**start != 'u' || *sz < 5 || *sz > 7) 342 break; 343 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 344 break; 345 if (*sz == 6 && (*start)[1] == '0') 346 break; 347 if (*sz == 5 && (*start)[1] == 'D' && 348 strchr("89ABCDEF", (*start)[2]) != NULL) 349 break; 350 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 351 + 1 == *sz) 352 gly = ESCAPE_UNICODE; 353 break; 354 default: 355 break; 356 } 357 358 return gly; 359 } 360 361 /* 362 * Parse a quoted or unquoted roff-style request or macro argument. 363 * Return a pointer to the parsed argument, which is either the original 364 * pointer or advanced by one byte in case the argument is quoted. 365 * NUL-terminate the argument in place. 366 * Collapse pairs of quotes inside quoted arguments. 367 * Advance the argument pointer to the next argument, 368 * or to the NUL byte terminating the argument line. 369 */ 370 char * 371 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 372 { 373 char *start, *cp; 374 int quoted, pairs, white; 375 376 /* Quoting can only start with a new word. */ 377 start = *cpp; 378 quoted = 0; 379 if ('"' == *start) { 380 quoted = 1; 381 start++; 382 } 383 384 pairs = 0; 385 white = 0; 386 for (cp = start; '\0' != *cp; cp++) { 387 388 /* 389 * Move the following text left 390 * after quoted quotes and after "\\" and "\t". 391 */ 392 if (pairs) 393 cp[-pairs] = cp[0]; 394 395 if ('\\' == cp[0]) { 396 /* 397 * In copy mode, translate double to single 398 * backslashes and backslash-t to literal tabs. 399 */ 400 switch (cp[1]) { 401 case 't': 402 cp[0] = '\t'; 403 /* FALLTHROUGH */ 404 case '\\': 405 pairs++; 406 cp++; 407 break; 408 case ' ': 409 /* Skip escaped blanks. */ 410 if (0 == quoted) 411 cp++; 412 break; 413 default: 414 break; 415 } 416 } else if (0 == quoted) { 417 if (' ' == cp[0]) { 418 /* Unescaped blanks end unquoted args. */ 419 white = 1; 420 break; 421 } 422 } else if ('"' == cp[0]) { 423 if ('"' == cp[1]) { 424 /* Quoted quotes collapse. */ 425 pairs++; 426 cp++; 427 } else { 428 /* Unquoted quotes end quoted args. */ 429 quoted = 2; 430 break; 431 } 432 } 433 } 434 435 /* Quoted argument without a closing quote. */ 436 if (1 == quoted) 437 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 438 439 /* NUL-terminate this argument and move to the next one. */ 440 if (pairs) 441 cp[-pairs] = '\0'; 442 if ('\0' != *cp) { 443 *cp++ = '\0'; 444 while (' ' == *cp) 445 cp++; 446 } 447 *pos += (int)(cp - start) + (quoted ? 1 : 0); 448 *cpp = cp; 449 450 if ('\0' == *cp && (white || ' ' == cp[-1])) 451 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 452 453 return start; 454 } 455 456 static int 457 a2time(time_t *t, const char *fmt, const char *p) 458 { 459 struct tm tm; 460 char *pp; 461 462 memset(&tm, 0, sizeof(struct tm)); 463 464 pp = NULL; 465 #if HAVE_STRPTIME 466 pp = strptime(p, fmt, &tm); 467 #endif 468 if (NULL != pp && '\0' == *pp) { 469 *t = mktime(&tm); 470 return 1; 471 } 472 473 return 0; 474 } 475 476 static char * 477 time2a(time_t t) 478 { 479 struct tm *tm; 480 char *buf, *p; 481 size_t ssz; 482 int isz; 483 484 tm = localtime(&t); 485 if (tm == NULL) 486 return NULL; 487 488 /* 489 * Reserve space: 490 * up to 9 characters for the month (September) + blank 491 * up to 2 characters for the day + comma + blank 492 * 4 characters for the year and a terminating '\0' 493 */ 494 495 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 496 497 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 498 goto fail; 499 p += (int)ssz; 500 501 /* 502 * The output format is just "%d" here, not "%2d" or "%02d". 503 * That's also the reason why we can't just format the 504 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 505 * Besides, the present approach is less prone to buffer 506 * overflows, in case anybody should ever introduce the bug 507 * of looking at LC_TIME. 508 */ 509 510 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1) 511 goto fail; 512 p += isz; 513 514 if (strftime(p, 4 + 1, "%Y", tm) == 0) 515 goto fail; 516 return buf; 517 518 fail: 519 free(buf); 520 return NULL; 521 } 522 523 char * 524 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 525 { 526 char *cp; 527 time_t t; 528 529 /* No date specified: use today's date. */ 530 531 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) { 532 mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL); 533 return time2a(time(NULL)); 534 } 535 536 /* Valid mdoc(7) date format. */ 537 538 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 539 a2time(&t, "%b %d, %Y", in)) { 540 cp = time2a(t); 541 if (t > time(NULL) + 86400) 542 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, 543 ln, pos, cp); 544 return cp; 545 } 546 547 /* In man(7), do not warn about the legacy format. */ 548 549 if (a2time(&t, "%Y-%m-%d", in) == 0) 550 mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in); 551 else if (t > time(NULL) + 86400) 552 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in); 553 else if (man->macroset == MACROSET_MDOC) 554 mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse, 555 ln, pos, "Dd %s", in); 556 557 /* Use any non-mdoc(7) date verbatim. */ 558 559 return mandoc_strdup(in); 560 } 561 562 int 563 mandoc_eos(const char *p, size_t sz) 564 { 565 const char *q; 566 int enclosed, found; 567 568 if (0 == sz) 569 return 0; 570 571 /* 572 * End-of-sentence recognition must include situations where 573 * some symbols, such as `)', allow prior EOS punctuation to 574 * propagate outward. 575 */ 576 577 enclosed = found = 0; 578 for (q = p + (int)sz - 1; q >= p; q--) { 579 switch (*q) { 580 case '\"': 581 case '\'': 582 case ']': 583 case ')': 584 if (0 == found) 585 enclosed = 1; 586 break; 587 case '.': 588 case '!': 589 case '?': 590 found = 1; 591 break; 592 default: 593 return found && 594 (!enclosed || isalnum((unsigned char)*q)); 595 } 596 } 597 598 return found && !enclosed; 599 } 600 601 /* 602 * Convert a string to a long that may not be <0. 603 * If the string is invalid, or is less than 0, return -1. 604 */ 605 int 606 mandoc_strntoi(const char *p, size_t sz, int base) 607 { 608 char buf[32]; 609 char *ep; 610 long v; 611 612 if (sz > 31) 613 return -1; 614 615 memcpy(buf, p, sz); 616 buf[(int)sz] = '\0'; 617 618 errno = 0; 619 v = strtol(buf, &ep, base); 620 621 if (buf[0] == '\0' || *ep != '\0') 622 return -1; 623 624 if (v > INT_MAX) 625 v = INT_MAX; 626 if (v < INT_MIN) 627 v = INT_MIN; 628 629 return (int)v; 630 } 631