1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1985 Sun Microsystems, Inc. 5 * Copyright (c) 1980, 1993 6 * The Regents of the University of California. All rights reserved. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #if 0 39 #ifndef lint 40 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 41 #endif /* not lint */ 42 #endif 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 /* 47 * Here we have the token scanner for indent. It scans off one token and puts 48 * it in the global variable "token". It returns a code, indicating the type 49 * of token scanned. 50 */ 51 52 #include <err.h> 53 #include <stdio.h> 54 #include <ctype.h> 55 #include <stdlib.h> 56 #include <string.h> 57 #include "indent_globs.h" 58 #include "indent_codes.h" 59 #include "indent.h" 60 61 #define alphanum 1 62 #ifdef undef 63 #define opchar 3 64 #endif 65 66 struct templ { 67 const char *rwd; 68 int rwcode; 69 }; 70 71 /* 72 * This table has to be sorted alphabetically, because it'll be used in binary 73 * search. For the same reason, string must be the first thing in struct templ. 74 */ 75 struct templ specials[] = 76 { 77 {"_Bool", 4}, 78 {"_Complex", 4}, 79 {"_Imaginary", 4}, 80 {"auto", 10}, 81 {"bool", 4}, 82 {"break", 9}, 83 {"case", 8}, 84 {"char", 4}, 85 {"complex", 4}, 86 {"const", 4}, 87 {"continue", 12}, 88 {"default", 8}, 89 {"do", 6}, 90 {"double", 4}, 91 {"else", 6}, 92 {"enum", 3}, 93 {"extern", 10}, 94 {"float", 4}, 95 {"for", 5}, 96 {"global", 4}, 97 {"goto", 9}, 98 {"if", 5}, 99 {"imaginary", 4}, 100 {"inline", 12}, 101 {"int", 4}, 102 {"long", 4}, 103 {"offsetof", 1}, 104 {"register", 10}, 105 {"restrict", 12}, 106 {"return", 9}, 107 {"short", 4}, 108 {"signed", 4}, 109 {"sizeof", 2}, 110 {"static", 10}, 111 {"struct", 3}, 112 {"switch", 7}, 113 {"typedef", 11}, 114 {"union", 3}, 115 {"unsigned", 4}, 116 {"void", 4}, 117 {"volatile", 4}, 118 {"while", 5} 119 }; 120 121 const char **typenames; 122 int typename_count; 123 int typename_top = -1; 124 125 char chartype[128] = 126 { /* this is used to facilitate the decision of 127 * what type (alphanumeric, operator) each 128 * character is */ 129 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 133 0, 3, 0, 0, 1, 3, 3, 0, 134 0, 0, 3, 3, 0, 3, 0, 3, 135 1, 1, 1, 1, 1, 1, 1, 1, 136 1, 1, 0, 0, 3, 3, 3, 3, 137 0, 1, 1, 1, 1, 1, 1, 1, 138 1, 1, 1, 1, 1, 1, 1, 1, 139 1, 1, 1, 1, 1, 1, 1, 1, 140 1, 1, 1, 0, 0, 0, 3, 1, 141 0, 1, 1, 1, 1, 1, 1, 1, 142 1, 1, 1, 1, 1, 1, 1, 1, 143 1, 1, 1, 1, 1, 1, 1, 1, 144 1, 1, 1, 0, 3, 0, 3, 0 145 }; 146 147 static int 148 strcmp_type(const void *e1, const void *e2) 149 { 150 return (strcmp(e1, *(const char * const *)e2)); 151 } 152 153 int 154 lexi(struct parser_state *state) 155 { 156 int unary_delim; /* this is set to 1 if the current token 157 * forces a following operator to be unary */ 158 int code; /* internal code to be returned */ 159 char qchar; /* the delimiter character for a string */ 160 161 e_token = s_token; /* point to start of place to save token */ 162 unary_delim = false; 163 state->col_1 = state->last_nl; /* tell world that this token started 164 * in column 1 iff the last thing 165 * scanned was a newline */ 166 state->last_nl = false; 167 168 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 169 state->col_1 = false; /* leading blanks imply token is not in column 170 * 1 */ 171 if (++buf_ptr >= buf_end) 172 fill_buffer(); 173 } 174 175 /* Scan an alphanumeric token */ 176 if (chartype[*buf_ptr & 127] == alphanum || 177 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 178 /* 179 * we have a character or number 180 */ 181 struct templ *p; 182 183 if (isdigit((unsigned char)*buf_ptr) || 184 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 185 int seendot = 0, 186 seenexp = 0, 187 seensfx = 0; 188 189 /* 190 * base 2, base 8, base 16: 191 */ 192 if (buf_ptr[0] == '0' && buf_ptr[1] != '.') { 193 int len; 194 195 if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B') 196 len = strspn(buf_ptr + 2, "01") + 2; 197 else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X') 198 len = strspn(buf_ptr + 2, "0123456789ABCDEFabcdef") + 2; 199 else 200 len = strspn(buf_ptr + 1, "012345678") + 1; 201 if (len > 0) { 202 CHECK_SIZE_TOKEN(len); 203 memcpy(e_token, buf_ptr, len); 204 e_token += len; 205 buf_ptr += len; 206 } 207 else 208 diag2(1, "Unterminated literal"); 209 } 210 else /* base 10: */ 211 while (1) { 212 if (*buf_ptr == '.') { 213 if (seendot) 214 break; 215 else 216 seendot++; 217 } 218 CHECK_SIZE_TOKEN(3); 219 *e_token++ = *buf_ptr++; 220 if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') { 221 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 222 break; 223 else { 224 seenexp++; 225 seendot++; 226 *e_token++ = *buf_ptr++; 227 if (*buf_ptr == '+' || *buf_ptr == '-') 228 *e_token++ = *buf_ptr++; 229 } 230 } 231 } 232 233 while (1) { 234 CHECK_SIZE_TOKEN(2); 235 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) { 236 *e_token++ = *buf_ptr++; 237 seensfx |= 1; 238 continue; 239 } 240 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) { 241 if (buf_ptr[1] == buf_ptr[0]) 242 *e_token++ = *buf_ptr++; 243 *e_token++ = *buf_ptr++; 244 seensfx |= 2; 245 continue; 246 } 247 break; 248 } 249 } 250 else 251 while (chartype[*buf_ptr & 127] == alphanum || *buf_ptr == BACKSLASH) { 252 /* fill_buffer() terminates buffer with newline */ 253 if (*buf_ptr == BACKSLASH) { 254 if (*(buf_ptr + 1) == '\n') { 255 buf_ptr += 2; 256 if (buf_ptr >= buf_end) 257 fill_buffer(); 258 } else 259 break; 260 } 261 CHECK_SIZE_TOKEN(1); 262 /* copy it over */ 263 *e_token++ = *buf_ptr++; 264 if (buf_ptr >= buf_end) 265 fill_buffer(); 266 } 267 *e_token = '\0'; 268 269 if (s_token[0] == 'L' && s_token[1] == '\0' && 270 (*buf_ptr == '"' || *buf_ptr == '\'')) 271 return (strpfx); 272 273 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 274 if (++buf_ptr >= buf_end) 275 fill_buffer(); 276 } 277 state->keyword = 0; 278 if (state->last_token == structure && !state->p_l_follow) { 279 /* if last token was 'struct' and we're not 280 * in parentheses, then this token 281 * should be treated as a declaration */ 282 state->last_u_d = true; 283 return (decl); 284 } 285 /* 286 * Operator after identifier is binary unless last token was 'struct' 287 */ 288 state->last_u_d = (state->last_token == structure); 289 290 p = bsearch(s_token, 291 specials, 292 sizeof(specials) / sizeof(specials[0]), 293 sizeof(specials[0]), 294 strcmp_type); 295 if (p == NULL) { /* not a special keyword... */ 296 char *u; 297 298 /* ... so maybe a type_t or a typedef */ 299 if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) && 300 strcmp(u, "_t") == 0) || (typename_top >= 0 && 301 bsearch(s_token, typenames, typename_top + 1, 302 sizeof(typenames[0]), strcmp_type))) { 303 state->keyword = 4; /* a type name */ 304 state->last_u_d = true; 305 goto found_typename; 306 } 307 } else { /* we have a keyword */ 308 state->keyword = p->rwcode; 309 state->last_u_d = true; 310 switch (p->rwcode) { 311 case 7: /* it is a switch */ 312 return (swstmt); 313 case 8: /* a case or default */ 314 return (casestmt); 315 316 case 3: /* a "struct" */ 317 /* FALLTHROUGH */ 318 case 4: /* one of the declaration keywords */ 319 found_typename: 320 if (state->p_l_follow) { 321 /* inside parens: cast, param list, offsetof or sizeof */ 322 state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask; 323 } 324 if (p != NULL && p->rwcode == 3) 325 return (structure); 326 if (state->p_l_follow) 327 break; 328 return (decl); 329 330 case 5: /* if, while, for */ 331 return (sp_paren); 332 333 case 6: /* do, else */ 334 return (sp_nparen); 335 336 case 10: /* storage class specifier */ 337 return (storage); 338 339 case 11: /* typedef */ 340 return (type_def); 341 342 default: /* all others are treated like any other 343 * identifier */ 344 return (ident); 345 } /* end of switch */ 346 } /* end of if (found_it) */ 347 if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 && 348 state->in_parameter_declaration == 0 && state->block_init == 0) { 349 char *tp = buf_ptr; 350 while (tp < buf_end) 351 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 352 goto not_proc; 353 strncpy(state->procname, token, sizeof state->procname - 1); 354 if (state->in_decl) 355 state->in_parameter_declaration = 1; 356 return (funcname); 357 not_proc:; 358 } 359 /* 360 * The following hack attempts to guess whether or not the current 361 * token is in fact a declaration keyword -- one that has been 362 * typedefd 363 */ 364 else if (!state->p_l_follow && !state->block_init && 365 !state->in_stmt && 366 ((*buf_ptr == '*' && buf_ptr[1] != '=') || 367 isalpha((unsigned char)*buf_ptr)) && 368 (state->last_token == semicolon || state->last_token == lbrace || 369 state->last_token == rbrace)) { 370 state->keyword = 4; /* a type name */ 371 state->last_u_d = true; 372 return decl; 373 } 374 if (state->last_token == decl) /* if this is a declared variable, 375 * then following sign is unary */ 376 state->last_u_d = true; /* will make "int a -1" work */ 377 return (ident); /* the ident is not in the list */ 378 } /* end of procesing for alpanum character */ 379 380 /* Scan a non-alphanumeric token */ 381 382 CHECK_SIZE_TOKEN(3); /* things like "<<=" */ 383 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 384 * moved here */ 385 *e_token = '\0'; 386 if (++buf_ptr >= buf_end) 387 fill_buffer(); 388 389 switch (*token) { 390 case '\n': 391 unary_delim = state->last_u_d; 392 state->last_nl = true; /* remember that we just had a newline */ 393 code = (had_eof ? 0 : newline); 394 395 /* 396 * if data has been exhausted, the newline is a dummy, and we should 397 * return code to stop 398 */ 399 break; 400 401 case '\'': /* start of quoted character */ 402 case '"': /* start of string */ 403 qchar = *token; 404 do { /* copy the string */ 405 while (1) { /* move one character or [/<char>]<char> */ 406 if (*buf_ptr == '\n') { 407 diag2(1, "Unterminated literal"); 408 goto stop_lit; 409 } 410 CHECK_SIZE_TOKEN(2); 411 *e_token = *buf_ptr++; 412 if (buf_ptr >= buf_end) 413 fill_buffer(); 414 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 415 if (*buf_ptr == '\n') /* check for escaped newline */ 416 ++line_no; 417 *++e_token = *buf_ptr++; 418 ++e_token; /* we must increment this again because we 419 * copied two chars */ 420 if (buf_ptr >= buf_end) 421 fill_buffer(); 422 } 423 else 424 break; /* we copied one character */ 425 } /* end of while (1) */ 426 } while (*e_token++ != qchar); 427 stop_lit: 428 code = ident; 429 break; 430 431 case ('('): 432 case ('['): 433 unary_delim = true; 434 code = lparen; 435 break; 436 437 case (')'): 438 case (']'): 439 code = rparen; 440 break; 441 442 case '#': 443 unary_delim = state->last_u_d; 444 code = preesc; 445 break; 446 447 case '?': 448 unary_delim = true; 449 code = question; 450 break; 451 452 case (':'): 453 code = colon; 454 unary_delim = true; 455 break; 456 457 case (';'): 458 unary_delim = true; 459 code = semicolon; 460 break; 461 462 case ('{'): 463 unary_delim = true; 464 465 /* 466 * if (state->in_or_st) state->block_init = 1; 467 */ 468 /* ? code = state->block_init ? lparen : lbrace; */ 469 code = lbrace; 470 break; 471 472 case ('}'): 473 unary_delim = true; 474 /* ? code = state->block_init ? rparen : rbrace; */ 475 code = rbrace; 476 break; 477 478 case 014: /* a form feed */ 479 unary_delim = state->last_u_d; 480 state->last_nl = true; /* remember this so we can set 'state->col_1' 481 * right */ 482 code = form_feed; 483 break; 484 485 case (','): 486 unary_delim = true; 487 code = comma; 488 break; 489 490 case '.': 491 unary_delim = false; 492 code = period; 493 break; 494 495 case '-': 496 case '+': /* check for -, +, --, ++ */ 497 code = (state->last_u_d ? unary_op : binary_op); 498 unary_delim = true; 499 500 if (*buf_ptr == token[0]) { 501 /* check for doubled character */ 502 *e_token++ = *buf_ptr++; 503 /* buffer overflow will be checked at end of loop */ 504 if (state->last_token == ident || state->last_token == rparen) { 505 code = (state->last_u_d ? unary_op : postop); 506 /* check for following ++ or -- */ 507 unary_delim = false; 508 } 509 } 510 else if (*buf_ptr == '=') 511 /* check for operator += */ 512 *e_token++ = *buf_ptr++; 513 else if (*buf_ptr == '>') { 514 /* check for operator -> */ 515 *e_token++ = *buf_ptr++; 516 unary_delim = false; 517 code = unary_op; 518 state->want_blank = false; 519 } 520 break; /* buffer overflow will be checked at end of 521 * switch */ 522 523 case '=': 524 if (state->in_or_st) 525 state->block_init = 1; 526 #ifdef undef 527 if (chartype[*buf_ptr & 127] == opchar) { /* we have two char assignment */ 528 e_token[-1] = *buf_ptr++; 529 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 530 *e_token++ = *buf_ptr++; 531 *e_token++ = '='; /* Flip =+ to += */ 532 *e_token = 0; 533 } 534 #else 535 if (*buf_ptr == '=') {/* == */ 536 *e_token++ = '='; /* Flip =+ to += */ 537 buf_ptr++; 538 *e_token = 0; 539 } 540 #endif 541 code = binary_op; 542 unary_delim = true; 543 break; 544 /* can drop thru!!! */ 545 546 case '>': 547 case '<': 548 case '!': /* ops like <, <<, <=, !=, etc */ 549 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 550 *e_token++ = *buf_ptr; 551 if (++buf_ptr >= buf_end) 552 fill_buffer(); 553 } 554 if (*buf_ptr == '=') 555 *e_token++ = *buf_ptr++; 556 code = (state->last_u_d ? unary_op : binary_op); 557 unary_delim = true; 558 break; 559 560 case '*': 561 unary_delim = true; 562 if (!state->last_u_d) { 563 if (*buf_ptr == '=') 564 *e_token++ = *buf_ptr++; 565 code = binary_op; 566 break; 567 } 568 while (*buf_ptr == '*' || isspace((unsigned char)*buf_ptr)) { 569 if (*buf_ptr == '*') { 570 CHECK_SIZE_TOKEN(1); 571 *e_token++ = *buf_ptr; 572 } 573 if (++buf_ptr >= buf_end) 574 fill_buffer(); 575 } 576 if (ps.in_decl) { 577 char *tp = buf_ptr; 578 579 while (isalpha((unsigned char)*tp) || 580 isspace((unsigned char)*tp)) { 581 if (++tp >= buf_end) 582 fill_buffer(); 583 } 584 if (*tp == '(') 585 ps.procname[0] = ' '; 586 } 587 code = unary_op; 588 break; 589 590 default: 591 if (token[0] == '/' && *buf_ptr == '*') { 592 /* it is start of comment */ 593 *e_token++ = '*'; 594 595 if (++buf_ptr >= buf_end) 596 fill_buffer(); 597 598 code = comment; 599 unary_delim = state->last_u_d; 600 break; 601 } 602 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 603 /* 604 * handle ||, &&, etc, and also things as in int *****i 605 */ 606 CHECK_SIZE_TOKEN(1); 607 *e_token++ = *buf_ptr; 608 if (++buf_ptr >= buf_end) 609 fill_buffer(); 610 } 611 code = (state->last_u_d ? unary_op : binary_op); 612 unary_delim = true; 613 614 615 } /* end of switch */ 616 if (buf_ptr >= buf_end) /* check for input buffer empty */ 617 fill_buffer(); 618 state->last_u_d = unary_delim; 619 CHECK_SIZE_TOKEN(1); 620 *e_token = '\0'; /* null terminate the token */ 621 return (code); 622 } 623 624 void 625 alloc_typenames(void) 626 { 627 628 typenames = (const char **)malloc(sizeof(typenames[0]) * 629 (typename_count = 16)); 630 if (typenames == NULL) 631 err(1, NULL); 632 } 633 634 void 635 add_typename(const char *key) 636 { 637 int comparison; 638 const char *copy; 639 640 if (typename_top + 1 >= typename_count) { 641 typenames = realloc((void *)typenames, 642 sizeof(typenames[0]) * (typename_count *= 2)); 643 if (typenames == NULL) 644 err(1, NULL); 645 } 646 if (typename_top == -1) 647 typenames[++typename_top] = copy = strdup(key); 648 else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) { 649 /* take advantage of sorted input */ 650 if (comparison == 0) /* remove duplicates */ 651 return; 652 typenames[++typename_top] = copy = strdup(key); 653 } 654 else { 655 int p; 656 657 for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++) 658 /* find place for the new key */; 659 if (comparison == 0) /* remove duplicates */ 660 return; 661 memmove(&typenames[p + 1], &typenames[p], 662 sizeof(typenames[0]) * (++typename_top - p)); 663 typenames[p] = copy = strdup(key); 664 } 665 666 if (copy == NULL) 667 err(1, NULL); 668 } 669