1 /* 2 * Copyright (c) 1985 Sun Microsystems, Inc. 3 * Copyright (c) 1980, 1993 4 * The Regents of the University of California. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #if 0 37 #ifndef lint 38 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 39 #endif /* not lint */ 40 #endif 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 /* 45 * Here we have the token scanner for indent. It scans off one token and puts 46 * it in the global variable "token". It returns a code, indicating the type 47 * of token scanned. 48 */ 49 50 #include <err.h> 51 #include <stdio.h> 52 #include <ctype.h> 53 #include <stdlib.h> 54 #include <string.h> 55 #include "indent_globs.h" 56 #include "indent_codes.h" 57 #include "indent.h" 58 59 #define alphanum 1 60 #define opchar 3 61 62 struct templ { 63 const char *rwd; 64 int rwcode; 65 }; 66 67 struct templ specials[1000] = 68 { 69 {"switch", 1}, 70 {"case", 2}, 71 {"break", 0}, 72 {"struct", 3}, 73 {"union", 3}, 74 {"enum", 3}, 75 {"default", 2}, 76 {"int", 4}, 77 {"char", 4}, 78 {"float", 4}, 79 {"double", 4}, 80 {"long", 4}, 81 {"short", 4}, 82 {"typedef", 4}, 83 {"unsigned", 4}, 84 {"register", 4}, 85 {"static", 4}, 86 {"global", 4}, 87 {"extern", 4}, 88 {"void", 4}, 89 {"const", 4}, 90 {"volatile", 4}, 91 {"goto", 0}, 92 {"return", 0}, 93 {"if", 5}, 94 {"while", 5}, 95 {"for", 5}, 96 {"else", 6}, 97 {"do", 6}, 98 {"sizeof", 7}, 99 {0, 0} 100 }; 101 102 char chartype[128] = 103 { /* this is used to facilitate the decision of 104 * what type (alphanumeric, operator) each 105 * character is */ 106 0, 0, 0, 0, 0, 0, 0, 0, 107 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 0, 0, 0, 0, 0, 0, 0, 110 0, 3, 0, 0, 1, 3, 3, 0, 111 0, 0, 3, 3, 0, 3, 0, 3, 112 1, 1, 1, 1, 1, 1, 1, 1, 113 1, 1, 0, 0, 3, 3, 3, 3, 114 0, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 1, 1, 1, 1, 1, 117 1, 1, 1, 0, 0, 0, 3, 1, 118 0, 1, 1, 1, 1, 1, 1, 1, 119 1, 1, 1, 1, 1, 1, 1, 1, 120 1, 1, 1, 1, 1, 1, 1, 1, 121 1, 1, 1, 0, 3, 0, 3, 0 122 }; 123 124 int 125 lexi(void) 126 { 127 int unary_delim; /* this is set to 1 if the current token 128 * forces a following operator to be unary */ 129 static int last_code; /* the last token type returned */ 130 static int l_struct; /* set to 1 if the last token was 'struct' */ 131 int code; /* internal code to be returned */ 132 char qchar; /* the delimiter character for a string */ 133 134 e_token = s_token; /* point to start of place to save token */ 135 unary_delim = false; 136 ps.col_1 = ps.last_nl; /* tell world that this token started in 137 * column 1 iff the last thing scanned was nl */ 138 ps.last_nl = false; 139 140 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 141 ps.col_1 = false; /* leading blanks imply token is not in column 142 * 1 */ 143 if (++buf_ptr >= buf_end) 144 fill_buffer(); 145 } 146 147 /* Scan an alphanumeric token */ 148 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 149 /* 150 * we have a character or number 151 */ 152 const char *j; /* used for searching thru list of 153 * 154 * reserved words */ 155 struct templ *p; 156 157 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 158 int seendot = 0, 159 seenexp = 0, 160 seensfx = 0; 161 if (*buf_ptr == '0' && 162 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 163 *e_token++ = *buf_ptr++; 164 *e_token++ = *buf_ptr++; 165 while (isxdigit(*buf_ptr)) { 166 CHECK_SIZE_TOKEN; 167 *e_token++ = *buf_ptr++; 168 } 169 } 170 else 171 while (1) { 172 if (*buf_ptr == '.') { 173 if (seendot) 174 break; 175 else 176 seendot++; 177 } 178 CHECK_SIZE_TOKEN; 179 *e_token++ = *buf_ptr++; 180 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 181 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 182 break; 183 else { 184 seenexp++; 185 seendot++; 186 CHECK_SIZE_TOKEN; 187 *e_token++ = *buf_ptr++; 188 if (*buf_ptr == '+' || *buf_ptr == '-') 189 *e_token++ = *buf_ptr++; 190 } 191 } 192 } 193 while (1) { 194 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) { 195 CHECK_SIZE_TOKEN; 196 *e_token++ = *buf_ptr++; 197 seensfx |= 1; 198 continue; 199 } 200 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) { 201 CHECK_SIZE_TOKEN; 202 if (buf_ptr[1] == buf_ptr[0]) 203 *e_token++ = *buf_ptr++; 204 *e_token++ = *buf_ptr++; 205 seensfx |= 2; 206 continue; 207 } 208 break; 209 } 210 } 211 else 212 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 213 /* fill_buffer() terminates buffer with newline */ 214 if (*buf_ptr == BACKSLASH) { 215 if (*(buf_ptr + 1) == '\n') { 216 buf_ptr += 2; 217 if (buf_ptr >= buf_end) 218 fill_buffer(); 219 } else 220 break; 221 } 222 CHECK_SIZE_TOKEN; 223 /* copy it over */ 224 *e_token++ = *buf_ptr++; 225 if (buf_ptr >= buf_end) 226 fill_buffer(); 227 } 228 *e_token++ = '\0'; 229 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 230 if (++buf_ptr >= buf_end) 231 fill_buffer(); 232 } 233 ps.its_a_keyword = false; 234 ps.sizeof_keyword = false; 235 if (l_struct && !ps.p_l_follow) { 236 /* if last token was 'struct' and we're not 237 * in parentheses, then this token 238 * should be treated as a declaration */ 239 l_struct = false; 240 last_code = ident; 241 ps.last_u_d = true; 242 return (decl); 243 } 244 ps.last_u_d = l_struct; /* Operator after identifier is binary 245 * unless last token was 'struct' */ 246 l_struct = false; 247 last_code = ident; /* Remember that this is the code we will 248 * return */ 249 250 if (auto_typedefs) { 251 const char *q = s_token; 252 size_t q_len = strlen(q); 253 /* Check if we have an "_t" in the end */ 254 if (q_len > 2 && 255 (strcmp(q + q_len - 2, "_t") == 0)) { 256 ps.its_a_keyword = true; 257 ps.last_u_d = true; 258 goto found_auto_typedef; 259 } 260 } 261 262 /* 263 * This loop will check if the token is a keyword. 264 */ 265 for (p = specials; (j = p->rwd) != NULL; p++) { 266 const char *q = s_token; /* point at scanned token */ 267 if (*j++ != *q++ || *j++ != *q++) 268 continue; /* This test depends on the fact that 269 * identifiers are always at least 1 character 270 * long (ie. the first two bytes of the 271 * identifier are always meaningful) */ 272 if (q[-1] == 0) 273 break; /* If its a one-character identifier */ 274 while (*q++ == *j) 275 if (*j++ == 0) 276 goto found_keyword; /* I wish that C had a multi-level 277 * break... */ 278 } 279 if (p->rwd) { /* we have a keyword */ 280 found_keyword: 281 ps.its_a_keyword = true; 282 ps.last_u_d = true; 283 switch (p->rwcode) { 284 case 1: /* it is a switch */ 285 return (swstmt); 286 case 2: /* a case or default */ 287 return (casestmt); 288 289 case 3: /* a "struct" */ 290 /* 291 * Next time around, we will want to know that we have had a 292 * 'struct' 293 */ 294 l_struct = true; 295 /* FALLTHROUGH */ 296 297 case 4: /* one of the declaration keywords */ 298 found_auto_typedef: 299 if (ps.p_l_follow) { 300 ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask; 301 break; /* inside parens: cast, param list or sizeof */ 302 } 303 last_code = decl; 304 return (decl); 305 306 case 5: /* if, while, for */ 307 return (sp_paren); 308 309 case 6: /* do, else */ 310 return (sp_nparen); 311 312 case 7: 313 ps.sizeof_keyword = true; 314 default: /* all others are treated like any other 315 * identifier */ 316 return (ident); 317 } /* end of switch */ 318 } /* end of if (found_it) */ 319 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 320 char *tp = buf_ptr; 321 while (tp < buf_end) 322 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 323 goto not_proc; 324 strncpy(ps.procname, token, sizeof ps.procname - 1); 325 ps.in_parameter_declaration = 1; 326 rparen_count = 1; 327 not_proc:; 328 } 329 /* 330 * The following hack attempts to guess whether or not the current 331 * token is in fact a declaration keyword -- one that has been 332 * typedefd 333 */ 334 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 335 && !ps.p_l_follow 336 && !ps.block_init 337 && (ps.last_token == rparen || ps.last_token == semicolon || 338 ps.last_token == decl || 339 ps.last_token == lbrace || ps.last_token == rbrace)) { 340 ps.its_a_keyword = true; 341 ps.last_u_d = true; 342 last_code = decl; 343 return decl; 344 } 345 if (last_code == decl) /* if this is a declared variable, then 346 * following sign is unary */ 347 ps.last_u_d = true; /* will make "int a -1" work */ 348 last_code = ident; 349 return (ident); /* the ident is not in the list */ 350 } /* end of procesing for alpanum character */ 351 352 /* Scan a non-alphanumeric token */ 353 354 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 355 * moved here */ 356 *e_token = '\0'; 357 if (++buf_ptr >= buf_end) 358 fill_buffer(); 359 360 switch (*token) { 361 case '\n': 362 unary_delim = ps.last_u_d; 363 ps.last_nl = true; /* remember that we just had a newline */ 364 code = (had_eof ? 0 : newline); 365 366 /* 367 * if data has been exhausted, the newline is a dummy, and we should 368 * return code to stop 369 */ 370 break; 371 372 case '\'': /* start of quoted character */ 373 case '"': /* start of string */ 374 qchar = *token; 375 if (troff) { 376 e_token[-1] = '`'; 377 if (qchar == '"') 378 *e_token++ = '`'; 379 e_token = chfont(&bodyf, &stringf, e_token); 380 } 381 do { /* copy the string */ 382 while (1) { /* move one character or [/<char>]<char> */ 383 if (*buf_ptr == '\n') { 384 diag2(1, "Unterminated literal"); 385 goto stop_lit; 386 } 387 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 388 * since CHECK_SIZE guarantees that there 389 * are at least 5 entries left */ 390 *e_token = *buf_ptr++; 391 if (buf_ptr >= buf_end) 392 fill_buffer(); 393 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 394 if (*buf_ptr == '\n') /* check for escaped newline */ 395 ++line_no; 396 if (troff) { 397 *++e_token = BACKSLASH; 398 if (*buf_ptr == BACKSLASH) 399 *++e_token = BACKSLASH; 400 } 401 *++e_token = *buf_ptr++; 402 ++e_token; /* we must increment this again because we 403 * copied two chars */ 404 if (buf_ptr >= buf_end) 405 fill_buffer(); 406 } 407 else 408 break; /* we copied one character */ 409 } /* end of while (1) */ 410 } while (*e_token++ != qchar); 411 if (troff) { 412 e_token = chfont(&stringf, &bodyf, e_token - 1); 413 if (qchar == '"') 414 *e_token++ = '\''; 415 } 416 stop_lit: 417 code = ident; 418 break; 419 420 case ('('): 421 case ('['): 422 unary_delim = true; 423 code = lparen; 424 break; 425 426 case (')'): 427 case (']'): 428 code = rparen; 429 break; 430 431 case '#': 432 unary_delim = ps.last_u_d; 433 code = preesc; 434 break; 435 436 case '?': 437 unary_delim = true; 438 code = question; 439 break; 440 441 case (':'): 442 code = colon; 443 unary_delim = true; 444 break; 445 446 case (';'): 447 unary_delim = true; 448 code = semicolon; 449 break; 450 451 case ('{'): 452 unary_delim = true; 453 454 /* 455 * if (ps.in_or_st) ps.block_init = 1; 456 */ 457 /* ? code = ps.block_init ? lparen : lbrace; */ 458 code = lbrace; 459 break; 460 461 case ('}'): 462 unary_delim = true; 463 /* ? code = ps.block_init ? rparen : rbrace; */ 464 code = rbrace; 465 break; 466 467 case 014: /* a form feed */ 468 unary_delim = ps.last_u_d; 469 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 470 * right */ 471 code = form_feed; 472 break; 473 474 case (','): 475 unary_delim = true; 476 code = comma; 477 break; 478 479 case '.': 480 unary_delim = false; 481 code = period; 482 break; 483 484 case '-': 485 case '+': /* check for -, +, --, ++ */ 486 code = (ps.last_u_d ? unary_op : binary_op); 487 unary_delim = true; 488 489 if (*buf_ptr == token[0]) { 490 /* check for doubled character */ 491 *e_token++ = *buf_ptr++; 492 /* buffer overflow will be checked at end of loop */ 493 if (last_code == ident || last_code == rparen) { 494 code = (ps.last_u_d ? unary_op : postop); 495 /* check for following ++ or -- */ 496 unary_delim = false; 497 } 498 } 499 else if (*buf_ptr == '=') 500 /* check for operator += */ 501 *e_token++ = *buf_ptr++; 502 else if (*buf_ptr == '>') { 503 /* check for operator -> */ 504 *e_token++ = *buf_ptr++; 505 if (!pointer_as_binop) { 506 unary_delim = false; 507 code = unary_op; 508 ps.want_blank = false; 509 } 510 } 511 break; /* buffer overflow will be checked at end of 512 * switch */ 513 514 case '=': 515 if (ps.in_or_st) 516 ps.block_init = 1; 517 #ifdef undef 518 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 519 e_token[-1] = *buf_ptr++; 520 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 521 *e_token++ = *buf_ptr++; 522 *e_token++ = '='; /* Flip =+ to += */ 523 *e_token = 0; 524 } 525 #else 526 if (*buf_ptr == '=') {/* == */ 527 *e_token++ = '='; /* Flip =+ to += */ 528 buf_ptr++; 529 *e_token = 0; 530 } 531 #endif 532 code = binary_op; 533 unary_delim = true; 534 break; 535 /* can drop thru!!! */ 536 537 case '>': 538 case '<': 539 case '!': /* ops like <, <<, <=, !=, etc */ 540 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 541 *e_token++ = *buf_ptr; 542 if (++buf_ptr >= buf_end) 543 fill_buffer(); 544 } 545 if (*buf_ptr == '=') 546 *e_token++ = *buf_ptr++; 547 code = (ps.last_u_d ? unary_op : binary_op); 548 unary_delim = true; 549 break; 550 551 default: 552 if (token[0] == '/' && *buf_ptr == '*') { 553 /* it is start of comment */ 554 *e_token++ = '*'; 555 556 if (++buf_ptr >= buf_end) 557 fill_buffer(); 558 559 code = comment; 560 unary_delim = ps.last_u_d; 561 break; 562 } 563 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 564 /* 565 * handle ||, &&, etc, and also things as in int *****i 566 */ 567 *e_token++ = *buf_ptr; 568 if (++buf_ptr >= buf_end) 569 fill_buffer(); 570 } 571 code = (ps.last_u_d ? unary_op : binary_op); 572 unary_delim = true; 573 574 575 } /* end of switch */ 576 if (code != newline) { 577 l_struct = false; 578 last_code = code; 579 } 580 if (buf_ptr >= buf_end) /* check for input buffer empty */ 581 fill_buffer(); 582 ps.last_u_d = unary_delim; 583 *e_token = '\0'; /* null terminate the token */ 584 return (code); 585 } 586 587 /* 588 * Add the given keyword to the keyword table, using val as the keyword type 589 */ 590 void 591 addkey(char *key, int val) 592 { 593 struct templ *p = specials; 594 while (p->rwd) 595 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 596 return; 597 else 598 p++; 599 if (p >= specials + sizeof specials / sizeof specials[0]) 600 return; /* For now, table overflows are silently 601 * ignored */ 602 p->rwd = key; 603 p->rwcode = val; 604 p[1].rwd = NULL; 605 p[1].rwcode = 0; 606 } 607