1 /* Generate assembler source containing symbol information 2 * 3 * Copyright 2002 by Kai Germaschewski 4 * 5 * This software may be used and distributed according to the terms 6 * of the GNU General Public License, incorporated herein by reference. 7 * 8 * Usage: nm -n vmlinux | scripts/kallsyms [--all-symbols] > symbols.S 9 * 10 * ChangeLog: 11 * 12 * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> 13 * Changed the compression method from stem compression to "table lookup" 14 * compression 15 * 16 * Table compression uses all the unused char codes on the symbols and 17 * maps these to the most used substrings (tokens). For instance, it might 18 * map char code 0xF7 to represent "write_" and then in every symbol where 19 * "write_" appears it can be replaced by 0xF7, saving 5 bytes. 20 * The used codes themselves are also placed in the table so that the 21 * decompresion can work without "special cases". 22 * Applied to kernel symbols, this usually produces a compression ratio 23 * of about 50%. 24 * 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <ctype.h> 31 32 /* maximum token length used. It doesn't pay to increase it a lot, because 33 * very long substrings probably don't repeat themselves too often. */ 34 #define MAX_TOK_SIZE 11 35 #define KSYM_NAME_LEN 127 36 37 /* we use only a subset of the complete symbol table to gather the token count, 38 * to speed up compression, at the expense of a little compression ratio */ 39 #define WORKING_SET 1024 40 41 /* first find the best token only on the list of tokens that would profit more 42 * than GOOD_BAD_THRESHOLD. Only if this list is empty go to the "bad" list. 43 * Increasing this value will put less tokens on the "good" list, so the search 44 * is faster. However, if the good list runs out of tokens, we must painfully 45 * search the bad list. */ 46 #define GOOD_BAD_THRESHOLD 10 47 48 /* token hash parameters */ 49 #define HASH_BITS 18 50 #define HASH_TABLE_SIZE (1 << HASH_BITS) 51 #define HASH_MASK (HASH_TABLE_SIZE - 1) 52 #define HASH_BASE_OFFSET 2166136261U 53 #define HASH_FOLD(a) ((a)&(HASH_MASK)) 54 55 /* flags to mark symbols */ 56 #define SYM_FLAG_VALID 1 57 #define SYM_FLAG_SAMPLED 2 58 59 struct sym_entry { 60 unsigned long long addr; 61 char type; 62 unsigned char flags; 63 unsigned char len; 64 unsigned char *sym; 65 }; 66 67 68 static struct sym_entry *table; 69 static int size, cnt; 70 static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext; 71 static int all_symbols = 0; 72 static char symbol_prefix_char = '\0'; 73 74 struct token { 75 unsigned char data[MAX_TOK_SIZE]; 76 unsigned char len; 77 /* profit: the number of bytes that could be saved by inserting this 78 * token into the table */ 79 int profit; 80 struct token *next; /* next token on the hash list */ 81 struct token *right; /* next token on the good/bad list */ 82 struct token *left; /* previous token on the good/bad list */ 83 struct token *smaller; /* token that is less one letter than this one */ 84 }; 85 86 struct token bad_head, good_head; 87 struct token *hash_table[HASH_TABLE_SIZE]; 88 89 /* the table that holds the result of the compression */ 90 unsigned char best_table[256][MAX_TOK_SIZE+1]; 91 unsigned char best_table_len[256]; 92 93 94 static void 95 usage(void) 96 { 97 fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n"); 98 exit(1); 99 } 100 101 /* 102 * This ignores the intensely annoying "mapping symbols" found 103 * in ARM ELF files: $a, $t and $d. 104 */ 105 static inline int 106 is_arm_mapping_symbol(const char *str) 107 { 108 return str[0] == '$' && strchr("atd", str[1]) 109 && (str[2] == '\0' || str[2] == '.'); 110 } 111 112 static int 113 read_symbol(FILE *in, struct sym_entry *s) 114 { 115 char str[500]; 116 char *sym; 117 int rc; 118 119 rc = fscanf(in, "%llx %c %499s\n", &s->addr, &s->type, str); 120 if (rc != 3) { 121 if (rc != EOF) { 122 /* skip line */ 123 fgets(str, 500, in); 124 } 125 return -1; 126 } 127 128 sym = str; 129 /* skip prefix char */ 130 if (symbol_prefix_char && str[0] == symbol_prefix_char) 131 sym++; 132 133 /* Ignore most absolute/undefined (?) symbols. */ 134 if (strcmp(sym, "_stext") == 0) 135 _stext = s->addr; 136 else if (strcmp(sym, "_etext") == 0) 137 _etext = s->addr; 138 else if (strcmp(sym, "_sinittext") == 0) 139 _sinittext = s->addr; 140 else if (strcmp(sym, "_einittext") == 0) 141 _einittext = s->addr; 142 else if (strcmp(sym, "_sextratext") == 0) 143 _sextratext = s->addr; 144 else if (strcmp(sym, "_eextratext") == 0) 145 _eextratext = s->addr; 146 else if (toupper(s->type) == 'A') 147 { 148 /* Keep these useful absolute symbols */ 149 if (strcmp(sym, "__kernel_syscall_via_break") && 150 strcmp(sym, "__kernel_syscall_via_epc") && 151 strcmp(sym, "__kernel_sigtramp") && 152 strcmp(sym, "__gp")) 153 return -1; 154 155 } 156 else if (toupper(s->type) == 'U' || 157 is_arm_mapping_symbol(sym)) 158 return -1; 159 160 /* include the type field in the symbol name, so that it gets 161 * compressed together */ 162 s->len = strlen(str) + 1; 163 s->sym = (char *) malloc(s->len + 1); 164 strcpy(s->sym + 1, str); 165 s->sym[0] = s->type; 166 167 return 0; 168 } 169 170 static int 171 symbol_valid(struct sym_entry *s) 172 { 173 /* Symbols which vary between passes. Passes 1 and 2 must have 174 * identical symbol lists. The kallsyms_* symbols below are only added 175 * after pass 1, they would be included in pass 2 when --all-symbols is 176 * specified so exclude them to get a stable symbol list. 177 */ 178 static char *special_symbols[] = { 179 "kallsyms_addresses", 180 "kallsyms_num_syms", 181 "kallsyms_names", 182 "kallsyms_markers", 183 "kallsyms_token_table", 184 "kallsyms_token_index", 185 186 /* Exclude linker generated symbols which vary between passes */ 187 "_SDA_BASE_", /* ppc */ 188 "_SDA2_BASE_", /* ppc */ 189 NULL }; 190 int i; 191 int offset = 1; 192 193 /* skip prefix char */ 194 if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char) 195 offset++; 196 197 /* if --all-symbols is not specified, then symbols outside the text 198 * and inittext sections are discarded */ 199 if (!all_symbols) { 200 if ((s->addr < _stext || s->addr > _etext) 201 && (s->addr < _sinittext || s->addr > _einittext) 202 && (s->addr < _sextratext || s->addr > _eextratext)) 203 return 0; 204 /* Corner case. Discard any symbols with the same value as 205 * _etext _einittext or _eextratext; they can move between pass 206 * 1 and 2 when the kallsyms data are added. If these symbols 207 * move then they may get dropped in pass 2, which breaks the 208 * kallsyms rules. 209 */ 210 if ((s->addr == _etext && strcmp(s->sym + offset, "_etext")) || 211 (s->addr == _einittext && strcmp(s->sym + offset, "_einittext")) || 212 (s->addr == _eextratext && strcmp(s->sym + offset, "_eextratext"))) 213 return 0; 214 } 215 216 /* Exclude symbols which vary between passes. */ 217 if (strstr(s->sym + offset, "_compiled.")) 218 return 0; 219 220 for (i = 0; special_symbols[i]; i++) 221 if( strcmp(s->sym + offset, special_symbols[i]) == 0 ) 222 return 0; 223 224 return 1; 225 } 226 227 static void 228 read_map(FILE *in) 229 { 230 while (!feof(in)) { 231 if (cnt >= size) { 232 size += 10000; 233 table = realloc(table, sizeof(*table) * size); 234 if (!table) { 235 fprintf(stderr, "out of memory\n"); 236 exit (1); 237 } 238 } 239 if (read_symbol(in, &table[cnt]) == 0) 240 cnt++; 241 } 242 } 243 244 static void output_label(char *label) 245 { 246 if (symbol_prefix_char) 247 printf(".globl %c%s\n", symbol_prefix_char, label); 248 else 249 printf(".globl %s\n", label); 250 printf("\tALGN\n"); 251 if (symbol_prefix_char) 252 printf("%c%s:\n", symbol_prefix_char, label); 253 else 254 printf("%s:\n", label); 255 } 256 257 /* uncompress a compressed symbol. When this function is called, the best table 258 * might still be compressed itself, so the function needs to be recursive */ 259 static int expand_symbol(unsigned char *data, int len, char *result) 260 { 261 int c, rlen, total=0; 262 263 while (len) { 264 c = *data; 265 /* if the table holds a single char that is the same as the one 266 * we are looking for, then end the search */ 267 if (best_table[c][0]==c && best_table_len[c]==1) { 268 *result++ = c; 269 total++; 270 } else { 271 /* if not, recurse and expand */ 272 rlen = expand_symbol(best_table[c], best_table_len[c], result); 273 total += rlen; 274 result += rlen; 275 } 276 data++; 277 len--; 278 } 279 *result=0; 280 281 return total; 282 } 283 284 static void 285 write_src(void) 286 { 287 int i, k, off, valid; 288 unsigned int best_idx[256]; 289 unsigned int *markers; 290 char buf[KSYM_NAME_LEN+1]; 291 292 printf("#include <asm/types.h>\n"); 293 printf("#if BITS_PER_LONG == 64\n"); 294 printf("#define PTR .quad\n"); 295 printf("#define ALGN .align 8\n"); 296 printf("#else\n"); 297 printf("#define PTR .long\n"); 298 printf("#define ALGN .align 4\n"); 299 printf("#endif\n"); 300 301 printf(".data\n"); 302 303 output_label("kallsyms_addresses"); 304 valid = 0; 305 for (i = 0; i < cnt; i++) { 306 if (table[i].flags & SYM_FLAG_VALID) { 307 printf("\tPTR\t%#llx\n", table[i].addr); 308 valid++; 309 } 310 } 311 printf("\n"); 312 313 output_label("kallsyms_num_syms"); 314 printf("\tPTR\t%d\n", valid); 315 printf("\n"); 316 317 /* table of offset markers, that give the offset in the compressed stream 318 * every 256 symbols */ 319 markers = (unsigned int *) malloc(sizeof(unsigned int)*((valid + 255) / 256)); 320 321 output_label("kallsyms_names"); 322 valid = 0; 323 off = 0; 324 for (i = 0; i < cnt; i++) { 325 326 if (!table[i].flags & SYM_FLAG_VALID) 327 continue; 328 329 if ((valid & 0xFF) == 0) 330 markers[valid >> 8] = off; 331 332 printf("\t.byte 0x%02x", table[i].len); 333 for (k = 0; k < table[i].len; k++) 334 printf(", 0x%02x", table[i].sym[k]); 335 printf("\n"); 336 337 off += table[i].len + 1; 338 valid++; 339 } 340 printf("\n"); 341 342 output_label("kallsyms_markers"); 343 for (i = 0; i < ((valid + 255) >> 8); i++) 344 printf("\tPTR\t%d\n", markers[i]); 345 printf("\n"); 346 347 free(markers); 348 349 output_label("kallsyms_token_table"); 350 off = 0; 351 for (i = 0; i < 256; i++) { 352 best_idx[i] = off; 353 expand_symbol(best_table[i],best_table_len[i],buf); 354 printf("\t.asciz\t\"%s\"\n", buf); 355 off += strlen(buf) + 1; 356 } 357 printf("\n"); 358 359 output_label("kallsyms_token_index"); 360 for (i = 0; i < 256; i++) 361 printf("\t.short\t%d\n", best_idx[i]); 362 printf("\n"); 363 } 364 365 366 /* table lookup compression functions */ 367 368 static inline unsigned int rehash_token(unsigned int hash, unsigned char data) 369 { 370 return ((hash * 16777619) ^ data); 371 } 372 373 static unsigned int hash_token(unsigned char *data, int len) 374 { 375 unsigned int hash=HASH_BASE_OFFSET; 376 int i; 377 378 for (i = 0; i < len; i++) 379 hash = rehash_token(hash, data[i]); 380 381 return HASH_FOLD(hash); 382 } 383 384 /* find a token given its data and hash value */ 385 static struct token *find_token_hash(unsigned char *data, int len, unsigned int hash) 386 { 387 struct token *ptr; 388 389 ptr = hash_table[hash]; 390 391 while (ptr) { 392 if ((ptr->len == len) && (memcmp(ptr->data, data, len) == 0)) 393 return ptr; 394 ptr=ptr->next; 395 } 396 397 return NULL; 398 } 399 400 static inline void insert_token_in_group(struct token *head, struct token *ptr) 401 { 402 ptr->right = head->right; 403 ptr->right->left = ptr; 404 head->right = ptr; 405 ptr->left = head; 406 } 407 408 static inline void remove_token_from_group(struct token *ptr) 409 { 410 ptr->left->right = ptr->right; 411 ptr->right->left = ptr->left; 412 } 413 414 415 /* build the counts for all the tokens that start with "data", and have lenghts 416 * from 2 to "len" */ 417 static void learn_token(unsigned char *data, int len) 418 { 419 struct token *ptr,*last_ptr; 420 int i, newprofit; 421 unsigned int hash = HASH_BASE_OFFSET; 422 unsigned int hashes[MAX_TOK_SIZE + 1]; 423 424 if (len > MAX_TOK_SIZE) 425 len = MAX_TOK_SIZE; 426 427 /* calculate and store the hash values for all the sub-tokens */ 428 hash = rehash_token(hash, data[0]); 429 for (i = 2; i <= len; i++) { 430 hash = rehash_token(hash, data[i-1]); 431 hashes[i] = HASH_FOLD(hash); 432 } 433 434 last_ptr = NULL; 435 ptr = NULL; 436 437 for (i = len; i >= 2; i--) { 438 hash = hashes[i]; 439 440 if (!ptr) ptr = find_token_hash(data, i, hash); 441 442 if (!ptr) { 443 /* create a new token entry */ 444 ptr = (struct token *) malloc(sizeof(*ptr)); 445 446 memcpy(ptr->data, data, i); 447 ptr->len = i; 448 449 /* when we create an entry, it's profit is 0 because 450 * we also take into account the size of the token on 451 * the compressed table. We then subtract GOOD_BAD_THRESHOLD 452 * so that the test to see if this token belongs to 453 * the good or bad list, is a comparison to zero */ 454 ptr->profit = -GOOD_BAD_THRESHOLD; 455 456 ptr->next = hash_table[hash]; 457 hash_table[hash] = ptr; 458 459 insert_token_in_group(&bad_head, ptr); 460 461 ptr->smaller = NULL; 462 } else { 463 newprofit = ptr->profit + (ptr->len - 1); 464 /* check to see if this token needs to be moved to a 465 * different list */ 466 if((ptr->profit < 0) && (newprofit >= 0)) { 467 remove_token_from_group(ptr); 468 insert_token_in_group(&good_head,ptr); 469 } 470 ptr->profit = newprofit; 471 } 472 473 if (last_ptr) last_ptr->smaller = ptr; 474 last_ptr = ptr; 475 476 ptr = ptr->smaller; 477 } 478 } 479 480 /* decrease the counts for all the tokens that start with "data", and have lenghts 481 * from 2 to "len". This function is much simpler than learn_token because we have 482 * more guarantees (tho tokens exist, the ->smaller pointer is set, etc.) 483 * The two separate functions exist only because of compression performance */ 484 static void forget_token(unsigned char *data, int len) 485 { 486 struct token *ptr; 487 int i, newprofit; 488 unsigned int hash=0; 489 490 if (len > MAX_TOK_SIZE) len = MAX_TOK_SIZE; 491 492 hash = hash_token(data, len); 493 ptr = find_token_hash(data, len, hash); 494 495 for (i = len; i >= 2; i--) { 496 497 newprofit = ptr->profit - (ptr->len - 1); 498 if ((ptr->profit >= 0) && (newprofit < 0)) { 499 remove_token_from_group(ptr); 500 insert_token_in_group(&bad_head, ptr); 501 } 502 ptr->profit=newprofit; 503 504 ptr=ptr->smaller; 505 } 506 } 507 508 /* count all the possible tokens in a symbol */ 509 static void learn_symbol(unsigned char *symbol, int len) 510 { 511 int i; 512 513 for (i = 0; i < len - 1; i++) 514 learn_token(symbol + i, len - i); 515 } 516 517 /* decrease the count for all the possible tokens in a symbol */ 518 static void forget_symbol(unsigned char *symbol, int len) 519 { 520 int i; 521 522 for (i = 0; i < len - 1; i++) 523 forget_token(symbol + i, len - i); 524 } 525 526 /* set all the symbol flags and do the initial token count */ 527 static void build_initial_tok_table(void) 528 { 529 int i, use_it, valid; 530 531 valid = 0; 532 for (i = 0; i < cnt; i++) { 533 table[i].flags = 0; 534 if ( symbol_valid(&table[i]) ) { 535 table[i].flags |= SYM_FLAG_VALID; 536 valid++; 537 } 538 } 539 540 use_it = 0; 541 for (i = 0; i < cnt; i++) { 542 543 /* subsample the available symbols. This method is almost like 544 * a Bresenham's algorithm to get uniformly distributed samples 545 * across the symbol table */ 546 if (table[i].flags & SYM_FLAG_VALID) { 547 548 use_it += WORKING_SET; 549 550 if (use_it >= valid) { 551 table[i].flags |= SYM_FLAG_SAMPLED; 552 use_it -= valid; 553 } 554 } 555 if (table[i].flags & SYM_FLAG_SAMPLED) 556 learn_symbol(table[i].sym, table[i].len); 557 } 558 } 559 560 /* replace a given token in all the valid symbols. Use the sampled symbols 561 * to update the counts */ 562 static void compress_symbols(unsigned char *str, int tlen, int idx) 563 { 564 int i, len, learn, size; 565 unsigned char *p; 566 567 for (i = 0; i < cnt; i++) { 568 569 if (!(table[i].flags & SYM_FLAG_VALID)) continue; 570 571 len = table[i].len; 572 learn = 0; 573 p = table[i].sym; 574 575 do { 576 /* find the token on the symbol */ 577 p = (unsigned char *) strstr((char *) p, (char *) str); 578 if (!p) break; 579 580 if (!learn) { 581 /* if this symbol was used to count, decrease it */ 582 if (table[i].flags & SYM_FLAG_SAMPLED) 583 forget_symbol(table[i].sym, len); 584 learn = 1; 585 } 586 587 *p = idx; 588 size = (len - (p - table[i].sym)) - tlen + 1; 589 memmove(p + 1, p + tlen, size); 590 p++; 591 len -= tlen - 1; 592 593 } while (size >= tlen); 594 595 if(learn) { 596 table[i].len = len; 597 /* if this symbol was used to count, learn it again */ 598 if(table[i].flags & SYM_FLAG_SAMPLED) 599 learn_symbol(table[i].sym, len); 600 } 601 } 602 } 603 604 /* search the token with the maximum profit */ 605 static struct token *find_best_token(void) 606 { 607 struct token *ptr,*best,*head; 608 int bestprofit; 609 610 bestprofit=-10000; 611 612 /* failsafe: if the "good" list is empty search from the "bad" list */ 613 if(good_head.right == &good_head) head = &bad_head; 614 else head = &good_head; 615 616 ptr = head->right; 617 best = NULL; 618 while (ptr != head) { 619 if (ptr->profit > bestprofit) { 620 bestprofit = ptr->profit; 621 best = ptr; 622 } 623 ptr = ptr->right; 624 } 625 626 return best; 627 } 628 629 /* this is the core of the algorithm: calculate the "best" table */ 630 static void optimize_result(void) 631 { 632 struct token *best; 633 int i; 634 635 /* using the '\0' symbol last allows compress_symbols to use standard 636 * fast string functions */ 637 for (i = 255; i >= 0; i--) { 638 639 /* if this table slot is empty (it is not used by an actual 640 * original char code */ 641 if (!best_table_len[i]) { 642 643 /* find the token with the breates profit value */ 644 best = find_best_token(); 645 646 /* place it in the "best" table */ 647 best_table_len[i] = best->len; 648 memcpy(best_table[i], best->data, best_table_len[i]); 649 /* zero terminate the token so that we can use strstr 650 in compress_symbols */ 651 best_table[i][best_table_len[i]]='\0'; 652 653 /* replace this token in all the valid symbols */ 654 compress_symbols(best_table[i], best_table_len[i], i); 655 } 656 } 657 } 658 659 /* start by placing the symbols that are actually used on the table */ 660 static void insert_real_symbols_in_table(void) 661 { 662 int i, j, c; 663 664 memset(best_table, 0, sizeof(best_table)); 665 memset(best_table_len, 0, sizeof(best_table_len)); 666 667 for (i = 0; i < cnt; i++) { 668 if (table[i].flags & SYM_FLAG_VALID) { 669 for (j = 0; j < table[i].len; j++) { 670 c = table[i].sym[j]; 671 best_table[c][0]=c; 672 best_table_len[c]=1; 673 } 674 } 675 } 676 } 677 678 static void optimize_token_table(void) 679 { 680 memset(hash_table, 0, sizeof(hash_table)); 681 682 good_head.left = &good_head; 683 good_head.right = &good_head; 684 685 bad_head.left = &bad_head; 686 bad_head.right = &bad_head; 687 688 build_initial_tok_table(); 689 690 insert_real_symbols_in_table(); 691 692 /* When valid symbol is not registered, exit to error */ 693 if (good_head.left == good_head.right && 694 bad_head.left == bad_head.right) { 695 fprintf(stderr, "No valid symbol.\n"); 696 exit(1); 697 } 698 699 optimize_result(); 700 } 701 702 703 int 704 main(int argc, char **argv) 705 { 706 if (argc >= 2) { 707 int i; 708 for (i = 1; i < argc; i++) { 709 if(strcmp(argv[i], "--all-symbols") == 0) 710 all_symbols = 1; 711 else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) { 712 char *p = &argv[i][16]; 713 /* skip quote */ 714 if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\'')) 715 p++; 716 symbol_prefix_char = *p; 717 } else 718 usage(); 719 } 720 } else if (argc != 1) 721 usage(); 722 723 read_map(stdin); 724 optimize_token_table(); 725 write_src(); 726 727 return 0; 728 } 729