1 /* Generate assembler source containing symbol information 2 * 3 * Copyright 2002 by Kai Germaschewski 4 * 5 * This software may be used and distributed according to the terms 6 * of the GNU General Public License, incorporated herein by reference. 7 * 8 * Usage: kallsyms [--all-symbols] in.map > out.S 9 * 10 * Table compression uses all the unused char codes on the symbols and 11 * maps these to the most used substrings (tokens). For instance, it might 12 * map char code 0xF7 to represent "write_" and then in every symbol where 13 * "write_" appears it can be replaced by 0xF7, saving 5 bytes. 14 * The used codes themselves are also placed in the table so that the 15 * decompresion can work without "special cases". 16 * Applied to kernel symbols, this usually produces a compression ratio 17 * of about 50%. 18 * 19 */ 20 21 #include <errno.h> 22 #include <getopt.h> 23 #include <stdbool.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <ctype.h> 28 #include <limits.h> 29 30 #include <xalloc.h> 31 32 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 33 34 #define KSYM_NAME_LEN 512 35 36 struct sym_entry { 37 unsigned long long addr; 38 unsigned int len; 39 unsigned int seq; 40 unsigned char sym[]; 41 }; 42 43 struct addr_range { 44 const char *start_sym, *end_sym; 45 unsigned long long start, end; 46 }; 47 48 static unsigned long long _text; 49 static struct addr_range text_ranges[] = { 50 { "_stext", "_etext" }, 51 { "_sinittext", "_einittext" }, 52 }; 53 #define text_range_text (&text_ranges[0]) 54 #define text_range_inittext (&text_ranges[1]) 55 56 static struct sym_entry **table; 57 static unsigned int table_size, table_cnt; 58 static int all_symbols; 59 static int pc_relative; 60 61 static int token_profit[0x10000]; 62 63 /* the table that holds the result of the compression */ 64 static unsigned char best_table[256][2]; 65 static unsigned char best_table_len[256]; 66 67 68 static void usage(void) 69 { 70 fprintf(stderr, "Usage: kallsyms [--all-symbols] in.map > out.S\n"); 71 exit(1); 72 } 73 74 static char *sym_name(const struct sym_entry *s) 75 { 76 return (char *)s->sym + 1; 77 } 78 79 static bool is_ignored_symbol(const char *name, char type) 80 { 81 if (type == 'u' || type == 'n') 82 return true; 83 84 if (toupper(type) == 'A') { 85 /* Keep these useful absolute symbols */ 86 if (strcmp(name, "__kernel_syscall_via_break") && 87 strcmp(name, "__kernel_syscall_via_epc") && 88 strcmp(name, "__kernel_sigtramp") && 89 strcmp(name, "__gp")) 90 return true; 91 } 92 93 return false; 94 } 95 96 static void check_symbol_range(const char *sym, unsigned long long addr, 97 struct addr_range *ranges, int entries) 98 { 99 size_t i; 100 struct addr_range *ar; 101 102 for (i = 0; i < entries; ++i) { 103 ar = &ranges[i]; 104 105 if (strcmp(sym, ar->start_sym) == 0) { 106 ar->start = addr; 107 return; 108 } else if (strcmp(sym, ar->end_sym) == 0) { 109 ar->end = addr; 110 return; 111 } 112 } 113 } 114 115 static struct sym_entry *read_symbol(FILE *in, char **buf, size_t *buf_len) 116 { 117 char *name, type, *p; 118 unsigned long long addr; 119 size_t len; 120 ssize_t readlen; 121 struct sym_entry *sym; 122 123 errno = 0; 124 readlen = getline(buf, buf_len, in); 125 if (readlen < 0) { 126 if (errno) { 127 perror("read_symbol"); 128 exit(EXIT_FAILURE); 129 } 130 return NULL; 131 } 132 133 if ((*buf)[readlen - 1] == '\n') 134 (*buf)[readlen - 1] = 0; 135 136 addr = strtoull(*buf, &p, 16); 137 138 if (*buf == p || *p++ != ' ' || !isascii((type = *p++)) || *p++ != ' ') { 139 fprintf(stderr, "line format error\n"); 140 exit(EXIT_FAILURE); 141 } 142 143 name = p; 144 len = strlen(name); 145 146 if (len >= KSYM_NAME_LEN) { 147 fprintf(stderr, "Symbol %s too long for kallsyms (%zu >= %d).\n" 148 "Please increase KSYM_NAME_LEN both in kernel and kallsyms.c\n", 149 name, len, KSYM_NAME_LEN); 150 return NULL; 151 } 152 153 if (strcmp(name, "_text") == 0) 154 _text = addr; 155 156 /* Ignore most absolute/undefined (?) symbols. */ 157 if (is_ignored_symbol(name, type)) 158 return NULL; 159 160 check_symbol_range(name, addr, text_ranges, ARRAY_SIZE(text_ranges)); 161 162 /* include the type field in the symbol name, so that it gets 163 * compressed together */ 164 len++; 165 166 sym = xmalloc(sizeof(*sym) + len + 1); 167 sym->addr = addr; 168 sym->len = len; 169 sym->sym[0] = type; 170 strcpy(sym_name(sym), name); 171 172 return sym; 173 } 174 175 static int symbol_in_range(const struct sym_entry *s, 176 const struct addr_range *ranges, int entries) 177 { 178 size_t i; 179 const struct addr_range *ar; 180 181 for (i = 0; i < entries; ++i) { 182 ar = &ranges[i]; 183 184 if (s->addr >= ar->start && s->addr <= ar->end) 185 return 1; 186 } 187 188 return 0; 189 } 190 191 static bool string_starts_with(const char *s, const char *prefix) 192 { 193 return strncmp(s, prefix, strlen(prefix)) == 0; 194 } 195 196 static int symbol_valid(const struct sym_entry *s) 197 { 198 const char *name = sym_name(s); 199 200 /* if --all-symbols is not specified, then symbols outside the text 201 * and inittext sections are discarded */ 202 if (!all_symbols) { 203 /* 204 * Symbols starting with __start and __stop are used to denote 205 * section boundaries, and should always be included: 206 */ 207 if (string_starts_with(name, "__start_") || 208 string_starts_with(name, "__stop_")) 209 return 1; 210 211 if (symbol_in_range(s, text_ranges, 212 ARRAY_SIZE(text_ranges)) == 0) 213 return 0; 214 /* Corner case. Discard any symbols with the same value as 215 * _etext _einittext; they can move between pass 1 and 2 when 216 * the kallsyms data are added. If these symbols move then 217 * they may get dropped in pass 2, which breaks the kallsyms 218 * rules. 219 */ 220 if ((s->addr == text_range_text->end && 221 strcmp(name, text_range_text->end_sym)) || 222 (s->addr == text_range_inittext->end && 223 strcmp(name, text_range_inittext->end_sym))) 224 return 0; 225 } 226 227 return 1; 228 } 229 230 /* remove all the invalid symbols from the table */ 231 static void shrink_table(void) 232 { 233 unsigned int i, pos; 234 235 pos = 0; 236 for (i = 0; i < table_cnt; i++) { 237 if (symbol_valid(table[i])) { 238 if (pos != i) 239 table[pos] = table[i]; 240 pos++; 241 } else { 242 free(table[i]); 243 } 244 } 245 table_cnt = pos; 246 } 247 248 static void read_map(const char *in) 249 { 250 FILE *fp; 251 struct sym_entry *sym; 252 char *buf = NULL; 253 size_t buflen = 0; 254 255 fp = fopen(in, "r"); 256 if (!fp) { 257 perror(in); 258 exit(1); 259 } 260 261 while (!feof(fp)) { 262 sym = read_symbol(fp, &buf, &buflen); 263 if (!sym) 264 continue; 265 266 sym->seq = table_cnt; 267 268 if (table_cnt >= table_size) { 269 table_size += 10000; 270 table = xrealloc(table, sizeof(*table) * table_size); 271 } 272 273 table[table_cnt++] = sym; 274 } 275 276 free(buf); 277 fclose(fp); 278 } 279 280 static void output_label(const char *label) 281 { 282 printf(".globl %s\n", label); 283 printf("\t.balign 4\n"); 284 printf("%s:\n", label); 285 } 286 287 /* uncompress a compressed symbol. When this function is called, the best table 288 * might still be compressed itself, so the function needs to be recursive */ 289 static int expand_symbol(const unsigned char *data, int len, char *result) 290 { 291 int c, rlen, total=0; 292 293 while (len) { 294 c = *data; 295 /* if the table holds a single char that is the same as the one 296 * we are looking for, then end the search */ 297 if (best_table[c][0]==c && best_table_len[c]==1) { 298 *result++ = c; 299 total++; 300 } else { 301 /* if not, recurse and expand */ 302 rlen = expand_symbol(best_table[c], best_table_len[c], result); 303 total += rlen; 304 result += rlen; 305 } 306 data++; 307 len--; 308 } 309 *result=0; 310 311 return total; 312 } 313 314 static int compare_names(const void *a, const void *b) 315 { 316 int ret; 317 const struct sym_entry *sa = *(const struct sym_entry **)a; 318 const struct sym_entry *sb = *(const struct sym_entry **)b; 319 320 ret = strcmp(sym_name(sa), sym_name(sb)); 321 if (!ret) { 322 if (sa->addr > sb->addr) 323 return 1; 324 else if (sa->addr < sb->addr) 325 return -1; 326 327 /* keep old order */ 328 return (int)(sa->seq - sb->seq); 329 } 330 331 return ret; 332 } 333 334 static void sort_symbols_by_name(void) 335 { 336 qsort(table, table_cnt, sizeof(table[0]), compare_names); 337 } 338 339 static void write_src(void) 340 { 341 unsigned int i, k, off; 342 unsigned int best_idx[256]; 343 unsigned int *markers, markers_cnt; 344 char buf[KSYM_NAME_LEN]; 345 346 printf("\t.section .rodata, \"a\"\n"); 347 348 output_label("kallsyms_num_syms"); 349 printf("\t.long\t%u\n", table_cnt); 350 printf("\n"); 351 352 /* table of offset markers, that give the offset in the compressed stream 353 * every 256 symbols */ 354 markers_cnt = (table_cnt + 255) / 256; 355 markers = xmalloc(sizeof(*markers) * markers_cnt); 356 357 output_label("kallsyms_names"); 358 off = 0; 359 for (i = 0; i < table_cnt; i++) { 360 if ((i & 0xFF) == 0) 361 markers[i >> 8] = off; 362 table[i]->seq = i; 363 364 /* There cannot be any symbol of length zero. */ 365 if (table[i]->len == 0) { 366 fprintf(stderr, "kallsyms failure: " 367 "unexpected zero symbol length\n"); 368 exit(EXIT_FAILURE); 369 } 370 371 /* Only lengths that fit in up-to-two-byte ULEB128 are supported. */ 372 if (table[i]->len > 0x3FFF) { 373 fprintf(stderr, "kallsyms failure: " 374 "unexpected huge symbol length\n"); 375 exit(EXIT_FAILURE); 376 } 377 378 /* Encode length with ULEB128. */ 379 if (table[i]->len <= 0x7F) { 380 /* Most symbols use a single byte for the length. */ 381 printf("\t.byte 0x%02x", table[i]->len); 382 off += table[i]->len + 1; 383 } else { 384 /* "Big" symbols use two bytes. */ 385 printf("\t.byte 0x%02x, 0x%02x", 386 (table[i]->len & 0x7F) | 0x80, 387 (table[i]->len >> 7) & 0x7F); 388 off += table[i]->len + 2; 389 } 390 for (k = 0; k < table[i]->len; k++) 391 printf(", 0x%02x", table[i]->sym[k]); 392 393 /* 394 * Now that we wrote out the compressed symbol name, restore the 395 * original name and print it in the comment. 396 */ 397 expand_symbol(table[i]->sym, table[i]->len, buf); 398 strcpy((char *)table[i]->sym, buf); 399 printf("\t/* %s */\n", table[i]->sym); 400 } 401 printf("\n"); 402 403 output_label("kallsyms_markers"); 404 for (i = 0; i < markers_cnt; i++) 405 printf("\t.long\t%u\n", markers[i]); 406 printf("\n"); 407 408 free(markers); 409 410 output_label("kallsyms_token_table"); 411 off = 0; 412 for (i = 0; i < 256; i++) { 413 best_idx[i] = off; 414 expand_symbol(best_table[i], best_table_len[i], buf); 415 printf("\t.asciz\t\"%s\"\n", buf); 416 off += strlen(buf) + 1; 417 } 418 printf("\n"); 419 420 output_label("kallsyms_token_index"); 421 for (i = 0; i < 256; i++) 422 printf("\t.short\t%d\n", best_idx[i]); 423 printf("\n"); 424 425 output_label("kallsyms_offsets"); 426 427 for (i = 0; i < table_cnt; i++) { 428 if (pc_relative) { 429 long long offset = table[i]->addr - _text; 430 431 if (offset < INT_MIN || offset > INT_MAX) { 432 fprintf(stderr, "kallsyms failure: " 433 "relative symbol value %#llx out of range\n", 434 table[i]->addr); 435 exit(EXIT_FAILURE); 436 } 437 printf("\t.long\t_text - . + (%d)\t/* %s */\n", 438 (int)offset, table[i]->sym); 439 } else { 440 printf("\t.long\t%#x\t/* %s */\n", 441 (unsigned int)table[i]->addr, table[i]->sym); 442 } 443 } 444 printf("\n"); 445 446 sort_symbols_by_name(); 447 output_label("kallsyms_seqs_of_names"); 448 for (i = 0; i < table_cnt; i++) 449 printf("\t.byte 0x%02x, 0x%02x, 0x%02x\t/* %s */\n", 450 (unsigned char)(table[i]->seq >> 16), 451 (unsigned char)(table[i]->seq >> 8), 452 (unsigned char)(table[i]->seq >> 0), 453 table[i]->sym); 454 printf("\n"); 455 } 456 457 458 /* table lookup compression functions */ 459 460 /* count all the possible tokens in a symbol */ 461 static void learn_symbol(const unsigned char *symbol, int len) 462 { 463 int i; 464 465 for (i = 0; i < len - 1; i++) 466 token_profit[ symbol[i] + (symbol[i + 1] << 8) ]++; 467 } 468 469 /* decrease the count for all the possible tokens in a symbol */ 470 static void forget_symbol(const unsigned char *symbol, int len) 471 { 472 int i; 473 474 for (i = 0; i < len - 1; i++) 475 token_profit[ symbol[i] + (symbol[i + 1] << 8) ]--; 476 } 477 478 /* do the initial token count */ 479 static void build_initial_token_table(void) 480 { 481 unsigned int i; 482 483 for (i = 0; i < table_cnt; i++) 484 learn_symbol(table[i]->sym, table[i]->len); 485 } 486 487 static unsigned char *find_token(unsigned char *str, int len, 488 const unsigned char *token) 489 { 490 int i; 491 492 for (i = 0; i < len - 1; i++) { 493 if (str[i] == token[0] && str[i+1] == token[1]) 494 return &str[i]; 495 } 496 return NULL; 497 } 498 499 /* replace a given token in all the valid symbols. Use the sampled symbols 500 * to update the counts */ 501 static void compress_symbols(const unsigned char *str, int idx) 502 { 503 unsigned int i, len, size; 504 unsigned char *p1, *p2; 505 506 for (i = 0; i < table_cnt; i++) { 507 508 len = table[i]->len; 509 p1 = table[i]->sym; 510 511 /* find the token on the symbol */ 512 p2 = find_token(p1, len, str); 513 if (!p2) continue; 514 515 /* decrease the counts for this symbol's tokens */ 516 forget_symbol(table[i]->sym, len); 517 518 size = len; 519 520 do { 521 *p2 = idx; 522 p2++; 523 size -= (p2 - p1); 524 memmove(p2, p2 + 1, size); 525 p1 = p2; 526 len--; 527 528 if (size < 2) break; 529 530 /* find the token on the symbol */ 531 p2 = find_token(p1, size, str); 532 533 } while (p2); 534 535 table[i]->len = len; 536 537 /* increase the counts for this symbol's new tokens */ 538 learn_symbol(table[i]->sym, len); 539 } 540 } 541 542 /* search the token with the maximum profit */ 543 static int find_best_token(void) 544 { 545 int i, best, bestprofit; 546 547 bestprofit=-10000; 548 best = 0; 549 550 for (i = 0; i < 0x10000; i++) { 551 if (token_profit[i] > bestprofit) { 552 best = i; 553 bestprofit = token_profit[i]; 554 } 555 } 556 return best; 557 } 558 559 /* this is the core of the algorithm: calculate the "best" table */ 560 static void optimize_result(void) 561 { 562 int i, best; 563 564 /* using the '\0' symbol last allows compress_symbols to use standard 565 * fast string functions */ 566 for (i = 255; i >= 0; i--) { 567 568 /* if this table slot is empty (it is not used by an actual 569 * original char code */ 570 if (!best_table_len[i]) { 571 572 /* find the token with the best profit value */ 573 best = find_best_token(); 574 if (token_profit[best] == 0) 575 break; 576 577 /* place it in the "best" table */ 578 best_table_len[i] = 2; 579 best_table[i][0] = best & 0xFF; 580 best_table[i][1] = (best >> 8) & 0xFF; 581 582 /* replace this token in all the valid symbols */ 583 compress_symbols(best_table[i], i); 584 } 585 } 586 } 587 588 /* start by placing the symbols that are actually used on the table */ 589 static void insert_real_symbols_in_table(void) 590 { 591 unsigned int i, j, c; 592 593 for (i = 0; i < table_cnt; i++) { 594 for (j = 0; j < table[i]->len; j++) { 595 c = table[i]->sym[j]; 596 best_table[c][0]=c; 597 best_table_len[c]=1; 598 } 599 } 600 } 601 602 static void optimize_token_table(void) 603 { 604 build_initial_token_table(); 605 606 insert_real_symbols_in_table(); 607 608 optimize_result(); 609 } 610 611 /* guess for "linker script provide" symbol */ 612 static int may_be_linker_script_provide_symbol(const struct sym_entry *se) 613 { 614 const char *symbol = sym_name(se); 615 int len = se->len - 1; 616 617 if (len < 8) 618 return 0; 619 620 if (symbol[0] != '_' || symbol[1] != '_') 621 return 0; 622 623 /* __start_XXXXX */ 624 if (!memcmp(symbol + 2, "start_", 6)) 625 return 1; 626 627 /* __stop_XXXXX */ 628 if (!memcmp(symbol + 2, "stop_", 5)) 629 return 1; 630 631 /* __end_XXXXX */ 632 if (!memcmp(symbol + 2, "end_", 4)) 633 return 1; 634 635 /* __XXXXX_start */ 636 if (!memcmp(symbol + len - 6, "_start", 6)) 637 return 1; 638 639 /* __XXXXX_end */ 640 if (!memcmp(symbol + len - 4, "_end", 4)) 641 return 1; 642 643 return 0; 644 } 645 646 static int compare_symbols(const void *a, const void *b) 647 { 648 const struct sym_entry *sa = *(const struct sym_entry **)a; 649 const struct sym_entry *sb = *(const struct sym_entry **)b; 650 int wa, wb; 651 652 /* sort by address first */ 653 if (sa->addr > sb->addr) 654 return 1; 655 if (sa->addr < sb->addr) 656 return -1; 657 658 /* sort by "weakness" type */ 659 wa = (sa->sym[0] == 'w') || (sa->sym[0] == 'W'); 660 wb = (sb->sym[0] == 'w') || (sb->sym[0] == 'W'); 661 if (wa != wb) 662 return wa - wb; 663 664 /* sort by "linker script provide" type */ 665 wa = may_be_linker_script_provide_symbol(sa); 666 wb = may_be_linker_script_provide_symbol(sb); 667 if (wa != wb) 668 return wa - wb; 669 670 /* sort by the number of prefix underscores */ 671 wa = strspn(sym_name(sa), "_"); 672 wb = strspn(sym_name(sb), "_"); 673 if (wa != wb) 674 return wa - wb; 675 676 /* sort by initial order, so that other symbols are left undisturbed */ 677 return sa->seq - sb->seq; 678 } 679 680 static void sort_symbols(void) 681 { 682 qsort(table, table_cnt, sizeof(table[0]), compare_symbols); 683 } 684 685 int main(int argc, char **argv) 686 { 687 while (1) { 688 static const struct option long_options[] = { 689 {"all-symbols", no_argument, &all_symbols, 1}, 690 {"pc-relative", no_argument, &pc_relative, 1}, 691 {}, 692 }; 693 694 int c = getopt_long(argc, argv, "", long_options, NULL); 695 696 if (c == -1) 697 break; 698 if (c != 0) 699 usage(); 700 } 701 702 if (optind >= argc) 703 usage(); 704 705 read_map(argv[optind]); 706 shrink_table(); 707 sort_symbols(); 708 optimize_token_table(); 709 write_src(); 710 711 return 0; 712 } 713