1 // SPDX-License-Identifier: GPL-2.0 2 #include <errno.h> 3 #include <signal.h> 4 #include <inttypes.h> 5 #include <linux/err.h> 6 #include <linux/kernel.h> 7 #include <linux/zalloc.h> 8 #include <api/fs/fs.h> 9 10 #include <byteswap.h> 11 #include <unistd.h> 12 #include <sys/types.h> 13 #include <sys/mman.h> 14 #include <perf/cpumap.h> 15 #include <perf/event.h> 16 17 #include "map_symbol.h" 18 #include "branch.h" 19 #include "debug.h" 20 #include "dwarf-regs.h" 21 #include "env.h" 22 #include "evlist.h" 23 #include "evsel.h" 24 #include "memswap.h" 25 #include "map.h" 26 #include "symbol.h" 27 #include "session.h" 28 #include "tool.h" 29 #include "perf_regs.h" 30 #include "asm/bug.h" 31 #include "auxtrace.h" 32 #include "thread.h" 33 #include "thread-stack.h" 34 #include "sample-raw.h" 35 #include "stat.h" 36 #include "tsc.h" 37 #include "ui/progress.h" 38 #include "util.h" 39 #include "arch/common.h" 40 #include "units.h" 41 #include "annotate.h" 42 #include "perf.h" 43 #include <internal/lib.h> 44 45 static int perf_session__deliver_event(struct perf_session *session, 46 union perf_event *event, 47 const struct perf_tool *tool, 48 u64 file_offset, 49 const char *file_path); 50 51 static int perf_session__open(struct perf_session *session) 52 { 53 struct perf_data *data = session->data; 54 55 if (perf_session__read_header(session) < 0) { 56 pr_err("incompatible file format (rerun with -v to learn more)\n"); 57 return -1; 58 } 59 60 if (perf_header__has_feat(&session->header, HEADER_AUXTRACE)) { 61 /* Auxiliary events may reference exited threads, hold onto dead ones. */ 62 symbol_conf.keep_exited_threads = true; 63 } 64 65 if (perf_data__is_pipe(data)) 66 return 0; 67 68 if (perf_header__has_feat(&session->header, HEADER_STAT)) 69 return 0; 70 71 if (!evlist__valid_sample_type(session->evlist)) { 72 pr_err("non matching sample_type\n"); 73 return -1; 74 } 75 76 if (!evlist__valid_sample_id_all(session->evlist)) { 77 pr_err("non matching sample_id_all\n"); 78 return -1; 79 } 80 81 if (!evlist__valid_read_format(session->evlist)) { 82 pr_err("non matching read_format\n"); 83 return -1; 84 } 85 86 return 0; 87 } 88 89 void perf_session__set_id_hdr_size(struct perf_session *session) 90 { 91 u16 id_hdr_size = evlist__id_hdr_size(session->evlist); 92 93 machines__set_id_hdr_size(&session->machines, id_hdr_size); 94 } 95 96 int perf_session__create_kernel_maps(struct perf_session *session) 97 { 98 int ret = machine__create_kernel_maps(&session->machines.host); 99 100 if (ret >= 0) 101 ret = machines__create_guest_kernel_maps(&session->machines); 102 return ret; 103 } 104 105 static void perf_session__destroy_kernel_maps(struct perf_session *session) 106 { 107 machines__destroy_kernel_maps(&session->machines); 108 } 109 110 static bool perf_session__has_comm_exec(struct perf_session *session) 111 { 112 struct evsel *evsel; 113 114 evlist__for_each_entry(session->evlist, evsel) { 115 if (evsel->core.attr.comm_exec) 116 return true; 117 } 118 119 return false; 120 } 121 122 static void perf_session__set_comm_exec(struct perf_session *session) 123 { 124 bool comm_exec = perf_session__has_comm_exec(session); 125 126 machines__set_comm_exec(&session->machines, comm_exec); 127 } 128 129 static int ordered_events__deliver_event(struct ordered_events *oe, 130 struct ordered_event *event) 131 { 132 struct perf_session *session = container_of(oe, struct perf_session, 133 ordered_events); 134 int ret = perf_session__deliver_event(session, event->event, 135 session->tool, event->file_offset, 136 event->file_path); 137 138 if (ret) { 139 pr_err("%#" PRIx64 " [%#x]: ordered event processing failed (%d) for event of type: %s (%d)\n", 140 event->file_offset, event->event->header.size, ret, 141 perf_event__name(event->event->header.type), 142 event->event->header.type); 143 } 144 return ret; 145 } 146 147 struct perf_session *__perf_session__new(struct perf_data *data, 148 struct perf_tool *tool, 149 bool trace_event_repipe, 150 struct perf_env *host_env) 151 { 152 int ret = -ENOMEM; 153 struct perf_session *session = zalloc(sizeof(*session)); 154 155 if (!session) 156 goto out; 157 158 session->trace_event_repipe = trace_event_repipe; 159 session->tool = tool; 160 session->decomp_data.zstd_decomp = &session->zstd_data; 161 session->active_decomp = &session->decomp_data; 162 INIT_LIST_HEAD(&session->auxtrace_index); 163 machines__init(&session->machines); 164 ordered_events__init(&session->ordered_events, 165 ordered_events__deliver_event, NULL); 166 167 perf_env__init(&session->header.env); 168 if (data) { 169 ret = perf_data__open(data); 170 if (ret < 0) 171 goto out_delete; 172 173 session->data = data; 174 175 if (perf_data__is_read(data)) { 176 ret = perf_session__open(session); 177 if (ret < 0) 178 goto out_delete; 179 180 /* 181 * set session attributes that are present in perf.data 182 * but not in pipe-mode. 183 */ 184 if (!data->is_pipe) { 185 perf_session__set_id_hdr_size(session); 186 perf_session__set_comm_exec(session); 187 } 188 189 evlist__init_trace_event_sample_raw(session->evlist, &session->header.env); 190 191 /* Open the directory data. */ 192 if (data->is_dir) { 193 ret = perf_data__open_dir(data); 194 if (ret) 195 goto out_delete; 196 } 197 198 if (!symbol_conf.kallsyms_name && 199 !symbol_conf.vmlinux_name) 200 symbol_conf.kallsyms_name = perf_data__kallsyms_name(data); 201 } 202 } else { 203 assert(host_env != NULL); 204 session->machines.host.env = host_env; 205 } 206 if (session->evlist) 207 session->evlist->session = session; 208 209 session->machines.host.single_address_space = 210 perf_env__single_address_space(session->machines.host.env); 211 212 if (!data || perf_data__is_write(data)) { 213 /* 214 * In O_RDONLY mode this will be performed when reading the 215 * kernel MMAP event, in perf_event__process_mmap(). 216 */ 217 if (perf_session__create_kernel_maps(session) < 0) 218 pr_warning("Cannot read kernel map\n"); 219 } 220 221 /* 222 * In pipe-mode, evlist is empty until PERF_RECORD_HEADER_ATTR is 223 * processed, so evlist__sample_id_all is not meaningful here. 224 */ 225 if ((!data || !data->is_pipe) && tool && tool->ordering_requires_timestamps && 226 tool->ordered_events && !evlist__sample_id_all(session->evlist)) { 227 dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); 228 tool->ordered_events = false; 229 } 230 231 return session; 232 233 out_delete: 234 perf_session__delete(session); 235 out: 236 return ERR_PTR(ret); 237 } 238 239 static void perf_decomp__release_events(struct decomp *next) 240 { 241 struct decomp *decomp; 242 size_t mmap_len; 243 244 do { 245 decomp = next; 246 if (decomp == NULL) 247 break; 248 next = decomp->next; 249 mmap_len = decomp->mmap_len; 250 munmap(decomp, mmap_len); 251 } while (1); 252 } 253 254 void perf_session__delete(struct perf_session *session) 255 { 256 if (session == NULL) 257 return; 258 auxtrace__free(session); 259 auxtrace_index__free(&session->auxtrace_index); 260 debuginfo_cache__delete(); 261 perf_session__destroy_kernel_maps(session); 262 perf_decomp__release_events(session->decomp_data.decomp); 263 perf_env__exit(&session->header.env); 264 machines__exit(&session->machines); 265 if (session->data) { 266 if (perf_data__is_read(session->data)) 267 evlist__delete(session->evlist); 268 perf_data__close(session->data); 269 } 270 #ifdef HAVE_LIBTRACEEVENT 271 trace_event__cleanup(&session->tevent); 272 #endif 273 free(session); 274 } 275 276 static void swap_sample_id_all(union perf_event *event, void *data) 277 { 278 void *end = (void *) event + event->header.size; 279 int size; 280 281 if (data >= end) 282 return; 283 284 size = end - data; 285 if (size % sizeof(u64)) { 286 pr_warning("swap_sample_id_all: unaligned sample_id_all remainder (%d), skipping swap\n", size); 287 return; 288 } 289 if (size > 0) 290 mem_bswap_64(data, size); 291 } 292 293 static int perf_event__all64_swap(union perf_event *event, 294 bool sample_id_all __maybe_unused) 295 { 296 struct perf_event_header *hdr = &event->header; 297 size_t size = event->header.size - sizeof(*hdr); 298 299 /* mem_bswap_64 rounds up to 8-byte chunks — unaligned size overruns the buffer */ 300 if (size % sizeof(u64)) 301 return -1; 302 mem_bswap_64(hdr + 1, size); 303 return 0; 304 } 305 306 static int perf_event__comm_swap(union perf_event *event, bool sample_id_all) 307 { 308 event->comm.pid = bswap_32(event->comm.pid); 309 event->comm.tid = bswap_32(event->comm.tid); 310 311 if (sample_id_all) { 312 void *data = &event->comm.comm; 313 void *end = (void *)event + event->header.size; 314 size_t len = strnlen(data, end - data); 315 316 /* 317 * No NUL within the event boundary — can't locate where 318 * sample_id_all starts. Reject so the event is skipped 319 * rather than swapping garbage. 320 */ 321 if (len == (size_t)(end - data)) 322 return -1; 323 data += PERF_ALIGN(len + 1, sizeof(u64)); 324 swap_sample_id_all(event, data); 325 } 326 return 0; 327 } 328 329 static int perf_event__mmap_swap(union perf_event *event, 330 bool sample_id_all) 331 { 332 event->mmap.pid = bswap_32(event->mmap.pid); 333 event->mmap.tid = bswap_32(event->mmap.tid); 334 event->mmap.start = bswap_64(event->mmap.start); 335 event->mmap.len = bswap_64(event->mmap.len); 336 event->mmap.pgoff = bswap_64(event->mmap.pgoff); 337 338 if (sample_id_all) { 339 void *data = &event->mmap.filename; 340 void *end = (void *)event + event->header.size; 341 size_t len = strnlen(data, end - data); 342 343 /* See comment in perf_event__comm_swap() */ 344 if (len == (size_t)(end - data)) 345 return -1; 346 data += PERF_ALIGN(len + 1, sizeof(u64)); 347 swap_sample_id_all(event, data); 348 } 349 return 0; 350 } 351 352 static int perf_event__mmap2_swap(union perf_event *event, 353 bool sample_id_all) 354 { 355 event->mmap2.pid = bswap_32(event->mmap2.pid); 356 event->mmap2.tid = bswap_32(event->mmap2.tid); 357 event->mmap2.start = bswap_64(event->mmap2.start); 358 event->mmap2.len = bswap_64(event->mmap2.len); 359 event->mmap2.pgoff = bswap_64(event->mmap2.pgoff); 360 361 if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) { 362 event->mmap2.maj = bswap_32(event->mmap2.maj); 363 event->mmap2.min = bswap_32(event->mmap2.min); 364 event->mmap2.ino = bswap_64(event->mmap2.ino); 365 event->mmap2.ino_generation = bswap_64(event->mmap2.ino_generation); 366 } 367 368 if (sample_id_all) { 369 void *data = &event->mmap2.filename; 370 void *end = (void *)event + event->header.size; 371 size_t len = strnlen(data, end - data); 372 373 /* See comment in perf_event__comm_swap() */ 374 if (len == (size_t)(end - data)) 375 return -1; 376 data += PERF_ALIGN(len + 1, sizeof(u64)); 377 swap_sample_id_all(event, data); 378 } 379 return 0; 380 } 381 382 static int perf_event__task_swap(union perf_event *event, bool sample_id_all) 383 { 384 event->fork.pid = bswap_32(event->fork.pid); 385 event->fork.tid = bswap_32(event->fork.tid); 386 event->fork.ppid = bswap_32(event->fork.ppid); 387 event->fork.ptid = bswap_32(event->fork.ptid); 388 event->fork.time = bswap_64(event->fork.time); 389 390 if (sample_id_all) 391 swap_sample_id_all(event, &event->fork + 1); 392 return 0; 393 } 394 395 static int perf_event__read_swap(union perf_event *event, 396 bool sample_id_all __maybe_unused) 397 { 398 size_t tail; 399 400 event->read.pid = bswap_32(event->read.pid); 401 event->read.tid = bswap_32(event->read.tid); 402 /* 403 * Everything after pid/tid is u64: the read values (variable 404 * set determined by attr.read_format, which we don't have 405 * here) optionally followed by sample_id_all fields. 406 * Since all are u64, swap the entire remaining tail at once. 407 */ 408 tail = event->header.size - offsetof(struct perf_record_read, value); 409 /* mem_bswap_64 rounds up to 8-byte chunks — unaligned tail overruns the buffer */ 410 if (tail % sizeof(u64)) 411 return -1; 412 mem_bswap_64(&event->read.value, tail); 413 return 0; 414 } 415 416 static int perf_event__aux_swap(union perf_event *event, bool sample_id_all) 417 { 418 event->aux.aux_offset = bswap_64(event->aux.aux_offset); 419 event->aux.aux_size = bswap_64(event->aux.aux_size); 420 event->aux.flags = bswap_64(event->aux.flags); 421 422 if (sample_id_all) 423 swap_sample_id_all(event, &event->aux + 1); 424 return 0; 425 } 426 427 static int perf_event__itrace_start_swap(union perf_event *event, 428 bool sample_id_all) 429 { 430 event->itrace_start.pid = bswap_32(event->itrace_start.pid); 431 event->itrace_start.tid = bswap_32(event->itrace_start.tid); 432 433 if (sample_id_all) 434 swap_sample_id_all(event, &event->itrace_start + 1); 435 return 0; 436 } 437 438 static int perf_event__switch_swap(union perf_event *event, bool sample_id_all) 439 { 440 if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE) { 441 event->context_switch.next_prev_pid = 442 bswap_32(event->context_switch.next_prev_pid); 443 event->context_switch.next_prev_tid = 444 bswap_32(event->context_switch.next_prev_tid); 445 } 446 447 if (sample_id_all) { 448 /* 449 * PERF_RECORD_SWITCH has no fields beyond the header; 450 * SWITCH_CPU_WIDE adds pid/tid. Use the right offset 451 * so sample_id starts at the correct position. 452 */ 453 if (event->header.type == PERF_RECORD_SWITCH) 454 swap_sample_id_all(event, (void *)event + sizeof(event->header)); 455 else 456 swap_sample_id_all(event, &event->context_switch + 1); 457 } 458 return 0; 459 } 460 461 static int perf_event__text_poke_swap(union perf_event *event, bool sample_id_all) 462 { 463 event->text_poke.addr = bswap_64(event->text_poke.addr); 464 event->text_poke.old_len = bswap_16(event->text_poke.old_len); 465 event->text_poke.new_len = bswap_16(event->text_poke.new_len); 466 467 if (sample_id_all) { 468 void *data = &event->text_poke.old_len; 469 void *end = (void *)event + event->header.size; 470 size_t len = sizeof(event->text_poke.old_len) + 471 sizeof(event->text_poke.new_len) + 472 event->text_poke.old_len + 473 event->text_poke.new_len; 474 475 /* old_len + new_len exceeds event — can't find sample_id_all */ 476 if (data + len > end) 477 return -1; 478 data += PERF_ALIGN(len, sizeof(u64)); 479 swap_sample_id_all(event, data); 480 } 481 return 0; 482 } 483 484 static int perf_event__throttle_swap(union perf_event *event, 485 bool sample_id_all) 486 { 487 event->throttle.time = bswap_64(event->throttle.time); 488 event->throttle.id = bswap_64(event->throttle.id); 489 event->throttle.stream_id = bswap_64(event->throttle.stream_id); 490 491 if (sample_id_all) 492 swap_sample_id_all(event, &event->throttle + 1); 493 return 0; 494 } 495 496 static int perf_event__namespaces_swap(union perf_event *event, 497 bool sample_id_all) 498 { 499 u64 i, nr, max_nr; 500 501 event->namespaces.pid = bswap_32(event->namespaces.pid); 502 event->namespaces.tid = bswap_32(event->namespaces.tid); 503 event->namespaces.nr_namespaces = bswap_64(event->namespaces.nr_namespaces); 504 505 nr = event->namespaces.nr_namespaces; 506 /* 507 * Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof. 508 * When sample_id_all is present max_nr slightly overestimates the 509 * array space because header.size includes the trailing sample_id. 510 * Harmless: both the per-element bswap_64 loop and swap_sample_id_all() 511 * perform the same u64 byte swap, so the result is correct regardless 512 * of where the boundary between array and sample_id falls. 513 */ 514 max_nr = (event->header.size - sizeof(event->namespaces)) / 515 sizeof(event->namespaces.link_info[0]); 516 /* 517 * Safe to clamp: each namespace entry is indexed by type; 518 * missing entries just won't be resolved. 519 */ 520 if (nr > max_nr) { 521 pr_warning("WARNING: PERF_RECORD_NAMESPACES: nr_namespaces %" PRIu64 " exceeds payload (max %" PRIu64 "), clamping\n", 522 nr, max_nr); 523 nr = max_nr; 524 event->namespaces.nr_namespaces = nr; 525 } 526 527 for (i = 0; i < nr; i++) { 528 struct perf_ns_link_info *ns = &event->namespaces.link_info[i]; 529 530 ns->dev = bswap_64(ns->dev); 531 ns->ino = bswap_64(ns->ino); 532 } 533 534 if (sample_id_all) 535 swap_sample_id_all(event, &event->namespaces.link_info[i]); 536 return 0; 537 } 538 539 static int perf_event__cgroup_swap(union perf_event *event, bool sample_id_all) 540 { 541 event->cgroup.id = bswap_64(event->cgroup.id); 542 543 if (sample_id_all) { 544 void *data = &event->cgroup.path; 545 void *end = (void *)event + event->header.size; 546 size_t len = strnlen(data, end - data); 547 548 /* See comment in perf_event__comm_swap() */ 549 if (len == (size_t)(end - data)) 550 return -1; 551 data += PERF_ALIGN(len + 1, sizeof(u64)); 552 swap_sample_id_all(event, data); 553 } 554 return 0; 555 } 556 557 static u8 revbyte(u8 b) 558 { 559 int rev = (b >> 4) | ((b & 0xf) << 4); 560 rev = ((rev & 0xcc) >> 2) | ((rev & 0x33) << 2); 561 rev = ((rev & 0xaa) >> 1) | ((rev & 0x55) << 1); 562 return (u8) rev; 563 } 564 565 /* 566 * XXX this is hack in attempt to carry flags bitfield 567 * through endian village. ABI says: 568 * 569 * Bit-fields are allocated from right to left (least to most significant) 570 * on little-endian implementations and from left to right (most to least 571 * significant) on big-endian implementations. 572 * 573 * The above seems to be byte specific, so we need to reverse each 574 * byte of the bitfield. 'Internet' also says this might be implementation 575 * specific and we probably need proper fix and carry perf_event_attr 576 * bitfield flags in separate data file FEAT_ section. Thought this seems 577 * to work for now. 578 */ 579 static void swap_bitfield(u8 *p, unsigned len) 580 { 581 unsigned i; 582 583 for (i = 0; i < len; i++) { 584 *p = revbyte(*p); 585 p++; 586 } 587 } 588 589 /* exported for swapping attributes in file header */ 590 void perf_event__attr_swap(struct perf_event_attr *attr) 591 { 592 attr->type = bswap_32(attr->type); 593 attr->size = bswap_32(attr->size); 594 595 /* 596 * ABI0: size == 0 means the producer didn't set it. 597 * Assume PERF_ATTR_SIZE_VER0 so bswap_safe() below 598 * correctly swaps the VER0 fields instead of skipping 599 * everything. Same convention as read_attr(). 600 */ 601 if (!attr->size) 602 attr->size = PERF_ATTR_SIZE_VER0; 603 604 /* Verify the full field extent fits, not just its start offset */ 605 #define bswap_safe(f, n) \ 606 (attr->size >= (offsetof(struct perf_event_attr, f) + \ 607 sizeof(attr->f) * ((n) + 1))) 608 #define bswap_field(f, sz) \ 609 do { \ 610 if (bswap_safe(f, 0)) \ 611 attr->f = bswap_##sz(attr->f); \ 612 } while(0) 613 #define bswap_field_16(f) bswap_field(f, 16) 614 #define bswap_field_32(f) bswap_field(f, 32) 615 #define bswap_field_64(f) bswap_field(f, 64) 616 617 bswap_field_64(config); 618 bswap_field_64(sample_period); 619 bswap_field_64(sample_type); 620 bswap_field_64(read_format); 621 bswap_field_32(wakeup_events); 622 bswap_field_32(bp_type); 623 bswap_field_64(bp_addr); 624 bswap_field_64(bp_len); 625 bswap_field_64(branch_sample_type); 626 bswap_field_64(sample_regs_user); 627 bswap_field_32(sample_stack_user); 628 bswap_field_32(aux_watermark); 629 bswap_field_16(sample_max_stack); 630 bswap_field_32(aux_sample_size); 631 632 /* 633 * After read_format are bitfields. Check read_format because 634 * we are unable to use offsetof on bitfield. 635 */ 636 if (bswap_safe(read_format, 1)) 637 swap_bitfield((u8 *) (&attr->read_format + 1), 638 sizeof(u64)); 639 #undef bswap_field_64 640 #undef bswap_field_32 641 #undef bswap_field 642 #undef bswap_safe 643 } 644 645 static int perf_event__hdr_attr_swap(union perf_event *event, 646 bool sample_id_all __maybe_unused) 647 { 648 u32 attr_size, payload_size; 649 size_t size; 650 651 /* 652 * Validate attr.size (still foreign-endian) before calling 653 * perf_event__attr_swap(), which uses it via bswap_safe() 654 * to decide which fields to swap. A crafted attr.size 655 * larger than the event payload would swap past the event 656 * boundary and corrupt adjacent memory. 657 * 658 * header.size alignment is already validated by 659 * perf_session__process_event(). The min_size table 660 * guarantees header.size >= sizeof(header) + 661 * PERF_ATTR_SIZE_VER0, so attr.size is safe to access. 662 */ 663 attr_size = bswap_32(event->attr.attr.size); 664 /* 665 * ABI0: size field not set. This only happens in pipe/inject 666 * mode where HEADER_ATTR events carry their own attr. For 667 * regular perf.data files, read_attr() uses f_header.attr_size 668 * from the file header instead. Assume PERF_ATTR_SIZE_VER0. 669 */ 670 if (!attr_size) 671 attr_size = PERF_ATTR_SIZE_VER0; 672 payload_size = event->header.size - sizeof(event->header); 673 674 if (attr_size < PERF_ATTR_SIZE_VER0 || attr_size % sizeof(u64) || 675 attr_size > payload_size) { 676 pr_err("PERF_RECORD_HEADER_ATTR: invalid attr.size %u (min: %d, max: %u, 8-byte aligned)\n", 677 attr_size, PERF_ATTR_SIZE_VER0, payload_size); 678 return -1; 679 } 680 681 perf_event__attr_swap(&event->attr.attr); 682 683 size = event->header.size; 684 size -= perf_record_header_attr_id(event) - (void *)event; 685 mem_bswap_64(perf_record_header_attr_id(event), size); 686 return 0; 687 } 688 689 static int perf_event__build_id_swap(union perf_event *event, 690 bool sample_id_all) 691 { 692 event->build_id.pid = bswap_32(event->build_id.pid); 693 694 if (sample_id_all) { 695 void *data = &event->build_id.filename; 696 void *end = (void *)event + event->header.size; 697 size_t len = strnlen(data, end - data); 698 699 /* See comment in perf_event__comm_swap() */ 700 if (len == (size_t)(end - data)) 701 return -1; 702 data += PERF_ALIGN(len + 1, sizeof(u64)); 703 swap_sample_id_all(event, data); 704 } 705 return 0; 706 } 707 708 static int perf_event__event_update_swap(union perf_event *event, 709 bool sample_id_all __maybe_unused) 710 { 711 struct perf_record_event_update *ev = &event->event_update; 712 713 ev->type = bswap_64(ev->type); 714 ev->id = bswap_64(ev->id); 715 716 /* 717 * Swap variant-specific fields so the processing path 718 * sees native byte order. 719 */ 720 if (ev->type == PERF_EVENT_UPDATE__SCALE) { 721 if (event->header.size < offsetof(struct perf_record_event_update, scale) + 722 sizeof(ev->scale)) 723 return -1; 724 mem_bswap_64(&ev->scale.scale, sizeof(ev->scale.scale)); 725 } else if (ev->type == PERF_EVENT_UPDATE__CPUS) { 726 u32 cpus_payload; 727 struct perf_record_cpu_map_data *data = &ev->cpus.cpus; 728 729 /* CPUS fields start at the same offset as scale (union) */ 730 if (event->header.size < offsetof(struct perf_record_event_update, cpus) + 731 sizeof(__u16) + sizeof(struct perf_record_range_cpu_map)) 732 return -1; 733 cpus_payload = event->header.size - offsetof(struct perf_record_event_update, cpus); 734 data->type = bswap_16(data->type); 735 /* 736 * Full swap including array elements — same logic as 737 * perf_event__cpu_map_swap() but scoped to the 738 * embedded cpu_map_data within EVENT_UPDATE. 739 */ 740 switch (data->type) { 741 case PERF_CPU_MAP__CPUS: { 742 u16 nr, max_nr; 743 744 data->cpus_data.nr = bswap_16(data->cpus_data.nr); 745 nr = data->cpus_data.nr; 746 max_nr = (cpus_payload - offsetof(struct perf_record_cpu_map_data, 747 cpus_data.cpu)) / 748 sizeof(data->cpus_data.cpu[0]); 749 if (nr > max_nr) { 750 nr = max_nr; 751 data->cpus_data.nr = nr; 752 } 753 for (unsigned int i = 0; i < nr; i++) 754 data->cpus_data.cpu[i] = bswap_16(data->cpus_data.cpu[i]); 755 break; 756 } 757 case PERF_CPU_MAP__MASK: 758 data->mask32_data.long_size = bswap_16(data->mask32_data.long_size); 759 switch (data->mask32_data.long_size) { 760 case 4: { 761 u16 nr, max_nr; 762 763 data->mask32_data.nr = bswap_16(data->mask32_data.nr); 764 nr = data->mask32_data.nr; 765 max_nr = (cpus_payload - offsetof(struct perf_record_cpu_map_data, 766 mask32_data.mask)) / 767 sizeof(data->mask32_data.mask[0]); 768 if (nr > max_nr) { 769 nr = max_nr; 770 data->mask32_data.nr = nr; 771 } 772 for (unsigned int i = 0; i < nr; i++) 773 data->mask32_data.mask[i] = bswap_32(data->mask32_data.mask[i]); 774 break; 775 } 776 case 8: { 777 u16 nr, max_nr; 778 779 data->mask64_data.nr = bswap_16(data->mask64_data.nr); 780 nr = data->mask64_data.nr; 781 if (cpus_payload < offsetof(struct perf_record_cpu_map_data, mask64_data.mask)) { 782 data->mask64_data.nr = 0; 783 break; 784 } 785 max_nr = (cpus_payload - offsetof(struct perf_record_cpu_map_data, 786 mask64_data.mask)) / 787 sizeof(data->mask64_data.mask[0]); 788 if (nr > max_nr) { 789 nr = max_nr; 790 data->mask64_data.nr = nr; 791 } 792 for (unsigned int i = 0; i < nr; i++) 793 data->mask64_data.mask[i] = bswap_64(data->mask64_data.mask[i]); 794 break; 795 } 796 default: 797 break; 798 } 799 break; 800 case PERF_CPU_MAP__RANGE_CPUS: 801 data->range_cpu_data.start_cpu = bswap_16(data->range_cpu_data.start_cpu); 802 data->range_cpu_data.end_cpu = bswap_16(data->range_cpu_data.end_cpu); 803 break; 804 default: 805 break; 806 } 807 } 808 return 0; 809 } 810 811 static int perf_event__event_type_swap(union perf_event *event, 812 bool sample_id_all __maybe_unused) 813 { 814 event->event_type.event_type.event_id = 815 bswap_64(event->event_type.event_type.event_id); 816 return 0; 817 } 818 819 static int perf_event__tracing_data_swap(union perf_event *event, 820 bool sample_id_all __maybe_unused) 821 { 822 event->tracing_data.size = bswap_32(event->tracing_data.size); 823 return 0; 824 } 825 826 static int perf_event__auxtrace_info_swap(union perf_event *event, 827 bool sample_id_all __maybe_unused) 828 { 829 size_t size; 830 831 event->auxtrace_info.type = bswap_32(event->auxtrace_info.type); 832 833 size = event->header.size; 834 size -= (void *)&event->auxtrace_info.priv - (void *)event; 835 mem_bswap_64(event->auxtrace_info.priv, size); 836 return 0; 837 } 838 839 static int perf_event__auxtrace_swap(union perf_event *event, 840 bool sample_id_all __maybe_unused) 841 { 842 event->auxtrace.size = bswap_64(event->auxtrace.size); 843 event->auxtrace.offset = bswap_64(event->auxtrace.offset); 844 event->auxtrace.reference = bswap_64(event->auxtrace.reference); 845 event->auxtrace.idx = bswap_32(event->auxtrace.idx); 846 event->auxtrace.tid = bswap_32(event->auxtrace.tid); 847 event->auxtrace.cpu = bswap_32(event->auxtrace.cpu); 848 return 0; 849 } 850 851 static int perf_event__auxtrace_error_swap(union perf_event *event, 852 bool sample_id_all __maybe_unused) 853 { 854 event->auxtrace_error.type = bswap_32(event->auxtrace_error.type); 855 event->auxtrace_error.code = bswap_32(event->auxtrace_error.code); 856 event->auxtrace_error.cpu = bswap_32(event->auxtrace_error.cpu); 857 event->auxtrace_error.pid = bswap_32(event->auxtrace_error.pid); 858 event->auxtrace_error.tid = bswap_32(event->auxtrace_error.tid); 859 event->auxtrace_error.fmt = bswap_32(event->auxtrace_error.fmt); 860 event->auxtrace_error.ip = bswap_64(event->auxtrace_error.ip); 861 if (event->auxtrace_error.fmt) 862 event->auxtrace_error.time = bswap_64(event->auxtrace_error.time); 863 if (event->auxtrace_error.fmt >= 2) { 864 /* 865 * fmt >= 2 adds machine_pid and vcpu after msg[64]. 866 * Older files may have fmt >= 2 but an event size 867 * that doesn't include these fields — downgrade to 868 * avoid swapping out of bounds. 869 */ 870 if (event->header.size < offsetof(typeof(event->auxtrace_error), vcpu) + 871 sizeof(event->auxtrace_error.vcpu)) { 872 pr_warning("WARNING: PERF_RECORD_AUXTRACE_ERROR: fmt %u but event too small for machine_pid/vcpu (%u bytes), downgrading fmt\n", 873 event->auxtrace_error.fmt, 874 event->header.size); 875 event->auxtrace_error.fmt = 1; 876 } else { 877 event->auxtrace_error.machine_pid = bswap_32(event->auxtrace_error.machine_pid); 878 event->auxtrace_error.vcpu = bswap_32(event->auxtrace_error.vcpu); 879 } 880 } 881 return 0; 882 } 883 884 static int perf_event__thread_map_swap(union perf_event *event, 885 bool sample_id_all __maybe_unused) 886 { 887 unsigned int i; 888 u64 nr; 889 890 event->thread_map.nr = bswap_64(event->thread_map.nr); 891 892 /* 893 * Reject rather than clamp: unlike namespaces (indexed by type) 894 * or stat_config (self-describing tags), a truncated thread map 895 * is structurally broken — downstream would get a wrong map. 896 */ 897 /* Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof */ 898 nr = event->thread_map.nr; 899 if (nr > (event->header.size - sizeof(event->thread_map)) / 900 sizeof(event->thread_map.entries[0])) 901 return -1; 902 903 for (i = 0; i < nr; i++) 904 event->thread_map.entries[i].pid = bswap_64(event->thread_map.entries[i].pid); 905 return 0; 906 } 907 908 static int perf_event__cpu_map_swap(union perf_event *event, 909 bool sample_id_all __maybe_unused) 910 { 911 struct perf_record_cpu_map_data *data = &event->cpu_map.data; 912 u32 payload = event->header.size - sizeof(event->header); 913 914 data->type = bswap_16(data->type); 915 916 /* 917 * Safe to clamp: a shorter CPU map just means some CPUs 918 * are absent; tools process the CPUs that are present. 919 */ 920 switch (data->type) { 921 case PERF_CPU_MAP__CPUS: { 922 u16 nr, max_nr; 923 924 data->cpus_data.nr = bswap_16(data->cpus_data.nr); 925 nr = data->cpus_data.nr; 926 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 927 cpus_data.cpu)) / 928 sizeof(data->cpus_data.cpu[0]); 929 if (nr > max_nr) { 930 pr_warning("WARNING: PERF_RECORD_CPU_MAP: nr %u exceeds payload (max %u), clamping\n", 931 nr, max_nr); 932 nr = max_nr; 933 data->cpus_data.nr = nr; 934 } 935 for (unsigned int i = 0; i < nr; i++) 936 data->cpus_data.cpu[i] = bswap_16(data->cpus_data.cpu[i]); 937 break; 938 } 939 case PERF_CPU_MAP__MASK: 940 data->mask32_data.long_size = bswap_16(data->mask32_data.long_size); 941 942 switch (data->mask32_data.long_size) { 943 case 4: { 944 u16 nr, max_nr; 945 946 data->mask32_data.nr = bswap_16(data->mask32_data.nr); 947 nr = data->mask32_data.nr; 948 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 949 mask32_data.mask)) / 950 sizeof(data->mask32_data.mask[0]); 951 if (nr > max_nr) { 952 pr_warning("WARNING: PERF_RECORD_CPU_MAP mask32: nr %u exceeds payload (max %u), clamping\n", 953 nr, max_nr); 954 nr = max_nr; 955 data->mask32_data.nr = nr; 956 } 957 for (unsigned int i = 0; i < nr; i++) 958 data->mask32_data.mask[i] = bswap_32(data->mask32_data.mask[i]); 959 break; 960 } 961 case 8: { 962 u16 nr, max_nr; 963 964 data->mask64_data.nr = bswap_16(data->mask64_data.nr); 965 nr = data->mask64_data.nr; 966 if (payload < offsetof(struct perf_record_cpu_map_data, mask64_data.mask)) { 967 data->mask64_data.nr = 0; 968 break; 969 } 970 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 971 mask64_data.mask)) / 972 sizeof(data->mask64_data.mask[0]); 973 if (nr > max_nr) { 974 pr_warning("WARNING: PERF_RECORD_CPU_MAP mask64: nr %u exceeds payload (max %u), clamping\n", 975 nr, max_nr); 976 nr = max_nr; 977 data->mask64_data.nr = nr; 978 } 979 for (unsigned int i = 0; i < nr; i++) 980 data->mask64_data.mask[i] = bswap_64(data->mask64_data.mask[i]); 981 break; 982 } 983 default: 984 pr_err("cpu_map swap: unsupported long size %u\n", 985 data->mask32_data.long_size); 986 } 987 break; 988 case PERF_CPU_MAP__RANGE_CPUS: 989 data->range_cpu_data.start_cpu = bswap_16(data->range_cpu_data.start_cpu); 990 data->range_cpu_data.end_cpu = bswap_16(data->range_cpu_data.end_cpu); 991 break; 992 default: 993 break; 994 } 995 return 0; 996 } 997 998 static int perf_event__stat_config_swap(union perf_event *event, 999 bool sample_id_all __maybe_unused) 1000 { 1001 u64 nr, max_nr, size; 1002 1003 nr = bswap_64(event->stat_config.nr); 1004 /* Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof */ 1005 max_nr = (event->header.size - sizeof(event->stat_config)) / 1006 sizeof(event->stat_config.data[0]); 1007 /* 1008 * Safe to clamp: each config entry is self-describing 1009 * via its tag; missing entries keep their defaults. 1010 */ 1011 if (nr > max_nr) { 1012 pr_warning("WARNING: PERF_RECORD_STAT_CONFIG: nr %" PRIu64 " exceeds payload (max %" PRIu64 "), clamping\n", 1013 nr, max_nr); 1014 nr = max_nr; 1015 } 1016 size = nr * sizeof(event->stat_config.data[0]); 1017 /* The swap starts at &nr, so add its size to cover the full range */ 1018 size += sizeof(event->stat_config.nr); 1019 mem_bswap_64(&event->stat_config.nr, size); 1020 /* Persist the clamped value in native byte order */ 1021 event->stat_config.nr = nr; 1022 return 0; 1023 } 1024 1025 static int perf_event__stat_swap(union perf_event *event, 1026 bool sample_id_all __maybe_unused) 1027 { 1028 event->stat.id = bswap_64(event->stat.id); 1029 event->stat.thread = bswap_32(event->stat.thread); 1030 event->stat.cpu = bswap_32(event->stat.cpu); 1031 event->stat.val = bswap_64(event->stat.val); 1032 event->stat.ena = bswap_64(event->stat.ena); 1033 event->stat.run = bswap_64(event->stat.run); 1034 return 0; 1035 } 1036 1037 static int perf_event__stat_round_swap(union perf_event *event, 1038 bool sample_id_all __maybe_unused) 1039 { 1040 event->stat_round.type = bswap_64(event->stat_round.type); 1041 event->stat_round.time = bswap_64(event->stat_round.time); 1042 return 0; 1043 } 1044 1045 static int perf_event__time_conv_swap(union perf_event *event, 1046 bool sample_id_all __maybe_unused) 1047 { 1048 event->time_conv.time_shift = bswap_64(event->time_conv.time_shift); 1049 event->time_conv.time_mult = bswap_64(event->time_conv.time_mult); 1050 event->time_conv.time_zero = bswap_64(event->time_conv.time_zero); 1051 1052 if (event_contains(event->time_conv, time_cycles)) 1053 event->time_conv.time_cycles = bswap_64(event->time_conv.time_cycles); 1054 if (event_contains(event->time_conv, time_mask)) 1055 event->time_conv.time_mask = bswap_64(event->time_conv.time_mask); 1056 return 0; 1057 } 1058 1059 static int perf_event__compressed2_swap(union perf_event *event, 1060 bool sample_id_all __maybe_unused) 1061 { 1062 /* Only data_size needs swapping — compressed payload is a raw byte stream */ 1063 event->pack2.data_size = bswap_64(event->pack2.data_size); 1064 return 0; 1065 } 1066 1067 static int perf_event__bpf_metadata_swap(union perf_event *event, 1068 bool sample_id_all __maybe_unused) 1069 { 1070 u64 i, nr, max_nr; 1071 1072 /* Fixed header must fit before accessing nr_entries or prog_name */ 1073 if (event->header.size < sizeof(event->bpf_metadata)) 1074 return -1; 1075 1076 event->bpf_metadata.nr_entries = bswap_64(event->bpf_metadata.nr_entries); 1077 1078 /* 1079 * Ensure NUL-termination on the cross-endian path where the 1080 * mapping is writable (MAP_PRIVATE + PROT_WRITE). Fixing 1081 * the string in place is preferred over rejecting because it 1082 * preserves the event for downstream processing — only the 1083 * last byte is lost. 1084 * 1085 * The native-endian path (MAP_SHARED + PROT_READ) cannot 1086 * write, so it validates and skips unterminated events in 1087 * perf_session__process_user_event() instead. The two 1088 * strategies produce different outcomes for the same 1089 * malformed input (fix vs skip), which is inherent in the 1090 * writable-vs-read-only mapping model. 1091 */ 1092 event->bpf_metadata.prog_name[BPF_PROG_NAME_LEN - 1] = '\0'; 1093 1094 nr = event->bpf_metadata.nr_entries; 1095 max_nr = (event->header.size - sizeof(event->bpf_metadata)) / 1096 sizeof(event->bpf_metadata.entries[0]); 1097 if (nr > max_nr) { 1098 /* Persist clamped value so the native path processes entries, not skips */ 1099 nr = max_nr; 1100 event->bpf_metadata.nr_entries = nr; 1101 } 1102 1103 for (i = 0; i < nr; i++) { 1104 event->bpf_metadata.entries[i].key[BPF_METADATA_KEY_LEN - 1] = '\0'; 1105 event->bpf_metadata.entries[i].value[BPF_METADATA_VALUE_LEN - 1] = '\0'; 1106 } 1107 return 0; 1108 } 1109 static int 1110 perf_event__schedstat_cpu_swap(union perf_event *event __maybe_unused, 1111 bool sample_id_all __maybe_unused) 1112 { 1113 /* FIXME */ 1114 return 0; 1115 } 1116 1117 static int 1118 perf_event__schedstat_domain_swap(union perf_event *event __maybe_unused, 1119 bool sample_id_all __maybe_unused) 1120 { 1121 /* FIXME */ 1122 return 0; 1123 } 1124 1125 static int perf_event__ksymbol_swap(union perf_event *event, 1126 bool sample_id_all) 1127 { 1128 event->ksymbol.addr = bswap_64(event->ksymbol.addr); 1129 event->ksymbol.len = bswap_32(event->ksymbol.len); 1130 event->ksymbol.ksym_type = bswap_16(event->ksymbol.ksym_type); 1131 event->ksymbol.flags = bswap_16(event->ksymbol.flags); 1132 1133 if (sample_id_all) { 1134 void *data = &event->ksymbol.name; 1135 void *end = (void *)event + event->header.size; 1136 size_t len = strnlen(data, end - data); 1137 1138 /* See comment in perf_event__comm_swap() */ 1139 if (len == (size_t)(end - data)) 1140 return -1; 1141 data += PERF_ALIGN(len + 1, sizeof(u64)); 1142 swap_sample_id_all(event, data); 1143 } 1144 return 0; 1145 } 1146 1147 static int perf_event__bpf_event_swap(union perf_event *event, 1148 bool sample_id_all) 1149 { 1150 event->bpf.type = bswap_16(event->bpf.type); 1151 event->bpf.flags = bswap_16(event->bpf.flags); 1152 event->bpf.id = bswap_32(event->bpf.id); 1153 1154 if (sample_id_all) 1155 swap_sample_id_all(event, &event->bpf + 1); 1156 return 0; 1157 } 1158 1159 static int perf_event__header_feature_swap(union perf_event *event, 1160 bool sample_id_all __maybe_unused) 1161 { 1162 event->feat.feat_id = bswap_64(event->feat.feat_id); 1163 return 0; 1164 } 1165 1166 typedef int (*perf_event__swap_op)(union perf_event *event, 1167 bool sample_id_all); 1168 1169 static perf_event__swap_op perf_event__swap_ops[] = { 1170 [PERF_RECORD_MMAP] = perf_event__mmap_swap, 1171 [PERF_RECORD_MMAP2] = perf_event__mmap2_swap, 1172 [PERF_RECORD_COMM] = perf_event__comm_swap, 1173 [PERF_RECORD_FORK] = perf_event__task_swap, 1174 [PERF_RECORD_EXIT] = perf_event__task_swap, 1175 [PERF_RECORD_LOST] = perf_event__all64_swap, 1176 [PERF_RECORD_READ] = perf_event__read_swap, 1177 [PERF_RECORD_THROTTLE] = perf_event__throttle_swap, 1178 [PERF_RECORD_UNTHROTTLE] = perf_event__throttle_swap, 1179 [PERF_RECORD_SAMPLE] = perf_event__all64_swap, 1180 [PERF_RECORD_AUX] = perf_event__aux_swap, 1181 [PERF_RECORD_ITRACE_START] = perf_event__itrace_start_swap, 1182 [PERF_RECORD_LOST_SAMPLES] = perf_event__all64_swap, 1183 [PERF_RECORD_SWITCH] = perf_event__switch_swap, 1184 [PERF_RECORD_SWITCH_CPU_WIDE] = perf_event__switch_swap, 1185 [PERF_RECORD_NAMESPACES] = perf_event__namespaces_swap, 1186 [PERF_RECORD_CGROUP] = perf_event__cgroup_swap, 1187 [PERF_RECORD_KSYMBOL] = perf_event__ksymbol_swap, 1188 [PERF_RECORD_BPF_EVENT] = perf_event__bpf_event_swap, 1189 [PERF_RECORD_TEXT_POKE] = perf_event__text_poke_swap, 1190 [PERF_RECORD_AUX_OUTPUT_HW_ID] = perf_event__all64_swap, 1191 [PERF_RECORD_CALLCHAIN_DEFERRED] = perf_event__all64_swap, 1192 [PERF_RECORD_HEADER_ATTR] = perf_event__hdr_attr_swap, 1193 [PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap, 1194 [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap, 1195 [PERF_RECORD_HEADER_BUILD_ID] = perf_event__build_id_swap, 1196 [PERF_RECORD_HEADER_FEATURE] = perf_event__header_feature_swap, 1197 [PERF_RECORD_ID_INDEX] = perf_event__all64_swap, 1198 [PERF_RECORD_AUXTRACE_INFO] = perf_event__auxtrace_info_swap, 1199 [PERF_RECORD_AUXTRACE] = perf_event__auxtrace_swap, 1200 [PERF_RECORD_AUXTRACE_ERROR] = perf_event__auxtrace_error_swap, 1201 [PERF_RECORD_THREAD_MAP] = perf_event__thread_map_swap, 1202 [PERF_RECORD_CPU_MAP] = perf_event__cpu_map_swap, 1203 [PERF_RECORD_STAT_CONFIG] = perf_event__stat_config_swap, 1204 [PERF_RECORD_STAT] = perf_event__stat_swap, 1205 [PERF_RECORD_STAT_ROUND] = perf_event__stat_round_swap, 1206 [PERF_RECORD_EVENT_UPDATE] = perf_event__event_update_swap, 1207 [PERF_RECORD_TIME_CONV] = perf_event__time_conv_swap, 1208 [PERF_RECORD_COMPRESSED2] = perf_event__compressed2_swap, 1209 [PERF_RECORD_BPF_METADATA] = perf_event__bpf_metadata_swap, 1210 [PERF_RECORD_SCHEDSTAT_CPU] = perf_event__schedstat_cpu_swap, 1211 [PERF_RECORD_SCHEDSTAT_DOMAIN] = perf_event__schedstat_domain_swap, 1212 [PERF_RECORD_HEADER_MAX] = NULL, 1213 }; 1214 1215 /* 1216 * When perf record finishes a pass on every buffers, it records this pseudo 1217 * event. 1218 * We record the max timestamp t found in the pass n. 1219 * Assuming these timestamps are monotonic across cpus, we know that if 1220 * a buffer still has events with timestamps below t, they will be all 1221 * available and then read in the pass n + 1. 1222 * Hence when we start to read the pass n + 2, we can safely flush every 1223 * events with timestamps below t. 1224 * 1225 * ============ PASS n ================= 1226 * CPU 0 | CPU 1 1227 * | 1228 * cnt1 timestamps | cnt2 timestamps 1229 * 1 | 2 1230 * 2 | 3 1231 * - | 4 <--- max recorded 1232 * 1233 * ============ PASS n + 1 ============== 1234 * CPU 0 | CPU 1 1235 * | 1236 * cnt1 timestamps | cnt2 timestamps 1237 * 3 | 5 1238 * 4 | 6 1239 * 5 | 7 <---- max recorded 1240 * 1241 * Flush every events below timestamp 4 1242 * 1243 * ============ PASS n + 2 ============== 1244 * CPU 0 | CPU 1 1245 * | 1246 * cnt1 timestamps | cnt2 timestamps 1247 * 6 | 8 1248 * 7 | 9 1249 * - | 10 1250 * 1251 * Flush every events below timestamp 7 1252 * etc... 1253 */ 1254 int perf_event__process_finished_round(const struct perf_tool *tool __maybe_unused, 1255 union perf_event *event __maybe_unused, 1256 struct ordered_events *oe) 1257 { 1258 if (dump_trace) 1259 fprintf(stdout, "\n"); 1260 return ordered_events__flush(oe, OE_FLUSH__ROUND); 1261 } 1262 1263 int perf_session__queue_event(struct perf_session *s, union perf_event *event, 1264 u64 timestamp, u64 file_offset, const char *file_path) 1265 { 1266 return ordered_events__queue(&s->ordered_events, event, timestamp, file_offset, file_path); 1267 } 1268 1269 static void callchain__lbr_callstack_printf(struct perf_sample *sample) 1270 { 1271 struct ip_callchain *callchain = sample->callchain; 1272 struct branch_stack *lbr_stack = sample->branch_stack; 1273 struct branch_entry *entries = perf_sample__branch_entries(sample); 1274 u64 kernel_callchain_nr = callchain->nr; 1275 unsigned int i; 1276 1277 for (i = 0; i < kernel_callchain_nr; i++) { 1278 if (callchain->ips[i] == PERF_CONTEXT_USER) 1279 break; 1280 } 1281 1282 if ((i != kernel_callchain_nr) && lbr_stack->nr) { 1283 u64 total_nr; 1284 /* 1285 * LBR callstack can only get user call chain, 1286 * i is kernel call chain number, 1287 * 1 is PERF_CONTEXT_USER. 1288 * 1289 * The user call chain is stored in LBR registers. 1290 * LBR are pair registers. The caller is stored 1291 * in "from" register, while the callee is stored 1292 * in "to" register. 1293 * For example, there is a call stack 1294 * "A"->"B"->"C"->"D". 1295 * The LBR registers will be recorded like 1296 * "C"->"D", "B"->"C", "A"->"B". 1297 * So only the first "to" register and all "from" 1298 * registers are needed to construct the whole stack. 1299 */ 1300 total_nr = i + 1 + lbr_stack->nr + 1; 1301 kernel_callchain_nr = i + 1; 1302 1303 printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr); 1304 1305 for (i = 0; i < kernel_callchain_nr; i++) 1306 printf("..... %2d: %016" PRIx64 "\n", 1307 i, callchain->ips[i]); 1308 1309 printf("..... %2d: %016" PRIx64 "\n", 1310 (int)(kernel_callchain_nr), entries[0].to); 1311 for (i = 0; i < lbr_stack->nr; i++) 1312 printf("..... %2d: %016" PRIx64 "\n", 1313 (int)(i + kernel_callchain_nr + 1), entries[i].from); 1314 } 1315 } 1316 1317 static const char *callchain_context_str(u64 ip) 1318 { 1319 switch (ip) { 1320 case PERF_CONTEXT_HV: 1321 return " (PERF_CONTEXT_HV)"; 1322 case PERF_CONTEXT_KERNEL: 1323 return " (PERF_CONTEXT_KERNEL)"; 1324 case PERF_CONTEXT_USER: 1325 return " (PERF_CONTEXT_USER)"; 1326 case PERF_CONTEXT_GUEST: 1327 return " (PERF_CONTEXT_GUEST)"; 1328 case PERF_CONTEXT_GUEST_KERNEL: 1329 return " (PERF_CONTEXT_GUEST_KERNEL)"; 1330 case PERF_CONTEXT_GUEST_USER: 1331 return " (PERF_CONTEXT_GUEST_USER)"; 1332 case PERF_CONTEXT_USER_DEFERRED: 1333 return " (PERF_CONTEXT_USER_DEFERRED)"; 1334 default: 1335 return ""; 1336 } 1337 } 1338 1339 static void callchain__printf(struct evsel *evsel, 1340 struct perf_sample *sample) 1341 { 1342 unsigned int i; 1343 struct ip_callchain *callchain = sample->callchain; 1344 1345 if (evsel__has_branch_callstack(evsel)) 1346 callchain__lbr_callstack_printf(sample); 1347 1348 printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr); 1349 1350 for (i = 0; i < callchain->nr; i++) 1351 printf("..... %2d: %016" PRIx64 "%s\n", 1352 i, callchain->ips[i], 1353 callchain_context_str(callchain->ips[i])); 1354 1355 if (sample->deferred_callchain) 1356 printf("...... (deferred)\n"); 1357 } 1358 1359 static void branch_stack__printf(struct perf_sample *sample, 1360 struct evsel *evsel) 1361 { 1362 struct branch_entry *entries = perf_sample__branch_entries(sample); 1363 bool callstack = evsel__has_branch_callstack(evsel); 1364 u64 *branch_stack_cntr = sample->branch_stack_cntr; 1365 uint64_t i; 1366 1367 if (!callstack) { 1368 printf("%s: nr:%" PRIu64 "\n", "... branch stack", sample->branch_stack->nr); 1369 } else { 1370 /* the reason of adding 1 to nr is because after expanding 1371 * branch stack it generates nr + 1 callstack records. e.g., 1372 * B()->C() 1373 * A()->B() 1374 * the final callstack should be: 1375 * C() 1376 * B() 1377 * A() 1378 */ 1379 printf("%s: nr:%" PRIu64 "\n", "... branch callstack", sample->branch_stack->nr+1); 1380 } 1381 1382 for (i = 0; i < sample->branch_stack->nr; i++) { 1383 struct branch_entry *e = &entries[i]; 1384 1385 if (!callstack) { 1386 printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x %s %s\n", 1387 i, e->from, e->to, 1388 (unsigned short)e->flags.cycles, 1389 e->flags.mispred ? "M" : " ", 1390 e->flags.predicted ? "P" : " ", 1391 e->flags.abort ? "A" : " ", 1392 e->flags.in_tx ? "T" : " ", 1393 (unsigned)e->flags.reserved, 1394 get_branch_type(e), 1395 e->flags.spec ? branch_spec_desc(e->flags.spec) : ""); 1396 } else { 1397 if (i == 0) { 1398 printf("..... %2"PRIu64": %016" PRIx64 "\n" 1399 "..... %2"PRIu64": %016" PRIx64 "\n", 1400 i, e->to, i+1, e->from); 1401 } else { 1402 printf("..... %2"PRIu64": %016" PRIx64 "\n", i+1, e->from); 1403 } 1404 } 1405 } 1406 1407 if (branch_stack_cntr) { 1408 unsigned int br_cntr_width, br_cntr_nr; 1409 1410 perf_env__find_br_cntr_info(evsel__env(evsel), &br_cntr_nr, &br_cntr_width); 1411 printf("... branch stack counters: nr:%" PRIu64 " (counter width: %u max counter nr:%u)\n", 1412 sample->branch_stack->nr, br_cntr_width, br_cntr_nr); 1413 for (i = 0; i < sample->branch_stack->nr; i++) 1414 printf("..... %2"PRIu64": %016" PRIx64 "\n", i, branch_stack_cntr[i]); 1415 } 1416 } 1417 1418 static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags) 1419 { 1420 unsigned rid, i = 0; 1421 1422 for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) { 1423 u64 val = regs[i++]; 1424 1425 printf(".... %-5s 0x%016" PRIx64 "\n", 1426 perf_reg_name(rid, e_machine, e_flags), val); 1427 } 1428 } 1429 1430 static const char *regs_abi[] = { 1431 [PERF_SAMPLE_REGS_ABI_NONE] = "none", 1432 [PERF_SAMPLE_REGS_ABI_32] = "32-bit", 1433 [PERF_SAMPLE_REGS_ABI_64] = "64-bit", 1434 }; 1435 1436 static inline const char *regs_dump_abi(struct regs_dump *d) 1437 { 1438 if (d->abi > PERF_SAMPLE_REGS_ABI_64) 1439 return "unknown"; 1440 1441 return regs_abi[d->abi]; 1442 } 1443 1444 static void regs__printf(const char *type, struct regs_dump *regs, 1445 uint16_t e_machine, uint32_t e_flags) 1446 { 1447 u64 mask = regs->mask; 1448 1449 printf("... %s regs: mask 0x%" PRIx64 " ABI %s\n", 1450 type, 1451 mask, 1452 regs_dump_abi(regs)); 1453 1454 regs_dump__printf(mask, regs->regs, e_machine, e_flags); 1455 } 1456 1457 static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags) 1458 { 1459 struct regs_dump *user_regs; 1460 1461 if (!sample->user_regs) 1462 return; 1463 1464 user_regs = perf_sample__user_regs(sample); 1465 1466 if (user_regs->regs) 1467 regs__printf("user", user_regs, e_machine, e_flags); 1468 } 1469 1470 static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags) 1471 { 1472 struct regs_dump *intr_regs; 1473 1474 if (!sample->intr_regs) 1475 return; 1476 1477 intr_regs = perf_sample__intr_regs(sample); 1478 1479 if (intr_regs->regs) 1480 regs__printf("intr", intr_regs, e_machine, e_flags); 1481 } 1482 1483 static void stack_user__printf(struct stack_dump *dump) 1484 { 1485 printf("... ustack: size %" PRIu64 ", offset 0x%x\n", 1486 dump->size, dump->offset); 1487 } 1488 1489 static void evlist__print_tstamp(struct evlist *evlist, union perf_event *event, struct perf_sample *sample) 1490 { 1491 u64 sample_type = __evlist__combined_sample_type(evlist); 1492 1493 if (event->header.type != PERF_RECORD_SAMPLE && 1494 !evlist__sample_id_all(evlist)) { 1495 fputs("-1 -1 ", stdout); 1496 return; 1497 } 1498 1499 if ((sample_type & PERF_SAMPLE_CPU)) 1500 printf("%u ", sample->cpu); 1501 1502 if (sample_type & PERF_SAMPLE_TIME) 1503 printf("%" PRIu64 " ", sample->time); 1504 } 1505 1506 static void sample_read__printf(struct perf_sample *sample, u64 read_format) 1507 { 1508 printf("... sample_read:\n"); 1509 1510 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1511 printf("...... time enabled %016" PRIx64 "\n", 1512 sample->read.time_enabled); 1513 1514 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1515 printf("...... time running %016" PRIx64 "\n", 1516 sample->read.time_running); 1517 1518 if (read_format & PERF_FORMAT_GROUP) { 1519 struct sample_read_value *value = sample->read.group.values; 1520 1521 printf(".... group nr %" PRIu64 "\n", sample->read.group.nr); 1522 1523 sample_read_group__for_each(value, sample->read.group.nr, read_format) { 1524 printf("..... id %016" PRIx64 1525 ", value %016" PRIx64, 1526 value->id, value->value); 1527 if (read_format & PERF_FORMAT_LOST) 1528 printf(", lost %" PRIu64, value->lost); 1529 printf("\n"); 1530 } 1531 } else { 1532 printf("..... id %016" PRIx64 ", value %016" PRIx64, 1533 sample->read.one.id, sample->read.one.value); 1534 if (read_format & PERF_FORMAT_LOST) 1535 printf(", lost %" PRIu64, sample->read.one.lost); 1536 printf("\n"); 1537 } 1538 } 1539 1540 static void dump_event(struct evlist *evlist, union perf_event *event, 1541 u64 file_offset, struct perf_sample *sample, 1542 const char *file_path) 1543 { 1544 if (!dump_trace) 1545 return; 1546 1547 printf("\n%#" PRIx64 "@%s [%#x]: event: %d\n", 1548 file_offset, file_path, event->header.size, event->header.type); 1549 1550 trace_event(event); 1551 if (event->header.type == PERF_RECORD_SAMPLE && evlist->trace_event_sample_raw) 1552 evlist->trace_event_sample_raw(evlist, event, sample); 1553 1554 if (sample) 1555 evlist__print_tstamp(evlist, event, sample); 1556 1557 printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset, 1558 event->header.size, perf_event__name(event->header.type)); 1559 } 1560 1561 char *get_page_size_name(u64 size, char *str) 1562 { 1563 if (!size || !unit_number__scnprintf(str, PAGE_SIZE_NAME_LEN, size)) 1564 snprintf(str, PAGE_SIZE_NAME_LEN, "%s", "N/A"); 1565 1566 return str; 1567 } 1568 1569 static void dump_sample(struct machine *machine, union perf_event *event, 1570 struct perf_sample *sample) 1571 { 1572 struct evsel *evsel = sample->evsel; 1573 u64 sample_type; 1574 char str[PAGE_SIZE_NAME_LEN]; 1575 uint16_t e_machine = EM_NONE; 1576 uint32_t e_flags = 0; 1577 1578 if (!dump_trace) 1579 return; 1580 1581 sample_type = evsel->core.attr.sample_type; 1582 1583 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) { 1584 struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid); 1585 1586 e_machine = thread__e_machine(thread, machine, &e_flags); 1587 } 1588 1589 printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n", 1590 event->header.misc, sample->pid, sample->tid, sample->ip, 1591 sample->period, sample->addr); 1592 1593 if (evsel__has_callchain(evsel)) 1594 callchain__printf(evsel, sample); 1595 1596 if (evsel__has_br_stack(evsel)) 1597 branch_stack__printf(sample, evsel); 1598 1599 if (sample_type & PERF_SAMPLE_REGS_USER) 1600 regs_user__printf(sample, e_machine, e_flags); 1601 1602 if (sample_type & PERF_SAMPLE_REGS_INTR) 1603 regs_intr__printf(sample, e_machine, e_flags); 1604 1605 if (sample_type & PERF_SAMPLE_STACK_USER) 1606 stack_user__printf(&sample->user_stack); 1607 1608 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { 1609 printf("... weight: %" PRIu64 "", sample->weight); 1610 if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { 1611 printf(",0x%"PRIx16"", sample->ins_lat); 1612 printf(",0x%"PRIx16"", sample->weight3); 1613 } 1614 printf("\n"); 1615 } 1616 1617 if (sample_type & PERF_SAMPLE_DATA_SRC) 1618 printf(" . data_src: 0x%"PRIx64"\n", sample->data_src); 1619 1620 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 1621 printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr); 1622 1623 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) 1624 printf(" .. data page size: %s\n", get_page_size_name(sample->data_page_size, str)); 1625 1626 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) 1627 printf(" .. code page size: %s\n", get_page_size_name(sample->code_page_size, str)); 1628 1629 if (sample_type & PERF_SAMPLE_TRANSACTION) 1630 printf("... transaction: %" PRIx64 "\n", sample->transaction); 1631 1632 if (sample_type & PERF_SAMPLE_READ) 1633 sample_read__printf(sample, evsel->core.attr.read_format); 1634 } 1635 1636 static void dump_deferred_callchain(union perf_event *event, struct perf_sample *sample) 1637 { 1638 struct evsel *evsel = sample->evsel; 1639 1640 if (!dump_trace) 1641 return; 1642 1643 printf("(IP, 0x%x): %d/%d: %#" PRIx64 "\n", 1644 event->header.misc, sample->pid, sample->tid, sample->deferred_cookie); 1645 1646 if (evsel__has_callchain(evsel)) 1647 callchain__printf(evsel, sample); 1648 } 1649 1650 static void dump_read(struct evsel *evsel, union perf_event *event) 1651 { 1652 u64 read_format; 1653 __u64 *array; 1654 void *end; 1655 1656 if (!dump_trace) 1657 return; 1658 1659 printf(": %d %d %s %" PRI_lu64 "\n", event->read.pid, event->read.tid, 1660 evsel__name(evsel), event->read.value); 1661 1662 if (!evsel) 1663 return; 1664 1665 read_format = evsel->core.attr.read_format; 1666 /* 1667 * The kernel packs only the enabled read_format fields 1668 * after value, with no gaps. Walk the packed array 1669 * instead of using fixed struct offsets. 1670 */ 1671 array = &event->read.value + 1; 1672 end = (void *)event + event->header.size; 1673 1674 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1675 if ((void *)(array + 1) > end) 1676 return; 1677 printf("... time enabled : %" PRI_lu64 "\n", *array++); 1678 } 1679 1680 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1681 if ((void *)(array + 1) > end) 1682 return; 1683 printf("... time running : %" PRI_lu64 "\n", *array++); 1684 } 1685 1686 if (read_format & PERF_FORMAT_ID) { 1687 if ((void *)(array + 1) > end) 1688 return; 1689 printf("... id : %" PRI_lu64 "\n", *array++); 1690 } 1691 1692 if (read_format & PERF_FORMAT_LOST) { 1693 if ((void *)(array + 1) > end) 1694 return; 1695 printf("... lost : %" PRI_lu64 "\n", *array++); 1696 } 1697 } 1698 1699 static struct machine *machines__find_for_cpumode(struct machines *machines, 1700 union perf_event *event, 1701 struct perf_sample *sample) 1702 { 1703 if (perf_guest && 1704 ((sample->cpumode == PERF_RECORD_MISC_GUEST_KERNEL) || 1705 (sample->cpumode == PERF_RECORD_MISC_GUEST_USER))) { 1706 u32 pid; 1707 1708 if (sample->machine_pid) 1709 pid = sample->machine_pid; 1710 else if (event->header.type == PERF_RECORD_MMAP 1711 || event->header.type == PERF_RECORD_MMAP2) 1712 pid = event->mmap.pid; 1713 else 1714 pid = sample->pid; 1715 1716 /* 1717 * Guest code machine is created as needed and does not use 1718 * DEFAULT_GUEST_KERNEL_ID. 1719 */ 1720 if (symbol_conf.guest_code) 1721 return machines__findnew(machines, pid); 1722 1723 return machines__find_guest(machines, pid); 1724 } 1725 1726 return &machines->host; 1727 } 1728 1729 static int deliver_sample_value(struct evlist *evlist, 1730 const struct perf_tool *tool, 1731 union perf_event *event, 1732 struct perf_sample *sample, 1733 struct sample_read_value *v, 1734 struct machine *machine, 1735 bool per_thread) 1736 { 1737 struct perf_sample_id *sid = evlist__id2sid(evlist, v->id); 1738 struct evsel *saved_evsel = sample->evsel; 1739 u64 *storage = NULL; 1740 int ret; 1741 1742 if (sid) { 1743 storage = perf_sample_id__get_period_storage(sid, sample->tid, per_thread); 1744 } 1745 1746 if (storage) { 1747 sample->id = v->id; 1748 sample->period = v->value - *storage; 1749 *storage = v->value; 1750 } 1751 1752 if (!storage || sid->evsel == NULL) { 1753 ++evlist->stats.nr_unknown_id; 1754 return 0; 1755 } 1756 1757 /* 1758 * There's no reason to deliver sample 1759 * for zero period, bail out. 1760 */ 1761 if (!sample->period) 1762 return 0; 1763 1764 sample->evsel = container_of(sid->evsel, struct evsel, core); 1765 ret = tool->sample(tool, event, sample, machine); 1766 sample->evsel = saved_evsel; 1767 return ret; 1768 } 1769 1770 static int deliver_sample_group(struct evlist *evlist, 1771 const struct perf_tool *tool, 1772 union perf_event *event, 1773 struct perf_sample *sample, 1774 struct machine *machine, 1775 u64 read_format, 1776 bool per_thread) 1777 { 1778 int ret = -EINVAL; 1779 struct sample_read_value *v = sample->read.group.values; 1780 1781 if (tool->dont_split_sample_group) 1782 return deliver_sample_value(evlist, tool, event, sample, v, machine, 1783 per_thread); 1784 1785 sample_read_group__for_each(v, sample->read.group.nr, read_format) { 1786 ret = deliver_sample_value(evlist, tool, event, sample, v, 1787 machine, per_thread); 1788 if (ret) 1789 break; 1790 } 1791 1792 return ret; 1793 } 1794 1795 static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool *tool, 1796 union perf_event *event, struct perf_sample *sample, 1797 struct machine *machine) 1798 { 1799 struct evsel *evsel = sample->evsel; 1800 /* We know evsel != NULL. */ 1801 u64 sample_type = evsel->core.attr.sample_type; 1802 u64 read_format = evsel->core.attr.read_format; 1803 bool per_thread = perf_evsel__attr_has_per_thread_sample_period(&evsel->core); 1804 1805 /* Standard sample delivery. */ 1806 if (!(sample_type & PERF_SAMPLE_READ)) 1807 return tool->sample(tool, event, sample, machine); 1808 1809 /* For PERF_SAMPLE_READ we have either single or group mode. */ 1810 if (read_format & PERF_FORMAT_GROUP) 1811 return deliver_sample_group(evlist, tool, event, sample, 1812 machine, read_format, per_thread); 1813 else 1814 return deliver_sample_value(evlist, tool, event, sample, 1815 &sample->read.one, machine, 1816 per_thread); 1817 } 1818 1819 /* 1820 * Samples with deferred callchains should wait for the next matching 1821 * PERF_RECORD_CALLCHAIN_RECORD entries. Keep the events in a list and 1822 * deliver them once it finds the callchains. 1823 */ 1824 struct deferred_event { 1825 struct list_head list; 1826 union perf_event *event; 1827 u64 file_offset; 1828 }; 1829 1830 /* 1831 * This is called when a deferred callchain record comes up. Find all matching 1832 * samples, merge the callchains and process them. 1833 */ 1834 static int evlist__deliver_deferred_callchain(struct evlist *evlist, 1835 const struct perf_tool *tool, 1836 union perf_event *event, 1837 struct perf_sample *sample, 1838 struct machine *machine) 1839 { 1840 struct deferred_event *de, *tmp; 1841 int ret = 0; 1842 1843 if (!tool->merge_deferred_callchains) { 1844 struct evsel *saved_evsel = sample->evsel; 1845 1846 sample->evsel = evlist__id2evsel(evlist, sample->id); 1847 ret = tool->callchain_deferred(tool, event, sample, machine); 1848 sample->evsel = saved_evsel; 1849 return ret; 1850 } 1851 1852 list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) { 1853 struct perf_sample orig_sample; 1854 1855 perf_sample__init(&orig_sample, /*all=*/false); 1856 ret = evlist__parse_sample(evlist, de->event, &orig_sample); 1857 if (ret < 0) { 1858 pr_err("failed to parse original sample\n"); 1859 perf_sample__exit(&orig_sample); 1860 break; 1861 } 1862 orig_sample.file_offset = de->file_offset; 1863 1864 if (sample->tid != orig_sample.tid) { 1865 perf_sample__exit(&orig_sample); 1866 continue; 1867 } 1868 1869 if (event->callchain_deferred.cookie == orig_sample.deferred_cookie) 1870 sample__merge_deferred_callchain(&orig_sample, sample); 1871 else 1872 orig_sample.deferred_callchain = false; 1873 1874 orig_sample.evsel = evlist__id2evsel(evlist, orig_sample.id); 1875 ret = evlist__deliver_sample(evlist, tool, de->event, 1876 &orig_sample, machine); 1877 1878 perf_sample__exit(&orig_sample); 1879 list_del(&de->list); 1880 free(de->event); 1881 free(de); 1882 1883 if (ret) 1884 break; 1885 } 1886 return ret; 1887 } 1888 1889 /* 1890 * This is called at the end of the data processing for the session. Flush the 1891 * remaining samples as there's no hope for matching deferred callchains. 1892 */ 1893 static int session__flush_deferred_samples(struct perf_session *session, 1894 const struct perf_tool *tool) 1895 { 1896 struct evlist *evlist = session->evlist; 1897 struct machine *machine = &session->machines.host; 1898 struct deferred_event *de, *tmp; 1899 int ret = 0; 1900 1901 list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) { 1902 struct perf_sample sample; 1903 1904 perf_sample__init(&sample, /*all=*/false); 1905 ret = evlist__parse_sample(evlist, de->event, &sample); 1906 if (ret < 0) { 1907 pr_err("failed to parse original sample\n"); 1908 perf_sample__exit(&sample); 1909 break; 1910 } 1911 sample.file_offset = de->file_offset; 1912 1913 sample.evsel = evlist__id2evsel(evlist, sample.id); 1914 ret = evlist__deliver_sample(evlist, tool, de->event, 1915 &sample, machine); 1916 1917 perf_sample__exit(&sample); 1918 list_del(&de->list); 1919 free(de->event); 1920 free(de); 1921 1922 if (ret) 1923 break; 1924 } 1925 return ret; 1926 } 1927 1928 /* 1929 * Return true if the string field is properly null-terminated 1930 * within the event boundary. Native-endian files are mapped 1931 * read-only (MAP_SHARED + PROT_READ) so we cannot write a 1932 * null byte in place; skip the event instead. 1933 */ 1934 static bool perf_event__check_nul(const char *str, const void *end, 1935 const char *event_name, u64 file_offset) 1936 { 1937 size_t max_len = (const char *)end - str; 1938 1939 if (max_len == 0 || strnlen(str, max_len) == max_len) { 1940 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_%s: string not null-terminated, skipping event\n", 1941 file_offset, event_name); 1942 return false; 1943 } 1944 1945 return true; 1946 } 1947 1948 static int machines__deliver_event(struct machines *machines, 1949 struct evlist *evlist, 1950 union perf_event *event, 1951 struct perf_sample *sample, 1952 const struct perf_tool *tool, u64 file_offset, 1953 const char *file_path) 1954 { 1955 struct machine *machine; 1956 1957 dump_event(evlist, event, file_offset, sample, file_path); 1958 1959 if (!sample->evsel) 1960 sample->evsel = evlist__id2evsel(evlist, sample->id); 1961 else 1962 assert(sample->evsel == evlist__id2evsel(evlist, sample->id)); 1963 1964 machine = machines__find_for_cpumode(machines, event, sample); 1965 1966 switch (event->header.type) { 1967 case PERF_RECORD_SAMPLE: 1968 if (sample->evsel == NULL) { 1969 ++evlist->stats.nr_unknown_id; 1970 return 0; 1971 } 1972 if (machine == NULL) { 1973 ++evlist->stats.nr_unprocessable_samples; 1974 dump_sample(machine, event, sample); 1975 return 0; 1976 } 1977 dump_sample(machine, event, sample); 1978 if (sample->deferred_callchain && tool->merge_deferred_callchains) { 1979 struct deferred_event *de = malloc(sizeof(*de)); 1980 size_t sz = event->header.size; 1981 1982 if (de == NULL) 1983 return -ENOMEM; 1984 1985 de->event = malloc(sz); 1986 if (de->event == NULL) { 1987 free(de); 1988 return -ENOMEM; 1989 } 1990 memcpy(de->event, event, sz); 1991 de->file_offset = sample->file_offset; 1992 list_add_tail(&de->list, &evlist->deferred_samples); 1993 return 0; 1994 } 1995 return evlist__deliver_sample(evlist, tool, event, sample, machine); 1996 case PERF_RECORD_MMAP: 1997 if (!perf_event__check_nul(event->mmap.filename, 1998 (void *)event + event->header.size, 1999 "MMAP", file_offset)) 2000 return 0; 2001 return tool->mmap(tool, event, sample, machine); 2002 case PERF_RECORD_MMAP2: 2003 if (event->header.misc & PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT) 2004 ++evlist->stats.nr_proc_map_timeout; 2005 if (!perf_event__check_nul(event->mmap2.filename, 2006 (void *)event + event->header.size, 2007 "MMAP2", file_offset)) 2008 return 0; 2009 return tool->mmap2(tool, event, sample, machine); 2010 case PERF_RECORD_COMM: 2011 if (!perf_event__check_nul(event->comm.comm, 2012 (void *)event + event->header.size, 2013 "COMM", file_offset)) 2014 return 0; 2015 return tool->comm(tool, event, sample, machine); 2016 case PERF_RECORD_NAMESPACES: { 2017 /* 2018 * Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof. 2019 * Includes trailing sample_id space when present, but prevents OOB. 2020 */ 2021 u64 max_nr = (event->header.size - sizeof(event->namespaces)) / 2022 sizeof(event->namespaces.link_info[0]); 2023 2024 /* 2025 * Native-endian events are mmap'd read-only, so we 2026 * cannot clamp nr in place. Skip the event instead. 2027 * The swap handler already clamps on the writable 2028 * cross-endian path. 2029 */ 2030 if (event->namespaces.nr_namespaces > max_nr) { 2031 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_NAMESPACES: nr_namespaces %" PRIu64 " exceeds payload (max %" PRIu64 "), skipping\n", 2032 file_offset, (u64)event->namespaces.nr_namespaces, max_nr); 2033 return 0; 2034 } 2035 return tool->namespaces(tool, event, sample, machine); 2036 } 2037 case PERF_RECORD_CGROUP: 2038 if (!perf_event__check_nul(event->cgroup.path, 2039 (void *)event + event->header.size, 2040 "CGROUP", file_offset)) 2041 return 0; 2042 return tool->cgroup(tool, event, sample, machine); 2043 case PERF_RECORD_FORK: 2044 return tool->fork(tool, event, sample, machine); 2045 case PERF_RECORD_EXIT: 2046 return tool->exit(tool, event, sample, machine); 2047 case PERF_RECORD_LOST: 2048 if (tool->lost == perf_event__process_lost) 2049 evlist->stats.total_lost += event->lost.lost; 2050 return tool->lost(tool, event, sample, machine); 2051 case PERF_RECORD_LOST_SAMPLES: 2052 if (event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF) 2053 evlist->stats.total_dropped_samples += event->lost_samples.lost; 2054 else if (tool->lost_samples == perf_event__process_lost_samples) 2055 evlist->stats.total_lost_samples += event->lost_samples.lost; 2056 return tool->lost_samples(tool, event, sample, machine); 2057 case PERF_RECORD_READ: 2058 dump_read(sample->evsel, event); 2059 return tool->read(tool, event, sample, machine); 2060 case PERF_RECORD_THROTTLE: 2061 return tool->throttle(tool, event, sample, machine); 2062 case PERF_RECORD_UNTHROTTLE: 2063 return tool->unthrottle(tool, event, sample, machine); 2064 case PERF_RECORD_AUX: 2065 if (tool->aux == perf_event__process_aux) { 2066 if (event->aux.flags & PERF_AUX_FLAG_TRUNCATED) 2067 evlist->stats.total_aux_lost += 1; 2068 if (event->aux.flags & PERF_AUX_FLAG_PARTIAL) 2069 evlist->stats.total_aux_partial += 1; 2070 if (event->aux.flags & PERF_AUX_FLAG_COLLISION) 2071 evlist->stats.total_aux_collision += 1; 2072 } 2073 return tool->aux(tool, event, sample, machine); 2074 case PERF_RECORD_ITRACE_START: 2075 return tool->itrace_start(tool, event, sample, machine); 2076 case PERF_RECORD_SWITCH: 2077 case PERF_RECORD_SWITCH_CPU_WIDE: 2078 return tool->context_switch(tool, event, sample, machine); 2079 case PERF_RECORD_KSYMBOL: 2080 if (!perf_event__check_nul(event->ksymbol.name, 2081 (void *)event + event->header.size, 2082 "KSYMBOL", file_offset)) 2083 return 0; 2084 return tool->ksymbol(tool, event, sample, machine); 2085 case PERF_RECORD_BPF_EVENT: 2086 return tool->bpf(tool, event, sample, machine); 2087 case PERF_RECORD_TEXT_POKE: { 2088 /* offsetof(bytes), not sizeof — sizeof includes padding past the flexible array */ 2089 size_t text_poke_len = offsetof(struct perf_record_text_poke_event, bytes) + 2090 event->text_poke.old_len + 2091 event->text_poke.new_len; 2092 2093 if (event->header.size < text_poke_len) { 2094 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_TEXT_POKE: old_len+new_len exceeds event, skipping\n", 2095 file_offset); 2096 return 0; 2097 } 2098 return tool->text_poke(tool, event, sample, machine); 2099 } 2100 case PERF_RECORD_AUX_OUTPUT_HW_ID: 2101 return tool->aux_output_hw_id(tool, event, sample, machine); 2102 case PERF_RECORD_CALLCHAIN_DEFERRED: 2103 dump_deferred_callchain(event, sample); 2104 return evlist__deliver_deferred_callchain(evlist, tool, event, 2105 sample, machine); 2106 default: 2107 ++evlist->stats.nr_unknown_events; 2108 return -1; 2109 } 2110 } 2111 2112 static int perf_session__deliver_event(struct perf_session *session, 2113 union perf_event *event, 2114 const struct perf_tool *tool, 2115 u64 file_offset, 2116 const char *file_path) 2117 { 2118 struct perf_sample sample; 2119 struct evsel *evsel; 2120 int ret; 2121 2122 perf_sample__init(&sample, /*all=*/false); 2123 evsel = evlist__event2evsel(session->evlist, event); 2124 if (!evsel) { 2125 pr_err("ERROR: at offset %#" PRIx64 ": no evsel found for %s (%u) event\n", 2126 file_offset, perf_event__name(event->header.type), 2127 event->header.type); 2128 ret = -EFAULT; 2129 goto out; 2130 } 2131 ret = evsel__parse_sample(evsel, event, &sample); 2132 if (ret) { 2133 pr_err("ERROR: at offset %#" PRIx64 ": can't parse %s (%u) sample, err = %d\n", 2134 file_offset, perf_event__name(event->header.type), 2135 event->header.type, ret); 2136 goto out; 2137 } 2138 sample.file_offset = file_offset; 2139 /* 2140 * evsel__parse_sample() doesn't populate machine_pid/vcpu, 2141 * which are needed by machines__find_for_cpumode() to 2142 * attribute samples to guest VMs. The SID table maps 2143 * sample IDs to the guest that owns the event. 2144 */ 2145 if (perf_guest && sample.id) { 2146 struct perf_sample_id *sid = evlist__id2sid(session->evlist, sample.id); 2147 2148 if (sid) { 2149 sample.machine_pid = sid->machine_pid; 2150 sample.vcpu = sid->vcpu.cpu; 2151 } 2152 } 2153 2154 /* 2155 * Validate sample.cpu before any callback can use it as an 2156 * array index (kwork cpus_runtime, timechart cpus_cstate_*, 2157 * sched cpu_last_switched). 2158 * 2159 * When PERF_SAMPLE_CPU is absent, evsel__parse_sample() leaves 2160 * sample.cpu as (u32)-1 — a sentinel that downstream tools 2161 * (script, inject) check to identify events without CPU info. 2162 * Only check when sample.cpu was actually populated from event 2163 * data: PERF_RECORD_SAMPLE always has it when PERF_SAMPLE_CPU 2164 * is set; non-sample events only have it when sample_id_all is 2165 * enabled. Otherwise sample.cpu is the (u32)-1 sentinel from 2166 * evsel__parse_sample() and must not be validated or clamped. 2167 */ 2168 if ((evsel->core.attr.sample_type & PERF_SAMPLE_CPU) && 2169 (event->header.type == PERF_RECORD_SAMPLE || 2170 evsel->core.attr.sample_id_all)) { 2171 int nr_cpus_avail = perf_session__env(session)->nr_cpus_avail; 2172 2173 /* 2174 * For perf.data files the MAX_NR_CPUS fallback in 2175 * perf_session__read_header() guarantees this is set. 2176 * For pipe mode, HEADER_NRCPUS may arrive late or not 2177 * at all (pre-2017 perf, third-party tools). Fall 2178 * back to MAX_NR_CPUS so the bounds check still works 2179 * against fixed-size downstream arrays. 2180 * 2181 * Do NOT write back to env: this function runs during 2182 * recording (synthesized events) when nr_cpus_avail is 2183 * legitimately 0. Writing MAX_NR_CPUS would cause 2184 * write_cpu_topology() to emit 4096 core_id/socket_id 2185 * pairs instead of the real CPU count, corrupting the 2186 * topology section in the generated perf.data. 2187 */ 2188 if (nr_cpus_avail <= 0) 2189 nr_cpus_avail = MAX_NR_CPUS; 2190 /* 2191 * Cap at MAX_NR_CPUS for the bounds check — downstream 2192 * consumers use fixed-size arrays of that size. Keep 2193 * the true nr_cpus_avail in env for header parsing 2194 * (e.g. process_cpu_topology) which needs the real count. 2195 */ 2196 if (nr_cpus_avail > MAX_NR_CPUS) 2197 nr_cpus_avail = MAX_NR_CPUS; 2198 if (sample.cpu >= (u32)nr_cpus_avail && 2199 sample.cpu != (u32)-1) { 2200 /* 2201 * Warn rather than abort: synthesized events 2202 * (MMAP, COMM) lack sample_id_all data, so 2203 * parse_id_sample reads garbage from the event 2204 * payload. Clamping to 0 protects downstream 2205 * array indexing while keeping the session alive. 2206 * 2207 * Preserve (u32)-1: perf script and perf inject 2208 * use it as a sentinel for "CPU not applicable." 2209 * Downstream array users (timechart, kwork) have 2210 * their own per-callback bounds checks. 2211 */ 2212 pr_warning_once("WARNING: at offset %#" PRIx64 ": sample CPU %u >= nr_cpus_avail %u, clamping to 0\n", 2213 file_offset, sample.cpu, nr_cpus_avail); 2214 sample.cpu = 0; 2215 } 2216 } 2217 2218 ret = auxtrace__process_event(session, event, &sample, tool); 2219 if (ret < 0) 2220 goto out; 2221 if (ret > 0) { 2222 ret = 0; 2223 goto out; 2224 } 2225 2226 ret = machines__deliver_event(&session->machines, session->evlist, 2227 event, &sample, tool, file_offset, file_path); 2228 2229 if (dump_trace && sample.aux_sample.size) 2230 auxtrace__dump_auxtrace_sample(session, &sample); 2231 out: 2232 perf_sample__exit(&sample); 2233 return ret; 2234 } 2235 2236 static s64 perf_session__process_user_event(struct perf_session *session, 2237 union perf_event *event, 2238 u64 file_offset, 2239 const char *file_path) 2240 { 2241 struct ordered_events *oe = &session->ordered_events; 2242 const struct perf_tool *tool = session->tool; 2243 const u32 event_size = READ_ONCE(event->header.size); 2244 struct perf_sample sample; 2245 int fd = perf_data__fd(session->data); 2246 s64 err; 2247 2248 perf_sample__init(&sample, /*all=*/true); 2249 if ((event->header.type != PERF_RECORD_COMPRESSED && 2250 event->header.type != PERF_RECORD_COMPRESSED2) || 2251 perf_tool__compressed_is_stub(tool)) 2252 dump_event(session->evlist, event, file_offset, &sample, file_path); 2253 2254 /* These events are processed right away */ 2255 switch (event->header.type) { 2256 case PERF_RECORD_HEADER_ATTR: 2257 err = tool->attr(tool, event, &session->evlist); 2258 if (err == 0) { 2259 perf_session__set_id_hdr_size(session); 2260 perf_session__set_comm_exec(session); 2261 } 2262 break; 2263 case PERF_RECORD_EVENT_UPDATE: 2264 err = tool->event_update(tool, event, &session->evlist); 2265 break; 2266 case PERF_RECORD_HEADER_EVENT_TYPE: 2267 /* 2268 * Deprecated, but we need to handle it for sake 2269 * of old data files create in pipe mode. 2270 */ 2271 err = 0; 2272 break; 2273 case PERF_RECORD_HEADER_TRACING_DATA: 2274 /* 2275 * Setup for reading amidst mmap, but only when we 2276 * are in 'file' mode. The 'pipe' fd is in proper 2277 * place already. 2278 */ 2279 if (!perf_data__is_pipe(session->data)) 2280 lseek(fd, file_offset, SEEK_SET); 2281 err = tool->tracing_data(tool, session, event); 2282 break; 2283 case PERF_RECORD_HEADER_BUILD_ID: 2284 if (!perf_event__check_nul(event->build_id.filename, 2285 (void *)event + event_size, 2286 "HEADER_BUILD_ID", file_offset)) { 2287 err = 0; 2288 break; 2289 } 2290 err = tool->build_id(tool, session, event); 2291 break; 2292 case PERF_RECORD_FINISHED_ROUND: 2293 err = tool->finished_round(tool, event, oe); 2294 break; 2295 case PERF_RECORD_ID_INDEX: 2296 err = tool->id_index(tool, session, event); 2297 break; 2298 case PERF_RECORD_AUXTRACE_INFO: 2299 err = tool->auxtrace_info(tool, session, event); 2300 break; 2301 case PERF_RECORD_AUXTRACE: 2302 /* 2303 * Setup for reading amidst mmap, but only when we 2304 * are in 'file' mode. The 'pipe' fd is in proper 2305 * place already. 2306 */ 2307 if (!perf_data__is_pipe(session->data)) 2308 lseek(fd, file_offset + event_size, SEEK_SET); 2309 err = tool->auxtrace(tool, session, event); 2310 break; 2311 case PERF_RECORD_AUXTRACE_ERROR: 2312 perf_session__auxtrace_error_inc(session, event); 2313 err = tool->auxtrace_error(tool, session, event); 2314 break; 2315 case PERF_RECORD_THREAD_MAP: { 2316 u64 max_nr; 2317 2318 if (event_size < sizeof(event->thread_map)) { 2319 pr_err("ERROR: at offset %#" PRIx64 ": PERF_RECORD_THREAD_MAP: header.size (%u) too small\n", 2320 file_offset, event_size); 2321 err = -EINVAL; 2322 break; 2323 } 2324 2325 max_nr = (event_size - sizeof(event->thread_map)) / 2326 sizeof(event->thread_map.entries[0]); 2327 if (event->thread_map.nr > max_nr) { 2328 pr_err("ERROR: at offset %#" PRIx64 ": PERF_RECORD_THREAD_MAP: nr %" PRIu64 " exceeds max %" PRIu64 "\n", 2329 file_offset, (u64)event->thread_map.nr, max_nr); 2330 err = -EINVAL; 2331 break; 2332 } 2333 2334 err = tool->thread_map(tool, session, event); 2335 break; 2336 } 2337 case PERF_RECORD_CPU_MAP: { 2338 struct perf_record_cpu_map_data *data = &event->cpu_map.data; 2339 u32 payload = event_size - sizeof(event->header); 2340 2341 /* 2342 * Native-endian events are mmap'd read-only, so we 2343 * cannot clamp nr fields in place. Skip the event 2344 * if any variant overflows. 2345 */ 2346 switch (data->type) { 2347 case PERF_CPU_MAP__CPUS: { 2348 u16 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 2349 cpus_data.cpu)) / 2350 sizeof(data->cpus_data.cpu[0]); 2351 2352 if (data->cpus_data.nr > max_nr) { 2353 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP: nr %u exceeds payload (max %u), skipping\n", 2354 file_offset, data->cpus_data.nr, max_nr); 2355 err = 0; 2356 goto out; 2357 } 2358 break; 2359 } 2360 case PERF_CPU_MAP__MASK: 2361 if (data->mask32_data.long_size == 4) { 2362 u16 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 2363 mask32_data.mask)) / 2364 sizeof(data->mask32_data.mask[0]); 2365 2366 if (data->mask32_data.nr > max_nr) { 2367 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP mask32: nr %u exceeds payload (max %u), skipping\n", 2368 file_offset, data->mask32_data.nr, max_nr); 2369 err = 0; 2370 goto out; 2371 } 2372 } else if (data->mask64_data.long_size == 8) { 2373 u16 max_nr; 2374 2375 if (payload < offsetof(struct perf_record_cpu_map_data, mask64_data.mask)) { 2376 err = 0; 2377 goto out; 2378 } 2379 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 2380 mask64_data.mask)) / 2381 sizeof(data->mask64_data.mask[0]); 2382 if (data->mask64_data.nr > max_nr) { 2383 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP mask64: nr %u exceeds payload (max %u), skipping\n", 2384 file_offset, data->mask64_data.nr, max_nr); 2385 err = 0; 2386 goto out; 2387 } 2388 } else { 2389 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP: unsupported long_size %u, skipping\n", 2390 file_offset, data->mask32_data.long_size); 2391 err = 0; 2392 goto out; 2393 } 2394 break; 2395 default: 2396 break; 2397 } 2398 2399 err = tool->cpu_map(tool, session, event); 2400 break; 2401 } 2402 case PERF_RECORD_STAT_CONFIG: { 2403 /* Cannot underflow: perf_event__min_size[] guarantees event_size >= sizeof */ 2404 u64 max_nr = (event_size - sizeof(event->stat_config)) / 2405 sizeof(event->stat_config.data[0]); 2406 2407 /* 2408 * Native-endian events are mmap'd read-only, so we 2409 * cannot clamp nr in place. Skip the event instead. 2410 */ 2411 if (event->stat_config.nr > max_nr) { 2412 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_STAT_CONFIG: nr %" PRIu64 " exceeds payload (max %" PRIu64 "), skipping\n", 2413 file_offset, (u64)event->stat_config.nr, max_nr); 2414 err = 0; 2415 goto out; 2416 } 2417 2418 err = tool->stat_config(tool, session, event); 2419 break; 2420 } 2421 case PERF_RECORD_STAT: 2422 err = tool->stat(tool, session, event); 2423 break; 2424 case PERF_RECORD_STAT_ROUND: 2425 err = tool->stat_round(tool, session, event); 2426 break; 2427 case PERF_RECORD_TIME_CONV: 2428 /* 2429 * Bounded copy: older kernels emit a shorter struct 2430 * without time_cycles/time_mask/cap_user_time_*. 2431 * Zero the rest so extended fields default to off. 2432 */ 2433 memset(&session->time_conv, 0, sizeof(session->time_conv)); 2434 memcpy(&session->time_conv, &event->time_conv, 2435 min((size_t)event_size, sizeof(session->time_conv))); 2436 err = tool->time_conv(tool, session, event); 2437 break; 2438 case PERF_RECORD_HEADER_FEATURE: 2439 err = tool->feature(tool, session, event); 2440 break; 2441 case PERF_RECORD_COMPRESSED: 2442 case PERF_RECORD_COMPRESSED2: 2443 err = tool->compressed(tool, session, event, file_offset, file_path); 2444 if (err) 2445 dump_event(session->evlist, event, file_offset, &sample, file_path); 2446 break; 2447 case PERF_RECORD_FINISHED_INIT: 2448 err = tool->finished_init(tool, session, event); 2449 break; 2450 case PERF_RECORD_BPF_METADATA: { 2451 u64 nr_entries, max_entries; 2452 2453 if (event_size < sizeof(event->bpf_metadata)) { 2454 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: header.size (%u) too small, skipping\n", 2455 file_offset, event_size); 2456 err = 0; 2457 break; 2458 } 2459 2460 /* 2461 * Native-endian files are mmap'd read-only — validate 2462 * NUL-termination instead of writing. 2463 */ 2464 if (strnlen(event->bpf_metadata.prog_name, 2465 BPF_PROG_NAME_LEN) == BPF_PROG_NAME_LEN) { 2466 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: prog_name not null-terminated, skipping\n", 2467 file_offset); 2468 err = 0; 2469 break; 2470 } 2471 2472 nr_entries = READ_ONCE(event->bpf_metadata.nr_entries); 2473 max_entries = (event_size - sizeof(event->bpf_metadata)) / 2474 sizeof(event->bpf_metadata.entries[0]); 2475 if (nr_entries > max_entries) { 2476 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: nr_entries %" PRIu64 " exceeds max %" PRIu64 ", skipping\n", 2477 file_offset, nr_entries, max_entries); 2478 err = 0; 2479 break; 2480 } 2481 2482 for (u64 i = 0; i < nr_entries; i++) { 2483 if (strnlen(event->bpf_metadata.entries[i].key, 2484 BPF_METADATA_KEY_LEN) == BPF_METADATA_KEY_LEN || 2485 strnlen(event->bpf_metadata.entries[i].value, 2486 BPF_METADATA_VALUE_LEN) == BPF_METADATA_VALUE_LEN) { 2487 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: entry %" PRIu64 " key/value not null-terminated, skipping\n", 2488 file_offset, i); 2489 err = 0; 2490 goto out; 2491 } 2492 } 2493 2494 err = tool->bpf_metadata(tool, session, event); 2495 break; 2496 } 2497 case PERF_RECORD_SCHEDSTAT_CPU: 2498 err = tool->schedstat_cpu(tool, session, event); 2499 break; 2500 case PERF_RECORD_SCHEDSTAT_DOMAIN: 2501 err = tool->schedstat_domain(tool, session, event); 2502 break; 2503 default: 2504 err = -EINVAL; 2505 break; 2506 } 2507 out: 2508 perf_sample__exit(&sample); 2509 return err; 2510 } 2511 2512 int perf_session__deliver_synth_event(struct perf_session *session, 2513 union perf_event *event, 2514 struct perf_sample *sample) 2515 { 2516 struct evlist *evlist = session->evlist; 2517 const struct perf_tool *tool = session->tool; 2518 2519 events_stats__inc(&evlist->stats, event->header.type); 2520 2521 if (event->header.type >= PERF_RECORD_USER_TYPE_START) 2522 return perf_session__process_user_event(session, event, 0, NULL); 2523 2524 return machines__deliver_event(&session->machines, evlist, event, sample, tool, 0, NULL); 2525 } 2526 2527 int perf_session__deliver_synth_attr_event(struct perf_session *session, 2528 const struct perf_event_attr *attr, 2529 u64 id) 2530 { 2531 union { 2532 struct { 2533 struct perf_record_header_attr attr; 2534 u64 ids[1]; 2535 } attr_id; 2536 union perf_event ev; 2537 } ev = { 2538 .attr_id.attr.header.type = PERF_RECORD_HEADER_ATTR, 2539 .attr_id.attr.header.size = sizeof(ev.attr_id), 2540 .attr_id.ids[0] = id, 2541 }; 2542 2543 if (attr->size != sizeof(ev.attr_id.attr.attr)) { 2544 pr_debug("Unexpected perf_event_attr size\n"); 2545 return -EINVAL; 2546 } 2547 ev.attr_id.attr.attr = *attr; 2548 return perf_session__deliver_synth_event(session, &ev.ev, NULL); 2549 } 2550 2551 /* Caller must ensure event->header.type < PERF_RECORD_HEADER_MAX */ 2552 static int event_swap(union perf_event *event, bool sample_id_all) 2553 { 2554 perf_event__swap_op swap = perf_event__swap_ops[event->header.type]; 2555 2556 if (swap) 2557 return swap(event, sample_id_all); 2558 return 0; 2559 } 2560 2561 /* 2562 * Minimum event sizes indexed by type. Checked before swap and 2563 * processing so that both cross-endian and native-endian paths 2564 * are protected from accessing fields past the event boundary. 2565 * Zero means no minimum beyond the 8-byte header (already 2566 * enforced by the reader). 2567 * 2568 * These values represent the smallest event the kernel has ever 2569 * emitted for each type, so they do not reject legitimate legacy 2570 * perf.data files from older kernels. Variable-length events 2571 * use offsetof() to the first variable field; the variable 2572 * content is validated separately (e.g., perf_event__check_nul). 2573 */ 2574 static const u32 perf_event__min_size[PERF_RECORD_HEADER_MAX] = { 2575 /* 2576 * offsetof() + 1 for types with a trailing variable-length 2577 * string (filename, comm, path, name, msg): the +1 ensures 2578 * room for at least a null terminator. Full null-termination 2579 * within the event boundary is checked separately. 2580 * 2581 * PERF_RECORD_SAMPLE is omitted: all64_swap is bounded by 2582 * header.size, and the internal layout varies by sample_type 2583 * so a fixed minimum is not meaningful. 2584 */ 2585 [PERF_RECORD_MMAP] = offsetof(struct perf_record_mmap, filename) + 1, 2586 [PERF_RECORD_LOST] = sizeof(struct perf_record_lost), 2587 [PERF_RECORD_COMM] = offsetof(struct perf_record_comm, comm) + 1, 2588 [PERF_RECORD_EXIT] = sizeof(struct perf_record_fork), 2589 [PERF_RECORD_THROTTLE] = sizeof(struct perf_record_throttle), 2590 [PERF_RECORD_UNTHROTTLE] = sizeof(struct perf_record_throttle), 2591 [PERF_RECORD_FORK] = sizeof(struct perf_record_fork), 2592 /* 2593 * The kernel dynamically sizes PERF_RECORD_READ based on 2594 * attr.read_format — only the enabled fields are emitted, 2595 * packed with no gaps. The minimum valid event has just 2596 * pid + tid + one u64 value (no optional fields). 2597 */ 2598 [PERF_RECORD_READ] = offsetof(struct perf_record_read, time_enabled), 2599 [PERF_RECORD_MMAP2] = offsetof(struct perf_record_mmap2, filename) + 1, 2600 [PERF_RECORD_LOST_SAMPLES] = sizeof(struct perf_record_lost_samples), 2601 [PERF_RECORD_AUX] = sizeof(struct perf_record_aux), 2602 [PERF_RECORD_ITRACE_START] = sizeof(struct perf_record_itrace_start), 2603 [PERF_RECORD_SWITCH] = sizeof(struct perf_event_header), 2604 [PERF_RECORD_SWITCH_CPU_WIDE] = sizeof(struct perf_record_switch), 2605 [PERF_RECORD_NAMESPACES] = sizeof(struct perf_record_namespaces), 2606 [PERF_RECORD_CGROUP] = offsetof(struct perf_record_cgroup, path) + 1, 2607 [PERF_RECORD_TEXT_POKE] = sizeof(struct perf_record_text_poke_event), 2608 [PERF_RECORD_KSYMBOL] = offsetof(struct perf_record_ksymbol, name) + 1, 2609 [PERF_RECORD_BPF_EVENT] = sizeof(struct perf_record_bpf_event), 2610 [PERF_RECORD_HEADER_ATTR] = sizeof(struct perf_event_header) + PERF_ATTR_SIZE_VER0, 2611 [PERF_RECORD_HEADER_EVENT_TYPE] = sizeof(struct perf_record_header_event_type), 2612 /* Legacy events predate the __u32 pad field, accept 12-byte records */ 2613 [PERF_RECORD_HEADER_TRACING_DATA] = offsetof(struct perf_record_header_tracing_data, pad), 2614 [PERF_RECORD_AUX_OUTPUT_HW_ID] = sizeof(struct perf_record_aux_output_hw_id), 2615 [PERF_RECORD_AUXTRACE_INFO] = sizeof(struct perf_record_auxtrace_info), 2616 [PERF_RECORD_AUXTRACE] = sizeof(struct perf_record_auxtrace), 2617 [PERF_RECORD_AUXTRACE_ERROR] = offsetof(struct perf_record_auxtrace_error, msg) + 1, 2618 [PERF_RECORD_THREAD_MAP] = sizeof(struct perf_record_thread_map), 2619 /* 2620 * sizeof(perf_record_cpu_map) is 20 because the outer struct 2621 * isn't packed and GCC adds 2 bytes of trailing padding. 2622 * The smallest valid variant (RANGE_CPUS) is only 16 bytes: 2623 * header(8) + type(2) + range_cpu_data(6). Per-variant 2624 * bounds are checked in the swap handler via payload. 2625 */ 2626 [PERF_RECORD_CPU_MAP] = sizeof(struct perf_event_header) + 2627 sizeof(__u16) + 2628 sizeof(struct perf_record_range_cpu_map), 2629 [PERF_RECORD_STAT_CONFIG] = sizeof(struct perf_record_stat_config), 2630 [PERF_RECORD_STAT] = sizeof(struct perf_record_stat), 2631 [PERF_RECORD_STAT_ROUND] = sizeof(struct perf_record_stat_round), 2632 /* 2633 * EVENT_UPDATE has a union whose largest member (cpus) 2634 * inflates sizeof to 40, but SCALE events are only 32 2635 * and UNIT/NAME events can be even smaller. Use the 2636 * fixed header fields (header + type + id) as minimum. 2637 */ 2638 [PERF_RECORD_EVENT_UPDATE] = offsetof(struct perf_record_event_update, scale), 2639 [PERF_RECORD_TIME_CONV] = offsetof(struct perf_record_time_conv, time_cycles), 2640 [PERF_RECORD_ID_INDEX] = sizeof(struct perf_record_id_index), 2641 [PERF_RECORD_HEADER_BUILD_ID] = sizeof(struct perf_record_header_build_id), 2642 [PERF_RECORD_HEADER_FEATURE] = sizeof(struct perf_record_header_feature), 2643 [PERF_RECORD_COMPRESSED2] = sizeof(struct perf_record_compressed2), 2644 [PERF_RECORD_BPF_METADATA] = sizeof(struct perf_record_bpf_metadata), 2645 [PERF_RECORD_CALLCHAIN_DEFERRED] = sizeof(struct perf_event_header) + sizeof(__u64), 2646 /* 2647 * SCHEDSTAT events have a version-dependent union after the 2648 * fixed header fields; the minimum is the base (pre-union) 2649 * portion so old and new versions both pass. 2650 */ 2651 [PERF_RECORD_SCHEDSTAT_CPU] = offsetof(struct perf_record_schedstat_cpu, v15), 2652 [PERF_RECORD_SCHEDSTAT_DOMAIN] = offsetof(struct perf_record_schedstat_domain, v15), 2653 }; 2654 2655 /* 2656 * Return true if the event is too small for its declared type. 2657 * Caller must ensure event->header.type < PERF_RECORD_HEADER_MAX. 2658 * If min is non-NULL, stores the required minimum on failure. 2659 */ 2660 static bool perf_event__too_small(const union perf_event *event, u32 *min) 2661 { 2662 u32 min_sz = perf_event__min_size[event->header.type]; 2663 2664 if (min_sz && event->header.size < min_sz) { 2665 if (min) 2666 *min = min_sz; 2667 return true; 2668 } 2669 2670 return false; 2671 } 2672 2673 /* 2674 * Read and validate the event at @file_offset. 2675 * 2676 * Returns: 2677 * 0 — success: *event_ptr is set and safe to access. 2678 * -1 — error; check *event_ptr to decide whether to advance or abort: 2679 * *event_ptr set — event header was read but the event is 2680 * malformed (too small for its type, or byte-swap 2681 * failed). header.size is still valid, so the 2682 * caller can advance past the event. 2683 * *event_ptr NULL — fatal: couldn't read the header at all 2684 * (I/O error, offset out of range, pipe mode). 2685 * Caller must abort. 2686 */ 2687 int perf_session__peek_event(struct perf_session *session, off_t file_offset, 2688 void *buf, size_t buf_sz, 2689 union perf_event **event_ptr, 2690 struct perf_sample *sample) 2691 { 2692 union perf_event *event; 2693 size_t hdr_sz, rest; 2694 u32 min_sz; 2695 int fd; 2696 2697 *event_ptr = NULL; 2698 2699 if (session->one_mmap && !session->header.needs_swap) { 2700 u64 offset_in_mmap; 2701 2702 /* Validate offset with integer arithmetic to avoid pointer UB */ 2703 if ((u64)file_offset < session->one_mmap_offset) 2704 return -1; 2705 2706 offset_in_mmap = (u64)file_offset - session->one_mmap_offset; 2707 2708 /* Use subtraction to avoid addition overflow */ 2709 if (offset_in_mmap >= session->one_mmap_size || 2710 session->one_mmap_size - offset_in_mmap < sizeof(struct perf_event_header)) 2711 return -1; 2712 2713 event = session->one_mmap_addr + offset_in_mmap; 2714 2715 if (event->header.size < sizeof(struct perf_event_header)) 2716 return -1; 2717 2718 /* Ensure full event is within the mmap region */ 2719 if (session->one_mmap_size - offset_in_mmap < event->header.size) 2720 return -1; 2721 } else { 2722 if (perf_data__is_pipe(session->data)) 2723 return -1; 2724 2725 fd = perf_data__fd(session->data); 2726 hdr_sz = sizeof(struct perf_event_header); 2727 2728 if (buf_sz < hdr_sz) 2729 return -1; 2730 2731 if (lseek(fd, file_offset, SEEK_SET) == (off_t)-1 || 2732 readn(fd, buf, hdr_sz) != (ssize_t)hdr_sz) 2733 return -1; 2734 2735 event = (union perf_event *)buf; 2736 2737 if (session->header.needs_swap) 2738 perf_event_header__bswap(&event->header); 2739 2740 if (event->header.size < hdr_sz || event->header.size > buf_sz) 2741 return -1; 2742 2743 buf += hdr_sz; 2744 rest = event->header.size - hdr_sz; 2745 2746 if (readn(fd, buf, rest) != (ssize_t)rest) 2747 return -1; 2748 } 2749 2750 /* Event data is fully loaded — expose so callers can advance */ 2751 *event_ptr = event; 2752 2753 /* 2754 * Check alignment before type: an unaligned size misaligns the 2755 * stream for all subsequent reads regardless of event type. 2756 * Three legacy user events predate the 8-byte rule — exempt them. 2757 */ 2758 if (event->header.size % sizeof(u64) && 2759 event->header.type != PERF_RECORD_HEADER_TRACING_DATA && 2760 event->header.type != PERF_RECORD_COMPRESSED && 2761 event->header.type != PERF_RECORD_HEADER_FEATURE) { 2762 pr_warning("WARNING: at offset %#" PRIx64 ": %s (%u) event size %u not aligned to %zu\n", 2763 (u64)file_offset, perf_event__name(event->header.type), 2764 event->header.type, event->header.size, sizeof(u64)); 2765 return -1; 2766 } 2767 2768 if (event->header.type >= PERF_RECORD_HEADER_MAX) { 2769 pr_warning("WARNING: at offset %#" PRIx64 ": unsupported event type %u, skipping\n", 2770 (u64)file_offset, event->header.type); 2771 return 0; 2772 } 2773 2774 if (perf_event__too_small(event, &min_sz)) { 2775 pr_warning("WARNING: at offset %#" PRIx64 ": %s (%u) event size %u too small (min %u)\n", 2776 (u64)file_offset, perf_event__name(event->header.type), 2777 event->header.type, event->header.size, min_sz); 2778 return -1; 2779 } 2780 2781 if (session->header.needs_swap && 2782 event_swap(event, evlist__sample_id_all(session->evlist))) { 2783 /* 2784 * The header was already swapped so header.size is 2785 * valid — expose the event so callers can advance 2786 * past this malformed entry instead of aborting. 2787 */ 2788 *event_ptr = event; 2789 return -1; 2790 } 2791 2792 if (sample && event->header.type < PERF_RECORD_USER_TYPE_START && 2793 evlist__parse_sample(session->evlist, event, sample)) 2794 return -1; 2795 2796 return 0; 2797 } 2798 2799 int perf_session__peek_events(struct perf_session *session, u64 offset, 2800 u64 size, peek_events_cb_t cb, void *data) 2801 { 2802 u64 max_offset = offset + size; 2803 char buf[PERF_SAMPLE_MAX_SIZE]; 2804 union perf_event *event; 2805 int err; 2806 2807 do { 2808 event = NULL; 2809 err = perf_session__peek_event(session, offset, buf, 2810 PERF_SAMPLE_MAX_SIZE, &event, 2811 NULL); 2812 if (err) { 2813 /* 2814 * Recoverable error: peek_event returns -1 but 2815 * sets event_ptr when the header was read 2816 * successfully but the event is malformed (too 2817 * small or swap failed). Skip past it using 2818 * header.size — don't invoke the callback since 2819 * type-specific fields may be truncated. 2820 * 2821 * Must abort if: event_ptr is NULL (I/O error), 2822 * size is 0 (can't advance), type is AUXTRACE 2823 * (payload extends beyond header.size), or size 2824 * is unaligned (would misalign all subsequent reads). 2825 * 2826 * Direct callers (auxtrace, cs-etm) treat any 2827 * non-zero return as fatal — only this loop skips. 2828 */ 2829 if (event && event->header.size && 2830 event->header.type != PERF_RECORD_AUXTRACE && 2831 event->header.size % sizeof(u64) == 0) { 2832 offset += event->header.size; 2833 err = 0; 2834 } else { 2835 return err; 2836 } 2837 continue; 2838 } 2839 2840 err = cb(session, event, offset, data); 2841 if (err) 2842 return err; 2843 2844 offset += event->header.size; 2845 if (event->header.type == PERF_RECORD_AUXTRACE) 2846 offset += event->auxtrace.size; 2847 2848 } while (offset < max_offset); 2849 2850 return err; 2851 } 2852 2853 static s64 perf_session__process_event(struct perf_session *session, 2854 union perf_event *event, u64 file_offset, 2855 const char *file_path) 2856 { 2857 struct evlist *evlist = session->evlist; 2858 const struct perf_tool *tool = session->tool; 2859 u32 min_sz; 2860 int ret; 2861 2862 /* 2863 * The kernel aligns all event sizes to sizeof(u64) — see 2864 * perf_event_comm_event() (ALIGN), perf_event_mmap_event(), 2865 * perf_event_cgroup(), perf_event_ksymbol() (IS_ALIGNED loops), 2866 * and perf_event_text_poke() (ALIGN) in kernel/events/core.c. 2867 * 2868 * An unaligned size means the file is corrupted or crafted. 2869 * Abort: there is no point continuing to read unaligned records 2870 * because the caller advances rd->head by event->header.size, 2871 * so every subsequent read would start at a misaligned offset, 2872 * producing garbage headers for the rest of the file. 2873 * 2874 * Exempt three legacy user events that predate the alignment rule: 2875 * 2876 * TRACING_DATA (66): struct tracing_data_event was 12 bytes before 2877 * b39c915a4f36 ("libperf event: Ensure tracing data is multiple 2878 * of 8 sized") added __u32 pad; old perf.data files still contain 2879 * 12-byte records. 2880 * TODO: introduce HEADER_TRACING_DATA2 with guaranteed alignment. 2881 * 2882 * COMPRESSED (81): raw ZSTD output, arbitrary length. Already 2883 * superseded by COMPRESSED2 (83) with PERF_ALIGN. 2884 * 2885 * HEADER_FEATURE (80): do_write_string() uses a 4-byte length 2886 * prefix with no padding to 8-byte total. 2887 * TODO: introduce HEADER_FEATURE2 with guaranteed alignment. 2888 */ 2889 if (event->header.size % sizeof(u64) && 2890 event->header.type != PERF_RECORD_HEADER_TRACING_DATA && 2891 event->header.type != PERF_RECORD_COMPRESSED && 2892 event->header.type != PERF_RECORD_HEADER_FEATURE) { 2893 pr_err("ERROR: at offset %#" PRIx64 ": %s (%u) event size %u is not 8-byte aligned, aborting\n", 2894 file_offset, perf_event__name(event->header.type), 2895 event->header.type, event->header.size); 2896 return -EINVAL; 2897 } 2898 2899 if (event->header.type >= PERF_RECORD_HEADER_MAX) { 2900 /* This perf is outdated and does not support the latest event type. */ 2901 ui__warning("Unsupported header type %u, please consider updating perf.\n", 2902 event->header.type); 2903 /* 2904 * Return 0 to skip: the caller (reader__read_event) 2905 * already advances by event->header.size. 2906 */ 2907 return 0; 2908 } 2909 2910 /* 2911 * Skip rather than abort: a too-small-but-aligned event 2912 * can be safely stepped over without misaligning the stream. 2913 */ 2914 if (perf_event__too_small(event, &min_sz)) { 2915 pr_warning("WARNING: at offset %#" PRIx64 ": %s (%u) event size %u too small (min %u), skipping\n", 2916 file_offset, perf_event__name(event->header.type), 2917 event->header.type, event->header.size, min_sz); 2918 return 0; 2919 } 2920 2921 if (session->header.needs_swap && 2922 event_swap(event, evlist__sample_id_all(evlist))) { 2923 pr_warning("WARNING: at offset %#" PRIx64 ": swap failed for %s (%u) event, skipping\n", 2924 file_offset, perf_event__name(event->header.type), 2925 event->header.type); 2926 return 0; 2927 } 2928 2929 events_stats__inc(&evlist->stats, event->header.type); 2930 2931 if (event->header.type >= PERF_RECORD_USER_TYPE_START) 2932 return perf_session__process_user_event(session, event, file_offset, file_path); 2933 2934 if (tool->ordered_events) { 2935 u64 timestamp = -1ULL; 2936 2937 ret = evlist__parse_sample_timestamp(evlist, event, ×tamp); 2938 if (ret && ret != -1) 2939 return ret; 2940 2941 ret = perf_session__queue_event(session, event, timestamp, file_offset, file_path); 2942 if (ret != -ETIME) 2943 return ret; 2944 } 2945 2946 return perf_session__deliver_event(session, event, tool, file_offset, file_path); 2947 } 2948 2949 void perf_event_header__bswap(struct perf_event_header *hdr) 2950 { 2951 hdr->type = bswap_32(hdr->type); 2952 hdr->misc = bswap_16(hdr->misc); 2953 hdr->size = bswap_16(hdr->size); 2954 } 2955 2956 struct thread *perf_session__findnew(struct perf_session *session, pid_t pid) 2957 { 2958 return machine__findnew_thread(&session->machines.host, -1, pid); 2959 } 2960 2961 int perf_session__register_idle_thread(struct perf_session *session) 2962 { 2963 struct thread *thread = machine__idle_thread(&session->machines.host); 2964 2965 /* machine__idle_thread() got the thread, so put it */ 2966 thread__put(thread); 2967 return thread ? 0 : -1; 2968 } 2969 2970 static void 2971 perf_session__warn_order(const struct perf_session *session) 2972 { 2973 const struct ordered_events *oe = &session->ordered_events; 2974 struct evsel *evsel; 2975 bool should_warn = true; 2976 2977 evlist__for_each_entry(session->evlist, evsel) { 2978 if (evsel->core.attr.write_backward) 2979 should_warn = false; 2980 } 2981 2982 if (!should_warn) 2983 return; 2984 if (oe->nr_unordered_events != 0) 2985 ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events); 2986 } 2987 2988 static void perf_session__warn_about_errors(const struct perf_session *session) 2989 { 2990 const struct events_stats *stats = &session->evlist->stats; 2991 2992 if (session->tool->lost == perf_event__process_lost && 2993 stats->nr_events[PERF_RECORD_LOST] != 0) { 2994 ui__warning("Processed %d events and lost %d chunks!\n\n" 2995 "Check IO/CPU overload!\n\n", 2996 stats->nr_events[0], 2997 stats->nr_events[PERF_RECORD_LOST]); 2998 } 2999 3000 if (session->tool->lost_samples == perf_event__process_lost_samples) { 3001 double drop_rate; 3002 3003 drop_rate = (double)stats->total_lost_samples / 3004 (double) (stats->nr_events[PERF_RECORD_SAMPLE] + stats->total_lost_samples); 3005 if (drop_rate > 0.05) { 3006 ui__warning("Processed %" PRIu64 " samples and lost %3.2f%%!\n\n", 3007 stats->nr_events[PERF_RECORD_SAMPLE] + stats->total_lost_samples, 3008 drop_rate * 100.0); 3009 } 3010 } 3011 3012 if (session->tool->aux == perf_event__process_aux && 3013 stats->total_aux_lost != 0) { 3014 ui__warning("AUX data lost %" PRIu64 " times out of %u!\n\n", 3015 stats->total_aux_lost, 3016 stats->nr_events[PERF_RECORD_AUX]); 3017 } 3018 3019 if (session->tool->aux == perf_event__process_aux && 3020 stats->total_aux_partial != 0) { 3021 bool vmm_exclusive = false; 3022 3023 (void)sysfs__read_bool("module/kvm_intel/parameters/vmm_exclusive", 3024 &vmm_exclusive); 3025 3026 ui__warning("AUX data had gaps in it %" PRIu64 " times out of %u!\n\n" 3027 "Are you running a KVM guest in the background?%s\n\n", 3028 stats->total_aux_partial, 3029 stats->nr_events[PERF_RECORD_AUX], 3030 vmm_exclusive ? 3031 "\nReloading kvm_intel module with vmm_exclusive=0\n" 3032 "will reduce the gaps to only guest's timeslices." : 3033 ""); 3034 } 3035 3036 if (session->tool->aux == perf_event__process_aux && 3037 stats->total_aux_collision != 0) { 3038 ui__warning("AUX data detected collision %" PRIu64 " times out of %u!\n\n", 3039 stats->total_aux_collision, 3040 stats->nr_events[PERF_RECORD_AUX]); 3041 } 3042 3043 if (stats->nr_unknown_events != 0) { 3044 ui__warning("Found %u unknown events!\n\n" 3045 "Is this an older tool processing a perf.data " 3046 "file generated by a more recent tool?\n\n" 3047 "If that is not the case, consider " 3048 "reporting to linux-kernel@vger.kernel.org.\n\n", 3049 stats->nr_unknown_events); 3050 } 3051 3052 if (stats->nr_unknown_id != 0) { 3053 ui__warning("%u samples with id not present in the header\n", 3054 stats->nr_unknown_id); 3055 } 3056 3057 if (stats->nr_invalid_chains != 0) { 3058 ui__warning("Found invalid callchains!\n\n" 3059 "%u out of %u events were discarded for this reason.\n\n" 3060 "Consider reporting to linux-kernel@vger.kernel.org.\n\n", 3061 stats->nr_invalid_chains, 3062 stats->nr_events[PERF_RECORD_SAMPLE]); 3063 } 3064 3065 if (stats->nr_unprocessable_samples != 0) { 3066 ui__warning("%u unprocessable samples recorded.\n" 3067 "Do you have a KVM guest running and not using 'perf kvm'?\n", 3068 stats->nr_unprocessable_samples); 3069 } 3070 3071 perf_session__warn_order(session); 3072 3073 events_stats__auxtrace_error_warn(stats); 3074 3075 if (stats->nr_proc_map_timeout != 0) { 3076 ui__warning("%d map information files for pre-existing threads were\n" 3077 "not processed, if there are samples for addresses they\n" 3078 "will not be resolved, you may find out which are these\n" 3079 "threads by running with -v and redirecting the output\n" 3080 "to a file.\n" 3081 "The time limit to process proc map is too short?\n" 3082 "Increase it by --proc-map-timeout\n", 3083 stats->nr_proc_map_timeout); 3084 } 3085 } 3086 3087 static int perf_session__flush_thread_stack(struct thread *thread, 3088 void *p __maybe_unused) 3089 { 3090 return thread_stack__flush(thread); 3091 } 3092 3093 static int perf_session__flush_thread_stacks(struct perf_session *session) 3094 { 3095 return machines__for_each_thread(&session->machines, 3096 perf_session__flush_thread_stack, 3097 NULL); 3098 } 3099 3100 volatile sig_atomic_t session_done; 3101 3102 static int __perf_session__process_decomp_events(struct perf_session *session); 3103 3104 static int __perf_session__process_pipe_events(struct perf_session *session) 3105 { 3106 struct ordered_events *oe = &session->ordered_events; 3107 const struct perf_tool *tool = session->tool; 3108 struct ui_progress prog; 3109 union perf_event *event; 3110 uint32_t size, cur_size = 0; 3111 void *buf = NULL; 3112 s64 skip = 0; 3113 u64 head; 3114 ssize_t err; 3115 void *p; 3116 bool update_prog = false; 3117 3118 /* 3119 * If it's from a file saving pipe data (by redirection), it would have 3120 * a file name other than "-". Then we can get the total size and show 3121 * the progress. 3122 */ 3123 if (strcmp(session->data->path, "-") && session->data->file.size) { 3124 ui_progress__init_size(&prog, session->data->file.size, 3125 "Processing events..."); 3126 update_prog = true; 3127 } 3128 3129 head = 0; 3130 cur_size = sizeof(union perf_event); 3131 3132 buf = malloc(cur_size); 3133 if (!buf) 3134 return -errno; 3135 ordered_events__set_copy_on_queue(oe, true); 3136 more: 3137 event = buf; 3138 err = perf_data__read(session->data, event, 3139 sizeof(struct perf_event_header)); 3140 if (err <= 0) { 3141 if (err == 0) 3142 goto done; 3143 3144 pr_err("failed to read event header\n"); 3145 goto out_err; 3146 } 3147 3148 if (session->header.needs_swap) 3149 perf_event_header__bswap(&event->header); 3150 3151 size = event->header.size; 3152 if (size < sizeof(struct perf_event_header)) { 3153 pr_err("bad event header size\n"); 3154 goto out_err; 3155 } 3156 3157 if (size > cur_size) { 3158 void *new = realloc(buf, size); 3159 if (!new) { 3160 pr_err("failed to allocate memory to read event\n"); 3161 goto out_err; 3162 } 3163 buf = new; 3164 cur_size = size; 3165 event = buf; 3166 } 3167 p = event; 3168 p += sizeof(struct perf_event_header); 3169 3170 if (size - sizeof(struct perf_event_header)) { 3171 err = perf_data__read(session->data, p, 3172 size - sizeof(struct perf_event_header)); 3173 if (err <= 0) { 3174 if (err == 0) { 3175 pr_err("unexpected end of event stream\n"); 3176 goto done; 3177 } 3178 3179 pr_err("failed to read event data\n"); 3180 goto out_err; 3181 } 3182 } 3183 3184 if ((skip = perf_session__process_event(session, event, head, "pipe")) < 0) { 3185 pr_err("%#" PRIx64 " [%#x]: piped event processing failed for event of type: %s (%d)\n", 3186 head, event->header.size, 3187 perf_event__name(event->header.type), 3188 event->header.type); 3189 err = -EINVAL; 3190 goto out_err; 3191 } 3192 3193 head += size; 3194 3195 if (skip > 0) 3196 head += skip; 3197 3198 err = __perf_session__process_decomp_events(session); 3199 if (err) 3200 goto out_err; 3201 3202 if (update_prog) 3203 ui_progress__update(&prog, size); 3204 3205 if (!session_done()) 3206 goto more; 3207 done: 3208 /* do the final flush for ordered samples */ 3209 err = ordered_events__flush(oe, OE_FLUSH__FINAL); 3210 if (err) 3211 goto out_err; 3212 err = session__flush_deferred_samples(session, tool); 3213 if (err) 3214 goto out_err; 3215 err = auxtrace__flush_events(session, tool); 3216 if (err) 3217 goto out_err; 3218 err = perf_session__flush_thread_stacks(session); 3219 out_err: 3220 free(buf); 3221 if (update_prog) 3222 ui_progress__finish(); 3223 if (!tool->no_warn) 3224 perf_session__warn_about_errors(session); 3225 ordered_events__free(&session->ordered_events); 3226 auxtrace__free_events(session); 3227 return err; 3228 } 3229 3230 static union perf_event * 3231 prefetch_event(char *buf, u64 head, size_t mmap_size, 3232 bool needs_swap, union perf_event *error) 3233 { 3234 union perf_event *event; 3235 u16 event_size; 3236 3237 /* 3238 * Ensure we have enough space remaining to read 3239 * the size of the event in the headers. 3240 */ 3241 if (head + sizeof(event->header) > mmap_size) 3242 return NULL; 3243 3244 event = (union perf_event *)(buf + head); 3245 if (needs_swap) 3246 perf_event_header__bswap(&event->header); 3247 3248 event_size = event->header.size; 3249 if (head + event_size <= mmap_size) 3250 return event; 3251 3252 /* We're not fetching the event so swap back again */ 3253 if (needs_swap) 3254 perf_event_header__bswap(&event->header); 3255 3256 /* Check if the event fits into the next mmapped buf. */ 3257 if (event_size <= mmap_size - head % page_size) { 3258 /* Remap buf and fetch again. */ 3259 return NULL; 3260 } 3261 3262 /* Invalid input. Event size should never exceed mmap_size. */ 3263 pr_debug("%s: head=%#" PRIx64 " event->header.size=%#x, mmap_size=%#zx:" 3264 " fuzzed or compressed perf.data?\n", __func__, head, event_size, mmap_size); 3265 3266 return error; 3267 } 3268 3269 static union perf_event * 3270 fetch_mmaped_event(u64 head, size_t mmap_size, char *buf, bool needs_swap) 3271 { 3272 return prefetch_event(buf, head, mmap_size, needs_swap, ERR_PTR(-EINVAL)); 3273 } 3274 3275 static union perf_event * 3276 fetch_decomp_event(u64 head, size_t mmap_size, char *buf, bool needs_swap) 3277 { 3278 return prefetch_event(buf, head, mmap_size, needs_swap, NULL); 3279 } 3280 3281 static int __perf_session__process_decomp_events(struct perf_session *session) 3282 { 3283 s64 skip; 3284 u64 size; 3285 struct decomp *decomp = session->active_decomp->decomp_last; 3286 3287 if (!decomp) 3288 return 0; 3289 3290 while (decomp->head < decomp->size && !session_done()) { 3291 union perf_event *event = fetch_decomp_event(decomp->head, decomp->size, decomp->data, 3292 session->header.needs_swap); 3293 3294 if (!event) 3295 break; 3296 3297 size = event->header.size; 3298 3299 if (size < sizeof(struct perf_event_header) || 3300 (skip = perf_session__process_event(session, event, decomp->file_pos, 3301 decomp->file_path)) < 0) { 3302 pr_err("%#" PRIx64 " [%#x]: decompress event processing failed for event of type: %s (%d)\n", 3303 decomp->file_pos + decomp->head, event->header.size, 3304 perf_event__name(event->header.type), 3305 event->header.type); 3306 return -EINVAL; 3307 } 3308 3309 if (skip) 3310 size += skip; 3311 3312 decomp->head += size; 3313 } 3314 3315 return 0; 3316 } 3317 3318 /* 3319 * On 64bit we can mmap the data file in one go. No need for tiny mmap 3320 * slices. On 32bit we use 32MB. 3321 */ 3322 #if BITS_PER_LONG == 64 3323 #define MMAP_SIZE ULLONG_MAX 3324 #define NUM_MMAPS 1 3325 #else 3326 #define MMAP_SIZE (32 * 1024 * 1024ULL) 3327 #define NUM_MMAPS 128 3328 #endif 3329 3330 struct reader; 3331 3332 typedef s64 (*reader_cb_t)(struct perf_session *session, 3333 union perf_event *event, 3334 u64 file_offset, 3335 const char *file_path); 3336 3337 struct reader { 3338 int fd; 3339 const char *path; 3340 u64 data_size; 3341 u64 data_offset; 3342 reader_cb_t process; 3343 bool in_place_update; 3344 char *mmaps[NUM_MMAPS]; 3345 size_t mmap_size; 3346 int mmap_idx; 3347 char *mmap_cur; 3348 u64 file_pos; 3349 u64 file_offset; 3350 u64 head; 3351 u64 size; 3352 bool done; 3353 struct zstd_data zstd_data; 3354 struct decomp_data decomp_data; 3355 }; 3356 3357 static int 3358 reader__init(struct reader *rd, bool *one_mmap) 3359 { 3360 u64 data_size = rd->data_size; 3361 char **mmaps = rd->mmaps; 3362 3363 rd->head = rd->data_offset; 3364 data_size += rd->data_offset; 3365 3366 rd->mmap_size = MMAP_SIZE; 3367 if (rd->mmap_size > data_size) { 3368 rd->mmap_size = data_size; 3369 if (one_mmap) 3370 *one_mmap = true; 3371 } 3372 3373 memset(mmaps, 0, sizeof(rd->mmaps)); 3374 3375 if (zstd_init(&rd->zstd_data, 0)) 3376 return -1; 3377 rd->decomp_data.zstd_decomp = &rd->zstd_data; 3378 3379 return 0; 3380 } 3381 3382 static void 3383 reader__release_decomp(struct reader *rd) 3384 { 3385 perf_decomp__release_events(rd->decomp_data.decomp); 3386 zstd_fini(&rd->zstd_data); 3387 } 3388 3389 static int 3390 reader__mmap(struct reader *rd, struct perf_session *session) 3391 { 3392 int mmap_prot, mmap_flags; 3393 char *buf, **mmaps = rd->mmaps; 3394 u64 page_offset; 3395 3396 /* 3397 * Native-endian: MAP_SHARED + PROT_READ — the kernel 3398 * guarantees page-level coherence but a concurrent writer 3399 * could modify the file between validation and use. This 3400 * is a theoretical TOCTOU that affects the entire perf.data 3401 * processing pipeline; fixing it would require copying each 3402 * event to a private buffer before processing. 3403 * 3404 * Cross-endian: MAP_PRIVATE + PROT_WRITE — swap handlers 3405 * get a copy-on-write snapshot immune to concurrent writes. 3406 */ 3407 mmap_prot = PROT_READ; 3408 mmap_flags = MAP_SHARED; 3409 3410 if (rd->in_place_update) { 3411 mmap_prot |= PROT_WRITE; 3412 } else if (session->header.needs_swap) { 3413 mmap_prot |= PROT_WRITE; 3414 mmap_flags = MAP_PRIVATE; 3415 } 3416 3417 if (mmaps[rd->mmap_idx]) { 3418 munmap(mmaps[rd->mmap_idx], rd->mmap_size); 3419 mmaps[rd->mmap_idx] = NULL; 3420 } 3421 3422 page_offset = page_size * (rd->head / page_size); 3423 rd->file_offset += page_offset; 3424 rd->head -= page_offset; 3425 3426 buf = mmap(NULL, rd->mmap_size, mmap_prot, mmap_flags, rd->fd, 3427 rd->file_offset); 3428 if (buf == MAP_FAILED) { 3429 pr_err("failed to mmap file\n"); 3430 return -errno; 3431 } 3432 mmaps[rd->mmap_idx] = rd->mmap_cur = buf; 3433 rd->mmap_idx = (rd->mmap_idx + 1) & (ARRAY_SIZE(rd->mmaps) - 1); 3434 rd->file_pos = rd->file_offset + rd->head; 3435 if (session->one_mmap) { 3436 session->one_mmap_addr = buf; 3437 session->one_mmap_offset = rd->file_offset; 3438 /* 3439 * mmap_size was set to the full file extent (data_offset + 3440 * data_size) but file_offset was shifted forward by 3441 * page_offset for page alignment. Reduce by page_offset 3442 * so the bounds check reflects the file-backed portion 3443 * of the mapping — pages beyond the file cause SIGBUS. 3444 */ 3445 session->one_mmap_size = rd->mmap_size - page_offset; 3446 } 3447 3448 return 0; 3449 } 3450 3451 enum { 3452 READER_OK, 3453 READER_NODATA, 3454 }; 3455 3456 static int 3457 reader__read_event(struct reader *rd, struct perf_session *session, 3458 struct ui_progress *prog) 3459 { 3460 u64 size; 3461 int err = READER_OK; 3462 union perf_event *event; 3463 s64 skip; 3464 3465 event = fetch_mmaped_event(rd->head, rd->mmap_size, rd->mmap_cur, 3466 session->header.needs_swap); 3467 if (IS_ERR(event)) 3468 return PTR_ERR(event); 3469 3470 if (!event) 3471 return READER_NODATA; 3472 3473 size = event->header.size; 3474 3475 skip = -EINVAL; 3476 3477 if (size < sizeof(struct perf_event_header) || 3478 (skip = rd->process(session, event, rd->file_pos, rd->path)) < 0) { 3479 errno = -skip; 3480 pr_err("%#" PRIx64 " [%#x]: processing failed for event of type: %s (%d) [%m]\n", 3481 rd->file_offset + rd->head, event->header.size, 3482 perf_event__name(event->header.type), 3483 event->header.type); 3484 err = skip; 3485 goto out; 3486 } 3487 3488 if (skip) 3489 size += skip; 3490 3491 rd->size += size; 3492 rd->head += size; 3493 rd->file_pos += size; 3494 3495 err = __perf_session__process_decomp_events(session); 3496 if (err) 3497 goto out; 3498 3499 ui_progress__update(prog, size); 3500 3501 out: 3502 return err; 3503 } 3504 3505 static inline bool 3506 reader__eof(struct reader *rd) 3507 { 3508 return (rd->file_pos >= rd->data_size + rd->data_offset); 3509 } 3510 3511 static int 3512 reader__process_events(struct reader *rd, struct perf_session *session, 3513 struct ui_progress *prog) 3514 { 3515 int err; 3516 3517 err = reader__init(rd, &session->one_mmap); 3518 if (err) 3519 goto out; 3520 3521 session->active_decomp = &rd->decomp_data; 3522 3523 remap: 3524 err = reader__mmap(rd, session); 3525 if (err) 3526 goto out; 3527 3528 more: 3529 err = reader__read_event(rd, session, prog); 3530 if (err < 0) 3531 goto out; 3532 else if (err == READER_NODATA) 3533 goto remap; 3534 3535 if (session_done()) 3536 goto out; 3537 3538 if (!reader__eof(rd)) 3539 goto more; 3540 3541 out: 3542 session->active_decomp = &session->decomp_data; 3543 return err; 3544 } 3545 3546 static s64 process_simple(struct perf_session *session, 3547 union perf_event *event, 3548 u64 file_offset, 3549 const char *file_path) 3550 { 3551 return perf_session__process_event(session, event, file_offset, file_path); 3552 } 3553 3554 static int __perf_session__process_events(struct perf_session *session) 3555 { 3556 struct reader rd = { 3557 .fd = perf_data__fd(session->data), 3558 .path = session->data->file.path, 3559 .data_size = session->header.data_size, 3560 .data_offset = session->header.data_offset, 3561 .process = process_simple, 3562 .in_place_update = session->data->in_place_update, 3563 }; 3564 struct ordered_events *oe = &session->ordered_events; 3565 const struct perf_tool *tool = session->tool; 3566 struct ui_progress prog; 3567 int err; 3568 3569 if (rd.data_size == 0) 3570 return -1; 3571 3572 ui_progress__init_size(&prog, rd.data_size, "Processing events..."); 3573 3574 err = reader__process_events(&rd, session, &prog); 3575 if (err) 3576 goto out_err; 3577 /* do the final flush for ordered samples */ 3578 err = ordered_events__flush(oe, OE_FLUSH__FINAL); 3579 if (err) 3580 goto out_err; 3581 err = auxtrace__flush_events(session, tool); 3582 if (err) 3583 goto out_err; 3584 err = session__flush_deferred_samples(session, tool); 3585 if (err) 3586 goto out_err; 3587 err = perf_session__flush_thread_stacks(session); 3588 out_err: 3589 ui_progress__finish(); 3590 if (!tool->no_warn) 3591 perf_session__warn_about_errors(session); 3592 /* 3593 * We may switching perf.data output, make ordered_events 3594 * reusable. 3595 */ 3596 ordered_events__reinit(&session->ordered_events); 3597 auxtrace__free_events(session); 3598 reader__release_decomp(&rd); 3599 session->one_mmap = false; 3600 return err; 3601 } 3602 3603 /* 3604 * Processing 2 MB of data from each reader in sequence, 3605 * because that's the way the ordered events sorting works 3606 * most efficiently. 3607 */ 3608 #define READER_MAX_SIZE (2 * 1024 * 1024) 3609 3610 /* 3611 * This function reads, merge and process directory data. 3612 * It assumens the version 1 of directory data, where each 3613 * data file holds per-cpu data, already sorted by kernel. 3614 */ 3615 static int __perf_session__process_dir_events(struct perf_session *session) 3616 { 3617 struct perf_data *data = session->data; 3618 const struct perf_tool *tool = session->tool; 3619 int i, ret, readers, nr_readers; 3620 struct ui_progress prog; 3621 u64 total_size = perf_data__size(session->data); 3622 struct reader *rd; 3623 3624 ui_progress__init_size(&prog, total_size, "Processing events..."); 3625 3626 nr_readers = 1; 3627 for (i = 0; i < data->dir.nr; i++) { 3628 if (data->dir.files[i].size) 3629 nr_readers++; 3630 } 3631 3632 rd = calloc(nr_readers, sizeof(struct reader)); 3633 if (!rd) 3634 return -ENOMEM; 3635 3636 rd[0] = (struct reader) { 3637 .fd = perf_data__fd(session->data), 3638 .path = session->data->file.path, 3639 .data_size = session->header.data_size, 3640 .data_offset = session->header.data_offset, 3641 .process = process_simple, 3642 .in_place_update = session->data->in_place_update, 3643 }; 3644 ret = reader__init(&rd[0], NULL); 3645 if (ret) 3646 goto out_err; 3647 ret = reader__mmap(&rd[0], session); 3648 if (ret) 3649 goto out_err; 3650 readers = 1; 3651 3652 for (i = 0; i < data->dir.nr; i++) { 3653 if (!data->dir.files[i].size) 3654 continue; 3655 rd[readers] = (struct reader) { 3656 .fd = perf_data_file__fd(&data->dir.files[i]), 3657 .path = data->dir.files[i].path, 3658 .data_size = data->dir.files[i].size, 3659 .data_offset = 0, 3660 .process = process_simple, 3661 .in_place_update = session->data->in_place_update, 3662 }; 3663 ret = reader__init(&rd[readers], NULL); 3664 if (ret) 3665 goto out_err; 3666 ret = reader__mmap(&rd[readers], session); 3667 if (ret) 3668 goto out_err; 3669 readers++; 3670 } 3671 3672 i = 0; 3673 while (readers) { 3674 if (session_done()) 3675 break; 3676 3677 if (rd[i].done) { 3678 i = (i + 1) % nr_readers; 3679 continue; 3680 } 3681 if (reader__eof(&rd[i])) { 3682 rd[i].done = true; 3683 readers--; 3684 continue; 3685 } 3686 3687 session->active_decomp = &rd[i].decomp_data; 3688 ret = reader__read_event(&rd[i], session, &prog); 3689 if (ret < 0) { 3690 goto out_err; 3691 } else if (ret == READER_NODATA) { 3692 ret = reader__mmap(&rd[i], session); 3693 if (ret) 3694 goto out_err; 3695 } 3696 3697 if (rd[i].size >= READER_MAX_SIZE) { 3698 rd[i].size = 0; 3699 i = (i + 1) % nr_readers; 3700 } 3701 } 3702 3703 ret = ordered_events__flush(&session->ordered_events, OE_FLUSH__FINAL); 3704 if (ret) 3705 goto out_err; 3706 3707 ret = session__flush_deferred_samples(session, tool); 3708 if (ret) 3709 goto out_err; 3710 3711 ret = perf_session__flush_thread_stacks(session); 3712 out_err: 3713 ui_progress__finish(); 3714 3715 if (!tool->no_warn) 3716 perf_session__warn_about_errors(session); 3717 3718 /* 3719 * We may switching perf.data output, make ordered_events 3720 * reusable. 3721 */ 3722 ordered_events__reinit(&session->ordered_events); 3723 3724 session->one_mmap = false; 3725 3726 session->active_decomp = &session->decomp_data; 3727 for (i = 0; i < nr_readers; i++) 3728 reader__release_decomp(&rd[i]); 3729 zfree(&rd); 3730 3731 return ret; 3732 } 3733 3734 int perf_session__process_events(struct perf_session *session) 3735 { 3736 if (perf_session__register_idle_thread(session) < 0) 3737 return -ENOMEM; 3738 3739 if (perf_data__is_pipe(session->data)) 3740 return __perf_session__process_pipe_events(session); 3741 3742 if (perf_data__is_dir(session->data) && session->data->dir.nr) 3743 return __perf_session__process_dir_events(session); 3744 3745 return __perf_session__process_events(session); 3746 } 3747 3748 bool perf_session__has_traces(struct perf_session *session, const char *msg) 3749 { 3750 struct evsel *evsel; 3751 3752 evlist__for_each_entry(session->evlist, evsel) { 3753 if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT) 3754 return true; 3755 } 3756 3757 pr_err("No trace sample to read. Did you call 'perf %s'?\n", msg); 3758 return false; 3759 } 3760 3761 bool perf_session__has_switch_events(struct perf_session *session) 3762 { 3763 struct evsel *evsel; 3764 3765 evlist__for_each_entry(session->evlist, evsel) { 3766 if (evsel->core.attr.context_switch) 3767 return true; 3768 } 3769 3770 return false; 3771 } 3772 3773 int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u64 addr) 3774 { 3775 char *bracket, *name; 3776 struct ref_reloc_sym *ref; 3777 struct kmap *kmap; 3778 3779 ref = zalloc(sizeof(struct ref_reloc_sym)); 3780 if (ref == NULL) 3781 return -ENOMEM; 3782 3783 ref->name = name = strdup(symbol_name); 3784 if (ref->name == NULL) { 3785 free(ref); 3786 return -ENOMEM; 3787 } 3788 3789 bracket = strchr(name, ']'); 3790 if (bracket) 3791 *bracket = '\0'; 3792 3793 ref->addr = addr; 3794 3795 kmap = map__kmap(map); 3796 if (kmap) 3797 kmap->ref_reloc_sym = ref; 3798 3799 return 0; 3800 } 3801 3802 size_t perf_session__fprintf_dsos(struct perf_session *session, FILE *fp) 3803 { 3804 return machines__fprintf_dsos(&session->machines, fp); 3805 } 3806 3807 size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp, 3808 bool (skip)(struct dso *dso, int parm), int parm) 3809 { 3810 return machines__fprintf_dsos_buildid(&session->machines, fp, skip, parm); 3811 } 3812 3813 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) 3814 { 3815 size_t ret; 3816 const char *msg = ""; 3817 3818 if (perf_header__has_feat(&session->header, HEADER_AUXTRACE)) 3819 msg = " (excludes AUX area (e.g. instruction trace) decoded / synthesized events)"; 3820 3821 ret = fprintf(fp, "\nAggregated stats:%s\n", msg); 3822 3823 ret += events_stats__fprintf(&session->evlist->stats, fp); 3824 return ret; 3825 } 3826 3827 size_t perf_session__fprintf(struct perf_session *session, FILE *fp) 3828 { 3829 size_t ret = machine__fprintf(&session->machines.host, fp); 3830 3831 for (struct rb_node *nd = rb_first_cached(&session->machines.guests); nd; nd = rb_next(nd)) { 3832 struct machine *pos = rb_entry(nd, struct machine, rb_node); 3833 3834 ret += machine__fprintf(pos, fp); 3835 } 3836 return ret; 3837 } 3838 3839 void perf_session__dump_kmaps(struct perf_session *session) 3840 { 3841 int save_verbose = verbose; 3842 3843 fflush(stdout); 3844 fprintf(stderr, "Kernel and module maps:\n"); 3845 verbose = 0; /* Suppress verbose to print a summary only */ 3846 maps__fprintf(machine__kernel_maps(&session->machines.host), stderr); 3847 verbose = save_verbose; 3848 } 3849 3850 struct evsel *perf_session__find_first_evtype(struct perf_session *session, 3851 unsigned int type) 3852 { 3853 struct evsel *pos; 3854 3855 evlist__for_each_entry(session->evlist, pos) { 3856 if (pos->core.attr.type == type) 3857 return pos; 3858 } 3859 return NULL; 3860 } 3861 3862 int perf_session__cpu_bitmap(struct perf_session *session, 3863 const char *cpu_list, unsigned long *cpu_bitmap) 3864 { 3865 unsigned int i; 3866 int err = -1; 3867 struct perf_cpu_map *map; 3868 int nr_cpus = min(perf_session__env(session)->nr_cpus_avail, MAX_NR_CPUS); 3869 struct perf_cpu cpu; 3870 3871 for (i = 0; i < PERF_TYPE_MAX; ++i) { 3872 struct evsel *evsel; 3873 3874 evsel = perf_session__find_first_evtype(session, i); 3875 if (!evsel) 3876 continue; 3877 3878 if (!(evsel->core.attr.sample_type & PERF_SAMPLE_CPU)) { 3879 pr_err("File does not contain CPU events. " 3880 "Remove -C option to proceed.\n"); 3881 return -1; 3882 } 3883 } 3884 3885 map = perf_cpu_map__new(cpu_list); 3886 if (map == NULL) { 3887 pr_err("Invalid cpu_list\n"); 3888 return -1; 3889 } 3890 3891 perf_cpu_map__for_each_cpu(cpu, i, map) { 3892 if (cpu.cpu >= nr_cpus) { 3893 pr_err("Requested CPU %d too large. " 3894 "Consider raising MAX_NR_CPUS\n", cpu.cpu); 3895 goto out_delete_map; 3896 } 3897 3898 __set_bit(cpu.cpu, cpu_bitmap); 3899 } 3900 3901 err = 0; 3902 3903 out_delete_map: 3904 perf_cpu_map__put(map); 3905 return err; 3906 } 3907 3908 void perf_session__fprintf_info(struct perf_session *session, FILE *fp, 3909 bool full) 3910 { 3911 if (session == NULL || fp == NULL) 3912 return; 3913 3914 fprintf(fp, "# ========\n"); 3915 perf_header__fprintf_info(session, fp, full); 3916 fprintf(fp, "# ========\n#\n"); 3917 } 3918 3919 static int perf_session__register_guest(struct perf_session *session, pid_t machine_pid) 3920 { 3921 struct machine *machine = machines__findnew(&session->machines, machine_pid); 3922 struct thread *thread; 3923 3924 if (!machine) 3925 return -ENOMEM; 3926 3927 machine->single_address_space = session->machines.host.single_address_space; 3928 3929 thread = machine__idle_thread(machine); 3930 if (!thread) 3931 return -ENOMEM; 3932 thread__put(thread); 3933 3934 machine->kallsyms_filename = perf_data__guest_kallsyms_name(session->data, machine_pid); 3935 3936 return 0; 3937 } 3938 3939 static int perf_session__set_guest_cpu(struct perf_session *session, pid_t pid, 3940 pid_t tid, int guest_cpu) 3941 { 3942 struct machine *machine = &session->machines.host; 3943 struct thread *thread = machine__findnew_thread(machine, pid, tid); 3944 3945 if (!thread) 3946 return -ENOMEM; 3947 thread__set_guest_cpu(thread, guest_cpu); 3948 thread__put(thread); 3949 3950 return 0; 3951 } 3952 3953 int perf_event__process_id_index(const struct perf_tool *tool __maybe_unused, 3954 struct perf_session *session, 3955 union perf_event *event) 3956 { 3957 struct evlist *evlist = session->evlist; 3958 struct perf_record_id_index *ie = &event->id_index; 3959 size_t sz = ie->header.size - sizeof(*ie); 3960 size_t i, nr, max_nr; 3961 size_t e1_sz = sizeof(struct id_index_entry); 3962 size_t e2_sz = sizeof(struct id_index_entry_2); 3963 size_t etot_sz = e1_sz + e2_sz; 3964 struct id_index_entry_2 *e2; 3965 pid_t last_pid = 0; 3966 3967 max_nr = sz / e1_sz; 3968 nr = ie->nr; 3969 if (nr > max_nr) { 3970 printf("Too big: nr %zu max_nr %zu\n", nr, max_nr); 3971 return -EINVAL; 3972 } 3973 3974 if (sz >= nr * etot_sz) { 3975 max_nr = sz / etot_sz; 3976 if (nr > max_nr) { 3977 printf("Too big2: nr %zu max_nr %zu\n", nr, max_nr); 3978 return -EINVAL; 3979 } 3980 e2 = (void *)ie + sizeof(*ie) + nr * e1_sz; 3981 } else { 3982 e2 = NULL; 3983 } 3984 3985 if (dump_trace) 3986 fprintf(stdout, " nr: %zu\n", nr); 3987 3988 for (i = 0; i < nr; i++, (e2 ? e2++ : 0)) { 3989 struct id_index_entry *e = &ie->entries[i]; 3990 struct perf_sample_id *sid; 3991 int ret; 3992 3993 if (dump_trace) { 3994 fprintf(stdout, " ... id: %"PRI_lu64, e->id); 3995 fprintf(stdout, " idx: %"PRI_lu64, e->idx); 3996 fprintf(stdout, " cpu: %"PRI_ld64, e->cpu); 3997 fprintf(stdout, " tid: %"PRI_ld64, e->tid); 3998 if (e2) { 3999 fprintf(stdout, " machine_pid: %"PRI_ld64, e2->machine_pid); 4000 fprintf(stdout, " vcpu: %"PRI_lu64"\n", e2->vcpu); 4001 } else { 4002 fprintf(stdout, "\n"); 4003 } 4004 } 4005 4006 sid = evlist__id2sid(evlist, e->id); 4007 if (!sid) 4008 return -ENOENT; 4009 4010 sid->idx = e->idx; 4011 sid->cpu.cpu = e->cpu; 4012 sid->tid = e->tid; 4013 4014 if (!e2) 4015 continue; 4016 4017 sid->machine_pid = e2->machine_pid; 4018 sid->vcpu.cpu = e2->vcpu; 4019 4020 if (!sid->machine_pid) 4021 continue; 4022 4023 if (sid->machine_pid != last_pid) { 4024 ret = perf_session__register_guest(session, sid->machine_pid); 4025 if (ret) 4026 return ret; 4027 last_pid = sid->machine_pid; 4028 perf_guest = true; 4029 } 4030 4031 ret = perf_session__set_guest_cpu(session, sid->machine_pid, e->tid, e2->vcpu); 4032 if (ret) 4033 return ret; 4034 } 4035 return 0; 4036 } 4037 4038 int perf_session__dsos_hit_all(struct perf_session *session) 4039 { 4040 struct rb_node *nd; 4041 int err; 4042 4043 err = machine__hit_all_dsos(&session->machines.host); 4044 if (err) 4045 return err; 4046 4047 for (nd = rb_first_cached(&session->machines.guests); nd; 4048 nd = rb_next(nd)) { 4049 struct machine *pos = rb_entry(nd, struct machine, rb_node); 4050 4051 err = machine__hit_all_dsos(pos); 4052 if (err) 4053 return err; 4054 } 4055 4056 return 0; 4057 } 4058 4059 struct perf_env *perf_session__env(struct perf_session *session) 4060 { 4061 return &session->header.env; 4062 } 4063 4064 struct perf_session__e_machine_cb_args { 4065 uint32_t e_flags; 4066 uint16_t e_machine; 4067 }; 4068 4069 static int perf_session__e_machine_cb(struct thread *thread, void *_args) 4070 { 4071 struct perf_session__e_machine_cb_args *args = _args; 4072 4073 args->e_machine = thread__e_machine(thread, /*machine=*/NULL, &args->e_flags); 4074 return args->e_machine != EM_NONE ? 1 : 0; 4075 } 4076 4077 /* 4078 * Note, a machine may have mixed 32-bit and 64-bit processes and so mixed 4079 * e_machines. Use thread__e_machine when this matters. 4080 */ 4081 uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags) 4082 { 4083 struct perf_session__e_machine_cb_args args = { 4084 .e_machine = EM_NONE, 4085 }; 4086 struct perf_env *env; 4087 4088 if (!session) { 4089 /* Default to assuming a host machine. */ 4090 if (e_flags) 4091 *e_flags = EF_HOST; 4092 4093 return EM_HOST; 4094 } 4095 4096 /* 4097 * Is the env caching an e_machine? If not we want to compute from the 4098 * more accurate threads. 4099 */ 4100 env = perf_session__env(session); 4101 if (env && env->e_machine != EM_NONE) 4102 return perf_env__e_machine(env, e_flags); 4103 4104 /* 4105 * Compute from threads, note this is more accurate than 4106 * perf_env__e_machine that falls back on EM_HOST and doesn't consider 4107 * mixed 32-bit and 64-bit threads. 4108 */ 4109 machines__for_each_thread(&session->machines, 4110 perf_session__e_machine_cb, 4111 &args); 4112 4113 if (args.e_machine != EM_NONE) { 4114 if (env) { 4115 env->e_machine = args.e_machine; 4116 env->e_flags = args.e_flags; 4117 } 4118 if (e_flags) 4119 *e_flags = args.e_flags; 4120 4121 return args.e_machine; 4122 } 4123 4124 /* 4125 * Couldn't determine from the perf_env or current set of 4126 * threads. Potentially use logic that uses the arch string otherwise 4127 * default to the host. Don't cache in the perf_env in case later 4128 * threads indicate a better ELF machine type. 4129 */ 4130 return perf_env__e_machine_nocache(env, e_flags); 4131 } 4132