1 // SPDX-License-Identifier: GPL-2.0 2 #include <errno.h> 3 #include <signal.h> 4 #include <inttypes.h> 5 #include <linux/err.h> 6 #include <linux/kernel.h> 7 #include <linux/zalloc.h> 8 #include <api/fs/fs.h> 9 10 #include <byteswap.h> 11 #include <unistd.h> 12 #include <sys/types.h> 13 #include <sys/mman.h> 14 #include <perf/cpumap.h> 15 #include <perf/event.h> 16 17 #include "map_symbol.h" 18 #include "branch.h" 19 #include "debug.h" 20 #include "dwarf-regs.h" 21 #include "env.h" 22 #include "evlist.h" 23 #include "evsel.h" 24 #include "memswap.h" 25 #include "map.h" 26 #include "symbol.h" 27 #include "session.h" 28 #include "tool.h" 29 #include "perf_regs.h" 30 #include "asm/bug.h" 31 #include "auxtrace.h" 32 #include "thread.h" 33 #include "thread-stack.h" 34 #include "sample-raw.h" 35 #include "stat.h" 36 #include "tsc.h" 37 #include "ui/progress.h" 38 #include "util.h" 39 #include "arch/common.h" 40 #include "units.h" 41 #include "annotate.h" 42 #include "perf.h" 43 #include <internal/lib.h> 44 45 static int perf_session__deliver_event(struct perf_session *session, 46 union perf_event *event, 47 const struct perf_tool *tool, 48 u64 file_offset, 49 const char *file_path); 50 51 static int perf_session__open(struct perf_session *session) 52 { 53 struct perf_data *data = session->data; 54 55 if (perf_session__read_header(session) < 0) { 56 pr_err("incompatible file format (rerun with -v to learn more)\n"); 57 return -1; 58 } 59 60 if (perf_header__has_feat(&session->header, HEADER_AUXTRACE)) { 61 /* Auxiliary events may reference exited threads, hold onto dead ones. */ 62 symbol_conf.keep_exited_threads = true; 63 } 64 65 if (perf_data__is_pipe(data)) 66 return 0; 67 68 if (perf_header__has_feat(&session->header, HEADER_STAT)) 69 return 0; 70 71 if (!evlist__valid_sample_type(session->evlist)) { 72 pr_err("non matching sample_type\n"); 73 return -1; 74 } 75 76 if (!evlist__valid_sample_id_all(session->evlist)) { 77 pr_err("non matching sample_id_all\n"); 78 return -1; 79 } 80 81 if (!evlist__valid_read_format(session->evlist)) { 82 pr_err("non matching read_format\n"); 83 return -1; 84 } 85 86 return 0; 87 } 88 89 void perf_session__set_id_hdr_size(struct perf_session *session) 90 { 91 u16 id_hdr_size = evlist__id_hdr_size(session->evlist); 92 93 machines__set_id_hdr_size(&session->machines, id_hdr_size); 94 } 95 96 int perf_session__create_kernel_maps(struct perf_session *session) 97 { 98 int ret = machine__create_kernel_maps(&session->machines.host); 99 100 if (ret >= 0) 101 ret = machines__create_guest_kernel_maps(&session->machines); 102 return ret; 103 } 104 105 static void perf_session__destroy_kernel_maps(struct perf_session *session) 106 { 107 machines__destroy_kernel_maps(&session->machines); 108 } 109 110 static bool perf_session__has_comm_exec(struct perf_session *session) 111 { 112 struct evsel *evsel; 113 114 evlist__for_each_entry(session->evlist, evsel) { 115 if (evsel->core.attr.comm_exec) 116 return true; 117 } 118 119 return false; 120 } 121 122 static void perf_session__set_comm_exec(struct perf_session *session) 123 { 124 bool comm_exec = perf_session__has_comm_exec(session); 125 126 machines__set_comm_exec(&session->machines, comm_exec); 127 } 128 129 static int ordered_events__deliver_event(struct ordered_events *oe, 130 struct ordered_event *event) 131 { 132 struct perf_session *session = container_of(oe, struct perf_session, 133 ordered_events); 134 int ret = perf_session__deliver_event(session, event->event, 135 session->tool, event->file_offset, 136 event->file_path); 137 138 if (ret) { 139 pr_err("%#" PRIx64 " [%#x]: ordered event processing failed (%d) for event of type: %s (%d)\n", 140 event->file_offset, event->event->header.size, ret, 141 perf_event__name(event->event->header.type), 142 event->event->header.type); 143 } 144 return ret; 145 } 146 147 struct perf_session *__perf_session__new(struct perf_data *data, 148 struct perf_tool *tool, 149 bool trace_event_repipe, 150 struct perf_env *host_env) 151 { 152 int ret = -ENOMEM; 153 struct perf_session *session = zalloc(sizeof(*session)); 154 155 if (!session) 156 goto out; 157 158 session->trace_event_repipe = trace_event_repipe; 159 session->tool = tool; 160 session->decomp_data.zstd_decomp = &session->zstd_data; 161 session->active_decomp = &session->decomp_data; 162 INIT_LIST_HEAD(&session->auxtrace_index); 163 perf_env__init(&session->header.env); 164 if (machines__init(&session->machines)) 165 goto out_delete; 166 167 ordered_events__init(&session->ordered_events, 168 ordered_events__deliver_event, NULL); 169 if (data) { 170 ret = perf_data__open(data); 171 if (ret < 0) 172 goto out_delete; 173 174 session->data = data; 175 176 if (perf_data__is_read(data)) { 177 ret = perf_session__open(session); 178 if (ret < 0) 179 goto out_delete; 180 181 /* 182 * set session attributes that are present in perf.data 183 * but not in pipe-mode. 184 */ 185 if (!data->is_pipe) { 186 perf_session__set_id_hdr_size(session); 187 perf_session__set_comm_exec(session); 188 } 189 190 evlist__init_trace_event_sample_raw(session->evlist, &session->header.env); 191 192 /* Open the directory data. */ 193 if (data->is_dir) { 194 ret = perf_data__open_dir(data); 195 if (ret) 196 goto out_delete; 197 } 198 199 if (!symbol_conf.kallsyms_name && 200 !symbol_conf.vmlinux_name) 201 symbol_conf.kallsyms_name = perf_data__kallsyms_name(data); 202 } 203 } else { 204 assert(host_env != NULL); 205 session->machines.host.env = host_env; 206 } 207 if (session->evlist) 208 session->evlist->session = session; 209 210 session->machines.host.single_address_space = 211 perf_env__single_address_space(session->machines.host.env); 212 213 if (!data || perf_data__is_write(data)) { 214 /* 215 * In O_RDONLY mode this will be performed when reading the 216 * kernel MMAP event, in perf_event__process_mmap(). 217 */ 218 if (perf_session__create_kernel_maps(session) < 0) 219 pr_warning("Cannot read kernel map\n"); 220 } 221 222 /* 223 * In pipe-mode, evlist is empty until PERF_RECORD_HEADER_ATTR is 224 * processed, so evlist__sample_id_all is not meaningful here. 225 */ 226 if ((!data || !data->is_pipe) && tool && tool->ordering_requires_timestamps && 227 tool->ordered_events && !evlist__sample_id_all(session->evlist)) { 228 dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); 229 tool->ordered_events = false; 230 } 231 232 return session; 233 234 out_delete: 235 perf_session__delete(session); 236 out: 237 return ERR_PTR(ret); 238 } 239 240 static void perf_decomp__release_events(struct decomp *next) 241 { 242 struct decomp *decomp; 243 size_t mmap_len; 244 245 do { 246 decomp = next; 247 if (decomp == NULL) 248 break; 249 next = decomp->next; 250 mmap_len = decomp->mmap_len; 251 munmap(decomp, mmap_len); 252 } while (1); 253 } 254 255 void perf_session__delete(struct perf_session *session) 256 { 257 if (session == NULL) 258 return; 259 auxtrace__free(session); 260 auxtrace_index__free(&session->auxtrace_index); 261 debuginfo_cache__delete(); 262 perf_session__destroy_kernel_maps(session); 263 perf_decomp__release_events(session->decomp_data.decomp); 264 perf_env__exit(&session->header.env); 265 machines__exit(&session->machines); 266 if (session->data) { 267 if (perf_data__is_read(session->data)) 268 evlist__delete(session->evlist); 269 perf_data__close(session->data); 270 } 271 #ifdef HAVE_LIBTRACEEVENT 272 trace_event__cleanup(&session->tevent); 273 #endif 274 free(session); 275 } 276 277 static void swap_sample_id_all(union perf_event *event, void *data) 278 { 279 void *end = (void *) event + event->header.size; 280 int size; 281 282 if (data >= end) 283 return; 284 285 size = end - data; 286 if (size % sizeof(u64)) { 287 pr_warning("swap_sample_id_all: unaligned sample_id_all remainder (%d), skipping swap\n", size); 288 return; 289 } 290 if (size > 0) 291 mem_bswap_64(data, size); 292 } 293 294 static int perf_event__all64_swap(union perf_event *event, 295 bool sample_id_all __maybe_unused) 296 { 297 struct perf_event_header *hdr = &event->header; 298 size_t size = event->header.size - sizeof(*hdr); 299 300 /* mem_bswap_64 rounds up to 8-byte chunks — unaligned size overruns the buffer */ 301 if (size % sizeof(u64)) 302 return -1; 303 mem_bswap_64(hdr + 1, size); 304 return 0; 305 } 306 307 static int perf_event__comm_swap(union perf_event *event, bool sample_id_all) 308 { 309 event->comm.pid = bswap_32(event->comm.pid); 310 event->comm.tid = bswap_32(event->comm.tid); 311 312 if (sample_id_all) { 313 void *data = &event->comm.comm; 314 void *end = (void *)event + event->header.size; 315 size_t len = strnlen(data, end - data); 316 317 /* 318 * No NUL within the event boundary — can't locate where 319 * sample_id_all starts. Reject so the event is skipped 320 * rather than swapping garbage. 321 */ 322 if (len == (size_t)(end - data)) 323 return -1; 324 data += PERF_ALIGN(len + 1, sizeof(u64)); 325 swap_sample_id_all(event, data); 326 } 327 return 0; 328 } 329 330 static int perf_event__mmap_swap(union perf_event *event, 331 bool sample_id_all) 332 { 333 event->mmap.pid = bswap_32(event->mmap.pid); 334 event->mmap.tid = bswap_32(event->mmap.tid); 335 event->mmap.start = bswap_64(event->mmap.start); 336 event->mmap.len = bswap_64(event->mmap.len); 337 event->mmap.pgoff = bswap_64(event->mmap.pgoff); 338 339 if (sample_id_all) { 340 void *data = &event->mmap.filename; 341 void *end = (void *)event + event->header.size; 342 size_t len = strnlen(data, end - data); 343 344 /* See comment in perf_event__comm_swap() */ 345 if (len == (size_t)(end - data)) 346 return -1; 347 data += PERF_ALIGN(len + 1, sizeof(u64)); 348 swap_sample_id_all(event, data); 349 } 350 return 0; 351 } 352 353 static int perf_event__mmap2_swap(union perf_event *event, 354 bool sample_id_all) 355 { 356 event->mmap2.pid = bswap_32(event->mmap2.pid); 357 event->mmap2.tid = bswap_32(event->mmap2.tid); 358 event->mmap2.start = bswap_64(event->mmap2.start); 359 event->mmap2.len = bswap_64(event->mmap2.len); 360 event->mmap2.pgoff = bswap_64(event->mmap2.pgoff); 361 362 if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) { 363 event->mmap2.maj = bswap_32(event->mmap2.maj); 364 event->mmap2.min = bswap_32(event->mmap2.min); 365 event->mmap2.ino = bswap_64(event->mmap2.ino); 366 event->mmap2.ino_generation = bswap_64(event->mmap2.ino_generation); 367 } 368 369 if (sample_id_all) { 370 void *data = &event->mmap2.filename; 371 void *end = (void *)event + event->header.size; 372 size_t len = strnlen(data, end - data); 373 374 /* See comment in perf_event__comm_swap() */ 375 if (len == (size_t)(end - data)) 376 return -1; 377 data += PERF_ALIGN(len + 1, sizeof(u64)); 378 swap_sample_id_all(event, data); 379 } 380 return 0; 381 } 382 383 static int perf_event__task_swap(union perf_event *event, bool sample_id_all) 384 { 385 event->fork.pid = bswap_32(event->fork.pid); 386 event->fork.tid = bswap_32(event->fork.tid); 387 event->fork.ppid = bswap_32(event->fork.ppid); 388 event->fork.ptid = bswap_32(event->fork.ptid); 389 event->fork.time = bswap_64(event->fork.time); 390 391 if (sample_id_all) 392 swap_sample_id_all(event, &event->fork + 1); 393 return 0; 394 } 395 396 static int perf_event__read_swap(union perf_event *event, 397 bool sample_id_all __maybe_unused) 398 { 399 size_t tail; 400 401 event->read.pid = bswap_32(event->read.pid); 402 event->read.tid = bswap_32(event->read.tid); 403 /* 404 * Everything after pid/tid is u64: the read values (variable 405 * set determined by attr.read_format, which we don't have 406 * here) optionally followed by sample_id_all fields. 407 * Since all are u64, swap the entire remaining tail at once. 408 */ 409 tail = event->header.size - offsetof(struct perf_record_read, value); 410 /* mem_bswap_64 rounds up to 8-byte chunks — unaligned tail overruns the buffer */ 411 if (tail % sizeof(u64)) 412 return -1; 413 mem_bswap_64(&event->read.value, tail); 414 return 0; 415 } 416 417 static int perf_event__aux_swap(union perf_event *event, bool sample_id_all) 418 { 419 event->aux.aux_offset = bswap_64(event->aux.aux_offset); 420 event->aux.aux_size = bswap_64(event->aux.aux_size); 421 event->aux.flags = bswap_64(event->aux.flags); 422 423 if (sample_id_all) 424 swap_sample_id_all(event, &event->aux + 1); 425 return 0; 426 } 427 428 static int perf_event__itrace_start_swap(union perf_event *event, 429 bool sample_id_all) 430 { 431 event->itrace_start.pid = bswap_32(event->itrace_start.pid); 432 event->itrace_start.tid = bswap_32(event->itrace_start.tid); 433 434 if (sample_id_all) 435 swap_sample_id_all(event, &event->itrace_start + 1); 436 return 0; 437 } 438 439 static int perf_event__switch_swap(union perf_event *event, bool sample_id_all) 440 { 441 if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE) { 442 event->context_switch.next_prev_pid = 443 bswap_32(event->context_switch.next_prev_pid); 444 event->context_switch.next_prev_tid = 445 bswap_32(event->context_switch.next_prev_tid); 446 } 447 448 if (sample_id_all) { 449 /* 450 * PERF_RECORD_SWITCH has no fields beyond the header; 451 * SWITCH_CPU_WIDE adds pid/tid. Use the right offset 452 * so sample_id starts at the correct position. 453 */ 454 if (event->header.type == PERF_RECORD_SWITCH) 455 swap_sample_id_all(event, (void *)event + sizeof(event->header)); 456 else 457 swap_sample_id_all(event, &event->context_switch + 1); 458 } 459 return 0; 460 } 461 462 static int perf_event__text_poke_swap(union perf_event *event, bool sample_id_all) 463 { 464 event->text_poke.addr = bswap_64(event->text_poke.addr); 465 event->text_poke.old_len = bswap_16(event->text_poke.old_len); 466 event->text_poke.new_len = bswap_16(event->text_poke.new_len); 467 468 if (sample_id_all) { 469 void *data = &event->text_poke.old_len; 470 void *end = (void *)event + event->header.size; 471 size_t len = sizeof(event->text_poke.old_len) + 472 sizeof(event->text_poke.new_len) + 473 event->text_poke.old_len + 474 event->text_poke.new_len; 475 476 /* old_len + new_len exceeds event — can't find sample_id_all */ 477 if (data + len > end) 478 return -1; 479 data += PERF_ALIGN(len, sizeof(u64)); 480 swap_sample_id_all(event, data); 481 } 482 return 0; 483 } 484 485 static int perf_event__throttle_swap(union perf_event *event, 486 bool sample_id_all) 487 { 488 event->throttle.time = bswap_64(event->throttle.time); 489 event->throttle.id = bswap_64(event->throttle.id); 490 event->throttle.stream_id = bswap_64(event->throttle.stream_id); 491 492 if (sample_id_all) 493 swap_sample_id_all(event, &event->throttle + 1); 494 return 0; 495 } 496 497 static int perf_event__namespaces_swap(union perf_event *event, 498 bool sample_id_all) 499 { 500 u64 i, nr, max_nr; 501 502 event->namespaces.pid = bswap_32(event->namespaces.pid); 503 event->namespaces.tid = bswap_32(event->namespaces.tid); 504 event->namespaces.nr_namespaces = bswap_64(event->namespaces.nr_namespaces); 505 506 nr = event->namespaces.nr_namespaces; 507 /* 508 * Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof. 509 * When sample_id_all is present max_nr slightly overestimates the 510 * array space because header.size includes the trailing sample_id. 511 * Harmless: both the per-element bswap_64 loop and swap_sample_id_all() 512 * perform the same u64 byte swap, so the result is correct regardless 513 * of where the boundary between array and sample_id falls. 514 */ 515 max_nr = (event->header.size - sizeof(event->namespaces)) / 516 sizeof(event->namespaces.link_info[0]); 517 /* 518 * Safe to clamp: each namespace entry is indexed by type; 519 * missing entries just won't be resolved. 520 */ 521 if (nr > max_nr) { 522 pr_warning("WARNING: PERF_RECORD_NAMESPACES: nr_namespaces %" PRIu64 " exceeds payload (max %" PRIu64 "), clamping\n", 523 nr, max_nr); 524 nr = max_nr; 525 event->namespaces.nr_namespaces = nr; 526 } 527 528 for (i = 0; i < nr; i++) { 529 struct perf_ns_link_info *ns = &event->namespaces.link_info[i]; 530 531 ns->dev = bswap_64(ns->dev); 532 ns->ino = bswap_64(ns->ino); 533 } 534 535 if (sample_id_all) 536 swap_sample_id_all(event, &event->namespaces.link_info[i]); 537 return 0; 538 } 539 540 static int perf_event__cgroup_swap(union perf_event *event, bool sample_id_all) 541 { 542 event->cgroup.id = bswap_64(event->cgroup.id); 543 544 if (sample_id_all) { 545 void *data = &event->cgroup.path; 546 void *end = (void *)event + event->header.size; 547 size_t len = strnlen(data, end - data); 548 549 /* See comment in perf_event__comm_swap() */ 550 if (len == (size_t)(end - data)) 551 return -1; 552 data += PERF_ALIGN(len + 1, sizeof(u64)); 553 swap_sample_id_all(event, data); 554 } 555 return 0; 556 } 557 558 static u8 revbyte(u8 b) 559 { 560 int rev = (b >> 4) | ((b & 0xf) << 4); 561 rev = ((rev & 0xcc) >> 2) | ((rev & 0x33) << 2); 562 rev = ((rev & 0xaa) >> 1) | ((rev & 0x55) << 1); 563 return (u8) rev; 564 } 565 566 /* 567 * XXX this is hack in attempt to carry flags bitfield 568 * through endian village. ABI says: 569 * 570 * Bit-fields are allocated from right to left (least to most significant) 571 * on little-endian implementations and from left to right (most to least 572 * significant) on big-endian implementations. 573 * 574 * The above seems to be byte specific, so we need to reverse each 575 * byte of the bitfield. 'Internet' also says this might be implementation 576 * specific and we probably need proper fix and carry perf_event_attr 577 * bitfield flags in separate data file FEAT_ section. Thought this seems 578 * to work for now. 579 */ 580 static void swap_bitfield(u8 *p, unsigned len) 581 { 582 unsigned i; 583 584 for (i = 0; i < len; i++) { 585 *p = revbyte(*p); 586 p++; 587 } 588 } 589 590 /* exported for swapping attributes in file header */ 591 void perf_event__attr_swap(struct perf_event_attr *attr) 592 { 593 attr->type = bswap_32(attr->type); 594 attr->size = bswap_32(attr->size); 595 596 /* 597 * ABI0: size == 0 means the producer didn't set it. 598 * Assume PERF_ATTR_SIZE_VER0 so bswap_safe() below 599 * correctly swaps the VER0 fields instead of skipping 600 * everything. Same convention as read_attr(). 601 */ 602 if (!attr->size) 603 attr->size = PERF_ATTR_SIZE_VER0; 604 605 /* Verify the full field extent fits, not just its start offset */ 606 #define bswap_safe(f, n) \ 607 (attr->size >= (offsetof(struct perf_event_attr, f) + \ 608 sizeof(attr->f) * ((n) + 1))) 609 #define bswap_field(f, sz) \ 610 do { \ 611 if (bswap_safe(f, 0)) \ 612 attr->f = bswap_##sz(attr->f); \ 613 } while(0) 614 #define bswap_field_16(f) bswap_field(f, 16) 615 #define bswap_field_32(f) bswap_field(f, 32) 616 #define bswap_field_64(f) bswap_field(f, 64) 617 618 bswap_field_64(config); 619 bswap_field_64(sample_period); 620 bswap_field_64(sample_type); 621 bswap_field_64(read_format); 622 bswap_field_32(wakeup_events); 623 bswap_field_32(bp_type); 624 bswap_field_64(bp_addr); 625 bswap_field_64(bp_len); 626 bswap_field_64(branch_sample_type); 627 bswap_field_64(sample_regs_user); 628 bswap_field_32(sample_stack_user); 629 bswap_field_32(aux_watermark); 630 bswap_field_16(sample_max_stack); 631 bswap_field_32(aux_sample_size); 632 633 /* 634 * After read_format are bitfields. Check read_format because 635 * we are unable to use offsetof on bitfield. 636 */ 637 if (bswap_safe(read_format, 1)) 638 swap_bitfield((u8 *) (&attr->read_format + 1), 639 sizeof(u64)); 640 #undef bswap_field_64 641 #undef bswap_field_32 642 #undef bswap_field 643 #undef bswap_safe 644 } 645 646 static int perf_event__hdr_attr_swap(union perf_event *event, 647 bool sample_id_all __maybe_unused) 648 { 649 u32 attr_size, payload_size; 650 size_t size; 651 652 /* 653 * Validate attr.size (still foreign-endian) before calling 654 * perf_event__attr_swap(), which uses it via bswap_safe() 655 * to decide which fields to swap. A crafted attr.size 656 * larger than the event payload would swap past the event 657 * boundary and corrupt adjacent memory. 658 * 659 * header.size alignment is already validated by 660 * perf_session__process_event(). The min_size table 661 * guarantees header.size >= sizeof(header) + 662 * PERF_ATTR_SIZE_VER0, so attr.size is safe to access. 663 */ 664 attr_size = bswap_32(event->attr.attr.size); 665 /* 666 * ABI0: size field not set. This only happens in pipe/inject 667 * mode where HEADER_ATTR events carry their own attr. For 668 * regular perf.data files, read_attr() uses f_header.attr_size 669 * from the file header instead. Assume PERF_ATTR_SIZE_VER0. 670 */ 671 if (!attr_size) 672 attr_size = PERF_ATTR_SIZE_VER0; 673 payload_size = event->header.size - sizeof(event->header); 674 675 if (attr_size < PERF_ATTR_SIZE_VER0 || attr_size % sizeof(u64) || 676 attr_size > payload_size) { 677 pr_err("PERF_RECORD_HEADER_ATTR: invalid attr.size %u (min: %d, max: %u, 8-byte aligned)\n", 678 attr_size, PERF_ATTR_SIZE_VER0, payload_size); 679 return -1; 680 } 681 682 perf_event__attr_swap(&event->attr.attr); 683 684 size = event->header.size; 685 size -= perf_record_header_attr_id(event) - (void *)event; 686 mem_bswap_64(perf_record_header_attr_id(event), size); 687 return 0; 688 } 689 690 static int perf_event__build_id_swap(union perf_event *event, 691 bool sample_id_all) 692 { 693 event->build_id.pid = bswap_32(event->build_id.pid); 694 695 if (sample_id_all) { 696 void *data = &event->build_id.filename; 697 void *end = (void *)event + event->header.size; 698 size_t len = strnlen(data, end - data); 699 700 /* See comment in perf_event__comm_swap() */ 701 if (len == (size_t)(end - data)) 702 return -1; 703 data += PERF_ALIGN(len + 1, sizeof(u64)); 704 swap_sample_id_all(event, data); 705 } 706 return 0; 707 } 708 709 static int perf_event__event_update_swap(union perf_event *event, 710 bool sample_id_all __maybe_unused) 711 { 712 struct perf_record_event_update *ev = &event->event_update; 713 714 ev->type = bswap_64(ev->type); 715 ev->id = bswap_64(ev->id); 716 717 /* 718 * Swap variant-specific fields so the processing path 719 * sees native byte order. 720 */ 721 if (ev->type == PERF_EVENT_UPDATE__SCALE) { 722 if (event->header.size < offsetof(struct perf_record_event_update, scale) + 723 sizeof(ev->scale)) 724 return -1; 725 mem_bswap_64(&ev->scale.scale, sizeof(ev->scale.scale)); 726 } else if (ev->type == PERF_EVENT_UPDATE__CPUS) { 727 u32 cpus_payload; 728 struct perf_record_cpu_map_data *data = &ev->cpus.cpus; 729 730 /* CPUS fields start at the same offset as scale (union) */ 731 if (event->header.size < offsetof(struct perf_record_event_update, cpus) + 732 sizeof(__u16) + sizeof(struct perf_record_range_cpu_map)) 733 return -1; 734 cpus_payload = event->header.size - offsetof(struct perf_record_event_update, cpus); 735 data->type = bswap_16(data->type); 736 /* 737 * Full swap including array elements — same logic as 738 * perf_event__cpu_map_swap() but scoped to the 739 * embedded cpu_map_data within EVENT_UPDATE. 740 */ 741 switch (data->type) { 742 case PERF_CPU_MAP__CPUS: { 743 u16 nr, max_nr; 744 745 data->cpus_data.nr = bswap_16(data->cpus_data.nr); 746 nr = data->cpus_data.nr; 747 max_nr = (cpus_payload - offsetof(struct perf_record_cpu_map_data, 748 cpus_data.cpu)) / 749 sizeof(data->cpus_data.cpu[0]); 750 if (nr > max_nr) { 751 nr = max_nr; 752 data->cpus_data.nr = nr; 753 } 754 for (unsigned int i = 0; i < nr; i++) 755 data->cpus_data.cpu[i] = bswap_16(data->cpus_data.cpu[i]); 756 break; 757 } 758 case PERF_CPU_MAP__MASK: 759 data->mask32_data.long_size = bswap_16(data->mask32_data.long_size); 760 switch (data->mask32_data.long_size) { 761 case 4: { 762 u16 nr, max_nr; 763 764 data->mask32_data.nr = bswap_16(data->mask32_data.nr); 765 nr = data->mask32_data.nr; 766 max_nr = (cpus_payload - offsetof(struct perf_record_cpu_map_data, 767 mask32_data.mask)) / 768 sizeof(data->mask32_data.mask[0]); 769 if (nr > max_nr) { 770 nr = max_nr; 771 data->mask32_data.nr = nr; 772 } 773 for (unsigned int i = 0; i < nr; i++) 774 data->mask32_data.mask[i] = bswap_32(data->mask32_data.mask[i]); 775 break; 776 } 777 case 8: { 778 u16 nr, max_nr; 779 780 data->mask64_data.nr = bswap_16(data->mask64_data.nr); 781 nr = data->mask64_data.nr; 782 if (cpus_payload < offsetof(struct perf_record_cpu_map_data, mask64_data.mask)) { 783 data->mask64_data.nr = 0; 784 break; 785 } 786 max_nr = (cpus_payload - offsetof(struct perf_record_cpu_map_data, 787 mask64_data.mask)) / 788 sizeof(data->mask64_data.mask[0]); 789 if (nr > max_nr) { 790 nr = max_nr; 791 data->mask64_data.nr = nr; 792 } 793 for (unsigned int i = 0; i < nr; i++) 794 data->mask64_data.mask[i] = bswap_64(data->mask64_data.mask[i]); 795 break; 796 } 797 default: 798 break; 799 } 800 break; 801 case PERF_CPU_MAP__RANGE_CPUS: 802 data->range_cpu_data.start_cpu = bswap_16(data->range_cpu_data.start_cpu); 803 data->range_cpu_data.end_cpu = bswap_16(data->range_cpu_data.end_cpu); 804 break; 805 default: 806 break; 807 } 808 } 809 return 0; 810 } 811 812 static int perf_event__event_type_swap(union perf_event *event, 813 bool sample_id_all __maybe_unused) 814 { 815 event->event_type.event_type.event_id = 816 bswap_64(event->event_type.event_type.event_id); 817 return 0; 818 } 819 820 static int perf_event__tracing_data_swap(union perf_event *event, 821 bool sample_id_all __maybe_unused) 822 { 823 event->tracing_data.size = bswap_32(event->tracing_data.size); 824 return 0; 825 } 826 827 static int perf_event__auxtrace_info_swap(union perf_event *event, 828 bool sample_id_all __maybe_unused) 829 { 830 size_t size; 831 832 event->auxtrace_info.type = bswap_32(event->auxtrace_info.type); 833 834 size = event->header.size; 835 size -= (void *)&event->auxtrace_info.priv - (void *)event; 836 mem_bswap_64(event->auxtrace_info.priv, size); 837 return 0; 838 } 839 840 static int perf_event__auxtrace_swap(union perf_event *event, 841 bool sample_id_all __maybe_unused) 842 { 843 event->auxtrace.size = bswap_64(event->auxtrace.size); 844 event->auxtrace.offset = bswap_64(event->auxtrace.offset); 845 event->auxtrace.reference = bswap_64(event->auxtrace.reference); 846 event->auxtrace.idx = bswap_32(event->auxtrace.idx); 847 event->auxtrace.tid = bswap_32(event->auxtrace.tid); 848 event->auxtrace.cpu = bswap_32(event->auxtrace.cpu); 849 return 0; 850 } 851 852 static int perf_event__auxtrace_error_swap(union perf_event *event, 853 bool sample_id_all __maybe_unused) 854 { 855 event->auxtrace_error.type = bswap_32(event->auxtrace_error.type); 856 event->auxtrace_error.code = bswap_32(event->auxtrace_error.code); 857 event->auxtrace_error.cpu = bswap_32(event->auxtrace_error.cpu); 858 event->auxtrace_error.pid = bswap_32(event->auxtrace_error.pid); 859 event->auxtrace_error.tid = bswap_32(event->auxtrace_error.tid); 860 event->auxtrace_error.fmt = bswap_32(event->auxtrace_error.fmt); 861 event->auxtrace_error.ip = bswap_64(event->auxtrace_error.ip); 862 if (event->auxtrace_error.fmt) 863 event->auxtrace_error.time = bswap_64(event->auxtrace_error.time); 864 if (event->auxtrace_error.fmt >= 2) { 865 /* 866 * fmt >= 2 adds machine_pid and vcpu after msg[64]. 867 * Older files may have fmt >= 2 but an event size 868 * that doesn't include these fields — downgrade to 869 * avoid swapping out of bounds. 870 */ 871 if (event->header.size < offsetof(typeof(event->auxtrace_error), vcpu) + 872 sizeof(event->auxtrace_error.vcpu)) { 873 pr_warning("WARNING: PERF_RECORD_AUXTRACE_ERROR: fmt %u but event too small for machine_pid/vcpu (%u bytes), downgrading fmt\n", 874 event->auxtrace_error.fmt, 875 event->header.size); 876 event->auxtrace_error.fmt = 1; 877 } else { 878 event->auxtrace_error.machine_pid = bswap_32(event->auxtrace_error.machine_pid); 879 event->auxtrace_error.vcpu = bswap_32(event->auxtrace_error.vcpu); 880 } 881 } 882 return 0; 883 } 884 885 static int perf_event__thread_map_swap(union perf_event *event, 886 bool sample_id_all __maybe_unused) 887 { 888 unsigned int i; 889 u64 nr; 890 891 event->thread_map.nr = bswap_64(event->thread_map.nr); 892 893 /* 894 * Reject rather than clamp: unlike namespaces (indexed by type) 895 * or stat_config (self-describing tags), a truncated thread map 896 * is structurally broken — downstream would get a wrong map. 897 */ 898 /* Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof */ 899 nr = event->thread_map.nr; 900 if (nr > (event->header.size - sizeof(event->thread_map)) / 901 sizeof(event->thread_map.entries[0])) 902 return -1; 903 904 for (i = 0; i < nr; i++) 905 event->thread_map.entries[i].pid = bswap_64(event->thread_map.entries[i].pid); 906 return 0; 907 } 908 909 static int perf_event__cpu_map_swap(union perf_event *event, 910 bool sample_id_all __maybe_unused) 911 { 912 struct perf_record_cpu_map_data *data = &event->cpu_map.data; 913 u32 payload = event->header.size - sizeof(event->header); 914 915 data->type = bswap_16(data->type); 916 917 /* 918 * Safe to clamp: a shorter CPU map just means some CPUs 919 * are absent; tools process the CPUs that are present. 920 */ 921 switch (data->type) { 922 case PERF_CPU_MAP__CPUS: { 923 u16 nr, max_nr; 924 925 data->cpus_data.nr = bswap_16(data->cpus_data.nr); 926 nr = data->cpus_data.nr; 927 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 928 cpus_data.cpu)) / 929 sizeof(data->cpus_data.cpu[0]); 930 if (nr > max_nr) { 931 pr_warning("WARNING: PERF_RECORD_CPU_MAP: nr %u exceeds payload (max %u), clamping\n", 932 nr, max_nr); 933 nr = max_nr; 934 data->cpus_data.nr = nr; 935 } 936 for (unsigned int i = 0; i < nr; i++) 937 data->cpus_data.cpu[i] = bswap_16(data->cpus_data.cpu[i]); 938 break; 939 } 940 case PERF_CPU_MAP__MASK: 941 data->mask32_data.long_size = bswap_16(data->mask32_data.long_size); 942 943 switch (data->mask32_data.long_size) { 944 case 4: { 945 u16 nr, max_nr; 946 947 data->mask32_data.nr = bswap_16(data->mask32_data.nr); 948 nr = data->mask32_data.nr; 949 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 950 mask32_data.mask)) / 951 sizeof(data->mask32_data.mask[0]); 952 if (nr > max_nr) { 953 pr_warning("WARNING: PERF_RECORD_CPU_MAP mask32: nr %u exceeds payload (max %u), clamping\n", 954 nr, max_nr); 955 nr = max_nr; 956 data->mask32_data.nr = nr; 957 } 958 for (unsigned int i = 0; i < nr; i++) 959 data->mask32_data.mask[i] = bswap_32(data->mask32_data.mask[i]); 960 break; 961 } 962 case 8: { 963 u16 nr, max_nr; 964 965 data->mask64_data.nr = bswap_16(data->mask64_data.nr); 966 nr = data->mask64_data.nr; 967 if (payload < offsetof(struct perf_record_cpu_map_data, mask64_data.mask)) { 968 data->mask64_data.nr = 0; 969 break; 970 } 971 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 972 mask64_data.mask)) / 973 sizeof(data->mask64_data.mask[0]); 974 if (nr > max_nr) { 975 pr_warning("WARNING: PERF_RECORD_CPU_MAP mask64: nr %u exceeds payload (max %u), clamping\n", 976 nr, max_nr); 977 nr = max_nr; 978 data->mask64_data.nr = nr; 979 } 980 for (unsigned int i = 0; i < nr; i++) 981 data->mask64_data.mask[i] = bswap_64(data->mask64_data.mask[i]); 982 break; 983 } 984 default: 985 pr_err("cpu_map swap: unsupported long size %u\n", 986 data->mask32_data.long_size); 987 } 988 break; 989 case PERF_CPU_MAP__RANGE_CPUS: 990 data->range_cpu_data.start_cpu = bswap_16(data->range_cpu_data.start_cpu); 991 data->range_cpu_data.end_cpu = bswap_16(data->range_cpu_data.end_cpu); 992 break; 993 default: 994 break; 995 } 996 return 0; 997 } 998 999 static int perf_event__stat_config_swap(union perf_event *event, 1000 bool sample_id_all __maybe_unused) 1001 { 1002 u64 nr, max_nr, size; 1003 1004 nr = bswap_64(event->stat_config.nr); 1005 /* Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof */ 1006 max_nr = (event->header.size - sizeof(event->stat_config)) / 1007 sizeof(event->stat_config.data[0]); 1008 /* 1009 * Safe to clamp: each config entry is self-describing 1010 * via its tag; missing entries keep their defaults. 1011 */ 1012 if (nr > max_nr) { 1013 pr_warning("WARNING: PERF_RECORD_STAT_CONFIG: nr %" PRIu64 " exceeds payload (max %" PRIu64 "), clamping\n", 1014 nr, max_nr); 1015 nr = max_nr; 1016 } 1017 size = nr * sizeof(event->stat_config.data[0]); 1018 /* The swap starts at &nr, so add its size to cover the full range */ 1019 size += sizeof(event->stat_config.nr); 1020 mem_bswap_64(&event->stat_config.nr, size); 1021 /* Persist the clamped value in native byte order */ 1022 event->stat_config.nr = nr; 1023 return 0; 1024 } 1025 1026 static int perf_event__stat_swap(union perf_event *event, 1027 bool sample_id_all __maybe_unused) 1028 { 1029 event->stat.id = bswap_64(event->stat.id); 1030 event->stat.thread = bswap_32(event->stat.thread); 1031 event->stat.cpu = bswap_32(event->stat.cpu); 1032 event->stat.val = bswap_64(event->stat.val); 1033 event->stat.ena = bswap_64(event->stat.ena); 1034 event->stat.run = bswap_64(event->stat.run); 1035 return 0; 1036 } 1037 1038 static int perf_event__stat_round_swap(union perf_event *event, 1039 bool sample_id_all __maybe_unused) 1040 { 1041 event->stat_round.type = bswap_64(event->stat_round.type); 1042 event->stat_round.time = bswap_64(event->stat_round.time); 1043 return 0; 1044 } 1045 1046 static int perf_event__time_conv_swap(union perf_event *event, 1047 bool sample_id_all __maybe_unused) 1048 { 1049 event->time_conv.time_shift = bswap_64(event->time_conv.time_shift); 1050 event->time_conv.time_mult = bswap_64(event->time_conv.time_mult); 1051 event->time_conv.time_zero = bswap_64(event->time_conv.time_zero); 1052 1053 if (event_contains(event->time_conv, time_cycles)) 1054 event->time_conv.time_cycles = bswap_64(event->time_conv.time_cycles); 1055 if (event_contains(event->time_conv, time_mask)) 1056 event->time_conv.time_mask = bswap_64(event->time_conv.time_mask); 1057 return 0; 1058 } 1059 1060 static int perf_event__compressed2_swap(union perf_event *event, 1061 bool sample_id_all __maybe_unused) 1062 { 1063 /* Only data_size needs swapping — compressed payload is a raw byte stream */ 1064 event->pack2.data_size = bswap_64(event->pack2.data_size); 1065 return 0; 1066 } 1067 1068 static int perf_event__bpf_metadata_swap(union perf_event *event, 1069 bool sample_id_all __maybe_unused) 1070 { 1071 u64 i, nr, max_nr; 1072 1073 /* Fixed header must fit before accessing nr_entries or prog_name */ 1074 if (event->header.size < sizeof(event->bpf_metadata)) 1075 return -1; 1076 1077 event->bpf_metadata.nr_entries = bswap_64(event->bpf_metadata.nr_entries); 1078 1079 /* 1080 * Ensure NUL-termination on the cross-endian path where the 1081 * mapping is writable (MAP_PRIVATE + PROT_WRITE). Fixing 1082 * the string in place is preferred over rejecting because it 1083 * preserves the event for downstream processing — only the 1084 * last byte is lost. 1085 * 1086 * The native-endian path (MAP_SHARED + PROT_READ) cannot 1087 * write, so it validates and skips unterminated events in 1088 * perf_session__process_user_event() instead. The two 1089 * strategies produce different outcomes for the same 1090 * malformed input (fix vs skip), which is inherent in the 1091 * writable-vs-read-only mapping model. 1092 */ 1093 event->bpf_metadata.prog_name[BPF_PROG_NAME_LEN - 1] = '\0'; 1094 1095 nr = event->bpf_metadata.nr_entries; 1096 max_nr = (event->header.size - sizeof(event->bpf_metadata)) / 1097 sizeof(event->bpf_metadata.entries[0]); 1098 if (nr > max_nr) { 1099 /* Persist clamped value so the native path processes entries, not skips */ 1100 nr = max_nr; 1101 event->bpf_metadata.nr_entries = nr; 1102 } 1103 1104 for (i = 0; i < nr; i++) { 1105 event->bpf_metadata.entries[i].key[BPF_METADATA_KEY_LEN - 1] = '\0'; 1106 event->bpf_metadata.entries[i].value[BPF_METADATA_VALUE_LEN - 1] = '\0'; 1107 } 1108 return 0; 1109 } 1110 static int 1111 perf_event__schedstat_cpu_swap(union perf_event *event __maybe_unused, 1112 bool sample_id_all __maybe_unused) 1113 { 1114 /* FIXME */ 1115 return 0; 1116 } 1117 1118 static int 1119 perf_event__schedstat_domain_swap(union perf_event *event __maybe_unused, 1120 bool sample_id_all __maybe_unused) 1121 { 1122 /* FIXME */ 1123 return 0; 1124 } 1125 1126 static int perf_event__ksymbol_swap(union perf_event *event, 1127 bool sample_id_all) 1128 { 1129 event->ksymbol.addr = bswap_64(event->ksymbol.addr); 1130 event->ksymbol.len = bswap_32(event->ksymbol.len); 1131 event->ksymbol.ksym_type = bswap_16(event->ksymbol.ksym_type); 1132 event->ksymbol.flags = bswap_16(event->ksymbol.flags); 1133 1134 if (sample_id_all) { 1135 void *data = &event->ksymbol.name; 1136 void *end = (void *)event + event->header.size; 1137 size_t len = strnlen(data, end - data); 1138 1139 /* See comment in perf_event__comm_swap() */ 1140 if (len == (size_t)(end - data)) 1141 return -1; 1142 data += PERF_ALIGN(len + 1, sizeof(u64)); 1143 swap_sample_id_all(event, data); 1144 } 1145 return 0; 1146 } 1147 1148 static int perf_event__bpf_event_swap(union perf_event *event, 1149 bool sample_id_all) 1150 { 1151 event->bpf.type = bswap_16(event->bpf.type); 1152 event->bpf.flags = bswap_16(event->bpf.flags); 1153 event->bpf.id = bswap_32(event->bpf.id); 1154 1155 if (sample_id_all) 1156 swap_sample_id_all(event, &event->bpf + 1); 1157 return 0; 1158 } 1159 1160 static int perf_event__header_feature_swap(union perf_event *event, 1161 bool sample_id_all __maybe_unused) 1162 { 1163 event->feat.feat_id = bswap_64(event->feat.feat_id); 1164 return 0; 1165 } 1166 1167 typedef int (*perf_event__swap_op)(union perf_event *event, 1168 bool sample_id_all); 1169 1170 static perf_event__swap_op perf_event__swap_ops[] = { 1171 [PERF_RECORD_MMAP] = perf_event__mmap_swap, 1172 [PERF_RECORD_MMAP2] = perf_event__mmap2_swap, 1173 [PERF_RECORD_COMM] = perf_event__comm_swap, 1174 [PERF_RECORD_FORK] = perf_event__task_swap, 1175 [PERF_RECORD_EXIT] = perf_event__task_swap, 1176 [PERF_RECORD_LOST] = perf_event__all64_swap, 1177 [PERF_RECORD_READ] = perf_event__read_swap, 1178 [PERF_RECORD_THROTTLE] = perf_event__throttle_swap, 1179 [PERF_RECORD_UNTHROTTLE] = perf_event__throttle_swap, 1180 [PERF_RECORD_SAMPLE] = perf_event__all64_swap, 1181 [PERF_RECORD_AUX] = perf_event__aux_swap, 1182 [PERF_RECORD_ITRACE_START] = perf_event__itrace_start_swap, 1183 [PERF_RECORD_LOST_SAMPLES] = perf_event__all64_swap, 1184 [PERF_RECORD_SWITCH] = perf_event__switch_swap, 1185 [PERF_RECORD_SWITCH_CPU_WIDE] = perf_event__switch_swap, 1186 [PERF_RECORD_NAMESPACES] = perf_event__namespaces_swap, 1187 [PERF_RECORD_CGROUP] = perf_event__cgroup_swap, 1188 [PERF_RECORD_KSYMBOL] = perf_event__ksymbol_swap, 1189 [PERF_RECORD_BPF_EVENT] = perf_event__bpf_event_swap, 1190 [PERF_RECORD_TEXT_POKE] = perf_event__text_poke_swap, 1191 [PERF_RECORD_AUX_OUTPUT_HW_ID] = perf_event__all64_swap, 1192 [PERF_RECORD_CALLCHAIN_DEFERRED] = perf_event__all64_swap, 1193 [PERF_RECORD_HEADER_ATTR] = perf_event__hdr_attr_swap, 1194 [PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap, 1195 [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap, 1196 [PERF_RECORD_HEADER_BUILD_ID] = perf_event__build_id_swap, 1197 [PERF_RECORD_HEADER_FEATURE] = perf_event__header_feature_swap, 1198 [PERF_RECORD_ID_INDEX] = perf_event__all64_swap, 1199 [PERF_RECORD_AUXTRACE_INFO] = perf_event__auxtrace_info_swap, 1200 [PERF_RECORD_AUXTRACE] = perf_event__auxtrace_swap, 1201 [PERF_RECORD_AUXTRACE_ERROR] = perf_event__auxtrace_error_swap, 1202 [PERF_RECORD_THREAD_MAP] = perf_event__thread_map_swap, 1203 [PERF_RECORD_CPU_MAP] = perf_event__cpu_map_swap, 1204 [PERF_RECORD_STAT_CONFIG] = perf_event__stat_config_swap, 1205 [PERF_RECORD_STAT] = perf_event__stat_swap, 1206 [PERF_RECORD_STAT_ROUND] = perf_event__stat_round_swap, 1207 [PERF_RECORD_EVENT_UPDATE] = perf_event__event_update_swap, 1208 [PERF_RECORD_TIME_CONV] = perf_event__time_conv_swap, 1209 [PERF_RECORD_COMPRESSED2] = perf_event__compressed2_swap, 1210 [PERF_RECORD_BPF_METADATA] = perf_event__bpf_metadata_swap, 1211 [PERF_RECORD_SCHEDSTAT_CPU] = perf_event__schedstat_cpu_swap, 1212 [PERF_RECORD_SCHEDSTAT_DOMAIN] = perf_event__schedstat_domain_swap, 1213 [PERF_RECORD_HEADER_MAX] = NULL, 1214 }; 1215 1216 /* 1217 * When perf record finishes a pass on every buffers, it records this pseudo 1218 * event. 1219 * We record the max timestamp t found in the pass n. 1220 * Assuming these timestamps are monotonic across cpus, we know that if 1221 * a buffer still has events with timestamps below t, they will be all 1222 * available and then read in the pass n + 1. 1223 * Hence when we start to read the pass n + 2, we can safely flush every 1224 * events with timestamps below t. 1225 * 1226 * ============ PASS n ================= 1227 * CPU 0 | CPU 1 1228 * | 1229 * cnt1 timestamps | cnt2 timestamps 1230 * 1 | 2 1231 * 2 | 3 1232 * - | 4 <--- max recorded 1233 * 1234 * ============ PASS n + 1 ============== 1235 * CPU 0 | CPU 1 1236 * | 1237 * cnt1 timestamps | cnt2 timestamps 1238 * 3 | 5 1239 * 4 | 6 1240 * 5 | 7 <---- max recorded 1241 * 1242 * Flush every events below timestamp 4 1243 * 1244 * ============ PASS n + 2 ============== 1245 * CPU 0 | CPU 1 1246 * | 1247 * cnt1 timestamps | cnt2 timestamps 1248 * 6 | 8 1249 * 7 | 9 1250 * - | 10 1251 * 1252 * Flush every events below timestamp 7 1253 * etc... 1254 */ 1255 int perf_event__process_finished_round(const struct perf_tool *tool __maybe_unused, 1256 union perf_event *event __maybe_unused, 1257 struct ordered_events *oe) 1258 { 1259 if (dump_trace) 1260 fprintf(stdout, "\n"); 1261 return ordered_events__flush(oe, OE_FLUSH__ROUND); 1262 } 1263 1264 int perf_session__queue_event(struct perf_session *s, union perf_event *event, 1265 u64 timestamp, u64 file_offset, const char *file_path) 1266 { 1267 return ordered_events__queue(&s->ordered_events, event, timestamp, file_offset, file_path); 1268 } 1269 1270 static void callchain__lbr_callstack_printf(struct perf_sample *sample) 1271 { 1272 struct ip_callchain *callchain = sample->callchain; 1273 struct branch_stack *lbr_stack = sample->branch_stack; 1274 struct branch_entry *entries = perf_sample__branch_entries(sample); 1275 u64 kernel_callchain_nr = callchain->nr; 1276 unsigned int i; 1277 1278 for (i = 0; i < kernel_callchain_nr; i++) { 1279 if (callchain->ips[i] == PERF_CONTEXT_USER) 1280 break; 1281 } 1282 1283 if ((i != kernel_callchain_nr) && lbr_stack->nr) { 1284 u64 total_nr; 1285 /* 1286 * LBR callstack can only get user call chain, 1287 * i is kernel call chain number, 1288 * 1 is PERF_CONTEXT_USER. 1289 * 1290 * The user call chain is stored in LBR registers. 1291 * LBR are pair registers. The caller is stored 1292 * in "from" register, while the callee is stored 1293 * in "to" register. 1294 * For example, there is a call stack 1295 * "A"->"B"->"C"->"D". 1296 * The LBR registers will be recorded like 1297 * "C"->"D", "B"->"C", "A"->"B". 1298 * So only the first "to" register and all "from" 1299 * registers are needed to construct the whole stack. 1300 */ 1301 total_nr = i + 1 + lbr_stack->nr + 1; 1302 kernel_callchain_nr = i + 1; 1303 1304 printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr); 1305 1306 for (i = 0; i < kernel_callchain_nr; i++) 1307 printf("..... %2d: %016" PRIx64 "\n", 1308 i, callchain->ips[i]); 1309 1310 printf("..... %2d: %016" PRIx64 "\n", 1311 (int)(kernel_callchain_nr), entries[0].to); 1312 for (i = 0; i < lbr_stack->nr; i++) 1313 printf("..... %2d: %016" PRIx64 "\n", 1314 (int)(i + kernel_callchain_nr + 1), entries[i].from); 1315 } 1316 } 1317 1318 static const char *callchain_context_str(u64 ip) 1319 { 1320 switch (ip) { 1321 case PERF_CONTEXT_HV: 1322 return " (PERF_CONTEXT_HV)"; 1323 case PERF_CONTEXT_KERNEL: 1324 return " (PERF_CONTEXT_KERNEL)"; 1325 case PERF_CONTEXT_USER: 1326 return " (PERF_CONTEXT_USER)"; 1327 case PERF_CONTEXT_GUEST: 1328 return " (PERF_CONTEXT_GUEST)"; 1329 case PERF_CONTEXT_GUEST_KERNEL: 1330 return " (PERF_CONTEXT_GUEST_KERNEL)"; 1331 case PERF_CONTEXT_GUEST_USER: 1332 return " (PERF_CONTEXT_GUEST_USER)"; 1333 case PERF_CONTEXT_USER_DEFERRED: 1334 return " (PERF_CONTEXT_USER_DEFERRED)"; 1335 default: 1336 return ""; 1337 } 1338 } 1339 1340 static void callchain__printf(struct evsel *evsel, 1341 struct perf_sample *sample) 1342 { 1343 unsigned int i; 1344 struct ip_callchain *callchain = sample->callchain; 1345 1346 if (evsel__has_branch_callstack(evsel)) 1347 callchain__lbr_callstack_printf(sample); 1348 1349 printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr); 1350 1351 for (i = 0; i < callchain->nr; i++) 1352 printf("..... %2d: %016" PRIx64 "%s\n", 1353 i, callchain->ips[i], 1354 callchain_context_str(callchain->ips[i])); 1355 1356 if (sample->deferred_callchain) 1357 printf("...... (deferred)\n"); 1358 } 1359 1360 static void branch_stack__printf(struct perf_sample *sample, 1361 struct evsel *evsel) 1362 { 1363 struct branch_entry *entries = perf_sample__branch_entries(sample); 1364 bool callstack = evsel__has_branch_callstack(evsel); 1365 u64 *branch_stack_cntr = sample->branch_stack_cntr; 1366 uint64_t i; 1367 1368 if (!callstack) { 1369 printf("%s: nr:%" PRIu64 "\n", "... branch stack", sample->branch_stack->nr); 1370 } else { 1371 /* the reason of adding 1 to nr is because after expanding 1372 * branch stack it generates nr + 1 callstack records. e.g., 1373 * B()->C() 1374 * A()->B() 1375 * the final callstack should be: 1376 * C() 1377 * B() 1378 * A() 1379 */ 1380 printf("%s: nr:%" PRIu64 "\n", "... branch callstack", sample->branch_stack->nr+1); 1381 } 1382 1383 for (i = 0; i < sample->branch_stack->nr; i++) { 1384 struct branch_entry *e = &entries[i]; 1385 1386 if (!callstack) { 1387 printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x %s %s\n", 1388 i, e->from, e->to, 1389 (unsigned short)e->flags.cycles, 1390 e->flags.mispred ? "M" : " ", 1391 e->flags.predicted ? "P" : " ", 1392 e->flags.abort ? "A" : " ", 1393 e->flags.in_tx ? "T" : " ", 1394 (unsigned)e->flags.reserved, 1395 get_branch_type(e), 1396 e->flags.spec ? branch_spec_desc(e->flags.spec) : ""); 1397 } else { 1398 if (i == 0) { 1399 printf("..... %2"PRIu64": %016" PRIx64 "\n" 1400 "..... %2"PRIu64": %016" PRIx64 "\n", 1401 i, e->to, i+1, e->from); 1402 } else { 1403 printf("..... %2"PRIu64": %016" PRIx64 "\n", i+1, e->from); 1404 } 1405 } 1406 } 1407 1408 if (branch_stack_cntr) { 1409 unsigned int br_cntr_width, br_cntr_nr; 1410 1411 perf_env__find_br_cntr_info(evsel__env(evsel), &br_cntr_nr, &br_cntr_width); 1412 printf("... branch stack counters: nr:%" PRIu64 " (counter width: %u max counter nr:%u)\n", 1413 sample->branch_stack->nr, br_cntr_width, br_cntr_nr); 1414 for (i = 0; i < sample->branch_stack->nr; i++) 1415 printf("..... %2"PRIu64": %016" PRIx64 "\n", i, branch_stack_cntr[i]); 1416 } 1417 } 1418 1419 static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags) 1420 { 1421 unsigned rid, i = 0; 1422 1423 for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) { 1424 u64 val = regs[i++]; 1425 1426 printf(".... %-5s 0x%016" PRIx64 "\n", 1427 perf_reg_name(rid, e_machine, e_flags), val); 1428 } 1429 } 1430 1431 static const char *regs_abi[] = { 1432 [PERF_SAMPLE_REGS_ABI_NONE] = "none", 1433 [PERF_SAMPLE_REGS_ABI_32] = "32-bit", 1434 [PERF_SAMPLE_REGS_ABI_64] = "64-bit", 1435 }; 1436 1437 static inline const char *regs_dump_abi(struct regs_dump *d) 1438 { 1439 if (d->abi > PERF_SAMPLE_REGS_ABI_64) 1440 return "unknown"; 1441 1442 return regs_abi[d->abi]; 1443 } 1444 1445 static void regs__printf(const char *type, struct regs_dump *regs, 1446 uint16_t e_machine, uint32_t e_flags) 1447 { 1448 u64 mask = regs->mask; 1449 1450 printf("... %s regs: mask 0x%" PRIx64 " ABI %s\n", 1451 type, 1452 mask, 1453 regs_dump_abi(regs)); 1454 1455 regs_dump__printf(mask, regs->regs, e_machine, e_flags); 1456 } 1457 1458 static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags) 1459 { 1460 struct regs_dump *user_regs; 1461 1462 if (!sample->user_regs) 1463 return; 1464 1465 user_regs = perf_sample__user_regs(sample); 1466 1467 if (user_regs->regs) 1468 regs__printf("user", user_regs, e_machine, e_flags); 1469 } 1470 1471 static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags) 1472 { 1473 struct regs_dump *intr_regs; 1474 1475 if (!sample->intr_regs) 1476 return; 1477 1478 intr_regs = perf_sample__intr_regs(sample); 1479 1480 if (intr_regs->regs) 1481 regs__printf("intr", intr_regs, e_machine, e_flags); 1482 } 1483 1484 static void stack_user__printf(struct stack_dump *dump) 1485 { 1486 printf("... ustack: size %" PRIu64 ", offset 0x%x\n", 1487 dump->size, dump->offset); 1488 } 1489 1490 static void evlist__print_tstamp(struct evlist *evlist, union perf_event *event, struct perf_sample *sample) 1491 { 1492 u64 sample_type = __evlist__combined_sample_type(evlist); 1493 1494 if (event->header.type != PERF_RECORD_SAMPLE && 1495 !evlist__sample_id_all(evlist)) { 1496 fputs("-1 -1 ", stdout); 1497 return; 1498 } 1499 1500 if ((sample_type & PERF_SAMPLE_CPU)) 1501 printf("%u ", sample->cpu); 1502 1503 if (sample_type & PERF_SAMPLE_TIME) 1504 printf("%" PRIu64 " ", sample->time); 1505 } 1506 1507 static void sample_read__printf(struct perf_sample *sample, u64 read_format) 1508 { 1509 printf("... sample_read:\n"); 1510 1511 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1512 printf("...... time enabled %016" PRIx64 "\n", 1513 sample->read.time_enabled); 1514 1515 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1516 printf("...... time running %016" PRIx64 "\n", 1517 sample->read.time_running); 1518 1519 if (read_format & PERF_FORMAT_GROUP) { 1520 struct sample_read_value *value = sample->read.group.values; 1521 1522 printf(".... group nr %" PRIu64 "\n", sample->read.group.nr); 1523 1524 sample_read_group__for_each(value, sample->read.group.nr, read_format) { 1525 printf("..... id %016" PRIx64 1526 ", value %016" PRIx64, 1527 value->id, value->value); 1528 if (read_format & PERF_FORMAT_LOST) 1529 printf(", lost %" PRIu64, value->lost); 1530 printf("\n"); 1531 } 1532 } else { 1533 printf("..... id %016" PRIx64 ", value %016" PRIx64, 1534 sample->read.one.id, sample->read.one.value); 1535 if (read_format & PERF_FORMAT_LOST) 1536 printf(", lost %" PRIu64, sample->read.one.lost); 1537 printf("\n"); 1538 } 1539 } 1540 1541 static void dump_event(struct evlist *evlist, union perf_event *event, 1542 u64 file_offset, struct perf_sample *sample, 1543 const char *file_path) 1544 { 1545 if (!dump_trace) 1546 return; 1547 1548 printf("\n%#" PRIx64 "@%s [%#x]: event: %d\n", 1549 file_offset, file_path, event->header.size, event->header.type); 1550 1551 trace_event(event); 1552 if (event->header.type == PERF_RECORD_SAMPLE && evlist->trace_event_sample_raw) 1553 evlist->trace_event_sample_raw(evlist, event, sample); 1554 1555 if (sample) 1556 evlist__print_tstamp(evlist, event, sample); 1557 1558 printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset, 1559 event->header.size, perf_event__name(event->header.type)); 1560 } 1561 1562 char *get_page_size_name(u64 size, char *str) 1563 { 1564 if (!size || !unit_number__scnprintf(str, PAGE_SIZE_NAME_LEN, size)) 1565 snprintf(str, PAGE_SIZE_NAME_LEN, "%s", "N/A"); 1566 1567 return str; 1568 } 1569 1570 static void dump_sample(struct machine *machine, union perf_event *event, 1571 struct perf_sample *sample) 1572 { 1573 struct evsel *evsel = sample->evsel; 1574 u64 sample_type; 1575 char str[PAGE_SIZE_NAME_LEN]; 1576 uint16_t e_machine = EM_NONE; 1577 uint32_t e_flags = 0; 1578 1579 if (!dump_trace) 1580 return; 1581 1582 sample_type = evsel->core.attr.sample_type; 1583 1584 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) { 1585 struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid); 1586 1587 e_machine = thread__e_machine(thread, machine, &e_flags); 1588 } 1589 1590 printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n", 1591 event->header.misc, sample->pid, sample->tid, sample->ip, 1592 sample->period, sample->addr); 1593 1594 if (evsel__has_callchain(evsel)) 1595 callchain__printf(evsel, sample); 1596 1597 if (evsel__has_br_stack(evsel)) 1598 branch_stack__printf(sample, evsel); 1599 1600 if (sample_type & PERF_SAMPLE_REGS_USER) 1601 regs_user__printf(sample, e_machine, e_flags); 1602 1603 if (sample_type & PERF_SAMPLE_REGS_INTR) 1604 regs_intr__printf(sample, e_machine, e_flags); 1605 1606 if (sample_type & PERF_SAMPLE_STACK_USER) 1607 stack_user__printf(&sample->user_stack); 1608 1609 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { 1610 printf("... weight: %" PRIu64 "", sample->weight); 1611 if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { 1612 printf(",0x%"PRIx16"", sample->ins_lat); 1613 printf(",0x%"PRIx16"", sample->weight3); 1614 } 1615 printf("\n"); 1616 } 1617 1618 if (sample_type & PERF_SAMPLE_DATA_SRC) 1619 printf(" . data_src: 0x%"PRIx64"\n", sample->data_src); 1620 1621 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 1622 printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr); 1623 1624 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) 1625 printf(" .. data page size: %s\n", get_page_size_name(sample->data_page_size, str)); 1626 1627 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) 1628 printf(" .. code page size: %s\n", get_page_size_name(sample->code_page_size, str)); 1629 1630 if (sample_type & PERF_SAMPLE_TRANSACTION) 1631 printf("... transaction: %" PRIx64 "\n", sample->transaction); 1632 1633 if (sample_type & PERF_SAMPLE_READ) 1634 sample_read__printf(sample, evsel->core.attr.read_format); 1635 } 1636 1637 static void dump_deferred_callchain(union perf_event *event, struct perf_sample *sample) 1638 { 1639 struct evsel *evsel = sample->evsel; 1640 1641 if (!dump_trace) 1642 return; 1643 1644 printf("(IP, 0x%x): %d/%d: %#" PRIx64 "\n", 1645 event->header.misc, sample->pid, sample->tid, sample->deferred_cookie); 1646 1647 if (evsel__has_callchain(evsel)) 1648 callchain__printf(evsel, sample); 1649 } 1650 1651 static void dump_read(struct evsel *evsel, union perf_event *event) 1652 { 1653 u64 read_format; 1654 __u64 *array; 1655 void *end; 1656 1657 if (!dump_trace) 1658 return; 1659 1660 printf(": %d %d %s %" PRI_lu64 "\n", event->read.pid, event->read.tid, 1661 evsel__name(evsel), event->read.value); 1662 1663 if (!evsel) 1664 return; 1665 1666 read_format = evsel->core.attr.read_format; 1667 /* 1668 * The kernel packs only the enabled read_format fields 1669 * after value, with no gaps. Walk the packed array 1670 * instead of using fixed struct offsets. 1671 */ 1672 array = &event->read.value + 1; 1673 end = (void *)event + event->header.size; 1674 1675 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1676 if ((void *)(array + 1) > end) 1677 return; 1678 printf("... time enabled : %" PRI_lu64 "\n", *array++); 1679 } 1680 1681 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1682 if ((void *)(array + 1) > end) 1683 return; 1684 printf("... time running : %" PRI_lu64 "\n", *array++); 1685 } 1686 1687 if (read_format & PERF_FORMAT_ID) { 1688 if ((void *)(array + 1) > end) 1689 return; 1690 printf("... id : %" PRI_lu64 "\n", *array++); 1691 } 1692 1693 if (read_format & PERF_FORMAT_LOST) { 1694 if ((void *)(array + 1) > end) 1695 return; 1696 printf("... lost : %" PRI_lu64 "\n", *array++); 1697 } 1698 } 1699 1700 static struct machine *machines__find_for_cpumode(struct machines *machines, 1701 union perf_event *event, 1702 struct perf_sample *sample) 1703 { 1704 if (perf_guest && 1705 ((sample->cpumode == PERF_RECORD_MISC_GUEST_KERNEL) || 1706 (sample->cpumode == PERF_RECORD_MISC_GUEST_USER))) { 1707 u32 pid; 1708 1709 if (sample->machine_pid) 1710 pid = sample->machine_pid; 1711 else if (event->header.type == PERF_RECORD_MMAP 1712 || event->header.type == PERF_RECORD_MMAP2) 1713 pid = event->mmap.pid; 1714 else 1715 pid = sample->pid; 1716 1717 /* 1718 * Guest code machine is created as needed and does not use 1719 * DEFAULT_GUEST_KERNEL_ID. 1720 */ 1721 if (symbol_conf.guest_code) 1722 return machines__findnew(machines, pid); 1723 1724 return machines__find_guest(machines, pid); 1725 } 1726 1727 return &machines->host; 1728 } 1729 1730 static int deliver_sample_value(struct evlist *evlist, 1731 const struct perf_tool *tool, 1732 union perf_event *event, 1733 struct perf_sample *sample, 1734 struct sample_read_value *v, 1735 struct machine *machine, 1736 bool per_thread) 1737 { 1738 struct perf_sample_id *sid = evlist__id2sid(evlist, v->id); 1739 struct evsel *saved_evsel = sample->evsel; 1740 u64 *storage = NULL; 1741 int ret; 1742 1743 if (sid) { 1744 storage = perf_sample_id__get_period_storage(sid, sample->tid, per_thread); 1745 } 1746 1747 if (storage) { 1748 sample->id = v->id; 1749 sample->period = v->value - *storage; 1750 *storage = v->value; 1751 } 1752 1753 if (!storage || sid->evsel == NULL) { 1754 ++evlist->stats.nr_unknown_id; 1755 return 0; 1756 } 1757 1758 /* 1759 * There's no reason to deliver sample 1760 * for zero period, bail out. 1761 */ 1762 if (!sample->period) 1763 return 0; 1764 1765 sample->evsel = container_of(sid->evsel, struct evsel, core); 1766 ret = tool->sample(tool, event, sample, machine); 1767 sample->evsel = saved_evsel; 1768 return ret; 1769 } 1770 1771 static int deliver_sample_group(struct evlist *evlist, 1772 const struct perf_tool *tool, 1773 union perf_event *event, 1774 struct perf_sample *sample, 1775 struct machine *machine, 1776 u64 read_format, 1777 bool per_thread) 1778 { 1779 int ret = -EINVAL; 1780 struct sample_read_value *v = sample->read.group.values; 1781 1782 if (tool->dont_split_sample_group) 1783 return deliver_sample_value(evlist, tool, event, sample, v, machine, 1784 per_thread); 1785 1786 sample_read_group__for_each(v, sample->read.group.nr, read_format) { 1787 ret = deliver_sample_value(evlist, tool, event, sample, v, 1788 machine, per_thread); 1789 if (ret) 1790 break; 1791 } 1792 1793 return ret; 1794 } 1795 1796 static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool *tool, 1797 union perf_event *event, struct perf_sample *sample, 1798 struct machine *machine) 1799 { 1800 struct evsel *evsel = sample->evsel; 1801 /* We know evsel != NULL. */ 1802 u64 sample_type = evsel->core.attr.sample_type; 1803 u64 read_format = evsel->core.attr.read_format; 1804 bool per_thread = perf_evsel__attr_has_per_thread_sample_period(&evsel->core); 1805 1806 /* Standard sample delivery. */ 1807 if (!(sample_type & PERF_SAMPLE_READ)) 1808 return tool->sample(tool, event, sample, machine); 1809 1810 /* For PERF_SAMPLE_READ we have either single or group mode. */ 1811 if (read_format & PERF_FORMAT_GROUP) 1812 return deliver_sample_group(evlist, tool, event, sample, 1813 machine, read_format, per_thread); 1814 else 1815 return deliver_sample_value(evlist, tool, event, sample, 1816 &sample->read.one, machine, 1817 per_thread); 1818 } 1819 1820 /* 1821 * Samples with deferred callchains should wait for the next matching 1822 * PERF_RECORD_CALLCHAIN_RECORD entries. Keep the events in a list and 1823 * deliver them once it finds the callchains. 1824 */ 1825 struct deferred_event { 1826 struct list_head list; 1827 union perf_event *event; 1828 u64 file_offset; 1829 }; 1830 1831 /* 1832 * This is called when a deferred callchain record comes up. Find all matching 1833 * samples, merge the callchains and process them. 1834 */ 1835 static int evlist__deliver_deferred_callchain(struct evlist *evlist, 1836 const struct perf_tool *tool, 1837 union perf_event *event, 1838 struct perf_sample *sample, 1839 struct machine *machine) 1840 { 1841 struct deferred_event *de, *tmp; 1842 int ret = 0; 1843 1844 if (!tool->merge_deferred_callchains) { 1845 struct evsel *saved_evsel = sample->evsel; 1846 1847 sample->evsel = evlist__id2evsel(evlist, sample->id); 1848 ret = tool->callchain_deferred(tool, event, sample, machine); 1849 sample->evsel = saved_evsel; 1850 return ret; 1851 } 1852 1853 list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) { 1854 struct perf_sample orig_sample; 1855 1856 perf_sample__init(&orig_sample, /*all=*/false); 1857 ret = evlist__parse_sample(evlist, de->event, &orig_sample); 1858 if (ret < 0) { 1859 pr_err("failed to parse original sample\n"); 1860 perf_sample__exit(&orig_sample); 1861 break; 1862 } 1863 orig_sample.file_offset = de->file_offset; 1864 1865 if (sample->tid != orig_sample.tid) { 1866 perf_sample__exit(&orig_sample); 1867 continue; 1868 } 1869 1870 if (event->callchain_deferred.cookie == orig_sample.deferred_cookie) 1871 sample__merge_deferred_callchain(&orig_sample, sample); 1872 else 1873 orig_sample.deferred_callchain = false; 1874 1875 orig_sample.evsel = evlist__id2evsel(evlist, orig_sample.id); 1876 ret = evlist__deliver_sample(evlist, tool, de->event, 1877 &orig_sample, machine); 1878 1879 perf_sample__exit(&orig_sample); 1880 list_del(&de->list); 1881 free(de->event); 1882 free(de); 1883 1884 if (ret) 1885 break; 1886 } 1887 return ret; 1888 } 1889 1890 /* 1891 * This is called at the end of the data processing for the session. Flush the 1892 * remaining samples as there's no hope for matching deferred callchains. 1893 */ 1894 static int session__flush_deferred_samples(struct perf_session *session, 1895 const struct perf_tool *tool) 1896 { 1897 struct evlist *evlist = session->evlist; 1898 struct machine *machine = &session->machines.host; 1899 struct deferred_event *de, *tmp; 1900 int ret = 0; 1901 1902 list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) { 1903 struct perf_sample sample; 1904 1905 perf_sample__init(&sample, /*all=*/false); 1906 ret = evlist__parse_sample(evlist, de->event, &sample); 1907 if (ret < 0) { 1908 pr_err("failed to parse original sample\n"); 1909 perf_sample__exit(&sample); 1910 break; 1911 } 1912 sample.file_offset = de->file_offset; 1913 1914 sample.evsel = evlist__id2evsel(evlist, sample.id); 1915 ret = evlist__deliver_sample(evlist, tool, de->event, 1916 &sample, machine); 1917 1918 perf_sample__exit(&sample); 1919 list_del(&de->list); 1920 free(de->event); 1921 free(de); 1922 1923 if (ret) 1924 break; 1925 } 1926 return ret; 1927 } 1928 1929 /* 1930 * Return true if the string field is properly null-terminated 1931 * within the event boundary. Native-endian files are mapped 1932 * read-only (MAP_SHARED + PROT_READ) so we cannot write a 1933 * null byte in place; skip the event instead. 1934 */ 1935 static bool perf_event__check_nul(const char *str, const void *end, 1936 const char *event_name, u64 file_offset) 1937 { 1938 size_t max_len = (const char *)end - str; 1939 1940 if (max_len == 0 || strnlen(str, max_len) == max_len) { 1941 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_%s: string not null-terminated, skipping event\n", 1942 file_offset, event_name); 1943 return false; 1944 } 1945 1946 return true; 1947 } 1948 1949 static int machines__deliver_event(struct machines *machines, 1950 struct evlist *evlist, 1951 union perf_event *event, 1952 struct perf_sample *sample, 1953 const struct perf_tool *tool, u64 file_offset, 1954 const char *file_path) 1955 { 1956 struct machine *machine; 1957 1958 dump_event(evlist, event, file_offset, sample, file_path); 1959 1960 if (!sample->evsel) 1961 sample->evsel = evlist__id2evsel(evlist, sample->id); 1962 else 1963 assert(sample->evsel == evlist__id2evsel(evlist, sample->id)); 1964 1965 machine = machines__find_for_cpumode(machines, event, sample); 1966 1967 switch (event->header.type) { 1968 case PERF_RECORD_SAMPLE: 1969 if (sample->evsel == NULL) { 1970 ++evlist->stats.nr_unknown_id; 1971 return 0; 1972 } 1973 if (machine == NULL) { 1974 ++evlist->stats.nr_unprocessable_samples; 1975 dump_sample(machine, event, sample); 1976 return 0; 1977 } 1978 dump_sample(machine, event, sample); 1979 if (sample->deferred_callchain && tool->merge_deferred_callchains) { 1980 struct deferred_event *de = malloc(sizeof(*de)); 1981 size_t sz = event->header.size; 1982 1983 if (de == NULL) 1984 return -ENOMEM; 1985 1986 de->event = malloc(sz); 1987 if (de->event == NULL) { 1988 free(de); 1989 return -ENOMEM; 1990 } 1991 memcpy(de->event, event, sz); 1992 de->file_offset = sample->file_offset; 1993 list_add_tail(&de->list, &evlist->deferred_samples); 1994 return 0; 1995 } 1996 return evlist__deliver_sample(evlist, tool, event, sample, machine); 1997 case PERF_RECORD_MMAP: 1998 if (!perf_event__check_nul(event->mmap.filename, 1999 (void *)event + event->header.size, 2000 "MMAP", file_offset)) 2001 return 0; 2002 return tool->mmap(tool, event, sample, machine); 2003 case PERF_RECORD_MMAP2: 2004 if (event->header.misc & PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT) 2005 ++evlist->stats.nr_proc_map_timeout; 2006 if (!perf_event__check_nul(event->mmap2.filename, 2007 (void *)event + event->header.size, 2008 "MMAP2", file_offset)) 2009 return 0; 2010 return tool->mmap2(tool, event, sample, machine); 2011 case PERF_RECORD_COMM: 2012 if (!perf_event__check_nul(event->comm.comm, 2013 (void *)event + event->header.size, 2014 "COMM", file_offset)) 2015 return 0; 2016 return tool->comm(tool, event, sample, machine); 2017 case PERF_RECORD_NAMESPACES: { 2018 /* 2019 * Cannot underflow: perf_event__min_size[] guarantees header.size >= sizeof. 2020 * Includes trailing sample_id space when present, but prevents OOB. 2021 */ 2022 u64 max_nr = (event->header.size - sizeof(event->namespaces)) / 2023 sizeof(event->namespaces.link_info[0]); 2024 2025 /* 2026 * Native-endian events are mmap'd read-only, so we 2027 * cannot clamp nr in place. Skip the event instead. 2028 * The swap handler already clamps on the writable 2029 * cross-endian path. 2030 */ 2031 if (event->namespaces.nr_namespaces > max_nr) { 2032 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_NAMESPACES: nr_namespaces %" PRIu64 " exceeds payload (max %" PRIu64 "), skipping\n", 2033 file_offset, (u64)event->namespaces.nr_namespaces, max_nr); 2034 return 0; 2035 } 2036 return tool->namespaces(tool, event, sample, machine); 2037 } 2038 case PERF_RECORD_CGROUP: 2039 if (!perf_event__check_nul(event->cgroup.path, 2040 (void *)event + event->header.size, 2041 "CGROUP", file_offset)) 2042 return 0; 2043 return tool->cgroup(tool, event, sample, machine); 2044 case PERF_RECORD_FORK: 2045 return tool->fork(tool, event, sample, machine); 2046 case PERF_RECORD_EXIT: 2047 return tool->exit(tool, event, sample, machine); 2048 case PERF_RECORD_LOST: 2049 if (tool->lost == perf_event__process_lost) 2050 evlist->stats.total_lost += event->lost.lost; 2051 return tool->lost(tool, event, sample, machine); 2052 case PERF_RECORD_LOST_SAMPLES: 2053 if (event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF) 2054 evlist->stats.total_dropped_samples += event->lost_samples.lost; 2055 else if (tool->lost_samples == perf_event__process_lost_samples) 2056 evlist->stats.total_lost_samples += event->lost_samples.lost; 2057 return tool->lost_samples(tool, event, sample, machine); 2058 case PERF_RECORD_READ: 2059 dump_read(sample->evsel, event); 2060 return tool->read(tool, event, sample, machine); 2061 case PERF_RECORD_THROTTLE: 2062 return tool->throttle(tool, event, sample, machine); 2063 case PERF_RECORD_UNTHROTTLE: 2064 return tool->unthrottle(tool, event, sample, machine); 2065 case PERF_RECORD_AUX: 2066 if (tool->aux == perf_event__process_aux) { 2067 if (event->aux.flags & PERF_AUX_FLAG_TRUNCATED) 2068 evlist->stats.total_aux_lost += 1; 2069 if (event->aux.flags & PERF_AUX_FLAG_PARTIAL) 2070 evlist->stats.total_aux_partial += 1; 2071 if (event->aux.flags & PERF_AUX_FLAG_COLLISION) 2072 evlist->stats.total_aux_collision += 1; 2073 } 2074 return tool->aux(tool, event, sample, machine); 2075 case PERF_RECORD_ITRACE_START: 2076 return tool->itrace_start(tool, event, sample, machine); 2077 case PERF_RECORD_SWITCH: 2078 case PERF_RECORD_SWITCH_CPU_WIDE: 2079 return tool->context_switch(tool, event, sample, machine); 2080 case PERF_RECORD_KSYMBOL: 2081 if (!perf_event__check_nul(event->ksymbol.name, 2082 (void *)event + event->header.size, 2083 "KSYMBOL", file_offset)) 2084 return 0; 2085 return tool->ksymbol(tool, event, sample, machine); 2086 case PERF_RECORD_BPF_EVENT: 2087 return tool->bpf(tool, event, sample, machine); 2088 case PERF_RECORD_TEXT_POKE: { 2089 /* offsetof(bytes), not sizeof — sizeof includes padding past the flexible array */ 2090 size_t text_poke_len = offsetof(struct perf_record_text_poke_event, bytes) + 2091 event->text_poke.old_len + 2092 event->text_poke.new_len; 2093 2094 if (event->header.size < text_poke_len) { 2095 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_TEXT_POKE: old_len+new_len exceeds event, skipping\n", 2096 file_offset); 2097 return 0; 2098 } 2099 return tool->text_poke(tool, event, sample, machine); 2100 } 2101 case PERF_RECORD_AUX_OUTPUT_HW_ID: 2102 return tool->aux_output_hw_id(tool, event, sample, machine); 2103 case PERF_RECORD_CALLCHAIN_DEFERRED: 2104 dump_deferred_callchain(event, sample); 2105 return evlist__deliver_deferred_callchain(evlist, tool, event, 2106 sample, machine); 2107 default: 2108 ++evlist->stats.nr_unknown_events; 2109 return -1; 2110 } 2111 } 2112 2113 static int perf_session__deliver_event(struct perf_session *session, 2114 union perf_event *event, 2115 const struct perf_tool *tool, 2116 u64 file_offset, 2117 const char *file_path) 2118 { 2119 struct perf_sample sample; 2120 struct evsel *evsel; 2121 int ret; 2122 2123 perf_sample__init(&sample, /*all=*/false); 2124 evsel = evlist__event2evsel(session->evlist, event); 2125 if (!evsel) { 2126 pr_err("ERROR: at offset %#" PRIx64 ": no evsel found for %s (%u) event\n", 2127 file_offset, perf_event__name(event->header.type), 2128 event->header.type); 2129 ret = -EFAULT; 2130 goto out; 2131 } 2132 ret = evsel__parse_sample(evsel, event, &sample); 2133 if (ret) { 2134 pr_err("ERROR: at offset %#" PRIx64 ": can't parse %s (%u) sample, err = %d\n", 2135 file_offset, perf_event__name(event->header.type), 2136 event->header.type, ret); 2137 goto out; 2138 } 2139 sample.file_offset = file_offset; 2140 /* 2141 * evsel__parse_sample() doesn't populate machine_pid/vcpu, 2142 * which are needed by machines__find_for_cpumode() to 2143 * attribute samples to guest VMs. The SID table maps 2144 * sample IDs to the guest that owns the event. 2145 */ 2146 if (perf_guest && sample.id) { 2147 struct perf_sample_id *sid = evlist__id2sid(session->evlist, sample.id); 2148 2149 if (sid) { 2150 sample.machine_pid = sid->machine_pid; 2151 sample.vcpu = sid->vcpu.cpu; 2152 } 2153 } 2154 2155 /* 2156 * Validate sample.cpu before any callback can use it as an 2157 * array index (kwork cpus_runtime, timechart cpus_cstate_*, 2158 * sched cpu_last_switched). 2159 * 2160 * When PERF_SAMPLE_CPU is absent, evsel__parse_sample() leaves 2161 * sample.cpu as (u32)-1 — a sentinel that downstream tools 2162 * (script, inject) check to identify events without CPU info. 2163 * Only check when sample.cpu was actually populated from event 2164 * data: PERF_RECORD_SAMPLE always has it when PERF_SAMPLE_CPU 2165 * is set; non-sample events only have it when sample_id_all is 2166 * enabled. Otherwise sample.cpu is the (u32)-1 sentinel from 2167 * evsel__parse_sample() and must not be validated or clamped. 2168 */ 2169 if ((evsel->core.attr.sample_type & PERF_SAMPLE_CPU) && 2170 (event->header.type == PERF_RECORD_SAMPLE || 2171 evsel->core.attr.sample_id_all)) { 2172 int nr_cpus_avail = perf_session__env(session)->nr_cpus_avail; 2173 2174 /* 2175 * For perf.data files the MAX_NR_CPUS fallback in 2176 * perf_session__read_header() guarantees this is set. 2177 * For pipe mode, HEADER_NRCPUS may arrive late or not 2178 * at all (pre-2017 perf, third-party tools). Fall 2179 * back to MAX_NR_CPUS so the bounds check still works 2180 * against fixed-size downstream arrays. 2181 * 2182 * Do NOT write back to env: this function runs during 2183 * recording (synthesized events) when nr_cpus_avail is 2184 * legitimately 0. Writing MAX_NR_CPUS would cause 2185 * write_cpu_topology() to emit 4096 core_id/socket_id 2186 * pairs instead of the real CPU count, corrupting the 2187 * topology section in the generated perf.data. 2188 */ 2189 if (nr_cpus_avail <= 0) 2190 nr_cpus_avail = MAX_NR_CPUS; 2191 /* 2192 * Cap at MAX_NR_CPUS for the bounds check — downstream 2193 * consumers use fixed-size arrays of that size. Keep 2194 * the true nr_cpus_avail in env for header parsing 2195 * (e.g. process_cpu_topology) which needs the real count. 2196 */ 2197 if (nr_cpus_avail > MAX_NR_CPUS) 2198 nr_cpus_avail = MAX_NR_CPUS; 2199 if (sample.cpu >= (u32)nr_cpus_avail && 2200 sample.cpu != (u32)-1) { 2201 /* 2202 * Warn rather than abort: synthesized events 2203 * (MMAP, COMM) lack sample_id_all data, so 2204 * parse_id_sample reads garbage from the event 2205 * payload. Clamping to 0 protects downstream 2206 * array indexing while keeping the session alive. 2207 * 2208 * Preserve (u32)-1: perf script and perf inject 2209 * use it as a sentinel for "CPU not applicable." 2210 * Downstream array users (timechart, kwork) have 2211 * their own per-callback bounds checks. 2212 */ 2213 pr_warning_once("WARNING: at offset %#" PRIx64 ": sample CPU %u >= nr_cpus_avail %u, clamping to 0\n", 2214 file_offset, sample.cpu, nr_cpus_avail); 2215 sample.cpu = 0; 2216 } 2217 } 2218 2219 ret = auxtrace__process_event(session, event, &sample, tool); 2220 if (ret < 0) 2221 goto out; 2222 if (ret > 0) { 2223 ret = 0; 2224 goto out; 2225 } 2226 2227 ret = machines__deliver_event(&session->machines, session->evlist, 2228 event, &sample, tool, file_offset, file_path); 2229 2230 if (dump_trace && sample.aux_sample.size) 2231 auxtrace__dump_auxtrace_sample(session, &sample); 2232 out: 2233 perf_sample__exit(&sample); 2234 return ret; 2235 } 2236 2237 static s64 perf_session__process_user_event(struct perf_session *session, 2238 union perf_event *event, 2239 u64 file_offset, 2240 const char *file_path) 2241 { 2242 struct ordered_events *oe = &session->ordered_events; 2243 const struct perf_tool *tool = session->tool; 2244 const u32 event_size = READ_ONCE(event->header.size); 2245 struct perf_sample sample; 2246 int fd = perf_data__fd(session->data); 2247 s64 err; 2248 2249 perf_sample__init(&sample, /*all=*/true); 2250 if ((event->header.type != PERF_RECORD_COMPRESSED && 2251 event->header.type != PERF_RECORD_COMPRESSED2) || 2252 perf_tool__compressed_is_stub(tool)) 2253 dump_event(session->evlist, event, file_offset, &sample, file_path); 2254 2255 /* These events are processed right away */ 2256 switch (event->header.type) { 2257 case PERF_RECORD_HEADER_ATTR: 2258 err = tool->attr(tool, event, &session->evlist); 2259 if (err == 0) { 2260 perf_session__set_id_hdr_size(session); 2261 perf_session__set_comm_exec(session); 2262 } 2263 break; 2264 case PERF_RECORD_EVENT_UPDATE: 2265 err = tool->event_update(tool, event, &session->evlist); 2266 break; 2267 case PERF_RECORD_HEADER_EVENT_TYPE: 2268 /* 2269 * Deprecated, but we need to handle it for sake 2270 * of old data files create in pipe mode. 2271 */ 2272 err = 0; 2273 break; 2274 case PERF_RECORD_HEADER_TRACING_DATA: 2275 /* 2276 * Setup for reading amidst mmap, but only when we 2277 * are in 'file' mode. The 'pipe' fd is in proper 2278 * place already. 2279 */ 2280 if (!perf_data__is_pipe(session->data)) 2281 lseek(fd, file_offset, SEEK_SET); 2282 err = tool->tracing_data(tool, session, event); 2283 break; 2284 case PERF_RECORD_HEADER_BUILD_ID: 2285 if (!perf_event__check_nul(event->build_id.filename, 2286 (void *)event + event_size, 2287 "HEADER_BUILD_ID", file_offset)) { 2288 err = 0; 2289 break; 2290 } 2291 err = tool->build_id(tool, session, event); 2292 break; 2293 case PERF_RECORD_FINISHED_ROUND: 2294 err = tool->finished_round(tool, event, oe); 2295 break; 2296 case PERF_RECORD_ID_INDEX: 2297 err = tool->id_index(tool, session, event); 2298 break; 2299 case PERF_RECORD_AUXTRACE_INFO: 2300 err = tool->auxtrace_info(tool, session, event); 2301 break; 2302 case PERF_RECORD_AUXTRACE: 2303 /* 2304 * Setup for reading amidst mmap, but only when we 2305 * are in 'file' mode. The 'pipe' fd is in proper 2306 * place already. 2307 */ 2308 if (!perf_data__is_pipe(session->data)) 2309 lseek(fd, file_offset + event_size, SEEK_SET); 2310 err = tool->auxtrace(tool, session, event); 2311 break; 2312 case PERF_RECORD_AUXTRACE_ERROR: 2313 perf_session__auxtrace_error_inc(session, event); 2314 err = tool->auxtrace_error(tool, session, event); 2315 break; 2316 case PERF_RECORD_THREAD_MAP: { 2317 u64 max_nr; 2318 2319 if (event_size < sizeof(event->thread_map)) { 2320 pr_err("ERROR: at offset %#" PRIx64 ": PERF_RECORD_THREAD_MAP: header.size (%u) too small\n", 2321 file_offset, event_size); 2322 err = -EINVAL; 2323 break; 2324 } 2325 2326 max_nr = (event_size - sizeof(event->thread_map)) / 2327 sizeof(event->thread_map.entries[0]); 2328 if (event->thread_map.nr > max_nr) { 2329 pr_err("ERROR: at offset %#" PRIx64 ": PERF_RECORD_THREAD_MAP: nr %" PRIu64 " exceeds max %" PRIu64 "\n", 2330 file_offset, (u64)event->thread_map.nr, max_nr); 2331 err = -EINVAL; 2332 break; 2333 } 2334 2335 err = tool->thread_map(tool, session, event); 2336 break; 2337 } 2338 case PERF_RECORD_CPU_MAP: { 2339 struct perf_record_cpu_map_data *data = &event->cpu_map.data; 2340 u32 payload = event_size - sizeof(event->header); 2341 2342 /* 2343 * Native-endian events are mmap'd read-only, so we 2344 * cannot clamp nr fields in place. Skip the event 2345 * if any variant overflows. 2346 */ 2347 switch (data->type) { 2348 case PERF_CPU_MAP__CPUS: { 2349 u16 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 2350 cpus_data.cpu)) / 2351 sizeof(data->cpus_data.cpu[0]); 2352 2353 if (data->cpus_data.nr > max_nr) { 2354 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP: nr %u exceeds payload (max %u), skipping\n", 2355 file_offset, data->cpus_data.nr, max_nr); 2356 err = 0; 2357 goto out; 2358 } 2359 break; 2360 } 2361 case PERF_CPU_MAP__MASK: 2362 if (data->mask32_data.long_size == 4) { 2363 u16 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 2364 mask32_data.mask)) / 2365 sizeof(data->mask32_data.mask[0]); 2366 2367 if (data->mask32_data.nr > max_nr) { 2368 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP mask32: nr %u exceeds payload (max %u), skipping\n", 2369 file_offset, data->mask32_data.nr, max_nr); 2370 err = 0; 2371 goto out; 2372 } 2373 } else if (data->mask64_data.long_size == 8) { 2374 u16 max_nr; 2375 2376 if (payload < offsetof(struct perf_record_cpu_map_data, mask64_data.mask)) { 2377 err = 0; 2378 goto out; 2379 } 2380 max_nr = (payload - offsetof(struct perf_record_cpu_map_data, 2381 mask64_data.mask)) / 2382 sizeof(data->mask64_data.mask[0]); 2383 if (data->mask64_data.nr > max_nr) { 2384 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP mask64: nr %u exceeds payload (max %u), skipping\n", 2385 file_offset, data->mask64_data.nr, max_nr); 2386 err = 0; 2387 goto out; 2388 } 2389 } else { 2390 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_CPU_MAP: unsupported long_size %u, skipping\n", 2391 file_offset, data->mask32_data.long_size); 2392 err = 0; 2393 goto out; 2394 } 2395 break; 2396 default: 2397 break; 2398 } 2399 2400 err = tool->cpu_map(tool, session, event); 2401 break; 2402 } 2403 case PERF_RECORD_STAT_CONFIG: { 2404 /* Cannot underflow: perf_event__min_size[] guarantees event_size >= sizeof */ 2405 u64 max_nr = (event_size - sizeof(event->stat_config)) / 2406 sizeof(event->stat_config.data[0]); 2407 2408 /* 2409 * Native-endian events are mmap'd read-only, so we 2410 * cannot clamp nr in place. Skip the event instead. 2411 */ 2412 if (event->stat_config.nr > max_nr) { 2413 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_STAT_CONFIG: nr %" PRIu64 " exceeds payload (max %" PRIu64 "), skipping\n", 2414 file_offset, (u64)event->stat_config.nr, max_nr); 2415 err = 0; 2416 goto out; 2417 } 2418 2419 err = tool->stat_config(tool, session, event); 2420 break; 2421 } 2422 case PERF_RECORD_STAT: 2423 err = tool->stat(tool, session, event); 2424 break; 2425 case PERF_RECORD_STAT_ROUND: 2426 err = tool->stat_round(tool, session, event); 2427 break; 2428 case PERF_RECORD_TIME_CONV: 2429 /* 2430 * Bounded copy: older kernels emit a shorter struct 2431 * without time_cycles/time_mask/cap_user_time_*. 2432 * Zero the rest so extended fields default to off. 2433 */ 2434 memset(&session->time_conv, 0, sizeof(session->time_conv)); 2435 memcpy(&session->time_conv, &event->time_conv, 2436 min((size_t)event_size, sizeof(session->time_conv))); 2437 err = tool->time_conv(tool, session, event); 2438 break; 2439 case PERF_RECORD_HEADER_FEATURE: 2440 err = tool->feature(tool, session, event); 2441 break; 2442 case PERF_RECORD_COMPRESSED: 2443 case PERF_RECORD_COMPRESSED2: 2444 err = tool->compressed(tool, session, event, file_offset, file_path); 2445 if (err) 2446 dump_event(session->evlist, event, file_offset, &sample, file_path); 2447 break; 2448 case PERF_RECORD_FINISHED_INIT: 2449 err = tool->finished_init(tool, session, event); 2450 break; 2451 case PERF_RECORD_BPF_METADATA: { 2452 u64 nr_entries, max_entries; 2453 2454 if (event_size < sizeof(event->bpf_metadata)) { 2455 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: header.size (%u) too small, skipping\n", 2456 file_offset, event_size); 2457 err = 0; 2458 break; 2459 } 2460 2461 /* 2462 * Native-endian files are mmap'd read-only — validate 2463 * NUL-termination instead of writing. 2464 */ 2465 if (strnlen(event->bpf_metadata.prog_name, 2466 BPF_PROG_NAME_LEN) == BPF_PROG_NAME_LEN) { 2467 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: prog_name not null-terminated, skipping\n", 2468 file_offset); 2469 err = 0; 2470 break; 2471 } 2472 2473 nr_entries = READ_ONCE(event->bpf_metadata.nr_entries); 2474 max_entries = (event_size - sizeof(event->bpf_metadata)) / 2475 sizeof(event->bpf_metadata.entries[0]); 2476 if (nr_entries > max_entries) { 2477 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: nr_entries %" PRIu64 " exceeds max %" PRIu64 ", skipping\n", 2478 file_offset, nr_entries, max_entries); 2479 err = 0; 2480 break; 2481 } 2482 2483 for (u64 i = 0; i < nr_entries; i++) { 2484 if (strnlen(event->bpf_metadata.entries[i].key, 2485 BPF_METADATA_KEY_LEN) == BPF_METADATA_KEY_LEN || 2486 strnlen(event->bpf_metadata.entries[i].value, 2487 BPF_METADATA_VALUE_LEN) == BPF_METADATA_VALUE_LEN) { 2488 pr_warning("WARNING: at offset %#" PRIx64 ": PERF_RECORD_BPF_METADATA: entry %" PRIu64 " key/value not null-terminated, skipping\n", 2489 file_offset, i); 2490 err = 0; 2491 goto out; 2492 } 2493 } 2494 2495 err = tool->bpf_metadata(tool, session, event); 2496 break; 2497 } 2498 case PERF_RECORD_SCHEDSTAT_CPU: 2499 err = tool->schedstat_cpu(tool, session, event); 2500 break; 2501 case PERF_RECORD_SCHEDSTAT_DOMAIN: 2502 err = tool->schedstat_domain(tool, session, event); 2503 break; 2504 default: 2505 err = -EINVAL; 2506 break; 2507 } 2508 out: 2509 perf_sample__exit(&sample); 2510 return err; 2511 } 2512 2513 int perf_session__deliver_synth_event(struct perf_session *session, 2514 union perf_event *event, 2515 struct perf_sample *sample) 2516 { 2517 struct evlist *evlist = session->evlist; 2518 const struct perf_tool *tool = session->tool; 2519 2520 events_stats__inc(&evlist->stats, event->header.type); 2521 2522 if (event->header.type >= PERF_RECORD_USER_TYPE_START) 2523 return perf_session__process_user_event(session, event, 0, NULL); 2524 2525 return machines__deliver_event(&session->machines, evlist, event, sample, tool, 0, NULL); 2526 } 2527 2528 int perf_session__deliver_synth_attr_event(struct perf_session *session, 2529 const struct perf_event_attr *attr, 2530 u64 id) 2531 { 2532 union { 2533 struct { 2534 struct perf_record_header_attr attr; 2535 u64 ids[1]; 2536 } attr_id; 2537 union perf_event ev; 2538 } ev = { 2539 .attr_id.attr.header.type = PERF_RECORD_HEADER_ATTR, 2540 .attr_id.attr.header.size = sizeof(ev.attr_id), 2541 .attr_id.ids[0] = id, 2542 }; 2543 2544 if (attr->size != sizeof(ev.attr_id.attr.attr)) { 2545 pr_debug("Unexpected perf_event_attr size\n"); 2546 return -EINVAL; 2547 } 2548 ev.attr_id.attr.attr = *attr; 2549 return perf_session__deliver_synth_event(session, &ev.ev, NULL); 2550 } 2551 2552 /* Caller must ensure event->header.type < PERF_RECORD_HEADER_MAX */ 2553 static int event_swap(union perf_event *event, bool sample_id_all) 2554 { 2555 perf_event__swap_op swap = perf_event__swap_ops[event->header.type]; 2556 2557 if (swap) 2558 return swap(event, sample_id_all); 2559 return 0; 2560 } 2561 2562 /* 2563 * Minimum event sizes indexed by type. Checked before swap and 2564 * processing so that both cross-endian and native-endian paths 2565 * are protected from accessing fields past the event boundary. 2566 * Zero means no minimum beyond the 8-byte header (already 2567 * enforced by the reader). 2568 * 2569 * These values represent the smallest event the kernel has ever 2570 * emitted for each type, so they do not reject legitimate legacy 2571 * perf.data files from older kernels. Variable-length events 2572 * use offsetof() to the first variable field; the variable 2573 * content is validated separately (e.g., perf_event__check_nul). 2574 */ 2575 static const u32 perf_event__min_size[PERF_RECORD_HEADER_MAX] = { 2576 /* 2577 * offsetof() + 1 for types with a trailing variable-length 2578 * string (filename, comm, path, name, msg): the +1 ensures 2579 * room for at least a null terminator. Full null-termination 2580 * within the event boundary is checked separately. 2581 * 2582 * PERF_RECORD_SAMPLE is omitted: all64_swap is bounded by 2583 * header.size, and the internal layout varies by sample_type 2584 * so a fixed minimum is not meaningful. 2585 */ 2586 [PERF_RECORD_MMAP] = offsetof(struct perf_record_mmap, filename) + 1, 2587 [PERF_RECORD_LOST] = sizeof(struct perf_record_lost), 2588 [PERF_RECORD_COMM] = offsetof(struct perf_record_comm, comm) + 1, 2589 [PERF_RECORD_EXIT] = sizeof(struct perf_record_fork), 2590 [PERF_RECORD_THROTTLE] = sizeof(struct perf_record_throttle), 2591 [PERF_RECORD_UNTHROTTLE] = sizeof(struct perf_record_throttle), 2592 [PERF_RECORD_FORK] = sizeof(struct perf_record_fork), 2593 /* 2594 * The kernel dynamically sizes PERF_RECORD_READ based on 2595 * attr.read_format — only the enabled fields are emitted, 2596 * packed with no gaps. The minimum valid event has just 2597 * pid + tid + one u64 value (no optional fields). 2598 */ 2599 [PERF_RECORD_READ] = offsetof(struct perf_record_read, time_enabled), 2600 [PERF_RECORD_MMAP2] = offsetof(struct perf_record_mmap2, filename) + 1, 2601 [PERF_RECORD_LOST_SAMPLES] = sizeof(struct perf_record_lost_samples), 2602 [PERF_RECORD_AUX] = sizeof(struct perf_record_aux), 2603 [PERF_RECORD_ITRACE_START] = sizeof(struct perf_record_itrace_start), 2604 [PERF_RECORD_SWITCH] = sizeof(struct perf_event_header), 2605 [PERF_RECORD_SWITCH_CPU_WIDE] = sizeof(struct perf_record_switch), 2606 [PERF_RECORD_NAMESPACES] = sizeof(struct perf_record_namespaces), 2607 [PERF_RECORD_CGROUP] = offsetof(struct perf_record_cgroup, path) + 1, 2608 [PERF_RECORD_TEXT_POKE] = sizeof(struct perf_record_text_poke_event), 2609 [PERF_RECORD_KSYMBOL] = offsetof(struct perf_record_ksymbol, name) + 1, 2610 [PERF_RECORD_BPF_EVENT] = sizeof(struct perf_record_bpf_event), 2611 [PERF_RECORD_HEADER_ATTR] = sizeof(struct perf_event_header) + PERF_ATTR_SIZE_VER0, 2612 [PERF_RECORD_HEADER_EVENT_TYPE] = sizeof(struct perf_record_header_event_type), 2613 /* Legacy events predate the __u32 pad field, accept 12-byte records */ 2614 [PERF_RECORD_HEADER_TRACING_DATA] = offsetof(struct perf_record_header_tracing_data, pad), 2615 [PERF_RECORD_AUX_OUTPUT_HW_ID] = sizeof(struct perf_record_aux_output_hw_id), 2616 [PERF_RECORD_AUXTRACE_INFO] = sizeof(struct perf_record_auxtrace_info), 2617 [PERF_RECORD_AUXTRACE] = sizeof(struct perf_record_auxtrace), 2618 [PERF_RECORD_AUXTRACE_ERROR] = offsetof(struct perf_record_auxtrace_error, msg) + 1, 2619 [PERF_RECORD_THREAD_MAP] = sizeof(struct perf_record_thread_map), 2620 /* 2621 * sizeof(perf_record_cpu_map) is 20 because the outer struct 2622 * isn't packed and GCC adds 2 bytes of trailing padding. 2623 * The smallest valid variant (RANGE_CPUS) is only 16 bytes: 2624 * header(8) + type(2) + range_cpu_data(6). Per-variant 2625 * bounds are checked in the swap handler via payload. 2626 */ 2627 [PERF_RECORD_CPU_MAP] = sizeof(struct perf_event_header) + 2628 sizeof(__u16) + 2629 sizeof(struct perf_record_range_cpu_map), 2630 [PERF_RECORD_STAT_CONFIG] = sizeof(struct perf_record_stat_config), 2631 [PERF_RECORD_STAT] = sizeof(struct perf_record_stat), 2632 [PERF_RECORD_STAT_ROUND] = sizeof(struct perf_record_stat_round), 2633 /* 2634 * EVENT_UPDATE has a union whose largest member (cpus) 2635 * inflates sizeof to 40, but SCALE events are only 32 2636 * and UNIT/NAME events can be even smaller. Use the 2637 * fixed header fields (header + type + id) as minimum. 2638 */ 2639 [PERF_RECORD_EVENT_UPDATE] = offsetof(struct perf_record_event_update, scale), 2640 [PERF_RECORD_TIME_CONV] = offsetof(struct perf_record_time_conv, time_cycles), 2641 [PERF_RECORD_ID_INDEX] = sizeof(struct perf_record_id_index), 2642 [PERF_RECORD_HEADER_BUILD_ID] = sizeof(struct perf_record_header_build_id), 2643 [PERF_RECORD_HEADER_FEATURE] = sizeof(struct perf_record_header_feature), 2644 [PERF_RECORD_COMPRESSED2] = sizeof(struct perf_record_compressed2), 2645 [PERF_RECORD_BPF_METADATA] = sizeof(struct perf_record_bpf_metadata), 2646 [PERF_RECORD_CALLCHAIN_DEFERRED] = sizeof(struct perf_event_header) + sizeof(__u64), 2647 /* 2648 * SCHEDSTAT events have a version-dependent union after the 2649 * fixed header fields; the minimum is the base (pre-union) 2650 * portion so old and new versions both pass. 2651 */ 2652 [PERF_RECORD_SCHEDSTAT_CPU] = offsetof(struct perf_record_schedstat_cpu, v15), 2653 [PERF_RECORD_SCHEDSTAT_DOMAIN] = offsetof(struct perf_record_schedstat_domain, v15), 2654 }; 2655 2656 /* 2657 * Return true if the event is too small for its declared type. 2658 * Caller must ensure event->header.type < PERF_RECORD_HEADER_MAX. 2659 * If min is non-NULL, stores the required minimum on failure. 2660 */ 2661 static bool perf_event__too_small(const union perf_event *event, u32 *min) 2662 { 2663 u32 min_sz = perf_event__min_size[event->header.type]; 2664 2665 if (min_sz && event->header.size < min_sz) { 2666 if (min) 2667 *min = min_sz; 2668 return true; 2669 } 2670 2671 return false; 2672 } 2673 2674 /* 2675 * Read and validate the event at @file_offset. 2676 * 2677 * Returns: 2678 * 0 — success: *event_ptr is set and safe to access. 2679 * -1 — error; check *event_ptr to decide whether to advance or abort: 2680 * *event_ptr set — event header was read but the event is 2681 * malformed (too small for its type, or byte-swap 2682 * failed). header.size is still valid, so the 2683 * caller can advance past the event. 2684 * *event_ptr NULL — fatal: couldn't read the header at all 2685 * (I/O error, offset out of range, pipe mode). 2686 * Caller must abort. 2687 */ 2688 int perf_session__peek_event(struct perf_session *session, off_t file_offset, 2689 void *buf, size_t buf_sz, 2690 union perf_event **event_ptr, 2691 struct perf_sample *sample) 2692 { 2693 union perf_event *event; 2694 size_t hdr_sz, rest; 2695 u32 min_sz; 2696 int fd; 2697 2698 *event_ptr = NULL; 2699 2700 if (session->one_mmap && !session->header.needs_swap) { 2701 u64 offset_in_mmap; 2702 2703 /* Validate offset with integer arithmetic to avoid pointer UB */ 2704 if ((u64)file_offset < session->one_mmap_offset) 2705 return -1; 2706 2707 offset_in_mmap = (u64)file_offset - session->one_mmap_offset; 2708 2709 /* Use subtraction to avoid addition overflow */ 2710 if (offset_in_mmap >= session->one_mmap_size || 2711 session->one_mmap_size - offset_in_mmap < sizeof(struct perf_event_header)) 2712 return -1; 2713 2714 event = session->one_mmap_addr + offset_in_mmap; 2715 2716 if (event->header.size < sizeof(struct perf_event_header)) 2717 return -1; 2718 2719 /* Ensure full event is within the mmap region */ 2720 if (session->one_mmap_size - offset_in_mmap < event->header.size) 2721 return -1; 2722 } else { 2723 if (perf_data__is_pipe(session->data)) 2724 return -1; 2725 2726 fd = perf_data__fd(session->data); 2727 hdr_sz = sizeof(struct perf_event_header); 2728 2729 if (buf_sz < hdr_sz) 2730 return -1; 2731 2732 if (lseek(fd, file_offset, SEEK_SET) == (off_t)-1 || 2733 readn(fd, buf, hdr_sz) != (ssize_t)hdr_sz) 2734 return -1; 2735 2736 event = (union perf_event *)buf; 2737 2738 if (session->header.needs_swap) 2739 perf_event_header__bswap(&event->header); 2740 2741 if (event->header.size < hdr_sz || event->header.size > buf_sz) 2742 return -1; 2743 2744 buf += hdr_sz; 2745 rest = event->header.size - hdr_sz; 2746 2747 if (readn(fd, buf, rest) != (ssize_t)rest) 2748 return -1; 2749 } 2750 2751 /* Event data is fully loaded — expose so callers can advance */ 2752 *event_ptr = event; 2753 2754 /* 2755 * Check alignment before type: an unaligned size misaligns the 2756 * stream for all subsequent reads regardless of event type. 2757 * Three legacy user events predate the 8-byte rule — exempt them. 2758 */ 2759 if (event->header.size % sizeof(u64) && 2760 event->header.type != PERF_RECORD_HEADER_TRACING_DATA && 2761 event->header.type != PERF_RECORD_COMPRESSED && 2762 event->header.type != PERF_RECORD_HEADER_FEATURE) { 2763 pr_warning("WARNING: at offset %#" PRIx64 ": %s (%u) event size %u not aligned to %zu\n", 2764 (u64)file_offset, perf_event__name(event->header.type), 2765 event->header.type, event->header.size, sizeof(u64)); 2766 return -1; 2767 } 2768 2769 if (event->header.type >= PERF_RECORD_HEADER_MAX) { 2770 pr_warning("WARNING: at offset %#" PRIx64 ": unsupported event type %u, skipping\n", 2771 (u64)file_offset, event->header.type); 2772 return 0; 2773 } 2774 2775 if (perf_event__too_small(event, &min_sz)) { 2776 pr_warning("WARNING: at offset %#" PRIx64 ": %s (%u) event size %u too small (min %u)\n", 2777 (u64)file_offset, perf_event__name(event->header.type), 2778 event->header.type, event->header.size, min_sz); 2779 return -1; 2780 } 2781 2782 if (session->header.needs_swap && 2783 event_swap(event, evlist__sample_id_all(session->evlist))) { 2784 /* 2785 * The header was already swapped so header.size is 2786 * valid — expose the event so callers can advance 2787 * past this malformed entry instead of aborting. 2788 */ 2789 *event_ptr = event; 2790 return -1; 2791 } 2792 2793 if (sample && event->header.type < PERF_RECORD_USER_TYPE_START && 2794 evlist__parse_sample(session->evlist, event, sample)) 2795 return -1; 2796 2797 return 0; 2798 } 2799 2800 int perf_session__peek_events(struct perf_session *session, u64 offset, 2801 u64 size, peek_events_cb_t cb, void *data) 2802 { 2803 u64 max_offset = offset + size; 2804 char buf[PERF_SAMPLE_MAX_SIZE]; 2805 union perf_event *event; 2806 int err; 2807 2808 do { 2809 event = NULL; 2810 err = perf_session__peek_event(session, offset, buf, 2811 PERF_SAMPLE_MAX_SIZE, &event, 2812 NULL); 2813 if (err) { 2814 /* 2815 * Recoverable error: peek_event returns -1 but 2816 * sets event_ptr when the header was read 2817 * successfully but the event is malformed (too 2818 * small or swap failed). Skip past it using 2819 * header.size — don't invoke the callback since 2820 * type-specific fields may be truncated. 2821 * 2822 * Must abort if: event_ptr is NULL (I/O error), 2823 * size is 0 (can't advance), type is AUXTRACE 2824 * (payload extends beyond header.size), or size 2825 * is unaligned (would misalign all subsequent reads). 2826 * 2827 * Direct callers (auxtrace, cs-etm) treat any 2828 * non-zero return as fatal — only this loop skips. 2829 */ 2830 if (event && event->header.size && 2831 event->header.type != PERF_RECORD_AUXTRACE && 2832 event->header.size % sizeof(u64) == 0) { 2833 offset += event->header.size; 2834 err = 0; 2835 } else { 2836 return err; 2837 } 2838 continue; 2839 } 2840 2841 err = cb(session, event, offset, data); 2842 if (err) 2843 return err; 2844 2845 offset += event->header.size; 2846 if (event->header.type == PERF_RECORD_AUXTRACE) 2847 offset += event->auxtrace.size; 2848 2849 } while (offset < max_offset); 2850 2851 return err; 2852 } 2853 2854 static s64 perf_session__process_event(struct perf_session *session, 2855 union perf_event *event, u64 file_offset, 2856 const char *file_path) 2857 { 2858 struct evlist *evlist = session->evlist; 2859 const struct perf_tool *tool = session->tool; 2860 u32 min_sz; 2861 int ret; 2862 2863 /* 2864 * The kernel aligns all event sizes to sizeof(u64) — see 2865 * perf_event_comm_event() (ALIGN), perf_event_mmap_event(), 2866 * perf_event_cgroup(), perf_event_ksymbol() (IS_ALIGNED loops), 2867 * and perf_event_text_poke() (ALIGN) in kernel/events/core.c. 2868 * 2869 * An unaligned size means the file is corrupted or crafted. 2870 * Abort: there is no point continuing to read unaligned records 2871 * because the caller advances rd->head by event->header.size, 2872 * so every subsequent read would start at a misaligned offset, 2873 * producing garbage headers for the rest of the file. 2874 * 2875 * Exempt three legacy user events that predate the alignment rule: 2876 * 2877 * TRACING_DATA (66): struct tracing_data_event was 12 bytes before 2878 * b39c915a4f36 ("libperf event: Ensure tracing data is multiple 2879 * of 8 sized") added __u32 pad; old perf.data files still contain 2880 * 12-byte records. 2881 * TODO: introduce HEADER_TRACING_DATA2 with guaranteed alignment. 2882 * 2883 * COMPRESSED (81): raw ZSTD output, arbitrary length. Already 2884 * superseded by COMPRESSED2 (83) with PERF_ALIGN. 2885 * 2886 * HEADER_FEATURE (80): do_write_string() uses a 4-byte length 2887 * prefix with no padding to 8-byte total. 2888 * TODO: introduce HEADER_FEATURE2 with guaranteed alignment. 2889 */ 2890 if (event->header.size % sizeof(u64) && 2891 event->header.type != PERF_RECORD_HEADER_TRACING_DATA && 2892 event->header.type != PERF_RECORD_COMPRESSED && 2893 event->header.type != PERF_RECORD_HEADER_FEATURE) { 2894 pr_err("ERROR: at offset %#" PRIx64 ": %s (%u) event size %u is not 8-byte aligned, aborting\n", 2895 file_offset, perf_event__name(event->header.type), 2896 event->header.type, event->header.size); 2897 return -EINVAL; 2898 } 2899 2900 if (event->header.type >= PERF_RECORD_HEADER_MAX) { 2901 /* This perf is outdated and does not support the latest event type. */ 2902 ui__warning("Unsupported header type %u, please consider updating perf.\n", 2903 event->header.type); 2904 /* 2905 * Return 0 to skip: the caller (reader__read_event) 2906 * already advances by event->header.size. 2907 */ 2908 return 0; 2909 } 2910 2911 /* 2912 * Skip rather than abort: a too-small-but-aligned event 2913 * can be safely stepped over without misaligning the stream. 2914 */ 2915 if (perf_event__too_small(event, &min_sz)) { 2916 pr_warning("WARNING: at offset %#" PRIx64 ": %s (%u) event size %u too small (min %u), skipping\n", 2917 file_offset, perf_event__name(event->header.type), 2918 event->header.type, event->header.size, min_sz); 2919 return 0; 2920 } 2921 2922 if (session->header.needs_swap && 2923 event_swap(event, evlist__sample_id_all(evlist))) { 2924 pr_warning("WARNING: at offset %#" PRIx64 ": swap failed for %s (%u) event, skipping\n", 2925 file_offset, perf_event__name(event->header.type), 2926 event->header.type); 2927 return 0; 2928 } 2929 2930 events_stats__inc(&evlist->stats, event->header.type); 2931 2932 if (event->header.type >= PERF_RECORD_USER_TYPE_START) 2933 return perf_session__process_user_event(session, event, file_offset, file_path); 2934 2935 if (tool->ordered_events) { 2936 u64 timestamp = -1ULL; 2937 2938 ret = evlist__parse_sample_timestamp(evlist, event, ×tamp); 2939 if (ret && ret != -1) 2940 return ret; 2941 2942 ret = perf_session__queue_event(session, event, timestamp, file_offset, file_path); 2943 if (ret != -ETIME) 2944 return ret; 2945 } 2946 2947 return perf_session__deliver_event(session, event, tool, file_offset, file_path); 2948 } 2949 2950 void perf_event_header__bswap(struct perf_event_header *hdr) 2951 { 2952 hdr->type = bswap_32(hdr->type); 2953 hdr->misc = bswap_16(hdr->misc); 2954 hdr->size = bswap_16(hdr->size); 2955 } 2956 2957 struct thread *perf_session__findnew(struct perf_session *session, pid_t pid) 2958 { 2959 return machine__findnew_thread(&session->machines.host, -1, pid); 2960 } 2961 2962 int perf_session__register_idle_thread(struct perf_session *session) 2963 { 2964 struct thread *thread = machine__idle_thread(&session->machines.host); 2965 2966 /* machine__idle_thread() got the thread, so put it */ 2967 thread__put(thread); 2968 return thread ? 0 : -1; 2969 } 2970 2971 static void 2972 perf_session__warn_order(const struct perf_session *session) 2973 { 2974 const struct ordered_events *oe = &session->ordered_events; 2975 struct evsel *evsel; 2976 bool should_warn = true; 2977 2978 evlist__for_each_entry(session->evlist, evsel) { 2979 if (evsel->core.attr.write_backward) 2980 should_warn = false; 2981 } 2982 2983 if (!should_warn) 2984 return; 2985 if (oe->nr_unordered_events != 0) 2986 ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events); 2987 } 2988 2989 static void perf_session__warn_about_errors(const struct perf_session *session) 2990 { 2991 const struct events_stats *stats = &session->evlist->stats; 2992 2993 if (session->tool->lost == perf_event__process_lost && 2994 stats->nr_events[PERF_RECORD_LOST] != 0) { 2995 ui__warning("Processed %d events and lost %d chunks!\n\n" 2996 "Check IO/CPU overload!\n\n", 2997 stats->nr_events[0], 2998 stats->nr_events[PERF_RECORD_LOST]); 2999 } 3000 3001 if (session->tool->lost_samples == perf_event__process_lost_samples) { 3002 double drop_rate; 3003 3004 drop_rate = (double)stats->total_lost_samples / 3005 (double) (stats->nr_events[PERF_RECORD_SAMPLE] + stats->total_lost_samples); 3006 if (drop_rate > 0.05) { 3007 ui__warning("Processed %" PRIu64 " samples and lost %3.2f%%!\n\n", 3008 stats->nr_events[PERF_RECORD_SAMPLE] + stats->total_lost_samples, 3009 drop_rate * 100.0); 3010 } 3011 } 3012 3013 if (session->tool->aux == perf_event__process_aux && 3014 stats->total_aux_lost != 0) { 3015 ui__warning("AUX data lost %" PRIu64 " times out of %u!\n\n", 3016 stats->total_aux_lost, 3017 stats->nr_events[PERF_RECORD_AUX]); 3018 } 3019 3020 if (session->tool->aux == perf_event__process_aux && 3021 stats->total_aux_partial != 0) { 3022 bool vmm_exclusive = false; 3023 3024 (void)sysfs__read_bool("module/kvm_intel/parameters/vmm_exclusive", 3025 &vmm_exclusive); 3026 3027 ui__warning("AUX data had gaps in it %" PRIu64 " times out of %u!\n\n" 3028 "Are you running a KVM guest in the background?%s\n\n", 3029 stats->total_aux_partial, 3030 stats->nr_events[PERF_RECORD_AUX], 3031 vmm_exclusive ? 3032 "\nReloading kvm_intel module with vmm_exclusive=0\n" 3033 "will reduce the gaps to only guest's timeslices." : 3034 ""); 3035 } 3036 3037 if (session->tool->aux == perf_event__process_aux && 3038 stats->total_aux_collision != 0) { 3039 ui__warning("AUX data detected collision %" PRIu64 " times out of %u!\n\n", 3040 stats->total_aux_collision, 3041 stats->nr_events[PERF_RECORD_AUX]); 3042 } 3043 3044 if (stats->nr_unknown_events != 0) { 3045 ui__warning("Found %u unknown events!\n\n" 3046 "Is this an older tool processing a perf.data " 3047 "file generated by a more recent tool?\n\n" 3048 "If that is not the case, consider " 3049 "reporting to linux-kernel@vger.kernel.org.\n\n", 3050 stats->nr_unknown_events); 3051 } 3052 3053 if (stats->nr_unknown_id != 0) { 3054 ui__warning("%u samples with id not present in the header\n", 3055 stats->nr_unknown_id); 3056 } 3057 3058 if (stats->nr_invalid_chains != 0) { 3059 ui__warning("Found invalid callchains!\n\n" 3060 "%u out of %u events were discarded for this reason.\n\n" 3061 "Consider reporting to linux-kernel@vger.kernel.org.\n\n", 3062 stats->nr_invalid_chains, 3063 stats->nr_events[PERF_RECORD_SAMPLE]); 3064 } 3065 3066 if (stats->nr_unprocessable_samples != 0) { 3067 ui__warning("%u unprocessable samples recorded.\n" 3068 "Do you have a KVM guest running and not using 'perf kvm'?\n", 3069 stats->nr_unprocessable_samples); 3070 } 3071 3072 perf_session__warn_order(session); 3073 3074 events_stats__auxtrace_error_warn(stats); 3075 3076 if (stats->nr_proc_map_timeout != 0) { 3077 ui__warning("%d map information files for pre-existing threads were\n" 3078 "not processed, if there are samples for addresses they\n" 3079 "will not be resolved, you may find out which are these\n" 3080 "threads by running with -v and redirecting the output\n" 3081 "to a file.\n" 3082 "The time limit to process proc map is too short?\n" 3083 "Increase it by --proc-map-timeout\n", 3084 stats->nr_proc_map_timeout); 3085 } 3086 } 3087 3088 static int perf_session__flush_thread_stack(struct thread *thread, 3089 void *p __maybe_unused) 3090 { 3091 return thread_stack__flush(thread); 3092 } 3093 3094 static int perf_session__flush_thread_stacks(struct perf_session *session) 3095 { 3096 return machines__for_each_thread(&session->machines, 3097 perf_session__flush_thread_stack, 3098 NULL); 3099 } 3100 3101 volatile sig_atomic_t session_done; 3102 3103 static int __perf_session__process_decomp_events(struct perf_session *session); 3104 3105 static int __perf_session__process_pipe_events(struct perf_session *session) 3106 { 3107 struct ordered_events *oe = &session->ordered_events; 3108 const struct perf_tool *tool = session->tool; 3109 struct ui_progress prog; 3110 union perf_event *event; 3111 uint32_t size, cur_size = 0; 3112 void *buf = NULL; 3113 s64 skip = 0; 3114 u64 head; 3115 ssize_t err; 3116 void *p; 3117 bool update_prog = false; 3118 3119 /* 3120 * If it's from a file saving pipe data (by redirection), it would have 3121 * a file name other than "-". Then we can get the total size and show 3122 * the progress. 3123 */ 3124 if (strcmp(session->data->path, "-") && session->data->file.size) { 3125 ui_progress__init_size(&prog, session->data->file.size, 3126 "Processing events..."); 3127 update_prog = true; 3128 } 3129 3130 head = 0; 3131 cur_size = sizeof(union perf_event); 3132 3133 buf = malloc(cur_size); 3134 if (!buf) 3135 return -errno; 3136 ordered_events__set_copy_on_queue(oe, true); 3137 more: 3138 event = buf; 3139 err = perf_data__read(session->data, event, 3140 sizeof(struct perf_event_header)); 3141 if (err <= 0) { 3142 if (err == 0) 3143 goto done; 3144 3145 pr_err("failed to read event header\n"); 3146 goto out_err; 3147 } 3148 3149 if (session->header.needs_swap) 3150 perf_event_header__bswap(&event->header); 3151 3152 size = event->header.size; 3153 if (size < sizeof(struct perf_event_header)) { 3154 pr_err("bad event header size\n"); 3155 goto out_err; 3156 } 3157 3158 if (size > cur_size) { 3159 void *new = realloc(buf, size); 3160 if (!new) { 3161 pr_err("failed to allocate memory to read event\n"); 3162 goto out_err; 3163 } 3164 buf = new; 3165 cur_size = size; 3166 event = buf; 3167 } 3168 p = event; 3169 p += sizeof(struct perf_event_header); 3170 3171 if (size - sizeof(struct perf_event_header)) { 3172 err = perf_data__read(session->data, p, 3173 size - sizeof(struct perf_event_header)); 3174 if (err <= 0) { 3175 if (err == 0) { 3176 pr_err("unexpected end of event stream\n"); 3177 goto done; 3178 } 3179 3180 pr_err("failed to read event data\n"); 3181 goto out_err; 3182 } 3183 } 3184 3185 if ((skip = perf_session__process_event(session, event, head, "pipe")) < 0) { 3186 pr_err("%#" PRIx64 " [%#x]: piped event processing failed for event of type: %s (%d)\n", 3187 head, event->header.size, 3188 perf_event__name(event->header.type), 3189 event->header.type); 3190 err = -EINVAL; 3191 goto out_err; 3192 } 3193 3194 head += size; 3195 3196 if (skip > 0) 3197 head += skip; 3198 3199 err = __perf_session__process_decomp_events(session); 3200 if (err) 3201 goto out_err; 3202 3203 if (update_prog) 3204 ui_progress__update(&prog, size); 3205 3206 if (!session_done()) 3207 goto more; 3208 done: 3209 /* do the final flush for ordered samples */ 3210 err = ordered_events__flush(oe, OE_FLUSH__FINAL); 3211 if (err) 3212 goto out_err; 3213 err = session__flush_deferred_samples(session, tool); 3214 if (err) 3215 goto out_err; 3216 err = auxtrace__flush_events(session, tool); 3217 if (err) 3218 goto out_err; 3219 err = perf_session__flush_thread_stacks(session); 3220 out_err: 3221 free(buf); 3222 if (update_prog) 3223 ui_progress__finish(); 3224 if (!tool->no_warn) 3225 perf_session__warn_about_errors(session); 3226 ordered_events__free(&session->ordered_events); 3227 auxtrace__free_events(session); 3228 return err; 3229 } 3230 3231 static union perf_event * 3232 prefetch_event(char *buf, u64 head, size_t mmap_size, 3233 bool needs_swap, union perf_event *error) 3234 { 3235 union perf_event *event; 3236 u16 event_size; 3237 3238 /* 3239 * Ensure we have enough space remaining to read 3240 * the size of the event in the headers. 3241 */ 3242 if (head + sizeof(event->header) > mmap_size) 3243 return NULL; 3244 3245 event = (union perf_event *)(buf + head); 3246 if (needs_swap) 3247 perf_event_header__bswap(&event->header); 3248 3249 event_size = event->header.size; 3250 if (head + event_size <= mmap_size) 3251 return event; 3252 3253 /* We're not fetching the event so swap back again */ 3254 if (needs_swap) 3255 perf_event_header__bswap(&event->header); 3256 3257 /* Check if the event fits into the next mmapped buf. */ 3258 if (event_size <= mmap_size - head % page_size) { 3259 /* Remap buf and fetch again. */ 3260 return NULL; 3261 } 3262 3263 /* Invalid input. Event size should never exceed mmap_size. */ 3264 pr_debug("%s: head=%#" PRIx64 " event->header.size=%#x, mmap_size=%#zx:" 3265 " fuzzed or compressed perf.data?\n", __func__, head, event_size, mmap_size); 3266 3267 return error; 3268 } 3269 3270 static union perf_event * 3271 fetch_mmaped_event(u64 head, size_t mmap_size, char *buf, bool needs_swap) 3272 { 3273 return prefetch_event(buf, head, mmap_size, needs_swap, ERR_PTR(-EINVAL)); 3274 } 3275 3276 static union perf_event * 3277 fetch_decomp_event(u64 head, size_t mmap_size, char *buf, bool needs_swap) 3278 { 3279 return prefetch_event(buf, head, mmap_size, needs_swap, NULL); 3280 } 3281 3282 static int __perf_session__process_decomp_events(struct perf_session *session) 3283 { 3284 s64 skip; 3285 u64 size; 3286 struct decomp *decomp = session->active_decomp->decomp_last; 3287 3288 if (!decomp) 3289 return 0; 3290 3291 while (decomp->head < decomp->size && !session_done()) { 3292 union perf_event *event = fetch_decomp_event(decomp->head, decomp->size, decomp->data, 3293 session->header.needs_swap); 3294 3295 if (!event) 3296 break; 3297 3298 size = event->header.size; 3299 3300 if (size < sizeof(struct perf_event_header) || 3301 (skip = perf_session__process_event(session, event, decomp->file_pos, 3302 decomp->file_path)) < 0) { 3303 pr_err("%#" PRIx64 " [%#x]: decompress event processing failed for event of type: %s (%d)\n", 3304 decomp->file_pos + decomp->head, event->header.size, 3305 perf_event__name(event->header.type), 3306 event->header.type); 3307 return -EINVAL; 3308 } 3309 3310 if (skip) 3311 size += skip; 3312 3313 decomp->head += size; 3314 } 3315 3316 return 0; 3317 } 3318 3319 /* 3320 * On 64bit we can mmap the data file in one go. No need for tiny mmap 3321 * slices. On 32bit we use 32MB. 3322 */ 3323 #if BITS_PER_LONG == 64 3324 #define MMAP_SIZE ULLONG_MAX 3325 #define NUM_MMAPS 1 3326 #else 3327 #define MMAP_SIZE (32 * 1024 * 1024ULL) 3328 #define NUM_MMAPS 128 3329 #endif 3330 3331 struct reader; 3332 3333 typedef s64 (*reader_cb_t)(struct perf_session *session, 3334 union perf_event *event, 3335 u64 file_offset, 3336 const char *file_path); 3337 3338 struct reader { 3339 int fd; 3340 const char *path; 3341 u64 data_size; 3342 u64 data_offset; 3343 reader_cb_t process; 3344 bool in_place_update; 3345 char *mmaps[NUM_MMAPS]; 3346 size_t mmap_size; 3347 int mmap_idx; 3348 char *mmap_cur; 3349 u64 file_pos; 3350 u64 file_offset; 3351 u64 head; 3352 u64 size; 3353 bool done; 3354 struct zstd_data zstd_data; 3355 struct decomp_data decomp_data; 3356 }; 3357 3358 static int 3359 reader__init(struct reader *rd, bool *one_mmap) 3360 { 3361 u64 data_size = rd->data_size; 3362 char **mmaps = rd->mmaps; 3363 3364 rd->head = rd->data_offset; 3365 data_size += rd->data_offset; 3366 3367 rd->mmap_size = MMAP_SIZE; 3368 if (rd->mmap_size > data_size) { 3369 rd->mmap_size = data_size; 3370 if (one_mmap) 3371 *one_mmap = true; 3372 } 3373 3374 memset(mmaps, 0, sizeof(rd->mmaps)); 3375 3376 if (zstd_init(&rd->zstd_data, 0)) 3377 return -1; 3378 rd->decomp_data.zstd_decomp = &rd->zstd_data; 3379 3380 return 0; 3381 } 3382 3383 static void 3384 reader__release_decomp(struct reader *rd) 3385 { 3386 perf_decomp__release_events(rd->decomp_data.decomp); 3387 zstd_fini(&rd->zstd_data); 3388 } 3389 3390 static int 3391 reader__mmap(struct reader *rd, struct perf_session *session) 3392 { 3393 int mmap_prot, mmap_flags; 3394 char *buf, **mmaps = rd->mmaps; 3395 u64 page_offset; 3396 3397 /* 3398 * Native-endian: MAP_SHARED + PROT_READ — the kernel 3399 * guarantees page-level coherence but a concurrent writer 3400 * could modify the file between validation and use. This 3401 * is a theoretical TOCTOU that affects the entire perf.data 3402 * processing pipeline; fixing it would require copying each 3403 * event to a private buffer before processing. 3404 * 3405 * Cross-endian: MAP_PRIVATE + PROT_WRITE — swap handlers 3406 * get a copy-on-write snapshot immune to concurrent writes. 3407 */ 3408 mmap_prot = PROT_READ; 3409 mmap_flags = MAP_SHARED; 3410 3411 if (rd->in_place_update) { 3412 mmap_prot |= PROT_WRITE; 3413 } else if (session->header.needs_swap) { 3414 mmap_prot |= PROT_WRITE; 3415 mmap_flags = MAP_PRIVATE; 3416 } 3417 3418 if (mmaps[rd->mmap_idx]) { 3419 munmap(mmaps[rd->mmap_idx], rd->mmap_size); 3420 mmaps[rd->mmap_idx] = NULL; 3421 } 3422 3423 page_offset = page_size * (rd->head / page_size); 3424 rd->file_offset += page_offset; 3425 rd->head -= page_offset; 3426 3427 buf = mmap(NULL, rd->mmap_size, mmap_prot, mmap_flags, rd->fd, 3428 rd->file_offset); 3429 if (buf == MAP_FAILED) { 3430 pr_err("failed to mmap file\n"); 3431 return -errno; 3432 } 3433 mmaps[rd->mmap_idx] = rd->mmap_cur = buf; 3434 rd->mmap_idx = (rd->mmap_idx + 1) & (ARRAY_SIZE(rd->mmaps) - 1); 3435 rd->file_pos = rd->file_offset + rd->head; 3436 if (session->one_mmap) { 3437 session->one_mmap_addr = buf; 3438 session->one_mmap_offset = rd->file_offset; 3439 /* 3440 * mmap_size was set to the full file extent (data_offset + 3441 * data_size) but file_offset was shifted forward by 3442 * page_offset for page alignment. Reduce by page_offset 3443 * so the bounds check reflects the file-backed portion 3444 * of the mapping — pages beyond the file cause SIGBUS. 3445 */ 3446 session->one_mmap_size = rd->mmap_size - page_offset; 3447 } 3448 3449 return 0; 3450 } 3451 3452 enum { 3453 READER_OK, 3454 READER_NODATA, 3455 }; 3456 3457 static int 3458 reader__read_event(struct reader *rd, struct perf_session *session, 3459 struct ui_progress *prog) 3460 { 3461 u64 size; 3462 int err = READER_OK; 3463 union perf_event *event; 3464 s64 skip; 3465 3466 event = fetch_mmaped_event(rd->head, rd->mmap_size, rd->mmap_cur, 3467 session->header.needs_swap); 3468 if (IS_ERR(event)) 3469 return PTR_ERR(event); 3470 3471 if (!event) 3472 return READER_NODATA; 3473 3474 size = event->header.size; 3475 3476 skip = -EINVAL; 3477 3478 if (size < sizeof(struct perf_event_header) || 3479 (skip = rd->process(session, event, rd->file_pos, rd->path)) < 0) { 3480 errno = -skip; 3481 pr_err("%#" PRIx64 " [%#x]: processing failed for event of type: %s (%d) [%m]\n", 3482 rd->file_offset + rd->head, event->header.size, 3483 perf_event__name(event->header.type), 3484 event->header.type); 3485 err = skip; 3486 goto out; 3487 } 3488 3489 if (skip) 3490 size += skip; 3491 3492 rd->size += size; 3493 rd->head += size; 3494 rd->file_pos += size; 3495 3496 err = __perf_session__process_decomp_events(session); 3497 if (err) 3498 goto out; 3499 3500 ui_progress__update(prog, size); 3501 3502 out: 3503 return err; 3504 } 3505 3506 static inline bool 3507 reader__eof(struct reader *rd) 3508 { 3509 return (rd->file_pos >= rd->data_size + rd->data_offset); 3510 } 3511 3512 static int 3513 reader__process_events(struct reader *rd, struct perf_session *session, 3514 struct ui_progress *prog) 3515 { 3516 int err; 3517 3518 err = reader__init(rd, &session->one_mmap); 3519 if (err) 3520 goto out; 3521 3522 session->active_decomp = &rd->decomp_data; 3523 3524 remap: 3525 err = reader__mmap(rd, session); 3526 if (err) 3527 goto out; 3528 3529 more: 3530 err = reader__read_event(rd, session, prog); 3531 if (err < 0) 3532 goto out; 3533 else if (err == READER_NODATA) 3534 goto remap; 3535 3536 if (session_done()) 3537 goto out; 3538 3539 if (!reader__eof(rd)) 3540 goto more; 3541 3542 out: 3543 session->active_decomp = &session->decomp_data; 3544 return err; 3545 } 3546 3547 static s64 process_simple(struct perf_session *session, 3548 union perf_event *event, 3549 u64 file_offset, 3550 const char *file_path) 3551 { 3552 return perf_session__process_event(session, event, file_offset, file_path); 3553 } 3554 3555 static int __perf_session__process_events(struct perf_session *session) 3556 { 3557 struct reader rd = { 3558 .fd = perf_data__fd(session->data), 3559 .path = session->data->file.path, 3560 .data_size = session->header.data_size, 3561 .data_offset = session->header.data_offset, 3562 .process = process_simple, 3563 .in_place_update = session->data->in_place_update, 3564 }; 3565 struct ordered_events *oe = &session->ordered_events; 3566 const struct perf_tool *tool = session->tool; 3567 struct ui_progress prog; 3568 int err; 3569 3570 if (rd.data_size == 0) 3571 return -1; 3572 3573 ui_progress__init_size(&prog, rd.data_size, "Processing events..."); 3574 3575 err = reader__process_events(&rd, session, &prog); 3576 if (err) 3577 goto out_err; 3578 /* do the final flush for ordered samples */ 3579 err = ordered_events__flush(oe, OE_FLUSH__FINAL); 3580 if (err) 3581 goto out_err; 3582 err = auxtrace__flush_events(session, tool); 3583 if (err) 3584 goto out_err; 3585 err = session__flush_deferred_samples(session, tool); 3586 if (err) 3587 goto out_err; 3588 err = perf_session__flush_thread_stacks(session); 3589 out_err: 3590 ui_progress__finish(); 3591 if (!tool->no_warn) 3592 perf_session__warn_about_errors(session); 3593 /* 3594 * We may switching perf.data output, make ordered_events 3595 * reusable. 3596 */ 3597 ordered_events__reinit(&session->ordered_events); 3598 auxtrace__free_events(session); 3599 reader__release_decomp(&rd); 3600 session->one_mmap = false; 3601 return err; 3602 } 3603 3604 /* 3605 * Processing 2 MB of data from each reader in sequence, 3606 * because that's the way the ordered events sorting works 3607 * most efficiently. 3608 */ 3609 #define READER_MAX_SIZE (2 * 1024 * 1024) 3610 3611 /* 3612 * This function reads, merge and process directory data. 3613 * It assumens the version 1 of directory data, where each 3614 * data file holds per-cpu data, already sorted by kernel. 3615 */ 3616 static int __perf_session__process_dir_events(struct perf_session *session) 3617 { 3618 struct perf_data *data = session->data; 3619 const struct perf_tool *tool = session->tool; 3620 int i, ret, readers, nr_readers; 3621 struct ui_progress prog; 3622 u64 total_size = perf_data__size(session->data); 3623 struct reader *rd; 3624 3625 ui_progress__init_size(&prog, total_size, "Processing events..."); 3626 3627 nr_readers = 1; 3628 for (i = 0; i < data->dir.nr; i++) { 3629 if (data->dir.files[i].size) 3630 nr_readers++; 3631 } 3632 3633 rd = calloc(nr_readers, sizeof(struct reader)); 3634 if (!rd) 3635 return -ENOMEM; 3636 3637 rd[0] = (struct reader) { 3638 .fd = perf_data__fd(session->data), 3639 .path = session->data->file.path, 3640 .data_size = session->header.data_size, 3641 .data_offset = session->header.data_offset, 3642 .process = process_simple, 3643 .in_place_update = session->data->in_place_update, 3644 }; 3645 ret = reader__init(&rd[0], NULL); 3646 if (ret) 3647 goto out_err; 3648 ret = reader__mmap(&rd[0], session); 3649 if (ret) 3650 goto out_err; 3651 readers = 1; 3652 3653 for (i = 0; i < data->dir.nr; i++) { 3654 if (!data->dir.files[i].size) 3655 continue; 3656 rd[readers] = (struct reader) { 3657 .fd = perf_data_file__fd(&data->dir.files[i]), 3658 .path = data->dir.files[i].path, 3659 .data_size = data->dir.files[i].size, 3660 .data_offset = 0, 3661 .process = process_simple, 3662 .in_place_update = session->data->in_place_update, 3663 }; 3664 ret = reader__init(&rd[readers], NULL); 3665 if (ret) 3666 goto out_err; 3667 ret = reader__mmap(&rd[readers], session); 3668 if (ret) 3669 goto out_err; 3670 readers++; 3671 } 3672 3673 i = 0; 3674 while (readers) { 3675 if (session_done()) 3676 break; 3677 3678 if (rd[i].done) { 3679 i = (i + 1) % nr_readers; 3680 continue; 3681 } 3682 if (reader__eof(&rd[i])) { 3683 rd[i].done = true; 3684 readers--; 3685 continue; 3686 } 3687 3688 session->active_decomp = &rd[i].decomp_data; 3689 ret = reader__read_event(&rd[i], session, &prog); 3690 if (ret < 0) { 3691 goto out_err; 3692 } else if (ret == READER_NODATA) { 3693 ret = reader__mmap(&rd[i], session); 3694 if (ret) 3695 goto out_err; 3696 } 3697 3698 if (rd[i].size >= READER_MAX_SIZE) { 3699 rd[i].size = 0; 3700 i = (i + 1) % nr_readers; 3701 } 3702 } 3703 3704 ret = ordered_events__flush(&session->ordered_events, OE_FLUSH__FINAL); 3705 if (ret) 3706 goto out_err; 3707 3708 ret = session__flush_deferred_samples(session, tool); 3709 if (ret) 3710 goto out_err; 3711 3712 ret = perf_session__flush_thread_stacks(session); 3713 out_err: 3714 ui_progress__finish(); 3715 3716 if (!tool->no_warn) 3717 perf_session__warn_about_errors(session); 3718 3719 /* 3720 * We may switching perf.data output, make ordered_events 3721 * reusable. 3722 */ 3723 ordered_events__reinit(&session->ordered_events); 3724 3725 session->one_mmap = false; 3726 3727 session->active_decomp = &session->decomp_data; 3728 for (i = 0; i < nr_readers; i++) 3729 reader__release_decomp(&rd[i]); 3730 zfree(&rd); 3731 3732 return ret; 3733 } 3734 3735 int perf_session__process_events(struct perf_session *session) 3736 { 3737 if (perf_session__register_idle_thread(session) < 0) 3738 return -ENOMEM; 3739 3740 if (perf_data__is_pipe(session->data)) 3741 return __perf_session__process_pipe_events(session); 3742 3743 if (perf_data__is_dir(session->data) && session->data->dir.nr) 3744 return __perf_session__process_dir_events(session); 3745 3746 return __perf_session__process_events(session); 3747 } 3748 3749 bool perf_session__has_traces(struct perf_session *session, const char *msg) 3750 { 3751 struct evsel *evsel; 3752 3753 evlist__for_each_entry(session->evlist, evsel) { 3754 if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT) 3755 return true; 3756 } 3757 3758 pr_err("No trace sample to read. Did you call 'perf %s'?\n", msg); 3759 return false; 3760 } 3761 3762 bool perf_session__has_switch_events(struct perf_session *session) 3763 { 3764 struct evsel *evsel; 3765 3766 evlist__for_each_entry(session->evlist, evsel) { 3767 if (evsel->core.attr.context_switch) 3768 return true; 3769 } 3770 3771 return false; 3772 } 3773 3774 int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u64 addr) 3775 { 3776 char *bracket, *name; 3777 struct ref_reloc_sym *ref; 3778 struct kmap *kmap; 3779 3780 ref = zalloc(sizeof(struct ref_reloc_sym)); 3781 if (ref == NULL) 3782 return -ENOMEM; 3783 3784 ref->name = name = strdup(symbol_name); 3785 if (ref->name == NULL) { 3786 free(ref); 3787 return -ENOMEM; 3788 } 3789 3790 bracket = strchr(name, ']'); 3791 if (bracket) 3792 *bracket = '\0'; 3793 3794 ref->addr = addr; 3795 3796 kmap = map__kmap(map); 3797 if (kmap) 3798 kmap->ref_reloc_sym = ref; 3799 3800 return 0; 3801 } 3802 3803 size_t perf_session__fprintf_dsos(struct perf_session *session, FILE *fp) 3804 { 3805 return machines__fprintf_dsos(&session->machines, fp); 3806 } 3807 3808 size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp, 3809 bool (skip)(struct dso *dso, int parm), int parm) 3810 { 3811 return machines__fprintf_dsos_buildid(&session->machines, fp, skip, parm); 3812 } 3813 3814 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) 3815 { 3816 size_t ret; 3817 const char *msg = ""; 3818 3819 if (perf_header__has_feat(&session->header, HEADER_AUXTRACE)) 3820 msg = " (excludes AUX area (e.g. instruction trace) decoded / synthesized events)"; 3821 3822 ret = fprintf(fp, "\nAggregated stats:%s\n", msg); 3823 3824 ret += events_stats__fprintf(&session->evlist->stats, fp); 3825 return ret; 3826 } 3827 3828 size_t perf_session__fprintf(struct perf_session *session, FILE *fp) 3829 { 3830 size_t ret = machine__fprintf(&session->machines.host, fp); 3831 3832 for (struct rb_node *nd = rb_first_cached(&session->machines.guests); nd; nd = rb_next(nd)) { 3833 struct machine *pos = rb_entry(nd, struct machine, rb_node); 3834 3835 ret += machine__fprintf(pos, fp); 3836 } 3837 return ret; 3838 } 3839 3840 void perf_session__dump_kmaps(struct perf_session *session) 3841 { 3842 int save_verbose = verbose; 3843 3844 fflush(stdout); 3845 fprintf(stderr, "Kernel and module maps:\n"); 3846 verbose = 0; /* Suppress verbose to print a summary only */ 3847 maps__fprintf(machine__kernel_maps(&session->machines.host), stderr); 3848 verbose = save_verbose; 3849 } 3850 3851 struct evsel *perf_session__find_first_evtype(struct perf_session *session, 3852 unsigned int type) 3853 { 3854 struct evsel *pos; 3855 3856 evlist__for_each_entry(session->evlist, pos) { 3857 if (pos->core.attr.type == type) 3858 return pos; 3859 } 3860 return NULL; 3861 } 3862 3863 int perf_session__cpu_bitmap(struct perf_session *session, 3864 const char *cpu_list, unsigned long *cpu_bitmap) 3865 { 3866 unsigned int i; 3867 int err = -1; 3868 struct perf_cpu_map *map; 3869 int nr_cpus = min(perf_session__env(session)->nr_cpus_avail, MAX_NR_CPUS); 3870 struct perf_cpu cpu; 3871 3872 for (i = 0; i < PERF_TYPE_MAX; ++i) { 3873 struct evsel *evsel; 3874 3875 evsel = perf_session__find_first_evtype(session, i); 3876 if (!evsel) 3877 continue; 3878 3879 if (!(evsel->core.attr.sample_type & PERF_SAMPLE_CPU)) { 3880 pr_err("File does not contain CPU events. " 3881 "Remove -C option to proceed.\n"); 3882 return -1; 3883 } 3884 } 3885 3886 map = perf_cpu_map__new(cpu_list); 3887 if (map == NULL) { 3888 pr_err("Invalid cpu_list\n"); 3889 return -1; 3890 } 3891 3892 perf_cpu_map__for_each_cpu(cpu, i, map) { 3893 if (cpu.cpu >= nr_cpus) { 3894 pr_err("Requested CPU %d too large. " 3895 "Consider raising MAX_NR_CPUS\n", cpu.cpu); 3896 goto out_delete_map; 3897 } 3898 3899 __set_bit(cpu.cpu, cpu_bitmap); 3900 } 3901 3902 err = 0; 3903 3904 out_delete_map: 3905 perf_cpu_map__put(map); 3906 return err; 3907 } 3908 3909 void perf_session__fprintf_info(struct perf_session *session, FILE *fp, 3910 bool full) 3911 { 3912 if (session == NULL || fp == NULL) 3913 return; 3914 3915 fprintf(fp, "# ========\n"); 3916 perf_header__fprintf_info(session, fp, full); 3917 fprintf(fp, "# ========\n#\n"); 3918 } 3919 3920 static int perf_session__register_guest(struct perf_session *session, pid_t machine_pid) 3921 { 3922 struct machine *machine = machines__findnew(&session->machines, machine_pid); 3923 struct thread *thread; 3924 3925 if (!machine) 3926 return -ENOMEM; 3927 3928 machine->single_address_space = session->machines.host.single_address_space; 3929 3930 thread = machine__idle_thread(machine); 3931 if (!thread) 3932 return -ENOMEM; 3933 thread__put(thread); 3934 3935 machine->kallsyms_filename = perf_data__guest_kallsyms_name(session->data, machine_pid); 3936 3937 return 0; 3938 } 3939 3940 static int perf_session__set_guest_cpu(struct perf_session *session, pid_t pid, 3941 pid_t tid, int guest_cpu) 3942 { 3943 struct machine *machine = &session->machines.host; 3944 struct thread *thread = machine__findnew_thread(machine, pid, tid); 3945 3946 if (!thread) 3947 return -ENOMEM; 3948 thread__set_guest_cpu(thread, guest_cpu); 3949 thread__put(thread); 3950 3951 return 0; 3952 } 3953 3954 int perf_event__process_id_index(const struct perf_tool *tool __maybe_unused, 3955 struct perf_session *session, 3956 union perf_event *event) 3957 { 3958 struct evlist *evlist = session->evlist; 3959 struct perf_record_id_index *ie = &event->id_index; 3960 size_t sz = ie->header.size - sizeof(*ie); 3961 size_t i, nr, max_nr; 3962 size_t e1_sz = sizeof(struct id_index_entry); 3963 size_t e2_sz = sizeof(struct id_index_entry_2); 3964 size_t etot_sz = e1_sz + e2_sz; 3965 struct id_index_entry_2 *e2; 3966 pid_t last_pid = 0; 3967 3968 max_nr = sz / e1_sz; 3969 nr = ie->nr; 3970 if (nr > max_nr) { 3971 printf("Too big: nr %zu max_nr %zu\n", nr, max_nr); 3972 return -EINVAL; 3973 } 3974 3975 if (sz >= nr * etot_sz) { 3976 max_nr = sz / etot_sz; 3977 if (nr > max_nr) { 3978 printf("Too big2: nr %zu max_nr %zu\n", nr, max_nr); 3979 return -EINVAL; 3980 } 3981 e2 = (void *)ie + sizeof(*ie) + nr * e1_sz; 3982 } else { 3983 e2 = NULL; 3984 } 3985 3986 if (dump_trace) 3987 fprintf(stdout, " nr: %zu\n", nr); 3988 3989 for (i = 0; i < nr; i++, (e2 ? e2++ : 0)) { 3990 struct id_index_entry *e = &ie->entries[i]; 3991 struct perf_sample_id *sid; 3992 int ret; 3993 3994 if (dump_trace) { 3995 fprintf(stdout, " ... id: %"PRI_lu64, e->id); 3996 fprintf(stdout, " idx: %"PRI_lu64, e->idx); 3997 fprintf(stdout, " cpu: %"PRI_ld64, e->cpu); 3998 fprintf(stdout, " tid: %"PRI_ld64, e->tid); 3999 if (e2) { 4000 fprintf(stdout, " machine_pid: %"PRI_ld64, e2->machine_pid); 4001 fprintf(stdout, " vcpu: %"PRI_lu64"\n", e2->vcpu); 4002 } else { 4003 fprintf(stdout, "\n"); 4004 } 4005 } 4006 4007 sid = evlist__id2sid(evlist, e->id); 4008 if (!sid) 4009 return -ENOENT; 4010 4011 sid->idx = e->idx; 4012 sid->cpu.cpu = e->cpu; 4013 sid->tid = e->tid; 4014 4015 if (!e2) 4016 continue; 4017 4018 sid->machine_pid = e2->machine_pid; 4019 sid->vcpu.cpu = e2->vcpu; 4020 4021 if (!sid->machine_pid) 4022 continue; 4023 4024 if (sid->machine_pid != last_pid) { 4025 ret = perf_session__register_guest(session, sid->machine_pid); 4026 if (ret) 4027 return ret; 4028 last_pid = sid->machine_pid; 4029 perf_guest = true; 4030 } 4031 4032 ret = perf_session__set_guest_cpu(session, sid->machine_pid, e->tid, e2->vcpu); 4033 if (ret) 4034 return ret; 4035 } 4036 return 0; 4037 } 4038 4039 int perf_session__dsos_hit_all(struct perf_session *session) 4040 { 4041 struct rb_node *nd; 4042 int err; 4043 4044 err = machine__hit_all_dsos(&session->machines.host); 4045 if (err) 4046 return err; 4047 4048 for (nd = rb_first_cached(&session->machines.guests); nd; 4049 nd = rb_next(nd)) { 4050 struct machine *pos = rb_entry(nd, struct machine, rb_node); 4051 4052 err = machine__hit_all_dsos(pos); 4053 if (err) 4054 return err; 4055 } 4056 4057 return 0; 4058 } 4059 4060 struct perf_env *perf_session__env(struct perf_session *session) 4061 { 4062 return &session->header.env; 4063 } 4064 4065 struct perf_session__e_machine_cb_args { 4066 uint32_t e_flags; 4067 uint16_t e_machine; 4068 }; 4069 4070 static int perf_session__e_machine_cb(struct thread *thread, void *_args) 4071 { 4072 struct perf_session__e_machine_cb_args *args = _args; 4073 4074 args->e_machine = thread__e_machine(thread, /*machine=*/NULL, &args->e_flags); 4075 return args->e_machine != EM_NONE ? 1 : 0; 4076 } 4077 4078 /* 4079 * Note, a machine may have mixed 32-bit and 64-bit processes and so mixed 4080 * e_machines. Use thread__e_machine when this matters. 4081 */ 4082 uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags) 4083 { 4084 struct perf_session__e_machine_cb_args args = { 4085 .e_machine = EM_NONE, 4086 }; 4087 struct perf_env *env; 4088 4089 if (!session) { 4090 /* Default to assuming a host machine. */ 4091 if (e_flags) 4092 *e_flags = EF_HOST; 4093 4094 return EM_HOST; 4095 } 4096 4097 /* 4098 * Is the env caching an e_machine? If not we want to compute from the 4099 * more accurate threads. 4100 */ 4101 env = perf_session__env(session); 4102 if (env && env->e_machine != EM_NONE) 4103 return perf_env__e_machine(env, e_flags); 4104 4105 /* 4106 * Compute from threads, note this is more accurate than 4107 * perf_env__e_machine that falls back on EM_HOST and doesn't consider 4108 * mixed 32-bit and 64-bit threads. 4109 */ 4110 machines__for_each_thread(&session->machines, 4111 perf_session__e_machine_cb, 4112 &args); 4113 4114 if (args.e_machine != EM_NONE) { 4115 if (env) { 4116 env->e_machine = args.e_machine; 4117 env->e_flags = args.e_flags; 4118 } 4119 if (e_flags) 4120 *e_flags = args.e_flags; 4121 4122 return args.e_machine; 4123 } 4124 4125 /* 4126 * Couldn't determine from the perf_env or current set of 4127 * threads. Potentially use logic that uses the arch string otherwise 4128 * default to the host. Don't cache in the perf_env in case later 4129 * threads indicate a better ELF machine type. 4130 */ 4131 return perf_env__e_machine_nocache(env, e_flags); 4132 } 4133