1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Arm Statistical Profiling Extensions (SPE) support 4 * Copyright (c) 2017-2018, Arm Ltd. 5 */ 6 7 #include <byteswap.h> 8 #include <endian.h> 9 #include <errno.h> 10 #include <inttypes.h> 11 #include <linux/bitops.h> 12 #include <linux/kernel.h> 13 #include <linux/log2.h> 14 #include <linux/types.h> 15 #include <linux/zalloc.h> 16 #include <stdlib.h> 17 #include <unistd.h> 18 19 #include "auxtrace.h" 20 #include "color.h" 21 #include "debug.h" 22 #include "evlist.h" 23 #include "evsel.h" 24 #include "machine.h" 25 #include "session.h" 26 #include "symbol.h" 27 #include "thread.h" 28 #include "thread-stack.h" 29 #include "tsc.h" 30 #include "tool.h" 31 #include "util/synthetic-events.h" 32 33 #include "arm-spe.h" 34 #include "arm-spe-decoder/arm-spe-decoder.h" 35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h" 36 37 #include "../../arch/arm64/include/asm/cputype.h" 38 #define MAX_TIMESTAMP (~0ULL) 39 40 #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) 41 42 struct arm_spe { 43 struct auxtrace auxtrace; 44 struct auxtrace_queues queues; 45 struct auxtrace_heap heap; 46 struct itrace_synth_opts synth_opts; 47 u32 auxtrace_type; 48 struct perf_session *session; 49 struct machine *machine; 50 u32 pmu_type; 51 52 struct perf_tsc_conversion tc; 53 54 u8 timeless_decoding; 55 u8 data_queued; 56 57 u64 sample_type; 58 u8 sample_flc; 59 u8 sample_llc; 60 u8 sample_tlb; 61 u8 sample_branch; 62 u8 sample_remote_access; 63 u8 sample_memory; 64 u8 sample_instructions; 65 u64 instructions_sample_period; 66 67 u64 l1d_miss_id; 68 u64 l1d_access_id; 69 u64 llc_miss_id; 70 u64 llc_access_id; 71 u64 tlb_miss_id; 72 u64 tlb_access_id; 73 u64 branch_id; 74 u64 remote_access_id; 75 u64 memory_id; 76 u64 instructions_id; 77 78 u64 kernel_start; 79 80 unsigned long num_events; 81 u8 use_ctx_pkt_for_pid; 82 83 u64 **metadata; 84 u64 metadata_ver; 85 u64 metadata_nr_cpu; 86 bool is_homogeneous; 87 }; 88 89 struct arm_spe_queue { 90 struct arm_spe *spe; 91 unsigned int queue_nr; 92 struct auxtrace_buffer *buffer; 93 struct auxtrace_buffer *old_buffer; 94 union perf_event *event_buf; 95 bool on_heap; 96 bool done; 97 pid_t pid; 98 pid_t tid; 99 int cpu; 100 struct arm_spe_decoder *decoder; 101 u64 time; 102 u64 timestamp; 103 struct thread *thread; 104 u64 period_instructions; 105 u32 flags; 106 struct branch_stack *last_branch; 107 }; 108 109 struct data_source_handle { 110 const struct midr_range *midr_ranges; 111 void (*ds_synth)(const struct arm_spe_record *record, 112 union perf_mem_data_src *data_src); 113 }; 114 115 #define DS(range, func) \ 116 { \ 117 .midr_ranges = range, \ 118 .ds_synth = arm_spe__synth_##func, \ 119 } 120 121 static void arm_spe_dump(struct arm_spe *spe __maybe_unused, 122 unsigned char *buf, size_t len) 123 { 124 struct arm_spe_pkt packet; 125 size_t pos = 0; 126 int ret, pkt_len, i; 127 char desc[ARM_SPE_PKT_DESC_MAX]; 128 const char *color = PERF_COLOR_BLUE; 129 130 color_fprintf(stdout, color, 131 ". ... ARM SPE data: size %#zx bytes\n", 132 len); 133 134 while (len) { 135 ret = arm_spe_get_packet(buf, len, &packet); 136 if (ret > 0) 137 pkt_len = ret; 138 else 139 pkt_len = 1; 140 printf("."); 141 color_fprintf(stdout, color, " %08zx: ", pos); 142 for (i = 0; i < pkt_len; i++) 143 color_fprintf(stdout, color, " %02x", buf[i]); 144 for (; i < 16; i++) 145 color_fprintf(stdout, color, " "); 146 if (ret > 0) { 147 ret = arm_spe_pkt_desc(&packet, desc, 148 ARM_SPE_PKT_DESC_MAX); 149 if (!ret) 150 color_fprintf(stdout, color, " %s\n", desc); 151 } else { 152 color_fprintf(stdout, color, " Bad packet!\n"); 153 } 154 pos += pkt_len; 155 buf += pkt_len; 156 len -= pkt_len; 157 } 158 } 159 160 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf, 161 size_t len) 162 { 163 printf(".\n"); 164 arm_spe_dump(spe, buf, len); 165 } 166 167 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data) 168 { 169 struct arm_spe_queue *speq = data; 170 struct auxtrace_buffer *buffer = speq->buffer; 171 struct auxtrace_buffer *old_buffer = speq->old_buffer; 172 struct auxtrace_queue *queue; 173 174 queue = &speq->spe->queues.queue_array[speq->queue_nr]; 175 176 buffer = auxtrace_buffer__next(queue, buffer); 177 /* If no more data, drop the previous auxtrace_buffer and return */ 178 if (!buffer) { 179 if (old_buffer) 180 auxtrace_buffer__drop_data(old_buffer); 181 b->len = 0; 182 return 0; 183 } 184 185 speq->buffer = buffer; 186 187 /* If the aux_buffer doesn't have data associated, try to load it */ 188 if (!buffer->data) { 189 /* get the file desc associated with the perf data file */ 190 int fd = perf_data__fd(speq->spe->session->data); 191 192 buffer->data = auxtrace_buffer__get_data(buffer, fd); 193 if (!buffer->data) 194 return -ENOMEM; 195 } 196 197 b->len = buffer->size; 198 b->buf = buffer->data; 199 200 if (b->len) { 201 if (old_buffer) 202 auxtrace_buffer__drop_data(old_buffer); 203 speq->old_buffer = buffer; 204 } else { 205 auxtrace_buffer__drop_data(buffer); 206 return arm_spe_get_trace(b, data); 207 } 208 209 return 0; 210 } 211 212 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, 213 unsigned int queue_nr) 214 { 215 struct arm_spe_params params = { .get_trace = 0, }; 216 struct arm_spe_queue *speq; 217 218 speq = zalloc(sizeof(*speq)); 219 if (!speq) 220 return NULL; 221 222 speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); 223 if (!speq->event_buf) 224 goto out_free; 225 226 speq->spe = spe; 227 speq->queue_nr = queue_nr; 228 speq->pid = -1; 229 speq->tid = -1; 230 speq->cpu = -1; 231 speq->period_instructions = 0; 232 233 /* params set */ 234 params.get_trace = arm_spe_get_trace; 235 params.data = speq; 236 237 if (spe->synth_opts.last_branch) { 238 size_t sz = sizeof(struct branch_stack); 239 240 /* Allocate up to two entries for PBT + TGT */ 241 sz += sizeof(struct branch_entry) * 242 min(spe->synth_opts.last_branch_sz, 2U); 243 speq->last_branch = zalloc(sz); 244 if (!speq->last_branch) 245 goto out_free; 246 } 247 248 /* create new decoder */ 249 speq->decoder = arm_spe_decoder_new(¶ms); 250 if (!speq->decoder) 251 goto out_free; 252 253 return speq; 254 255 out_free: 256 zfree(&speq->event_buf); 257 zfree(&speq->last_branch); 258 free(speq); 259 260 return NULL; 261 } 262 263 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip) 264 { 265 return ip >= spe->kernel_start ? 266 PERF_RECORD_MISC_KERNEL : 267 PERF_RECORD_MISC_USER; 268 } 269 270 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe, 271 struct auxtrace_queue *queue) 272 { 273 struct arm_spe_queue *speq = queue->priv; 274 pid_t tid; 275 276 tid = machine__get_current_tid(spe->machine, speq->cpu); 277 if (tid != -1) { 278 speq->tid = tid; 279 thread__zput(speq->thread); 280 } else 281 speq->tid = queue->tid; 282 283 if ((!speq->thread) && (speq->tid != -1)) { 284 speq->thread = machine__find_thread(spe->machine, -1, 285 speq->tid); 286 } 287 288 if (speq->thread) { 289 speq->pid = thread__pid(speq->thread); 290 if (queue->cpu == -1) 291 speq->cpu = thread__cpu(speq->thread); 292 } 293 } 294 295 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid) 296 { 297 struct arm_spe *spe = speq->spe; 298 int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid); 299 300 if (err) 301 return err; 302 303 arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]); 304 305 return 0; 306 } 307 308 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu) 309 { 310 u64 i; 311 312 if (!spe->metadata) 313 return NULL; 314 315 for (i = 0; i < spe->metadata_nr_cpu; i++) 316 if (spe->metadata[i][ARM_SPE_CPU] == cpu) 317 return spe->metadata[i]; 318 319 return NULL; 320 } 321 322 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record) 323 { 324 struct simd_flags simd_flags = {}; 325 326 if ((record->op & ARM_SPE_OP_LDST) && (record->op & ARM_SPE_OP_SVE_LDST)) 327 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE; 328 329 if ((record->op & ARM_SPE_OP_OTHER) && (record->op & ARM_SPE_OP_SVE_OTHER)) 330 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE; 331 332 if (record->type & ARM_SPE_SVE_PARTIAL_PRED) 333 simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL; 334 335 if (record->type & ARM_SPE_SVE_EMPTY_PRED) 336 simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY; 337 338 return simd_flags; 339 } 340 341 static void arm_spe_prep_sample(struct arm_spe *spe, 342 struct arm_spe_queue *speq, 343 union perf_event *event, 344 struct perf_sample *sample) 345 { 346 struct arm_spe_record *record = &speq->decoder->record; 347 348 if (!spe->timeless_decoding) 349 sample->time = tsc_to_perf_time(record->timestamp, &spe->tc); 350 351 sample->ip = record->from_ip; 352 sample->cpumode = arm_spe_cpumode(spe, sample->ip); 353 sample->pid = speq->pid; 354 sample->tid = speq->tid; 355 sample->period = 1; 356 sample->cpu = speq->cpu; 357 sample->simd_flags = arm_spe__synth_simd_flags(record); 358 359 event->sample.header.type = PERF_RECORD_SAMPLE; 360 event->sample.header.misc = sample->cpumode; 361 event->sample.header.size = sizeof(struct perf_event_header); 362 } 363 364 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) 365 { 366 struct arm_spe *spe = speq->spe; 367 struct arm_spe_record *record = &speq->decoder->record; 368 struct branch_stack *bstack = speq->last_branch; 369 struct branch_flags *bs_flags; 370 unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; 371 bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); 372 bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; 373 size_t sz = sizeof(struct branch_stack) + 374 sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; 375 int i = 0; 376 377 /* Clean up branch stack */ 378 memset(bstack, 0x0, sz); 379 380 if (!have_tgt && !have_pbt) 381 return; 382 383 if (have_tgt) { 384 bstack->entries[i].from = record->from_ip; 385 bstack->entries[i].to = record->to_ip; 386 387 bs_flags = &bstack->entries[i].flags; 388 bs_flags->value = 0; 389 390 if (record->op & ARM_SPE_OP_BR_CR_BL) { 391 if (record->op & ARM_SPE_OP_BR_COND) 392 bs_flags->type |= PERF_BR_COND_CALL; 393 else 394 bs_flags->type |= PERF_BR_CALL; 395 /* 396 * Indirect branch instruction without link (e.g. BR), 397 * take this case as function return. 398 */ 399 } else if (record->op & ARM_SPE_OP_BR_CR_RET || 400 record->op & ARM_SPE_OP_BR_INDIRECT) { 401 if (record->op & ARM_SPE_OP_BR_COND) 402 bs_flags->type |= PERF_BR_COND_RET; 403 else 404 bs_flags->type |= PERF_BR_RET; 405 } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { 406 if (record->op & ARM_SPE_OP_BR_COND) 407 bs_flags->type |= PERF_BR_COND; 408 else 409 bs_flags->type |= PERF_BR_UNCOND; 410 } else { 411 if (record->op & ARM_SPE_OP_BR_COND) 412 bs_flags->type |= PERF_BR_COND; 413 else 414 bs_flags->type |= PERF_BR_UNKNOWN; 415 } 416 417 if (record->type & ARM_SPE_BRANCH_MISS) { 418 bs_flags->mispred = 1; 419 bs_flags->predicted = 0; 420 } else { 421 bs_flags->mispred = 0; 422 bs_flags->predicted = 1; 423 } 424 425 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 426 bs_flags->not_taken = 1; 427 428 if (record->type & ARM_SPE_IN_TXN) 429 bs_flags->in_tx = 1; 430 431 bs_flags->cycles = min(record->latency, 0xFFFFU); 432 i++; 433 } 434 435 if (have_pbt) { 436 bs_flags = &bstack->entries[i].flags; 437 bs_flags->type |= PERF_BR_UNKNOWN; 438 bstack->entries[i].to = record->prev_br_tgt; 439 i++; 440 } 441 442 bstack->nr = i; 443 bstack->hw_idx = -1ULL; 444 } 445 446 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type) 447 { 448 event->header.size = perf_event__sample_event_size(sample, type, 0); 449 return perf_event__synthesize_sample(event, type, 0, sample); 450 } 451 452 static inline int 453 arm_spe_deliver_synth_event(struct arm_spe *spe, 454 struct arm_spe_queue *speq __maybe_unused, 455 union perf_event *event, 456 struct perf_sample *sample) 457 { 458 int ret; 459 460 if (spe->synth_opts.inject) { 461 ret = arm_spe__inject_event(event, sample, spe->sample_type); 462 if (ret) 463 return ret; 464 } 465 466 ret = perf_session__deliver_synth_event(spe->session, event, sample); 467 if (ret) 468 pr_err("ARM SPE: failed to deliver event, error %d\n", ret); 469 470 return ret; 471 } 472 473 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, 474 u64 spe_events_id, u64 data_src) 475 { 476 struct arm_spe *spe = speq->spe; 477 struct arm_spe_record *record = &speq->decoder->record; 478 union perf_event *event = speq->event_buf; 479 struct perf_sample sample; 480 int ret; 481 482 perf_sample__init(&sample, /*all=*/true); 483 arm_spe_prep_sample(spe, speq, event, &sample); 484 485 sample.id = spe_events_id; 486 sample.stream_id = spe_events_id; 487 sample.addr = record->virt_addr; 488 sample.phys_addr = record->phys_addr; 489 sample.data_src = data_src; 490 sample.weight = record->latency; 491 492 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 493 perf_sample__exit(&sample); 494 return ret; 495 } 496 497 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, 498 u64 spe_events_id) 499 { 500 struct arm_spe *spe = speq->spe; 501 struct arm_spe_record *record = &speq->decoder->record; 502 union perf_event *event = speq->event_buf; 503 struct perf_sample sample; 504 int ret; 505 506 perf_sample__init(&sample, /*all=*/true); 507 arm_spe_prep_sample(spe, speq, event, &sample); 508 509 sample.id = spe_events_id; 510 sample.stream_id = spe_events_id; 511 sample.addr = record->to_ip; 512 sample.weight = record->latency; 513 sample.flags = speq->flags; 514 sample.branch_stack = speq->last_branch; 515 516 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 517 perf_sample__exit(&sample); 518 return ret; 519 } 520 521 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, 522 u64 spe_events_id, u64 data_src) 523 { 524 struct arm_spe *spe = speq->spe; 525 struct arm_spe_record *record = &speq->decoder->record; 526 union perf_event *event = speq->event_buf; 527 struct perf_sample sample; 528 int ret; 529 530 /* 531 * Handles perf instruction sampling period. 532 */ 533 speq->period_instructions++; 534 if (speq->period_instructions < spe->instructions_sample_period) 535 return 0; 536 speq->period_instructions = 0; 537 538 perf_sample__init(&sample, /*all=*/true); 539 arm_spe_prep_sample(spe, speq, event, &sample); 540 541 sample.id = spe_events_id; 542 sample.stream_id = spe_events_id; 543 sample.addr = record->to_ip; 544 sample.phys_addr = record->phys_addr; 545 sample.data_src = data_src; 546 sample.period = spe->instructions_sample_period; 547 sample.weight = record->latency; 548 sample.flags = speq->flags; 549 sample.branch_stack = speq->last_branch; 550 551 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 552 perf_sample__exit(&sample); 553 return ret; 554 } 555 556 static const struct midr_range common_ds_encoding_cpus[] = { 557 MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), 558 MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), 559 MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), 560 MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), 561 MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), 562 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), 563 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), 564 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), 565 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), 566 {}, 567 }; 568 569 static const struct midr_range ampereone_ds_encoding_cpus[] = { 570 MIDR_ALL_VERSIONS(MIDR_AMPERE1A), 571 {}, 572 }; 573 574 static const struct midr_range hisi_hip_ds_encoding_cpus[] = { 575 MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), 576 {}, 577 }; 578 579 static void arm_spe__sample_flags(struct arm_spe_queue *speq) 580 { 581 const struct arm_spe_record *record = &speq->decoder->record; 582 583 speq->flags = 0; 584 if (record->op & ARM_SPE_OP_BRANCH_ERET) { 585 speq->flags = PERF_IP_FLAG_BRANCH; 586 587 if (record->type & ARM_SPE_BRANCH_MISS) 588 speq->flags |= PERF_IP_FLAG_BRANCH_MISS; 589 590 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 591 speq->flags |= PERF_IP_FLAG_NOT_TAKEN; 592 593 if (record->type & ARM_SPE_IN_TXN) 594 speq->flags |= PERF_IP_FLAG_IN_TX; 595 596 if (record->op & ARM_SPE_OP_BR_COND) 597 speq->flags |= PERF_IP_FLAG_CONDITIONAL; 598 599 if (record->op & ARM_SPE_OP_BR_CR_BL) 600 speq->flags |= PERF_IP_FLAG_CALL; 601 else if (record->op & ARM_SPE_OP_BR_CR_RET) 602 speq->flags |= PERF_IP_FLAG_RETURN; 603 /* 604 * Indirect branch instruction without link (e.g. BR), 605 * take it as a function return. 606 */ 607 else if (record->op & ARM_SPE_OP_BR_INDIRECT) 608 speq->flags |= PERF_IP_FLAG_RETURN; 609 } 610 } 611 612 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record, 613 union perf_mem_data_src *data_src) 614 { 615 /* 616 * Even though four levels of cache hierarchy are possible, no known 617 * production Neoverse systems currently include more than three levels 618 * so for the time being we assume three exist. If a production system 619 * is built with four the this function would have to be changed to 620 * detect the number of levels for reporting. 621 */ 622 623 /* 624 * We have no data on the hit level or data source for stores in the 625 * Neoverse SPE records. 626 */ 627 if (record->op & ARM_SPE_OP_ST) { 628 data_src->mem_lvl = PERF_MEM_LVL_NA; 629 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; 630 data_src->mem_snoop = PERF_MEM_SNOOP_NA; 631 return; 632 } 633 634 switch (record->source) { 635 case ARM_SPE_COMMON_DS_L1D: 636 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 637 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 638 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 639 break; 640 case ARM_SPE_COMMON_DS_L2: 641 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 642 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 643 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 644 break; 645 case ARM_SPE_COMMON_DS_PEER_CORE: 646 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 647 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 648 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 649 break; 650 /* 651 * We don't know if this is L1, L2 but we do know it was a cache-2-cache 652 * transfer, so set SNOOPX_PEER 653 */ 654 case ARM_SPE_COMMON_DS_LOCAL_CLUSTER: 655 case ARM_SPE_COMMON_DS_PEER_CLUSTER: 656 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 657 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 658 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 659 break; 660 /* 661 * System cache is assumed to be L3 662 */ 663 case ARM_SPE_COMMON_DS_SYS_CACHE: 664 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 665 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 666 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 667 break; 668 /* 669 * We don't know what level it hit in, except it came from the other 670 * socket 671 */ 672 case ARM_SPE_COMMON_DS_REMOTE: 673 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; 674 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 675 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 676 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 677 break; 678 case ARM_SPE_COMMON_DS_DRAM: 679 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 680 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 681 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 682 break; 683 default: 684 break; 685 } 686 } 687 688 /* 689 * Source is IMPDEF. Here we convert the source code used on AmpereOne cores 690 * to the common (Neoverse, Cortex) to avoid duplicating the decoding code. 691 */ 692 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record, 693 union perf_mem_data_src *data_src) 694 { 695 struct arm_spe_record common_record; 696 697 switch (record->source) { 698 case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE: 699 common_record.source = ARM_SPE_COMMON_DS_PEER_CORE; 700 break; 701 case ARM_SPE_AMPEREONE_SLC: 702 common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE; 703 break; 704 case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE: 705 common_record.source = ARM_SPE_COMMON_DS_REMOTE; 706 break; 707 case ARM_SPE_AMPEREONE_DDR: 708 common_record.source = ARM_SPE_COMMON_DS_DRAM; 709 break; 710 case ARM_SPE_AMPEREONE_L1D: 711 common_record.source = ARM_SPE_COMMON_DS_L1D; 712 break; 713 case ARM_SPE_AMPEREONE_L2D: 714 common_record.source = ARM_SPE_COMMON_DS_L2; 715 break; 716 default: 717 pr_warning_once("AmpereOne: Unknown data source (0x%x)\n", 718 record->source); 719 return; 720 } 721 722 common_record.op = record->op; 723 arm_spe__synth_data_source_common(&common_record, data_src); 724 } 725 726 static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record, 727 union perf_mem_data_src *data_src) 728 { 729 /* Use common synthesis method to handle store operations */ 730 if (record->op & ARM_SPE_OP_ST) { 731 arm_spe__synth_data_source_common(record, data_src); 732 return; 733 } 734 735 switch (record->source) { 736 case ARM_SPE_HISI_HIP_PEER_CPU: 737 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 738 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 739 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 740 break; 741 case ARM_SPE_HISI_HIP_PEER_CPU_HITM: 742 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 743 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 744 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 745 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 746 break; 747 case ARM_SPE_HISI_HIP_L3: 748 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 749 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 750 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 751 break; 752 case ARM_SPE_HISI_HIP_L3_HITM: 753 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 754 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 755 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 756 break; 757 case ARM_SPE_HISI_HIP_PEER_CLUSTER: 758 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; 759 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 760 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 761 break; 762 case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM: 763 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; 764 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 765 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 766 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 767 break; 768 case ARM_SPE_HISI_HIP_REMOTE_SOCKET: 769 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; 770 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 771 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 772 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 773 break; 774 case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM: 775 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; 776 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 777 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 778 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 779 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 780 break; 781 case ARM_SPE_HISI_HIP_LOCAL_MEM: 782 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 783 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 784 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 785 break; 786 case ARM_SPE_HISI_HIP_REMOTE_MEM: 787 data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; 788 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 789 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 790 break; 791 case ARM_SPE_HISI_HIP_NC_DEV: 792 data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT; 793 data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; 794 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 795 break; 796 case ARM_SPE_HISI_HIP_L2: 797 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 798 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 799 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 800 break; 801 case ARM_SPE_HISI_HIP_L2_HITM: 802 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 803 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 804 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 805 break; 806 case ARM_SPE_HISI_HIP_L1: 807 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 808 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 809 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 810 break; 811 default: 812 break; 813 } 814 } 815 816 static const struct data_source_handle data_source_handles[] = { 817 DS(common_ds_encoding_cpus, data_source_common), 818 DS(ampereone_ds_encoding_cpus, data_source_ampereone), 819 DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip), 820 }; 821 822 static void arm_spe__synth_memory_level(const struct arm_spe_record *record, 823 union perf_mem_data_src *data_src) 824 { 825 if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { 826 data_src->mem_lvl = PERF_MEM_LVL_L3; 827 828 if (record->type & ARM_SPE_LLC_MISS) 829 data_src->mem_lvl |= PERF_MEM_LVL_MISS; 830 else 831 data_src->mem_lvl |= PERF_MEM_LVL_HIT; 832 } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { 833 data_src->mem_lvl = PERF_MEM_LVL_L1; 834 835 if (record->type & ARM_SPE_L1D_MISS) 836 data_src->mem_lvl |= PERF_MEM_LVL_MISS; 837 else 838 data_src->mem_lvl |= PERF_MEM_LVL_HIT; 839 } 840 841 if (record->type & ARM_SPE_REMOTE_ACCESS) 842 data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; 843 } 844 845 static bool arm_spe__synth_ds(struct arm_spe_queue *speq, 846 const struct arm_spe_record *record, 847 union perf_mem_data_src *data_src) 848 { 849 struct arm_spe *spe = speq->spe; 850 u64 *metadata = NULL; 851 u64 midr; 852 unsigned int i; 853 854 /* Metadata version 1 assumes all CPUs are the same (old behavior) */ 855 if (spe->metadata_ver == 1) { 856 const char *cpuid; 857 858 pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n"); 859 cpuid = perf_env__cpuid(spe->session->evlist->env); 860 midr = strtol(cpuid, NULL, 16); 861 } else { 862 /* CPU ID is -1 for per-thread mode */ 863 if (speq->cpu < 0) { 864 /* 865 * On the heterogeneous system, due to CPU ID is -1, 866 * cannot confirm the data source packet is supported. 867 */ 868 if (!spe->is_homogeneous) 869 return false; 870 871 /* In homogeneous system, simply use CPU0's metadata */ 872 if (spe->metadata) 873 metadata = spe->metadata[0]; 874 } else { 875 metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); 876 } 877 878 if (!metadata) 879 return false; 880 881 midr = metadata[ARM_SPE_CPU_MIDR]; 882 } 883 884 for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { 885 if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { 886 data_source_handles[i].ds_synth(record, data_src); 887 return true; 888 } 889 } 890 891 return false; 892 } 893 894 static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, 895 const struct arm_spe_record *record) 896 { 897 union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA }; 898 899 /* Only synthesize data source for LDST operations */ 900 if (!is_ldst_op(record->op)) 901 return 0; 902 903 if (record->op & ARM_SPE_OP_LD) 904 data_src.mem_op = PERF_MEM_OP_LOAD; 905 else if (record->op & ARM_SPE_OP_ST) 906 data_src.mem_op = PERF_MEM_OP_STORE; 907 else 908 return 0; 909 910 if (!arm_spe__synth_ds(speq, record, &data_src)) 911 arm_spe__synth_memory_level(record, &data_src); 912 913 if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { 914 data_src.mem_dtlb = PERF_MEM_TLB_WK; 915 916 if (record->type & ARM_SPE_TLB_MISS) 917 data_src.mem_dtlb |= PERF_MEM_TLB_MISS; 918 else 919 data_src.mem_dtlb |= PERF_MEM_TLB_HIT; 920 } 921 922 return data_src.val; 923 } 924 925 static int arm_spe_sample(struct arm_spe_queue *speq) 926 { 927 const struct arm_spe_record *record = &speq->decoder->record; 928 struct arm_spe *spe = speq->spe; 929 u64 data_src; 930 int err; 931 932 arm_spe__sample_flags(speq); 933 data_src = arm_spe__synth_data_source(speq, record); 934 935 if (spe->sample_flc) { 936 if (record->type & ARM_SPE_L1D_MISS) { 937 err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id, 938 data_src); 939 if (err) 940 return err; 941 } 942 943 if (record->type & ARM_SPE_L1D_ACCESS) { 944 err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id, 945 data_src); 946 if (err) 947 return err; 948 } 949 } 950 951 if (spe->sample_llc) { 952 if (record->type & ARM_SPE_LLC_MISS) { 953 err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id, 954 data_src); 955 if (err) 956 return err; 957 } 958 959 if (record->type & ARM_SPE_LLC_ACCESS) { 960 err = arm_spe__synth_mem_sample(speq, spe->llc_access_id, 961 data_src); 962 if (err) 963 return err; 964 } 965 } 966 967 if (spe->sample_tlb) { 968 if (record->type & ARM_SPE_TLB_MISS) { 969 err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id, 970 data_src); 971 if (err) 972 return err; 973 } 974 975 if (record->type & ARM_SPE_TLB_ACCESS) { 976 err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id, 977 data_src); 978 if (err) 979 return err; 980 } 981 } 982 983 if (spe->synth_opts.last_branch && 984 (spe->sample_branch || spe->sample_instructions)) 985 arm_spe__prep_branch_stack(speq); 986 987 if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) { 988 err = arm_spe__synth_branch_sample(speq, spe->branch_id); 989 if (err) 990 return err; 991 } 992 993 if (spe->sample_remote_access && 994 (record->type & ARM_SPE_REMOTE_ACCESS)) { 995 err = arm_spe__synth_mem_sample(speq, spe->remote_access_id, 996 data_src); 997 if (err) 998 return err; 999 } 1000 1001 /* 1002 * When data_src is zero it means the record is not a memory operation, 1003 * skip to synthesize memory sample for this case. 1004 */ 1005 if (spe->sample_memory && is_ldst_op(record->op)) { 1006 err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); 1007 if (err) 1008 return err; 1009 } 1010 1011 if (spe->sample_instructions) { 1012 err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src); 1013 if (err) 1014 return err; 1015 } 1016 1017 return 0; 1018 } 1019 1020 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp) 1021 { 1022 struct arm_spe *spe = speq->spe; 1023 struct arm_spe_record *record; 1024 int ret; 1025 1026 if (!spe->kernel_start) 1027 spe->kernel_start = machine__kernel_start(spe->machine); 1028 1029 while (1) { 1030 /* 1031 * The usual logic is firstly to decode the packets, and then 1032 * based the record to synthesize sample; but here the flow is 1033 * reversed: it calls arm_spe_sample() for synthesizing samples 1034 * prior to arm_spe_decode(). 1035 * 1036 * Two reasons for this code logic: 1037 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it 1038 * has decoded trace data and generated a record, but the record 1039 * is left to generate sample until run to here, so it's correct 1040 * to synthesize sample for the left record. 1041 * 2. After decoding trace data, it needs to compare the record 1042 * timestamp with the coming perf event, if the record timestamp 1043 * is later than the perf event, it needs bail out and pushs the 1044 * record into auxtrace heap, thus the record can be deferred to 1045 * synthesize sample until run to here at the next time; so this 1046 * can correlate samples between Arm SPE trace data and other 1047 * perf events with correct time ordering. 1048 */ 1049 1050 /* 1051 * Update pid/tid info. 1052 */ 1053 record = &speq->decoder->record; 1054 if (!spe->timeless_decoding && record->context_id != (u64)-1) { 1055 ret = arm_spe_set_tid(speq, record->context_id); 1056 if (ret) 1057 return ret; 1058 1059 spe->use_ctx_pkt_for_pid = true; 1060 } 1061 1062 ret = arm_spe_sample(speq); 1063 if (ret) 1064 return ret; 1065 1066 ret = arm_spe_decode(speq->decoder); 1067 if (!ret) { 1068 pr_debug("No data or all data has been processed.\n"); 1069 return 1; 1070 } 1071 1072 /* 1073 * Error is detected when decode SPE trace data, continue to 1074 * the next trace data and find out more records. 1075 */ 1076 if (ret < 0) 1077 continue; 1078 1079 record = &speq->decoder->record; 1080 1081 /* Update timestamp for the last record */ 1082 if (record->timestamp > speq->timestamp) 1083 speq->timestamp = record->timestamp; 1084 1085 /* 1086 * If the timestamp of the queue is later than timestamp of the 1087 * coming perf event, bail out so can allow the perf event to 1088 * be processed ahead. 1089 */ 1090 if (!spe->timeless_decoding && speq->timestamp >= *timestamp) { 1091 *timestamp = speq->timestamp; 1092 return 0; 1093 } 1094 } 1095 1096 return 0; 1097 } 1098 1099 static int arm_spe__setup_queue(struct arm_spe *spe, 1100 struct auxtrace_queue *queue, 1101 unsigned int queue_nr) 1102 { 1103 struct arm_spe_queue *speq = queue->priv; 1104 struct arm_spe_record *record; 1105 1106 if (list_empty(&queue->head) || speq) 1107 return 0; 1108 1109 speq = arm_spe__alloc_queue(spe, queue_nr); 1110 1111 if (!speq) 1112 return -ENOMEM; 1113 1114 queue->priv = speq; 1115 1116 if (queue->cpu != -1) 1117 speq->cpu = queue->cpu; 1118 1119 if (!speq->on_heap) { 1120 int ret; 1121 1122 if (spe->timeless_decoding) 1123 return 0; 1124 1125 retry: 1126 ret = arm_spe_decode(speq->decoder); 1127 1128 if (!ret) 1129 return 0; 1130 1131 if (ret < 0) 1132 goto retry; 1133 1134 record = &speq->decoder->record; 1135 1136 speq->timestamp = record->timestamp; 1137 ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp); 1138 if (ret) 1139 return ret; 1140 speq->on_heap = true; 1141 } 1142 1143 return 0; 1144 } 1145 1146 static int arm_spe__setup_queues(struct arm_spe *spe) 1147 { 1148 unsigned int i; 1149 int ret; 1150 1151 for (i = 0; i < spe->queues.nr_queues; i++) { 1152 ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i); 1153 if (ret) 1154 return ret; 1155 } 1156 1157 return 0; 1158 } 1159 1160 static int arm_spe__update_queues(struct arm_spe *spe) 1161 { 1162 if (spe->queues.new_data) { 1163 spe->queues.new_data = false; 1164 return arm_spe__setup_queues(spe); 1165 } 1166 1167 return 0; 1168 } 1169 1170 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe) 1171 { 1172 struct evsel *evsel; 1173 struct evlist *evlist = spe->session->evlist; 1174 bool timeless_decoding = true; 1175 1176 /* 1177 * Circle through the list of event and complain if we find one 1178 * with the time bit set. 1179 */ 1180 evlist__for_each_entry(evlist, evsel) { 1181 if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME)) 1182 timeless_decoding = false; 1183 } 1184 1185 return timeless_decoding; 1186 } 1187 1188 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp) 1189 { 1190 unsigned int queue_nr; 1191 u64 ts; 1192 int ret; 1193 1194 while (1) { 1195 struct auxtrace_queue *queue; 1196 struct arm_spe_queue *speq; 1197 1198 if (!spe->heap.heap_cnt) 1199 return 0; 1200 1201 if (spe->heap.heap_array[0].ordinal >= timestamp) 1202 return 0; 1203 1204 queue_nr = spe->heap.heap_array[0].queue_nr; 1205 queue = &spe->queues.queue_array[queue_nr]; 1206 speq = queue->priv; 1207 1208 auxtrace_heap__pop(&spe->heap); 1209 1210 if (spe->heap.heap_cnt) { 1211 ts = spe->heap.heap_array[0].ordinal + 1; 1212 if (ts > timestamp) 1213 ts = timestamp; 1214 } else { 1215 ts = timestamp; 1216 } 1217 1218 /* 1219 * A previous context-switch event has set pid/tid in the machine's context, so 1220 * here we need to update the pid/tid in the thread and SPE queue. 1221 */ 1222 if (!spe->use_ctx_pkt_for_pid) 1223 arm_spe_set_pid_tid_cpu(spe, queue); 1224 1225 ret = arm_spe_run_decoder(speq, &ts); 1226 if (ret < 0) { 1227 auxtrace_heap__add(&spe->heap, queue_nr, ts); 1228 return ret; 1229 } 1230 1231 if (!ret) { 1232 ret = auxtrace_heap__add(&spe->heap, queue_nr, ts); 1233 if (ret < 0) 1234 return ret; 1235 } else { 1236 speq->on_heap = false; 1237 } 1238 } 1239 1240 return 0; 1241 } 1242 1243 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid, 1244 u64 time_) 1245 { 1246 struct auxtrace_queues *queues = &spe->queues; 1247 unsigned int i; 1248 u64 ts = 0; 1249 1250 for (i = 0; i < queues->nr_queues; i++) { 1251 struct auxtrace_queue *queue = &spe->queues.queue_array[i]; 1252 struct arm_spe_queue *speq = queue->priv; 1253 1254 if (speq && (tid == -1 || speq->tid == tid)) { 1255 speq->time = time_; 1256 arm_spe_set_pid_tid_cpu(spe, queue); 1257 arm_spe_run_decoder(speq, &ts); 1258 } 1259 } 1260 return 0; 1261 } 1262 1263 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event, 1264 struct perf_sample *sample) 1265 { 1266 pid_t pid, tid; 1267 int cpu; 1268 1269 if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT)) 1270 return 0; 1271 1272 pid = event->context_switch.next_prev_pid; 1273 tid = event->context_switch.next_prev_tid; 1274 cpu = sample->cpu; 1275 1276 if (tid == -1) 1277 pr_warning("context_switch event has no tid\n"); 1278 1279 return machine__set_current_tid(spe->machine, cpu, pid, tid); 1280 } 1281 1282 static int arm_spe_process_event(struct perf_session *session, 1283 union perf_event *event, 1284 struct perf_sample *sample, 1285 const struct perf_tool *tool) 1286 { 1287 int err = 0; 1288 u64 timestamp; 1289 struct arm_spe *spe = container_of(session->auxtrace, 1290 struct arm_spe, auxtrace); 1291 1292 if (dump_trace) 1293 return 0; 1294 1295 if (!tool->ordered_events) { 1296 pr_err("SPE trace requires ordered events\n"); 1297 return -EINVAL; 1298 } 1299 1300 if (sample->time && (sample->time != (u64) -1)) 1301 timestamp = perf_time_to_tsc(sample->time, &spe->tc); 1302 else 1303 timestamp = 0; 1304 1305 if (timestamp || spe->timeless_decoding) { 1306 err = arm_spe__update_queues(spe); 1307 if (err) 1308 return err; 1309 } 1310 1311 if (spe->timeless_decoding) { 1312 if (event->header.type == PERF_RECORD_EXIT) { 1313 err = arm_spe_process_timeless_queues(spe, 1314 event->fork.tid, 1315 sample->time); 1316 } 1317 } else if (timestamp) { 1318 err = arm_spe_process_queues(spe, timestamp); 1319 if (err) 1320 return err; 1321 1322 if (!spe->use_ctx_pkt_for_pid && 1323 (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE || 1324 event->header.type == PERF_RECORD_SWITCH)) 1325 err = arm_spe_context_switch(spe, event, sample); 1326 } 1327 1328 return err; 1329 } 1330 1331 static int arm_spe_process_auxtrace_event(struct perf_session *session, 1332 union perf_event *event, 1333 const struct perf_tool *tool __maybe_unused) 1334 { 1335 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1336 auxtrace); 1337 1338 if (!spe->data_queued) { 1339 struct auxtrace_buffer *buffer; 1340 off_t data_offset; 1341 int fd = perf_data__fd(session->data); 1342 int err; 1343 1344 if (perf_data__is_pipe(session->data)) { 1345 data_offset = 0; 1346 } else { 1347 data_offset = lseek(fd, 0, SEEK_CUR); 1348 if (data_offset == -1) 1349 return -errno; 1350 } 1351 1352 err = auxtrace_queues__add_event(&spe->queues, session, event, 1353 data_offset, &buffer); 1354 if (err) 1355 return err; 1356 1357 /* Dump here now we have copied a piped trace out of the pipe */ 1358 if (dump_trace) { 1359 if (auxtrace_buffer__get_data(buffer, fd)) { 1360 arm_spe_dump_event(spe, buffer->data, 1361 buffer->size); 1362 auxtrace_buffer__put_data(buffer); 1363 } 1364 } 1365 } 1366 1367 return 0; 1368 } 1369 1370 static int arm_spe_flush(struct perf_session *session __maybe_unused, 1371 const struct perf_tool *tool __maybe_unused) 1372 { 1373 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1374 auxtrace); 1375 int ret; 1376 1377 if (dump_trace) 1378 return 0; 1379 1380 if (!tool->ordered_events) 1381 return -EINVAL; 1382 1383 ret = arm_spe__update_queues(spe); 1384 if (ret < 0) 1385 return ret; 1386 1387 if (spe->timeless_decoding) 1388 return arm_spe_process_timeless_queues(spe, -1, 1389 MAX_TIMESTAMP - 1); 1390 1391 ret = arm_spe_process_queues(spe, MAX_TIMESTAMP); 1392 if (ret) 1393 return ret; 1394 1395 if (!spe->use_ctx_pkt_for_pid) 1396 ui__warning("Arm SPE CONTEXT packets not found in the traces.\n" 1397 "Matching of TIDs to SPE events could be inaccurate.\n"); 1398 1399 return 0; 1400 } 1401 1402 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size) 1403 { 1404 u64 *metadata; 1405 1406 metadata = zalloc(per_cpu_size); 1407 if (!metadata) 1408 return NULL; 1409 1410 memcpy(metadata, buf, per_cpu_size); 1411 return metadata; 1412 } 1413 1414 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu) 1415 { 1416 int i; 1417 1418 for (i = 0; i < nr_cpu; i++) 1419 zfree(&metadata[i]); 1420 free(metadata); 1421 } 1422 1423 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info, 1424 u64 *ver, int *nr_cpu) 1425 { 1426 u64 *ptr = (u64 *)info->priv; 1427 u64 metadata_size; 1428 u64 **metadata = NULL; 1429 int hdr_sz, per_cpu_sz, i; 1430 1431 metadata_size = info->header.size - 1432 sizeof(struct perf_record_auxtrace_info); 1433 1434 /* Metadata version 1 */ 1435 if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) { 1436 *ver = 1; 1437 *nr_cpu = 0; 1438 /* No per CPU metadata */ 1439 return NULL; 1440 } 1441 1442 *ver = ptr[ARM_SPE_HEADER_VERSION]; 1443 hdr_sz = ptr[ARM_SPE_HEADER_SIZE]; 1444 *nr_cpu = ptr[ARM_SPE_CPUS_NUM]; 1445 1446 metadata = calloc(*nr_cpu, sizeof(*metadata)); 1447 if (!metadata) 1448 return NULL; 1449 1450 /* Locate the start address of per CPU metadata */ 1451 ptr += hdr_sz; 1452 per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu); 1453 1454 for (i = 0; i < *nr_cpu; i++) { 1455 metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz); 1456 if (!metadata[i]) 1457 goto err_per_cpu_metadata; 1458 1459 ptr += per_cpu_sz / sizeof(u64); 1460 } 1461 1462 return metadata; 1463 1464 err_per_cpu_metadata: 1465 arm_spe__free_metadata(metadata, *nr_cpu); 1466 return NULL; 1467 } 1468 1469 static void arm_spe_free_queue(void *priv) 1470 { 1471 struct arm_spe_queue *speq = priv; 1472 1473 if (!speq) 1474 return; 1475 thread__zput(speq->thread); 1476 arm_spe_decoder_free(speq->decoder); 1477 zfree(&speq->event_buf); 1478 zfree(&speq->last_branch); 1479 free(speq); 1480 } 1481 1482 static void arm_spe_free_events(struct perf_session *session) 1483 { 1484 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1485 auxtrace); 1486 struct auxtrace_queues *queues = &spe->queues; 1487 unsigned int i; 1488 1489 for (i = 0; i < queues->nr_queues; i++) { 1490 arm_spe_free_queue(queues->queue_array[i].priv); 1491 queues->queue_array[i].priv = NULL; 1492 } 1493 auxtrace_queues__free(queues); 1494 } 1495 1496 static void arm_spe_free(struct perf_session *session) 1497 { 1498 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1499 auxtrace); 1500 1501 auxtrace_heap__free(&spe->heap); 1502 arm_spe_free_events(session); 1503 session->auxtrace = NULL; 1504 arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu); 1505 free(spe); 1506 } 1507 1508 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session, 1509 struct evsel *evsel) 1510 { 1511 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); 1512 1513 return evsel->core.attr.type == spe->pmu_type; 1514 } 1515 1516 static const char * const metadata_hdr_v1_fmts[] = { 1517 [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1518 [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n", 1519 }; 1520 1521 static const char * const metadata_hdr_fmts[] = { 1522 [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n", 1523 [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n", 1524 [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n", 1525 [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n", 1526 }; 1527 1528 static const char * const metadata_per_cpu_fmts[] = { 1529 [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n", 1530 [ARM_SPE_CPU] = " CPU # :%"PRId64"\n", 1531 [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n", 1532 [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n", 1533 [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1534 [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n", 1535 }; 1536 1537 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) 1538 { 1539 unsigned int i, cpu, hdr_size, cpu_num, cpu_size; 1540 const char * const *hdr_fmts; 1541 1542 if (!dump_trace) 1543 return; 1544 1545 if (spe->metadata_ver == 1) { 1546 cpu_num = 0; 1547 hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX; 1548 hdr_fmts = metadata_hdr_v1_fmts; 1549 } else { 1550 cpu_num = arr[ARM_SPE_CPUS_NUM]; 1551 hdr_size = arr[ARM_SPE_HEADER_SIZE]; 1552 hdr_fmts = metadata_hdr_fmts; 1553 } 1554 1555 for (i = 0; i < hdr_size; i++) 1556 fprintf(stdout, hdr_fmts[i], arr[i]); 1557 1558 arr += hdr_size; 1559 for (cpu = 0; cpu < cpu_num; cpu++) { 1560 /* 1561 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS 1562 * are fixed. The sequential parameter size is decided by the 1563 * field 'ARM_SPE_CPU_NR_PARAMS'. 1564 */ 1565 cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS]; 1566 for (i = 0; i < cpu_size; i++) 1567 fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]); 1568 arr += cpu_size; 1569 } 1570 } 1571 1572 static void arm_spe_set_event_name(struct evlist *evlist, u64 id, 1573 const char *name) 1574 { 1575 struct evsel *evsel; 1576 1577 evlist__for_each_entry(evlist, evsel) { 1578 if (evsel->core.id && evsel->core.id[0] == id) { 1579 if (evsel->name) 1580 zfree(&evsel->name); 1581 evsel->name = strdup(name); 1582 break; 1583 } 1584 } 1585 } 1586 1587 static int 1588 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) 1589 { 1590 struct evlist *evlist = session->evlist; 1591 struct evsel *evsel; 1592 struct perf_event_attr attr; 1593 bool found = false; 1594 u64 id; 1595 int err; 1596 1597 evlist__for_each_entry(evlist, evsel) { 1598 if (evsel->core.attr.type == spe->pmu_type) { 1599 found = true; 1600 break; 1601 } 1602 } 1603 1604 if (!found) { 1605 pr_debug("No selected events with SPE trace data\n"); 1606 return 0; 1607 } 1608 1609 memset(&attr, 0, sizeof(struct perf_event_attr)); 1610 attr.size = sizeof(struct perf_event_attr); 1611 attr.type = PERF_TYPE_HARDWARE; 1612 attr.sample_type = evsel->core.attr.sample_type & 1613 (PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR); 1614 attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID | 1615 PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC | 1616 PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR; 1617 if (spe->timeless_decoding) 1618 attr.sample_type &= ~(u64)PERF_SAMPLE_TIME; 1619 else 1620 attr.sample_type |= PERF_SAMPLE_TIME; 1621 1622 spe->sample_type = attr.sample_type; 1623 1624 attr.exclude_user = evsel->core.attr.exclude_user; 1625 attr.exclude_kernel = evsel->core.attr.exclude_kernel; 1626 attr.exclude_hv = evsel->core.attr.exclude_hv; 1627 attr.exclude_host = evsel->core.attr.exclude_host; 1628 attr.exclude_guest = evsel->core.attr.exclude_guest; 1629 attr.sample_id_all = evsel->core.attr.sample_id_all; 1630 attr.read_format = evsel->core.attr.read_format; 1631 1632 /* create new id val to be a fixed offset from evsel id */ 1633 id = evsel->core.id[0] + 1000000000; 1634 1635 if (!id) 1636 id = 1; 1637 1638 if (spe->synth_opts.flc) { 1639 spe->sample_flc = true; 1640 1641 /* Level 1 data cache miss */ 1642 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1643 if (err) 1644 return err; 1645 spe->l1d_miss_id = id; 1646 arm_spe_set_event_name(evlist, id, "l1d-miss"); 1647 id += 1; 1648 1649 /* Level 1 data cache access */ 1650 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1651 if (err) 1652 return err; 1653 spe->l1d_access_id = id; 1654 arm_spe_set_event_name(evlist, id, "l1d-access"); 1655 id += 1; 1656 } 1657 1658 if (spe->synth_opts.llc) { 1659 spe->sample_llc = true; 1660 1661 /* Last level cache miss */ 1662 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1663 if (err) 1664 return err; 1665 spe->llc_miss_id = id; 1666 arm_spe_set_event_name(evlist, id, "llc-miss"); 1667 id += 1; 1668 1669 /* Last level cache access */ 1670 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1671 if (err) 1672 return err; 1673 spe->llc_access_id = id; 1674 arm_spe_set_event_name(evlist, id, "llc-access"); 1675 id += 1; 1676 } 1677 1678 if (spe->synth_opts.tlb) { 1679 spe->sample_tlb = true; 1680 1681 /* TLB miss */ 1682 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1683 if (err) 1684 return err; 1685 spe->tlb_miss_id = id; 1686 arm_spe_set_event_name(evlist, id, "tlb-miss"); 1687 id += 1; 1688 1689 /* TLB access */ 1690 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1691 if (err) 1692 return err; 1693 spe->tlb_access_id = id; 1694 arm_spe_set_event_name(evlist, id, "tlb-access"); 1695 id += 1; 1696 } 1697 1698 if (spe->synth_opts.last_branch) { 1699 if (spe->synth_opts.last_branch_sz > 2) 1700 pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); 1701 1702 attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; 1703 /* 1704 * We don't use the hardware index, but the sample generation 1705 * code uses the new format branch_stack with this field, 1706 * so the event attributes must indicate that it's present. 1707 */ 1708 attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; 1709 } 1710 1711 if (spe->synth_opts.branches) { 1712 spe->sample_branch = true; 1713 1714 /* Branch */ 1715 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1716 if (err) 1717 return err; 1718 spe->branch_id = id; 1719 arm_spe_set_event_name(evlist, id, "branch"); 1720 id += 1; 1721 } 1722 1723 if (spe->synth_opts.remote_access) { 1724 spe->sample_remote_access = true; 1725 1726 /* Remote access */ 1727 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1728 if (err) 1729 return err; 1730 spe->remote_access_id = id; 1731 arm_spe_set_event_name(evlist, id, "remote-access"); 1732 id += 1; 1733 } 1734 1735 if (spe->synth_opts.mem) { 1736 spe->sample_memory = true; 1737 1738 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1739 if (err) 1740 return err; 1741 spe->memory_id = id; 1742 arm_spe_set_event_name(evlist, id, "memory"); 1743 id += 1; 1744 } 1745 1746 if (spe->synth_opts.instructions) { 1747 if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { 1748 pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n"); 1749 goto synth_instructions_out; 1750 } 1751 if (spe->synth_opts.period > 1) 1752 pr_warning("Arm SPE has a hardware-based sample period.\n" 1753 "Additional instruction events will be discarded by --itrace\n"); 1754 1755 spe->sample_instructions = true; 1756 attr.config = PERF_COUNT_HW_INSTRUCTIONS; 1757 attr.sample_period = spe->synth_opts.period; 1758 spe->instructions_sample_period = attr.sample_period; 1759 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1760 if (err) 1761 return err; 1762 spe->instructions_id = id; 1763 arm_spe_set_event_name(evlist, id, "instructions"); 1764 } 1765 synth_instructions_out: 1766 1767 return 0; 1768 } 1769 1770 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu) 1771 { 1772 u64 midr; 1773 int i; 1774 1775 if (!nr_cpu) 1776 return false; 1777 1778 for (i = 0; i < nr_cpu; i++) { 1779 if (!metadata[i]) 1780 return false; 1781 1782 if (i == 0) { 1783 midr = metadata[i][ARM_SPE_CPU_MIDR]; 1784 continue; 1785 } 1786 1787 if (midr != metadata[i][ARM_SPE_CPU_MIDR]) 1788 return false; 1789 } 1790 1791 return true; 1792 } 1793 1794 int arm_spe_process_auxtrace_info(union perf_event *event, 1795 struct perf_session *session) 1796 { 1797 struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; 1798 size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE; 1799 struct perf_record_time_conv *tc = &session->time_conv; 1800 struct arm_spe *spe; 1801 u64 **metadata = NULL; 1802 u64 metadata_ver; 1803 int nr_cpu, err; 1804 1805 if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) + 1806 min_sz) 1807 return -EINVAL; 1808 1809 metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver, 1810 &nr_cpu); 1811 if (!metadata && metadata_ver != 1) { 1812 pr_err("Failed to parse Arm SPE metadata.\n"); 1813 return -EINVAL; 1814 } 1815 1816 spe = zalloc(sizeof(struct arm_spe)); 1817 if (!spe) { 1818 err = -ENOMEM; 1819 goto err_free_metadata; 1820 } 1821 1822 err = auxtrace_queues__init(&spe->queues); 1823 if (err) 1824 goto err_free; 1825 1826 spe->session = session; 1827 spe->machine = &session->machines.host; /* No kvm support */ 1828 spe->auxtrace_type = auxtrace_info->type; 1829 if (metadata_ver == 1) 1830 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; 1831 else 1832 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2]; 1833 spe->metadata = metadata; 1834 spe->metadata_ver = metadata_ver; 1835 spe->metadata_nr_cpu = nr_cpu; 1836 spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu); 1837 1838 spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); 1839 1840 /* 1841 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead 1842 * and the parameters for hardware clock are stored in the session 1843 * context. Passes these parameters to the struct perf_tsc_conversion 1844 * in "spe->tc", which is used for later conversion between clock 1845 * counter and timestamp. 1846 * 1847 * For backward compatibility, copies the fields starting from 1848 * "time_cycles" only if they are contained in the event. 1849 */ 1850 spe->tc.time_shift = tc->time_shift; 1851 spe->tc.time_mult = tc->time_mult; 1852 spe->tc.time_zero = tc->time_zero; 1853 1854 if (event_contains(*tc, time_cycles)) { 1855 spe->tc.time_cycles = tc->time_cycles; 1856 spe->tc.time_mask = tc->time_mask; 1857 spe->tc.cap_user_time_zero = tc->cap_user_time_zero; 1858 spe->tc.cap_user_time_short = tc->cap_user_time_short; 1859 } 1860 1861 spe->auxtrace.process_event = arm_spe_process_event; 1862 spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event; 1863 spe->auxtrace.flush_events = arm_spe_flush; 1864 spe->auxtrace.free_events = arm_spe_free_events; 1865 spe->auxtrace.free = arm_spe_free; 1866 spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace; 1867 session->auxtrace = &spe->auxtrace; 1868 1869 arm_spe_print_info(spe, &auxtrace_info->priv[0]); 1870 1871 if (dump_trace) 1872 return 0; 1873 1874 if (session->itrace_synth_opts && session->itrace_synth_opts->set) 1875 spe->synth_opts = *session->itrace_synth_opts; 1876 else 1877 itrace_synth_opts__set_default(&spe->synth_opts, false); 1878 1879 err = arm_spe_synth_events(spe, session); 1880 if (err) 1881 goto err_free_queues; 1882 1883 err = auxtrace_queues__process_index(&spe->queues, session); 1884 if (err) 1885 goto err_free_queues; 1886 1887 if (spe->queues.populated) 1888 spe->data_queued = true; 1889 1890 return 0; 1891 1892 err_free_queues: 1893 auxtrace_queues__free(&spe->queues); 1894 session->auxtrace = NULL; 1895 err_free: 1896 free(spe); 1897 err_free_metadata: 1898 arm_spe__free_metadata(metadata, nr_cpu); 1899 return err; 1900 } 1901