1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Arm Statistical Profiling Extensions (SPE) support 4 * Copyright (c) 2017-2018, Arm Ltd. 5 */ 6 7 #include <byteswap.h> 8 #include <endian.h> 9 #include <errno.h> 10 #include <inttypes.h> 11 #include <linux/bitops.h> 12 #include <linux/kernel.h> 13 #include <linux/log2.h> 14 #include <linux/types.h> 15 #include <linux/zalloc.h> 16 #include <stdlib.h> 17 #include <unistd.h> 18 19 #include "auxtrace.h" 20 #include "color.h" 21 #include "debug.h" 22 #include "evlist.h" 23 #include "evsel.h" 24 #include "machine.h" 25 #include "session.h" 26 #include "symbol.h" 27 #include "thread.h" 28 #include "thread-stack.h" 29 #include "tsc.h" 30 #include "tool.h" 31 #include "util/synthetic-events.h" 32 33 #include "arm-spe.h" 34 #include "arm-spe-decoder/arm-spe-decoder.h" 35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h" 36 37 #include "../../arch/arm64/include/asm/cputype.h" 38 #define MAX_TIMESTAMP (~0ULL) 39 40 #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) 41 42 struct arm_spe { 43 struct auxtrace auxtrace; 44 struct auxtrace_queues queues; 45 struct auxtrace_heap heap; 46 struct itrace_synth_opts synth_opts; 47 u32 auxtrace_type; 48 struct perf_session *session; 49 struct machine *machine; 50 u32 pmu_type; 51 52 struct perf_tsc_conversion tc; 53 54 u8 timeless_decoding; 55 u8 data_queued; 56 57 u64 sample_type; 58 u8 sample_flc; 59 u8 sample_llc; 60 u8 sample_tlb; 61 u8 sample_branch; 62 u8 sample_remote_access; 63 u8 sample_memory; 64 u8 sample_instructions; 65 66 u64 l1d_miss_id; 67 u64 l1d_access_id; 68 u64 llc_miss_id; 69 u64 llc_access_id; 70 u64 tlb_miss_id; 71 u64 tlb_access_id; 72 u64 branch_id; 73 u64 remote_access_id; 74 u64 memory_id; 75 u64 instructions_id; 76 77 u64 kernel_start; 78 79 unsigned long num_events; 80 u8 use_ctx_pkt_for_pid; 81 82 u64 **metadata; 83 u64 metadata_ver; 84 u64 metadata_nr_cpu; 85 bool is_homogeneous; 86 }; 87 88 struct arm_spe_queue { 89 struct arm_spe *spe; 90 unsigned int queue_nr; 91 struct auxtrace_buffer *buffer; 92 struct auxtrace_buffer *old_buffer; 93 union perf_event *event_buf; 94 bool on_heap; 95 bool done; 96 pid_t pid; 97 pid_t tid; 98 int cpu; 99 struct arm_spe_decoder *decoder; 100 u64 time; 101 u64 timestamp; 102 struct thread *thread; 103 u64 sample_count; 104 u32 flags; 105 struct branch_stack *last_branch; 106 }; 107 108 struct data_source_handle { 109 const struct midr_range *midr_ranges; 110 void (*ds_synth)(const struct arm_spe_record *record, 111 union perf_mem_data_src *data_src); 112 }; 113 114 #define DS(range, func) \ 115 { \ 116 .midr_ranges = range, \ 117 .ds_synth = arm_spe__synth_##func, \ 118 } 119 120 static void arm_spe_dump(struct arm_spe *spe __maybe_unused, 121 unsigned char *buf, size_t len) 122 { 123 struct arm_spe_pkt packet; 124 size_t pos = 0; 125 int ret, pkt_len, i; 126 char desc[ARM_SPE_PKT_DESC_MAX]; 127 const char *color = PERF_COLOR_BLUE; 128 129 color_fprintf(stdout, color, 130 ". ... ARM SPE data: size %#zx bytes\n", 131 len); 132 133 while (len) { 134 ret = arm_spe_get_packet(buf, len, &packet); 135 if (ret > 0) 136 pkt_len = ret; 137 else 138 pkt_len = 1; 139 printf("."); 140 color_fprintf(stdout, color, " %08zx: ", pos); 141 for (i = 0; i < pkt_len; i++) 142 color_fprintf(stdout, color, " %02x", buf[i]); 143 for (; i < 16; i++) 144 color_fprintf(stdout, color, " "); 145 if (ret > 0) { 146 ret = arm_spe_pkt_desc(&packet, desc, 147 ARM_SPE_PKT_DESC_MAX); 148 if (!ret) 149 color_fprintf(stdout, color, " %s\n", desc); 150 } else { 151 color_fprintf(stdout, color, " Bad packet!\n"); 152 } 153 pos += pkt_len; 154 buf += pkt_len; 155 len -= pkt_len; 156 } 157 } 158 159 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf, 160 size_t len) 161 { 162 printf(".\n"); 163 arm_spe_dump(spe, buf, len); 164 } 165 166 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data) 167 { 168 struct arm_spe_queue *speq = data; 169 struct auxtrace_buffer *buffer = speq->buffer; 170 struct auxtrace_buffer *old_buffer = speq->old_buffer; 171 struct auxtrace_queue *queue; 172 173 queue = &speq->spe->queues.queue_array[speq->queue_nr]; 174 175 buffer = auxtrace_buffer__next(queue, buffer); 176 /* If no more data, drop the previous auxtrace_buffer and return */ 177 if (!buffer) { 178 if (old_buffer) 179 auxtrace_buffer__drop_data(old_buffer); 180 b->len = 0; 181 return 0; 182 } 183 184 speq->buffer = buffer; 185 186 /* If the aux_buffer doesn't have data associated, try to load it */ 187 if (!buffer->data) { 188 /* get the file desc associated with the perf data file */ 189 int fd = perf_data__fd(speq->spe->session->data); 190 191 buffer->data = auxtrace_buffer__get_data(buffer, fd); 192 if (!buffer->data) 193 return -ENOMEM; 194 } 195 196 b->len = buffer->size; 197 b->buf = buffer->data; 198 199 if (b->len) { 200 if (old_buffer) 201 auxtrace_buffer__drop_data(old_buffer); 202 speq->old_buffer = buffer; 203 } else { 204 auxtrace_buffer__drop_data(buffer); 205 return arm_spe_get_trace(b, data); 206 } 207 208 return 0; 209 } 210 211 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, 212 unsigned int queue_nr) 213 { 214 struct arm_spe_params params = { .get_trace = 0, }; 215 struct arm_spe_queue *speq; 216 217 speq = zalloc(sizeof(*speq)); 218 if (!speq) 219 return NULL; 220 221 speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); 222 if (!speq->event_buf) 223 goto out_free; 224 225 speq->spe = spe; 226 speq->queue_nr = queue_nr; 227 speq->pid = -1; 228 speq->tid = -1; 229 speq->cpu = -1; 230 231 /* params set */ 232 params.get_trace = arm_spe_get_trace; 233 params.data = speq; 234 235 if (spe->synth_opts.last_branch) { 236 size_t sz = sizeof(struct branch_stack); 237 238 /* Allocate up to two entries for PBT + TGT */ 239 sz += sizeof(struct branch_entry) * 240 min(spe->synth_opts.last_branch_sz, 2U); 241 speq->last_branch = zalloc(sz); 242 if (!speq->last_branch) 243 goto out_free; 244 } 245 246 /* create new decoder */ 247 speq->decoder = arm_spe_decoder_new(¶ms); 248 if (!speq->decoder) 249 goto out_free; 250 251 return speq; 252 253 out_free: 254 zfree(&speq->event_buf); 255 zfree(&speq->last_branch); 256 free(speq); 257 258 return NULL; 259 } 260 261 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip) 262 { 263 return ip >= spe->kernel_start ? 264 PERF_RECORD_MISC_KERNEL : 265 PERF_RECORD_MISC_USER; 266 } 267 268 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe, 269 struct auxtrace_queue *queue) 270 { 271 struct arm_spe_queue *speq = queue->priv; 272 pid_t tid; 273 274 tid = machine__get_current_tid(spe->machine, speq->cpu); 275 if (tid != -1) { 276 speq->tid = tid; 277 thread__zput(speq->thread); 278 } else 279 speq->tid = queue->tid; 280 281 if ((!speq->thread) && (speq->tid != -1)) { 282 speq->thread = machine__find_thread(spe->machine, -1, 283 speq->tid); 284 } 285 286 if (speq->thread) { 287 speq->pid = thread__pid(speq->thread); 288 if (queue->cpu == -1) 289 speq->cpu = thread__cpu(speq->thread); 290 } 291 } 292 293 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid) 294 { 295 struct arm_spe *spe = speq->spe; 296 int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid); 297 298 if (err) 299 return err; 300 301 arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]); 302 303 return 0; 304 } 305 306 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu) 307 { 308 u64 i; 309 310 if (!spe->metadata) 311 return NULL; 312 313 for (i = 0; i < spe->metadata_nr_cpu; i++) 314 if (spe->metadata[i][ARM_SPE_CPU] == cpu) 315 return spe->metadata[i]; 316 317 return NULL; 318 } 319 320 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record) 321 { 322 struct simd_flags simd_flags = {}; 323 324 if ((record->op & ARM_SPE_OP_LDST) && (record->op & ARM_SPE_OP_SVE_LDST)) 325 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE; 326 327 if ((record->op & ARM_SPE_OP_OTHER) && (record->op & ARM_SPE_OP_SVE_OTHER)) 328 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE; 329 330 if (record->type & ARM_SPE_SVE_PARTIAL_PRED) 331 simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL; 332 333 if (record->type & ARM_SPE_SVE_EMPTY_PRED) 334 simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY; 335 336 return simd_flags; 337 } 338 339 static void arm_spe_prep_sample(struct arm_spe *spe, 340 struct arm_spe_queue *speq, 341 union perf_event *event, 342 struct perf_sample *sample) 343 { 344 struct arm_spe_record *record = &speq->decoder->record; 345 346 if (!spe->timeless_decoding) 347 sample->time = tsc_to_perf_time(record->timestamp, &spe->tc); 348 349 sample->ip = record->from_ip; 350 sample->cpumode = arm_spe_cpumode(spe, sample->ip); 351 sample->pid = speq->pid; 352 sample->tid = speq->tid; 353 sample->period = spe->synth_opts.period; 354 sample->cpu = speq->cpu; 355 sample->simd_flags = arm_spe__synth_simd_flags(record); 356 357 event->sample.header.type = PERF_RECORD_SAMPLE; 358 event->sample.header.misc = sample->cpumode; 359 event->sample.header.size = sizeof(struct perf_event_header); 360 } 361 362 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) 363 { 364 struct arm_spe *spe = speq->spe; 365 struct arm_spe_record *record = &speq->decoder->record; 366 struct branch_stack *bstack = speq->last_branch; 367 struct branch_flags *bs_flags; 368 unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; 369 bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); 370 bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; 371 size_t sz = sizeof(struct branch_stack) + 372 sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; 373 int i = 0; 374 375 /* Clean up branch stack */ 376 memset(bstack, 0x0, sz); 377 378 if (!have_tgt && !have_pbt) 379 return; 380 381 if (have_tgt) { 382 bstack->entries[i].from = record->from_ip; 383 bstack->entries[i].to = record->to_ip; 384 385 bs_flags = &bstack->entries[i].flags; 386 bs_flags->value = 0; 387 388 if (record->op & ARM_SPE_OP_BR_CR_BL) { 389 if (record->op & ARM_SPE_OP_BR_COND) 390 bs_flags->type |= PERF_BR_COND_CALL; 391 else 392 bs_flags->type |= PERF_BR_CALL; 393 /* 394 * Indirect branch instruction without link (e.g. BR), 395 * take this case as function return. 396 */ 397 } else if (record->op & ARM_SPE_OP_BR_CR_RET || 398 record->op & ARM_SPE_OP_BR_INDIRECT) { 399 if (record->op & ARM_SPE_OP_BR_COND) 400 bs_flags->type |= PERF_BR_COND_RET; 401 else 402 bs_flags->type |= PERF_BR_RET; 403 } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { 404 if (record->op & ARM_SPE_OP_BR_COND) 405 bs_flags->type |= PERF_BR_COND; 406 else 407 bs_flags->type |= PERF_BR_UNCOND; 408 } else { 409 if (record->op & ARM_SPE_OP_BR_COND) 410 bs_flags->type |= PERF_BR_COND; 411 else 412 bs_flags->type |= PERF_BR_UNKNOWN; 413 } 414 415 if (record->type & ARM_SPE_BRANCH_MISS) { 416 bs_flags->mispred = 1; 417 bs_flags->predicted = 0; 418 } else { 419 bs_flags->mispred = 0; 420 bs_flags->predicted = 1; 421 } 422 423 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 424 bs_flags->not_taken = 1; 425 426 if (record->type & ARM_SPE_IN_TXN) 427 bs_flags->in_tx = 1; 428 429 bs_flags->cycles = min(record->latency, 0xFFFFU); 430 i++; 431 } 432 433 if (have_pbt) { 434 bs_flags = &bstack->entries[i].flags; 435 bs_flags->type |= PERF_BR_UNKNOWN; 436 bstack->entries[i].to = record->prev_br_tgt; 437 i++; 438 } 439 440 bstack->nr = i; 441 bstack->hw_idx = -1ULL; 442 } 443 444 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type) 445 { 446 event->header.size = perf_event__sample_event_size(sample, type, 0); 447 return perf_event__synthesize_sample(event, type, 0, sample); 448 } 449 450 static inline int 451 arm_spe_deliver_synth_event(struct arm_spe *spe, 452 struct arm_spe_queue *speq __maybe_unused, 453 union perf_event *event, 454 struct perf_sample *sample) 455 { 456 int ret; 457 458 if (spe->synth_opts.inject) { 459 ret = arm_spe__inject_event(event, sample, spe->sample_type); 460 if (ret) 461 return ret; 462 } 463 464 ret = perf_session__deliver_synth_event(spe->session, event, sample); 465 if (ret) 466 pr_err("ARM SPE: failed to deliver event, error %d\n", ret); 467 468 return ret; 469 } 470 471 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, 472 u64 spe_events_id, u64 data_src) 473 { 474 struct arm_spe *spe = speq->spe; 475 struct arm_spe_record *record = &speq->decoder->record; 476 union perf_event *event = speq->event_buf; 477 struct perf_sample sample; 478 int ret; 479 480 perf_sample__init(&sample, /*all=*/true); 481 arm_spe_prep_sample(spe, speq, event, &sample); 482 483 sample.id = spe_events_id; 484 sample.stream_id = spe_events_id; 485 sample.addr = record->virt_addr; 486 sample.phys_addr = record->phys_addr; 487 sample.data_src = data_src; 488 sample.weight = record->latency; 489 490 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 491 perf_sample__exit(&sample); 492 return ret; 493 } 494 495 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, 496 u64 spe_events_id) 497 { 498 struct arm_spe *spe = speq->spe; 499 struct arm_spe_record *record = &speq->decoder->record; 500 union perf_event *event = speq->event_buf; 501 struct perf_sample sample; 502 int ret; 503 504 perf_sample__init(&sample, /*all=*/true); 505 arm_spe_prep_sample(spe, speq, event, &sample); 506 507 sample.id = spe_events_id; 508 sample.stream_id = spe_events_id; 509 sample.addr = record->to_ip; 510 sample.weight = record->latency; 511 sample.flags = speq->flags; 512 sample.branch_stack = speq->last_branch; 513 514 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 515 perf_sample__exit(&sample); 516 return ret; 517 } 518 519 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, 520 u64 spe_events_id, u64 data_src) 521 { 522 struct arm_spe *spe = speq->spe; 523 struct arm_spe_record *record = &speq->decoder->record; 524 union perf_event *event = speq->event_buf; 525 struct perf_sample sample; 526 int ret; 527 528 perf_sample__init(&sample, /*all=*/true); 529 arm_spe_prep_sample(spe, speq, event, &sample); 530 531 sample.id = spe_events_id; 532 sample.stream_id = spe_events_id; 533 sample.addr = record->to_ip; 534 sample.phys_addr = record->phys_addr; 535 sample.data_src = data_src; 536 sample.weight = record->latency; 537 sample.flags = speq->flags; 538 sample.branch_stack = speq->last_branch; 539 540 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 541 perf_sample__exit(&sample); 542 return ret; 543 } 544 545 static const struct midr_range common_ds_encoding_cpus[] = { 546 MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), 547 MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), 548 MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), 549 MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), 550 MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), 551 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), 552 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), 553 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), 554 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), 555 {}, 556 }; 557 558 static const struct midr_range ampereone_ds_encoding_cpus[] = { 559 MIDR_ALL_VERSIONS(MIDR_AMPERE1A), 560 {}, 561 }; 562 563 static const struct midr_range hisi_hip_ds_encoding_cpus[] = { 564 MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), 565 {}, 566 }; 567 568 static void arm_spe__sample_flags(struct arm_spe_queue *speq) 569 { 570 const struct arm_spe_record *record = &speq->decoder->record; 571 572 speq->flags = 0; 573 if (record->op & ARM_SPE_OP_BRANCH_ERET) { 574 speq->flags = PERF_IP_FLAG_BRANCH; 575 576 if (record->type & ARM_SPE_BRANCH_MISS) 577 speq->flags |= PERF_IP_FLAG_BRANCH_MISS; 578 579 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 580 speq->flags |= PERF_IP_FLAG_NOT_TAKEN; 581 582 if (record->type & ARM_SPE_IN_TXN) 583 speq->flags |= PERF_IP_FLAG_IN_TX; 584 585 if (record->op & ARM_SPE_OP_BR_COND) 586 speq->flags |= PERF_IP_FLAG_CONDITIONAL; 587 588 if (record->op & ARM_SPE_OP_BR_CR_BL) 589 speq->flags |= PERF_IP_FLAG_CALL; 590 else if (record->op & ARM_SPE_OP_BR_CR_RET) 591 speq->flags |= PERF_IP_FLAG_RETURN; 592 /* 593 * Indirect branch instruction without link (e.g. BR), 594 * take it as a function return. 595 */ 596 else if (record->op & ARM_SPE_OP_BR_INDIRECT) 597 speq->flags |= PERF_IP_FLAG_RETURN; 598 } 599 } 600 601 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record, 602 union perf_mem_data_src *data_src) 603 { 604 /* 605 * Even though four levels of cache hierarchy are possible, no known 606 * production Neoverse systems currently include more than three levels 607 * so for the time being we assume three exist. If a production system 608 * is built with four the this function would have to be changed to 609 * detect the number of levels for reporting. 610 */ 611 612 /* 613 * We have no data on the hit level or data source for stores in the 614 * Neoverse SPE records. 615 */ 616 if (record->op & ARM_SPE_OP_ST) { 617 data_src->mem_lvl = PERF_MEM_LVL_NA; 618 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; 619 data_src->mem_snoop = PERF_MEM_SNOOP_NA; 620 return; 621 } 622 623 switch (record->source) { 624 case ARM_SPE_COMMON_DS_L1D: 625 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 626 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 627 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 628 break; 629 case ARM_SPE_COMMON_DS_L2: 630 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 631 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 632 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 633 break; 634 case ARM_SPE_COMMON_DS_PEER_CORE: 635 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 636 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 637 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 638 break; 639 /* 640 * We don't know if this is L1, L2 but we do know it was a cache-2-cache 641 * transfer, so set SNOOPX_PEER 642 */ 643 case ARM_SPE_COMMON_DS_LOCAL_CLUSTER: 644 case ARM_SPE_COMMON_DS_PEER_CLUSTER: 645 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 646 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 647 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 648 break; 649 /* 650 * System cache is assumed to be L3 651 */ 652 case ARM_SPE_COMMON_DS_SYS_CACHE: 653 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 654 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 655 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 656 break; 657 /* 658 * We don't know what level it hit in, except it came from the other 659 * socket 660 */ 661 case ARM_SPE_COMMON_DS_REMOTE: 662 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; 663 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 664 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 665 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 666 break; 667 case ARM_SPE_COMMON_DS_DRAM: 668 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 669 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 670 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 671 break; 672 default: 673 break; 674 } 675 } 676 677 /* 678 * Source is IMPDEF. Here we convert the source code used on AmpereOne cores 679 * to the common (Neoverse, Cortex) to avoid duplicating the decoding code. 680 */ 681 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record, 682 union perf_mem_data_src *data_src) 683 { 684 struct arm_spe_record common_record; 685 686 switch (record->source) { 687 case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE: 688 common_record.source = ARM_SPE_COMMON_DS_PEER_CORE; 689 break; 690 case ARM_SPE_AMPEREONE_SLC: 691 common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE; 692 break; 693 case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE: 694 common_record.source = ARM_SPE_COMMON_DS_REMOTE; 695 break; 696 case ARM_SPE_AMPEREONE_DDR: 697 common_record.source = ARM_SPE_COMMON_DS_DRAM; 698 break; 699 case ARM_SPE_AMPEREONE_L1D: 700 common_record.source = ARM_SPE_COMMON_DS_L1D; 701 break; 702 case ARM_SPE_AMPEREONE_L2D: 703 common_record.source = ARM_SPE_COMMON_DS_L2; 704 break; 705 default: 706 pr_warning_once("AmpereOne: Unknown data source (0x%x)\n", 707 record->source); 708 return; 709 } 710 711 common_record.op = record->op; 712 arm_spe__synth_data_source_common(&common_record, data_src); 713 } 714 715 static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record, 716 union perf_mem_data_src *data_src) 717 { 718 /* Use common synthesis method to handle store operations */ 719 if (record->op & ARM_SPE_OP_ST) { 720 arm_spe__synth_data_source_common(record, data_src); 721 return; 722 } 723 724 switch (record->source) { 725 case ARM_SPE_HISI_HIP_PEER_CPU: 726 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 727 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 728 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 729 break; 730 case ARM_SPE_HISI_HIP_PEER_CPU_HITM: 731 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 732 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 733 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 734 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 735 break; 736 case ARM_SPE_HISI_HIP_L3: 737 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 738 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 739 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 740 break; 741 case ARM_SPE_HISI_HIP_L3_HITM: 742 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 743 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 744 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 745 break; 746 case ARM_SPE_HISI_HIP_PEER_CLUSTER: 747 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; 748 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 749 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 750 break; 751 case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM: 752 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; 753 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 754 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 755 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 756 break; 757 case ARM_SPE_HISI_HIP_REMOTE_SOCKET: 758 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; 759 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 760 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 761 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 762 break; 763 case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM: 764 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; 765 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 766 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 767 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 768 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 769 break; 770 case ARM_SPE_HISI_HIP_LOCAL_MEM: 771 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 772 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 773 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 774 break; 775 case ARM_SPE_HISI_HIP_REMOTE_MEM: 776 data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; 777 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 778 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 779 break; 780 case ARM_SPE_HISI_HIP_NC_DEV: 781 data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT; 782 data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; 783 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 784 break; 785 case ARM_SPE_HISI_HIP_L2: 786 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 787 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 788 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 789 break; 790 case ARM_SPE_HISI_HIP_L2_HITM: 791 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 792 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 793 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 794 break; 795 case ARM_SPE_HISI_HIP_L1: 796 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 797 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 798 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 799 break; 800 default: 801 break; 802 } 803 } 804 805 static const struct data_source_handle data_source_handles[] = { 806 DS(common_ds_encoding_cpus, data_source_common), 807 DS(ampereone_ds_encoding_cpus, data_source_ampereone), 808 DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip), 809 }; 810 811 static void arm_spe__synth_memory_level(const struct arm_spe_record *record, 812 union perf_mem_data_src *data_src) 813 { 814 if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { 815 data_src->mem_lvl = PERF_MEM_LVL_L3; 816 817 if (record->type & ARM_SPE_LLC_MISS) 818 data_src->mem_lvl |= PERF_MEM_LVL_MISS; 819 else 820 data_src->mem_lvl |= PERF_MEM_LVL_HIT; 821 } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { 822 data_src->mem_lvl = PERF_MEM_LVL_L1; 823 824 if (record->type & ARM_SPE_L1D_MISS) 825 data_src->mem_lvl |= PERF_MEM_LVL_MISS; 826 else 827 data_src->mem_lvl |= PERF_MEM_LVL_HIT; 828 } 829 830 if (record->type & ARM_SPE_REMOTE_ACCESS) 831 data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; 832 } 833 834 static bool arm_spe__synth_ds(struct arm_spe_queue *speq, 835 const struct arm_spe_record *record, 836 union perf_mem_data_src *data_src) 837 { 838 struct arm_spe *spe = speq->spe; 839 u64 *metadata = NULL; 840 u64 midr; 841 unsigned int i; 842 843 /* Metadata version 1 assumes all CPUs are the same (old behavior) */ 844 if (spe->metadata_ver == 1) { 845 const char *cpuid; 846 847 pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n"); 848 cpuid = perf_env__cpuid(perf_session__env(spe->session)); 849 midr = strtol(cpuid, NULL, 16); 850 } else { 851 /* CPU ID is -1 for per-thread mode */ 852 if (speq->cpu < 0) { 853 /* 854 * On the heterogeneous system, due to CPU ID is -1, 855 * cannot confirm the data source packet is supported. 856 */ 857 if (!spe->is_homogeneous) 858 return false; 859 860 /* In homogeneous system, simply use CPU0's metadata */ 861 if (spe->metadata) 862 metadata = spe->metadata[0]; 863 } else { 864 metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); 865 } 866 867 if (!metadata) 868 return false; 869 870 midr = metadata[ARM_SPE_CPU_MIDR]; 871 } 872 873 for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { 874 if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { 875 data_source_handles[i].ds_synth(record, data_src); 876 return true; 877 } 878 } 879 880 return false; 881 } 882 883 static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, 884 const struct arm_spe_record *record) 885 { 886 union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA }; 887 888 /* Only synthesize data source for LDST operations */ 889 if (!is_ldst_op(record->op)) 890 return 0; 891 892 if (record->op & ARM_SPE_OP_LD) 893 data_src.mem_op = PERF_MEM_OP_LOAD; 894 else if (record->op & ARM_SPE_OP_ST) 895 data_src.mem_op = PERF_MEM_OP_STORE; 896 else 897 return 0; 898 899 if (!arm_spe__synth_ds(speq, record, &data_src)) 900 arm_spe__synth_memory_level(record, &data_src); 901 902 if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { 903 data_src.mem_dtlb = PERF_MEM_TLB_WK; 904 905 if (record->type & ARM_SPE_TLB_MISS) 906 data_src.mem_dtlb |= PERF_MEM_TLB_MISS; 907 else 908 data_src.mem_dtlb |= PERF_MEM_TLB_HIT; 909 } 910 911 return data_src.val; 912 } 913 914 static int arm_spe_sample(struct arm_spe_queue *speq) 915 { 916 const struct arm_spe_record *record = &speq->decoder->record; 917 struct arm_spe *spe = speq->spe; 918 u64 data_src; 919 int err; 920 921 /* 922 * Discard all samples until period is reached 923 */ 924 speq->sample_count++; 925 if (speq->sample_count < spe->synth_opts.period) 926 return 0; 927 speq->sample_count = 0; 928 929 arm_spe__sample_flags(speq); 930 data_src = arm_spe__synth_data_source(speq, record); 931 932 if (spe->sample_flc) { 933 if (record->type & ARM_SPE_L1D_MISS) { 934 err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id, 935 data_src); 936 if (err) 937 return err; 938 } 939 940 if (record->type & ARM_SPE_L1D_ACCESS) { 941 err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id, 942 data_src); 943 if (err) 944 return err; 945 } 946 } 947 948 if (spe->sample_llc) { 949 if (record->type & ARM_SPE_LLC_MISS) { 950 err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id, 951 data_src); 952 if (err) 953 return err; 954 } 955 956 if (record->type & ARM_SPE_LLC_ACCESS) { 957 err = arm_spe__synth_mem_sample(speq, spe->llc_access_id, 958 data_src); 959 if (err) 960 return err; 961 } 962 } 963 964 if (spe->sample_tlb) { 965 if (record->type & ARM_SPE_TLB_MISS) { 966 err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id, 967 data_src); 968 if (err) 969 return err; 970 } 971 972 if (record->type & ARM_SPE_TLB_ACCESS) { 973 err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id, 974 data_src); 975 if (err) 976 return err; 977 } 978 } 979 980 if (spe->synth_opts.last_branch && 981 (spe->sample_branch || spe->sample_instructions)) 982 arm_spe__prep_branch_stack(speq); 983 984 if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) { 985 err = arm_spe__synth_branch_sample(speq, spe->branch_id); 986 if (err) 987 return err; 988 } 989 990 if (spe->sample_remote_access && 991 (record->type & ARM_SPE_REMOTE_ACCESS)) { 992 err = arm_spe__synth_mem_sample(speq, spe->remote_access_id, 993 data_src); 994 if (err) 995 return err; 996 } 997 998 /* 999 * When data_src is zero it means the record is not a memory operation, 1000 * skip to synthesize memory sample for this case. 1001 */ 1002 if (spe->sample_memory && is_ldst_op(record->op)) { 1003 err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); 1004 if (err) 1005 return err; 1006 } 1007 1008 if (spe->sample_instructions) { 1009 err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src); 1010 if (err) 1011 return err; 1012 } 1013 1014 return 0; 1015 } 1016 1017 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp) 1018 { 1019 struct arm_spe *spe = speq->spe; 1020 struct arm_spe_record *record; 1021 int ret; 1022 1023 if (!spe->kernel_start) 1024 spe->kernel_start = machine__kernel_start(spe->machine); 1025 1026 while (1) { 1027 /* 1028 * The usual logic is firstly to decode the packets, and then 1029 * based the record to synthesize sample; but here the flow is 1030 * reversed: it calls arm_spe_sample() for synthesizing samples 1031 * prior to arm_spe_decode(). 1032 * 1033 * Two reasons for this code logic: 1034 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it 1035 * has decoded trace data and generated a record, but the record 1036 * is left to generate sample until run to here, so it's correct 1037 * to synthesize sample for the left record. 1038 * 2. After decoding trace data, it needs to compare the record 1039 * timestamp with the coming perf event, if the record timestamp 1040 * is later than the perf event, it needs bail out and pushs the 1041 * record into auxtrace heap, thus the record can be deferred to 1042 * synthesize sample until run to here at the next time; so this 1043 * can correlate samples between Arm SPE trace data and other 1044 * perf events with correct time ordering. 1045 */ 1046 1047 /* 1048 * Update pid/tid info. 1049 */ 1050 record = &speq->decoder->record; 1051 if (!spe->timeless_decoding && record->context_id != (u64)-1) { 1052 ret = arm_spe_set_tid(speq, record->context_id); 1053 if (ret) 1054 return ret; 1055 1056 spe->use_ctx_pkt_for_pid = true; 1057 } 1058 1059 ret = arm_spe_sample(speq); 1060 if (ret) 1061 return ret; 1062 1063 ret = arm_spe_decode(speq->decoder); 1064 if (!ret) { 1065 pr_debug("No data or all data has been processed.\n"); 1066 return 1; 1067 } 1068 1069 /* 1070 * Error is detected when decode SPE trace data, continue to 1071 * the next trace data and find out more records. 1072 */ 1073 if (ret < 0) 1074 continue; 1075 1076 record = &speq->decoder->record; 1077 1078 /* Update timestamp for the last record */ 1079 if (record->timestamp > speq->timestamp) 1080 speq->timestamp = record->timestamp; 1081 1082 /* 1083 * If the timestamp of the queue is later than timestamp of the 1084 * coming perf event, bail out so can allow the perf event to 1085 * be processed ahead. 1086 */ 1087 if (!spe->timeless_decoding && speq->timestamp >= *timestamp) { 1088 *timestamp = speq->timestamp; 1089 return 0; 1090 } 1091 } 1092 1093 return 0; 1094 } 1095 1096 static int arm_spe__setup_queue(struct arm_spe *spe, 1097 struct auxtrace_queue *queue, 1098 unsigned int queue_nr) 1099 { 1100 struct arm_spe_queue *speq = queue->priv; 1101 struct arm_spe_record *record; 1102 1103 if (list_empty(&queue->head) || speq) 1104 return 0; 1105 1106 speq = arm_spe__alloc_queue(spe, queue_nr); 1107 1108 if (!speq) 1109 return -ENOMEM; 1110 1111 queue->priv = speq; 1112 1113 if (queue->cpu != -1) 1114 speq->cpu = queue->cpu; 1115 1116 if (!speq->on_heap) { 1117 int ret; 1118 1119 if (spe->timeless_decoding) 1120 return 0; 1121 1122 retry: 1123 ret = arm_spe_decode(speq->decoder); 1124 1125 if (!ret) 1126 return 0; 1127 1128 if (ret < 0) 1129 goto retry; 1130 1131 record = &speq->decoder->record; 1132 1133 speq->timestamp = record->timestamp; 1134 ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp); 1135 if (ret) 1136 return ret; 1137 speq->on_heap = true; 1138 } 1139 1140 return 0; 1141 } 1142 1143 static int arm_spe__setup_queues(struct arm_spe *spe) 1144 { 1145 unsigned int i; 1146 int ret; 1147 1148 for (i = 0; i < spe->queues.nr_queues; i++) { 1149 ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i); 1150 if (ret) 1151 return ret; 1152 } 1153 1154 return 0; 1155 } 1156 1157 static int arm_spe__update_queues(struct arm_spe *spe) 1158 { 1159 if (spe->queues.new_data) { 1160 spe->queues.new_data = false; 1161 return arm_spe__setup_queues(spe); 1162 } 1163 1164 return 0; 1165 } 1166 1167 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe) 1168 { 1169 struct evsel *evsel; 1170 struct evlist *evlist = spe->session->evlist; 1171 bool timeless_decoding = true; 1172 1173 /* 1174 * Circle through the list of event and complain if we find one 1175 * with the time bit set. 1176 */ 1177 evlist__for_each_entry(evlist, evsel) { 1178 if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME)) 1179 timeless_decoding = false; 1180 } 1181 1182 return timeless_decoding; 1183 } 1184 1185 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp) 1186 { 1187 unsigned int queue_nr; 1188 u64 ts; 1189 int ret; 1190 1191 while (1) { 1192 struct auxtrace_queue *queue; 1193 struct arm_spe_queue *speq; 1194 1195 if (!spe->heap.heap_cnt) 1196 return 0; 1197 1198 if (spe->heap.heap_array[0].ordinal >= timestamp) 1199 return 0; 1200 1201 queue_nr = spe->heap.heap_array[0].queue_nr; 1202 queue = &spe->queues.queue_array[queue_nr]; 1203 speq = queue->priv; 1204 1205 auxtrace_heap__pop(&spe->heap); 1206 1207 if (spe->heap.heap_cnt) { 1208 ts = spe->heap.heap_array[0].ordinal + 1; 1209 if (ts > timestamp) 1210 ts = timestamp; 1211 } else { 1212 ts = timestamp; 1213 } 1214 1215 /* 1216 * A previous context-switch event has set pid/tid in the machine's context, so 1217 * here we need to update the pid/tid in the thread and SPE queue. 1218 */ 1219 if (!spe->use_ctx_pkt_for_pid) 1220 arm_spe_set_pid_tid_cpu(spe, queue); 1221 1222 ret = arm_spe_run_decoder(speq, &ts); 1223 if (ret < 0) { 1224 auxtrace_heap__add(&spe->heap, queue_nr, ts); 1225 return ret; 1226 } 1227 1228 if (!ret) { 1229 ret = auxtrace_heap__add(&spe->heap, queue_nr, ts); 1230 if (ret < 0) 1231 return ret; 1232 } else { 1233 speq->on_heap = false; 1234 } 1235 } 1236 1237 return 0; 1238 } 1239 1240 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid, 1241 u64 time_) 1242 { 1243 struct auxtrace_queues *queues = &spe->queues; 1244 unsigned int i; 1245 u64 ts = 0; 1246 1247 for (i = 0; i < queues->nr_queues; i++) { 1248 struct auxtrace_queue *queue = &spe->queues.queue_array[i]; 1249 struct arm_spe_queue *speq = queue->priv; 1250 1251 if (speq && (tid == -1 || speq->tid == tid)) { 1252 speq->time = time_; 1253 arm_spe_set_pid_tid_cpu(spe, queue); 1254 arm_spe_run_decoder(speq, &ts); 1255 } 1256 } 1257 return 0; 1258 } 1259 1260 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event, 1261 struct perf_sample *sample) 1262 { 1263 pid_t pid, tid; 1264 int cpu; 1265 1266 if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT)) 1267 return 0; 1268 1269 pid = event->context_switch.next_prev_pid; 1270 tid = event->context_switch.next_prev_tid; 1271 cpu = sample->cpu; 1272 1273 if (tid == -1) 1274 pr_warning("context_switch event has no tid\n"); 1275 1276 return machine__set_current_tid(spe->machine, cpu, pid, tid); 1277 } 1278 1279 static int arm_spe_process_event(struct perf_session *session, 1280 union perf_event *event, 1281 struct perf_sample *sample, 1282 const struct perf_tool *tool) 1283 { 1284 int err = 0; 1285 u64 timestamp; 1286 struct arm_spe *spe = container_of(session->auxtrace, 1287 struct arm_spe, auxtrace); 1288 1289 if (dump_trace) 1290 return 0; 1291 1292 if (!tool->ordered_events) { 1293 pr_err("SPE trace requires ordered events\n"); 1294 return -EINVAL; 1295 } 1296 1297 if (sample->time && (sample->time != (u64) -1)) 1298 timestamp = perf_time_to_tsc(sample->time, &spe->tc); 1299 else 1300 timestamp = 0; 1301 1302 if (timestamp || spe->timeless_decoding) { 1303 err = arm_spe__update_queues(spe); 1304 if (err) 1305 return err; 1306 } 1307 1308 if (spe->timeless_decoding) { 1309 if (event->header.type == PERF_RECORD_EXIT) { 1310 err = arm_spe_process_timeless_queues(spe, 1311 event->fork.tid, 1312 sample->time); 1313 } 1314 } else if (timestamp) { 1315 err = arm_spe_process_queues(spe, timestamp); 1316 if (err) 1317 return err; 1318 1319 if (!spe->use_ctx_pkt_for_pid && 1320 (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE || 1321 event->header.type == PERF_RECORD_SWITCH)) 1322 err = arm_spe_context_switch(spe, event, sample); 1323 } 1324 1325 return err; 1326 } 1327 1328 static int arm_spe_process_auxtrace_event(struct perf_session *session, 1329 union perf_event *event, 1330 const struct perf_tool *tool __maybe_unused) 1331 { 1332 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1333 auxtrace); 1334 1335 if (!spe->data_queued) { 1336 struct auxtrace_buffer *buffer; 1337 off_t data_offset; 1338 int fd = perf_data__fd(session->data); 1339 int err; 1340 1341 if (perf_data__is_pipe(session->data)) { 1342 data_offset = 0; 1343 } else { 1344 data_offset = lseek(fd, 0, SEEK_CUR); 1345 if (data_offset == -1) 1346 return -errno; 1347 } 1348 1349 err = auxtrace_queues__add_event(&spe->queues, session, event, 1350 data_offset, &buffer); 1351 if (err) 1352 return err; 1353 1354 /* Dump here now we have copied a piped trace out of the pipe */ 1355 if (dump_trace) { 1356 if (auxtrace_buffer__get_data(buffer, fd)) { 1357 arm_spe_dump_event(spe, buffer->data, 1358 buffer->size); 1359 auxtrace_buffer__put_data(buffer); 1360 } 1361 } 1362 } 1363 1364 return 0; 1365 } 1366 1367 static int arm_spe_flush(struct perf_session *session __maybe_unused, 1368 const struct perf_tool *tool __maybe_unused) 1369 { 1370 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1371 auxtrace); 1372 int ret; 1373 1374 if (dump_trace) 1375 return 0; 1376 1377 if (!tool->ordered_events) 1378 return -EINVAL; 1379 1380 ret = arm_spe__update_queues(spe); 1381 if (ret < 0) 1382 return ret; 1383 1384 if (spe->timeless_decoding) 1385 return arm_spe_process_timeless_queues(spe, -1, 1386 MAX_TIMESTAMP - 1); 1387 1388 ret = arm_spe_process_queues(spe, MAX_TIMESTAMP); 1389 if (ret) 1390 return ret; 1391 1392 if (!spe->use_ctx_pkt_for_pid) 1393 ui__warning("Arm SPE CONTEXT packets not found in the traces.\n" 1394 "Matching of TIDs to SPE events could be inaccurate.\n"); 1395 1396 return 0; 1397 } 1398 1399 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size) 1400 { 1401 u64 *metadata; 1402 1403 metadata = zalloc(per_cpu_size); 1404 if (!metadata) 1405 return NULL; 1406 1407 memcpy(metadata, buf, per_cpu_size); 1408 return metadata; 1409 } 1410 1411 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu) 1412 { 1413 int i; 1414 1415 for (i = 0; i < nr_cpu; i++) 1416 zfree(&metadata[i]); 1417 free(metadata); 1418 } 1419 1420 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info, 1421 u64 *ver, int *nr_cpu) 1422 { 1423 u64 *ptr = (u64 *)info->priv; 1424 u64 metadata_size; 1425 u64 **metadata = NULL; 1426 int hdr_sz, per_cpu_sz, i; 1427 1428 metadata_size = info->header.size - 1429 sizeof(struct perf_record_auxtrace_info); 1430 1431 /* Metadata version 1 */ 1432 if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) { 1433 *ver = 1; 1434 *nr_cpu = 0; 1435 /* No per CPU metadata */ 1436 return NULL; 1437 } 1438 1439 *ver = ptr[ARM_SPE_HEADER_VERSION]; 1440 hdr_sz = ptr[ARM_SPE_HEADER_SIZE]; 1441 *nr_cpu = ptr[ARM_SPE_CPUS_NUM]; 1442 1443 metadata = calloc(*nr_cpu, sizeof(*metadata)); 1444 if (!metadata) 1445 return NULL; 1446 1447 /* Locate the start address of per CPU metadata */ 1448 ptr += hdr_sz; 1449 per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu); 1450 1451 for (i = 0; i < *nr_cpu; i++) { 1452 metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz); 1453 if (!metadata[i]) 1454 goto err_per_cpu_metadata; 1455 1456 ptr += per_cpu_sz / sizeof(u64); 1457 } 1458 1459 return metadata; 1460 1461 err_per_cpu_metadata: 1462 arm_spe__free_metadata(metadata, *nr_cpu); 1463 return NULL; 1464 } 1465 1466 static void arm_spe_free_queue(void *priv) 1467 { 1468 struct arm_spe_queue *speq = priv; 1469 1470 if (!speq) 1471 return; 1472 thread__zput(speq->thread); 1473 arm_spe_decoder_free(speq->decoder); 1474 zfree(&speq->event_buf); 1475 zfree(&speq->last_branch); 1476 free(speq); 1477 } 1478 1479 static void arm_spe_free_events(struct perf_session *session) 1480 { 1481 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1482 auxtrace); 1483 struct auxtrace_queues *queues = &spe->queues; 1484 unsigned int i; 1485 1486 for (i = 0; i < queues->nr_queues; i++) { 1487 arm_spe_free_queue(queues->queue_array[i].priv); 1488 queues->queue_array[i].priv = NULL; 1489 } 1490 auxtrace_queues__free(queues); 1491 } 1492 1493 static void arm_spe_free(struct perf_session *session) 1494 { 1495 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1496 auxtrace); 1497 1498 auxtrace_heap__free(&spe->heap); 1499 arm_spe_free_events(session); 1500 session->auxtrace = NULL; 1501 arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu); 1502 free(spe); 1503 } 1504 1505 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session, 1506 struct evsel *evsel) 1507 { 1508 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); 1509 1510 return evsel->core.attr.type == spe->pmu_type; 1511 } 1512 1513 static const char * const metadata_hdr_v1_fmts[] = { 1514 [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1515 [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n", 1516 }; 1517 1518 static const char * const metadata_hdr_fmts[] = { 1519 [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n", 1520 [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n", 1521 [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n", 1522 [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n", 1523 }; 1524 1525 static const char * const metadata_per_cpu_fmts[] = { 1526 [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n", 1527 [ARM_SPE_CPU] = " CPU # :%"PRId64"\n", 1528 [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n", 1529 [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n", 1530 [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1531 [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n", 1532 }; 1533 1534 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) 1535 { 1536 unsigned int i, cpu, hdr_size, cpu_num, cpu_size; 1537 const char * const *hdr_fmts; 1538 1539 if (!dump_trace) 1540 return; 1541 1542 if (spe->metadata_ver == 1) { 1543 cpu_num = 0; 1544 hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX; 1545 hdr_fmts = metadata_hdr_v1_fmts; 1546 } else { 1547 cpu_num = arr[ARM_SPE_CPUS_NUM]; 1548 hdr_size = arr[ARM_SPE_HEADER_SIZE]; 1549 hdr_fmts = metadata_hdr_fmts; 1550 } 1551 1552 for (i = 0; i < hdr_size; i++) 1553 fprintf(stdout, hdr_fmts[i], arr[i]); 1554 1555 arr += hdr_size; 1556 for (cpu = 0; cpu < cpu_num; cpu++) { 1557 /* 1558 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS 1559 * are fixed. The sequential parameter size is decided by the 1560 * field 'ARM_SPE_CPU_NR_PARAMS'. 1561 */ 1562 cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS]; 1563 for (i = 0; i < cpu_size; i++) 1564 fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]); 1565 arr += cpu_size; 1566 } 1567 } 1568 1569 static void arm_spe_set_event_name(struct evlist *evlist, u64 id, 1570 const char *name) 1571 { 1572 struct evsel *evsel; 1573 1574 evlist__for_each_entry(evlist, evsel) { 1575 if (evsel->core.id && evsel->core.id[0] == id) { 1576 if (evsel->name) 1577 zfree(&evsel->name); 1578 evsel->name = strdup(name); 1579 break; 1580 } 1581 } 1582 } 1583 1584 static int 1585 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) 1586 { 1587 struct evlist *evlist = session->evlist; 1588 struct evsel *evsel; 1589 struct perf_event_attr attr; 1590 bool found = false; 1591 u64 id; 1592 int err; 1593 1594 evlist__for_each_entry(evlist, evsel) { 1595 if (evsel->core.attr.type == spe->pmu_type) { 1596 found = true; 1597 break; 1598 } 1599 } 1600 1601 if (!found) { 1602 pr_debug("No selected events with SPE trace data\n"); 1603 return 0; 1604 } 1605 1606 memset(&attr, 0, sizeof(struct perf_event_attr)); 1607 attr.size = sizeof(struct perf_event_attr); 1608 attr.type = PERF_TYPE_HARDWARE; 1609 attr.sample_type = evsel->core.attr.sample_type & 1610 (PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR); 1611 attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID | 1612 PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC | 1613 PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR; 1614 if (spe->timeless_decoding) 1615 attr.sample_type &= ~(u64)PERF_SAMPLE_TIME; 1616 else 1617 attr.sample_type |= PERF_SAMPLE_TIME; 1618 1619 spe->sample_type = attr.sample_type; 1620 1621 attr.exclude_user = evsel->core.attr.exclude_user; 1622 attr.exclude_kernel = evsel->core.attr.exclude_kernel; 1623 attr.exclude_hv = evsel->core.attr.exclude_hv; 1624 attr.exclude_host = evsel->core.attr.exclude_host; 1625 attr.exclude_guest = evsel->core.attr.exclude_guest; 1626 attr.sample_id_all = evsel->core.attr.sample_id_all; 1627 attr.read_format = evsel->core.attr.read_format; 1628 attr.sample_period = spe->synth_opts.period; 1629 1630 /* create new id val to be a fixed offset from evsel id */ 1631 id = evsel->core.id[0] + 1000000000; 1632 1633 if (!id) 1634 id = 1; 1635 1636 if (spe->synth_opts.flc) { 1637 spe->sample_flc = true; 1638 1639 /* Level 1 data cache miss */ 1640 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1641 if (err) 1642 return err; 1643 spe->l1d_miss_id = id; 1644 arm_spe_set_event_name(evlist, id, "l1d-miss"); 1645 id += 1; 1646 1647 /* Level 1 data cache access */ 1648 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1649 if (err) 1650 return err; 1651 spe->l1d_access_id = id; 1652 arm_spe_set_event_name(evlist, id, "l1d-access"); 1653 id += 1; 1654 } 1655 1656 if (spe->synth_opts.llc) { 1657 spe->sample_llc = true; 1658 1659 /* Last level cache miss */ 1660 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1661 if (err) 1662 return err; 1663 spe->llc_miss_id = id; 1664 arm_spe_set_event_name(evlist, id, "llc-miss"); 1665 id += 1; 1666 1667 /* Last level cache access */ 1668 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1669 if (err) 1670 return err; 1671 spe->llc_access_id = id; 1672 arm_spe_set_event_name(evlist, id, "llc-access"); 1673 id += 1; 1674 } 1675 1676 if (spe->synth_opts.tlb) { 1677 spe->sample_tlb = true; 1678 1679 /* TLB miss */ 1680 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1681 if (err) 1682 return err; 1683 spe->tlb_miss_id = id; 1684 arm_spe_set_event_name(evlist, id, "tlb-miss"); 1685 id += 1; 1686 1687 /* TLB access */ 1688 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1689 if (err) 1690 return err; 1691 spe->tlb_access_id = id; 1692 arm_spe_set_event_name(evlist, id, "tlb-access"); 1693 id += 1; 1694 } 1695 1696 if (spe->synth_opts.last_branch) { 1697 if (spe->synth_opts.last_branch_sz > 2) 1698 pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); 1699 1700 attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; 1701 /* 1702 * We don't use the hardware index, but the sample generation 1703 * code uses the new format branch_stack with this field, 1704 * so the event attributes must indicate that it's present. 1705 */ 1706 attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; 1707 } 1708 1709 if (spe->synth_opts.branches) { 1710 spe->sample_branch = true; 1711 1712 /* Branch */ 1713 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1714 if (err) 1715 return err; 1716 spe->branch_id = id; 1717 arm_spe_set_event_name(evlist, id, "branch"); 1718 id += 1; 1719 } 1720 1721 if (spe->synth_opts.remote_access) { 1722 spe->sample_remote_access = true; 1723 1724 /* Remote access */ 1725 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1726 if (err) 1727 return err; 1728 spe->remote_access_id = id; 1729 arm_spe_set_event_name(evlist, id, "remote-access"); 1730 id += 1; 1731 } 1732 1733 if (spe->synth_opts.mem) { 1734 spe->sample_memory = true; 1735 1736 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1737 if (err) 1738 return err; 1739 spe->memory_id = id; 1740 arm_spe_set_event_name(evlist, id, "memory"); 1741 id += 1; 1742 } 1743 1744 if (spe->synth_opts.instructions) { 1745 spe->sample_instructions = true; 1746 attr.config = PERF_COUNT_HW_INSTRUCTIONS; 1747 1748 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1749 if (err) 1750 return err; 1751 spe->instructions_id = id; 1752 arm_spe_set_event_name(evlist, id, "instructions"); 1753 } 1754 1755 return 0; 1756 } 1757 1758 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu) 1759 { 1760 u64 midr; 1761 int i; 1762 1763 if (!nr_cpu) 1764 return false; 1765 1766 for (i = 0; i < nr_cpu; i++) { 1767 if (!metadata[i]) 1768 return false; 1769 1770 if (i == 0) { 1771 midr = metadata[i][ARM_SPE_CPU_MIDR]; 1772 continue; 1773 } 1774 1775 if (midr != metadata[i][ARM_SPE_CPU_MIDR]) 1776 return false; 1777 } 1778 1779 return true; 1780 } 1781 1782 int arm_spe_process_auxtrace_info(union perf_event *event, 1783 struct perf_session *session) 1784 { 1785 struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; 1786 size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE; 1787 struct perf_record_time_conv *tc = &session->time_conv; 1788 struct arm_spe *spe; 1789 u64 **metadata = NULL; 1790 u64 metadata_ver; 1791 int nr_cpu, err; 1792 1793 if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) + 1794 min_sz) 1795 return -EINVAL; 1796 1797 metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver, 1798 &nr_cpu); 1799 if (!metadata && metadata_ver != 1) { 1800 pr_err("Failed to parse Arm SPE metadata.\n"); 1801 return -EINVAL; 1802 } 1803 1804 spe = zalloc(sizeof(struct arm_spe)); 1805 if (!spe) { 1806 err = -ENOMEM; 1807 goto err_free_metadata; 1808 } 1809 1810 err = auxtrace_queues__init(&spe->queues); 1811 if (err) 1812 goto err_free; 1813 1814 spe->session = session; 1815 spe->machine = &session->machines.host; /* No kvm support */ 1816 spe->auxtrace_type = auxtrace_info->type; 1817 if (metadata_ver == 1) 1818 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; 1819 else 1820 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2]; 1821 spe->metadata = metadata; 1822 spe->metadata_ver = metadata_ver; 1823 spe->metadata_nr_cpu = nr_cpu; 1824 spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu); 1825 1826 spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); 1827 1828 /* 1829 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead 1830 * and the parameters for hardware clock are stored in the session 1831 * context. Passes these parameters to the struct perf_tsc_conversion 1832 * in "spe->tc", which is used for later conversion between clock 1833 * counter and timestamp. 1834 * 1835 * For backward compatibility, copies the fields starting from 1836 * "time_cycles" only if they are contained in the event. 1837 */ 1838 spe->tc.time_shift = tc->time_shift; 1839 spe->tc.time_mult = tc->time_mult; 1840 spe->tc.time_zero = tc->time_zero; 1841 1842 if (event_contains(*tc, time_cycles)) { 1843 spe->tc.time_cycles = tc->time_cycles; 1844 spe->tc.time_mask = tc->time_mask; 1845 spe->tc.cap_user_time_zero = tc->cap_user_time_zero; 1846 spe->tc.cap_user_time_short = tc->cap_user_time_short; 1847 } 1848 1849 spe->auxtrace.process_event = arm_spe_process_event; 1850 spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event; 1851 spe->auxtrace.flush_events = arm_spe_flush; 1852 spe->auxtrace.free_events = arm_spe_free_events; 1853 spe->auxtrace.free = arm_spe_free; 1854 spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace; 1855 session->auxtrace = &spe->auxtrace; 1856 1857 arm_spe_print_info(spe, &auxtrace_info->priv[0]); 1858 1859 if (dump_trace) 1860 return 0; 1861 1862 if (session->itrace_synth_opts && session->itrace_synth_opts->set) { 1863 spe->synth_opts = *session->itrace_synth_opts; 1864 } else { 1865 itrace_synth_opts__set_default(&spe->synth_opts, false); 1866 /* Default nanoseconds period not supported */ 1867 spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; 1868 spe->synth_opts.period = 1; 1869 } 1870 1871 if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { 1872 ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n"); 1873 err = -EINVAL; 1874 goto err_free_queues; 1875 } 1876 if (spe->synth_opts.period > 1) 1877 ui__warning("Arm SPE has a hardware-based sampling period.\n\n" 1878 "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n"); 1879 1880 err = arm_spe_synth_events(spe, session); 1881 if (err) 1882 goto err_free_queues; 1883 1884 err = auxtrace_queues__process_index(&spe->queues, session); 1885 if (err) 1886 goto err_free_queues; 1887 1888 if (spe->queues.populated) 1889 spe->data_queued = true; 1890 1891 return 0; 1892 1893 err_free_queues: 1894 auxtrace_queues__free(&spe->queues); 1895 session->auxtrace = NULL; 1896 err_free: 1897 free(spe); 1898 err_free_metadata: 1899 arm_spe__free_metadata(metadata, nr_cpu); 1900 return err; 1901 } 1902