1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Arm Statistical Profiling Extensions (SPE) support 4 * Copyright (c) 2017-2018, Arm Ltd. 5 */ 6 7 #include <byteswap.h> 8 #include <endian.h> 9 #include <errno.h> 10 #include <inttypes.h> 11 #include <linux/bitops.h> 12 #include <linux/kernel.h> 13 #include <linux/log2.h> 14 #include <linux/types.h> 15 #include <linux/zalloc.h> 16 #include <stdlib.h> 17 #include <unistd.h> 18 19 #include "auxtrace.h" 20 #include "color.h" 21 #include "debug.h" 22 #include "evlist.h" 23 #include "evsel.h" 24 #include "machine.h" 25 #include "session.h" 26 #include "symbol.h" 27 #include "thread.h" 28 #include "thread-stack.h" 29 #include "tsc.h" 30 #include "tool.h" 31 #include "util/synthetic-events.h" 32 33 #include "arm-spe.h" 34 #include "arm-spe-decoder/arm-spe-decoder.h" 35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h" 36 37 #include "../../arch/arm64/include/asm/cputype.h" 38 #define MAX_TIMESTAMP (~0ULL) 39 40 #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) 41 42 #define is_simd_op(op) (!!((op) & (ARM_SPE_OP_SIMD_FP | ARM_SPE_OP_SVE | \ 43 ARM_SPE_OP_SME | ARM_SPE_OP_ASE))) 44 45 #define is_mem_op(op) (is_ldst_op(op) || is_simd_op(op)) 46 47 #define ARM_SPE_CACHE_EVENT(lvl) \ 48 (ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS) 49 50 #define arm_spe_is_cache_level(type, lvl) \ 51 ((type) & ARM_SPE_CACHE_EVENT(lvl)) 52 53 #define arm_spe_is_cache_hit(type, lvl) \ 54 (((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS) 55 56 #define arm_spe_is_cache_miss(type, lvl) \ 57 ((type) & ARM_SPE_##lvl##_MISS) 58 59 struct arm_spe { 60 struct auxtrace auxtrace; 61 struct auxtrace_queues queues; 62 struct auxtrace_heap heap; 63 struct itrace_synth_opts synth_opts; 64 u32 auxtrace_type; 65 struct perf_session *session; 66 struct machine *machine; 67 u32 pmu_type; 68 69 struct perf_tsc_conversion tc; 70 71 u8 timeless_decoding; 72 u8 data_queued; 73 74 u64 sample_type; 75 u8 sample_flc; 76 u8 sample_llc; 77 u8 sample_tlb; 78 u8 sample_branch; 79 u8 sample_remote_access; 80 u8 sample_memory; 81 u8 sample_instructions; 82 83 u64 l1d_miss_id; 84 u64 l1d_access_id; 85 u64 llc_miss_id; 86 u64 llc_access_id; 87 u64 tlb_miss_id; 88 u64 tlb_access_id; 89 u64 branch_id; 90 u64 remote_access_id; 91 u64 memory_id; 92 u64 instructions_id; 93 94 u64 kernel_start; 95 96 unsigned long num_events; 97 u8 use_ctx_pkt_for_pid; 98 99 u64 **metadata; 100 u64 metadata_ver; 101 u64 metadata_nr_cpu; 102 bool is_homogeneous; 103 }; 104 105 struct arm_spe_queue { 106 struct arm_spe *spe; 107 unsigned int queue_nr; 108 struct auxtrace_buffer *buffer; 109 struct auxtrace_buffer *old_buffer; 110 union perf_event *event_buf; 111 bool on_heap; 112 bool done; 113 pid_t pid; 114 pid_t tid; 115 int cpu; 116 struct arm_spe_decoder *decoder; 117 u64 time; 118 u64 timestamp; 119 struct thread *thread; 120 u64 sample_count; 121 u32 flags; 122 struct branch_stack *last_branch; 123 }; 124 125 struct data_source_handle { 126 const struct midr_range *midr_ranges; 127 void (*ds_synth)(const struct arm_spe_record *record, 128 union perf_mem_data_src *data_src); 129 }; 130 131 #define DS(range, func) \ 132 { \ 133 .midr_ranges = range, \ 134 .ds_synth = arm_spe__synth_##func, \ 135 } 136 137 static int arm_spe__get_midr(struct arm_spe *spe, int cpu, u64 *midr); 138 139 static void arm_spe_dump(struct arm_spe *spe __maybe_unused, 140 unsigned char *buf, size_t len, u64 midr) 141 { 142 struct arm_spe_pkt packet; 143 size_t pos = 0; 144 int ret, pkt_len, i; 145 char desc[ARM_SPE_PKT_DESC_MAX]; 146 const char *color = PERF_COLOR_BLUE; 147 148 color_fprintf(stdout, color, 149 ". ... ARM SPE data: size %#zx bytes\n", 150 len); 151 152 while (len) { 153 ret = arm_spe_get_packet(buf, len, &packet, midr); 154 155 if (ret > 0) 156 pkt_len = ret; 157 else 158 pkt_len = 1; 159 printf("."); 160 color_fprintf(stdout, color, " %08zx: ", pos); 161 for (i = 0; i < pkt_len; i++) 162 color_fprintf(stdout, color, " %02x", buf[i]); 163 for (; i < 16; i++) 164 color_fprintf(stdout, color, " "); 165 if (ret > 0) { 166 ret = arm_spe_pkt_desc(&packet, desc, 167 ARM_SPE_PKT_DESC_MAX); 168 if (!ret) 169 color_fprintf(stdout, color, " %s\n", desc); 170 } else { 171 color_fprintf(stdout, color, " Bad packet!\n"); 172 } 173 pos += pkt_len; 174 buf += pkt_len; 175 len -= pkt_len; 176 } 177 } 178 179 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf, 180 size_t len, u64 midr) 181 { 182 printf(".\n"); 183 arm_spe_dump(spe, buf, len, midr); 184 } 185 186 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data) 187 { 188 struct arm_spe_queue *speq = data; 189 struct auxtrace_buffer *buffer = speq->buffer; 190 struct auxtrace_buffer *old_buffer = speq->old_buffer; 191 struct auxtrace_queue *queue; 192 193 queue = &speq->spe->queues.queue_array[speq->queue_nr]; 194 195 buffer = auxtrace_buffer__next(queue, buffer); 196 /* If no more data, drop the previous auxtrace_buffer and return */ 197 if (!buffer) { 198 if (old_buffer) 199 auxtrace_buffer__drop_data(old_buffer); 200 b->len = 0; 201 return 0; 202 } 203 204 speq->buffer = buffer; 205 206 /* If the aux_buffer doesn't have data associated, try to load it */ 207 if (!buffer->data) { 208 /* get the file desc associated with the perf data file */ 209 int fd = perf_data__fd(speq->spe->session->data); 210 211 buffer->data = auxtrace_buffer__get_data(buffer, fd); 212 if (!buffer->data) 213 return -ENOMEM; 214 } 215 216 b->len = buffer->size; 217 b->buf = buffer->data; 218 219 if (b->len) { 220 if (old_buffer) 221 auxtrace_buffer__drop_data(old_buffer); 222 speq->old_buffer = buffer; 223 } else { 224 auxtrace_buffer__drop_data(buffer); 225 return arm_spe_get_trace(b, data); 226 } 227 228 return 0; 229 } 230 231 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, 232 unsigned int queue_nr) 233 { 234 struct arm_spe_params params = { .get_trace = 0, }; 235 struct arm_spe_queue *speq; 236 237 speq = zalloc(sizeof(*speq)); 238 if (!speq) 239 return NULL; 240 241 speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); 242 if (!speq->event_buf) 243 goto out_free; 244 245 speq->spe = spe; 246 speq->queue_nr = queue_nr; 247 speq->pid = -1; 248 speq->tid = -1; 249 speq->cpu = -1; 250 251 /* params set */ 252 params.get_trace = arm_spe_get_trace; 253 params.data = speq; 254 255 if (spe->synth_opts.last_branch) { 256 size_t sz = sizeof(struct branch_stack); 257 258 /* Allocate up to two entries for PBT + TGT */ 259 sz += sizeof(struct branch_entry) * 260 min(spe->synth_opts.last_branch_sz, 2U); 261 speq->last_branch = zalloc(sz); 262 if (!speq->last_branch) 263 goto out_free; 264 } 265 266 /* create new decoder */ 267 speq->decoder = arm_spe_decoder_new(¶ms); 268 if (!speq->decoder) 269 goto out_free; 270 271 return speq; 272 273 out_free: 274 zfree(&speq->event_buf); 275 zfree(&speq->last_branch); 276 free(speq); 277 278 return NULL; 279 } 280 281 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip) 282 { 283 return ip >= spe->kernel_start ? 284 PERF_RECORD_MISC_KERNEL : 285 PERF_RECORD_MISC_USER; 286 } 287 288 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe, 289 struct auxtrace_queue *queue) 290 { 291 struct arm_spe_queue *speq = queue->priv; 292 pid_t tid; 293 294 tid = machine__get_current_tid(spe->machine, speq->cpu); 295 if (tid != -1) { 296 speq->tid = tid; 297 thread__zput(speq->thread); 298 } else 299 speq->tid = queue->tid; 300 301 if ((!speq->thread) && (speq->tid != -1)) { 302 speq->thread = machine__find_thread(spe->machine, -1, 303 speq->tid); 304 } 305 306 if (speq->thread) { 307 speq->pid = thread__pid(speq->thread); 308 if (queue->cpu == -1) { 309 speq->cpu = thread__cpu(speq->thread); 310 arm_spe__get_midr(spe, speq->cpu, &speq->decoder->midr); 311 } 312 } 313 } 314 315 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid) 316 { 317 struct arm_spe *spe = speq->spe; 318 int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid); 319 320 if (err) 321 return err; 322 323 arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]); 324 325 return 0; 326 } 327 328 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu) 329 { 330 u64 i; 331 332 if (!spe->metadata) 333 return NULL; 334 335 /* CPU ID is -1 for per-thread mode */ 336 if (cpu < 0) { 337 /* 338 * On the heterogeneous system, due to CPU ID is -1, 339 * cannot confirm the data source packet is supported. 340 */ 341 if (!spe->is_homogeneous) 342 return NULL; 343 344 /* In homogeneous system, simply use CPU0's metadata */ 345 return spe->metadata[0]; 346 } 347 348 for (i = 0; i < spe->metadata_nr_cpu; i++) 349 if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu) 350 return spe->metadata[i]; 351 352 return NULL; 353 } 354 355 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record) 356 { 357 struct simd_flags simd_flags = {}; 358 359 if (record->op & ARM_SPE_OP_SVE) 360 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE; 361 else if (record->op & ARM_SPE_OP_SME) 362 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SME; 363 else if (record->op & (ARM_SPE_OP_ASE | ARM_SPE_OP_SIMD_FP)) 364 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_ASE; 365 366 if (record->op & ARM_SPE_OP_SVE) { 367 if (!(record->op & ARM_SPE_OP_PRED)) 368 simd_flags.pred = SIMD_OP_FLAGS_PRED_DISABLED; 369 else if (record->type & ARM_SPE_SVE_PARTIAL_PRED) 370 simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL; 371 else if (record->type & ARM_SPE_SVE_EMPTY_PRED) 372 simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY; 373 else 374 simd_flags.pred = SIMD_OP_FLAGS_PRED_FULL; 375 } else { 376 if (record->type & ARM_SPE_SVE_PARTIAL_PRED) 377 simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL; 378 else if (record->type & ARM_SPE_SVE_EMPTY_PRED) 379 simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY; 380 } 381 382 return simd_flags; 383 } 384 385 static void arm_spe_prep_sample(struct arm_spe *spe, 386 struct arm_spe_queue *speq, 387 union perf_event *event, 388 struct perf_sample *sample) 389 { 390 struct arm_spe_record *record = &speq->decoder->record; 391 392 if (!spe->timeless_decoding) 393 sample->time = tsc_to_perf_time(record->timestamp, &spe->tc); 394 395 sample->ip = record->from_ip; 396 sample->cpumode = arm_spe_cpumode(spe, sample->ip); 397 sample->pid = speq->pid; 398 sample->tid = speq->tid; 399 sample->period = spe->synth_opts.period; 400 sample->cpu = speq->cpu; 401 sample->simd_flags = arm_spe__synth_simd_flags(record); 402 403 event->sample.header.type = PERF_RECORD_SAMPLE; 404 event->sample.header.misc = sample->cpumode; 405 event->sample.header.size = sizeof(struct perf_event_header); 406 } 407 408 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) 409 { 410 struct arm_spe *spe = speq->spe; 411 struct arm_spe_record *record = &speq->decoder->record; 412 struct branch_stack *bstack = speq->last_branch; 413 struct branch_flags *bs_flags; 414 unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; 415 bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); 416 bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; 417 size_t sz = sizeof(struct branch_stack) + 418 sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; 419 int i = 0; 420 421 /* Clean up branch stack */ 422 memset(bstack, 0x0, sz); 423 424 if (!have_tgt && !have_pbt) 425 return; 426 427 if (have_tgt) { 428 bstack->entries[i].from = record->from_ip; 429 bstack->entries[i].to = record->to_ip; 430 431 bs_flags = &bstack->entries[i].flags; 432 bs_flags->value = 0; 433 434 if (record->op & ARM_SPE_OP_BR_CR_BL) { 435 if (record->op & ARM_SPE_OP_BR_COND) 436 bs_flags->type |= PERF_BR_COND_CALL; 437 else 438 bs_flags->type |= PERF_BR_CALL; 439 /* 440 * Indirect branch instruction without link (e.g. BR), 441 * take this case as function return. 442 */ 443 } else if (record->op & ARM_SPE_OP_BR_CR_RET || 444 record->op & ARM_SPE_OP_BR_INDIRECT) { 445 if (record->op & ARM_SPE_OP_BR_COND) 446 bs_flags->type |= PERF_BR_COND_RET; 447 else 448 bs_flags->type |= PERF_BR_RET; 449 } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { 450 if (record->op & ARM_SPE_OP_BR_COND) 451 bs_flags->type |= PERF_BR_COND; 452 else 453 bs_flags->type |= PERF_BR_UNCOND; 454 } else { 455 if (record->op & ARM_SPE_OP_BR_COND) 456 bs_flags->type |= PERF_BR_COND; 457 else 458 bs_flags->type |= PERF_BR_UNKNOWN; 459 } 460 461 if (record->type & ARM_SPE_BRANCH_MISS) { 462 bs_flags->mispred = 1; 463 bs_flags->predicted = 0; 464 } else { 465 bs_flags->mispred = 0; 466 bs_flags->predicted = 1; 467 } 468 469 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 470 bs_flags->not_taken = 1; 471 472 if (record->type & ARM_SPE_IN_TXN) 473 bs_flags->in_tx = 1; 474 475 bs_flags->cycles = min(record->latency, 0xFFFFU); 476 i++; 477 } 478 479 if (have_pbt) { 480 bs_flags = &bstack->entries[i].flags; 481 bs_flags->type |= PERF_BR_UNKNOWN; 482 bstack->entries[i].to = record->prev_br_tgt; 483 i++; 484 } 485 486 bstack->nr = i; 487 bstack->hw_idx = -1ULL; 488 } 489 490 static int arm_spe__inject_event(struct arm_spe *spe, union perf_event *event, 491 struct perf_sample *sample, u64 type) 492 { 493 struct evsel *evsel = sample->evsel; 494 u64 branch_sample_type = 0; 495 size_t sz; 496 497 if (!evsel && spe->session && spe->session->evlist) 498 evsel = evlist__id2evsel(spe->session->evlist, sample->id); 499 500 if (evsel) 501 branch_sample_type = evsel->core.attr.branch_sample_type; 502 503 event->header.type = PERF_RECORD_SAMPLE; 504 sz = perf_event__sample_event_size(sample, type, /*read_format=*/0, 505 branch_sample_type); 506 if (sz >= PERF_SAMPLE_MAX_SIZE) { 507 pr_err("Sample size %zu exceeds max size %d\n", sz, PERF_SAMPLE_MAX_SIZE); 508 return -EFAULT; 509 } 510 event->header.size = sz; 511 512 return perf_event__synthesize_sample(event, type, /*read_format=*/0, 513 branch_sample_type, sample); 514 } 515 516 static inline int 517 arm_spe_deliver_synth_event(struct arm_spe *spe, 518 struct arm_spe_queue *speq __maybe_unused, 519 union perf_event *event, 520 struct perf_sample *sample) 521 { 522 int ret; 523 524 if (spe->synth_opts.inject) { 525 ret = arm_spe__inject_event(spe, event, sample, spe->sample_type); 526 if (ret) 527 return ret; 528 } 529 530 ret = perf_session__deliver_synth_event(spe->session, event, sample); 531 if (ret) 532 pr_err("ARM SPE: failed to deliver event, error %d\n", ret); 533 534 return ret; 535 } 536 537 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, 538 u64 spe_events_id, 539 union perf_mem_data_src data_src) 540 { 541 struct arm_spe *spe = speq->spe; 542 struct arm_spe_record *record = &speq->decoder->record; 543 union perf_event *event = speq->event_buf; 544 struct perf_sample sample; 545 int ret; 546 547 perf_sample__init(&sample, /*all=*/true); 548 arm_spe_prep_sample(spe, speq, event, &sample); 549 550 sample.id = spe_events_id; 551 sample.stream_id = spe_events_id; 552 sample.addr = record->virt_addr; 553 sample.phys_addr = record->phys_addr; 554 sample.data_src = data_src.val; 555 sample.weight = record->latency; 556 557 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 558 perf_sample__exit(&sample); 559 return ret; 560 } 561 562 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, 563 u64 spe_events_id) 564 { 565 struct arm_spe *spe = speq->spe; 566 struct arm_spe_record *record = &speq->decoder->record; 567 union perf_event *event = speq->event_buf; 568 struct perf_sample sample; 569 int ret; 570 571 perf_sample__init(&sample, /*all=*/true); 572 arm_spe_prep_sample(spe, speq, event, &sample); 573 574 sample.id = spe_events_id; 575 sample.stream_id = spe_events_id; 576 sample.addr = record->to_ip; 577 sample.weight = record->latency; 578 sample.flags = speq->flags; 579 sample.branch_stack = speq->last_branch; 580 581 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 582 perf_sample__exit(&sample); 583 return ret; 584 } 585 586 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, 587 u64 spe_events_id, 588 union perf_mem_data_src data_src) 589 { 590 struct arm_spe *spe = speq->spe; 591 struct arm_spe_record *record = &speq->decoder->record; 592 union perf_event *event = speq->event_buf; 593 struct perf_sample sample; 594 int ret; 595 596 perf_sample__init(&sample, /*all=*/true); 597 arm_spe_prep_sample(spe, speq, event, &sample); 598 599 sample.id = spe_events_id; 600 sample.stream_id = spe_events_id; 601 sample.addr = record->to_ip; 602 sample.phys_addr = record->phys_addr; 603 sample.data_src = data_src.val; 604 sample.weight = record->latency; 605 sample.flags = speq->flags; 606 sample.branch_stack = speq->last_branch; 607 608 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 609 perf_sample__exit(&sample); 610 return ret; 611 } 612 613 static const struct midr_range common_ds_encoding_cpus[] = { 614 MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), 615 MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), 616 MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), 617 MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), 618 MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), 619 MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), 620 MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), 621 MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), 622 MIDR_ALL_VERSIONS(MIDR_CORTEX_X4), 623 MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), 624 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), 625 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), 626 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), 627 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), 628 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), 629 MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), 630 {}, 631 }; 632 633 static const struct midr_range ampereone_ds_encoding_cpus[] = { 634 MIDR_ALL_VERSIONS(MIDR_AMPERE1A), 635 {}, 636 }; 637 638 static const struct midr_range hisi_hip_ds_encoding_cpus[] = { 639 MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), 640 {}, 641 }; 642 643 static void arm_spe__sample_flags(struct arm_spe_queue *speq) 644 { 645 const struct arm_spe_record *record = &speq->decoder->record; 646 647 speq->flags = 0; 648 if (record->op & ARM_SPE_OP_BRANCH_ERET) { 649 speq->flags = PERF_IP_FLAG_BRANCH; 650 651 if (record->type & ARM_SPE_BRANCH_MISS) 652 speq->flags |= PERF_IP_FLAG_BRANCH_MISS; 653 654 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 655 speq->flags |= PERF_IP_FLAG_NOT_TAKEN; 656 657 if (record->type & ARM_SPE_IN_TXN) 658 speq->flags |= PERF_IP_FLAG_IN_TX; 659 660 if (record->op & ARM_SPE_OP_BR_COND) 661 speq->flags |= PERF_IP_FLAG_CONDITIONAL; 662 663 if (record->op & ARM_SPE_OP_BR_CR_BL) 664 speq->flags |= PERF_IP_FLAG_CALL; 665 else if (record->op & ARM_SPE_OP_BR_CR_RET) 666 speq->flags |= PERF_IP_FLAG_RETURN; 667 /* 668 * Indirect branch instruction without link (e.g. BR), 669 * take it as a function return. 670 */ 671 else if (record->op & ARM_SPE_OP_BR_INDIRECT) 672 speq->flags |= PERF_IP_FLAG_RETURN; 673 } 674 } 675 676 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record, 677 union perf_mem_data_src *data_src) 678 { 679 /* 680 * Even though four levels of cache hierarchy are possible, no known 681 * production Neoverse systems currently include more than three levels 682 * so for the time being we assume three exist. If a production system 683 * is built with four the this function would have to be changed to 684 * detect the number of levels for reporting. 685 */ 686 687 /* 688 * We have no data on the hit level or data source for stores in the 689 * Neoverse SPE records. 690 */ 691 if (record->op & ARM_SPE_OP_ST) { 692 data_src->mem_lvl = PERF_MEM_LVL_NA; 693 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; 694 data_src->mem_snoop = PERF_MEM_SNOOP_NA; 695 return; 696 } 697 698 switch (record->source) { 699 case ARM_SPE_COMMON_DS_L1D: 700 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 701 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 702 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 703 break; 704 case ARM_SPE_COMMON_DS_L2: 705 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 706 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 707 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 708 break; 709 case ARM_SPE_COMMON_DS_PEER_CORE: 710 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 711 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 712 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 713 break; 714 /* 715 * We don't know if this is L1, L2 but we do know it was a cache-2-cache 716 * transfer, so set SNOOPX_PEER 717 */ 718 case ARM_SPE_COMMON_DS_LOCAL_CLUSTER: 719 case ARM_SPE_COMMON_DS_PEER_CLUSTER: 720 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 721 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 722 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 723 break; 724 /* 725 * System cache is assumed to be L3 726 */ 727 case ARM_SPE_COMMON_DS_SYS_CACHE: 728 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 729 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 730 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 731 break; 732 /* 733 * We don't know what level it hit in, except it came from the other 734 * socket 735 */ 736 case ARM_SPE_COMMON_DS_REMOTE: 737 data_src->mem_lvl = PERF_MEM_LVL_NA; 738 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; 739 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 740 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 741 break; 742 case ARM_SPE_COMMON_DS_DRAM: 743 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 744 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 745 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 746 break; 747 default: 748 break; 749 } 750 } 751 752 /* 753 * Source is IMPDEF. Here we convert the source code used on AmpereOne cores 754 * to the common (Neoverse, Cortex) to avoid duplicating the decoding code. 755 */ 756 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record, 757 union perf_mem_data_src *data_src) 758 { 759 struct arm_spe_record common_record; 760 761 switch (record->source) { 762 case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE: 763 common_record.source = ARM_SPE_COMMON_DS_PEER_CORE; 764 break; 765 case ARM_SPE_AMPEREONE_SLC: 766 common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE; 767 break; 768 case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE: 769 common_record.source = ARM_SPE_COMMON_DS_REMOTE; 770 break; 771 case ARM_SPE_AMPEREONE_DDR: 772 common_record.source = ARM_SPE_COMMON_DS_DRAM; 773 break; 774 case ARM_SPE_AMPEREONE_L1D: 775 common_record.source = ARM_SPE_COMMON_DS_L1D; 776 break; 777 case ARM_SPE_AMPEREONE_L2D: 778 common_record.source = ARM_SPE_COMMON_DS_L2; 779 break; 780 default: 781 pr_warning_once("AmpereOne: Unknown data source (0x%x)\n", 782 record->source); 783 return; 784 } 785 786 common_record.op = record->op; 787 arm_spe__synth_data_source_common(&common_record, data_src); 788 } 789 790 static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record, 791 union perf_mem_data_src *data_src) 792 { 793 /* Use common synthesis method to handle store operations */ 794 if (record->op & ARM_SPE_OP_ST) { 795 arm_spe__synth_data_source_common(record, data_src); 796 return; 797 } 798 799 switch (record->source) { 800 case ARM_SPE_HISI_HIP_PEER_CPU: 801 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 802 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 803 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 804 break; 805 case ARM_SPE_HISI_HIP_PEER_CPU_HITM: 806 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 807 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 808 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 809 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 810 break; 811 case ARM_SPE_HISI_HIP_L3: 812 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 813 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 814 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 815 break; 816 case ARM_SPE_HISI_HIP_L3_HITM: 817 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 818 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 819 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 820 break; 821 case ARM_SPE_HISI_HIP_PEER_CLUSTER: 822 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; 823 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 824 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 825 break; 826 case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM: 827 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; 828 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 829 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 830 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 831 break; 832 case ARM_SPE_HISI_HIP_REMOTE_SOCKET: 833 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; 834 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 835 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 836 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 837 break; 838 case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM: 839 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; 840 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 841 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 842 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 843 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 844 break; 845 case ARM_SPE_HISI_HIP_LOCAL_MEM: 846 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 847 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 848 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 849 break; 850 case ARM_SPE_HISI_HIP_REMOTE_MEM: 851 data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; 852 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 853 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 854 break; 855 case ARM_SPE_HISI_HIP_NC_DEV: 856 data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT; 857 data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; 858 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 859 break; 860 case ARM_SPE_HISI_HIP_L2: 861 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 862 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 863 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 864 break; 865 case ARM_SPE_HISI_HIP_L2_HITM: 866 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 867 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 868 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 869 break; 870 case ARM_SPE_HISI_HIP_L1: 871 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 872 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 873 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 874 break; 875 default: 876 break; 877 } 878 } 879 880 static const struct data_source_handle data_source_handles[] = { 881 DS(common_ds_encoding_cpus, data_source_common), 882 DS(ampereone_ds_encoding_cpus, data_source_ampereone), 883 DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip), 884 }; 885 886 static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record, 887 union perf_mem_data_src *data_src) 888 { 889 /* 890 * To find a cache hit, search in ascending order from the lower level 891 * caches to the higher level caches. This reflects the best scenario 892 * for a cache hit. 893 */ 894 if (arm_spe_is_cache_hit(record->type, L1D)) { 895 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 896 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 897 } else if (record->type & ARM_SPE_RECENTLY_FETCHED) { 898 data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; 899 data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB; 900 } else if (arm_spe_is_cache_hit(record->type, L2D)) { 901 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 902 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 903 } else if (arm_spe_is_cache_hit(record->type, LLC)) { 904 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 905 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 906 /* 907 * To find a cache miss, search in descending order from the higher 908 * level cache to the lower level cache. This represents the worst 909 * scenario for a cache miss. 910 */ 911 } else if (arm_spe_is_cache_miss(record->type, LLC)) { 912 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS; 913 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 914 } else if (arm_spe_is_cache_miss(record->type, L2D)) { 915 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS; 916 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 917 } else if (arm_spe_is_cache_miss(record->type, L1D)) { 918 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; 919 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 920 } 921 } 922 923 static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record, 924 union perf_mem_data_src *data_src) 925 { 926 /* Record the greatest level info for a store operation. */ 927 if (arm_spe_is_cache_level(record->type, LLC)) { 928 data_src->mem_lvl = PERF_MEM_LVL_L3; 929 data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ? 930 PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; 931 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 932 } else if (arm_spe_is_cache_level(record->type, L2D)) { 933 data_src->mem_lvl = PERF_MEM_LVL_L2; 934 data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ? 935 PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; 936 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 937 } else if (arm_spe_is_cache_level(record->type, L1D)) { 938 data_src->mem_lvl = PERF_MEM_LVL_L1; 939 data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ? 940 PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; 941 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 942 } 943 } 944 945 static void arm_spe__synth_memory_level(struct arm_spe_queue *speq, 946 const struct arm_spe_record *record, 947 union perf_mem_data_src *data_src) 948 { 949 struct arm_spe *spe = speq->spe; 950 951 /* 952 * The data source packet contains more info for cache levels for 953 * peer snooping. So respect the memory level if has been set by 954 * data source parsing. 955 */ 956 if (!data_src->mem_lvl) { 957 if (data_src->mem_op == PERF_MEM_OP_LOAD) 958 arm_spe__synth_ld_memory_level(record, data_src); 959 if (data_src->mem_op == PERF_MEM_OP_STORE) 960 arm_spe__synth_st_memory_level(record, data_src); 961 } 962 963 if (!data_src->mem_lvl) { 964 data_src->mem_lvl = PERF_MEM_LVL_NA; 965 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; 966 } 967 968 /* 969 * If 'mem_snoop' has been set by data source packet, skip to set 970 * it at here. 971 */ 972 if (!data_src->mem_snoop) { 973 if (record->type & ARM_SPE_DATA_SNOOPED) { 974 if (record->type & ARM_SPE_HITM) 975 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 976 else 977 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 978 } else { 979 u64 *metadata = 980 arm_spe__get_metadata_by_cpu(spe, speq->cpu); 981 982 /* 983 * Set NA ("Not available") mode if no meta data or the 984 * SNOOPED event is not supported. 985 */ 986 if (!metadata || 987 !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED)) 988 data_src->mem_snoop = PERF_MEM_SNOOP_NA; 989 else 990 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 991 } 992 } 993 994 if (!data_src->mem_remote) { 995 if (record->type & ARM_SPE_REMOTE_ACCESS) 996 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 997 } 998 } 999 1000 static int arm_spe__get_midr(struct arm_spe *spe, int cpu, u64 *midr) 1001 { 1002 u64 *metadata; 1003 1004 /* Metadata version 1 assumes all CPUs are the same (old behavior) */ 1005 if (spe->metadata_ver == 1) { 1006 const char *cpuid; 1007 1008 pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n"); 1009 cpuid = perf_env__cpuid(perf_session__env(spe->session)); 1010 if (!cpuid) 1011 goto err; 1012 1013 *midr = strtol(cpuid, NULL, 16); 1014 return 0; 1015 } 1016 1017 metadata = arm_spe__get_metadata_by_cpu(spe, cpu); 1018 if (!metadata) 1019 goto err; 1020 1021 *midr = metadata[ARM_SPE_CPU_MIDR]; 1022 return 0; 1023 1024 err: 1025 pr_warning_once("Failed to get MIDR for CPU %d\n", cpu); 1026 return -EINVAL; 1027 } 1028 1029 static void arm_spe__synth_ds(struct arm_spe_queue *speq, 1030 const struct arm_spe_record *record, 1031 union perf_mem_data_src *data_src) 1032 { 1033 u64 midr; 1034 unsigned int i; 1035 1036 if (arm_spe__get_midr(speq->spe, speq->cpu, &midr)) 1037 return; 1038 1039 for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { 1040 if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { 1041 return data_source_handles[i].ds_synth(record, data_src); 1042 } 1043 } 1044 1045 return; 1046 } 1047 1048 static union perf_mem_data_src 1049 arm_spe__synth_data_source(struct arm_spe_queue *speq, 1050 const struct arm_spe_record *record) 1051 { 1052 union perf_mem_data_src data_src = {}; 1053 1054 if (!is_mem_op(record->op)) 1055 return data_src; 1056 1057 if (record->op & ARM_SPE_OP_LD) 1058 data_src.mem_op = PERF_MEM_OP_LOAD; 1059 else if (record->op & ARM_SPE_OP_ST) 1060 data_src.mem_op = PERF_MEM_OP_STORE; 1061 else 1062 data_src.mem_op = PERF_MEM_OP_NA; 1063 1064 arm_spe__synth_ds(speq, record, &data_src); 1065 arm_spe__synth_memory_level(speq, record, &data_src); 1066 1067 if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { 1068 data_src.mem_dtlb = PERF_MEM_TLB_WK; 1069 1070 if (record->type & ARM_SPE_TLB_MISS) 1071 data_src.mem_dtlb |= PERF_MEM_TLB_MISS; 1072 else 1073 data_src.mem_dtlb |= PERF_MEM_TLB_HIT; 1074 } 1075 1076 return data_src; 1077 } 1078 1079 static int arm_spe_sample(struct arm_spe_queue *speq) 1080 { 1081 const struct arm_spe_record *record = &speq->decoder->record; 1082 struct arm_spe *spe = speq->spe; 1083 union perf_mem_data_src data_src; 1084 int err; 1085 1086 /* 1087 * Discard all samples until period is reached 1088 */ 1089 speq->sample_count++; 1090 if (speq->sample_count < spe->synth_opts.period) 1091 return 0; 1092 speq->sample_count = 0; 1093 1094 arm_spe__sample_flags(speq); 1095 data_src = arm_spe__synth_data_source(speq, record); 1096 1097 if (spe->sample_flc) { 1098 if (record->type & ARM_SPE_L1D_MISS) { 1099 err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id, 1100 data_src); 1101 if (err) 1102 return err; 1103 } 1104 1105 if (record->type & ARM_SPE_L1D_ACCESS) { 1106 err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id, 1107 data_src); 1108 if (err) 1109 return err; 1110 } 1111 } 1112 1113 if (spe->sample_llc) { 1114 if (record->type & ARM_SPE_LLC_MISS) { 1115 err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id, 1116 data_src); 1117 if (err) 1118 return err; 1119 } 1120 1121 if (record->type & ARM_SPE_LLC_ACCESS) { 1122 err = arm_spe__synth_mem_sample(speq, spe->llc_access_id, 1123 data_src); 1124 if (err) 1125 return err; 1126 } 1127 } 1128 1129 if (spe->sample_tlb) { 1130 if (record->type & ARM_SPE_TLB_MISS) { 1131 err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id, 1132 data_src); 1133 if (err) 1134 return err; 1135 } 1136 1137 if (record->type & ARM_SPE_TLB_ACCESS) { 1138 err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id, 1139 data_src); 1140 if (err) 1141 return err; 1142 } 1143 } 1144 1145 if (spe->synth_opts.last_branch && 1146 (spe->sample_branch || spe->sample_instructions)) 1147 arm_spe__prep_branch_stack(speq); 1148 1149 if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) { 1150 err = arm_spe__synth_branch_sample(speq, spe->branch_id); 1151 if (err) 1152 return err; 1153 } 1154 1155 if (spe->sample_remote_access && 1156 (record->type & ARM_SPE_REMOTE_ACCESS)) { 1157 err = arm_spe__synth_mem_sample(speq, spe->remote_access_id, 1158 data_src); 1159 if (err) 1160 return err; 1161 } 1162 1163 if (spe->sample_memory && is_mem_op(record->op)) { 1164 err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); 1165 if (err) 1166 return err; 1167 } 1168 1169 if (spe->sample_instructions) { 1170 err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src); 1171 if (err) 1172 return err; 1173 } 1174 1175 return 0; 1176 } 1177 1178 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp) 1179 { 1180 struct arm_spe *spe = speq->spe; 1181 struct arm_spe_record *record; 1182 int ret; 1183 1184 if (!spe->kernel_start) 1185 spe->kernel_start = machine__kernel_start(spe->machine); 1186 1187 while (1) { 1188 /* 1189 * The usual logic is firstly to decode the packets, and then 1190 * based the record to synthesize sample; but here the flow is 1191 * reversed: it calls arm_spe_sample() for synthesizing samples 1192 * prior to arm_spe_decode(). 1193 * 1194 * Two reasons for this code logic: 1195 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it 1196 * has decoded trace data and generated a record, but the record 1197 * is left to generate sample until run to here, so it's correct 1198 * to synthesize sample for the left record. 1199 * 2. After decoding trace data, it needs to compare the record 1200 * timestamp with the coming perf event, if the record timestamp 1201 * is later than the perf event, it needs bail out and pushs the 1202 * record into auxtrace heap, thus the record can be deferred to 1203 * synthesize sample until run to here at the next time; so this 1204 * can correlate samples between Arm SPE trace data and other 1205 * perf events with correct time ordering. 1206 */ 1207 1208 /* 1209 * Update pid/tid info. 1210 */ 1211 record = &speq->decoder->record; 1212 if (!spe->timeless_decoding && record->context_id != (u64)-1) { 1213 ret = arm_spe_set_tid(speq, record->context_id); 1214 if (ret) 1215 return ret; 1216 1217 spe->use_ctx_pkt_for_pid = true; 1218 } 1219 1220 ret = arm_spe_sample(speq); 1221 if (ret) 1222 return ret; 1223 1224 ret = arm_spe_decode(speq->decoder); 1225 if (!ret) { 1226 pr_debug("No data or all data has been processed.\n"); 1227 return 1; 1228 } 1229 1230 /* 1231 * Error is detected when decode SPE trace data, continue to 1232 * the next trace data and find out more records. 1233 */ 1234 if (ret < 0) 1235 continue; 1236 1237 record = &speq->decoder->record; 1238 1239 /* Update timestamp for the last record */ 1240 if (record->timestamp > speq->timestamp) 1241 speq->timestamp = record->timestamp; 1242 1243 /* 1244 * If the timestamp of the queue is later than timestamp of the 1245 * coming perf event, bail out so can allow the perf event to 1246 * be processed ahead. 1247 */ 1248 if (!spe->timeless_decoding && speq->timestamp >= *timestamp) { 1249 *timestamp = speq->timestamp; 1250 return 0; 1251 } 1252 } 1253 1254 return 0; 1255 } 1256 1257 static int arm_spe__setup_queue(struct arm_spe *spe, 1258 struct auxtrace_queue *queue, 1259 unsigned int queue_nr) 1260 { 1261 struct arm_spe_queue *speq = queue->priv; 1262 struct arm_spe_record *record; 1263 1264 if (list_empty(&queue->head) || speq) 1265 return 0; 1266 1267 speq = arm_spe__alloc_queue(spe, queue_nr); 1268 1269 if (!speq) 1270 return -ENOMEM; 1271 1272 queue->priv = speq; 1273 1274 if (queue->cpu != -1) 1275 speq->cpu = queue->cpu; 1276 arm_spe__get_midr(spe, queue->cpu, &speq->decoder->midr); 1277 1278 if (!speq->on_heap) { 1279 int ret; 1280 1281 if (spe->timeless_decoding) 1282 return 0; 1283 1284 retry: 1285 ret = arm_spe_decode(speq->decoder); 1286 1287 if (!ret) 1288 return 0; 1289 1290 if (ret < 0) 1291 goto retry; 1292 1293 record = &speq->decoder->record; 1294 1295 speq->timestamp = record->timestamp; 1296 ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp); 1297 if (ret) 1298 return ret; 1299 speq->on_heap = true; 1300 } 1301 1302 return 0; 1303 } 1304 1305 static int arm_spe__setup_queues(struct arm_spe *spe) 1306 { 1307 unsigned int i; 1308 int ret; 1309 1310 for (i = 0; i < spe->queues.nr_queues; i++) { 1311 ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i); 1312 if (ret) 1313 return ret; 1314 } 1315 1316 return 0; 1317 } 1318 1319 static int arm_spe__update_queues(struct arm_spe *spe) 1320 { 1321 if (spe->queues.new_data) { 1322 spe->queues.new_data = false; 1323 return arm_spe__setup_queues(spe); 1324 } 1325 1326 return 0; 1327 } 1328 1329 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe) 1330 { 1331 struct evsel *evsel; 1332 struct evlist *evlist = spe->session->evlist; 1333 bool timeless_decoding = true; 1334 1335 /* 1336 * Circle through the list of event and complain if we find one 1337 * with the time bit set. 1338 */ 1339 evlist__for_each_entry(evlist, evsel) { 1340 if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME)) 1341 timeless_decoding = false; 1342 } 1343 1344 return timeless_decoding; 1345 } 1346 1347 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp) 1348 { 1349 unsigned int queue_nr; 1350 u64 ts; 1351 int ret; 1352 1353 while (1) { 1354 struct auxtrace_queue *queue; 1355 struct arm_spe_queue *speq; 1356 1357 if (!spe->heap.heap_cnt) 1358 return 0; 1359 1360 if (spe->heap.heap_array[0].ordinal >= timestamp) 1361 return 0; 1362 1363 queue_nr = spe->heap.heap_array[0].queue_nr; 1364 queue = &spe->queues.queue_array[queue_nr]; 1365 speq = queue->priv; 1366 1367 auxtrace_heap__pop(&spe->heap); 1368 1369 if (spe->heap.heap_cnt) { 1370 ts = spe->heap.heap_array[0].ordinal + 1; 1371 if (ts > timestamp) 1372 ts = timestamp; 1373 } else { 1374 ts = timestamp; 1375 } 1376 1377 /* 1378 * A previous context-switch event has set pid/tid in the machine's context, so 1379 * here we need to update the pid/tid in the thread and SPE queue. 1380 */ 1381 if (!spe->use_ctx_pkt_for_pid) 1382 arm_spe_set_pid_tid_cpu(spe, queue); 1383 1384 ret = arm_spe_run_decoder(speq, &ts); 1385 if (ret < 0) { 1386 auxtrace_heap__add(&spe->heap, queue_nr, ts); 1387 return ret; 1388 } 1389 1390 if (!ret) { 1391 ret = auxtrace_heap__add(&spe->heap, queue_nr, ts); 1392 if (ret < 0) 1393 return ret; 1394 } else { 1395 speq->on_heap = false; 1396 } 1397 } 1398 1399 return 0; 1400 } 1401 1402 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid, 1403 u64 time_) 1404 { 1405 struct auxtrace_queues *queues = &spe->queues; 1406 unsigned int i; 1407 u64 ts = 0; 1408 1409 for (i = 0; i < queues->nr_queues; i++) { 1410 struct auxtrace_queue *queue = &spe->queues.queue_array[i]; 1411 struct arm_spe_queue *speq = queue->priv; 1412 1413 if (speq && (tid == -1 || speq->tid == tid)) { 1414 speq->time = time_; 1415 arm_spe_set_pid_tid_cpu(spe, queue); 1416 arm_spe_run_decoder(speq, &ts); 1417 } 1418 } 1419 return 0; 1420 } 1421 1422 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event, 1423 struct perf_sample *sample) 1424 { 1425 pid_t pid, tid; 1426 int cpu; 1427 1428 if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT)) 1429 return 0; 1430 1431 pid = event->context_switch.next_prev_pid; 1432 tid = event->context_switch.next_prev_tid; 1433 cpu = sample->cpu; 1434 1435 if (tid == -1) 1436 pr_warning("context_switch event has no tid\n"); 1437 1438 return machine__set_current_tid(spe->machine, cpu, pid, tid); 1439 } 1440 1441 static int arm_spe_process_event(struct perf_session *session, 1442 union perf_event *event, 1443 struct perf_sample *sample, 1444 const struct perf_tool *tool) 1445 { 1446 int err = 0; 1447 u64 timestamp; 1448 struct arm_spe *spe = container_of(session->auxtrace, 1449 struct arm_spe, auxtrace); 1450 1451 if (dump_trace) 1452 return 0; 1453 1454 if (!tool->ordered_events) { 1455 pr_err("SPE trace requires ordered events\n"); 1456 return -EINVAL; 1457 } 1458 1459 if (sample->time && (sample->time != (u64) -1)) 1460 timestamp = perf_time_to_tsc(sample->time, &spe->tc); 1461 else 1462 timestamp = 0; 1463 1464 if (timestamp || spe->timeless_decoding) { 1465 err = arm_spe__update_queues(spe); 1466 if (err) 1467 return err; 1468 } 1469 1470 if (spe->timeless_decoding) { 1471 if (event->header.type == PERF_RECORD_EXIT) { 1472 err = arm_spe_process_timeless_queues(spe, 1473 event->fork.tid, 1474 sample->time); 1475 } 1476 } else if (timestamp) { 1477 err = arm_spe_process_queues(spe, timestamp); 1478 if (err) 1479 return err; 1480 1481 if (!spe->use_ctx_pkt_for_pid && 1482 (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE || 1483 event->header.type == PERF_RECORD_SWITCH)) 1484 err = arm_spe_context_switch(spe, event, sample); 1485 } 1486 1487 return err; 1488 } 1489 1490 static int arm_spe_process_auxtrace_event(struct perf_session *session, 1491 union perf_event *event, 1492 const struct perf_tool *tool __maybe_unused) 1493 { 1494 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1495 auxtrace); 1496 1497 if (!spe->data_queued) { 1498 struct auxtrace_buffer *buffer; 1499 off_t data_offset; 1500 int fd = perf_data__fd(session->data); 1501 int err; 1502 1503 if (perf_data__is_pipe(session->data)) { 1504 data_offset = 0; 1505 } else { 1506 data_offset = lseek(fd, 0, SEEK_CUR); 1507 if (data_offset == -1) 1508 return -errno; 1509 } 1510 1511 err = auxtrace_queues__add_event(&spe->queues, session, event, 1512 data_offset, &buffer); 1513 if (err) 1514 return err; 1515 1516 /* Dump here now we have copied a piped trace out of the pipe */ 1517 if (dump_trace) { 1518 if (auxtrace_buffer__get_data(buffer, fd)) { 1519 u64 midr = 0; 1520 1521 arm_spe__get_midr(spe, buffer->cpu.cpu, &midr); 1522 arm_spe_dump_event(spe, buffer->data, 1523 buffer->size, midr); 1524 auxtrace_buffer__put_data(buffer); 1525 } 1526 } 1527 } 1528 1529 return 0; 1530 } 1531 1532 static int arm_spe_flush(struct perf_session *session __maybe_unused, 1533 const struct perf_tool *tool __maybe_unused) 1534 { 1535 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1536 auxtrace); 1537 int ret; 1538 1539 if (dump_trace) 1540 return 0; 1541 1542 if (!tool->ordered_events) 1543 return -EINVAL; 1544 1545 ret = arm_spe__update_queues(spe); 1546 if (ret < 0) 1547 return ret; 1548 1549 if (spe->timeless_decoding) 1550 return arm_spe_process_timeless_queues(spe, -1, 1551 MAX_TIMESTAMP - 1); 1552 1553 ret = arm_spe_process_queues(spe, MAX_TIMESTAMP); 1554 if (ret) 1555 return ret; 1556 1557 if (!spe->use_ctx_pkt_for_pid) 1558 ui__warning("Arm SPE CONTEXT packets not found in the traces.\n" 1559 "Matching of TIDs to SPE events could be inaccurate.\n"); 1560 1561 return 0; 1562 } 1563 1564 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size) 1565 { 1566 u64 *metadata; 1567 1568 metadata = zalloc(per_cpu_size); 1569 if (!metadata) 1570 return NULL; 1571 1572 memcpy(metadata, buf, per_cpu_size); 1573 return metadata; 1574 } 1575 1576 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu) 1577 { 1578 int i; 1579 1580 for (i = 0; i < nr_cpu; i++) 1581 zfree(&metadata[i]); 1582 free(metadata); 1583 } 1584 1585 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info, 1586 u64 *ver, int *nr_cpu) 1587 { 1588 u64 *ptr = (u64 *)info->priv; 1589 u64 metadata_size; 1590 u64 **metadata = NULL; 1591 int hdr_sz, per_cpu_sz, i; 1592 1593 metadata_size = info->header.size - 1594 sizeof(struct perf_record_auxtrace_info); 1595 1596 /* Metadata version 1 */ 1597 if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) { 1598 *ver = 1; 1599 *nr_cpu = 0; 1600 /* No per CPU metadata */ 1601 return NULL; 1602 } 1603 1604 *ver = ptr[ARM_SPE_HEADER_VERSION]; 1605 hdr_sz = ptr[ARM_SPE_HEADER_SIZE]; 1606 *nr_cpu = ptr[ARM_SPE_CPUS_NUM]; 1607 1608 metadata = calloc(*nr_cpu, sizeof(*metadata)); 1609 if (!metadata) 1610 return NULL; 1611 1612 /* Locate the start address of per CPU metadata */ 1613 ptr += hdr_sz; 1614 per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu); 1615 1616 for (i = 0; i < *nr_cpu; i++) { 1617 metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz); 1618 if (!metadata[i]) 1619 goto err_per_cpu_metadata; 1620 1621 ptr += per_cpu_sz / sizeof(u64); 1622 } 1623 1624 return metadata; 1625 1626 err_per_cpu_metadata: 1627 arm_spe__free_metadata(metadata, *nr_cpu); 1628 return NULL; 1629 } 1630 1631 static void arm_spe_free_queue(void *priv) 1632 { 1633 struct arm_spe_queue *speq = priv; 1634 1635 if (!speq) 1636 return; 1637 thread__zput(speq->thread); 1638 arm_spe_decoder_free(speq->decoder); 1639 zfree(&speq->event_buf); 1640 zfree(&speq->last_branch); 1641 free(speq); 1642 } 1643 1644 static void arm_spe_free_events(struct perf_session *session) 1645 { 1646 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1647 auxtrace); 1648 struct auxtrace_queues *queues = &spe->queues; 1649 unsigned int i; 1650 1651 for (i = 0; i < queues->nr_queues; i++) { 1652 arm_spe_free_queue(queues->queue_array[i].priv); 1653 queues->queue_array[i].priv = NULL; 1654 } 1655 auxtrace_queues__free(queues); 1656 } 1657 1658 static void arm_spe_free(struct perf_session *session) 1659 { 1660 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1661 auxtrace); 1662 1663 auxtrace_heap__free(&spe->heap); 1664 arm_spe_free_events(session); 1665 session->auxtrace = NULL; 1666 arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu); 1667 free(spe); 1668 } 1669 1670 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session, 1671 struct evsel *evsel) 1672 { 1673 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); 1674 1675 return evsel->core.attr.type == spe->pmu_type; 1676 } 1677 1678 static const char * const metadata_hdr_v1_fmts[] = { 1679 [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1680 [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n", 1681 }; 1682 1683 static const char * const metadata_hdr_fmts[] = { 1684 [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n", 1685 [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n", 1686 [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n", 1687 [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n", 1688 }; 1689 1690 static const char * const metadata_per_cpu_fmts[] = { 1691 [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n", 1692 [ARM_SPE_CPU] = " CPU # :%"PRId64"\n", 1693 [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n", 1694 [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n", 1695 [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1696 [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n", 1697 [ARM_SPE_CAP_EVENT_FILTER] = " Event Filter :0x%"PRIx64"\n", 1698 }; 1699 1700 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) 1701 { 1702 unsigned int i, cpu, hdr_size, cpu_num, cpu_size; 1703 const char * const *hdr_fmts; 1704 1705 if (!dump_trace) 1706 return; 1707 1708 if (spe->metadata_ver == 1) { 1709 cpu_num = 0; 1710 hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX; 1711 hdr_fmts = metadata_hdr_v1_fmts; 1712 } else { 1713 cpu_num = arr[ARM_SPE_CPUS_NUM]; 1714 hdr_size = arr[ARM_SPE_HEADER_SIZE]; 1715 hdr_fmts = metadata_hdr_fmts; 1716 } 1717 1718 for (i = 0; i < hdr_size; i++) 1719 fprintf(stdout, hdr_fmts[i], arr[i]); 1720 1721 arr += hdr_size; 1722 for (cpu = 0; cpu < cpu_num; cpu++) { 1723 /* 1724 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS 1725 * are fixed. The sequential parameter size is decided by the 1726 * field 'ARM_SPE_CPU_NR_PARAMS'. 1727 */ 1728 cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS]; 1729 for (i = 0; i < cpu_size; i++) 1730 fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]); 1731 arr += cpu_size; 1732 } 1733 } 1734 1735 static void arm_spe_set_event_name(struct evlist *evlist, u64 id, 1736 const char *name) 1737 { 1738 struct evsel *evsel; 1739 1740 evlist__for_each_entry(evlist, evsel) { 1741 if (evsel->core.id && evsel->core.id[0] == id) { 1742 if (evsel->name) 1743 zfree(&evsel->name); 1744 evsel->name = strdup(name); 1745 break; 1746 } 1747 } 1748 } 1749 1750 static int 1751 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) 1752 { 1753 struct evlist *evlist = session->evlist; 1754 struct evsel *evsel; 1755 struct perf_event_attr attr; 1756 bool found = false; 1757 u64 id; 1758 int err; 1759 1760 evlist__for_each_entry(evlist, evsel) { 1761 if (evsel->core.attr.type == spe->pmu_type) { 1762 found = true; 1763 break; 1764 } 1765 } 1766 1767 if (!found) { 1768 pr_debug("No selected events with SPE trace data\n"); 1769 return 0; 1770 } 1771 1772 memset(&attr, 0, sizeof(struct perf_event_attr)); 1773 attr.size = sizeof(struct perf_event_attr); 1774 attr.type = PERF_TYPE_HARDWARE; 1775 attr.sample_type = evsel->core.attr.sample_type & 1776 (PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR); 1777 attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID | 1778 PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC | 1779 PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR; 1780 if (spe->timeless_decoding) 1781 attr.sample_type &= ~(u64)PERF_SAMPLE_TIME; 1782 else 1783 attr.sample_type |= PERF_SAMPLE_TIME; 1784 1785 spe->sample_type = attr.sample_type; 1786 1787 attr.exclude_user = evsel->core.attr.exclude_user; 1788 attr.exclude_kernel = evsel->core.attr.exclude_kernel; 1789 attr.exclude_hv = evsel->core.attr.exclude_hv; 1790 attr.exclude_host = evsel->core.attr.exclude_host; 1791 attr.exclude_guest = evsel->core.attr.exclude_guest; 1792 attr.sample_id_all = evsel->core.attr.sample_id_all; 1793 attr.read_format = evsel->core.attr.read_format; 1794 attr.sample_period = spe->synth_opts.period; 1795 1796 /* create new id val to be a fixed offset from evsel id */ 1797 id = auxtrace_synth_id_range_start(evsel); 1798 1799 if (spe->synth_opts.flc) { 1800 spe->sample_flc = true; 1801 1802 /* Level 1 data cache miss */ 1803 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1804 if (err) 1805 return err; 1806 spe->l1d_miss_id = id; 1807 arm_spe_set_event_name(evlist, id, "l1d-miss"); 1808 id += 1; 1809 1810 /* Level 1 data cache access */ 1811 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1812 if (err) 1813 return err; 1814 spe->l1d_access_id = id; 1815 arm_spe_set_event_name(evlist, id, "l1d-access"); 1816 id += 1; 1817 } 1818 1819 if (spe->synth_opts.llc) { 1820 spe->sample_llc = true; 1821 1822 /* Last level cache miss */ 1823 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1824 if (err) 1825 return err; 1826 spe->llc_miss_id = id; 1827 arm_spe_set_event_name(evlist, id, "llc-miss"); 1828 id += 1; 1829 1830 /* Last level cache access */ 1831 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1832 if (err) 1833 return err; 1834 spe->llc_access_id = id; 1835 arm_spe_set_event_name(evlist, id, "llc-access"); 1836 id += 1; 1837 } 1838 1839 if (spe->synth_opts.tlb) { 1840 spe->sample_tlb = true; 1841 1842 /* TLB miss */ 1843 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1844 if (err) 1845 return err; 1846 spe->tlb_miss_id = id; 1847 arm_spe_set_event_name(evlist, id, "tlb-miss"); 1848 id += 1; 1849 1850 /* TLB access */ 1851 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1852 if (err) 1853 return err; 1854 spe->tlb_access_id = id; 1855 arm_spe_set_event_name(evlist, id, "tlb-access"); 1856 id += 1; 1857 } 1858 1859 if (spe->synth_opts.last_branch) { 1860 if (spe->synth_opts.last_branch_sz > 2) 1861 pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); 1862 1863 attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; 1864 /* 1865 * We don't use the hardware index, but the sample generation 1866 * code uses the new format branch_stack with this field, 1867 * so the event attributes must indicate that it's present. 1868 */ 1869 attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; 1870 } 1871 1872 if (spe->synth_opts.branches) { 1873 spe->sample_branch = true; 1874 1875 /* Branch */ 1876 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1877 if (err) 1878 return err; 1879 spe->branch_id = id; 1880 arm_spe_set_event_name(evlist, id, "branch"); 1881 id += 1; 1882 } 1883 1884 if (spe->synth_opts.remote_access) { 1885 spe->sample_remote_access = true; 1886 1887 /* Remote access */ 1888 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1889 if (err) 1890 return err; 1891 spe->remote_access_id = id; 1892 arm_spe_set_event_name(evlist, id, "remote-access"); 1893 id += 1; 1894 } 1895 1896 if (spe->synth_opts.mem) { 1897 spe->sample_memory = true; 1898 1899 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1900 if (err) 1901 return err; 1902 spe->memory_id = id; 1903 arm_spe_set_event_name(evlist, id, "memory"); 1904 id += 1; 1905 } 1906 1907 if (spe->synth_opts.instructions) { 1908 spe->sample_instructions = true; 1909 attr.config = PERF_COUNT_HW_INSTRUCTIONS; 1910 1911 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1912 if (err) 1913 return err; 1914 spe->instructions_id = id; 1915 arm_spe_set_event_name(evlist, id, "instructions"); 1916 } 1917 1918 return 0; 1919 } 1920 1921 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu) 1922 { 1923 u64 midr; 1924 int i; 1925 1926 if (!nr_cpu) 1927 return false; 1928 1929 for (i = 0; i < nr_cpu; i++) { 1930 if (!metadata[i]) 1931 return false; 1932 1933 if (i == 0) { 1934 midr = metadata[i][ARM_SPE_CPU_MIDR]; 1935 continue; 1936 } 1937 1938 if (midr != metadata[i][ARM_SPE_CPU_MIDR]) 1939 return false; 1940 } 1941 1942 return true; 1943 } 1944 1945 int arm_spe_process_auxtrace_info(union perf_event *event, 1946 struct perf_session *session) 1947 { 1948 struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; 1949 size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE; 1950 struct perf_record_time_conv *tc = &session->time_conv; 1951 struct arm_spe *spe; 1952 u64 **metadata = NULL; 1953 u64 metadata_ver; 1954 int nr_cpu, err; 1955 1956 if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) + 1957 min_sz) 1958 return -EINVAL; 1959 1960 metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver, 1961 &nr_cpu); 1962 if (!metadata && metadata_ver != 1) { 1963 pr_err("Failed to parse Arm SPE metadata.\n"); 1964 return -EINVAL; 1965 } 1966 1967 spe = zalloc(sizeof(struct arm_spe)); 1968 if (!spe) { 1969 err = -ENOMEM; 1970 goto err_free_metadata; 1971 } 1972 1973 err = auxtrace_queues__init(&spe->queues); 1974 if (err) 1975 goto err_free; 1976 1977 spe->session = session; 1978 spe->machine = &session->machines.host; /* No kvm support */ 1979 spe->auxtrace_type = auxtrace_info->type; 1980 if (metadata_ver == 1) 1981 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; 1982 else 1983 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2]; 1984 spe->metadata = metadata; 1985 spe->metadata_ver = metadata_ver; 1986 spe->metadata_nr_cpu = nr_cpu; 1987 spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu); 1988 1989 spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); 1990 1991 /* 1992 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead 1993 * and the parameters for hardware clock are stored in the session 1994 * context. Passes these parameters to the struct perf_tsc_conversion 1995 * in "spe->tc", which is used for later conversion between clock 1996 * counter and timestamp. 1997 * 1998 * For backward compatibility, copies the fields starting from 1999 * "time_cycles" only if they are contained in the event. 2000 */ 2001 spe->tc.time_shift = tc->time_shift; 2002 spe->tc.time_mult = tc->time_mult; 2003 spe->tc.time_zero = tc->time_zero; 2004 2005 if (event_contains(*tc, cap_user_time_short)) { 2006 spe->tc.time_cycles = tc->time_cycles; 2007 spe->tc.time_mask = tc->time_mask; 2008 spe->tc.cap_user_time_zero = tc->cap_user_time_zero; 2009 spe->tc.cap_user_time_short = tc->cap_user_time_short; 2010 } 2011 2012 spe->auxtrace.process_event = arm_spe_process_event; 2013 spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event; 2014 spe->auxtrace.flush_events = arm_spe_flush; 2015 spe->auxtrace.free_events = arm_spe_free_events; 2016 spe->auxtrace.free = arm_spe_free; 2017 spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace; 2018 session->auxtrace = &spe->auxtrace; 2019 2020 arm_spe_print_info(spe, &auxtrace_info->priv[0]); 2021 2022 if (dump_trace) 2023 return 0; 2024 2025 if (session->itrace_synth_opts && session->itrace_synth_opts->set) { 2026 spe->synth_opts = *session->itrace_synth_opts; 2027 } else { 2028 itrace_synth_opts__set_default(&spe->synth_opts, false); 2029 /* Default nanoseconds period not supported */ 2030 spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; 2031 spe->synth_opts.period = 1; 2032 } 2033 2034 if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { 2035 ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n"); 2036 err = -EINVAL; 2037 goto err_free_queues; 2038 } 2039 if (spe->synth_opts.period > 1) 2040 ui__warning("Arm SPE has a hardware-based sampling period.\n\n" 2041 "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n"); 2042 2043 err = arm_spe_synth_events(spe, session); 2044 if (err) 2045 goto err_free_queues; 2046 2047 err = auxtrace_queues__process_index(&spe->queues, session); 2048 if (err) 2049 goto err_free_queues; 2050 2051 if (spe->queues.populated) 2052 spe->data_queued = true; 2053 2054 return 0; 2055 2056 err_free_queues: 2057 auxtrace_queues__free(&spe->queues); 2058 session->auxtrace = NULL; 2059 err_free: 2060 free(spe); 2061 err_free_metadata: 2062 arm_spe__free_metadata(metadata, nr_cpu); 2063 return err; 2064 } 2065