1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Perf interface to expose Dispatch Trace Log counters. 4 * 5 * Copyright (C) 2024 Kajol Jain, IBM Corporation 6 */ 7 8 #ifdef CONFIG_PPC_SPLPAR 9 #define pr_fmt(fmt) "vpa_dtl: " fmt 10 11 #include <asm/dtl.h> 12 #include <linux/perf_event.h> 13 #include <asm/plpar_wrappers.h> 14 #include <linux/vmalloc.h> 15 16 #define EVENT(_name, _code) enum{_name = _code} 17 18 /* 19 * Based on Power Architecture Platform Reference(PAPR) documentation, 20 * Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL) 21 * Enable Mask used to get corresponding virtual processor dispatch 22 * to preempt traces: 23 * DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual 24 * processor waits 25 * DTL_PREEMPT(0x2): Trace time slice preempts 26 * DTL_FAULT(0x4): Trace virtual partition memory page 27 faults. 28 * DTL_ALL(0x7): Trace all (DTL_CEDE | DTL_PREEMPT | DTL_FAULT) 29 * 30 * Event codes based on Dispatch Trace Log Enable Mask. 31 */ 32 EVENT(DTL_CEDE, 0x1); 33 EVENT(DTL_PREEMPT, 0x2); 34 EVENT(DTL_FAULT, 0x4); 35 EVENT(DTL_ALL, 0x7); 36 37 GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE); 38 GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT); 39 GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT); 40 GENERIC_EVENT_ATTR(dtl_all, DTL_ALL); 41 42 PMU_FORMAT_ATTR(event, "config:0-7"); 43 44 static struct attribute *events_attr[] = { 45 GENERIC_EVENT_PTR(DTL_CEDE), 46 GENERIC_EVENT_PTR(DTL_PREEMPT), 47 GENERIC_EVENT_PTR(DTL_FAULT), 48 GENERIC_EVENT_PTR(DTL_ALL), 49 NULL 50 }; 51 52 static struct attribute_group event_group = { 53 .name = "events", 54 .attrs = events_attr, 55 }; 56 57 static struct attribute *format_attrs[] = { 58 &format_attr_event.attr, 59 NULL, 60 }; 61 62 static const struct attribute_group format_group = { 63 .name = "format", 64 .attrs = format_attrs, 65 }; 66 67 static const struct attribute_group *attr_groups[] = { 68 &format_group, 69 &event_group, 70 NULL, 71 }; 72 73 struct vpa_dtl { 74 struct dtl_entry *buf; 75 u64 last_idx; 76 }; 77 78 struct vpa_pmu_ctx { 79 struct perf_output_handle handle; 80 }; 81 82 struct vpa_pmu_buf { 83 int nr_pages; 84 bool snapshot; 85 u64 *base; 86 u64 size; 87 u64 head; 88 u64 head_size; 89 /* boot timebase and frequency needs to be saved only at once */ 90 int boottb_freq_saved; 91 u64 threshold; 92 bool full; 93 }; 94 95 /* 96 * To corelate each DTL entry with other events across CPU's, 97 * we need to map timebase from "struct dtl_entry" which phyp 98 * provides with boot timebase. This also needs timebase frequency. 99 * Formula is: ((timbase from DTL entry - boot time) / frequency) 100 * 101 * To match with size of "struct dtl_entry" to ease post processing, 102 * padded 24 bytes to the structure. 103 */ 104 struct boottb_freq { 105 u64 boot_tb; 106 u64 tb_freq; 107 u64 timebase; 108 u64 padded[3]; 109 }; 110 111 static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx); 112 static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu); 113 114 /* variable to capture reference count for the active dtl threads */ 115 static int dtl_global_refc; 116 static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock); 117 118 /* 119 * Capture DTL data in AUX buffer 120 */ 121 static void vpa_dtl_capture_aux(long *n_entries, struct vpa_pmu_buf *buf, 122 struct vpa_dtl *dtl, int index) 123 { 124 struct dtl_entry *aux_copy_buf = (struct dtl_entry *)buf->base; 125 126 /* 127 * check if there is enough space to contain the 128 * DTL data. If not, save the data for available 129 * memory and set full to true. 130 */ 131 if (buf->head + *n_entries >= buf->threshold) { 132 *n_entries = buf->threshold - buf->head; 133 buf->full = 1; 134 } 135 136 /* 137 * Copy to AUX buffer from per-thread address 138 */ 139 memcpy(aux_copy_buf + buf->head, &dtl->buf[index], *n_entries * sizeof(struct dtl_entry)); 140 141 if (buf->full) { 142 /* 143 * Set head of private aux to zero when buffer is full 144 * so that next data will be copied to beginning of the 145 * buffer 146 */ 147 buf->head = 0; 148 return; 149 } 150 151 buf->head += *n_entries; 152 153 return; 154 } 155 156 /* 157 * Function to dump the dispatch trace log buffer data to the 158 * perf data. 159 * 160 * perf_aux_output_begin: This function is called before writing 161 * to AUX area. This returns the pointer to aux area private structure, 162 * ie "struct vpa_pmu_buf" here which is set in setup_aux() function. 163 * The function obtains the output handle (used in perf_aux_output_end). 164 * when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end() 165 * to commit the recorded data. 166 * 167 * perf_aux_output_end: This function commits data by adjusting the 168 * aux_head of "struct perf_buffer". aux_tail will be moved in perf tools 169 * side when writing the data from aux buffer to perf.data file in disk. 170 * 171 * Here in the private aux structure, we maintain head to know where 172 * to copy data next time in the PMU driver. vpa_pmu_buf->head is moved to 173 * maintain the aux head for PMU driver. It is responsiblity of PMU 174 * driver to make sure data is copied between perf_aux_output_begin and 175 * perf_aux_output_end. 176 * 177 * After data is copied in vpa_dtl_capture_aux() function, perf_aux_output_end() 178 * is called to move the aux->head of "struct perf_buffer" to indicate size of 179 * data in aux buffer. This will post a PERF_RECORD_AUX into the perf buffer. 180 * Data will be written to disk only when the allocated buffer is full. 181 * 182 * By this approach, all the DTL data will be present as-is in the 183 * perf.data. The data will be pre-processed in perf tools side when doing 184 * perf report/perf script and this will avoid time taken to create samples 185 * in the kernel space. 186 */ 187 static void vpa_dtl_dump_sample_data(struct perf_event *event) 188 { 189 u64 cur_idx, last_idx, i; 190 u64 boot_tb; 191 struct boottb_freq boottb_freq; 192 193 /* actual number of entries read */ 194 long n_read = 0, read_size = 0; 195 196 /* number of entries added to dtl buffer */ 197 long n_req; 198 199 struct vpa_pmu_ctx *vpa_ctx = this_cpu_ptr(&vpa_pmu_ctx); 200 201 struct vpa_pmu_buf *aux_buf; 202 203 struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); 204 u64 size; 205 206 cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx); 207 last_idx = dtl->last_idx; 208 209 if (last_idx + N_DISPATCH_LOG <= cur_idx) 210 last_idx = cur_idx - N_DISPATCH_LOG + 1; 211 212 n_req = cur_idx - last_idx; 213 214 /* no new entry added to the buffer, return */ 215 if (n_req <= 0) 216 return; 217 218 dtl->last_idx = last_idx + n_req; 219 boot_tb = get_boot_tb(); 220 221 i = last_idx % N_DISPATCH_LOG; 222 223 aux_buf = perf_aux_output_begin(&vpa_ctx->handle, event); 224 if (!aux_buf) { 225 pr_debug("returning. no aux\n"); 226 return; 227 } 228 229 if (!aux_buf->boottb_freq_saved) { 230 pr_debug("Copying boot tb to aux buffer: %lld\n", boot_tb); 231 /* Save boot_tb to convert raw timebase to it's relative system boot time */ 232 boottb_freq.boot_tb = boot_tb; 233 /* Save tb_ticks_per_sec to convert timebase to sec */ 234 boottb_freq.tb_freq = tb_ticks_per_sec; 235 boottb_freq.timebase = 0; 236 memcpy(aux_buf->base, &boottb_freq, sizeof(boottb_freq)); 237 aux_buf->head += 1; 238 aux_buf->boottb_freq_saved = 1; 239 n_read += 1; 240 } 241 242 /* read the tail of the buffer if we've wrapped */ 243 if (i + n_req > N_DISPATCH_LOG) { 244 read_size = N_DISPATCH_LOG - i; 245 vpa_dtl_capture_aux(&read_size, aux_buf, dtl, i); 246 n_req -= read_size; 247 n_read += read_size; 248 i = 0; 249 if (aux_buf->full) { 250 size = (n_read * sizeof(struct dtl_entry)); 251 if ((size + aux_buf->head_size) > aux_buf->size) { 252 size = aux_buf->size - aux_buf->head_size; 253 perf_aux_output_end(&vpa_ctx->handle, size); 254 aux_buf->head = 0; 255 aux_buf->head_size = 0; 256 } else { 257 aux_buf->head_size += (n_read * sizeof(struct dtl_entry)); 258 perf_aux_output_end(&vpa_ctx->handle, n_read * sizeof(struct dtl_entry)); 259 } 260 goto out; 261 } 262 } 263 264 /* .. and now the head */ 265 vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i); 266 267 size = ((n_req + n_read) * sizeof(struct dtl_entry)); 268 if ((size + aux_buf->head_size) > aux_buf->size) { 269 size = aux_buf->size - aux_buf->head_size; 270 perf_aux_output_end(&vpa_ctx->handle, size); 271 aux_buf->head = 0; 272 aux_buf->head_size = 0; 273 } else { 274 aux_buf->head_size += ((n_req + n_read) * sizeof(struct dtl_entry)); 275 /* Move the aux->head to indicate size of data in aux buffer */ 276 perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry)); 277 } 278 out: 279 aux_buf->full = 0; 280 } 281 282 /* 283 * The VPA Dispatch Trace log counters do not interrupt on overflow. 284 * Therefore, the kernel needs to poll the counters to avoid missing 285 * an overflow using hrtimer. The timer interval is based on sample_period 286 * count provided by user, and minimum interval is 1 millisecond. 287 */ 288 static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer) 289 { 290 struct perf_event *event; 291 u64 period; 292 293 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 294 295 if (event->state != PERF_EVENT_STATE_ACTIVE) 296 return HRTIMER_NORESTART; 297 298 vpa_dtl_dump_sample_data(event); 299 period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period); 300 hrtimer_forward_now(hrtimer, ns_to_ktime(period)); 301 302 return HRTIMER_RESTART; 303 } 304 305 static void vpa_dtl_start_hrtimer(struct perf_event *event) 306 { 307 u64 period; 308 struct hw_perf_event *hwc = &event->hw; 309 310 period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period); 311 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED); 312 } 313 314 static void vpa_dtl_stop_hrtimer(struct perf_event *event) 315 { 316 struct hw_perf_event *hwc = &event->hw; 317 318 hrtimer_cancel(&hwc->hrtimer); 319 } 320 321 static void vpa_dtl_reset_global_refc(struct perf_event *event) 322 { 323 spin_lock(&dtl_global_lock); 324 dtl_global_refc--; 325 if (dtl_global_refc <= 0) { 326 dtl_global_refc = 0; 327 up_write(&dtl_access_lock); 328 } 329 spin_unlock(&dtl_global_lock); 330 } 331 332 static int vpa_dtl_mem_alloc(int cpu) 333 { 334 struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu); 335 struct dtl_entry *buf = NULL; 336 337 /* Check for dispatch trace log buffer cache */ 338 if (!dtl_cache) 339 return -ENOMEM; 340 341 buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL | GFP_ATOMIC, cpu_to_node(cpu)); 342 if (!buf) { 343 pr_warn("buffer allocation failed for cpu %d\n", cpu); 344 return -ENOMEM; 345 } 346 dtl->buf = buf; 347 return 0; 348 } 349 350 static int vpa_dtl_event_init(struct perf_event *event) 351 { 352 struct hw_perf_event *hwc = &event->hw; 353 354 /* test the event attr type for PMU enumeration */ 355 if (event->attr.type != event->pmu->type) 356 return -ENOENT; 357 358 if (!perfmon_capable()) 359 return -EACCES; 360 361 /* Return if this is a counting event */ 362 if (!is_sampling_event(event)) 363 return -EOPNOTSUPP; 364 365 /* no branch sampling */ 366 if (has_branch_stack(event)) 367 return -EOPNOTSUPP; 368 369 /* Invalid eventcode */ 370 switch (event->attr.config) { 371 case DTL_LOG_CEDE: 372 case DTL_LOG_PREEMPT: 373 case DTL_LOG_FAULT: 374 case DTL_LOG_ALL: 375 break; 376 default: 377 return -EINVAL; 378 } 379 380 spin_lock(&dtl_global_lock); 381 382 /* 383 * To ensure there are no other conflicting dtl users 384 * (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl), 385 * below code try to take the dtl_access_lock. 386 * The dtl_access_lock is a rwlock defined in dtl.h, which is used 387 * to unsure there is no conflicting dtl users. 388 * Based on below code, vpa_dtl pmu tries to take write access lock 389 * and also checks for dtl_global_refc, to make sure that the 390 * dtl_access_lock is taken by vpa_dtl pmu interface. 391 */ 392 if (dtl_global_refc == 0 && !down_write_trylock(&dtl_access_lock)) { 393 spin_unlock(&dtl_global_lock); 394 return -EBUSY; 395 } 396 397 /* Allocate dtl buffer memory */ 398 if (vpa_dtl_mem_alloc(event->cpu)) { 399 spin_unlock(&dtl_global_lock); 400 return -ENOMEM; 401 } 402 403 /* 404 * Increment the number of active vpa_dtl pmu threads. The 405 * dtl_global_refc is used to keep count of cpu threads that 406 * currently capturing dtl data using vpa_dtl pmu interface. 407 */ 408 dtl_global_refc++; 409 410 spin_unlock(&dtl_global_lock); 411 412 hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 413 414 /* 415 * Since hrtimers have a fixed rate, we can do a static freq->period 416 * mapping and avoid the whole period adjust feedback stuff. 417 */ 418 if (event->attr.freq) { 419 long freq = event->attr.sample_freq; 420 421 event->attr.sample_period = NSEC_PER_SEC / freq; 422 hwc->sample_period = event->attr.sample_period; 423 local64_set(&hwc->period_left, hwc->sample_period); 424 hwc->last_period = hwc->sample_period; 425 event->attr.freq = 0; 426 } 427 428 event->destroy = vpa_dtl_reset_global_refc; 429 return 0; 430 } 431 432 static int vpa_dtl_event_add(struct perf_event *event, int flags) 433 { 434 int ret, hwcpu; 435 unsigned long addr; 436 struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); 437 438 /* 439 * Register our dtl buffer with the hypervisor. The 440 * HV expects the buffer size to be passed in the second 441 * word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA' 442 * from PAPR for more information. 443 */ 444 ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES); 445 dtl->last_idx = 0; 446 447 hwcpu = get_hard_smp_processor_id(event->cpu); 448 addr = __pa(dtl->buf); 449 450 ret = register_dtl(hwcpu, addr); 451 if (ret) { 452 pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n", 453 event->cpu, hwcpu, ret); 454 return ret; 455 } 456 457 /* set our initial buffer indices */ 458 lppaca_of(event->cpu).dtl_idx = 0; 459 460 /* 461 * Ensure that our updates to the lppaca fields have 462 * occurred before we actually enable the logging 463 */ 464 smp_wmb(); 465 466 /* enable event logging */ 467 lppaca_of(event->cpu).dtl_enable_mask = event->attr.config; 468 469 vpa_dtl_start_hrtimer(event); 470 471 return 0; 472 } 473 474 static void vpa_dtl_event_del(struct perf_event *event, int flags) 475 { 476 int hwcpu = get_hard_smp_processor_id(event->cpu); 477 struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); 478 479 vpa_dtl_stop_hrtimer(event); 480 unregister_dtl(hwcpu); 481 kmem_cache_free(dtl_cache, dtl->buf); 482 dtl->buf = NULL; 483 lppaca_of(event->cpu).dtl_enable_mask = 0x0; 484 } 485 486 /* 487 * This function definition is empty as vpa_dtl_dump_sample_data 488 * is used to parse and dump the dispatch trace log data, 489 * to perf data. 490 */ 491 static void vpa_dtl_event_read(struct perf_event *event) 492 { 493 } 494 495 /* 496 * Set up pmu-private data structures for an AUX area 497 * **pages contains the aux buffer allocated for this event 498 * for the corresponding cpu. rb_alloc_aux uses "alloc_pages_node" 499 * and returns pointer to each page address. Map these pages to 500 * contiguous space using vmap and use that as base address. 501 * 502 * The aux private data structure ie, "struct vpa_pmu_buf" mainly 503 * saves 504 * - buf->base: aux buffer base address 505 * - buf->head: offset from base address where data will be written to. 506 * - buf->size: Size of allocated memory 507 */ 508 static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages, 509 int nr_pages, bool snapshot) 510 { 511 int i, cpu = event->cpu; 512 struct vpa_pmu_buf *buf __free(kfree) = NULL; 513 struct page **pglist __free(kfree) = NULL; 514 515 /* We need at least one page for this to work. */ 516 if (!nr_pages) 517 return NULL; 518 519 if (cpu == -1) 520 cpu = raw_smp_processor_id(); 521 522 buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu)); 523 if (!buf) 524 return NULL; 525 526 pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL); 527 if (!pglist) 528 return NULL; 529 530 for (i = 0; i < nr_pages; ++i) 531 pglist[i] = virt_to_page(pages[i]); 532 533 buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL); 534 if (!buf->base) 535 return NULL; 536 537 buf->nr_pages = nr_pages; 538 buf->snapshot = false; 539 540 buf->size = nr_pages << PAGE_SHIFT; 541 buf->head = 0; 542 buf->head_size = 0; 543 buf->boottb_freq_saved = 0; 544 buf->threshold = ((buf->size - 32) / sizeof(struct dtl_entry)); 545 return no_free_ptr(buf); 546 } 547 548 /* 549 * free pmu-private AUX data structures 550 */ 551 static void vpa_dtl_free_aux(void *aux) 552 { 553 struct vpa_pmu_buf *buf = aux; 554 555 vunmap(buf->base); 556 kfree(buf); 557 } 558 559 static struct pmu vpa_dtl_pmu = { 560 .task_ctx_nr = perf_invalid_context, 561 562 .name = "vpa_dtl", 563 .attr_groups = attr_groups, 564 .event_init = vpa_dtl_event_init, 565 .add = vpa_dtl_event_add, 566 .del = vpa_dtl_event_del, 567 .read = vpa_dtl_event_read, 568 .setup_aux = vpa_dtl_setup_aux, 569 .free_aux = vpa_dtl_free_aux, 570 .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE, 571 }; 572 573 static int vpa_dtl_init(void) 574 { 575 int r; 576 577 if (!firmware_has_feature(FW_FEATURE_SPLPAR)) { 578 pr_debug("not a shared virtualized system, not enabling\n"); 579 return -ENODEV; 580 } 581 582 /* This driver is intended only for L1 host. */ 583 if (is_kvm_guest()) { 584 pr_debug("Only supported for L1 host system\n"); 585 return -ENODEV; 586 } 587 588 r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1); 589 if (r) 590 return r; 591 592 return 0; 593 } 594 595 device_initcall(vpa_dtl_init); 596 #endif //CONFIG_PPC_SPLPAR 597