1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * BTS PMU driver for perf 4 * Copyright (c) 2013-2014, Intel Corporation. 5 */ 6 7 #undef DEBUG 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/bitops.h> 12 #include <linux/types.h> 13 #include <linux/slab.h> 14 #include <linux/debugfs.h> 15 #include <linux/device.h> 16 #include <linux/coredump.h> 17 18 #include <linux/sizes.h> 19 #include <asm/perf_event.h> 20 #include <asm/msr.h> 21 22 #include "../perf_event.h" 23 24 struct bts_ctx { 25 struct perf_output_handle handle; 26 struct debug_store ds_back; 27 int state; 28 }; 29 30 /* BTS context states: */ 31 enum { 32 /* no ongoing AUX transactions */ 33 BTS_STATE_STOPPED = 0, 34 /* AUX transaction is on, BTS tracing is disabled */ 35 BTS_STATE_INACTIVE, 36 /* AUX transaction is on, BTS tracing is running */ 37 BTS_STATE_ACTIVE, 38 }; 39 40 static struct bts_ctx __percpu *bts_ctx; 41 42 #define BTS_RECORD_SIZE 24 43 #define BTS_SAFETY_MARGIN 4080 44 45 struct bts_phys { 46 struct page *page; 47 unsigned long size; 48 unsigned long offset; 49 unsigned long displacement; 50 }; 51 52 struct bts_buffer { 53 size_t real_size; /* multiple of BTS_RECORD_SIZE */ 54 unsigned int nr_pages; 55 unsigned int nr_bufs; 56 unsigned int cur_buf; 57 bool snapshot; 58 local_t data_size; 59 local_t head; 60 unsigned long end; 61 void **data_pages; 62 struct bts_phys buf[] __counted_by(nr_bufs); 63 }; 64 65 static struct pmu bts_pmu; 66 67 static int buf_nr_pages(struct page *page) 68 { 69 if (!PagePrivate(page)) 70 return 1; 71 72 return 1 << page_private(page); 73 } 74 75 static size_t buf_size(struct page *page) 76 { 77 return buf_nr_pages(page) * PAGE_SIZE; 78 } 79 80 static void * 81 bts_buffer_setup_aux(struct perf_event *event, void **pages, 82 int nr_pages, bool overwrite) 83 { 84 struct bts_buffer *bb; 85 struct page *page; 86 int cpu = event->cpu; 87 int node = (cpu == -1) ? cpu : cpu_to_node(cpu); 88 unsigned long offset; 89 size_t size = nr_pages << PAGE_SHIFT; 90 int pg, nr_buf, pad; 91 92 /* count all the high order buffers */ 93 for (pg = 0, nr_buf = 0; pg < nr_pages;) { 94 page = virt_to_page(pages[pg]); 95 pg += buf_nr_pages(page); 96 nr_buf++; 97 } 98 99 /* 100 * to avoid interrupts in overwrite mode, only allow one physical 101 */ 102 if (overwrite && nr_buf > 1) 103 return NULL; 104 105 bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node); 106 if (!bb) 107 return NULL; 108 109 bb->nr_pages = nr_pages; 110 bb->nr_bufs = nr_buf; 111 bb->snapshot = overwrite; 112 bb->data_pages = pages; 113 bb->real_size = size - size % BTS_RECORD_SIZE; 114 115 for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) { 116 unsigned int __nr_pages; 117 118 page = virt_to_page(pages[pg]); 119 __nr_pages = buf_nr_pages(page); 120 bb->buf[nr_buf].page = page; 121 bb->buf[nr_buf].offset = offset; 122 bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); 123 bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement; 124 pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE; 125 bb->buf[nr_buf].size -= pad; 126 127 pg += __nr_pages; 128 offset += __nr_pages << PAGE_SHIFT; 129 } 130 131 return bb; 132 } 133 134 static void bts_buffer_free_aux(void *data) 135 { 136 kfree(data); 137 } 138 139 static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx) 140 { 141 return bb->buf[idx].offset + bb->buf[idx].displacement; 142 } 143 144 static void 145 bts_config_buffer(struct bts_buffer *bb) 146 { 147 int cpu = raw_smp_processor_id(); 148 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 149 struct bts_phys *phys = &bb->buf[bb->cur_buf]; 150 unsigned long index, thresh = 0, end = phys->size; 151 struct page *page = phys->page; 152 153 index = local_read(&bb->head); 154 155 if (!bb->snapshot) { 156 if (bb->end < phys->offset + buf_size(page)) 157 end = bb->end - phys->offset - phys->displacement; 158 159 index -= phys->offset + phys->displacement; 160 161 if (end - index > BTS_SAFETY_MARGIN) 162 thresh = end - BTS_SAFETY_MARGIN; 163 else if (end - index > BTS_RECORD_SIZE) 164 thresh = end - BTS_RECORD_SIZE; 165 else 166 thresh = end; 167 } 168 169 ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; 170 ds->bts_index = ds->bts_buffer_base + index; 171 ds->bts_absolute_maximum = ds->bts_buffer_base + end; 172 ds->bts_interrupt_threshold = !bb->snapshot 173 ? ds->bts_buffer_base + thresh 174 : ds->bts_absolute_maximum + BTS_RECORD_SIZE; 175 } 176 177 static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) 178 { 179 unsigned long index = head - phys->offset; 180 181 memset(page_address(phys->page) + index, 0, phys->size - index); 182 } 183 184 static void bts_update(struct bts_ctx *bts) 185 { 186 int cpu = raw_smp_processor_id(); 187 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 188 struct bts_buffer *bb = perf_get_aux(&bts->handle); 189 unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; 190 191 if (!bb) 192 return; 193 194 head = index + bts_buffer_offset(bb, bb->cur_buf); 195 old = local_xchg(&bb->head, head); 196 197 if (!bb->snapshot) { 198 if (old == head) 199 return; 200 201 if (ds->bts_index >= ds->bts_absolute_maximum) 202 perf_aux_output_flag(&bts->handle, 203 PERF_AUX_FLAG_TRUNCATED); 204 205 /* 206 * old and head are always in the same physical buffer, so we 207 * can subtract them to get the data size. 208 */ 209 local_add(head - old, &bb->data_size); 210 } else { 211 local_set(&bb->data_size, head); 212 } 213 214 /* 215 * Since BTS is coherent, just add compiler barrier to ensure 216 * BTS updating is ordered against bts::handle::event. 217 */ 218 barrier(); 219 } 220 221 static int 222 bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle); 223 224 /* 225 * Ordering PMU callbacks wrt themselves and the PMI is done by means 226 * of bts::state, which: 227 * - is set when bts::handle::event is valid, that is, between 228 * perf_aux_output_begin() and perf_aux_output_end(); 229 * - is zero otherwise; 230 * - is ordered against bts::handle::event with a compiler barrier. 231 */ 232 233 static void __bts_event_start(struct perf_event *event) 234 { 235 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 236 struct bts_buffer *bb = perf_get_aux(&bts->handle); 237 u64 config = 0; 238 239 if (!bb->snapshot) 240 config |= ARCH_PERFMON_EVENTSEL_INT; 241 if (!event->attr.exclude_kernel) 242 config |= ARCH_PERFMON_EVENTSEL_OS; 243 if (!event->attr.exclude_user) 244 config |= ARCH_PERFMON_EVENTSEL_USR; 245 246 bts_config_buffer(bb); 247 248 /* 249 * local barrier to make sure that ds configuration made it 250 * before we enable BTS and bts::state goes ACTIVE 251 */ 252 wmb(); 253 254 /* INACTIVE/STOPPED -> ACTIVE */ 255 WRITE_ONCE(bts->state, BTS_STATE_ACTIVE); 256 257 intel_pmu_enable_bts(config); 258 259 } 260 261 static void bts_event_start(struct perf_event *event, int flags) 262 { 263 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 264 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 265 struct bts_buffer *bb; 266 267 bb = perf_aux_output_begin(&bts->handle, event); 268 if (!bb) 269 goto fail_stop; 270 271 if (bts_buffer_reset(bb, &bts->handle)) 272 goto fail_end_stop; 273 274 bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; 275 bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; 276 bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; 277 278 perf_event_itrace_started(event); 279 event->hw.state = 0; 280 281 __bts_event_start(event); 282 283 return; 284 285 fail_end_stop: 286 perf_aux_output_end(&bts->handle, 0); 287 288 fail_stop: 289 event->hw.state = PERF_HES_STOPPED; 290 } 291 292 static void __bts_event_stop(struct perf_event *event, int state) 293 { 294 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 295 296 /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ 297 WRITE_ONCE(bts->state, state); 298 299 /* 300 * No extra synchronization is mandated by the documentation to have 301 * BTS data stores globally visible. 302 */ 303 intel_pmu_disable_bts(); 304 } 305 306 static void bts_event_stop(struct perf_event *event, int flags) 307 { 308 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 309 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 310 struct bts_buffer *bb = NULL; 311 int state = READ_ONCE(bts->state); 312 313 if (state == BTS_STATE_ACTIVE) 314 __bts_event_stop(event, BTS_STATE_STOPPED); 315 316 if (state != BTS_STATE_STOPPED) 317 bb = perf_get_aux(&bts->handle); 318 319 event->hw.state |= PERF_HES_STOPPED; 320 321 if (flags & PERF_EF_UPDATE) { 322 bts_update(bts); 323 324 if (bb) { 325 if (bb->snapshot) 326 bts->handle.head = 327 local_xchg(&bb->data_size, 328 bb->nr_pages << PAGE_SHIFT); 329 perf_aux_output_end(&bts->handle, 330 local_xchg(&bb->data_size, 0)); 331 } 332 333 cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; 334 cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; 335 cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; 336 cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; 337 } 338 } 339 340 void intel_bts_enable_local(void) 341 { 342 struct bts_ctx *bts; 343 int state; 344 345 if (!bts_ctx) 346 return; 347 348 bts = this_cpu_ptr(bts_ctx); 349 state = READ_ONCE(bts->state); 350 /* 351 * Here we transition from INACTIVE to ACTIVE; 352 * if we instead are STOPPED from the interrupt handler, 353 * stay that way. Can't be ACTIVE here though. 354 */ 355 if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE)) 356 return; 357 358 if (state == BTS_STATE_STOPPED) 359 return; 360 361 if (bts->handle.event) 362 __bts_event_start(bts->handle.event); 363 } 364 365 void intel_bts_disable_local(void) 366 { 367 struct bts_ctx *bts; 368 369 if (!bts_ctx) 370 return; 371 372 bts = this_cpu_ptr(bts_ctx); 373 374 /* 375 * Here we transition from ACTIVE to INACTIVE; 376 * do nothing for STOPPED or INACTIVE. 377 */ 378 if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE) 379 return; 380 381 if (bts->handle.event) 382 __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE); 383 } 384 385 static int 386 bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle) 387 { 388 unsigned long head, space, next_space, pad, gap, skip, wakeup; 389 unsigned int next_buf; 390 struct bts_phys *phys, *next_phys; 391 int ret; 392 393 if (bb->snapshot) 394 return 0; 395 396 head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1); 397 398 phys = &bb->buf[bb->cur_buf]; 399 space = phys->offset + phys->displacement + phys->size - head; 400 pad = space; 401 if (space > handle->size) { 402 space = handle->size; 403 space -= space % BTS_RECORD_SIZE; 404 } 405 if (space <= BTS_SAFETY_MARGIN) { 406 /* See if next phys buffer has more space */ 407 next_buf = bb->cur_buf + 1; 408 if (next_buf >= bb->nr_bufs) 409 next_buf = 0; 410 next_phys = &bb->buf[next_buf]; 411 gap = buf_size(phys->page) - phys->displacement - phys->size + 412 next_phys->displacement; 413 skip = pad + gap; 414 if (handle->size >= skip) { 415 next_space = next_phys->size; 416 if (next_space + skip > handle->size) { 417 next_space = handle->size - skip; 418 next_space -= next_space % BTS_RECORD_SIZE; 419 } 420 if (next_space > space || !space) { 421 if (pad) 422 bts_buffer_pad_out(phys, head); 423 ret = perf_aux_output_skip(handle, skip); 424 if (ret) 425 return ret; 426 /* Advance to next phys buffer */ 427 phys = next_phys; 428 space = next_space; 429 head = phys->offset + phys->displacement; 430 /* 431 * After this, cur_buf and head won't match ds 432 * anymore, so we must not be racing with 433 * bts_update(). 434 */ 435 bb->cur_buf = next_buf; 436 local_set(&bb->head, head); 437 } 438 } 439 } 440 441 /* Don't go far beyond wakeup watermark */ 442 wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup - 443 handle->head; 444 if (space > wakeup) { 445 space = wakeup; 446 space -= space % BTS_RECORD_SIZE; 447 } 448 449 bb->end = head + space; 450 451 /* 452 * If we have no space, the lost notification would have been sent when 453 * we hit absolute_maximum - see bts_update() 454 */ 455 if (!space) 456 return -ENOSPC; 457 458 return 0; 459 } 460 461 int intel_bts_interrupt(void) 462 { 463 struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; 464 struct bts_ctx *bts; 465 struct perf_event *event; 466 struct bts_buffer *bb; 467 s64 old_head; 468 int err = -ENOSPC, handled = 0; 469 470 if (!bts_ctx) 471 return 0; 472 473 bts = this_cpu_ptr(bts_ctx); 474 event = bts->handle.event; 475 /* 476 * The only surefire way of knowing if this NMI is ours is by checking 477 * the write ptr against the PMI threshold. 478 */ 479 if (ds && (ds->bts_index >= ds->bts_interrupt_threshold)) 480 handled = 1; 481 482 /* 483 * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, 484 * so we can only be INACTIVE or STOPPED 485 */ 486 if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) 487 return handled; 488 489 bb = perf_get_aux(&bts->handle); 490 if (!bb) 491 return handled; 492 493 /* 494 * Skip snapshot counters: they don't use the interrupt, but 495 * there's no other way of telling, because the pointer will 496 * keep moving 497 */ 498 if (bb->snapshot) 499 return 0; 500 501 old_head = local_read(&bb->head); 502 bts_update(bts); 503 504 /* no new data */ 505 if (old_head == local_read(&bb->head)) 506 return handled; 507 508 perf_aux_output_end(&bts->handle, local_xchg(&bb->data_size, 0)); 509 510 bb = perf_aux_output_begin(&bts->handle, event); 511 if (bb) 512 err = bts_buffer_reset(bb, &bts->handle); 513 514 if (err) { 515 WRITE_ONCE(bts->state, BTS_STATE_STOPPED); 516 517 if (bb) { 518 /* 519 * BTS_STATE_STOPPED should be visible before 520 * cleared handle::event 521 */ 522 barrier(); 523 perf_aux_output_end(&bts->handle, 0); 524 } 525 } 526 527 return 1; 528 } 529 530 static void bts_event_del(struct perf_event *event, int mode) 531 { 532 bts_event_stop(event, PERF_EF_UPDATE); 533 } 534 535 static int bts_event_add(struct perf_event *event, int mode) 536 { 537 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 538 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 539 struct hw_perf_event *hwc = &event->hw; 540 541 event->hw.state = PERF_HES_STOPPED; 542 543 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 544 return -EBUSY; 545 546 if (bts->handle.event) 547 return -EBUSY; 548 549 if (mode & PERF_EF_START) { 550 bts_event_start(event, 0); 551 if (hwc->state & PERF_HES_STOPPED) 552 return -EINVAL; 553 } 554 555 return 0; 556 } 557 558 static void bts_event_destroy(struct perf_event *event) 559 { 560 x86_release_hardware(); 561 x86_del_exclusive(x86_lbr_exclusive_bts); 562 } 563 564 static int bts_event_init(struct perf_event *event) 565 { 566 int ret; 567 568 if (event->attr.type != bts_pmu.type) 569 return -ENOENT; 570 571 /* 572 * BTS leaks kernel addresses even when CPL0 tracing is 573 * disabled, so disallow intel_bts driver for unprivileged 574 * users on paranoid systems since it provides trace data 575 * to the user in a zero-copy fashion. 576 */ 577 if (event->attr.exclude_kernel) { 578 ret = perf_allow_kernel(); 579 if (ret) 580 return ret; 581 } 582 583 if (x86_add_exclusive(x86_lbr_exclusive_bts)) 584 return -EBUSY; 585 586 ret = x86_reserve_hardware(); 587 if (ret) { 588 x86_del_exclusive(x86_lbr_exclusive_bts); 589 return ret; 590 } 591 592 event->destroy = bts_event_destroy; 593 594 return 0; 595 } 596 597 static void bts_event_read(struct perf_event *event) 598 { 599 } 600 601 static __init int bts_init(void) 602 { 603 if (!boot_cpu_has(X86_FEATURE_DTES64)) 604 return -ENODEV; 605 606 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); 607 if (!x86_pmu.bts) 608 return -ENODEV; 609 610 if (boot_cpu_has(X86_FEATURE_PTI)) { 611 /* 612 * BTS hardware writes through a virtual memory map we must 613 * either use the kernel physical map, or the user mapping of 614 * the AUX buffer. 615 * 616 * However, since this driver supports per-CPU and per-task inherit 617 * we cannot use the user mapping since it will not be available 618 * if we're not running the owning process. 619 * 620 * With PTI we can't use the kernel map either, because its not 621 * there when we run userspace. 622 * 623 * For now, disable this driver when using PTI. 624 */ 625 return -ENODEV; 626 } 627 628 bts_ctx = alloc_percpu(struct bts_ctx); 629 if (!bts_ctx) 630 return -ENOMEM; 631 632 bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | 633 PERF_PMU_CAP_EXCLUSIVE; 634 bts_pmu.task_ctx_nr = perf_sw_context; 635 bts_pmu.event_init = bts_event_init; 636 bts_pmu.add = bts_event_add; 637 bts_pmu.del = bts_event_del; 638 bts_pmu.start = bts_event_start; 639 bts_pmu.stop = bts_event_stop; 640 bts_pmu.read = bts_event_read; 641 bts_pmu.setup_aux = bts_buffer_setup_aux; 642 bts_pmu.free_aux = bts_buffer_free_aux; 643 644 return perf_pmu_register(&bts_pmu, "intel_bts", -1); 645 } 646 early_initcall(bts_init); 647