1 /* 2 * Performance events - AMD IBS 3 * 4 * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter 5 * 6 * For licencing details see kernel-base/COPYING 7 */ 8 9 #include <linux/perf_event.h> 10 #include <linux/init.h> 11 #include <linux/export.h> 12 #include <linux/pci.h> 13 #include <linux/ptrace.h> 14 #include <linux/syscore_ops.h> 15 #include <linux/sched/clock.h> 16 17 #include <asm/apic.h> 18 19 #include "../perf_event.h" 20 21 static u32 ibs_caps; 22 23 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 24 25 #include <linux/kprobes.h> 26 #include <linux/hardirq.h> 27 28 #include <asm/nmi.h> 29 #include <asm/amd-ibs.h> 30 31 #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) 32 #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT 33 34 /* attr.config2 */ 35 #define IBS_SW_FILTER_MASK 1 36 37 /* 38 * IBS states: 39 * 40 * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken 41 * and any further add()s must fail. 42 * 43 * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are 44 * complicated by the fact that the IBS hardware can send late NMIs (ie. after 45 * we've cleared the EN bit). 46 * 47 * In order to consume these late NMIs we have the STOPPED state, any NMI that 48 * happens after we've cleared the EN state will clear this bit and report the 49 * NMI handled (this is fundamentally racy in the face or multiple NMI sources, 50 * someone else can consume our BIT and our NMI will go unhandled). 51 * 52 * And since we cannot set/clear this separate bit together with the EN bit, 53 * there are races; if we cleared STARTED early, an NMI could land in 54 * between clearing STARTED and clearing the EN bit (in fact multiple NMIs 55 * could happen if the period is small enough), and consume our STOPPED bit 56 * and trigger streams of unhandled NMIs. 57 * 58 * If, however, we clear STARTED late, an NMI can hit between clearing the 59 * EN bit and clearing STARTED, still see STARTED set and process the event. 60 * If this event will have the VALID bit clear, we bail properly, but this 61 * is not a given. With VALID set we can end up calling pmu::stop() again 62 * (the throttle logic) and trigger the WARNs in there. 63 * 64 * So what we do is set STOPPING before clearing EN to avoid the pmu::stop() 65 * nesting, and clear STARTED late, so that we have a well defined state over 66 * the clearing of the EN bit. 67 * 68 * XXX: we could probably be using !atomic bitops for all this. 69 */ 70 71 enum ibs_states { 72 IBS_ENABLED = 0, 73 IBS_STARTED = 1, 74 IBS_STOPPING = 2, 75 IBS_STOPPED = 3, 76 77 IBS_MAX_STATES, 78 }; 79 80 struct cpu_perf_ibs { 81 struct perf_event *event; 82 unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; 83 }; 84 85 struct perf_ibs { 86 struct pmu pmu; 87 unsigned int msr; 88 u64 config_mask; 89 u64 cnt_mask; 90 u64 enable_mask; 91 u64 valid_mask; 92 u64 max_period; 93 unsigned long offset_mask[1]; 94 int offset_max; 95 unsigned int fetch_count_reset_broken : 1; 96 unsigned int fetch_ignore_if_zero_rip : 1; 97 struct cpu_perf_ibs __percpu *pcpu; 98 99 u64 (*get_count)(u64 config); 100 }; 101 102 static int 103 perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) 104 { 105 s64 left = local64_read(&hwc->period_left); 106 s64 period = hwc->sample_period; 107 int overflow = 0; 108 109 /* 110 * If we are way outside a reasonable range then just skip forward: 111 */ 112 if (unlikely(left <= -period)) { 113 left = period; 114 local64_set(&hwc->period_left, left); 115 hwc->last_period = period; 116 overflow = 1; 117 } 118 119 if (unlikely(left < (s64)min)) { 120 left += period; 121 local64_set(&hwc->period_left, left); 122 hwc->last_period = period; 123 overflow = 1; 124 } 125 126 /* 127 * If the hw period that triggers the sw overflow is too short 128 * we might hit the irq handler. This biases the results. 129 * Thus we shorten the next-to-last period and set the last 130 * period to the max period. 131 */ 132 if (left > max) { 133 left -= max; 134 if (left > max) 135 left = max; 136 else if (left < min) 137 left = min; 138 } 139 140 *hw_period = (u64)left; 141 142 return overflow; 143 } 144 145 static int 146 perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) 147 { 148 struct hw_perf_event *hwc = &event->hw; 149 int shift = 64 - width; 150 u64 prev_raw_count; 151 u64 delta; 152 153 /* 154 * Careful: an NMI might modify the previous event value. 155 * 156 * Our tactic to handle this is to first atomically read and 157 * exchange a new raw count - then add that new-prev delta 158 * count to the generic event atomically: 159 */ 160 prev_raw_count = local64_read(&hwc->prev_count); 161 if (!local64_try_cmpxchg(&hwc->prev_count, 162 &prev_raw_count, new_raw_count)) 163 return 0; 164 165 /* 166 * Now we have the new raw value and have updated the prev 167 * timestamp already. We can now calculate the elapsed delta 168 * (event-)time and add that to the generic event. 169 * 170 * Careful, not all hw sign-extends above the physical width 171 * of the count. 172 */ 173 delta = (new_raw_count << shift) - (prev_raw_count << shift); 174 delta >>= shift; 175 176 local64_add(delta, &event->count); 177 local64_sub(delta, &hwc->period_left); 178 179 return 1; 180 } 181 182 static struct perf_ibs perf_ibs_fetch; 183 static struct perf_ibs perf_ibs_op; 184 185 static struct perf_ibs *get_ibs_pmu(int type) 186 { 187 if (perf_ibs_fetch.pmu.type == type) 188 return &perf_ibs_fetch; 189 if (perf_ibs_op.pmu.type == type) 190 return &perf_ibs_op; 191 return NULL; 192 } 193 194 /* 195 * core pmu config -> IBS config 196 * 197 * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count 198 * perf record -a -e r076:p ... # same as -e cpu-cycles:p 199 * perf record -a -e r0C1:p ... # use ibs op counting micro-ops 200 * 201 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, 202 * MSRC001_1033) is used to select either cycle or micro-ops counting 203 * mode. 204 */ 205 static int core_pmu_ibs_config(struct perf_event *event, u64 *config) 206 { 207 switch (event->attr.type) { 208 case PERF_TYPE_HARDWARE: 209 switch (event->attr.config) { 210 case PERF_COUNT_HW_CPU_CYCLES: 211 *config = 0; 212 return 0; 213 } 214 break; 215 case PERF_TYPE_RAW: 216 switch (event->attr.config) { 217 case 0x0076: 218 *config = 0; 219 return 0; 220 case 0x00C1: 221 *config = IBS_OP_CNT_CTL; 222 return 0; 223 } 224 break; 225 default: 226 return -ENOENT; 227 } 228 229 return -EOPNOTSUPP; 230 } 231 232 /* 233 * The rip of IBS samples has skid 0. Thus, IBS supports precise 234 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the 235 * rip is invalid when IBS was not able to record the rip correctly. 236 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. 237 */ 238 int forward_event_to_ibs(struct perf_event *event) 239 { 240 u64 config = 0; 241 242 if (!event->attr.precise_ip || event->attr.precise_ip > 2) 243 return -EOPNOTSUPP; 244 245 if (!core_pmu_ibs_config(event, &config)) { 246 event->attr.type = perf_ibs_op.pmu.type; 247 event->attr.config = config; 248 } 249 return -ENOENT; 250 } 251 252 /* 253 * Grouping of IBS events is not possible since IBS can have only 254 * one event active at any point in time. 255 */ 256 static int validate_group(struct perf_event *event) 257 { 258 struct perf_event *sibling; 259 260 if (event->group_leader == event) 261 return 0; 262 263 if (event->group_leader->pmu == event->pmu) 264 return -EINVAL; 265 266 for_each_sibling_event(sibling, event->group_leader) { 267 if (sibling->pmu == event->pmu) 268 return -EINVAL; 269 } 270 return 0; 271 } 272 273 static int perf_ibs_init(struct perf_event *event) 274 { 275 struct hw_perf_event *hwc = &event->hw; 276 struct perf_ibs *perf_ibs; 277 u64 max_cnt, config; 278 int ret; 279 280 perf_ibs = get_ibs_pmu(event->attr.type); 281 if (!perf_ibs) 282 return -ENOENT; 283 284 config = event->attr.config; 285 286 if (event->pmu != &perf_ibs->pmu) 287 return -ENOENT; 288 289 if (config & ~perf_ibs->config_mask) 290 return -EINVAL; 291 292 if (has_branch_stack(event)) 293 return -EOPNOTSUPP; 294 295 /* handle exclude_{user,kernel} in the IRQ handler */ 296 if (event->attr.exclude_host || event->attr.exclude_guest || 297 event->attr.exclude_idle) 298 return -EINVAL; 299 300 if (!(event->attr.config2 & IBS_SW_FILTER_MASK) && 301 (event->attr.exclude_kernel || event->attr.exclude_user || 302 event->attr.exclude_hv)) 303 return -EINVAL; 304 305 ret = validate_group(event); 306 if (ret) 307 return ret; 308 309 if (hwc->sample_period) { 310 if (config & perf_ibs->cnt_mask) 311 /* raw max_cnt may not be set */ 312 return -EINVAL; 313 if (!event->attr.sample_freq && hwc->sample_period & 0x0f) 314 /* 315 * lower 4 bits can not be set in ibs max cnt, 316 * but allowing it in case we adjust the 317 * sample period to set a frequency. 318 */ 319 return -EINVAL; 320 hwc->sample_period &= ~0x0FULL; 321 if (!hwc->sample_period) 322 hwc->sample_period = 0x10; 323 } else { 324 max_cnt = config & perf_ibs->cnt_mask; 325 config &= ~perf_ibs->cnt_mask; 326 event->attr.sample_period = max_cnt << 4; 327 hwc->sample_period = event->attr.sample_period; 328 } 329 330 if (!hwc->sample_period) 331 return -EINVAL; 332 333 /* 334 * If we modify hwc->sample_period, we also need to update 335 * hwc->last_period and hwc->period_left. 336 */ 337 hwc->last_period = hwc->sample_period; 338 local64_set(&hwc->period_left, hwc->sample_period); 339 340 hwc->config_base = perf_ibs->msr; 341 hwc->config = config; 342 343 return 0; 344 } 345 346 static int perf_ibs_set_period(struct perf_ibs *perf_ibs, 347 struct hw_perf_event *hwc, u64 *period) 348 { 349 int overflow; 350 351 /* ignore lower 4 bits in min count: */ 352 overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); 353 local64_set(&hwc->prev_count, 0); 354 355 return overflow; 356 } 357 358 static u64 get_ibs_fetch_count(u64 config) 359 { 360 union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config; 361 362 return fetch_ctl.fetch_cnt << 4; 363 } 364 365 static u64 get_ibs_op_count(u64 config) 366 { 367 union ibs_op_ctl op_ctl = (union ibs_op_ctl)config; 368 u64 count = 0; 369 370 /* 371 * If the internal 27-bit counter rolled over, the count is MaxCnt 372 * and the lower 7 bits of CurCnt are randomized. 373 * Otherwise CurCnt has the full 27-bit current counter value. 374 */ 375 if (op_ctl.op_val) { 376 count = op_ctl.opmaxcnt << 4; 377 if (ibs_caps & IBS_CAPS_OPCNTEXT) 378 count += op_ctl.opmaxcnt_ext << 20; 379 } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { 380 count = op_ctl.opcurcnt; 381 } 382 383 return count; 384 } 385 386 static void 387 perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, 388 u64 *config) 389 { 390 u64 count = perf_ibs->get_count(*config); 391 392 /* 393 * Set width to 64 since we do not overflow on max width but 394 * instead on max count. In perf_ibs_set_period() we clear 395 * prev count manually on overflow. 396 */ 397 while (!perf_event_try_update(event, count, 64)) { 398 rdmsrl(event->hw.config_base, *config); 399 count = perf_ibs->get_count(*config); 400 } 401 } 402 403 static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, 404 struct hw_perf_event *hwc, u64 config) 405 { 406 u64 tmp = hwc->config | config; 407 408 if (perf_ibs->fetch_count_reset_broken) 409 wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask); 410 411 wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask); 412 } 413 414 /* 415 * Erratum #420 Instruction-Based Sampling Engine May Generate 416 * Interrupt that Cannot Be Cleared: 417 * 418 * Must clear counter mask first, then clear the enable bit. See 419 * Revision Guide for AMD Family 10h Processors, Publication #41322. 420 */ 421 static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, 422 struct hw_perf_event *hwc, u64 config) 423 { 424 config &= ~perf_ibs->cnt_mask; 425 if (boot_cpu_data.x86 == 0x10) 426 wrmsrl(hwc->config_base, config); 427 config &= ~perf_ibs->enable_mask; 428 wrmsrl(hwc->config_base, config); 429 } 430 431 /* 432 * We cannot restore the ibs pmu state, so we always needs to update 433 * the event while stopping it and then reset the state when starting 434 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in 435 * perf_ibs_start()/perf_ibs_stop() and instead always do it. 436 */ 437 static void perf_ibs_start(struct perf_event *event, int flags) 438 { 439 struct hw_perf_event *hwc = &event->hw; 440 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 441 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 442 u64 period, config = 0; 443 444 if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) 445 return; 446 447 WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); 448 hwc->state = 0; 449 450 perf_ibs_set_period(perf_ibs, hwc, &period); 451 if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { 452 config |= period & IBS_OP_MAX_CNT_EXT_MASK; 453 period &= ~IBS_OP_MAX_CNT_EXT_MASK; 454 } 455 config |= period >> 4; 456 457 /* 458 * Set STARTED before enabling the hardware, such that a subsequent NMI 459 * must observe it. 460 */ 461 set_bit(IBS_STARTED, pcpu->state); 462 clear_bit(IBS_STOPPING, pcpu->state); 463 perf_ibs_enable_event(perf_ibs, hwc, config); 464 465 perf_event_update_userpage(event); 466 } 467 468 static void perf_ibs_stop(struct perf_event *event, int flags) 469 { 470 struct hw_perf_event *hwc = &event->hw; 471 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 472 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 473 u64 config; 474 int stopping; 475 476 if (test_and_set_bit(IBS_STOPPING, pcpu->state)) 477 return; 478 479 stopping = test_bit(IBS_STARTED, pcpu->state); 480 481 if (!stopping && (hwc->state & PERF_HES_UPTODATE)) 482 return; 483 484 rdmsrl(hwc->config_base, config); 485 486 if (stopping) { 487 /* 488 * Set STOPPED before disabling the hardware, such that it 489 * must be visible to NMIs the moment we clear the EN bit, 490 * at which point we can generate an !VALID sample which 491 * we need to consume. 492 */ 493 set_bit(IBS_STOPPED, pcpu->state); 494 perf_ibs_disable_event(perf_ibs, hwc, config); 495 /* 496 * Clear STARTED after disabling the hardware; if it were 497 * cleared before an NMI hitting after the clear but before 498 * clearing the EN bit might think it a spurious NMI and not 499 * handle it. 500 * 501 * Clearing it after, however, creates the problem of the NMI 502 * handler seeing STARTED but not having a valid sample. 503 */ 504 clear_bit(IBS_STARTED, pcpu->state); 505 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 506 hwc->state |= PERF_HES_STOPPED; 507 } 508 509 if (hwc->state & PERF_HES_UPTODATE) 510 return; 511 512 /* 513 * Clear valid bit to not count rollovers on update, rollovers 514 * are only updated in the irq handler. 515 */ 516 config &= ~perf_ibs->valid_mask; 517 518 perf_ibs_event_update(perf_ibs, event, &config); 519 hwc->state |= PERF_HES_UPTODATE; 520 } 521 522 static int perf_ibs_add(struct perf_event *event, int flags) 523 { 524 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 525 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 526 527 if (test_and_set_bit(IBS_ENABLED, pcpu->state)) 528 return -ENOSPC; 529 530 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 531 532 pcpu->event = event; 533 534 if (flags & PERF_EF_START) 535 perf_ibs_start(event, PERF_EF_RELOAD); 536 537 return 0; 538 } 539 540 static void perf_ibs_del(struct perf_event *event, int flags) 541 { 542 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 543 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 544 545 if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) 546 return; 547 548 perf_ibs_stop(event, PERF_EF_UPDATE); 549 550 pcpu->event = NULL; 551 552 perf_event_update_userpage(event); 553 } 554 555 static void perf_ibs_read(struct perf_event *event) { } 556 557 /* 558 * We need to initialize with empty group if all attributes in the 559 * group are dynamic. 560 */ 561 static struct attribute *attrs_empty[] = { 562 NULL, 563 }; 564 565 static struct attribute_group empty_caps_group = { 566 .name = "caps", 567 .attrs = attrs_empty, 568 }; 569 570 PMU_FORMAT_ATTR(rand_en, "config:57"); 571 PMU_FORMAT_ATTR(cnt_ctl, "config:19"); 572 PMU_FORMAT_ATTR(swfilt, "config2:0"); 573 PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); 574 PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); 575 PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); 576 577 static umode_t 578 zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) 579 { 580 return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; 581 } 582 583 static struct attribute *fetch_attrs[] = { 584 &format_attr_rand_en.attr, 585 &format_attr_swfilt.attr, 586 NULL, 587 }; 588 589 static struct attribute *fetch_l3missonly_attrs[] = { 590 &fetch_l3missonly.attr.attr, 591 NULL, 592 }; 593 594 static struct attribute *zen4_ibs_extensions_attrs[] = { 595 &zen4_ibs_extensions.attr.attr, 596 NULL, 597 }; 598 599 static struct attribute_group group_fetch_formats = { 600 .name = "format", 601 .attrs = fetch_attrs, 602 }; 603 604 static struct attribute_group group_fetch_l3missonly = { 605 .name = "format", 606 .attrs = fetch_l3missonly_attrs, 607 .is_visible = zen4_ibs_extensions_is_visible, 608 }; 609 610 static struct attribute_group group_zen4_ibs_extensions = { 611 .name = "caps", 612 .attrs = zen4_ibs_extensions_attrs, 613 .is_visible = zen4_ibs_extensions_is_visible, 614 }; 615 616 static const struct attribute_group *fetch_attr_groups[] = { 617 &group_fetch_formats, 618 &empty_caps_group, 619 NULL, 620 }; 621 622 static const struct attribute_group *fetch_attr_update[] = { 623 &group_fetch_l3missonly, 624 &group_zen4_ibs_extensions, 625 NULL, 626 }; 627 628 static umode_t 629 cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) 630 { 631 return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; 632 } 633 634 static struct attribute *op_attrs[] = { 635 &format_attr_swfilt.attr, 636 NULL, 637 }; 638 639 static struct attribute *cnt_ctl_attrs[] = { 640 &format_attr_cnt_ctl.attr, 641 NULL, 642 }; 643 644 static struct attribute *op_l3missonly_attrs[] = { 645 &op_l3missonly.attr.attr, 646 NULL, 647 }; 648 649 static struct attribute_group group_op_formats = { 650 .name = "format", 651 .attrs = op_attrs, 652 }; 653 654 static struct attribute_group group_cnt_ctl = { 655 .name = "format", 656 .attrs = cnt_ctl_attrs, 657 .is_visible = cnt_ctl_is_visible, 658 }; 659 660 static struct attribute_group group_op_l3missonly = { 661 .name = "format", 662 .attrs = op_l3missonly_attrs, 663 .is_visible = zen4_ibs_extensions_is_visible, 664 }; 665 666 static const struct attribute_group *op_attr_groups[] = { 667 &group_op_formats, 668 &empty_caps_group, 669 NULL, 670 }; 671 672 static const struct attribute_group *op_attr_update[] = { 673 &group_cnt_ctl, 674 &group_op_l3missonly, 675 &group_zen4_ibs_extensions, 676 NULL, 677 }; 678 679 static struct perf_ibs perf_ibs_fetch = { 680 .pmu = { 681 .task_ctx_nr = perf_hw_context, 682 683 .event_init = perf_ibs_init, 684 .add = perf_ibs_add, 685 .del = perf_ibs_del, 686 .start = perf_ibs_start, 687 .stop = perf_ibs_stop, 688 .read = perf_ibs_read, 689 }, 690 .msr = MSR_AMD64_IBSFETCHCTL, 691 .config_mask = IBS_FETCH_CONFIG_MASK, 692 .cnt_mask = IBS_FETCH_MAX_CNT, 693 .enable_mask = IBS_FETCH_ENABLE, 694 .valid_mask = IBS_FETCH_VAL, 695 .max_period = IBS_FETCH_MAX_CNT << 4, 696 .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, 697 .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, 698 699 .get_count = get_ibs_fetch_count, 700 }; 701 702 static struct perf_ibs perf_ibs_op = { 703 .pmu = { 704 .task_ctx_nr = perf_hw_context, 705 706 .event_init = perf_ibs_init, 707 .add = perf_ibs_add, 708 .del = perf_ibs_del, 709 .start = perf_ibs_start, 710 .stop = perf_ibs_stop, 711 .read = perf_ibs_read, 712 }, 713 .msr = MSR_AMD64_IBSOPCTL, 714 .config_mask = IBS_OP_CONFIG_MASK, 715 .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | 716 IBS_OP_CUR_CNT_RAND, 717 .enable_mask = IBS_OP_ENABLE, 718 .valid_mask = IBS_OP_VAL, 719 .max_period = IBS_OP_MAX_CNT << 4, 720 .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, 721 .offset_max = MSR_AMD64_IBSOP_REG_COUNT, 722 723 .get_count = get_ibs_op_count, 724 }; 725 726 static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, 727 struct perf_sample_data *data) 728 { 729 union perf_mem_data_src *data_src = &data->data_src; 730 731 data_src->mem_op = PERF_MEM_OP_NA; 732 733 if (op_data3->ld_op) 734 data_src->mem_op = PERF_MEM_OP_LOAD; 735 else if (op_data3->st_op) 736 data_src->mem_op = PERF_MEM_OP_STORE; 737 } 738 739 /* 740 * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has 741 * more fine granular DataSrc encodings. Others have coarse. 742 */ 743 static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) 744 { 745 if (ibs_caps & IBS_CAPS_ZEN4) 746 return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; 747 748 return op_data2->data_src_lo; 749 } 750 751 #define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT)) 752 #define LN(x) PERF_MEM_S(LVLNUM, x) 753 #define REM PERF_MEM_S(REMOTE, REMOTE) 754 #define HOPS(x) PERF_MEM_S(HOPS, x) 755 756 static u64 g_data_src[8] = { 757 [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0), 758 [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM), 759 [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1), 760 [IBS_DATA_SRC_IO] = L(IO) | LN(IO), 761 }; 762 763 #define RMT_NODE_BITS (1 << IBS_DATA_SRC_DRAM) 764 #define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x)) 765 766 static u64 g_zen4_data_src[32] = { 767 [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3), 768 [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0), 769 [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM), 770 [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1), 771 [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM), 772 [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO), 773 [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL), 774 }; 775 776 #define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \ 777 (1 << IBS_DATA_SRC_EXT_PMEM) | \ 778 (1 << IBS_DATA_SRC_EXT_EXT_MEM)) 779 #define ZEN4_RMT_NODE_APPLICABLE(x) (ZEN4_RMT_NODE_BITS & (1 << x)) 780 781 static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, 782 union ibs_op_data3 *op_data3, 783 struct perf_sample_data *data) 784 { 785 union perf_mem_data_src *data_src = &data->data_src; 786 u8 ibs_data_src = perf_ibs_data_src(op_data2); 787 788 data_src->mem_lvl = 0; 789 data_src->mem_lvl_num = 0; 790 791 /* 792 * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached 793 * memory accesses. So, check DcUcMemAcc bit early. 794 */ 795 if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) 796 return L(UNC) | LN(UNC); 797 798 /* L1 Hit */ 799 if (op_data3->dc_miss == 0) 800 return L(L1) | LN(L1); 801 802 /* L2 Hit */ 803 if (op_data3->l2_miss == 0) { 804 /* Erratum #1293 */ 805 if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || 806 !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) 807 return L(L2) | LN(L2); 808 } 809 810 /* 811 * OP_DATA2 is valid only for load ops. Skip all checks which 812 * uses OP_DATA2[DataSrc]. 813 */ 814 if (data_src->mem_op != PERF_MEM_OP_LOAD) 815 goto check_mab; 816 817 if (ibs_caps & IBS_CAPS_ZEN4) { 818 u64 val = g_zen4_data_src[ibs_data_src]; 819 820 if (!val) 821 goto check_mab; 822 823 /* HOPS_1 because IBS doesn't provide remote socket detail */ 824 if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) { 825 if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) 826 val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); 827 else 828 val |= REM | HOPS(1); 829 } 830 831 return val; 832 } else { 833 u64 val = g_data_src[ibs_data_src]; 834 835 if (!val) 836 goto check_mab; 837 838 /* HOPS_1 because IBS doesn't provide remote socket detail */ 839 if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) { 840 if (ibs_data_src == IBS_DATA_SRC_DRAM) 841 val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); 842 else 843 val |= REM | HOPS(1); 844 } 845 846 return val; 847 } 848 849 check_mab: 850 /* 851 * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding 852 * DC misses. However, such data may come from any level in mem 853 * hierarchy. IBS provides detail about both MAB as well as actual 854 * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set 855 * MAB only when IBS fails to provide DataSrc. 856 */ 857 if (op_data3->dc_miss_no_mab_alloc) 858 return L(LFB) | LN(LFB); 859 860 /* Don't set HIT with NA */ 861 return PERF_MEM_S(LVL, NA) | LN(NA); 862 } 863 864 static bool perf_ibs_cache_hit_st_valid(void) 865 { 866 /* 0: Uninitialized, 1: Valid, -1: Invalid */ 867 static int cache_hit_st_valid; 868 869 if (unlikely(!cache_hit_st_valid)) { 870 if (boot_cpu_data.x86 == 0x19 && 871 (boot_cpu_data.x86_model <= 0xF || 872 (boot_cpu_data.x86_model >= 0x20 && 873 boot_cpu_data.x86_model <= 0x5F))) { 874 cache_hit_st_valid = -1; 875 } else { 876 cache_hit_st_valid = 1; 877 } 878 } 879 880 return cache_hit_st_valid == 1; 881 } 882 883 static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, 884 struct perf_sample_data *data) 885 { 886 union perf_mem_data_src *data_src = &data->data_src; 887 u8 ibs_data_src; 888 889 data_src->mem_snoop = PERF_MEM_SNOOP_NA; 890 891 if (!perf_ibs_cache_hit_st_valid() || 892 data_src->mem_op != PERF_MEM_OP_LOAD || 893 data_src->mem_lvl & PERF_MEM_LVL_L1 || 894 data_src->mem_lvl & PERF_MEM_LVL_L2 || 895 op_data2->cache_hit_st) 896 return; 897 898 ibs_data_src = perf_ibs_data_src(op_data2); 899 900 if (ibs_caps & IBS_CAPS_ZEN4) { 901 if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || 902 ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || 903 ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) 904 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 905 } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { 906 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 907 } 908 } 909 910 static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, 911 struct perf_sample_data *data) 912 { 913 union perf_mem_data_src *data_src = &data->data_src; 914 915 data_src->mem_dtlb = PERF_MEM_TLB_NA; 916 917 if (!op_data3->dc_lin_addr_valid) 918 return; 919 920 if (!op_data3->dc_l1tlb_miss) { 921 data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; 922 return; 923 } 924 925 if (!op_data3->dc_l2tlb_miss) { 926 data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; 927 return; 928 } 929 930 data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; 931 } 932 933 static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, 934 struct perf_sample_data *data) 935 { 936 union perf_mem_data_src *data_src = &data->data_src; 937 938 data_src->mem_lock = PERF_MEM_LOCK_NA; 939 940 if (op_data3->dc_locked_op) 941 data_src->mem_lock = PERF_MEM_LOCK_LOCKED; 942 } 943 944 /* Be careful. Works only for contiguous MSRs. */ 945 #define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL) 946 #define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) 947 948 static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, 949 struct perf_sample_data *data, 950 union ibs_op_data2 *op_data2, 951 union ibs_op_data3 *op_data3) 952 { 953 union perf_mem_data_src *data_src = &data->data_src; 954 955 data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data); 956 perf_ibs_get_mem_snoop(op_data2, data); 957 perf_ibs_get_tlb_lvl(op_data3, data); 958 perf_ibs_get_mem_lock(op_data3, data); 959 } 960 961 static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, 962 union ibs_op_data3 *op_data3) 963 { 964 __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; 965 966 /* Erratum #1293 */ 967 if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && 968 (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { 969 /* 970 * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. 971 * DataSrc=0 is 'No valid status' and RmtNode is invalid when 972 * DataSrc=0. 973 */ 974 val = 0; 975 } 976 return val; 977 } 978 979 static void perf_ibs_parse_ld_st_data(__u64 sample_type, 980 struct perf_ibs_data *ibs_data, 981 struct perf_sample_data *data) 982 { 983 union ibs_op_data3 op_data3; 984 union ibs_op_data2 op_data2; 985 union ibs_op_data op_data; 986 987 data->data_src.val = PERF_MEM_NA; 988 op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; 989 990 perf_ibs_get_mem_op(&op_data3, data); 991 if (data->data_src.mem_op != PERF_MEM_OP_LOAD && 992 data->data_src.mem_op != PERF_MEM_OP_STORE) 993 return; 994 995 op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); 996 997 if (sample_type & PERF_SAMPLE_DATA_SRC) { 998 perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); 999 data->sample_flags |= PERF_SAMPLE_DATA_SRC; 1000 } 1001 1002 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && 1003 data->data_src.mem_op == PERF_MEM_OP_LOAD) { 1004 op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; 1005 1006 if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { 1007 data->weight.var1_dw = op_data3.dc_miss_lat; 1008 data->weight.var2_w = op_data.tag_to_ret_ctr; 1009 } else if (sample_type & PERF_SAMPLE_WEIGHT) { 1010 data->weight.full = op_data3.dc_miss_lat; 1011 } 1012 data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 1013 } 1014 1015 if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { 1016 data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; 1017 data->sample_flags |= PERF_SAMPLE_ADDR; 1018 } 1019 1020 if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { 1021 data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; 1022 data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; 1023 } 1024 } 1025 1026 static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, 1027 int check_rip) 1028 { 1029 if (sample_type & PERF_SAMPLE_RAW || 1030 (perf_ibs == &perf_ibs_op && 1031 (sample_type & PERF_SAMPLE_DATA_SRC || 1032 sample_type & PERF_SAMPLE_WEIGHT_TYPE || 1033 sample_type & PERF_SAMPLE_ADDR || 1034 sample_type & PERF_SAMPLE_PHYS_ADDR))) 1035 return perf_ibs->offset_max; 1036 else if (check_rip) 1037 return 3; 1038 return 1; 1039 } 1040 1041 static bool perf_ibs_is_kernel_data_addr(struct perf_event *event, 1042 struct perf_ibs_data *ibs_data) 1043 { 1044 u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW; 1045 union ibs_op_data3 op_data3; 1046 u64 dc_lin_addr; 1047 1048 op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; 1049 dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; 1050 1051 return unlikely((event->attr.sample_type & sample_type_mask) && 1052 op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr)); 1053 } 1054 1055 static bool perf_ibs_is_kernel_br_target(struct perf_event *event, 1056 struct perf_ibs_data *ibs_data, 1057 int br_target_idx) 1058 { 1059 union ibs_op_data op_data; 1060 u64 br_target; 1061 1062 op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; 1063 br_target = ibs_data->regs[br_target_idx]; 1064 1065 return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) && 1066 op_data.op_brn_ret && kernel_ip(br_target)); 1067 } 1068 1069 static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event, 1070 struct pt_regs *regs, struct perf_ibs_data *ibs_data, 1071 int br_target_idx) 1072 { 1073 if (perf_exclude_event(event, regs)) 1074 return true; 1075 1076 if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel) 1077 return false; 1078 1079 if (perf_ibs_is_kernel_data_addr(event, ibs_data)) 1080 return true; 1081 1082 if (br_target_idx != -1 && 1083 perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx)) 1084 return true; 1085 1086 return false; 1087 } 1088 1089 static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs, 1090 struct perf_ibs_data *ibs_data) 1091 { 1092 if (perf_ibs == &perf_ibs_op) { 1093 ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)] &= ~(1ULL << 18); 1094 ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0; 1095 return; 1096 } 1097 1098 ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)] &= ~(1ULL << 52); 1099 ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0; 1100 } 1101 1102 static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) 1103 { 1104 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 1105 struct perf_event *event = pcpu->event; 1106 struct hw_perf_event *hwc; 1107 struct perf_sample_data data; 1108 struct perf_raw_record raw; 1109 struct pt_regs regs; 1110 struct perf_ibs_data ibs_data; 1111 int offset, size, check_rip, offset_max, throttle = 0; 1112 unsigned int msr; 1113 u64 *buf, *config, period, new_config = 0; 1114 int br_target_idx = -1; 1115 1116 if (!test_bit(IBS_STARTED, pcpu->state)) { 1117 fail: 1118 /* 1119 * Catch spurious interrupts after stopping IBS: After 1120 * disabling IBS there could be still incoming NMIs 1121 * with samples that even have the valid bit cleared. 1122 * Mark all this NMIs as handled. 1123 */ 1124 if (test_and_clear_bit(IBS_STOPPED, pcpu->state)) 1125 return 1; 1126 1127 return 0; 1128 } 1129 1130 if (WARN_ON_ONCE(!event)) 1131 goto fail; 1132 1133 hwc = &event->hw; 1134 msr = hwc->config_base; 1135 buf = ibs_data.regs; 1136 rdmsrl(msr, *buf); 1137 if (!(*buf++ & perf_ibs->valid_mask)) 1138 goto fail; 1139 1140 config = &ibs_data.regs[0]; 1141 perf_ibs_event_update(perf_ibs, event, config); 1142 perf_sample_data_init(&data, 0, hwc->last_period); 1143 if (!perf_ibs_set_period(perf_ibs, hwc, &period)) 1144 goto out; /* no sw counter overflow */ 1145 1146 ibs_data.caps = ibs_caps; 1147 size = 1; 1148 offset = 1; 1149 check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); 1150 1151 offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); 1152 1153 do { 1154 rdmsrl(msr + offset, *buf++); 1155 size++; 1156 offset = find_next_bit(perf_ibs->offset_mask, 1157 perf_ibs->offset_max, 1158 offset + 1); 1159 } while (offset < offset_max); 1160 /* 1161 * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately 1162 * depending on their availability. 1163 * Can't add to offset_max as they are staggered 1164 */ 1165 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 1166 if (perf_ibs == &perf_ibs_op) { 1167 if (ibs_caps & IBS_CAPS_BRNTRGT) { 1168 rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); 1169 br_target_idx = size; 1170 size++; 1171 } 1172 if (ibs_caps & IBS_CAPS_OPDATA4) { 1173 rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); 1174 size++; 1175 } 1176 } 1177 if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { 1178 rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++); 1179 size++; 1180 } 1181 } 1182 ibs_data.size = sizeof(u64) * size; 1183 1184 regs = *iregs; 1185 if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { 1186 regs.flags &= ~PERF_EFLAGS_EXACT; 1187 } else { 1188 /* Workaround for erratum #1197 */ 1189 if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) 1190 goto out; 1191 1192 set_linear_ip(®s, ibs_data.regs[1]); 1193 regs.flags |= PERF_EFLAGS_EXACT; 1194 } 1195 1196 if ((event->attr.config2 & IBS_SW_FILTER_MASK) && 1197 perf_ibs_swfilt_discard(perf_ibs, event, ®s, &ibs_data, br_target_idx)) { 1198 throttle = perf_event_account_interrupt(event); 1199 goto out; 1200 } 1201 /* 1202 * Prevent leaking physical addresses to unprivileged users. Skip 1203 * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for 1204 * unprivileged users. 1205 */ 1206 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 1207 perf_allow_kernel(&event->attr)) { 1208 perf_ibs_phyaddr_clear(perf_ibs, &ibs_data); 1209 } 1210 1211 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 1212 raw = (struct perf_raw_record){ 1213 .frag = { 1214 .size = sizeof(u32) + ibs_data.size, 1215 .data = ibs_data.data, 1216 }, 1217 }; 1218 perf_sample_save_raw_data(&data, event, &raw); 1219 } 1220 1221 if (perf_ibs == &perf_ibs_op) 1222 perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); 1223 1224 /* 1225 * rip recorded by IbsOpRip will not be consistent with rsp and rbp 1226 * recorded as part of interrupt regs. Thus we need to use rip from 1227 * interrupt regs while unwinding call stack. 1228 */ 1229 perf_sample_save_callchain(&data, event, iregs); 1230 1231 throttle = perf_event_overflow(event, &data, ®s); 1232 out: 1233 if (throttle) { 1234 perf_ibs_stop(event, 0); 1235 } else { 1236 if (perf_ibs == &perf_ibs_op) { 1237 if (ibs_caps & IBS_CAPS_OPCNTEXT) { 1238 new_config = period & IBS_OP_MAX_CNT_EXT_MASK; 1239 period &= ~IBS_OP_MAX_CNT_EXT_MASK; 1240 } 1241 if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) 1242 new_config |= *config & IBS_OP_CUR_CNT_RAND; 1243 } 1244 new_config |= period >> 4; 1245 1246 perf_ibs_enable_event(perf_ibs, hwc, new_config); 1247 } 1248 1249 perf_event_update_userpage(event); 1250 1251 return 1; 1252 } 1253 1254 static int 1255 perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1256 { 1257 u64 stamp = sched_clock(); 1258 int handled = 0; 1259 1260 handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); 1261 handled += perf_ibs_handle_irq(&perf_ibs_op, regs); 1262 1263 if (handled) 1264 inc_irq_stat(apic_perf_irqs); 1265 1266 perf_sample_event_took(sched_clock() - stamp); 1267 1268 return handled; 1269 } 1270 NOKPROBE_SYMBOL(perf_ibs_nmi_handler); 1271 1272 static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) 1273 { 1274 struct cpu_perf_ibs __percpu *pcpu; 1275 int ret; 1276 1277 pcpu = alloc_percpu(struct cpu_perf_ibs); 1278 if (!pcpu) 1279 return -ENOMEM; 1280 1281 perf_ibs->pcpu = pcpu; 1282 1283 ret = perf_pmu_register(&perf_ibs->pmu, name, -1); 1284 if (ret) { 1285 perf_ibs->pcpu = NULL; 1286 free_percpu(pcpu); 1287 } 1288 1289 return ret; 1290 } 1291 1292 static __init int perf_ibs_fetch_init(void) 1293 { 1294 /* 1295 * Some chips fail to reset the fetch count when it is written; instead 1296 * they need a 0-1 transition of IbsFetchEn. 1297 */ 1298 if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) 1299 perf_ibs_fetch.fetch_count_reset_broken = 1; 1300 1301 if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) 1302 perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; 1303 1304 if (ibs_caps & IBS_CAPS_ZEN4) 1305 perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; 1306 1307 perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; 1308 perf_ibs_fetch.pmu.attr_update = fetch_attr_update; 1309 1310 return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); 1311 } 1312 1313 static __init int perf_ibs_op_init(void) 1314 { 1315 if (ibs_caps & IBS_CAPS_OPCNT) 1316 perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; 1317 1318 if (ibs_caps & IBS_CAPS_OPCNTEXT) { 1319 perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; 1320 perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; 1321 perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; 1322 } 1323 1324 if (ibs_caps & IBS_CAPS_ZEN4) 1325 perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; 1326 1327 perf_ibs_op.pmu.attr_groups = op_attr_groups; 1328 perf_ibs_op.pmu.attr_update = op_attr_update; 1329 1330 return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); 1331 } 1332 1333 static __init int perf_event_ibs_init(void) 1334 { 1335 int ret; 1336 1337 ret = perf_ibs_fetch_init(); 1338 if (ret) 1339 return ret; 1340 1341 ret = perf_ibs_op_init(); 1342 if (ret) 1343 goto err_op; 1344 1345 ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); 1346 if (ret) 1347 goto err_nmi; 1348 1349 pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); 1350 return 0; 1351 1352 err_nmi: 1353 perf_pmu_unregister(&perf_ibs_op.pmu); 1354 free_percpu(perf_ibs_op.pcpu); 1355 perf_ibs_op.pcpu = NULL; 1356 err_op: 1357 perf_pmu_unregister(&perf_ibs_fetch.pmu); 1358 free_percpu(perf_ibs_fetch.pcpu); 1359 perf_ibs_fetch.pcpu = NULL; 1360 1361 return ret; 1362 } 1363 1364 #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ 1365 1366 static __init int perf_event_ibs_init(void) 1367 { 1368 return 0; 1369 } 1370 1371 #endif 1372 1373 /* IBS - apic initialization, for perf and oprofile */ 1374 1375 static __init u32 __get_ibs_caps(void) 1376 { 1377 u32 caps; 1378 unsigned int max_level; 1379 1380 if (!boot_cpu_has(X86_FEATURE_IBS)) 1381 return 0; 1382 1383 /* check IBS cpuid feature flags */ 1384 max_level = cpuid_eax(0x80000000); 1385 if (max_level < IBS_CPUID_FEATURES) 1386 return IBS_CAPS_DEFAULT; 1387 1388 caps = cpuid_eax(IBS_CPUID_FEATURES); 1389 if (!(caps & IBS_CAPS_AVAIL)) 1390 /* cpuid flags not valid */ 1391 return IBS_CAPS_DEFAULT; 1392 1393 return caps; 1394 } 1395 1396 u32 get_ibs_caps(void) 1397 { 1398 return ibs_caps; 1399 } 1400 1401 EXPORT_SYMBOL(get_ibs_caps); 1402 1403 static inline int get_eilvt(int offset) 1404 { 1405 return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); 1406 } 1407 1408 static inline int put_eilvt(int offset) 1409 { 1410 return !setup_APIC_eilvt(offset, 0, 0, 1); 1411 } 1412 1413 /* 1414 * Check and reserve APIC extended interrupt LVT offset for IBS if available. 1415 */ 1416 static inline int ibs_eilvt_valid(void) 1417 { 1418 int offset; 1419 u64 val; 1420 int valid = 0; 1421 1422 preempt_disable(); 1423 1424 rdmsrl(MSR_AMD64_IBSCTL, val); 1425 offset = val & IBSCTL_LVT_OFFSET_MASK; 1426 1427 if (!(val & IBSCTL_LVT_OFFSET_VALID)) { 1428 pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", 1429 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); 1430 goto out; 1431 } 1432 1433 if (!get_eilvt(offset)) { 1434 pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", 1435 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); 1436 goto out; 1437 } 1438 1439 valid = 1; 1440 out: 1441 preempt_enable(); 1442 1443 return valid; 1444 } 1445 1446 static int setup_ibs_ctl(int ibs_eilvt_off) 1447 { 1448 struct pci_dev *cpu_cfg; 1449 int nodes; 1450 u32 value = 0; 1451 1452 nodes = 0; 1453 cpu_cfg = NULL; 1454 do { 1455 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, 1456 PCI_DEVICE_ID_AMD_10H_NB_MISC, 1457 cpu_cfg); 1458 if (!cpu_cfg) 1459 break; 1460 ++nodes; 1461 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off 1462 | IBSCTL_LVT_OFFSET_VALID); 1463 pci_read_config_dword(cpu_cfg, IBSCTL, &value); 1464 if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { 1465 pci_dev_put(cpu_cfg); 1466 pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n", 1467 value); 1468 return -EINVAL; 1469 } 1470 } while (1); 1471 1472 if (!nodes) { 1473 pr_debug("No CPU node configured for IBS\n"); 1474 return -ENODEV; 1475 } 1476 1477 return 0; 1478 } 1479 1480 /* 1481 * This runs only on the current cpu. We try to find an LVT offset and 1482 * setup the local APIC. For this we must disable preemption. On 1483 * success we initialize all nodes with this offset. This updates then 1484 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of 1485 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that 1486 * is using the new offset. 1487 */ 1488 static void force_ibs_eilvt_setup(void) 1489 { 1490 int offset; 1491 int ret; 1492 1493 preempt_disable(); 1494 /* find the next free available EILVT entry, skip offset 0 */ 1495 for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { 1496 if (get_eilvt(offset)) 1497 break; 1498 } 1499 preempt_enable(); 1500 1501 if (offset == APIC_EILVT_NR_MAX) { 1502 pr_debug("No EILVT entry available\n"); 1503 return; 1504 } 1505 1506 ret = setup_ibs_ctl(offset); 1507 if (ret) 1508 goto out; 1509 1510 if (!ibs_eilvt_valid()) 1511 goto out; 1512 1513 pr_info("LVT offset %d assigned\n", offset); 1514 1515 return; 1516 out: 1517 preempt_disable(); 1518 put_eilvt(offset); 1519 preempt_enable(); 1520 return; 1521 } 1522 1523 static void ibs_eilvt_setup(void) 1524 { 1525 /* 1526 * Force LVT offset assignment for family 10h: The offsets are 1527 * not assigned by the BIOS for this family, so the OS is 1528 * responsible for doing it. If the OS assignment fails, fall 1529 * back to BIOS settings and try to setup this. 1530 */ 1531 if (boot_cpu_data.x86 == 0x10) 1532 force_ibs_eilvt_setup(); 1533 } 1534 1535 static inline int get_ibs_lvt_offset(void) 1536 { 1537 u64 val; 1538 1539 rdmsrl(MSR_AMD64_IBSCTL, val); 1540 if (!(val & IBSCTL_LVT_OFFSET_VALID)) 1541 return -EINVAL; 1542 1543 return val & IBSCTL_LVT_OFFSET_MASK; 1544 } 1545 1546 static void setup_APIC_ibs(void) 1547 { 1548 int offset; 1549 1550 offset = get_ibs_lvt_offset(); 1551 if (offset < 0) 1552 goto failed; 1553 1554 if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) 1555 return; 1556 failed: 1557 pr_warn("perf: IBS APIC setup failed on cpu #%d\n", 1558 smp_processor_id()); 1559 } 1560 1561 static void clear_APIC_ibs(void) 1562 { 1563 int offset; 1564 1565 offset = get_ibs_lvt_offset(); 1566 if (offset >= 0) 1567 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); 1568 } 1569 1570 static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu) 1571 { 1572 setup_APIC_ibs(); 1573 return 0; 1574 } 1575 1576 #ifdef CONFIG_PM 1577 1578 static int perf_ibs_suspend(void) 1579 { 1580 clear_APIC_ibs(); 1581 return 0; 1582 } 1583 1584 static void perf_ibs_resume(void) 1585 { 1586 ibs_eilvt_setup(); 1587 setup_APIC_ibs(); 1588 } 1589 1590 static struct syscore_ops perf_ibs_syscore_ops = { 1591 .resume = perf_ibs_resume, 1592 .suspend = perf_ibs_suspend, 1593 }; 1594 1595 static void perf_ibs_pm_init(void) 1596 { 1597 register_syscore_ops(&perf_ibs_syscore_ops); 1598 } 1599 1600 #else 1601 1602 static inline void perf_ibs_pm_init(void) { } 1603 1604 #endif 1605 1606 static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu) 1607 { 1608 clear_APIC_ibs(); 1609 return 0; 1610 } 1611 1612 static __init int amd_ibs_init(void) 1613 { 1614 u32 caps; 1615 1616 caps = __get_ibs_caps(); 1617 if (!caps) 1618 return -ENODEV; /* ibs not supported by the cpu */ 1619 1620 ibs_eilvt_setup(); 1621 1622 if (!ibs_eilvt_valid()) 1623 return -EINVAL; 1624 1625 perf_ibs_pm_init(); 1626 1627 ibs_caps = caps; 1628 /* make ibs_caps visible to other cpus: */ 1629 smp_mb(); 1630 /* 1631 * x86_pmu_amd_ibs_starting_cpu will be called from core on 1632 * all online cpus. 1633 */ 1634 cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, 1635 "perf/x86/amd/ibs:starting", 1636 x86_pmu_amd_ibs_starting_cpu, 1637 x86_pmu_amd_ibs_dying_cpu); 1638 1639 return perf_event_ibs_init(); 1640 } 1641 1642 /* Since we need the pci subsystem to init ibs we can't do this earlier: */ 1643 device_initcall(amd_ibs_init); 1644