1 /* 2 * Performance events - AMD IBS 3 * 4 * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter 5 * 6 * For licencing details see kernel-base/COPYING 7 */ 8 9 #include <linux/perf_event.h> 10 #include <linux/init.h> 11 #include <linux/export.h> 12 #include <linux/pci.h> 13 #include <linux/ptrace.h> 14 #include <linux/syscore_ops.h> 15 #include <linux/sched/clock.h> 16 17 #include <asm/apic.h> 18 #include <asm/msr.h> 19 20 #include "../perf_event.h" 21 22 static u32 ibs_caps; 23 24 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 25 26 #include <linux/kprobes.h> 27 #include <linux/hardirq.h> 28 29 #include <asm/nmi.h> 30 #include <asm/amd/ibs.h> 31 32 /* attr.config2 */ 33 #define IBS_SW_FILTER_MASK 1 34 35 /* 36 * IBS states: 37 * 38 * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken 39 * and any further add()s must fail. 40 * 41 * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are 42 * complicated by the fact that the IBS hardware can send late NMIs (ie. after 43 * we've cleared the EN bit). 44 * 45 * In order to consume these late NMIs we have the STOPPED state, any NMI that 46 * happens after we've cleared the EN state will clear this bit and report the 47 * NMI handled (this is fundamentally racy in the face or multiple NMI sources, 48 * someone else can consume our BIT and our NMI will go unhandled). 49 * 50 * And since we cannot set/clear this separate bit together with the EN bit, 51 * there are races; if we cleared STARTED early, an NMI could land in 52 * between clearing STARTED and clearing the EN bit (in fact multiple NMIs 53 * could happen if the period is small enough), and consume our STOPPED bit 54 * and trigger streams of unhandled NMIs. 55 * 56 * If, however, we clear STARTED late, an NMI can hit between clearing the 57 * EN bit and clearing STARTED, still see STARTED set and process the event. 58 * If this event will have the VALID bit clear, we bail properly, but this 59 * is not a given. With VALID set we can end up calling pmu::stop() again 60 * (the throttle logic) and trigger the WARNs in there. 61 * 62 * So what we do is set STOPPING before clearing EN to avoid the pmu::stop() 63 * nesting, and clear STARTED late, so that we have a well defined state over 64 * the clearing of the EN bit. 65 * 66 * XXX: we could probably be using !atomic bitops for all this. 67 */ 68 69 enum ibs_states { 70 IBS_ENABLED = 0, 71 IBS_STARTED = 1, 72 IBS_STOPPING = 2, 73 IBS_STOPPED = 3, 74 75 IBS_MAX_STATES, 76 }; 77 78 struct cpu_perf_ibs { 79 struct perf_event *event; 80 unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; 81 }; 82 83 struct perf_ibs { 84 struct pmu pmu; 85 unsigned int msr; 86 u64 config_mask; 87 u64 cnt_mask; 88 u64 enable_mask; 89 u64 valid_mask; 90 u16 min_period; 91 u64 max_period; 92 unsigned long offset_mask[1]; 93 int offset_max; 94 unsigned int fetch_count_reset_broken : 1; 95 unsigned int fetch_ignore_if_zero_rip : 1; 96 struct cpu_perf_ibs __percpu *pcpu; 97 98 u64 (*get_count)(u64 config); 99 }; 100 101 static int 102 perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) 103 { 104 s64 left = local64_read(&hwc->period_left); 105 s64 period = hwc->sample_period; 106 int overflow = 0; 107 108 /* 109 * If we are way outside a reasonable range then just skip forward: 110 */ 111 if (unlikely(left <= -period)) { 112 left = period; 113 local64_set(&hwc->period_left, left); 114 hwc->last_period = period; 115 overflow = 1; 116 } 117 118 if (unlikely(left < (s64)min)) { 119 left += period; 120 local64_set(&hwc->period_left, left); 121 hwc->last_period = period; 122 overflow = 1; 123 } 124 125 /* 126 * If the hw period that triggers the sw overflow is too short 127 * we might hit the irq handler. This biases the results. 128 * Thus we shorten the next-to-last period and set the last 129 * period to the max period. 130 */ 131 if (left > max) { 132 left -= max; 133 if (left > max) 134 left = max; 135 else if (left < min) 136 left = min; 137 } 138 139 *hw_period = (u64)left; 140 141 return overflow; 142 } 143 144 static int 145 perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) 146 { 147 struct hw_perf_event *hwc = &event->hw; 148 int shift = 64 - width; 149 u64 prev_raw_count; 150 u64 delta; 151 152 /* 153 * Careful: an NMI might modify the previous event value. 154 * 155 * Our tactic to handle this is to first atomically read and 156 * exchange a new raw count - then add that new-prev delta 157 * count to the generic event atomically: 158 */ 159 prev_raw_count = local64_read(&hwc->prev_count); 160 if (!local64_try_cmpxchg(&hwc->prev_count, 161 &prev_raw_count, new_raw_count)) 162 return 0; 163 164 /* 165 * Now we have the new raw value and have updated the prev 166 * timestamp already. We can now calculate the elapsed delta 167 * (event-)time and add that to the generic event. 168 * 169 * Careful, not all hw sign-extends above the physical width 170 * of the count. 171 */ 172 delta = (new_raw_count << shift) - (prev_raw_count << shift); 173 delta >>= shift; 174 175 local64_add(delta, &event->count); 176 local64_sub(delta, &hwc->period_left); 177 178 return 1; 179 } 180 181 static struct perf_ibs perf_ibs_fetch; 182 static struct perf_ibs perf_ibs_op; 183 184 static struct perf_ibs *get_ibs_pmu(int type) 185 { 186 if (perf_ibs_fetch.pmu.type == type) 187 return &perf_ibs_fetch; 188 if (perf_ibs_op.pmu.type == type) 189 return &perf_ibs_op; 190 return NULL; 191 } 192 193 /* 194 * core pmu config -> IBS config 195 * 196 * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count 197 * perf record -a -e r076:p ... # same as -e cpu-cycles:p 198 * perf record -a -e r0C1:p ... # use ibs op counting micro-ops 199 * 200 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, 201 * MSRC001_1033) is used to select either cycle or micro-ops counting 202 * mode. 203 */ 204 static int core_pmu_ibs_config(struct perf_event *event, u64 *config) 205 { 206 switch (event->attr.type) { 207 case PERF_TYPE_HARDWARE: 208 switch (event->attr.config) { 209 case PERF_COUNT_HW_CPU_CYCLES: 210 *config = 0; 211 return 0; 212 } 213 break; 214 case PERF_TYPE_RAW: 215 switch (event->attr.config) { 216 case 0x0076: 217 *config = 0; 218 return 0; 219 case 0x00C1: 220 *config = IBS_OP_CNT_CTL; 221 return 0; 222 } 223 break; 224 default: 225 return -ENOENT; 226 } 227 228 return -EOPNOTSUPP; 229 } 230 231 /* 232 * The rip of IBS samples has skid 0. Thus, IBS supports precise 233 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the 234 * rip is invalid when IBS was not able to record the rip correctly. 235 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. 236 */ 237 int forward_event_to_ibs(struct perf_event *event) 238 { 239 u64 config = 0; 240 241 if (!event->attr.precise_ip || event->attr.precise_ip > 2) 242 return -EOPNOTSUPP; 243 244 if (!core_pmu_ibs_config(event, &config)) { 245 event->attr.type = perf_ibs_op.pmu.type; 246 event->attr.config = config; 247 } 248 return -ENOENT; 249 } 250 251 /* 252 * Grouping of IBS events is not possible since IBS can have only 253 * one event active at any point in time. 254 */ 255 static int validate_group(struct perf_event *event) 256 { 257 struct perf_event *sibling; 258 259 if (event->group_leader == event) 260 return 0; 261 262 if (event->group_leader->pmu == event->pmu) 263 return -EINVAL; 264 265 for_each_sibling_event(sibling, event->group_leader) { 266 if (sibling->pmu == event->pmu) 267 return -EINVAL; 268 } 269 return 0; 270 } 271 272 static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs, 273 struct perf_event *event) 274 { 275 return perf_ibs == &perf_ibs_op && 276 (ibs_caps & IBS_CAPS_OPLDLAT) && 277 (event->attr.config1 & 0xFFF); 278 } 279 280 static int perf_ibs_init(struct perf_event *event) 281 { 282 struct hw_perf_event *hwc = &event->hw; 283 struct perf_ibs *perf_ibs; 284 u64 config; 285 int ret; 286 287 perf_ibs = get_ibs_pmu(event->attr.type); 288 if (!perf_ibs) 289 return -ENOENT; 290 291 config = event->attr.config; 292 293 if (event->pmu != &perf_ibs->pmu) 294 return -ENOENT; 295 296 if (config & ~perf_ibs->config_mask) 297 return -EINVAL; 298 299 if (has_branch_stack(event)) 300 return -EOPNOTSUPP; 301 302 /* handle exclude_{user,kernel} in the IRQ handler */ 303 if (event->attr.exclude_host || event->attr.exclude_guest || 304 event->attr.exclude_idle) 305 return -EINVAL; 306 307 if (!(event->attr.config2 & IBS_SW_FILTER_MASK) && 308 (event->attr.exclude_kernel || event->attr.exclude_user || 309 event->attr.exclude_hv)) 310 return -EINVAL; 311 312 ret = validate_group(event); 313 if (ret) 314 return ret; 315 316 if (hwc->sample_period) { 317 if (config & perf_ibs->cnt_mask) 318 /* raw max_cnt may not be set */ 319 return -EINVAL; 320 321 if (event->attr.freq) { 322 hwc->sample_period = perf_ibs->min_period; 323 } else { 324 /* Silently mask off lower nibble. IBS hw mandates it. */ 325 hwc->sample_period &= ~0x0FULL; 326 if (hwc->sample_period < perf_ibs->min_period) 327 return -EINVAL; 328 } 329 } else { 330 u64 period = 0; 331 332 if (event->attr.freq) 333 return -EINVAL; 334 335 if (perf_ibs == &perf_ibs_op) { 336 period = (config & IBS_OP_MAX_CNT) << 4; 337 if (ibs_caps & IBS_CAPS_OPCNTEXT) 338 period |= config & IBS_OP_MAX_CNT_EXT_MASK; 339 } else { 340 period = (config & IBS_FETCH_MAX_CNT) << 4; 341 } 342 343 config &= ~perf_ibs->cnt_mask; 344 event->attr.sample_period = period; 345 hwc->sample_period = period; 346 347 if (hwc->sample_period < perf_ibs->min_period) 348 return -EINVAL; 349 } 350 351 if (perf_ibs_ldlat_event(perf_ibs, event)) { 352 u64 ldlat = event->attr.config1 & 0xFFF; 353 354 if (ldlat < 128 || ldlat > 2048) 355 return -EINVAL; 356 ldlat >>= 7; 357 358 config |= (ldlat - 1) << 59; 359 config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN; 360 } 361 362 /* 363 * If we modify hwc->sample_period, we also need to update 364 * hwc->last_period and hwc->period_left. 365 */ 366 hwc->last_period = hwc->sample_period; 367 local64_set(&hwc->period_left, hwc->sample_period); 368 369 hwc->config_base = perf_ibs->msr; 370 hwc->config = config; 371 372 return 0; 373 } 374 375 static int perf_ibs_set_period(struct perf_ibs *perf_ibs, 376 struct hw_perf_event *hwc, u64 *period) 377 { 378 int overflow; 379 380 /* ignore lower 4 bits in min count: */ 381 overflow = perf_event_set_period(hwc, perf_ibs->min_period, 382 perf_ibs->max_period, period); 383 local64_set(&hwc->prev_count, 0); 384 385 return overflow; 386 } 387 388 static u64 get_ibs_fetch_count(u64 config) 389 { 390 union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config; 391 392 return fetch_ctl.fetch_cnt << 4; 393 } 394 395 static u64 get_ibs_op_count(u64 config) 396 { 397 union ibs_op_ctl op_ctl = (union ibs_op_ctl)config; 398 u64 count = 0; 399 400 /* 401 * If the internal 27-bit counter rolled over, the count is MaxCnt 402 * and the lower 7 bits of CurCnt are randomized. 403 * Otherwise CurCnt has the full 27-bit current counter value. 404 */ 405 if (op_ctl.op_val) { 406 count = op_ctl.opmaxcnt << 4; 407 if (ibs_caps & IBS_CAPS_OPCNTEXT) 408 count += op_ctl.opmaxcnt_ext << 20; 409 } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { 410 count = op_ctl.opcurcnt; 411 } 412 413 return count; 414 } 415 416 static void 417 perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, 418 u64 *config) 419 { 420 u64 count = perf_ibs->get_count(*config); 421 422 /* 423 * Set width to 64 since we do not overflow on max width but 424 * instead on max count. In perf_ibs_set_period() we clear 425 * prev count manually on overflow. 426 */ 427 while (!perf_event_try_update(event, count, 64)) { 428 rdmsrq(event->hw.config_base, *config); 429 count = perf_ibs->get_count(*config); 430 } 431 } 432 433 static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, 434 struct hw_perf_event *hwc, u64 config) 435 { 436 u64 tmp = hwc->config | config; 437 438 if (perf_ibs->fetch_count_reset_broken) 439 wrmsrq(hwc->config_base, tmp & ~perf_ibs->enable_mask); 440 441 wrmsrq(hwc->config_base, tmp | perf_ibs->enable_mask); 442 } 443 444 /* 445 * Erratum #420 Instruction-Based Sampling Engine May Generate 446 * Interrupt that Cannot Be Cleared: 447 * 448 * Must clear counter mask first, then clear the enable bit. See 449 * Revision Guide for AMD Family 10h Processors, Publication #41322. 450 */ 451 static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, 452 struct hw_perf_event *hwc, u64 config) 453 { 454 config &= ~perf_ibs->cnt_mask; 455 if (boot_cpu_data.x86 == 0x10) 456 wrmsrq(hwc->config_base, config); 457 config &= ~perf_ibs->enable_mask; 458 wrmsrq(hwc->config_base, config); 459 } 460 461 /* 462 * We cannot restore the ibs pmu state, so we always needs to update 463 * the event while stopping it and then reset the state when starting 464 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in 465 * perf_ibs_start()/perf_ibs_stop() and instead always do it. 466 */ 467 static void perf_ibs_start(struct perf_event *event, int flags) 468 { 469 struct hw_perf_event *hwc = &event->hw; 470 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 471 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 472 u64 period, config = 0; 473 474 if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) 475 return; 476 477 WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); 478 hwc->state = 0; 479 480 if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) 481 hwc->sample_period = perf_ibs->min_period; 482 483 perf_ibs_set_period(perf_ibs, hwc, &period); 484 if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { 485 config |= period & IBS_OP_MAX_CNT_EXT_MASK; 486 period &= ~IBS_OP_MAX_CNT_EXT_MASK; 487 } 488 config |= period >> 4; 489 490 /* 491 * Set STARTED before enabling the hardware, such that a subsequent NMI 492 * must observe it. 493 */ 494 set_bit(IBS_STARTED, pcpu->state); 495 clear_bit(IBS_STOPPING, pcpu->state); 496 perf_ibs_enable_event(perf_ibs, hwc, config); 497 498 perf_event_update_userpage(event); 499 } 500 501 static void perf_ibs_stop(struct perf_event *event, int flags) 502 { 503 struct hw_perf_event *hwc = &event->hw; 504 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 505 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 506 u64 config; 507 int stopping; 508 509 if (test_and_set_bit(IBS_STOPPING, pcpu->state)) 510 return; 511 512 stopping = test_bit(IBS_STARTED, pcpu->state); 513 514 if (!stopping && (hwc->state & PERF_HES_UPTODATE)) 515 return; 516 517 rdmsrq(hwc->config_base, config); 518 519 if (stopping) { 520 /* 521 * Set STOPPED before disabling the hardware, such that it 522 * must be visible to NMIs the moment we clear the EN bit, 523 * at which point we can generate an !VALID sample which 524 * we need to consume. 525 */ 526 set_bit(IBS_STOPPED, pcpu->state); 527 perf_ibs_disable_event(perf_ibs, hwc, config); 528 /* 529 * Clear STARTED after disabling the hardware; if it were 530 * cleared before an NMI hitting after the clear but before 531 * clearing the EN bit might think it a spurious NMI and not 532 * handle it. 533 * 534 * Clearing it after, however, creates the problem of the NMI 535 * handler seeing STARTED but not having a valid sample. 536 */ 537 clear_bit(IBS_STARTED, pcpu->state); 538 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 539 hwc->state |= PERF_HES_STOPPED; 540 } 541 542 if (hwc->state & PERF_HES_UPTODATE) 543 return; 544 545 /* 546 * Clear valid bit to not count rollovers on update, rollovers 547 * are only updated in the irq handler. 548 */ 549 config &= ~perf_ibs->valid_mask; 550 551 perf_ibs_event_update(perf_ibs, event, &config); 552 hwc->state |= PERF_HES_UPTODATE; 553 } 554 555 static int perf_ibs_add(struct perf_event *event, int flags) 556 { 557 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 558 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 559 560 if (test_and_set_bit(IBS_ENABLED, pcpu->state)) 561 return -ENOSPC; 562 563 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 564 565 pcpu->event = event; 566 567 if (flags & PERF_EF_START) 568 perf_ibs_start(event, PERF_EF_RELOAD); 569 570 return 0; 571 } 572 573 static void perf_ibs_del(struct perf_event *event, int flags) 574 { 575 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 576 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 577 578 if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) 579 return; 580 581 perf_ibs_stop(event, PERF_EF_UPDATE); 582 583 pcpu->event = NULL; 584 585 perf_event_update_userpage(event); 586 } 587 588 static void perf_ibs_read(struct perf_event *event) { } 589 590 static int perf_ibs_check_period(struct perf_event *event, u64 value) 591 { 592 struct perf_ibs *perf_ibs; 593 u64 low_nibble; 594 595 if (event->attr.freq) 596 return 0; 597 598 perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); 599 low_nibble = value & 0xFULL; 600 601 /* 602 * This contradicts with perf_ibs_init() which allows sample period 603 * with lower nibble bits set but silently masks them off. Whereas 604 * this returns error. 605 */ 606 if (low_nibble || value < perf_ibs->min_period) 607 return -EINVAL; 608 609 return 0; 610 } 611 612 /* 613 * We need to initialize with empty group if all attributes in the 614 * group are dynamic. 615 */ 616 static struct attribute *attrs_empty[] = { 617 NULL, 618 }; 619 620 static struct attribute_group empty_caps_group = { 621 .name = "caps", 622 .attrs = attrs_empty, 623 }; 624 625 PMU_FORMAT_ATTR(rand_en, "config:57"); 626 PMU_FORMAT_ATTR(cnt_ctl, "config:19"); 627 PMU_FORMAT_ATTR(swfilt, "config2:0"); 628 PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); 629 PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); 630 PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); 631 PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); 632 PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); 633 PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1"); 634 635 static umode_t 636 zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) 637 { 638 return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; 639 } 640 641 static umode_t 642 ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) 643 { 644 return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; 645 } 646 647 static umode_t 648 ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i) 649 { 650 return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; 651 } 652 653 static struct attribute *fetch_attrs[] = { 654 &format_attr_rand_en.attr, 655 &format_attr_swfilt.attr, 656 NULL, 657 }; 658 659 static struct attribute *fetch_l3missonly_attrs[] = { 660 &fetch_l3missonly.attr.attr, 661 NULL, 662 }; 663 664 static struct attribute *zen4_ibs_extensions_attrs[] = { 665 &zen4_ibs_extensions.attr.attr, 666 NULL, 667 }; 668 669 static struct attribute *ibs_op_ldlat_cap_attrs[] = { 670 &ibs_op_ldlat_cap.attr.attr, 671 NULL, 672 }; 673 674 static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { 675 &ibs_op_dtlb_pgsize_cap.attr.attr, 676 NULL, 677 }; 678 679 static struct attribute_group group_fetch_formats = { 680 .name = "format", 681 .attrs = fetch_attrs, 682 }; 683 684 static struct attribute_group group_fetch_l3missonly = { 685 .name = "format", 686 .attrs = fetch_l3missonly_attrs, 687 .is_visible = zen4_ibs_extensions_is_visible, 688 }; 689 690 static struct attribute_group group_zen4_ibs_extensions = { 691 .name = "caps", 692 .attrs = zen4_ibs_extensions_attrs, 693 .is_visible = zen4_ibs_extensions_is_visible, 694 }; 695 696 static struct attribute_group group_ibs_op_ldlat_cap = { 697 .name = "caps", 698 .attrs = ibs_op_ldlat_cap_attrs, 699 .is_visible = ibs_op_ldlat_is_visible, 700 }; 701 702 static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { 703 .name = "caps", 704 .attrs = ibs_op_dtlb_pgsize_cap_attrs, 705 .is_visible = ibs_op_dtlb_pgsize_is_visible, 706 }; 707 708 static const struct attribute_group *fetch_attr_groups[] = { 709 &group_fetch_formats, 710 &empty_caps_group, 711 NULL, 712 }; 713 714 static const struct attribute_group *fetch_attr_update[] = { 715 &group_fetch_l3missonly, 716 &group_zen4_ibs_extensions, 717 NULL, 718 }; 719 720 static umode_t 721 cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) 722 { 723 return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; 724 } 725 726 static struct attribute *op_attrs[] = { 727 &format_attr_swfilt.attr, 728 NULL, 729 }; 730 731 static struct attribute *cnt_ctl_attrs[] = { 732 &format_attr_cnt_ctl.attr, 733 NULL, 734 }; 735 736 static struct attribute *op_l3missonly_attrs[] = { 737 &op_l3missonly.attr.attr, 738 NULL, 739 }; 740 741 static struct attribute_group group_op_formats = { 742 .name = "format", 743 .attrs = op_attrs, 744 }; 745 746 static struct attribute *ibs_op_ldlat_format_attrs[] = { 747 &ibs_op_ldlat_format.attr.attr, 748 NULL, 749 }; 750 751 static struct attribute_group group_cnt_ctl = { 752 .name = "format", 753 .attrs = cnt_ctl_attrs, 754 .is_visible = cnt_ctl_is_visible, 755 }; 756 757 static struct attribute_group group_op_l3missonly = { 758 .name = "format", 759 .attrs = op_l3missonly_attrs, 760 .is_visible = zen4_ibs_extensions_is_visible, 761 }; 762 763 static const struct attribute_group *op_attr_groups[] = { 764 &group_op_formats, 765 &empty_caps_group, 766 NULL, 767 }; 768 769 static struct attribute_group group_ibs_op_ldlat_format = { 770 .name = "format", 771 .attrs = ibs_op_ldlat_format_attrs, 772 .is_visible = ibs_op_ldlat_is_visible, 773 }; 774 775 static const struct attribute_group *op_attr_update[] = { 776 &group_cnt_ctl, 777 &group_op_l3missonly, 778 &group_zen4_ibs_extensions, 779 &group_ibs_op_ldlat_cap, 780 &group_ibs_op_ldlat_format, 781 &group_ibs_op_dtlb_pgsize_cap, 782 NULL, 783 }; 784 785 static struct perf_ibs perf_ibs_fetch = { 786 .pmu = { 787 .task_ctx_nr = perf_hw_context, 788 789 .event_init = perf_ibs_init, 790 .add = perf_ibs_add, 791 .del = perf_ibs_del, 792 .start = perf_ibs_start, 793 .stop = perf_ibs_stop, 794 .read = perf_ibs_read, 795 .check_period = perf_ibs_check_period, 796 }, 797 .msr = MSR_AMD64_IBSFETCHCTL, 798 .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, 799 .cnt_mask = IBS_FETCH_MAX_CNT, 800 .enable_mask = IBS_FETCH_ENABLE, 801 .valid_mask = IBS_FETCH_VAL, 802 .min_period = 0x10, 803 .max_period = IBS_FETCH_MAX_CNT << 4, 804 .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, 805 .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, 806 807 .get_count = get_ibs_fetch_count, 808 }; 809 810 static struct perf_ibs perf_ibs_op = { 811 .pmu = { 812 .task_ctx_nr = perf_hw_context, 813 814 .event_init = perf_ibs_init, 815 .add = perf_ibs_add, 816 .del = perf_ibs_del, 817 .start = perf_ibs_start, 818 .stop = perf_ibs_stop, 819 .read = perf_ibs_read, 820 .check_period = perf_ibs_check_period, 821 }, 822 .msr = MSR_AMD64_IBSOPCTL, 823 .config_mask = IBS_OP_MAX_CNT, 824 .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | 825 IBS_OP_CUR_CNT_RAND, 826 .enable_mask = IBS_OP_ENABLE, 827 .valid_mask = IBS_OP_VAL, 828 .min_period = 0x90, 829 .max_period = IBS_OP_MAX_CNT << 4, 830 .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, 831 .offset_max = MSR_AMD64_IBSOP_REG_COUNT, 832 833 .get_count = get_ibs_op_count, 834 }; 835 836 static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, 837 struct perf_sample_data *data) 838 { 839 union perf_mem_data_src *data_src = &data->data_src; 840 841 data_src->mem_op = PERF_MEM_OP_NA; 842 843 if (op_data3->ld_op) 844 data_src->mem_op = PERF_MEM_OP_LOAD; 845 else if (op_data3->st_op) 846 data_src->mem_op = PERF_MEM_OP_STORE; 847 } 848 849 /* 850 * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has 851 * more fine granular DataSrc encodings. Others have coarse. 852 */ 853 static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) 854 { 855 if (ibs_caps & IBS_CAPS_ZEN4) 856 return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; 857 858 return op_data2->data_src_lo; 859 } 860 861 #define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT)) 862 #define LN(x) PERF_MEM_S(LVLNUM, x) 863 #define REM PERF_MEM_S(REMOTE, REMOTE) 864 #define HOPS(x) PERF_MEM_S(HOPS, x) 865 866 static u64 g_data_src[8] = { 867 [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0), 868 [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM), 869 [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1), 870 [IBS_DATA_SRC_IO] = L(IO) | LN(IO), 871 }; 872 873 #define RMT_NODE_BITS (1 << IBS_DATA_SRC_DRAM) 874 #define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x)) 875 876 static u64 g_zen4_data_src[32] = { 877 [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3), 878 [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0), 879 [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM), 880 [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1), 881 [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM), 882 [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO), 883 [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL), 884 }; 885 886 #define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \ 887 (1 << IBS_DATA_SRC_EXT_PMEM) | \ 888 (1 << IBS_DATA_SRC_EXT_EXT_MEM)) 889 #define ZEN4_RMT_NODE_APPLICABLE(x) (ZEN4_RMT_NODE_BITS & (1 << x)) 890 891 static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, 892 union ibs_op_data3 *op_data3, 893 struct perf_sample_data *data) 894 { 895 union perf_mem_data_src *data_src = &data->data_src; 896 u8 ibs_data_src = perf_ibs_data_src(op_data2); 897 898 data_src->mem_lvl = 0; 899 data_src->mem_lvl_num = 0; 900 901 /* 902 * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached 903 * memory accesses. So, check DcUcMemAcc bit early. 904 */ 905 if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) 906 return L(UNC) | LN(UNC); 907 908 /* L1 Hit */ 909 if (op_data3->dc_miss == 0) 910 return L(L1) | LN(L1); 911 912 /* L2 Hit */ 913 if (op_data3->l2_miss == 0) { 914 /* Erratum #1293 */ 915 if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || 916 !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) 917 return L(L2) | LN(L2); 918 } 919 920 /* 921 * OP_DATA2 is valid only for load ops. Skip all checks which 922 * uses OP_DATA2[DataSrc]. 923 */ 924 if (data_src->mem_op != PERF_MEM_OP_LOAD) 925 goto check_mab; 926 927 if (ibs_caps & IBS_CAPS_ZEN4) { 928 u64 val = g_zen4_data_src[ibs_data_src]; 929 930 if (!val) 931 goto check_mab; 932 933 /* HOPS_1 because IBS doesn't provide remote socket detail */ 934 if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) { 935 if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) 936 val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); 937 else 938 val |= REM | HOPS(1); 939 } 940 941 return val; 942 } else { 943 u64 val = g_data_src[ibs_data_src]; 944 945 if (!val) 946 goto check_mab; 947 948 /* HOPS_1 because IBS doesn't provide remote socket detail */ 949 if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) { 950 if (ibs_data_src == IBS_DATA_SRC_DRAM) 951 val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); 952 else 953 val |= REM | HOPS(1); 954 } 955 956 return val; 957 } 958 959 check_mab: 960 /* 961 * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding 962 * DC misses. However, such data may come from any level in mem 963 * hierarchy. IBS provides detail about both MAB as well as actual 964 * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set 965 * MAB only when IBS fails to provide DataSrc. 966 */ 967 if (op_data3->dc_miss_no_mab_alloc) 968 return L(LFB) | LN(LFB); 969 970 /* Don't set HIT with NA */ 971 return PERF_MEM_S(LVL, NA) | LN(NA); 972 } 973 974 static bool perf_ibs_cache_hit_st_valid(void) 975 { 976 /* 0: Uninitialized, 1: Valid, -1: Invalid */ 977 static int cache_hit_st_valid; 978 979 if (unlikely(!cache_hit_st_valid)) { 980 if (boot_cpu_data.x86 == 0x19 && 981 (boot_cpu_data.x86_model <= 0xF || 982 (boot_cpu_data.x86_model >= 0x20 && 983 boot_cpu_data.x86_model <= 0x5F))) { 984 cache_hit_st_valid = -1; 985 } else { 986 cache_hit_st_valid = 1; 987 } 988 } 989 990 return cache_hit_st_valid == 1; 991 } 992 993 static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, 994 struct perf_sample_data *data) 995 { 996 union perf_mem_data_src *data_src = &data->data_src; 997 u8 ibs_data_src; 998 999 data_src->mem_snoop = PERF_MEM_SNOOP_NA; 1000 1001 if (!perf_ibs_cache_hit_st_valid() || 1002 data_src->mem_op != PERF_MEM_OP_LOAD || 1003 data_src->mem_lvl & PERF_MEM_LVL_L1 || 1004 data_src->mem_lvl & PERF_MEM_LVL_L2 || 1005 op_data2->cache_hit_st) 1006 return; 1007 1008 ibs_data_src = perf_ibs_data_src(op_data2); 1009 1010 if (ibs_caps & IBS_CAPS_ZEN4) { 1011 if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || 1012 ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || 1013 ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) 1014 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 1015 } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { 1016 data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 1017 } 1018 } 1019 1020 static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, 1021 struct perf_sample_data *data) 1022 { 1023 union perf_mem_data_src *data_src = &data->data_src; 1024 1025 data_src->mem_dtlb = PERF_MEM_TLB_NA; 1026 1027 if (!op_data3->dc_lin_addr_valid) 1028 return; 1029 1030 if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) && 1031 !op_data3->dc_phy_addr_valid) 1032 return; 1033 1034 if (!op_data3->dc_l1tlb_miss) { 1035 data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; 1036 return; 1037 } 1038 1039 if (!op_data3->dc_l2tlb_miss) { 1040 data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; 1041 return; 1042 } 1043 1044 data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; 1045 } 1046 1047 static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, 1048 struct perf_sample_data *data) 1049 { 1050 union perf_mem_data_src *data_src = &data->data_src; 1051 1052 data_src->mem_lock = PERF_MEM_LOCK_NA; 1053 1054 if (op_data3->dc_locked_op) 1055 data_src->mem_lock = PERF_MEM_LOCK_LOCKED; 1056 } 1057 1058 /* Be careful. Works only for contiguous MSRs. */ 1059 #define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL) 1060 #define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) 1061 1062 static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, 1063 struct perf_sample_data *data, 1064 union ibs_op_data2 *op_data2, 1065 union ibs_op_data3 *op_data3) 1066 { 1067 union perf_mem_data_src *data_src = &data->data_src; 1068 1069 data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data); 1070 perf_ibs_get_mem_snoop(op_data2, data); 1071 perf_ibs_get_tlb_lvl(op_data3, data); 1072 perf_ibs_get_mem_lock(op_data3, data); 1073 } 1074 1075 static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, 1076 union ibs_op_data3 *op_data3) 1077 { 1078 __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; 1079 1080 /* Erratum #1293 */ 1081 if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && 1082 (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { 1083 /* 1084 * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. 1085 * DataSrc=0 is 'No valid status' and RmtNode is invalid when 1086 * DataSrc=0. 1087 */ 1088 val = 0; 1089 } 1090 return val; 1091 } 1092 1093 static void perf_ibs_parse_ld_st_data(__u64 sample_type, 1094 struct perf_ibs_data *ibs_data, 1095 struct perf_sample_data *data) 1096 { 1097 union ibs_op_data3 op_data3; 1098 union ibs_op_data2 op_data2; 1099 union ibs_op_data op_data; 1100 1101 data->data_src.val = PERF_MEM_NA; 1102 op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; 1103 1104 perf_ibs_get_mem_op(&op_data3, data); 1105 if (data->data_src.mem_op != PERF_MEM_OP_LOAD && 1106 data->data_src.mem_op != PERF_MEM_OP_STORE) 1107 return; 1108 1109 op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); 1110 1111 if (sample_type & PERF_SAMPLE_DATA_SRC) { 1112 perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); 1113 data->sample_flags |= PERF_SAMPLE_DATA_SRC; 1114 } 1115 1116 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && 1117 data->data_src.mem_op == PERF_MEM_OP_LOAD) { 1118 op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; 1119 1120 if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { 1121 data->weight.var1_dw = op_data3.dc_miss_lat; 1122 data->weight.var2_w = op_data.tag_to_ret_ctr; 1123 } else if (sample_type & PERF_SAMPLE_WEIGHT) { 1124 data->weight.full = op_data3.dc_miss_lat; 1125 } 1126 data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 1127 } 1128 1129 if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { 1130 data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; 1131 data->sample_flags |= PERF_SAMPLE_ADDR; 1132 } 1133 1134 if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { 1135 data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; 1136 data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; 1137 } 1138 } 1139 1140 static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs, 1141 struct perf_event *event) 1142 { 1143 u64 sample_type = event->attr.sample_type; 1144 1145 return perf_ibs == &perf_ibs_op && 1146 sample_type & (PERF_SAMPLE_DATA_SRC | 1147 PERF_SAMPLE_WEIGHT_TYPE | 1148 PERF_SAMPLE_ADDR | 1149 PERF_SAMPLE_PHYS_ADDR); 1150 } 1151 1152 static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, 1153 struct perf_event *event, 1154 int check_rip) 1155 { 1156 if (event->attr.sample_type & PERF_SAMPLE_RAW || 1157 perf_ibs_is_mem_sample_type(perf_ibs, event) || 1158 perf_ibs_ldlat_event(perf_ibs, event)) 1159 return perf_ibs->offset_max; 1160 else if (check_rip) 1161 return 3; 1162 return 1; 1163 } 1164 1165 static bool perf_ibs_is_kernel_data_addr(struct perf_event *event, 1166 struct perf_ibs_data *ibs_data) 1167 { 1168 u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW; 1169 union ibs_op_data3 op_data3; 1170 u64 dc_lin_addr; 1171 1172 op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; 1173 dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; 1174 1175 return unlikely((event->attr.sample_type & sample_type_mask) && 1176 op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr)); 1177 } 1178 1179 static bool perf_ibs_is_kernel_br_target(struct perf_event *event, 1180 struct perf_ibs_data *ibs_data, 1181 int br_target_idx) 1182 { 1183 union ibs_op_data op_data; 1184 u64 br_target; 1185 1186 op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; 1187 br_target = ibs_data->regs[br_target_idx]; 1188 1189 return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) && 1190 op_data.op_brn_ret && kernel_ip(br_target)); 1191 } 1192 1193 static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event, 1194 struct pt_regs *regs, struct perf_ibs_data *ibs_data, 1195 int br_target_idx) 1196 { 1197 if (perf_exclude_event(event, regs)) 1198 return true; 1199 1200 if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel) 1201 return false; 1202 1203 if (perf_ibs_is_kernel_data_addr(event, ibs_data)) 1204 return true; 1205 1206 if (br_target_idx != -1 && 1207 perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx)) 1208 return true; 1209 1210 return false; 1211 } 1212 1213 static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs, 1214 struct perf_ibs_data *ibs_data) 1215 { 1216 if (perf_ibs == &perf_ibs_op) { 1217 ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)] &= ~(1ULL << 18); 1218 ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0; 1219 return; 1220 } 1221 1222 ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)] &= ~(1ULL << 52); 1223 ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0; 1224 } 1225 1226 static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) 1227 { 1228 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); 1229 struct perf_event *event = pcpu->event; 1230 struct hw_perf_event *hwc; 1231 struct perf_sample_data data; 1232 struct perf_raw_record raw; 1233 struct pt_regs regs; 1234 struct perf_ibs_data ibs_data; 1235 int offset, size, check_rip, offset_max, throttle = 0; 1236 unsigned int msr; 1237 u64 *buf, *config, period, new_config = 0; 1238 int br_target_idx = -1; 1239 1240 if (!test_bit(IBS_STARTED, pcpu->state)) { 1241 fail: 1242 /* 1243 * Catch spurious interrupts after stopping IBS: After 1244 * disabling IBS there could be still incoming NMIs 1245 * with samples that even have the valid bit cleared. 1246 * Mark all this NMIs as handled. 1247 */ 1248 if (test_and_clear_bit(IBS_STOPPED, pcpu->state)) 1249 return 1; 1250 1251 return 0; 1252 } 1253 1254 if (WARN_ON_ONCE(!event)) 1255 goto fail; 1256 1257 hwc = &event->hw; 1258 msr = hwc->config_base; 1259 buf = ibs_data.regs; 1260 rdmsrq(msr, *buf); 1261 if (!(*buf++ & perf_ibs->valid_mask)) 1262 goto fail; 1263 1264 config = &ibs_data.regs[0]; 1265 perf_ibs_event_update(perf_ibs, event, config); 1266 perf_sample_data_init(&data, 0, hwc->last_period); 1267 if (!perf_ibs_set_period(perf_ibs, hwc, &period)) 1268 goto out; /* no sw counter overflow */ 1269 1270 ibs_data.caps = ibs_caps; 1271 size = 1; 1272 offset = 1; 1273 check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); 1274 1275 offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip); 1276 1277 do { 1278 rdmsrq(msr + offset, *buf++); 1279 size++; 1280 offset = find_next_bit(perf_ibs->offset_mask, 1281 perf_ibs->offset_max, 1282 offset + 1); 1283 } while (offset < offset_max); 1284 1285 if (perf_ibs_ldlat_event(perf_ibs, event)) { 1286 union ibs_op_data3 op_data3; 1287 1288 op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; 1289 /* 1290 * Opening event is errored out if load latency threshold is 1291 * outside of [128, 2048] range. Since the event has reached 1292 * interrupt handler, we can safely assume the threshold is 1293 * within [128, 2048] range. 1294 */ 1295 if (!op_data3.ld_op || !op_data3.dc_miss || 1296 op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF)) 1297 goto out; 1298 } 1299 1300 /* 1301 * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately 1302 * depending on their availability. 1303 * Can't add to offset_max as they are staggered 1304 */ 1305 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 1306 if (perf_ibs == &perf_ibs_op) { 1307 if (ibs_caps & IBS_CAPS_BRNTRGT) { 1308 rdmsrq(MSR_AMD64_IBSBRTARGET, *buf++); 1309 br_target_idx = size; 1310 size++; 1311 } 1312 if (ibs_caps & IBS_CAPS_OPDATA4) { 1313 rdmsrq(MSR_AMD64_IBSOPDATA4, *buf++); 1314 size++; 1315 } 1316 } 1317 if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { 1318 rdmsrq(MSR_AMD64_ICIBSEXTDCTL, *buf++); 1319 size++; 1320 } 1321 } 1322 ibs_data.size = sizeof(u64) * size; 1323 1324 regs = *iregs; 1325 if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { 1326 regs.flags &= ~PERF_EFLAGS_EXACT; 1327 } else { 1328 /* Workaround for erratum #1197 */ 1329 if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) 1330 goto out; 1331 1332 set_linear_ip(®s, ibs_data.regs[1]); 1333 regs.flags |= PERF_EFLAGS_EXACT; 1334 } 1335 1336 if ((event->attr.config2 & IBS_SW_FILTER_MASK) && 1337 perf_ibs_swfilt_discard(perf_ibs, event, ®s, &ibs_data, br_target_idx)) { 1338 throttle = perf_event_account_interrupt(event); 1339 goto out; 1340 } 1341 /* 1342 * Prevent leaking physical addresses to unprivileged users. Skip 1343 * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for 1344 * unprivileged users. 1345 */ 1346 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 1347 perf_allow_kernel()) { 1348 perf_ibs_phyaddr_clear(perf_ibs, &ibs_data); 1349 } 1350 1351 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 1352 raw = (struct perf_raw_record){ 1353 .frag = { 1354 .size = sizeof(u32) + ibs_data.size, 1355 .data = ibs_data.data, 1356 }, 1357 }; 1358 perf_sample_save_raw_data(&data, event, &raw); 1359 } 1360 1361 if (perf_ibs == &perf_ibs_op) 1362 perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); 1363 1364 /* 1365 * rip recorded by IbsOpRip will not be consistent with rsp and rbp 1366 * recorded as part of interrupt regs. Thus we need to use rip from 1367 * interrupt regs while unwinding call stack. 1368 */ 1369 perf_sample_save_callchain(&data, event, iregs); 1370 1371 throttle = perf_event_overflow(event, &data, ®s); 1372 1373 if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) 1374 hwc->sample_period = perf_ibs->min_period; 1375 1376 out: 1377 if (!throttle) { 1378 if (perf_ibs == &perf_ibs_op) { 1379 if (ibs_caps & IBS_CAPS_OPCNTEXT) { 1380 new_config = period & IBS_OP_MAX_CNT_EXT_MASK; 1381 period &= ~IBS_OP_MAX_CNT_EXT_MASK; 1382 } 1383 if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) 1384 new_config |= *config & IBS_OP_CUR_CNT_RAND; 1385 } 1386 new_config |= period >> 4; 1387 1388 perf_ibs_enable_event(perf_ibs, hwc, new_config); 1389 } 1390 1391 perf_event_update_userpage(event); 1392 1393 return 1; 1394 } 1395 1396 static int 1397 perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1398 { 1399 u64 stamp = sched_clock(); 1400 int handled = 0; 1401 1402 handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); 1403 handled += perf_ibs_handle_irq(&perf_ibs_op, regs); 1404 1405 if (handled) 1406 inc_irq_stat(apic_perf_irqs); 1407 1408 perf_sample_event_took(sched_clock() - stamp); 1409 1410 return handled; 1411 } 1412 NOKPROBE_SYMBOL(perf_ibs_nmi_handler); 1413 1414 static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) 1415 { 1416 struct cpu_perf_ibs __percpu *pcpu; 1417 int ret; 1418 1419 pcpu = alloc_percpu(struct cpu_perf_ibs); 1420 if (!pcpu) 1421 return -ENOMEM; 1422 1423 perf_ibs->pcpu = pcpu; 1424 1425 ret = perf_pmu_register(&perf_ibs->pmu, name, -1); 1426 if (ret) { 1427 perf_ibs->pcpu = NULL; 1428 free_percpu(pcpu); 1429 } 1430 1431 return ret; 1432 } 1433 1434 static __init int perf_ibs_fetch_init(void) 1435 { 1436 /* 1437 * Some chips fail to reset the fetch count when it is written; instead 1438 * they need a 0-1 transition of IbsFetchEn. 1439 */ 1440 if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) 1441 perf_ibs_fetch.fetch_count_reset_broken = 1; 1442 1443 if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) 1444 perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; 1445 1446 if (ibs_caps & IBS_CAPS_ZEN4) 1447 perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; 1448 1449 perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; 1450 perf_ibs_fetch.pmu.attr_update = fetch_attr_update; 1451 1452 return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); 1453 } 1454 1455 static __init int perf_ibs_op_init(void) 1456 { 1457 if (ibs_caps & IBS_CAPS_OPCNT) 1458 perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; 1459 1460 if (ibs_caps & IBS_CAPS_OPCNTEXT) { 1461 perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; 1462 perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; 1463 perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK | 1464 IBS_OP_CUR_CNT_EXT_MASK); 1465 } 1466 1467 if (ibs_caps & IBS_CAPS_ZEN4) 1468 perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; 1469 1470 perf_ibs_op.pmu.attr_groups = op_attr_groups; 1471 perf_ibs_op.pmu.attr_update = op_attr_update; 1472 1473 return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); 1474 } 1475 1476 static __init int perf_event_ibs_init(void) 1477 { 1478 int ret; 1479 1480 ret = perf_ibs_fetch_init(); 1481 if (ret) 1482 return ret; 1483 1484 ret = perf_ibs_op_init(); 1485 if (ret) 1486 goto err_op; 1487 1488 ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); 1489 if (ret) 1490 goto err_nmi; 1491 1492 pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); 1493 return 0; 1494 1495 err_nmi: 1496 perf_pmu_unregister(&perf_ibs_op.pmu); 1497 free_percpu(perf_ibs_op.pcpu); 1498 perf_ibs_op.pcpu = NULL; 1499 err_op: 1500 perf_pmu_unregister(&perf_ibs_fetch.pmu); 1501 free_percpu(perf_ibs_fetch.pcpu); 1502 perf_ibs_fetch.pcpu = NULL; 1503 1504 return ret; 1505 } 1506 1507 #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ 1508 1509 static __init int perf_event_ibs_init(void) 1510 { 1511 return 0; 1512 } 1513 1514 #endif 1515 1516 /* IBS - apic initialization, for perf and oprofile */ 1517 1518 static __init u32 __get_ibs_caps(void) 1519 { 1520 u32 caps; 1521 unsigned int max_level; 1522 1523 if (!boot_cpu_has(X86_FEATURE_IBS)) 1524 return 0; 1525 1526 /* check IBS cpuid feature flags */ 1527 max_level = cpuid_eax(0x80000000); 1528 if (max_level < IBS_CPUID_FEATURES) 1529 return IBS_CAPS_DEFAULT; 1530 1531 caps = cpuid_eax(IBS_CPUID_FEATURES); 1532 if (!(caps & IBS_CAPS_AVAIL)) 1533 /* cpuid flags not valid */ 1534 return IBS_CAPS_DEFAULT; 1535 1536 return caps; 1537 } 1538 1539 u32 get_ibs_caps(void) 1540 { 1541 return ibs_caps; 1542 } 1543 1544 EXPORT_SYMBOL(get_ibs_caps); 1545 1546 static inline int get_eilvt(int offset) 1547 { 1548 return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); 1549 } 1550 1551 static inline int put_eilvt(int offset) 1552 { 1553 return !setup_APIC_eilvt(offset, 0, 0, 1); 1554 } 1555 1556 /* 1557 * Check and reserve APIC extended interrupt LVT offset for IBS if available. 1558 */ 1559 static inline int ibs_eilvt_valid(void) 1560 { 1561 int offset; 1562 u64 val; 1563 int valid = 0; 1564 1565 preempt_disable(); 1566 1567 rdmsrq(MSR_AMD64_IBSCTL, val); 1568 offset = val & IBSCTL_LVT_OFFSET_MASK; 1569 1570 if (!(val & IBSCTL_LVT_OFFSET_VALID)) { 1571 pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", 1572 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); 1573 goto out; 1574 } 1575 1576 if (!get_eilvt(offset)) { 1577 pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", 1578 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); 1579 goto out; 1580 } 1581 1582 valid = 1; 1583 out: 1584 preempt_enable(); 1585 1586 return valid; 1587 } 1588 1589 static int setup_ibs_ctl(int ibs_eilvt_off) 1590 { 1591 struct pci_dev *cpu_cfg; 1592 int nodes; 1593 u32 value = 0; 1594 1595 nodes = 0; 1596 cpu_cfg = NULL; 1597 do { 1598 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, 1599 PCI_DEVICE_ID_AMD_10H_NB_MISC, 1600 cpu_cfg); 1601 if (!cpu_cfg) 1602 break; 1603 ++nodes; 1604 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off 1605 | IBSCTL_LVT_OFFSET_VALID); 1606 pci_read_config_dword(cpu_cfg, IBSCTL, &value); 1607 if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { 1608 pci_dev_put(cpu_cfg); 1609 pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n", 1610 value); 1611 return -EINVAL; 1612 } 1613 } while (1); 1614 1615 if (!nodes) { 1616 pr_debug("No CPU node configured for IBS\n"); 1617 return -ENODEV; 1618 } 1619 1620 return 0; 1621 } 1622 1623 /* 1624 * This runs only on the current cpu. We try to find an LVT offset and 1625 * setup the local APIC. For this we must disable preemption. On 1626 * success we initialize all nodes with this offset. This updates then 1627 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of 1628 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that 1629 * is using the new offset. 1630 */ 1631 static void force_ibs_eilvt_setup(void) 1632 { 1633 int offset; 1634 int ret; 1635 1636 preempt_disable(); 1637 /* find the next free available EILVT entry, skip offset 0 */ 1638 for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { 1639 if (get_eilvt(offset)) 1640 break; 1641 } 1642 preempt_enable(); 1643 1644 if (offset == APIC_EILVT_NR_MAX) { 1645 pr_debug("No EILVT entry available\n"); 1646 return; 1647 } 1648 1649 ret = setup_ibs_ctl(offset); 1650 if (ret) 1651 goto out; 1652 1653 if (!ibs_eilvt_valid()) 1654 goto out; 1655 1656 pr_info("LVT offset %d assigned\n", offset); 1657 1658 return; 1659 out: 1660 preempt_disable(); 1661 put_eilvt(offset); 1662 preempt_enable(); 1663 return; 1664 } 1665 1666 static void ibs_eilvt_setup(void) 1667 { 1668 /* 1669 * Force LVT offset assignment for family 10h: The offsets are 1670 * not assigned by the BIOS for this family, so the OS is 1671 * responsible for doing it. If the OS assignment fails, fall 1672 * back to BIOS settings and try to setup this. 1673 */ 1674 if (boot_cpu_data.x86 == 0x10) 1675 force_ibs_eilvt_setup(); 1676 } 1677 1678 static inline int get_ibs_lvt_offset(void) 1679 { 1680 u64 val; 1681 1682 rdmsrq(MSR_AMD64_IBSCTL, val); 1683 if (!(val & IBSCTL_LVT_OFFSET_VALID)) 1684 return -EINVAL; 1685 1686 return val & IBSCTL_LVT_OFFSET_MASK; 1687 } 1688 1689 static void setup_APIC_ibs(void) 1690 { 1691 int offset; 1692 1693 offset = get_ibs_lvt_offset(); 1694 if (offset < 0) 1695 goto failed; 1696 1697 if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) 1698 return; 1699 failed: 1700 pr_warn("perf: IBS APIC setup failed on cpu #%d\n", 1701 smp_processor_id()); 1702 } 1703 1704 static void clear_APIC_ibs(void) 1705 { 1706 int offset; 1707 1708 offset = get_ibs_lvt_offset(); 1709 if (offset >= 0) 1710 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); 1711 } 1712 1713 static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu) 1714 { 1715 setup_APIC_ibs(); 1716 return 0; 1717 } 1718 1719 #ifdef CONFIG_PM 1720 1721 static int perf_ibs_suspend(void) 1722 { 1723 clear_APIC_ibs(); 1724 return 0; 1725 } 1726 1727 static void perf_ibs_resume(void) 1728 { 1729 ibs_eilvt_setup(); 1730 setup_APIC_ibs(); 1731 } 1732 1733 static struct syscore_ops perf_ibs_syscore_ops = { 1734 .resume = perf_ibs_resume, 1735 .suspend = perf_ibs_suspend, 1736 }; 1737 1738 static void perf_ibs_pm_init(void) 1739 { 1740 register_syscore_ops(&perf_ibs_syscore_ops); 1741 } 1742 1743 #else 1744 1745 static inline void perf_ibs_pm_init(void) { } 1746 1747 #endif 1748 1749 static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu) 1750 { 1751 clear_APIC_ibs(); 1752 return 0; 1753 } 1754 1755 static __init int amd_ibs_init(void) 1756 { 1757 u32 caps; 1758 1759 caps = __get_ibs_caps(); 1760 if (!caps) 1761 return -ENODEV; /* ibs not supported by the cpu */ 1762 1763 ibs_eilvt_setup(); 1764 1765 if (!ibs_eilvt_valid()) 1766 return -EINVAL; 1767 1768 perf_ibs_pm_init(); 1769 1770 ibs_caps = caps; 1771 /* make ibs_caps visible to other cpus: */ 1772 smp_mb(); 1773 /* 1774 * x86_pmu_amd_ibs_starting_cpu will be called from core on 1775 * all online cpus. 1776 */ 1777 cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, 1778 "perf/x86/amd/ibs:starting", 1779 x86_pmu_amd_ibs_starting_cpu, 1780 x86_pmu_amd_ibs_dying_cpu); 1781 1782 return perf_event_ibs_init(); 1783 } 1784 1785 /* Since we need the pci subsystem to init ibs we can't do this earlier: */ 1786 device_initcall(amd_ibs_init); 1787