1 #include <linux/perf_event.h> 2 #include <linux/types.h> 3 4 #include <asm/perf_event.h> 5 #include <asm/msr.h> 6 #include <asm/insn.h> 7 8 #include "../perf_event.h" 9 10 enum { 11 LBR_FORMAT_32 = 0x00, 12 LBR_FORMAT_LIP = 0x01, 13 LBR_FORMAT_EIP = 0x02, 14 LBR_FORMAT_EIP_FLAGS = 0x03, 15 LBR_FORMAT_EIP_FLAGS2 = 0x04, 16 LBR_FORMAT_INFO = 0x05, 17 LBR_FORMAT_TIME = 0x06, 18 LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, 19 }; 20 21 static enum { 22 LBR_EIP_FLAGS = 1, 23 LBR_TSX = 2, 24 } lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { 25 [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, 26 [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, 27 }; 28 29 /* 30 * Intel LBR_SELECT bits 31 * Intel Vol3a, April 2011, Section 16.7 Table 16-10 32 * 33 * Hardware branch filter (not available on all CPUs) 34 */ 35 #define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ 36 #define LBR_USER_BIT 1 /* do not capture at ring > 0 */ 37 #define LBR_JCC_BIT 2 /* do not capture conditional branches */ 38 #define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ 39 #define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ 40 #define LBR_RETURN_BIT 5 /* do not capture near returns */ 41 #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ 42 #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ 43 #define LBR_FAR_BIT 8 /* do not capture far branches */ 44 #define LBR_CALL_STACK_BIT 9 /* enable call stack */ 45 46 /* 47 * Following bit only exists in Linux; we mask it out before writing it to 48 * the actual MSR. But it helps the constraint perf code to understand 49 * that this is a separate configuration. 50 */ 51 #define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */ 52 53 #define LBR_KERNEL (1 << LBR_KERNEL_BIT) 54 #define LBR_USER (1 << LBR_USER_BIT) 55 #define LBR_JCC (1 << LBR_JCC_BIT) 56 #define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) 57 #define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) 58 #define LBR_RETURN (1 << LBR_RETURN_BIT) 59 #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) 60 #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) 61 #define LBR_FAR (1 << LBR_FAR_BIT) 62 #define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT) 63 #define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT) 64 65 #define LBR_PLM (LBR_KERNEL | LBR_USER) 66 67 #define LBR_SEL_MASK 0x3ff /* valid bits in LBR_SELECT */ 68 #define LBR_NOT_SUPP -1 /* LBR filter not supported */ 69 #define LBR_IGN 0 /* ignored */ 70 71 #define LBR_ANY \ 72 (LBR_JCC |\ 73 LBR_REL_CALL |\ 74 LBR_IND_CALL |\ 75 LBR_RETURN |\ 76 LBR_REL_JMP |\ 77 LBR_IND_JMP |\ 78 LBR_FAR) 79 80 #define LBR_FROM_FLAG_MISPRED (1ULL << 63) 81 #define LBR_FROM_FLAG_IN_TX (1ULL << 62) 82 #define LBR_FROM_FLAG_ABORT (1ULL << 61) 83 84 /* 85 * x86control flow change classification 86 * x86control flow changes include branches, interrupts, traps, faults 87 */ 88 enum { 89 X86_BR_NONE = 0, /* unknown */ 90 91 X86_BR_USER = 1 << 0, /* branch target is user */ 92 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ 93 94 X86_BR_CALL = 1 << 2, /* call */ 95 X86_BR_RET = 1 << 3, /* return */ 96 X86_BR_SYSCALL = 1 << 4, /* syscall */ 97 X86_BR_SYSRET = 1 << 5, /* syscall return */ 98 X86_BR_INT = 1 << 6, /* sw interrupt */ 99 X86_BR_IRET = 1 << 7, /* return from interrupt */ 100 X86_BR_JCC = 1 << 8, /* conditional */ 101 X86_BR_JMP = 1 << 9, /* jump */ 102 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ 103 X86_BR_IND_CALL = 1 << 11,/* indirect calls */ 104 X86_BR_ABORT = 1 << 12,/* transaction abort */ 105 X86_BR_IN_TX = 1 << 13,/* in transaction */ 106 X86_BR_NO_TX = 1 << 14,/* not in transaction */ 107 X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ 108 X86_BR_CALL_STACK = 1 << 16,/* call stack */ 109 X86_BR_IND_JMP = 1 << 17,/* indirect jump */ 110 }; 111 112 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) 113 #define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) 114 115 #define X86_BR_ANY \ 116 (X86_BR_CALL |\ 117 X86_BR_RET |\ 118 X86_BR_SYSCALL |\ 119 X86_BR_SYSRET |\ 120 X86_BR_INT |\ 121 X86_BR_IRET |\ 122 X86_BR_JCC |\ 123 X86_BR_JMP |\ 124 X86_BR_IRQ |\ 125 X86_BR_ABORT |\ 126 X86_BR_IND_CALL |\ 127 X86_BR_IND_JMP |\ 128 X86_BR_ZERO_CALL) 129 130 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) 131 132 #define X86_BR_ANY_CALL \ 133 (X86_BR_CALL |\ 134 X86_BR_IND_CALL |\ 135 X86_BR_ZERO_CALL |\ 136 X86_BR_SYSCALL |\ 137 X86_BR_IRQ |\ 138 X86_BR_INT) 139 140 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); 141 142 /* 143 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI 144 * otherwise it becomes near impossible to get a reliable stack. 145 */ 146 147 static void __intel_pmu_lbr_enable(bool pmi) 148 { 149 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 150 u64 debugctl, lbr_select = 0, orig_debugctl; 151 152 /* 153 * No need to unfreeze manually, as v4 can do that as part 154 * of the GLOBAL_STATUS ack. 155 */ 156 if (pmi && x86_pmu.version >= 4) 157 return; 158 159 /* 160 * No need to reprogram LBR_SELECT in a PMI, as it 161 * did not change. 162 */ 163 if (cpuc->lbr_sel) 164 lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; 165 if (!pmi && cpuc->lbr_sel) 166 wrmsrl(MSR_LBR_SELECT, lbr_select); 167 168 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 169 orig_debugctl = debugctl; 170 debugctl |= DEBUGCTLMSR_LBR; 171 /* 172 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. 173 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions 174 * may cause superfluous increase/decrease of LBR_TOS. 175 */ 176 if (!(lbr_select & LBR_CALL_STACK)) 177 debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 178 if (orig_debugctl != debugctl) 179 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 180 } 181 182 static void __intel_pmu_lbr_disable(void) 183 { 184 u64 debugctl; 185 186 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 187 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 188 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 189 } 190 191 static void intel_pmu_lbr_reset_32(void) 192 { 193 int i; 194 195 for (i = 0; i < x86_pmu.lbr_nr; i++) 196 wrmsrl(x86_pmu.lbr_from + i, 0); 197 } 198 199 static void intel_pmu_lbr_reset_64(void) 200 { 201 int i; 202 203 for (i = 0; i < x86_pmu.lbr_nr; i++) { 204 wrmsrl(x86_pmu.lbr_from + i, 0); 205 wrmsrl(x86_pmu.lbr_to + i, 0); 206 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 207 wrmsrl(MSR_LBR_INFO_0 + i, 0); 208 } 209 } 210 211 void intel_pmu_lbr_reset(void) 212 { 213 if (!x86_pmu.lbr_nr) 214 return; 215 216 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) 217 intel_pmu_lbr_reset_32(); 218 else 219 intel_pmu_lbr_reset_64(); 220 } 221 222 /* 223 * TOS = most recently recorded branch 224 */ 225 static inline u64 intel_pmu_lbr_tos(void) 226 { 227 u64 tos; 228 229 rdmsrl(x86_pmu.lbr_tos, tos); 230 return tos; 231 } 232 233 enum { 234 LBR_NONE, 235 LBR_VALID, 236 }; 237 238 static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) 239 { 240 int i; 241 unsigned lbr_idx, mask; 242 u64 tos; 243 244 if (task_ctx->lbr_callstack_users == 0 || 245 task_ctx->lbr_stack_state == LBR_NONE) { 246 intel_pmu_lbr_reset(); 247 return; 248 } 249 250 mask = x86_pmu.lbr_nr - 1; 251 tos = task_ctx->tos; 252 for (i = 0; i < tos; i++) { 253 lbr_idx = (tos - i) & mask; 254 wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); 255 wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); 256 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 257 wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); 258 } 259 wrmsrl(x86_pmu.lbr_tos, tos); 260 task_ctx->lbr_stack_state = LBR_NONE; 261 } 262 263 static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) 264 { 265 int i; 266 unsigned lbr_idx, mask; 267 u64 tos; 268 269 if (task_ctx->lbr_callstack_users == 0) { 270 task_ctx->lbr_stack_state = LBR_NONE; 271 return; 272 } 273 274 mask = x86_pmu.lbr_nr - 1; 275 tos = intel_pmu_lbr_tos(); 276 for (i = 0; i < tos; i++) { 277 lbr_idx = (tos - i) & mask; 278 rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); 279 rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); 280 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 281 rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); 282 } 283 task_ctx->tos = tos; 284 task_ctx->lbr_stack_state = LBR_VALID; 285 } 286 287 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) 288 { 289 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 290 struct x86_perf_task_context *task_ctx; 291 292 /* 293 * If LBR callstack feature is enabled and the stack was saved when 294 * the task was scheduled out, restore the stack. Otherwise flush 295 * the LBR stack. 296 */ 297 task_ctx = ctx ? ctx->task_ctx_data : NULL; 298 if (task_ctx) { 299 if (sched_in) { 300 __intel_pmu_lbr_restore(task_ctx); 301 cpuc->lbr_context = ctx; 302 } else { 303 __intel_pmu_lbr_save(task_ctx); 304 } 305 return; 306 } 307 308 /* 309 * When sampling the branck stack in system-wide, it may be 310 * necessary to flush the stack on context switch. This happens 311 * when the branch stack does not tag its entries with the pid 312 * of the current task. Otherwise it becomes impossible to 313 * associate a branch entry with a task. This ambiguity is more 314 * likely to appear when the branch stack supports priv level 315 * filtering and the user sets it to monitor only at the user 316 * level (which could be a useful measurement in system-wide 317 * mode). In that case, the risk is high of having a branch 318 * stack with branch from multiple tasks. 319 */ 320 if (sched_in) { 321 intel_pmu_lbr_reset(); 322 cpuc->lbr_context = ctx; 323 } 324 } 325 326 static inline bool branch_user_callstack(unsigned br_sel) 327 { 328 return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); 329 } 330 331 void intel_pmu_lbr_enable(struct perf_event *event) 332 { 333 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 334 struct x86_perf_task_context *task_ctx; 335 336 if (!x86_pmu.lbr_nr) 337 return; 338 339 /* 340 * Reset the LBR stack if we changed task context to 341 * avoid data leaks. 342 */ 343 if (event->ctx->task && cpuc->lbr_context != event->ctx) { 344 intel_pmu_lbr_reset(); 345 cpuc->lbr_context = event->ctx; 346 } 347 cpuc->br_sel = event->hw.branch_reg.reg; 348 349 if (branch_user_callstack(cpuc->br_sel) && event->ctx && 350 event->ctx->task_ctx_data) { 351 task_ctx = event->ctx->task_ctx_data; 352 task_ctx->lbr_callstack_users++; 353 } 354 355 cpuc->lbr_users++; 356 perf_sched_cb_inc(event->ctx->pmu); 357 } 358 359 void intel_pmu_lbr_disable(struct perf_event *event) 360 { 361 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 362 struct x86_perf_task_context *task_ctx; 363 364 if (!x86_pmu.lbr_nr) 365 return; 366 367 if (branch_user_callstack(cpuc->br_sel) && event->ctx && 368 event->ctx->task_ctx_data) { 369 task_ctx = event->ctx->task_ctx_data; 370 task_ctx->lbr_callstack_users--; 371 } 372 373 cpuc->lbr_users--; 374 WARN_ON_ONCE(cpuc->lbr_users < 0); 375 perf_sched_cb_dec(event->ctx->pmu); 376 377 if (cpuc->enabled && !cpuc->lbr_users) { 378 __intel_pmu_lbr_disable(); 379 /* avoid stale pointer */ 380 cpuc->lbr_context = NULL; 381 } 382 } 383 384 void intel_pmu_lbr_enable_all(bool pmi) 385 { 386 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 387 388 if (cpuc->lbr_users) 389 __intel_pmu_lbr_enable(pmi); 390 } 391 392 void intel_pmu_lbr_disable_all(void) 393 { 394 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 395 396 if (cpuc->lbr_users) 397 __intel_pmu_lbr_disable(); 398 } 399 400 static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) 401 { 402 unsigned long mask = x86_pmu.lbr_nr - 1; 403 u64 tos = intel_pmu_lbr_tos(); 404 int i; 405 406 for (i = 0; i < x86_pmu.lbr_nr; i++) { 407 unsigned long lbr_idx = (tos - i) & mask; 408 union { 409 struct { 410 u32 from; 411 u32 to; 412 }; 413 u64 lbr; 414 } msr_lastbranch; 415 416 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); 417 418 cpuc->lbr_entries[i].from = msr_lastbranch.from; 419 cpuc->lbr_entries[i].to = msr_lastbranch.to; 420 cpuc->lbr_entries[i].mispred = 0; 421 cpuc->lbr_entries[i].predicted = 0; 422 cpuc->lbr_entries[i].reserved = 0; 423 } 424 cpuc->lbr_stack.nr = i; 425 } 426 427 /* 428 * Due to lack of segmentation in Linux the effective address (offset) 429 * is the same as the linear address, allowing us to merge the LIP and EIP 430 * LBR formats. 431 */ 432 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) 433 { 434 bool need_info = false; 435 unsigned long mask = x86_pmu.lbr_nr - 1; 436 int lbr_format = x86_pmu.intel_cap.lbr_format; 437 u64 tos = intel_pmu_lbr_tos(); 438 int i; 439 int out = 0; 440 int num = x86_pmu.lbr_nr; 441 442 if (cpuc->lbr_sel) { 443 need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO); 444 if (cpuc->lbr_sel->config & LBR_CALL_STACK) 445 num = tos; 446 } 447 448 for (i = 0; i < num; i++) { 449 unsigned long lbr_idx = (tos - i) & mask; 450 u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; 451 int skip = 0; 452 u16 cycles = 0; 453 int lbr_flags = lbr_desc[lbr_format]; 454 455 rdmsrl(x86_pmu.lbr_from + lbr_idx, from); 456 rdmsrl(x86_pmu.lbr_to + lbr_idx, to); 457 458 if (lbr_format == LBR_FORMAT_INFO && need_info) { 459 u64 info; 460 461 rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); 462 mis = !!(info & LBR_INFO_MISPRED); 463 pred = !mis; 464 in_tx = !!(info & LBR_INFO_IN_TX); 465 abort = !!(info & LBR_INFO_ABORT); 466 cycles = (info & LBR_INFO_CYCLES); 467 } 468 469 if (lbr_format == LBR_FORMAT_TIME) { 470 mis = !!(from & LBR_FROM_FLAG_MISPRED); 471 pred = !mis; 472 skip = 1; 473 cycles = ((to >> 48) & LBR_INFO_CYCLES); 474 475 to = (u64)((((s64)to) << 16) >> 16); 476 } 477 478 if (lbr_flags & LBR_EIP_FLAGS) { 479 mis = !!(from & LBR_FROM_FLAG_MISPRED); 480 pred = !mis; 481 skip = 1; 482 } 483 if (lbr_flags & LBR_TSX) { 484 in_tx = !!(from & LBR_FROM_FLAG_IN_TX); 485 abort = !!(from & LBR_FROM_FLAG_ABORT); 486 skip = 3; 487 } 488 from = (u64)((((s64)from) << skip) >> skip); 489 490 /* 491 * Some CPUs report duplicated abort records, 492 * with the second entry not having an abort bit set. 493 * Skip them here. This loop runs backwards, 494 * so we need to undo the previous record. 495 * If the abort just happened outside the window 496 * the extra entry cannot be removed. 497 */ 498 if (abort && x86_pmu.lbr_double_abort && out > 0) 499 out--; 500 501 cpuc->lbr_entries[out].from = from; 502 cpuc->lbr_entries[out].to = to; 503 cpuc->lbr_entries[out].mispred = mis; 504 cpuc->lbr_entries[out].predicted = pred; 505 cpuc->lbr_entries[out].in_tx = in_tx; 506 cpuc->lbr_entries[out].abort = abort; 507 cpuc->lbr_entries[out].cycles = cycles; 508 cpuc->lbr_entries[out].reserved = 0; 509 out++; 510 } 511 cpuc->lbr_stack.nr = out; 512 } 513 514 void intel_pmu_lbr_read(void) 515 { 516 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 517 518 if (!cpuc->lbr_users) 519 return; 520 521 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) 522 intel_pmu_lbr_read_32(cpuc); 523 else 524 intel_pmu_lbr_read_64(cpuc); 525 526 intel_pmu_lbr_filter(cpuc); 527 } 528 529 /* 530 * SW filter is used: 531 * - in case there is no HW filter 532 * - in case the HW filter has errata or limitations 533 */ 534 static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) 535 { 536 u64 br_type = event->attr.branch_sample_type; 537 int mask = 0; 538 539 if (br_type & PERF_SAMPLE_BRANCH_USER) 540 mask |= X86_BR_USER; 541 542 if (br_type & PERF_SAMPLE_BRANCH_KERNEL) 543 mask |= X86_BR_KERNEL; 544 545 /* we ignore BRANCH_HV here */ 546 547 if (br_type & PERF_SAMPLE_BRANCH_ANY) 548 mask |= X86_BR_ANY; 549 550 if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) 551 mask |= X86_BR_ANY_CALL; 552 553 if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) 554 mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; 555 556 if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) 557 mask |= X86_BR_IND_CALL; 558 559 if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX) 560 mask |= X86_BR_ABORT; 561 562 if (br_type & PERF_SAMPLE_BRANCH_IN_TX) 563 mask |= X86_BR_IN_TX; 564 565 if (br_type & PERF_SAMPLE_BRANCH_NO_TX) 566 mask |= X86_BR_NO_TX; 567 568 if (br_type & PERF_SAMPLE_BRANCH_COND) 569 mask |= X86_BR_JCC; 570 571 if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) { 572 if (!x86_pmu_has_lbr_callstack()) 573 return -EOPNOTSUPP; 574 if (mask & ~(X86_BR_USER | X86_BR_KERNEL)) 575 return -EINVAL; 576 mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET | 577 X86_BR_CALL_STACK; 578 } 579 580 if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) 581 mask |= X86_BR_IND_JMP; 582 583 if (br_type & PERF_SAMPLE_BRANCH_CALL) 584 mask |= X86_BR_CALL | X86_BR_ZERO_CALL; 585 /* 586 * stash actual user request into reg, it may 587 * be used by fixup code for some CPU 588 */ 589 event->hw.branch_reg.reg = mask; 590 return 0; 591 } 592 593 /* 594 * setup the HW LBR filter 595 * Used only when available, may not be enough to disambiguate 596 * all branches, may need the help of the SW filter 597 */ 598 static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) 599 { 600 struct hw_perf_event_extra *reg; 601 u64 br_type = event->attr.branch_sample_type; 602 u64 mask = 0, v; 603 int i; 604 605 for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { 606 if (!(br_type & (1ULL << i))) 607 continue; 608 609 v = x86_pmu.lbr_sel_map[i]; 610 if (v == LBR_NOT_SUPP) 611 return -EOPNOTSUPP; 612 613 if (v != LBR_IGN) 614 mask |= v; 615 } 616 617 reg = &event->hw.branch_reg; 618 reg->idx = EXTRA_REG_LBR; 619 620 /* 621 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate 622 * in suppress mode. So LBR_SELECT should be set to 623 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) 624 * But the 10th bit LBR_CALL_STACK does not operate 625 * in suppress mode. 626 */ 627 reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK); 628 629 if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && 630 (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && 631 (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)) 632 reg->config |= LBR_NO_INFO; 633 634 return 0; 635 } 636 637 int intel_pmu_setup_lbr_filter(struct perf_event *event) 638 { 639 int ret = 0; 640 641 /* 642 * no LBR on this PMU 643 */ 644 if (!x86_pmu.lbr_nr) 645 return -EOPNOTSUPP; 646 647 /* 648 * setup SW LBR filter 649 */ 650 ret = intel_pmu_setup_sw_lbr_filter(event); 651 if (ret) 652 return ret; 653 654 /* 655 * setup HW LBR filter, if any 656 */ 657 if (x86_pmu.lbr_sel_map) 658 ret = intel_pmu_setup_hw_lbr_filter(event); 659 660 return ret; 661 } 662 663 /* 664 * return the type of control flow change at address "from" 665 * instruction is not necessarily a branch (in case of interrupt). 666 * 667 * The branch type returned also includes the priv level of the 668 * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). 669 * 670 * If a branch type is unknown OR the instruction cannot be 671 * decoded (e.g., text page not present), then X86_BR_NONE is 672 * returned. 673 */ 674 static int branch_type(unsigned long from, unsigned long to, int abort) 675 { 676 struct insn insn; 677 void *addr; 678 int bytes_read, bytes_left; 679 int ret = X86_BR_NONE; 680 int ext, to_plm, from_plm; 681 u8 buf[MAX_INSN_SIZE]; 682 int is64 = 0; 683 684 to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; 685 from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; 686 687 /* 688 * maybe zero if lbr did not fill up after a reset by the time 689 * we get a PMU interrupt 690 */ 691 if (from == 0 || to == 0) 692 return X86_BR_NONE; 693 694 if (abort) 695 return X86_BR_ABORT | to_plm; 696 697 if (from_plm == X86_BR_USER) { 698 /* 699 * can happen if measuring at the user level only 700 * and we interrupt in a kernel thread, e.g., idle. 701 */ 702 if (!current->mm) 703 return X86_BR_NONE; 704 705 /* may fail if text not present */ 706 bytes_left = copy_from_user_nmi(buf, (void __user *)from, 707 MAX_INSN_SIZE); 708 bytes_read = MAX_INSN_SIZE - bytes_left; 709 if (!bytes_read) 710 return X86_BR_NONE; 711 712 addr = buf; 713 } else { 714 /* 715 * The LBR logs any address in the IP, even if the IP just 716 * faulted. This means userspace can control the from address. 717 * Ensure we don't blindy read any address by validating it is 718 * a known text address. 719 */ 720 if (kernel_text_address(from)) { 721 addr = (void *)from; 722 /* 723 * Assume we can get the maximum possible size 724 * when grabbing kernel data. This is not 725 * _strictly_ true since we could possibly be 726 * executing up next to a memory hole, but 727 * it is very unlikely to be a problem. 728 */ 729 bytes_read = MAX_INSN_SIZE; 730 } else { 731 return X86_BR_NONE; 732 } 733 } 734 735 /* 736 * decoder needs to know the ABI especially 737 * on 64-bit systems running 32-bit apps 738 */ 739 #ifdef CONFIG_X86_64 740 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); 741 #endif 742 insn_init(&insn, addr, bytes_read, is64); 743 insn_get_opcode(&insn); 744 if (!insn.opcode.got) 745 return X86_BR_ABORT; 746 747 switch (insn.opcode.bytes[0]) { 748 case 0xf: 749 switch (insn.opcode.bytes[1]) { 750 case 0x05: /* syscall */ 751 case 0x34: /* sysenter */ 752 ret = X86_BR_SYSCALL; 753 break; 754 case 0x07: /* sysret */ 755 case 0x35: /* sysexit */ 756 ret = X86_BR_SYSRET; 757 break; 758 case 0x80 ... 0x8f: /* conditional */ 759 ret = X86_BR_JCC; 760 break; 761 default: 762 ret = X86_BR_NONE; 763 } 764 break; 765 case 0x70 ... 0x7f: /* conditional */ 766 ret = X86_BR_JCC; 767 break; 768 case 0xc2: /* near ret */ 769 case 0xc3: /* near ret */ 770 case 0xca: /* far ret */ 771 case 0xcb: /* far ret */ 772 ret = X86_BR_RET; 773 break; 774 case 0xcf: /* iret */ 775 ret = X86_BR_IRET; 776 break; 777 case 0xcc ... 0xce: /* int */ 778 ret = X86_BR_INT; 779 break; 780 case 0xe8: /* call near rel */ 781 insn_get_immediate(&insn); 782 if (insn.immediate1.value == 0) { 783 /* zero length call */ 784 ret = X86_BR_ZERO_CALL; 785 break; 786 } 787 case 0x9a: /* call far absolute */ 788 ret = X86_BR_CALL; 789 break; 790 case 0xe0 ... 0xe3: /* loop jmp */ 791 ret = X86_BR_JCC; 792 break; 793 case 0xe9 ... 0xeb: /* jmp */ 794 ret = X86_BR_JMP; 795 break; 796 case 0xff: /* call near absolute, call far absolute ind */ 797 insn_get_modrm(&insn); 798 ext = (insn.modrm.bytes[0] >> 3) & 0x7; 799 switch (ext) { 800 case 2: /* near ind call */ 801 case 3: /* far ind call */ 802 ret = X86_BR_IND_CALL; 803 break; 804 case 4: 805 case 5: 806 ret = X86_BR_IND_JMP; 807 break; 808 } 809 break; 810 default: 811 ret = X86_BR_NONE; 812 } 813 /* 814 * interrupts, traps, faults (and thus ring transition) may 815 * occur on any instructions. Thus, to classify them correctly, 816 * we need to first look at the from and to priv levels. If they 817 * are different and to is in the kernel, then it indicates 818 * a ring transition. If the from instruction is not a ring 819 * transition instr (syscall, systenter, int), then it means 820 * it was a irq, trap or fault. 821 * 822 * we have no way of detecting kernel to kernel faults. 823 */ 824 if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL 825 && ret != X86_BR_SYSCALL && ret != X86_BR_INT) 826 ret = X86_BR_IRQ; 827 828 /* 829 * branch priv level determined by target as 830 * is done by HW when LBR_SELECT is implemented 831 */ 832 if (ret != X86_BR_NONE) 833 ret |= to_plm; 834 835 return ret; 836 } 837 838 /* 839 * implement actual branch filter based on user demand. 840 * Hardware may not exactly satisfy that request, thus 841 * we need to inspect opcodes. Mismatched branches are 842 * discarded. Therefore, the number of branches returned 843 * in PERF_SAMPLE_BRANCH_STACK sample may vary. 844 */ 845 static void 846 intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) 847 { 848 u64 from, to; 849 int br_sel = cpuc->br_sel; 850 int i, j, type; 851 bool compress = false; 852 853 /* if sampling all branches, then nothing to filter */ 854 if ((br_sel & X86_BR_ALL) == X86_BR_ALL) 855 return; 856 857 for (i = 0; i < cpuc->lbr_stack.nr; i++) { 858 859 from = cpuc->lbr_entries[i].from; 860 to = cpuc->lbr_entries[i].to; 861 862 type = branch_type(from, to, cpuc->lbr_entries[i].abort); 863 if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { 864 if (cpuc->lbr_entries[i].in_tx) 865 type |= X86_BR_IN_TX; 866 else 867 type |= X86_BR_NO_TX; 868 } 869 870 /* if type does not correspond, then discard */ 871 if (type == X86_BR_NONE || (br_sel & type) != type) { 872 cpuc->lbr_entries[i].from = 0; 873 compress = true; 874 } 875 } 876 877 if (!compress) 878 return; 879 880 /* remove all entries with from=0 */ 881 for (i = 0; i < cpuc->lbr_stack.nr; ) { 882 if (!cpuc->lbr_entries[i].from) { 883 j = i; 884 while (++j < cpuc->lbr_stack.nr) 885 cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; 886 cpuc->lbr_stack.nr--; 887 if (!cpuc->lbr_entries[i].from) 888 continue; 889 } 890 i++; 891 } 892 } 893 894 /* 895 * Map interface branch filters onto LBR filters 896 */ 897 static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { 898 [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, 899 [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, 900 [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, 901 [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, 902 [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP 903 | LBR_IND_JMP | LBR_FAR, 904 /* 905 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches 906 */ 907 [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = 908 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, 909 /* 910 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL 911 */ 912 [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, 913 [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, 914 [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, 915 }; 916 917 static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { 918 [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, 919 [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, 920 [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, 921 [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, 922 [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, 923 [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL 924 | LBR_FAR, 925 [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, 926 [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, 927 [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, 928 [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, 929 }; 930 931 static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { 932 [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, 933 [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, 934 [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, 935 [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, 936 [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, 937 [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL 938 | LBR_FAR, 939 [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, 940 [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, 941 [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL 942 | LBR_RETURN | LBR_CALL_STACK, 943 [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, 944 [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, 945 }; 946 947 /* core */ 948 void __init intel_pmu_lbr_init_core(void) 949 { 950 x86_pmu.lbr_nr = 4; 951 x86_pmu.lbr_tos = MSR_LBR_TOS; 952 x86_pmu.lbr_from = MSR_LBR_CORE_FROM; 953 x86_pmu.lbr_to = MSR_LBR_CORE_TO; 954 955 /* 956 * SW branch filter usage: 957 * - compensate for lack of HW filter 958 */ 959 pr_cont("4-deep LBR, "); 960 } 961 962 /* nehalem/westmere */ 963 void __init intel_pmu_lbr_init_nhm(void) 964 { 965 x86_pmu.lbr_nr = 16; 966 x86_pmu.lbr_tos = MSR_LBR_TOS; 967 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; 968 x86_pmu.lbr_to = MSR_LBR_NHM_TO; 969 970 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 971 x86_pmu.lbr_sel_map = nhm_lbr_sel_map; 972 973 /* 974 * SW branch filter usage: 975 * - workaround LBR_SEL errata (see above) 976 * - support syscall, sysret capture. 977 * That requires LBR_FAR but that means far 978 * jmp need to be filtered out 979 */ 980 pr_cont("16-deep LBR, "); 981 } 982 983 /* sandy bridge */ 984 void __init intel_pmu_lbr_init_snb(void) 985 { 986 x86_pmu.lbr_nr = 16; 987 x86_pmu.lbr_tos = MSR_LBR_TOS; 988 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; 989 x86_pmu.lbr_to = MSR_LBR_NHM_TO; 990 991 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 992 x86_pmu.lbr_sel_map = snb_lbr_sel_map; 993 994 /* 995 * SW branch filter usage: 996 * - support syscall, sysret capture. 997 * That requires LBR_FAR but that means far 998 * jmp need to be filtered out 999 */ 1000 pr_cont("16-deep LBR, "); 1001 } 1002 1003 /* haswell */ 1004 void intel_pmu_lbr_init_hsw(void) 1005 { 1006 x86_pmu.lbr_nr = 16; 1007 x86_pmu.lbr_tos = MSR_LBR_TOS; 1008 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; 1009 x86_pmu.lbr_to = MSR_LBR_NHM_TO; 1010 1011 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 1012 x86_pmu.lbr_sel_map = hsw_lbr_sel_map; 1013 1014 pr_cont("16-deep LBR, "); 1015 } 1016 1017 /* skylake */ 1018 __init void intel_pmu_lbr_init_skl(void) 1019 { 1020 x86_pmu.lbr_nr = 32; 1021 x86_pmu.lbr_tos = MSR_LBR_TOS; 1022 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; 1023 x86_pmu.lbr_to = MSR_LBR_NHM_TO; 1024 1025 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 1026 x86_pmu.lbr_sel_map = hsw_lbr_sel_map; 1027 1028 /* 1029 * SW branch filter usage: 1030 * - support syscall, sysret capture. 1031 * That requires LBR_FAR but that means far 1032 * jmp need to be filtered out 1033 */ 1034 pr_cont("32-deep LBR, "); 1035 } 1036 1037 /* atom */ 1038 void __init intel_pmu_lbr_init_atom(void) 1039 { 1040 /* 1041 * only models starting at stepping 10 seems 1042 * to have an operational LBR which can freeze 1043 * on PMU interrupt 1044 */ 1045 if (boot_cpu_data.x86_model == 28 1046 && boot_cpu_data.x86_mask < 10) { 1047 pr_cont("LBR disabled due to erratum"); 1048 return; 1049 } 1050 1051 x86_pmu.lbr_nr = 8; 1052 x86_pmu.lbr_tos = MSR_LBR_TOS; 1053 x86_pmu.lbr_from = MSR_LBR_CORE_FROM; 1054 x86_pmu.lbr_to = MSR_LBR_CORE_TO; 1055 1056 /* 1057 * SW branch filter usage: 1058 * - compensate for lack of HW filter 1059 */ 1060 pr_cont("8-deep LBR, "); 1061 } 1062 1063 /* slm */ 1064 void __init intel_pmu_lbr_init_slm(void) 1065 { 1066 x86_pmu.lbr_nr = 8; 1067 x86_pmu.lbr_tos = MSR_LBR_TOS; 1068 x86_pmu.lbr_from = MSR_LBR_CORE_FROM; 1069 x86_pmu.lbr_to = MSR_LBR_CORE_TO; 1070 1071 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 1072 x86_pmu.lbr_sel_map = nhm_lbr_sel_map; 1073 1074 /* 1075 * SW branch filter usage: 1076 * - compensate for lack of HW filter 1077 */ 1078 pr_cont("8-deep LBR, "); 1079 } 1080 1081 /* Knights Landing */ 1082 void intel_pmu_lbr_init_knl(void) 1083 { 1084 x86_pmu.lbr_nr = 8; 1085 x86_pmu.lbr_tos = MSR_LBR_TOS; 1086 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; 1087 x86_pmu.lbr_to = MSR_LBR_NHM_TO; 1088 1089 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 1090 x86_pmu.lbr_sel_map = snb_lbr_sel_map; 1091 1092 pr_cont("8-deep LBR, "); 1093 } 1094