1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * 4 * Copyright (C) 2007 Alan Stern 5 * Copyright (C) 2009 IBM Corporation 6 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> 7 * 8 * Authors: Alan Stern <stern@rowland.harvard.edu> 9 * K.Prasad <prasad@linux.vnet.ibm.com> 10 * Frederic Weisbecker <fweisbec@gmail.com> 11 */ 12 13 /* 14 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, 15 * using the CPU's debug registers. 16 */ 17 18 #include <linux/perf_event.h> 19 #include <linux/hw_breakpoint.h> 20 #include <linux/irqflags.h> 21 #include <linux/notifier.h> 22 #include <linux/kallsyms.h> 23 #include <linux/kprobes.h> 24 #include <linux/percpu.h> 25 #include <linux/kdebug.h> 26 #include <linux/kernel.h> 27 #include <linux/kvm_types.h> 28 #include <linux/export.h> 29 #include <linux/sched.h> 30 #include <linux/smp.h> 31 32 #include <asm/hw_breakpoint.h> 33 #include <asm/processor.h> 34 #include <asm/debugreg.h> 35 #include <asm/user.h> 36 #include <asm/desc.h> 37 #include <asm/tlbflush.h> 38 39 /* Per cpu debug control register value */ 40 DEFINE_PER_CPU(unsigned long, cpu_dr7); 41 EXPORT_PER_CPU_SYMBOL(cpu_dr7); 42 43 /* Per cpu debug address registers values */ 44 static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); 45 46 /* 47 * Stores the breakpoints currently in use on each breakpoint address 48 * register for each cpus 49 */ 50 static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); 51 52 53 static inline unsigned long 54 __encode_dr7(int drnum, unsigned int len, unsigned int type) 55 { 56 unsigned long bp_info; 57 58 bp_info = (len | type) & 0xf; 59 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); 60 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); 61 62 return bp_info; 63 } 64 65 /* 66 * Encode the length, type, Exact, and Enable bits for a particular breakpoint 67 * as stored in debug register 7. 68 */ 69 unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) 70 { 71 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; 72 } 73 74 /* 75 * Decode the length and type bits for a particular breakpoint as 76 * stored in debug register 7. Return the "enabled" status. 77 */ 78 int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) 79 { 80 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); 81 82 *len = (bp_info & 0xc) | 0x40; 83 *type = (bp_info & 0x3) | 0x80; 84 85 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; 86 } 87 88 /* 89 * Install a perf counter breakpoint. 90 * 91 * We seek a free debug address register and use it for this 92 * breakpoint. Eventually we enable it in the debug control register. 93 * 94 * Atomic: we hold the counter->ctx->lock and we only handle variables 95 * and registers local to this cpu. 96 */ 97 int arch_install_hw_breakpoint(struct perf_event *bp) 98 { 99 struct arch_hw_breakpoint *info = counter_arch_bp(bp); 100 unsigned long *dr7; 101 int i; 102 103 lockdep_assert_irqs_disabled(); 104 105 for (i = 0; i < HBP_NUM; i++) { 106 struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); 107 108 if (!*slot) { 109 *slot = bp; 110 break; 111 } 112 } 113 114 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) 115 return -EBUSY; 116 117 set_debugreg(info->address, i); 118 __this_cpu_write(cpu_debugreg[i], info->address); 119 120 dr7 = this_cpu_ptr(&cpu_dr7); 121 *dr7 |= encode_dr7(i, info->len, info->type); 122 123 /* 124 * Ensure we first write cpu_dr7 before we set the DR7 register. 125 * This ensures an NMI never see cpu_dr7 0 when DR7 is not. 126 */ 127 barrier(); 128 129 set_debugreg(*dr7, 7); 130 if (info->mask) 131 amd_set_dr_addr_mask(info->mask, i); 132 133 return 0; 134 } 135 136 /* 137 * Uninstall the breakpoint contained in the given counter. 138 * 139 * First we search the debug address register it uses and then we disable 140 * it. 141 * 142 * Atomic: we hold the counter->ctx->lock and we only handle variables 143 * and registers local to this cpu. 144 */ 145 void arch_uninstall_hw_breakpoint(struct perf_event *bp) 146 { 147 struct arch_hw_breakpoint *info = counter_arch_bp(bp); 148 unsigned long dr7; 149 int i; 150 151 lockdep_assert_irqs_disabled(); 152 153 for (i = 0; i < HBP_NUM; i++) { 154 struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); 155 156 if (*slot == bp) { 157 *slot = NULL; 158 break; 159 } 160 } 161 162 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) 163 return; 164 165 dr7 = this_cpu_read(cpu_dr7); 166 dr7 &= ~__encode_dr7(i, info->len, info->type); 167 168 set_debugreg(dr7, 7); 169 if (info->mask) 170 amd_set_dr_addr_mask(0, i); 171 172 /* 173 * Ensure the write to cpu_dr7 is after we've set the DR7 register. 174 * This ensures an NMI never see cpu_dr7 0 when DR7 is not. 175 */ 176 barrier(); 177 178 this_cpu_write(cpu_dr7, dr7); 179 } 180 181 static int arch_bp_generic_len(int x86_len) 182 { 183 switch (x86_len) { 184 case X86_BREAKPOINT_LEN_1: 185 return HW_BREAKPOINT_LEN_1; 186 case X86_BREAKPOINT_LEN_2: 187 return HW_BREAKPOINT_LEN_2; 188 case X86_BREAKPOINT_LEN_4: 189 return HW_BREAKPOINT_LEN_4; 190 #ifdef CONFIG_X86_64 191 case X86_BREAKPOINT_LEN_8: 192 return HW_BREAKPOINT_LEN_8; 193 #endif 194 default: 195 return -EINVAL; 196 } 197 } 198 199 int arch_bp_generic_fields(int x86_len, int x86_type, 200 int *gen_len, int *gen_type) 201 { 202 int len; 203 204 /* Type */ 205 switch (x86_type) { 206 case X86_BREAKPOINT_EXECUTE: 207 if (x86_len != X86_BREAKPOINT_LEN_X) 208 return -EINVAL; 209 210 *gen_type = HW_BREAKPOINT_X; 211 *gen_len = sizeof(long); 212 return 0; 213 case X86_BREAKPOINT_WRITE: 214 *gen_type = HW_BREAKPOINT_W; 215 break; 216 case X86_BREAKPOINT_RW: 217 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; 218 break; 219 default: 220 return -EINVAL; 221 } 222 223 /* Len */ 224 len = arch_bp_generic_len(x86_len); 225 if (len < 0) 226 return -EINVAL; 227 *gen_len = len; 228 229 return 0; 230 } 231 232 /* 233 * Check for virtual address in kernel space. 234 */ 235 int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) 236 { 237 unsigned long va; 238 int len; 239 240 va = hw->address; 241 len = arch_bp_generic_len(hw->len); 242 WARN_ON_ONCE(len < 0); 243 244 /* 245 * We don't need to worry about va + len - 1 overflowing: 246 * we already require that va is aligned to a multiple of len. 247 */ 248 return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); 249 } 250 251 /* 252 * Checks whether the range [addr, end], overlaps the area [base, base + size). 253 */ 254 static inline bool within_area(unsigned long addr, unsigned long end, 255 unsigned long base, unsigned long size) 256 { 257 return end >= base && addr < (base + size); 258 } 259 260 /* 261 * Checks whether the range from addr to end, inclusive, overlaps the fixed 262 * mapped CPU entry area range or other ranges used for CPU entry. 263 */ 264 static inline bool within_cpu_entry(unsigned long addr, unsigned long end) 265 { 266 int cpu; 267 268 /* CPU entry erea is always used for CPU entry */ 269 if (within_area(addr, end, CPU_ENTRY_AREA_BASE, 270 CPU_ENTRY_AREA_MAP_SIZE)) 271 return true; 272 273 /* 274 * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU 275 * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. 276 */ 277 #ifdef CONFIG_SMP 278 if (within_area(addr, end, (unsigned long)__per_cpu_offset, 279 sizeof(unsigned long) * nr_cpu_ids)) 280 return true; 281 #else 282 if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, 283 sizeof(pcpu_unit_offsets))) 284 return true; 285 #endif 286 287 for_each_possible_cpu(cpu) { 288 /* The original rw GDT is being used after load_direct_gdt() */ 289 if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), 290 GDT_SIZE)) 291 return true; 292 293 /* 294 * cpu_tss_rw is not directly referenced by hardware, but 295 * cpu_tss_rw is also used in CPU entry code, 296 */ 297 if (within_area(addr, end, 298 (unsigned long)&per_cpu(cpu_tss_rw, cpu), 299 sizeof(struct tss_struct))) 300 return true; 301 302 /* 303 * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. 304 * If a data breakpoint on it, it will cause an unwanted #DB. 305 * Protect the full cpu_tlbstate structure to be sure. 306 */ 307 if (within_area(addr, end, 308 (unsigned long)&per_cpu(cpu_tlbstate, cpu), 309 sizeof(struct tlb_state))) 310 return true; 311 312 /* 313 * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() 314 * will read per-cpu cpu_dr7 before clear dr7 register. 315 */ 316 if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), 317 sizeof(cpu_dr7))) 318 return true; 319 } 320 321 return false; 322 } 323 324 static int arch_build_bp_info(struct perf_event *bp, 325 const struct perf_event_attr *attr, 326 struct arch_hw_breakpoint *hw) 327 { 328 unsigned long bp_end; 329 330 bp_end = attr->bp_addr + attr->bp_len - 1; 331 if (bp_end < attr->bp_addr) 332 return -EINVAL; 333 334 /* 335 * Prevent any breakpoint of any type that overlaps the CPU 336 * entry area and data. This protects the IST stacks and also 337 * reduces the chance that we ever find out what happens if 338 * there's a data breakpoint on the GDT, IDT, or TSS. 339 */ 340 if (within_cpu_entry(attr->bp_addr, bp_end)) 341 return -EINVAL; 342 343 hw->address = attr->bp_addr; 344 hw->mask = 0; 345 346 /* Type */ 347 switch (attr->bp_type) { 348 case HW_BREAKPOINT_W: 349 hw->type = X86_BREAKPOINT_WRITE; 350 break; 351 case HW_BREAKPOINT_W | HW_BREAKPOINT_R: 352 hw->type = X86_BREAKPOINT_RW; 353 break; 354 case HW_BREAKPOINT_X: 355 /* 356 * We don't allow kernel breakpoints in places that are not 357 * acceptable for kprobes. On non-kprobes kernels, we don't 358 * allow kernel breakpoints at all. 359 */ 360 if (attr->bp_addr >= TASK_SIZE_MAX) { 361 if (within_kprobe_blacklist(attr->bp_addr)) 362 return -EINVAL; 363 } 364 365 hw->type = X86_BREAKPOINT_EXECUTE; 366 /* 367 * x86 inst breakpoints need to have a specific undefined len. 368 * But we still need to check userspace is not trying to setup 369 * an unsupported length, to get a range breakpoint for example. 370 */ 371 if (attr->bp_len == sizeof(long)) { 372 hw->len = X86_BREAKPOINT_LEN_X; 373 return 0; 374 } 375 fallthrough; 376 default: 377 return -EINVAL; 378 } 379 380 /* Len */ 381 switch (attr->bp_len) { 382 case HW_BREAKPOINT_LEN_1: 383 hw->len = X86_BREAKPOINT_LEN_1; 384 break; 385 case HW_BREAKPOINT_LEN_2: 386 hw->len = X86_BREAKPOINT_LEN_2; 387 break; 388 case HW_BREAKPOINT_LEN_4: 389 hw->len = X86_BREAKPOINT_LEN_4; 390 break; 391 #ifdef CONFIG_X86_64 392 case HW_BREAKPOINT_LEN_8: 393 hw->len = X86_BREAKPOINT_LEN_8; 394 break; 395 #endif 396 default: 397 /* AMD range breakpoint */ 398 if (!is_power_of_2(attr->bp_len)) 399 return -EINVAL; 400 if (attr->bp_addr & (attr->bp_len - 1)) 401 return -EINVAL; 402 403 if (!boot_cpu_has(X86_FEATURE_BPEXT)) 404 return -EOPNOTSUPP; 405 406 /* 407 * It's impossible to use a range breakpoint to fake out 408 * user vs kernel detection because bp_len - 1 can't 409 * have the high bit set. If we ever allow range instruction 410 * breakpoints, then we'll have to check for kprobe-blacklisted 411 * addresses anywhere in the range. 412 */ 413 hw->mask = attr->bp_len - 1; 414 hw->len = X86_BREAKPOINT_LEN_1; 415 } 416 417 return 0; 418 } 419 420 /* 421 * Validate the arch-specific HW Breakpoint register settings 422 */ 423 int hw_breakpoint_arch_parse(struct perf_event *bp, 424 const struct perf_event_attr *attr, 425 struct arch_hw_breakpoint *hw) 426 { 427 unsigned int align; 428 int ret; 429 430 431 ret = arch_build_bp_info(bp, attr, hw); 432 if (ret) 433 return ret; 434 435 switch (hw->len) { 436 case X86_BREAKPOINT_LEN_1: 437 align = 0; 438 if (hw->mask) 439 align = hw->mask; 440 break; 441 case X86_BREAKPOINT_LEN_2: 442 align = 1; 443 break; 444 case X86_BREAKPOINT_LEN_4: 445 align = 3; 446 break; 447 #ifdef CONFIG_X86_64 448 case X86_BREAKPOINT_LEN_8: 449 align = 7; 450 break; 451 #endif 452 default: 453 WARN_ON_ONCE(1); 454 return -EINVAL; 455 } 456 457 /* 458 * Check that the low-order bits of the address are appropriate 459 * for the alignment implied by len. 460 */ 461 if (hw->address & align) 462 return -EINVAL; 463 464 return 0; 465 } 466 467 /* 468 * Release the user breakpoints used by ptrace 469 */ 470 void flush_ptrace_hw_breakpoint(struct task_struct *tsk) 471 { 472 int i; 473 struct thread_struct *t = &tsk->thread; 474 475 for (i = 0; i < HBP_NUM; i++) { 476 unregister_hw_breakpoint(t->ptrace_bps[i]); 477 t->ptrace_bps[i] = NULL; 478 } 479 480 t->virtual_dr6 = 0; 481 t->ptrace_dr7 = 0; 482 } 483 484 void hw_breakpoint_restore(void) 485 { 486 set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); 487 set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); 488 set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); 489 set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); 490 set_debugreg(DR6_RESERVED, 6); 491 set_debugreg(__this_cpu_read(cpu_dr7), 7); 492 } 493 EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore); 494 495 /* 496 * Handle debug exception notifications. 497 * 498 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. 499 * 500 * NOTIFY_DONE returned if one of the following conditions is true. 501 * i) When the causative address is from user-space and the exception 502 * is a valid one, i.e. not triggered as a result of lazy debug register 503 * switching 504 * ii) When there are more bits than trap<n> set in DR6 register (such 505 * as BD, BS or BT) indicating that more than one debug condition is 506 * met and requires some more action in do_debug(). 507 * 508 * NOTIFY_STOP returned for all other cases 509 * 510 */ 511 static int hw_breakpoint_handler(struct die_args *args) 512 { 513 int i, rc = NOTIFY_STOP; 514 struct perf_event *bp; 515 unsigned long *dr6_p; 516 unsigned long dr6; 517 bool bpx; 518 519 /* The DR6 value is pointed by args->err */ 520 dr6_p = (unsigned long *)ERR_PTR(args->err); 521 dr6 = *dr6_p; 522 523 /* Do an early return if no trap bits are set in DR6 */ 524 if ((dr6 & DR_TRAP_BITS) == 0) 525 return NOTIFY_DONE; 526 527 /* Handle all the breakpoints that were triggered */ 528 for (i = 0; i < HBP_NUM; ++i) { 529 if (likely(!(dr6 & (DR_TRAP0 << i)))) 530 continue; 531 532 bp = this_cpu_read(bp_per_reg[i]); 533 if (!bp) 534 continue; 535 536 bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; 537 538 /* 539 * TF and data breakpoints are traps and can be merged, however 540 * instruction breakpoints are faults and will be raised 541 * separately. 542 * 543 * However DR6 can indicate both TF and instruction 544 * breakpoints. In that case take TF as that has precedence and 545 * delay the instruction breakpoint for the next exception. 546 */ 547 if (bpx && (dr6 & DR_STEP)) 548 continue; 549 550 /* 551 * Reset the 'i'th TRAP bit in dr6 to denote completion of 552 * exception handling 553 */ 554 (*dr6_p) &= ~(DR_TRAP0 << i); 555 556 perf_bp_event(bp, args->regs); 557 558 /* 559 * Set up resume flag to avoid breakpoint recursion when 560 * returning back to origin. 561 */ 562 if (bpx) 563 args->regs->flags |= X86_EFLAGS_RF; 564 } 565 566 /* 567 * Further processing in do_debug() is needed for a) user-space 568 * breakpoints (to generate signals) and b) when the system has 569 * taken exception due to multiple causes 570 */ 571 if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || 572 (dr6 & (~DR_TRAP_BITS))) 573 rc = NOTIFY_DONE; 574 575 return rc; 576 } 577 578 /* 579 * Handle debug exception notifications. 580 */ 581 int hw_breakpoint_exceptions_notify( 582 struct notifier_block *unused, unsigned long val, void *data) 583 { 584 if (val != DIE_DEBUG) 585 return NOTIFY_DONE; 586 587 return hw_breakpoint_handler(data); 588 } 589 590 void hw_breakpoint_pmu_read(struct perf_event *bp) 591 { 592 /* TODO */ 593 } 594