1 // SPDX-License-Identifier: GPL-2.0-only 2 #define pr_fmt(fmt) "SMP alternatives: " fmt 3 4 #include <linux/module.h> 5 #include <linux/sched.h> 6 #include <linux/perf_event.h> 7 #include <linux/mutex.h> 8 #include <linux/list.h> 9 #include <linux/stringify.h> 10 #include <linux/highmem.h> 11 #include <linux/mm.h> 12 #include <linux/vmalloc.h> 13 #include <linux/memory.h> 14 #include <linux/stop_machine.h> 15 #include <linux/slab.h> 16 #include <linux/kdebug.h> 17 #include <linux/kprobes.h> 18 #include <linux/mmu_context.h> 19 #include <linux/bsearch.h> 20 #include <linux/sync_core.h> 21 #include <asm/text-patching.h> 22 #include <asm/alternative.h> 23 #include <asm/sections.h> 24 #include <asm/mce.h> 25 #include <asm/nmi.h> 26 #include <asm/cacheflush.h> 27 #include <asm/tlbflush.h> 28 #include <asm/insn.h> 29 #include <asm/io.h> 30 #include <asm/fixmap.h> 31 #include <asm/paravirt.h> 32 #include <asm/asm-prototypes.h> 33 34 int __read_mostly alternatives_patched; 35 36 EXPORT_SYMBOL_GPL(alternatives_patched); 37 38 #define MAX_PATCH_LEN (255-1) 39 40 static int __initdata_or_module debug_alternative; 41 42 static int __init debug_alt(char *str) 43 { 44 debug_alternative = 1; 45 return 1; 46 } 47 __setup("debug-alternative", debug_alt); 48 49 static int noreplace_smp; 50 51 static int __init setup_noreplace_smp(char *str) 52 { 53 noreplace_smp = 1; 54 return 1; 55 } 56 __setup("noreplace-smp", setup_noreplace_smp); 57 58 #define DPRINTK(fmt, args...) \ 59 do { \ 60 if (debug_alternative) \ 61 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ 62 } while (0) 63 64 #define DUMP_BYTES(buf, len, fmt, args...) \ 65 do { \ 66 if (unlikely(debug_alternative)) { \ 67 int j; \ 68 \ 69 if (!(len)) \ 70 break; \ 71 \ 72 printk(KERN_DEBUG pr_fmt(fmt), ##args); \ 73 for (j = 0; j < (len) - 1; j++) \ 74 printk(KERN_CONT "%02hhx ", buf[j]); \ 75 printk(KERN_CONT "%02hhx\n", buf[j]); \ 76 } \ 77 } while (0) 78 79 static const unsigned char x86nops[] = 80 { 81 BYTES_NOP1, 82 BYTES_NOP2, 83 BYTES_NOP3, 84 BYTES_NOP4, 85 BYTES_NOP5, 86 BYTES_NOP6, 87 BYTES_NOP7, 88 BYTES_NOP8, 89 }; 90 91 const unsigned char * const x86_nops[ASM_NOP_MAX+1] = 92 { 93 NULL, 94 x86nops, 95 x86nops + 1, 96 x86nops + 1 + 2, 97 x86nops + 1 + 2 + 3, 98 x86nops + 1 + 2 + 3 + 4, 99 x86nops + 1 + 2 + 3 + 4 + 5, 100 x86nops + 1 + 2 + 3 + 4 + 5 + 6, 101 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 }; 103 104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */ 105 static void __init_or_module add_nops(void *insns, unsigned int len) 106 { 107 while (len > 0) { 108 unsigned int noplen = len; 109 if (noplen > ASM_NOP_MAX) 110 noplen = ASM_NOP_MAX; 111 memcpy(insns, x86_nops[noplen], noplen); 112 insns += noplen; 113 len -= noplen; 114 } 115 } 116 117 extern s32 __retpoline_sites[], __retpoline_sites_end[]; 118 extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 119 extern s32 __smp_locks[], __smp_locks_end[]; 120 void text_poke_early(void *addr, const void *opcode, size_t len); 121 122 /* 123 * Are we looking at a near JMP with a 1 or 4-byte displacement. 124 */ 125 static inline bool is_jmp(const u8 opcode) 126 { 127 return opcode == 0xeb || opcode == 0xe9; 128 } 129 130 static void __init_or_module 131 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) 132 { 133 u8 *next_rip, *tgt_rip; 134 s32 n_dspl, o_dspl; 135 int repl_len; 136 137 if (a->replacementlen != 5) 138 return; 139 140 o_dspl = *(s32 *)(insn_buff + 1); 141 142 /* next_rip of the replacement JMP */ 143 next_rip = repl_insn + a->replacementlen; 144 /* target rip of the replacement JMP */ 145 tgt_rip = next_rip + o_dspl; 146 n_dspl = tgt_rip - orig_insn; 147 148 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); 149 150 if (tgt_rip - orig_insn >= 0) { 151 if (n_dspl - 2 <= 127) 152 goto two_byte_jmp; 153 else 154 goto five_byte_jmp; 155 /* negative offset */ 156 } else { 157 if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) 158 goto two_byte_jmp; 159 else 160 goto five_byte_jmp; 161 } 162 163 two_byte_jmp: 164 n_dspl -= 2; 165 166 insn_buff[0] = 0xeb; 167 insn_buff[1] = (s8)n_dspl; 168 add_nops(insn_buff + 2, 3); 169 170 repl_len = 2; 171 goto done; 172 173 five_byte_jmp: 174 n_dspl -= 5; 175 176 insn_buff[0] = 0xe9; 177 *(s32 *)&insn_buff[1] = n_dspl; 178 179 repl_len = 5; 180 181 done: 182 183 DPRINTK("final displ: 0x%08x, JMP 0x%lx", 184 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); 185 } 186 187 /* 188 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) 189 * 190 * @instr: instruction byte stream 191 * @instrlen: length of the above 192 * @off: offset within @instr where the first NOP has been detected 193 * 194 * Return: number of NOPs found (and replaced). 195 */ 196 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) 197 { 198 unsigned long flags; 199 int i = off, nnops; 200 201 while (i < instrlen) { 202 if (instr[i] != 0x90) 203 break; 204 205 i++; 206 } 207 208 nnops = i - off; 209 210 if (nnops <= 1) 211 return nnops; 212 213 local_irq_save(flags); 214 add_nops(instr + off, nnops); 215 local_irq_restore(flags); 216 217 DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); 218 219 return nnops; 220 } 221 222 /* 223 * "noinline" to cause control flow change and thus invalidate I$ and 224 * cause refetch after modification. 225 */ 226 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) 227 { 228 struct insn insn; 229 int i = 0; 230 231 /* 232 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger 233 * ones. 234 */ 235 for (;;) { 236 if (insn_decode_kernel(&insn, &instr[i])) 237 return; 238 239 /* 240 * See if this and any potentially following NOPs can be 241 * optimized. 242 */ 243 if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) 244 i += optimize_nops_range(instr, len, i); 245 else 246 i += insn.length; 247 248 if (i >= len) 249 return; 250 } 251 } 252 253 /* 254 * Replace instructions with better alternatives for this CPU type. This runs 255 * before SMP is initialized to avoid SMP problems with self modifying code. 256 * This implies that asymmetric systems where APs have less capabilities than 257 * the boot processor are not handled. Tough. Make sure you disable such 258 * features by hand. 259 * 260 * Marked "noinline" to cause control flow change and thus insn cache 261 * to refetch changed I$ lines. 262 */ 263 void __init_or_module noinline apply_alternatives(struct alt_instr *start, 264 struct alt_instr *end) 265 { 266 struct alt_instr *a; 267 u8 *instr, *replacement; 268 u8 insn_buff[MAX_PATCH_LEN]; 269 270 DPRINTK("alt table %px, -> %px", start, end); 271 /* 272 * The scan order should be from start to end. A later scanned 273 * alternative code can overwrite previously scanned alternative code. 274 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 275 * patch code. 276 * 277 * So be careful if you want to change the scan order to any other 278 * order. 279 */ 280 for (a = start; a < end; a++) { 281 int insn_buff_sz = 0; 282 /* Mask away "NOT" flag bit for feature to test. */ 283 u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; 284 285 instr = (u8 *)&a->instr_offset + a->instr_offset; 286 replacement = (u8 *)&a->repl_offset + a->repl_offset; 287 BUG_ON(a->instrlen > sizeof(insn_buff)); 288 BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); 289 290 /* 291 * Patch if either: 292 * - feature is present 293 * - feature not present but ALTINSTR_FLAG_INV is set to mean, 294 * patch if feature is *NOT* present. 295 */ 296 if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) 297 goto next; 298 299 DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", 300 (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", 301 feature >> 5, 302 feature & 0x1f, 303 instr, instr, a->instrlen, 304 replacement, a->replacementlen); 305 306 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); 307 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); 308 309 memcpy(insn_buff, replacement, a->replacementlen); 310 insn_buff_sz = a->replacementlen; 311 312 /* 313 * 0xe8 is a relative jump; fix the offset. 314 * 315 * Instruction length is checked before the opcode to avoid 316 * accessing uninitialized bytes for zero-length replacements. 317 */ 318 if (a->replacementlen == 5 && *insn_buff == 0xe8) { 319 *(s32 *)(insn_buff + 1) += replacement - instr; 320 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", 321 *(s32 *)(insn_buff + 1), 322 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); 323 } 324 325 if (a->replacementlen && is_jmp(replacement[0])) 326 recompute_jump(a, instr, replacement, insn_buff); 327 328 for (; insn_buff_sz < a->instrlen; insn_buff_sz++) 329 insn_buff[insn_buff_sz] = 0x90; 330 331 DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); 332 333 text_poke_early(instr, insn_buff, insn_buff_sz); 334 335 next: 336 optimize_nops(instr, a->instrlen); 337 } 338 } 339 340 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) 341 342 /* 343 * CALL/JMP *%\reg 344 */ 345 static int emit_indirect(int op, int reg, u8 *bytes) 346 { 347 int i = 0; 348 u8 modrm; 349 350 switch (op) { 351 case CALL_INSN_OPCODE: 352 modrm = 0x10; /* Reg = 2; CALL r/m */ 353 break; 354 355 case JMP32_INSN_OPCODE: 356 modrm = 0x20; /* Reg = 4; JMP r/m */ 357 break; 358 359 default: 360 WARN_ON_ONCE(1); 361 return -1; 362 } 363 364 if (reg >= 8) { 365 bytes[i++] = 0x41; /* REX.B prefix */ 366 reg -= 8; 367 } 368 369 modrm |= 0xc0; /* Mod = 3 */ 370 modrm += reg; 371 372 bytes[i++] = 0xff; /* opcode */ 373 bytes[i++] = modrm; 374 375 return i; 376 } 377 378 /* 379 * Rewrite the compiler generated retpoline thunk calls. 380 * 381 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate 382 * indirect instructions, avoiding the extra indirection. 383 * 384 * For example, convert: 385 * 386 * CALL __x86_indirect_thunk_\reg 387 * 388 * into: 389 * 390 * CALL *%\reg 391 * 392 * It also tries to inline spectre_v2=retpoline,amd when size permits. 393 */ 394 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) 395 { 396 retpoline_thunk_t *target; 397 int reg, ret, i = 0; 398 u8 op, cc; 399 400 target = addr + insn->length + insn->immediate.value; 401 reg = target - __x86_indirect_thunk_array; 402 403 if (WARN_ON_ONCE(reg & ~0xf)) 404 return -1; 405 406 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ 407 BUG_ON(reg == 4); 408 409 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && 410 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) 411 return -1; 412 413 op = insn->opcode.bytes[0]; 414 415 /* 416 * Convert: 417 * 418 * Jcc.d32 __x86_indirect_thunk_\reg 419 * 420 * into: 421 * 422 * Jncc.d8 1f 423 * [ LFENCE ] 424 * JMP *%\reg 425 * [ NOP ] 426 * 1: 427 */ 428 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ 429 if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { 430 cc = insn->opcode.bytes[1] & 0xf; 431 cc ^= 1; /* invert condition */ 432 433 bytes[i++] = 0x70 + cc; /* Jcc.d8 */ 434 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ 435 436 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ 437 op = JMP32_INSN_OPCODE; 438 } 439 440 /* 441 * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. 442 */ 443 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { 444 bytes[i++] = 0x0f; 445 bytes[i++] = 0xae; 446 bytes[i++] = 0xe8; /* LFENCE */ 447 } 448 449 ret = emit_indirect(op, reg, bytes + i); 450 if (ret < 0) 451 return ret; 452 i += ret; 453 454 for (; i < insn->length;) 455 bytes[i++] = BYTES_NOP1; 456 457 return i; 458 } 459 460 /* 461 * Generated by 'objtool --retpoline'. 462 */ 463 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) 464 { 465 s32 *s; 466 467 for (s = start; s < end; s++) { 468 void *addr = (void *)s + *s; 469 struct insn insn; 470 int len, ret; 471 u8 bytes[16]; 472 u8 op1, op2; 473 474 ret = insn_decode_kernel(&insn, addr); 475 if (WARN_ON_ONCE(ret < 0)) 476 continue; 477 478 op1 = insn.opcode.bytes[0]; 479 op2 = insn.opcode.bytes[1]; 480 481 switch (op1) { 482 case CALL_INSN_OPCODE: 483 case JMP32_INSN_OPCODE: 484 break; 485 486 case 0x0f: /* escape */ 487 if (op2 >= 0x80 && op2 <= 0x8f) 488 break; 489 fallthrough; 490 default: 491 WARN_ON_ONCE(1); 492 continue; 493 } 494 495 DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", 496 addr, addr, insn.length, 497 addr + insn.length + insn.immediate.value); 498 499 len = patch_retpoline(addr, &insn, bytes); 500 if (len == insn.length) { 501 optimize_nops(bytes, len); 502 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); 503 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); 504 text_poke_early(addr, bytes, len); 505 } 506 } 507 } 508 509 #else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ 510 511 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } 512 513 #endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ 514 515 #ifdef CONFIG_SMP 516 static void alternatives_smp_lock(const s32 *start, const s32 *end, 517 u8 *text, u8 *text_end) 518 { 519 const s32 *poff; 520 521 for (poff = start; poff < end; poff++) { 522 u8 *ptr = (u8 *)poff + *poff; 523 524 if (!*poff || ptr < text || ptr >= text_end) 525 continue; 526 /* turn DS segment override prefix into lock prefix */ 527 if (*ptr == 0x3e) 528 text_poke(ptr, ((unsigned char []){0xf0}), 1); 529 } 530 } 531 532 static void alternatives_smp_unlock(const s32 *start, const s32 *end, 533 u8 *text, u8 *text_end) 534 { 535 const s32 *poff; 536 537 for (poff = start; poff < end; poff++) { 538 u8 *ptr = (u8 *)poff + *poff; 539 540 if (!*poff || ptr < text || ptr >= text_end) 541 continue; 542 /* turn lock prefix into DS segment override prefix */ 543 if (*ptr == 0xf0) 544 text_poke(ptr, ((unsigned char []){0x3E}), 1); 545 } 546 } 547 548 struct smp_alt_module { 549 /* what is this ??? */ 550 struct module *mod; 551 char *name; 552 553 /* ptrs to lock prefixes */ 554 const s32 *locks; 555 const s32 *locks_end; 556 557 /* .text segment, needed to avoid patching init code ;) */ 558 u8 *text; 559 u8 *text_end; 560 561 struct list_head next; 562 }; 563 static LIST_HEAD(smp_alt_modules); 564 static bool uniproc_patched = false; /* protected by text_mutex */ 565 566 void __init_or_module alternatives_smp_module_add(struct module *mod, 567 char *name, 568 void *locks, void *locks_end, 569 void *text, void *text_end) 570 { 571 struct smp_alt_module *smp; 572 573 mutex_lock(&text_mutex); 574 if (!uniproc_patched) 575 goto unlock; 576 577 if (num_possible_cpus() == 1) 578 /* Don't bother remembering, we'll never have to undo it. */ 579 goto smp_unlock; 580 581 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 582 if (NULL == smp) 583 /* we'll run the (safe but slow) SMP code then ... */ 584 goto unlock; 585 586 smp->mod = mod; 587 smp->name = name; 588 smp->locks = locks; 589 smp->locks_end = locks_end; 590 smp->text = text; 591 smp->text_end = text_end; 592 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", 593 smp->locks, smp->locks_end, 594 smp->text, smp->text_end, smp->name); 595 596 list_add_tail(&smp->next, &smp_alt_modules); 597 smp_unlock: 598 alternatives_smp_unlock(locks, locks_end, text, text_end); 599 unlock: 600 mutex_unlock(&text_mutex); 601 } 602 603 void __init_or_module alternatives_smp_module_del(struct module *mod) 604 { 605 struct smp_alt_module *item; 606 607 mutex_lock(&text_mutex); 608 list_for_each_entry(item, &smp_alt_modules, next) { 609 if (mod != item->mod) 610 continue; 611 list_del(&item->next); 612 kfree(item); 613 break; 614 } 615 mutex_unlock(&text_mutex); 616 } 617 618 void alternatives_enable_smp(void) 619 { 620 struct smp_alt_module *mod; 621 622 /* Why bother if there are no other CPUs? */ 623 BUG_ON(num_possible_cpus() == 1); 624 625 mutex_lock(&text_mutex); 626 627 if (uniproc_patched) { 628 pr_info("switching to SMP code\n"); 629 BUG_ON(num_online_cpus() != 1); 630 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 631 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 632 list_for_each_entry(mod, &smp_alt_modules, next) 633 alternatives_smp_lock(mod->locks, mod->locks_end, 634 mod->text, mod->text_end); 635 uniproc_patched = false; 636 } 637 mutex_unlock(&text_mutex); 638 } 639 640 /* 641 * Return 1 if the address range is reserved for SMP-alternatives. 642 * Must hold text_mutex. 643 */ 644 int alternatives_text_reserved(void *start, void *end) 645 { 646 struct smp_alt_module *mod; 647 const s32 *poff; 648 u8 *text_start = start; 649 u8 *text_end = end; 650 651 lockdep_assert_held(&text_mutex); 652 653 list_for_each_entry(mod, &smp_alt_modules, next) { 654 if (mod->text > text_end || mod->text_end < text_start) 655 continue; 656 for (poff = mod->locks; poff < mod->locks_end; poff++) { 657 const u8 *ptr = (const u8 *)poff + *poff; 658 659 if (text_start <= ptr && text_end > ptr) 660 return 1; 661 } 662 } 663 664 return 0; 665 } 666 #endif /* CONFIG_SMP */ 667 668 #ifdef CONFIG_PARAVIRT 669 void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 670 struct paravirt_patch_site *end) 671 { 672 struct paravirt_patch_site *p; 673 char insn_buff[MAX_PATCH_LEN]; 674 675 for (p = start; p < end; p++) { 676 unsigned int used; 677 678 BUG_ON(p->len > MAX_PATCH_LEN); 679 /* prep the buffer with the original instructions */ 680 memcpy(insn_buff, p->instr, p->len); 681 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); 682 683 BUG_ON(used > p->len); 684 685 /* Pad the rest with nops */ 686 add_nops(insn_buff + used, p->len - used); 687 text_poke_early(p->instr, insn_buff, p->len); 688 } 689 } 690 extern struct paravirt_patch_site __start_parainstructions[], 691 __stop_parainstructions[]; 692 #endif /* CONFIG_PARAVIRT */ 693 694 /* 695 * Self-test for the INT3 based CALL emulation code. 696 * 697 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up 698 * properly and that there is a stack gap between the INT3 frame and the 699 * previous context. Without this gap doing a virtual PUSH on the interrupted 700 * stack would corrupt the INT3 IRET frame. 701 * 702 * See entry_{32,64}.S for more details. 703 */ 704 705 /* 706 * We define the int3_magic() function in assembly to control the calling 707 * convention such that we can 'call' it from assembly. 708 */ 709 710 extern void int3_magic(unsigned int *ptr); /* defined in asm */ 711 712 asm ( 713 " .pushsection .init.text, \"ax\", @progbits\n" 714 " .type int3_magic, @function\n" 715 "int3_magic:\n" 716 " movl $1, (%" _ASM_ARG1 ")\n" 717 ASM_RET 718 " .size int3_magic, .-int3_magic\n" 719 " .popsection\n" 720 ); 721 722 extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ 723 724 static int __init 725 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) 726 { 727 struct die_args *args = data; 728 struct pt_regs *regs = args->regs; 729 730 if (!regs || user_mode(regs)) 731 return NOTIFY_DONE; 732 733 if (val != DIE_INT3) 734 return NOTIFY_DONE; 735 736 if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) 737 return NOTIFY_DONE; 738 739 int3_emulate_call(regs, (unsigned long)&int3_magic); 740 return NOTIFY_STOP; 741 } 742 743 static void __init int3_selftest(void) 744 { 745 static __initdata struct notifier_block int3_exception_nb = { 746 .notifier_call = int3_exception_notify, 747 .priority = INT_MAX-1, /* last */ 748 }; 749 unsigned int val = 0; 750 751 BUG_ON(register_die_notifier(&int3_exception_nb)); 752 753 /* 754 * Basically: int3_magic(&val); but really complicated :-) 755 * 756 * Stick the address of the INT3 instruction into int3_selftest_ip, 757 * then trigger the INT3, padded with NOPs to match a CALL instruction 758 * length. 759 */ 760 asm volatile ("1: int3; nop; nop; nop; nop\n\t" 761 ".pushsection .init.data,\"aw\"\n\t" 762 ".align " __ASM_SEL(4, 8) "\n\t" 763 ".type int3_selftest_ip, @object\n\t" 764 ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" 765 "int3_selftest_ip:\n\t" 766 __ASM_SEL(.long, .quad) " 1b\n\t" 767 ".popsection\n\t" 768 : ASM_CALL_CONSTRAINT 769 : __ASM_SEL_RAW(a, D) (&val) 770 : "memory"); 771 772 BUG_ON(val != 1); 773 774 unregister_die_notifier(&int3_exception_nb); 775 } 776 777 void __init alternative_instructions(void) 778 { 779 int3_selftest(); 780 781 /* 782 * The patching is not fully atomic, so try to avoid local 783 * interruptions that might execute the to be patched code. 784 * Other CPUs are not running. 785 */ 786 stop_nmi(); 787 788 /* 789 * Don't stop machine check exceptions while patching. 790 * MCEs only happen when something got corrupted and in this 791 * case we must do something about the corruption. 792 * Ignoring it is worse than an unlikely patching race. 793 * Also machine checks tend to be broadcast and if one CPU 794 * goes into machine check the others follow quickly, so we don't 795 * expect a machine check to cause undue problems during to code 796 * patching. 797 */ 798 799 /* 800 * Paravirt patching and alternative patching can be combined to 801 * replace a function call with a short direct code sequence (e.g. 802 * by setting a constant return value instead of doing that in an 803 * external function). 804 * In order to make this work the following sequence is required: 805 * 1. set (artificial) features depending on used paravirt 806 * functions which can later influence alternative patching 807 * 2. apply paravirt patching (generally replacing an indirect 808 * function call with a direct one) 809 * 3. apply alternative patching (e.g. replacing a direct function 810 * call with a custom code sequence) 811 * Doing paravirt patching after alternative patching would clobber 812 * the optimization of the custom code with a function call again. 813 */ 814 paravirt_set_cap(); 815 816 /* 817 * First patch paravirt functions, such that we overwrite the indirect 818 * call with the direct call. 819 */ 820 apply_paravirt(__parainstructions, __parainstructions_end); 821 822 /* 823 * Rewrite the retpolines, must be done before alternatives since 824 * those can rewrite the retpoline thunks. 825 */ 826 apply_retpolines(__retpoline_sites, __retpoline_sites_end); 827 828 /* 829 * Then patch alternatives, such that those paravirt calls that are in 830 * alternatives can be overwritten by their immediate fragments. 831 */ 832 apply_alternatives(__alt_instructions, __alt_instructions_end); 833 834 #ifdef CONFIG_SMP 835 /* Patch to UP if other cpus not imminent. */ 836 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 837 uniproc_patched = true; 838 alternatives_smp_module_add(NULL, "core kernel", 839 __smp_locks, __smp_locks_end, 840 _text, _etext); 841 } 842 843 if (!uniproc_patched || num_possible_cpus() == 1) { 844 free_init_pages("SMP alternatives", 845 (unsigned long)__smp_locks, 846 (unsigned long)__smp_locks_end); 847 } 848 #endif 849 850 restart_nmi(); 851 alternatives_patched = 1; 852 } 853 854 /** 855 * text_poke_early - Update instructions on a live kernel at boot time 856 * @addr: address to modify 857 * @opcode: source of the copy 858 * @len: length to copy 859 * 860 * When you use this code to patch more than one byte of an instruction 861 * you need to make sure that other CPUs cannot execute this code in parallel. 862 * Also no thread must be currently preempted in the middle of these 863 * instructions. And on the local CPU you need to be protected against NMI or 864 * MCE handlers seeing an inconsistent instruction while you patch. 865 */ 866 void __init_or_module text_poke_early(void *addr, const void *opcode, 867 size_t len) 868 { 869 unsigned long flags; 870 871 if (boot_cpu_has(X86_FEATURE_NX) && 872 is_module_text_address((unsigned long)addr)) { 873 /* 874 * Modules text is marked initially as non-executable, so the 875 * code cannot be running and speculative code-fetches are 876 * prevented. Just change the code. 877 */ 878 memcpy(addr, opcode, len); 879 } else { 880 local_irq_save(flags); 881 memcpy(addr, opcode, len); 882 local_irq_restore(flags); 883 sync_core(); 884 885 /* 886 * Could also do a CLFLUSH here to speed up CPU recovery; but 887 * that causes hangs on some VIA CPUs. 888 */ 889 } 890 } 891 892 typedef struct { 893 struct mm_struct *mm; 894 } temp_mm_state_t; 895 896 /* 897 * Using a temporary mm allows to set temporary mappings that are not accessible 898 * by other CPUs. Such mappings are needed to perform sensitive memory writes 899 * that override the kernel memory protections (e.g., W^X), without exposing the 900 * temporary page-table mappings that are required for these write operations to 901 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the 902 * mapping is torn down. 903 * 904 * Context: The temporary mm needs to be used exclusively by a single core. To 905 * harden security IRQs must be disabled while the temporary mm is 906 * loaded, thereby preventing interrupt handler bugs from overriding 907 * the kernel memory protection. 908 */ 909 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) 910 { 911 temp_mm_state_t temp_state; 912 913 lockdep_assert_irqs_disabled(); 914 915 /* 916 * Make sure not to be in TLB lazy mode, as otherwise we'll end up 917 * with a stale address space WITHOUT being in lazy mode after 918 * restoring the previous mm. 919 */ 920 if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) 921 leave_mm(smp_processor_id()); 922 923 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); 924 switch_mm_irqs_off(NULL, mm, current); 925 926 /* 927 * If breakpoints are enabled, disable them while the temporary mm is 928 * used. Userspace might set up watchpoints on addresses that are used 929 * in the temporary mm, which would lead to wrong signals being sent or 930 * crashes. 931 * 932 * Note that breakpoints are not disabled selectively, which also causes 933 * kernel breakpoints (e.g., perf's) to be disabled. This might be 934 * undesirable, but still seems reasonable as the code that runs in the 935 * temporary mm should be short. 936 */ 937 if (hw_breakpoint_active()) 938 hw_breakpoint_disable(); 939 940 return temp_state; 941 } 942 943 static inline void unuse_temporary_mm(temp_mm_state_t prev_state) 944 { 945 lockdep_assert_irqs_disabled(); 946 switch_mm_irqs_off(NULL, prev_state.mm, current); 947 948 /* 949 * Restore the breakpoints if they were disabled before the temporary mm 950 * was loaded. 951 */ 952 if (hw_breakpoint_active()) 953 hw_breakpoint_restore(); 954 } 955 956 __ro_after_init struct mm_struct *poking_mm; 957 __ro_after_init unsigned long poking_addr; 958 959 static void *__text_poke(void *addr, const void *opcode, size_t len) 960 { 961 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; 962 struct page *pages[2] = {NULL}; 963 temp_mm_state_t prev; 964 unsigned long flags; 965 pte_t pte, *ptep; 966 spinlock_t *ptl; 967 pgprot_t pgprot; 968 969 /* 970 * While boot memory allocator is running we cannot use struct pages as 971 * they are not yet initialized. There is no way to recover. 972 */ 973 BUG_ON(!after_bootmem); 974 975 if (!core_kernel_text((unsigned long)addr)) { 976 pages[0] = vmalloc_to_page(addr); 977 if (cross_page_boundary) 978 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 979 } else { 980 pages[0] = virt_to_page(addr); 981 WARN_ON(!PageReserved(pages[0])); 982 if (cross_page_boundary) 983 pages[1] = virt_to_page(addr + PAGE_SIZE); 984 } 985 /* 986 * If something went wrong, crash and burn since recovery paths are not 987 * implemented. 988 */ 989 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); 990 991 /* 992 * Map the page without the global bit, as TLB flushing is done with 993 * flush_tlb_mm_range(), which is intended for non-global PTEs. 994 */ 995 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); 996 997 /* 998 * The lock is not really needed, but this allows to avoid open-coding. 999 */ 1000 ptep = get_locked_pte(poking_mm, poking_addr, &ptl); 1001 1002 /* 1003 * This must not fail; preallocated in poking_init(). 1004 */ 1005 VM_BUG_ON(!ptep); 1006 1007 local_irq_save(flags); 1008 1009 pte = mk_pte(pages[0], pgprot); 1010 set_pte_at(poking_mm, poking_addr, ptep, pte); 1011 1012 if (cross_page_boundary) { 1013 pte = mk_pte(pages[1], pgprot); 1014 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); 1015 } 1016 1017 /* 1018 * Loading the temporary mm behaves as a compiler barrier, which 1019 * guarantees that the PTE will be set at the time memcpy() is done. 1020 */ 1021 prev = use_temporary_mm(poking_mm); 1022 1023 kasan_disable_current(); 1024 memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); 1025 kasan_enable_current(); 1026 1027 /* 1028 * Ensure that the PTE is only cleared after the instructions of memcpy 1029 * were issued by using a compiler barrier. 1030 */ 1031 barrier(); 1032 1033 pte_clear(poking_mm, poking_addr, ptep); 1034 if (cross_page_boundary) 1035 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); 1036 1037 /* 1038 * Loading the previous page-table hierarchy requires a serializing 1039 * instruction that already allows the core to see the updated version. 1040 * Xen-PV is assumed to serialize execution in a similar manner. 1041 */ 1042 unuse_temporary_mm(prev); 1043 1044 /* 1045 * Flushing the TLB might involve IPIs, which would require enabled 1046 * IRQs, but not if the mm is not used, as it is in this point. 1047 */ 1048 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + 1049 (cross_page_boundary ? 2 : 1) * PAGE_SIZE, 1050 PAGE_SHIFT, false); 1051 1052 /* 1053 * If the text does not match what we just wrote then something is 1054 * fundamentally screwy; there's nothing we can really do about that. 1055 */ 1056 BUG_ON(memcmp(addr, opcode, len)); 1057 1058 local_irq_restore(flags); 1059 pte_unmap_unlock(ptep, ptl); 1060 return addr; 1061 } 1062 1063 /** 1064 * text_poke - Update instructions on a live kernel 1065 * @addr: address to modify 1066 * @opcode: source of the copy 1067 * @len: length to copy 1068 * 1069 * Only atomic text poke/set should be allowed when not doing early patching. 1070 * It means the size must be writable atomically and the address must be aligned 1071 * in a way that permits an atomic write. It also makes sure we fit on a single 1072 * page. 1073 * 1074 * Note that the caller must ensure that if the modified code is part of a 1075 * module, the module would not be removed during poking. This can be achieved 1076 * by registering a module notifier, and ordering module removal and patching 1077 * trough a mutex. 1078 */ 1079 void *text_poke(void *addr, const void *opcode, size_t len) 1080 { 1081 lockdep_assert_held(&text_mutex); 1082 1083 return __text_poke(addr, opcode, len); 1084 } 1085 1086 /** 1087 * text_poke_kgdb - Update instructions on a live kernel by kgdb 1088 * @addr: address to modify 1089 * @opcode: source of the copy 1090 * @len: length to copy 1091 * 1092 * Only atomic text poke/set should be allowed when not doing early patching. 1093 * It means the size must be writable atomically and the address must be aligned 1094 * in a way that permits an atomic write. It also makes sure we fit on a single 1095 * page. 1096 * 1097 * Context: should only be used by kgdb, which ensures no other core is running, 1098 * despite the fact it does not hold the text_mutex. 1099 */ 1100 void *text_poke_kgdb(void *addr, const void *opcode, size_t len) 1101 { 1102 return __text_poke(addr, opcode, len); 1103 } 1104 1105 /** 1106 * text_poke_copy - Copy instructions into (an unused part of) RX memory 1107 * @addr: address to modify 1108 * @opcode: source of the copy 1109 * @len: length to copy, could be more than 2x PAGE_SIZE 1110 * 1111 * Not safe against concurrent execution; useful for JITs to dump 1112 * new code blocks into unused regions of RX memory. Can be used in 1113 * conjunction with synchronize_rcu_tasks() to wait for existing 1114 * execution to quiesce after having made sure no existing functions 1115 * pointers are live. 1116 */ 1117 void *text_poke_copy(void *addr, const void *opcode, size_t len) 1118 { 1119 unsigned long start = (unsigned long)addr; 1120 size_t patched = 0; 1121 1122 if (WARN_ON_ONCE(core_kernel_text(start))) 1123 return NULL; 1124 1125 mutex_lock(&text_mutex); 1126 while (patched < len) { 1127 unsigned long ptr = start + patched; 1128 size_t s; 1129 1130 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 1131 1132 __text_poke((void *)ptr, opcode + patched, s); 1133 patched += s; 1134 } 1135 mutex_unlock(&text_mutex); 1136 return addr; 1137 } 1138 1139 static void do_sync_core(void *info) 1140 { 1141 sync_core(); 1142 } 1143 1144 void text_poke_sync(void) 1145 { 1146 on_each_cpu(do_sync_core, NULL, 1); 1147 } 1148 1149 struct text_poke_loc { 1150 /* addr := _stext + rel_addr */ 1151 s32 rel_addr; 1152 s32 disp; 1153 u8 len; 1154 u8 opcode; 1155 const u8 text[POKE_MAX_OPCODE_SIZE]; 1156 /* see text_poke_bp_batch() */ 1157 u8 old; 1158 }; 1159 1160 struct bp_patching_desc { 1161 struct text_poke_loc *vec; 1162 int nr_entries; 1163 atomic_t refs; 1164 }; 1165 1166 static struct bp_patching_desc *bp_desc; 1167 1168 static __always_inline 1169 struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) 1170 { 1171 /* rcu_dereference */ 1172 struct bp_patching_desc *desc = __READ_ONCE(*descp); 1173 1174 if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) 1175 return NULL; 1176 1177 return desc; 1178 } 1179 1180 static __always_inline void put_desc(struct bp_patching_desc *desc) 1181 { 1182 smp_mb__before_atomic(); 1183 arch_atomic_dec(&desc->refs); 1184 } 1185 1186 static __always_inline void *text_poke_addr(struct text_poke_loc *tp) 1187 { 1188 return _stext + tp->rel_addr; 1189 } 1190 1191 static __always_inline int patch_cmp(const void *key, const void *elt) 1192 { 1193 struct text_poke_loc *tp = (struct text_poke_loc *) elt; 1194 1195 if (key < text_poke_addr(tp)) 1196 return -1; 1197 if (key > text_poke_addr(tp)) 1198 return 1; 1199 return 0; 1200 } 1201 1202 noinstr int poke_int3_handler(struct pt_regs *regs) 1203 { 1204 struct bp_patching_desc *desc; 1205 struct text_poke_loc *tp; 1206 int ret = 0; 1207 void *ip; 1208 1209 if (user_mode(regs)) 1210 return 0; 1211 1212 /* 1213 * Having observed our INT3 instruction, we now must observe 1214 * bp_desc: 1215 * 1216 * bp_desc = desc INT3 1217 * WMB RMB 1218 * write INT3 if (desc) 1219 */ 1220 smp_rmb(); 1221 1222 desc = try_get_desc(&bp_desc); 1223 if (!desc) 1224 return 0; 1225 1226 /* 1227 * Discount the INT3. See text_poke_bp_batch(). 1228 */ 1229 ip = (void *) regs->ip - INT3_INSN_SIZE; 1230 1231 /* 1232 * Skip the binary search if there is a single member in the vector. 1233 */ 1234 if (unlikely(desc->nr_entries > 1)) { 1235 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, 1236 sizeof(struct text_poke_loc), 1237 patch_cmp); 1238 if (!tp) 1239 goto out_put; 1240 } else { 1241 tp = desc->vec; 1242 if (text_poke_addr(tp) != ip) 1243 goto out_put; 1244 } 1245 1246 ip += tp->len; 1247 1248 switch (tp->opcode) { 1249 case INT3_INSN_OPCODE: 1250 /* 1251 * Someone poked an explicit INT3, they'll want to handle it, 1252 * do not consume. 1253 */ 1254 goto out_put; 1255 1256 case RET_INSN_OPCODE: 1257 int3_emulate_ret(regs); 1258 break; 1259 1260 case CALL_INSN_OPCODE: 1261 int3_emulate_call(regs, (long)ip + tp->disp); 1262 break; 1263 1264 case JMP32_INSN_OPCODE: 1265 case JMP8_INSN_OPCODE: 1266 int3_emulate_jmp(regs, (long)ip + tp->disp); 1267 break; 1268 1269 default: 1270 BUG(); 1271 } 1272 1273 ret = 1; 1274 1275 out_put: 1276 put_desc(desc); 1277 return ret; 1278 } 1279 1280 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) 1281 static struct text_poke_loc tp_vec[TP_VEC_MAX]; 1282 static int tp_vec_nr; 1283 1284 /** 1285 * text_poke_bp_batch() -- update instructions on live kernel on SMP 1286 * @tp: vector of instructions to patch 1287 * @nr_entries: number of entries in the vector 1288 * 1289 * Modify multi-byte instruction by using int3 breakpoint on SMP. 1290 * We completely avoid stop_machine() here, and achieve the 1291 * synchronization using int3 breakpoint. 1292 * 1293 * The way it is done: 1294 * - For each entry in the vector: 1295 * - add a int3 trap to the address that will be patched 1296 * - sync cores 1297 * - For each entry in the vector: 1298 * - update all but the first byte of the patched range 1299 * - sync cores 1300 * - For each entry in the vector: 1301 * - replace the first byte (int3) by the first byte of 1302 * replacing opcode 1303 * - sync cores 1304 */ 1305 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) 1306 { 1307 struct bp_patching_desc desc = { 1308 .vec = tp, 1309 .nr_entries = nr_entries, 1310 .refs = ATOMIC_INIT(1), 1311 }; 1312 unsigned char int3 = INT3_INSN_OPCODE; 1313 unsigned int i; 1314 int do_sync; 1315 1316 lockdep_assert_held(&text_mutex); 1317 1318 smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ 1319 1320 /* 1321 * Corresponding read barrier in int3 notifier for making sure the 1322 * nr_entries and handler are correctly ordered wrt. patching. 1323 */ 1324 smp_wmb(); 1325 1326 /* 1327 * First step: add a int3 trap to the address that will be patched. 1328 */ 1329 for (i = 0; i < nr_entries; i++) { 1330 tp[i].old = *(u8 *)text_poke_addr(&tp[i]); 1331 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); 1332 } 1333 1334 text_poke_sync(); 1335 1336 /* 1337 * Second step: update all but the first byte of the patched range. 1338 */ 1339 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1340 u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; 1341 int len = tp[i].len; 1342 1343 if (len - INT3_INSN_SIZE > 0) { 1344 memcpy(old + INT3_INSN_SIZE, 1345 text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1346 len - INT3_INSN_SIZE); 1347 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1348 (const char *)tp[i].text + INT3_INSN_SIZE, 1349 len - INT3_INSN_SIZE); 1350 do_sync++; 1351 } 1352 1353 /* 1354 * Emit a perf event to record the text poke, primarily to 1355 * support Intel PT decoding which must walk the executable code 1356 * to reconstruct the trace. The flow up to here is: 1357 * - write INT3 byte 1358 * - IPI-SYNC 1359 * - write instruction tail 1360 * At this point the actual control flow will be through the 1361 * INT3 and handler and not hit the old or new instruction. 1362 * Intel PT outputs FUP/TIP packets for the INT3, so the flow 1363 * can still be decoded. Subsequently: 1364 * - emit RECORD_TEXT_POKE with the new instruction 1365 * - IPI-SYNC 1366 * - write first byte 1367 * - IPI-SYNC 1368 * So before the text poke event timestamp, the decoder will see 1369 * either the old instruction flow or FUP/TIP of INT3. After the 1370 * text poke event timestamp, the decoder will see either the 1371 * new instruction flow or FUP/TIP of INT3. Thus decoders can 1372 * use the timestamp as the point at which to modify the 1373 * executable code. 1374 * The old instruction is recorded so that the event can be 1375 * processed forwards or backwards. 1376 */ 1377 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, 1378 tp[i].text, len); 1379 } 1380 1381 if (do_sync) { 1382 /* 1383 * According to Intel, this core syncing is very likely 1384 * not necessary and we'd be safe even without it. But 1385 * better safe than sorry (plus there's not only Intel). 1386 */ 1387 text_poke_sync(); 1388 } 1389 1390 /* 1391 * Third step: replace the first byte (int3) by the first byte of 1392 * replacing opcode. 1393 */ 1394 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1395 if (tp[i].text[0] == INT3_INSN_OPCODE) 1396 continue; 1397 1398 text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); 1399 do_sync++; 1400 } 1401 1402 if (do_sync) 1403 text_poke_sync(); 1404 1405 /* 1406 * Remove and synchronize_rcu(), except we have a very primitive 1407 * refcount based completion. 1408 */ 1409 WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ 1410 if (!atomic_dec_and_test(&desc.refs)) 1411 atomic_cond_read_acquire(&desc.refs, !VAL); 1412 } 1413 1414 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, 1415 const void *opcode, size_t len, const void *emulate) 1416 { 1417 struct insn insn; 1418 int ret, i; 1419 1420 memcpy((void *)tp->text, opcode, len); 1421 if (!emulate) 1422 emulate = opcode; 1423 1424 ret = insn_decode_kernel(&insn, emulate); 1425 BUG_ON(ret < 0); 1426 1427 tp->rel_addr = addr - (void *)_stext; 1428 tp->len = len; 1429 tp->opcode = insn.opcode.bytes[0]; 1430 1431 switch (tp->opcode) { 1432 case RET_INSN_OPCODE: 1433 case JMP32_INSN_OPCODE: 1434 case JMP8_INSN_OPCODE: 1435 /* 1436 * Control flow instructions without implied execution of the 1437 * next instruction can be padded with INT3. 1438 */ 1439 for (i = insn.length; i < len; i++) 1440 BUG_ON(tp->text[i] != INT3_INSN_OPCODE); 1441 break; 1442 1443 default: 1444 BUG_ON(len != insn.length); 1445 }; 1446 1447 1448 switch (tp->opcode) { 1449 case INT3_INSN_OPCODE: 1450 case RET_INSN_OPCODE: 1451 break; 1452 1453 case CALL_INSN_OPCODE: 1454 case JMP32_INSN_OPCODE: 1455 case JMP8_INSN_OPCODE: 1456 tp->disp = insn.immediate.value; 1457 break; 1458 1459 default: /* assume NOP */ 1460 switch (len) { 1461 case 2: /* NOP2 -- emulate as JMP8+0 */ 1462 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1463 tp->opcode = JMP8_INSN_OPCODE; 1464 tp->disp = 0; 1465 break; 1466 1467 case 5: /* NOP5 -- emulate as JMP32+0 */ 1468 BUG_ON(memcmp(emulate, x86_nops[len], len)); 1469 tp->opcode = JMP32_INSN_OPCODE; 1470 tp->disp = 0; 1471 break; 1472 1473 default: /* unknown instruction */ 1474 BUG(); 1475 } 1476 break; 1477 } 1478 } 1479 1480 /* 1481 * We hard rely on the tp_vec being ordered; ensure this is so by flushing 1482 * early if needed. 1483 */ 1484 static bool tp_order_fail(void *addr) 1485 { 1486 struct text_poke_loc *tp; 1487 1488 if (!tp_vec_nr) 1489 return false; 1490 1491 if (!addr) /* force */ 1492 return true; 1493 1494 tp = &tp_vec[tp_vec_nr - 1]; 1495 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) 1496 return true; 1497 1498 return false; 1499 } 1500 1501 static void text_poke_flush(void *addr) 1502 { 1503 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { 1504 text_poke_bp_batch(tp_vec, tp_vec_nr); 1505 tp_vec_nr = 0; 1506 } 1507 } 1508 1509 void text_poke_finish(void) 1510 { 1511 text_poke_flush(NULL); 1512 } 1513 1514 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) 1515 { 1516 struct text_poke_loc *tp; 1517 1518 if (unlikely(system_state == SYSTEM_BOOTING)) { 1519 text_poke_early(addr, opcode, len); 1520 return; 1521 } 1522 1523 text_poke_flush(addr); 1524 1525 tp = &tp_vec[tp_vec_nr++]; 1526 text_poke_loc_init(tp, addr, opcode, len, emulate); 1527 } 1528 1529 /** 1530 * text_poke_bp() -- update instructions on live kernel on SMP 1531 * @addr: address to patch 1532 * @opcode: opcode of new instruction 1533 * @len: length to copy 1534 * @emulate: instruction to be emulated 1535 * 1536 * Update a single instruction with the vector in the stack, avoiding 1537 * dynamically allocated memory. This function should be used when it is 1538 * not possible to allocate memory. 1539 */ 1540 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) 1541 { 1542 struct text_poke_loc tp; 1543 1544 if (unlikely(system_state == SYSTEM_BOOTING)) { 1545 text_poke_early(addr, opcode, len); 1546 return; 1547 } 1548 1549 text_poke_loc_init(&tp, addr, opcode, len, emulate); 1550 text_poke_bp_batch(&tp, 1); 1551 } 1552