1 // SPDX-License-Identifier: GPL-2.0-only 2 #define pr_fmt(fmt) "SMP alternatives: " fmt 3 4 #include <linux/module.h> 5 #include <linux/sched.h> 6 #include <linux/perf_event.h> 7 #include <linux/mutex.h> 8 #include <linux/list.h> 9 #include <linux/stringify.h> 10 #include <linux/highmem.h> 11 #include <linux/mm.h> 12 #include <linux/vmalloc.h> 13 #include <linux/memory.h> 14 #include <linux/stop_machine.h> 15 #include <linux/slab.h> 16 #include <linux/kdebug.h> 17 #include <linux/kprobes.h> 18 #include <linux/mmu_context.h> 19 #include <linux/bsearch.h> 20 #include <linux/sync_core.h> 21 #include <asm/text-patching.h> 22 #include <asm/alternative.h> 23 #include <asm/sections.h> 24 #include <asm/mce.h> 25 #include <asm/nmi.h> 26 #include <asm/cacheflush.h> 27 #include <asm/tlbflush.h> 28 #include <asm/insn.h> 29 #include <asm/io.h> 30 #include <asm/fixmap.h> 31 #include <asm/paravirt.h> 32 #include <asm/asm-prototypes.h> 33 #include <asm/cfi.h> 34 35 int __read_mostly alternatives_patched; 36 37 EXPORT_SYMBOL_GPL(alternatives_patched); 38 39 #define MAX_PATCH_LEN (255-1) 40 41 #define DA_ALL (~0) 42 #define DA_ALT 0x01 43 #define DA_RET 0x02 44 #define DA_RETPOLINE 0x04 45 #define DA_ENDBR 0x08 46 #define DA_SMP 0x10 47 48 static unsigned int __initdata_or_module debug_alternative; 49 50 static int __init debug_alt(char *str) 51 { 52 if (str && *str == '=') 53 str++; 54 55 if (!str || kstrtouint(str, 0, &debug_alternative)) 56 debug_alternative = DA_ALL; 57 58 return 1; 59 } 60 __setup("debug-alternative", debug_alt); 61 62 static int noreplace_smp; 63 64 static int __init setup_noreplace_smp(char *str) 65 { 66 noreplace_smp = 1; 67 return 1; 68 } 69 __setup("noreplace-smp", setup_noreplace_smp); 70 71 #define DPRINTK(type, fmt, args...) \ 72 do { \ 73 if (debug_alternative & DA_##type) \ 74 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ 75 } while (0) 76 77 #define DUMP_BYTES(type, buf, len, fmt, args...) \ 78 do { \ 79 if (unlikely(debug_alternative & DA_##type)) { \ 80 int j; \ 81 \ 82 if (!(len)) \ 83 break; \ 84 \ 85 printk(KERN_DEBUG pr_fmt(fmt), ##args); \ 86 for (j = 0; j < (len) - 1; j++) \ 87 printk(KERN_CONT "%02hhx ", buf[j]); \ 88 printk(KERN_CONT "%02hhx\n", buf[j]); \ 89 } \ 90 } while (0) 91 92 static const unsigned char x86nops[] = 93 { 94 BYTES_NOP1, 95 BYTES_NOP2, 96 BYTES_NOP3, 97 BYTES_NOP4, 98 BYTES_NOP5, 99 BYTES_NOP6, 100 BYTES_NOP7, 101 BYTES_NOP8, 102 #ifdef CONFIG_64BIT 103 BYTES_NOP9, 104 BYTES_NOP10, 105 BYTES_NOP11, 106 #endif 107 }; 108 109 const unsigned char * const x86_nops[ASM_NOP_MAX+1] = 110 { 111 NULL, 112 x86nops, 113 x86nops + 1, 114 x86nops + 1 + 2, 115 x86nops + 1 + 2 + 3, 116 x86nops + 1 + 2 + 3 + 4, 117 x86nops + 1 + 2 + 3 + 4 + 5, 118 x86nops + 1 + 2 + 3 + 4 + 5 + 6, 119 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 120 #ifdef CONFIG_64BIT 121 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, 122 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, 123 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, 124 #endif 125 }; 126 127 /* 128 * Fill the buffer with a single effective instruction of size @len. 129 * 130 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) 131 * for every single-byte NOP, try to generate the maximally available NOP of 132 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for 133 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and 134 * *jump* over instead of executing long and daft NOPs. 135 */ 136 static void __init_or_module add_nop(u8 *instr, unsigned int len) 137 { 138 u8 *target = instr + len; 139 140 if (!len) 141 return; 142 143 if (len <= ASM_NOP_MAX) { 144 memcpy(instr, x86_nops[len], len); 145 return; 146 } 147 148 if (len < 128) { 149 __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); 150 instr += JMP8_INSN_SIZE; 151 } else { 152 __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); 153 instr += JMP32_INSN_SIZE; 154 } 155 156 for (;instr < target; instr++) 157 *instr = INT3_INSN_OPCODE; 158 } 159 160 extern s32 __retpoline_sites[], __retpoline_sites_end[]; 161 extern s32 __return_sites[], __return_sites_end[]; 162 extern s32 __cfi_sites[], __cfi_sites_end[]; 163 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; 164 extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 165 extern s32 __smp_locks[], __smp_locks_end[]; 166 void text_poke_early(void *addr, const void *opcode, size_t len); 167 168 /* 169 * Matches NOP and NOPL, not any of the other possible NOPs. 170 */ 171 static bool insn_is_nop(struct insn *insn) 172 { 173 /* Anything NOP, but no REP NOP */ 174 if (insn->opcode.bytes[0] == 0x90 && 175 (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) 176 return true; 177 178 /* NOPL */ 179 if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) 180 return true; 181 182 /* TODO: more nops */ 183 184 return false; 185 } 186 187 /* 188 * Find the offset of the first non-NOP instruction starting at @offset 189 * but no further than @len. 190 */ 191 static int skip_nops(u8 *instr, int offset, int len) 192 { 193 struct insn insn; 194 195 for (; offset < len; offset += insn.length) { 196 if (insn_decode_kernel(&insn, &instr[offset])) 197 break; 198 199 if (!insn_is_nop(&insn)) 200 break; 201 } 202 203 return offset; 204 } 205 206 /* 207 * Optimize a sequence of NOPs, possibly preceded by an unconditional jump 208 * to the end of the NOP sequence into a single NOP. 209 */ 210 static bool __init_or_module 211 __optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) 212 { 213 int i = *next - insn->length; 214 215 switch (insn->opcode.bytes[0]) { 216 case JMP8_INSN_OPCODE: 217 case JMP32_INSN_OPCODE: 218 *prev = i; 219 *target = *next + insn->immediate.value; 220 return false; 221 } 222 223 if (insn_is_nop(insn)) { 224 int nop = i; 225 226 *next = skip_nops(instr, *next, len); 227 if (*target && *next == *target) 228 nop = *prev; 229 230 add_nop(instr + nop, *next - nop); 231 DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); 232 return true; 233 } 234 235 *target = 0; 236 return false; 237 } 238 239 /* 240 * "noinline" to cause control flow change and thus invalidate I$ and 241 * cause refetch after modification. 242 */ 243 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) 244 { 245 int prev, target = 0; 246 247 for (int next, i = 0; i < len; i = next) { 248 struct insn insn; 249 250 if (insn_decode_kernel(&insn, &instr[i])) 251 return; 252 253 next = i + insn.length; 254 255 __optimize_nops(instr, len, &insn, &next, &prev, &target); 256 } 257 } 258 259 /* 260 * In this context, "source" is where the instructions are placed in the 261 * section .altinstr_replacement, for example during kernel build by the 262 * toolchain. 263 * "Destination" is where the instructions are being patched in by this 264 * machinery. 265 * 266 * The source offset is: 267 * 268 * src_imm = target - src_next_ip (1) 269 * 270 * and the target offset is: 271 * 272 * dst_imm = target - dst_next_ip (2) 273 * 274 * so rework (1) as an expression for target like: 275 * 276 * target = src_imm + src_next_ip (1a) 277 * 278 * and substitute in (2) to get: 279 * 280 * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) 281 * 282 * Now, since the instruction stream is 'identical' at src and dst (it 283 * is being copied after all) it can be stated that: 284 * 285 * src_next_ip = src + ip_offset 286 * dst_next_ip = dst + ip_offset (4) 287 * 288 * Substitute (4) in (3) and observe ip_offset being cancelled out to 289 * obtain: 290 * 291 * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) 292 * = src_imm + src - dst + ip_offset - ip_offset 293 * = src_imm + src - dst (5) 294 * 295 * IOW, only the relative displacement of the code block matters. 296 */ 297 298 #define apply_reloc_n(n_, p_, d_) \ 299 do { \ 300 s32 v = *(s##n_ *)(p_); \ 301 v += (d_); \ 302 BUG_ON((v >> 31) != (v >> (n_-1))); \ 303 *(s##n_ *)(p_) = (s##n_)v; \ 304 } while (0) 305 306 307 static __always_inline 308 void apply_reloc(int n, void *ptr, uintptr_t diff) 309 { 310 switch (n) { 311 case 1: apply_reloc_n(8, ptr, diff); break; 312 case 2: apply_reloc_n(16, ptr, diff); break; 313 case 4: apply_reloc_n(32, ptr, diff); break; 314 default: BUG(); 315 } 316 } 317 318 static __always_inline 319 bool need_reloc(unsigned long offset, u8 *src, size_t src_len) 320 { 321 u8 *target = src + offset; 322 /* 323 * If the target is inside the patched block, it's relative to the 324 * block itself and does not need relocation. 325 */ 326 return (target < src || target > src + src_len); 327 } 328 329 static void __init_or_module noinline 330 apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) 331 { 332 int prev, target = 0; 333 334 for (int next, i = 0; i < len; i = next) { 335 struct insn insn; 336 337 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) 338 return; 339 340 next = i + insn.length; 341 342 if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) 343 continue; 344 345 switch (insn.opcode.bytes[0]) { 346 case 0x0f: 347 if (insn.opcode.bytes[1] < 0x80 || 348 insn.opcode.bytes[1] > 0x8f) 349 break; 350 351 fallthrough; /* Jcc.d32 */ 352 case 0x70 ... 0x7f: /* Jcc.d8 */ 353 case JMP8_INSN_OPCODE: 354 case JMP32_INSN_OPCODE: 355 case CALL_INSN_OPCODE: 356 if (need_reloc(next + insn.immediate.value, src, src_len)) { 357 apply_reloc(insn.immediate.nbytes, 358 buf + i + insn_offset_immediate(&insn), 359 src - dest); 360 } 361 362 /* 363 * Where possible, convert JMP.d32 into JMP.d8. 364 */ 365 if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { 366 s32 imm = insn.immediate.value; 367 imm += src - dest; 368 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; 369 if ((imm >> 31) == (imm >> 7)) { 370 buf[i+0] = JMP8_INSN_OPCODE; 371 buf[i+1] = (s8)imm; 372 373 memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); 374 } 375 } 376 break; 377 } 378 379 if (insn_rip_relative(&insn)) { 380 if (need_reloc(next + insn.displacement.value, src, src_len)) { 381 apply_reloc(insn.displacement.nbytes, 382 buf + i + insn_offset_displacement(&insn), 383 src - dest); 384 } 385 } 386 } 387 } 388 389 /* 390 * Replace instructions with better alternatives for this CPU type. This runs 391 * before SMP is initialized to avoid SMP problems with self modifying code. 392 * This implies that asymmetric systems where APs have less capabilities than 393 * the boot processor are not handled. Tough. Make sure you disable such 394 * features by hand. 395 * 396 * Marked "noinline" to cause control flow change and thus insn cache 397 * to refetch changed I$ lines. 398 */ 399 void __init_or_module noinline apply_alternatives(struct alt_instr *start, 400 struct alt_instr *end) 401 { 402 struct alt_instr *a; 403 u8 *instr, *replacement; 404 u8 insn_buff[MAX_PATCH_LEN]; 405 406 DPRINTK(ALT, "alt table %px, -> %px", start, end); 407 408 /* 409 * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using 410 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. 411 * During the process, KASAN becomes confused seeing partial LA57 412 * conversion and triggers a false-positive out-of-bound report. 413 * 414 * Disable KASAN until the patching is complete. 415 */ 416 kasan_disable_current(); 417 418 /* 419 * The scan order should be from start to end. A later scanned 420 * alternative code can overwrite previously scanned alternative code. 421 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 422 * patch code. 423 * 424 * So be careful if you want to change the scan order to any other 425 * order. 426 */ 427 for (a = start; a < end; a++) { 428 int insn_buff_sz = 0; 429 430 instr = (u8 *)&a->instr_offset + a->instr_offset; 431 replacement = (u8 *)&a->repl_offset + a->repl_offset; 432 BUG_ON(a->instrlen > sizeof(insn_buff)); 433 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 434 435 /* 436 * Patch if either: 437 * - feature is present 438 * - feature not present but ALT_FLAG_NOT is set to mean, 439 * patch if feature is *NOT* present. 440 */ 441 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { 442 optimize_nops(instr, a->instrlen); 443 continue; 444 } 445 446 DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", 447 (a->flags & ALT_FLAG_NOT) ? "!" : "", 448 a->cpuid >> 5, 449 a->cpuid & 0x1f, 450 instr, instr, a->instrlen, 451 replacement, a->replacementlen); 452 453 memcpy(insn_buff, replacement, a->replacementlen); 454 insn_buff_sz = a->replacementlen; 455 456 for (; insn_buff_sz < a->instrlen; insn_buff_sz++) 457 insn_buff[insn_buff_sz] = 0x90; 458 459 apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); 460 461 DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); 462 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); 463 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); 464 465 text_poke_early(instr, insn_buff, insn_buff_sz); 466 } 467 468 kasan_enable_current(); 469 } 470 471 static inline bool is_jcc32(struct insn *insn) 472 { 473 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ 474 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; 475 } 476 477 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) 478 479 /* 480 * CALL/JMP *%\reg 481 */ 482 static int emit_indirect(int op, int reg, u8 *bytes) 483 { 484 int i = 0; 485 u8 modrm; 486 487 switch (op) { 488 case CALL_INSN_OPCODE: 489 modrm = 0x10; /* Reg = 2; CALL r/m */ 490 break; 491 492 case JMP32_INSN_OPCODE: 493 modrm = 0x20; /* Reg = 4; JMP r/m */ 494 break; 495 496 default: 497 WARN_ON_ONCE(1); 498 return -1; 499 } 500 501 if (reg >= 8) { 502 bytes[i++] = 0x41; /* REX.B prefix */ 503 reg -= 8; 504 } 505 506 modrm |= 0xc0; /* Mod = 3 */ 507 modrm += reg; 508 509 bytes[i++] = 0xff; /* opcode */ 510 bytes[i++] = modrm; 511 512 return i; 513 } 514 515 static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) 516 { 517 u8 op = insn->opcode.bytes[0]; 518 int i = 0; 519 520 /* 521 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional 522 * tail-calls. Deal with them. 523 */ 524 if (is_jcc32(insn)) { 525 bytes[i++] = op; 526 op = insn->opcode.bytes[1]; 527 goto clang_jcc; 528 } 529 530 if (insn->length == 6) 531 bytes[i++] = 0x2e; /* CS-prefix */ 532 533 switch (op) { 534 case CALL_INSN_OPCODE: 535 __text_gen_insn(bytes+i, op, addr+i, 536 __x86_indirect_call_thunk_array[reg], 537 CALL_INSN_SIZE); 538 i += CALL_INSN_SIZE; 539 break; 540 541 case JMP32_INSN_OPCODE: 542 clang_jcc: 543 __text_gen_insn(bytes+i, op, addr+i, 544 __x86_indirect_jump_thunk_array[reg], 545 JMP32_INSN_SIZE); 546 i += JMP32_INSN_SIZE; 547 break; 548 549 default: 550 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); 551 return -1; 552 } 553 554 WARN_ON_ONCE(i != insn->length); 555 556 return i; 557 } 558 559 /* 560 * Rewrite the compiler generated retpoline thunk calls. 561 * 562 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate 563 * indirect instructions, avoiding the extra indirection. 564 * 565 * For example, convert: 566 * 567 * CALL __x86_indirect_thunk_\reg 568 * 569 * into: 570 * 571 * CALL *%\reg 572 * 573 * It also tries to inline spectre_v2=retpoline,lfence when size permits. 574 */ 575 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) 576 { 577 retpoline_thunk_t *target; 578 int reg, ret, i = 0; 579 u8 op, cc; 580 581 target = addr + insn->length + insn->immediate.value; 582 reg = target - __x86_indirect_thunk_array; 583 584 if (WARN_ON_ONCE(reg & ~0xf)) 585 return -1; 586 587 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ 588 BUG_ON(reg == 4); 589 590 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && 591 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { 592 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) 593 return emit_call_track_retpoline(addr, insn, reg, bytes); 594 595 return -1; 596 } 597 598 op = insn->opcode.bytes[0]; 599 600 /* 601 * Convert: 602 * 603 * Jcc.d32 __x86_indirect_thunk_\reg 604 * 605 * into: 606 * 607 * Jncc.d8 1f 608 * [ LFENCE ] 609 * JMP *%\reg 610 * [ NOP ] 611 * 1: 612 */ 613 if (is_jcc32(insn)) { 614 cc = insn->opcode.bytes[1] & 0xf; 615 cc ^= 1; /* invert condition */ 616 617 bytes[i++] = 0x70 + cc; /* Jcc.d8 */ 618 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ 619 620 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ 621 op = JMP32_INSN_OPCODE; 622 } 623 624 /* 625 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. 626 */ 627 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { 628 bytes[i++] = 0x0f; 629 bytes[i++] = 0xae; 630 bytes[i++] = 0xe8; /* LFENCE */ 631 } 632 633 ret = emit_indirect(op, reg, bytes + i); 634 if (ret < 0) 635 return ret; 636 i += ret; 637 638 /* 639 * The compiler is supposed to EMIT an INT3 after every unconditional 640 * JMP instruction due to AMD BTC. However, if the compiler is too old 641 * or SLS isn't enabled, we still need an INT3 after indirect JMPs 642 * even on Intel. 643 */ 644 if (op == JMP32_INSN_OPCODE && i < insn->length) 645 bytes[i++] = INT3_INSN_OPCODE; 646 647 for (; i < insn->length;) 648 bytes[i++] = BYTES_NOP1; 649 650 return i; 651 } 652 653 /* 654 * Generated by 'objtool --retpoline'. 655 */ 656 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) 657 { 658 s32 *s; 659 660 for (s = start; s < end; s++) { 661 void *addr = (void *)s + *s; 662 struct insn insn; 663 int len, ret; 664 u8 bytes[16]; 665 u8 op1, op2; 666 667 ret = insn_decode_kernel(&insn, addr); 668 if (WARN_ON_ONCE(ret < 0)) 669 continue; 670 671 op1 = insn.opcode.bytes[0]; 672 op2 = insn.opcode.bytes[1]; 673 674 switch (op1) { 675 case CALL_INSN_OPCODE: 676 case JMP32_INSN_OPCODE: 677 break; 678 679 case 0x0f: /* escape */ 680 if (op2 >= 0x80 && op2 <= 0x8f) 681 break; 682 fallthrough; 683 default: 684 WARN_ON_ONCE(1); 685 continue; 686 } 687 688 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", 689 addr, addr, insn.length, 690 addr + insn.length + insn.immediate.value); 691 692 len = patch_retpoline(addr, &insn, bytes); 693 if (len == insn.length) { 694 optimize_nops(bytes, len); 695 DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); 696 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); 697 text_poke_early(addr, bytes, len); 698 } 699 } 700 } 701 702 #ifdef CONFIG_RETHUNK 703 704 /* 705 * Rewrite the compiler generated return thunk tail-calls. 706 * 707 * For example, convert: 708 * 709 * JMP __x86_return_thunk 710 * 711 * into: 712 * 713 * RET 714 */ 715 static int patch_return(void *addr, struct insn *insn, u8 *bytes) 716 { 717 int i = 0; 718 719 /* Patch the custom return thunks... */ 720 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { 721 i = JMP32_INSN_SIZE; 722 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); 723 } else { 724 /* ... or patch them out if not needed. */ 725 bytes[i++] = RET_INSN_OPCODE; 726 } 727 728 for (; i < insn->length;) 729 bytes[i++] = INT3_INSN_OPCODE; 730 return i; 731 } 732 733 void __init_or_module noinline apply_returns(s32 *start, s32 *end) 734 { 735 s32 *s; 736 737 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) 738 static_call_force_reinit(); 739 740 for (s = start; s < end; s++) { 741 void *dest = NULL, *addr = (void *)s + *s; 742 struct insn insn; 743 int len, ret; 744 u8 bytes[16]; 745 u8 op; 746 747 ret = insn_decode_kernel(&insn, addr); 748 if (WARN_ON_ONCE(ret < 0)) 749 continue; 750 751 op = insn.opcode.bytes[0]; 752 if (op == JMP32_INSN_OPCODE) 753 dest = addr + insn.length + insn.immediate.value; 754 755 if (__static_call_fixup(addr, op, dest) || 756 WARN_ONCE(dest != &__x86_return_thunk, 757 "missing return thunk: %pS-%pS: %*ph", 758 addr, dest, 5, addr)) 759 continue; 760 761 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", 762 addr, addr, insn.length, 763 addr + insn.length + insn.immediate.value); 764 765 len = patch_return(addr, &insn, bytes); 766 if (len == insn.length) { 767 DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); 768 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); 769 text_poke_early(addr, bytes, len); 770 } 771 } 772 } 773 #else 774 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 775 #endif /* CONFIG_RETHUNK */ 776 777 #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ 778 779 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } 780 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 781 782 #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ 783 784 #ifdef CONFIG_X86_KERNEL_IBT 785 786 static void poison_cfi(void *addr); 787 788 static void __init_or_module poison_endbr(void *addr, bool warn) 789 { 790 u32 endbr, poison = gen_endbr_poison(); 791 792 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) 793 return; 794 795 if (!is_endbr(endbr)) { 796 WARN_ON_ONCE(warn); 797 return; 798 } 799 800 DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); 801 802 /* 803 * When we have IBT, the lack of ENDBR will trigger #CP 804 */ 805 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); 806 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); 807 text_poke_early(addr, &poison, 4); 808 } 809 810 /* 811 * Generated by: objtool --ibt 812 * 813 * Seal the functions for indirect calls by clobbering the ENDBR instructions 814 * and the kCFI hash value. 815 */ 816 void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) 817 { 818 s32 *s; 819 820 for (s = start; s < end; s++) { 821 void *addr = (void *)s + *s; 822 823 poison_endbr(addr, true); 824 if (IS_ENABLED(CONFIG_FINEIBT)) 825 poison_cfi(addr - 16); 826 } 827 } 828 829 #else 830 831 void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } 832 833 #endif /* CONFIG_X86_KERNEL_IBT */ 834 835 #ifdef CONFIG_FINEIBT 836 #define __CFI_DEFAULT CFI_DEFAULT 837 #elif defined(CONFIG_CFI_CLANG) 838 #define __CFI_DEFAULT CFI_KCFI 839 #else 840 #define __CFI_DEFAULT CFI_OFF 841 #endif 842 843 enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; 844 845 #ifdef CONFIG_CFI_CLANG 846 struct bpf_insn; 847 848 /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ 849 extern unsigned int __bpf_prog_runX(const void *ctx, 850 const struct bpf_insn *insn); 851 852 /* 853 * Force a reference to the external symbol so the compiler generates 854 * __kcfi_typid. 855 */ 856 __ADDRESSABLE(__bpf_prog_runX); 857 858 /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ 859 asm ( 860 " .pushsection .data..ro_after_init,\"aw\",@progbits \n" 861 " .type cfi_bpf_hash,@object \n" 862 " .globl cfi_bpf_hash \n" 863 " .p2align 2, 0x0 \n" 864 "cfi_bpf_hash: \n" 865 " .long __kcfi_typeid___bpf_prog_runX \n" 866 " .size cfi_bpf_hash, 4 \n" 867 " .popsection \n" 868 ); 869 870 /* Must match bpf_callback_t */ 871 extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); 872 873 __ADDRESSABLE(__bpf_callback_fn); 874 875 /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ 876 asm ( 877 " .pushsection .data..ro_after_init,\"aw\",@progbits \n" 878 " .type cfi_bpf_subprog_hash,@object \n" 879 " .globl cfi_bpf_subprog_hash \n" 880 " .p2align 2, 0x0 \n" 881 "cfi_bpf_subprog_hash: \n" 882 " .long __kcfi_typeid___bpf_callback_fn \n" 883 " .size cfi_bpf_subprog_hash, 4 \n" 884 " .popsection \n" 885 ); 886 887 u32 cfi_get_func_hash(void *func) 888 { 889 u32 hash; 890 891 func -= cfi_get_offset(); 892 switch (cfi_mode) { 893 case CFI_FINEIBT: 894 func += 7; 895 break; 896 case CFI_KCFI: 897 func += 1; 898 break; 899 default: 900 return 0; 901 } 902 903 if (get_kernel_nofault(hash, func)) 904 return 0; 905 906 return hash; 907 } 908 #endif 909 910 #ifdef CONFIG_FINEIBT 911 912 static bool cfi_rand __ro_after_init = true; 913 static u32 cfi_seed __ro_after_init; 914 915 /* 916 * Re-hash the CFI hash with a boot-time seed while making sure the result is 917 * not a valid ENDBR instruction. 918 */ 919 static u32 cfi_rehash(u32 hash) 920 { 921 hash ^= cfi_seed; 922 while (unlikely(is_endbr(hash) || is_endbr(-hash))) { 923 bool lsb = hash & 1; 924 hash >>= 1; 925 if (lsb) 926 hash ^= 0x80200003; 927 } 928 return hash; 929 } 930 931 static __init int cfi_parse_cmdline(char *str) 932 { 933 if (!str) 934 return -EINVAL; 935 936 while (str) { 937 char *next = strchr(str, ','); 938 if (next) { 939 *next = 0; 940 next++; 941 } 942 943 if (!strcmp(str, "auto")) { 944 cfi_mode = CFI_DEFAULT; 945 } else if (!strcmp(str, "off")) { 946 cfi_mode = CFI_OFF; 947 cfi_rand = false; 948 } else if (!strcmp(str, "kcfi")) { 949 cfi_mode = CFI_KCFI; 950 } else if (!strcmp(str, "fineibt")) { 951 cfi_mode = CFI_FINEIBT; 952 } else if (!strcmp(str, "norand")) { 953 cfi_rand = false; 954 } else { 955 pr_err("Ignoring unknown cfi option (%s).", str); 956 } 957 958 str = next; 959 } 960 961 return 0; 962 } 963 early_param("cfi", cfi_parse_cmdline); 964 965 /* 966 * kCFI FineIBT 967 * 968 * __cfi_\func: __cfi_\func: 969 * movl $0x12345678,%eax // 5 endbr64 // 4 970 * nop subl $0x12345678,%r10d // 7 971 * nop jz 1f // 2 972 * nop ud2 // 2 973 * nop 1: nop // 1 974 * nop 975 * nop 976 * nop 977 * nop 978 * nop 979 * nop 980 * nop 981 * 982 * 983 * caller: caller: 984 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 985 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 986 * je 1f // 2 nop4 // 4 987 * ud2 // 2 988 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 989 * 990 */ 991 992 asm( ".pushsection .rodata \n" 993 "fineibt_preamble_start: \n" 994 " endbr64 \n" 995 " subl $0x12345678, %r10d \n" 996 " je fineibt_preamble_end \n" 997 " ud2 \n" 998 " nop \n" 999 "fineibt_preamble_end: \n" 1000 ".popsection\n" 1001 ); 1002 1003 extern u8 fineibt_preamble_start[]; 1004 extern u8 fineibt_preamble_end[]; 1005 1006 #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) 1007 #define fineibt_preamble_hash 7 1008 1009 asm( ".pushsection .rodata \n" 1010 "fineibt_caller_start: \n" 1011 " movl $0x12345678, %r10d \n" 1012 " sub $16, %r11 \n" 1013 ASM_NOP4 1014 "fineibt_caller_end: \n" 1015 ".popsection \n" 1016 ); 1017 1018 extern u8 fineibt_caller_start[]; 1019 extern u8 fineibt_caller_end[]; 1020 1021 #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) 1022 #define fineibt_caller_hash 2 1023 1024 #define fineibt_caller_jmp (fineibt_caller_size - 2) 1025 1026 static u32 decode_preamble_hash(void *addr) 1027 { 1028 u8 *p = addr; 1029 1030 /* b8 78 56 34 12 mov $0x12345678,%eax */ 1031 if (p[0] == 0xb8) 1032 return *(u32 *)(addr + 1); 1033 1034 return 0; /* invalid hash value */ 1035 } 1036 1037 static u32 decode_caller_hash(void *addr) 1038 { 1039 u8 *p = addr; 1040 1041 /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ 1042 if (p[0] == 0x41 && p[1] == 0xba) 1043 return -*(u32 *)(addr + 2); 1044 1045 /* e8 0c 78 56 34 12 jmp.d8 +12 */ 1046 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) 1047 return -*(u32 *)(addr + 2); 1048 1049 return 0; /* invalid hash value */ 1050 } 1051 1052 /* .retpoline_sites */ 1053 static int cfi_disable_callers(s32 *start, s32 *end) 1054 { 1055 /* 1056 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate 1057 * in tact for later usage. Also see decode_caller_hash() and 1058 * cfi_rewrite_callers(). 1059 */ 1060 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; 1061 s32 *s; 1062 1063 for (s = start; s < end; s++) { 1064 void *addr = (void *)s + *s; 1065 u32 hash; 1066 1067 addr -= fineibt_caller_size; 1068 hash = decode_caller_hash(addr); 1069 if (!hash) /* nocfi callers */ 1070 continue; 1071 1072 text_poke_early(addr, jmp, 2); 1073 } 1074 1075 return 0; 1076 } 1077 1078 static int cfi_enable_callers(s32 *start, s32 *end) 1079 { 1080 /* 1081 * Re-enable kCFI, undo what cfi_disable_callers() did. 1082 */ 1083 const u8 mov[] = { 0x41, 0xba }; 1084 s32 *s; 1085 1086 for (s = start; s < end; s++) { 1087 void *addr = (void *)s + *s; 1088 u32 hash; 1089 1090 addr -= fineibt_caller_size; 1091 hash = decode_caller_hash(addr); 1092 if (!hash) /* nocfi callers */ 1093 continue; 1094 1095 text_poke_early(addr, mov, 2); 1096 } 1097 1098 return 0; 1099 } 1100 1101 /* .cfi_sites */ 1102 static int cfi_rand_preamble(s32 *start, s32 *end) 1103 { 1104 s32 *s; 1105 1106 for (s = start; s < end; s++) { 1107 void *addr = (void *)s + *s; 1108 u32 hash; 1109 1110 hash = decode_preamble_hash(addr); 1111 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", 1112 addr, addr, 5, addr)) 1113 return -EINVAL; 1114 1115 hash = cfi_rehash(hash); 1116 text_poke_early(addr + 1, &hash, 4); 1117 } 1118 1119 return 0; 1120 } 1121 1122 static int cfi_rewrite_preamble(s32 *start, s32 *end) 1123 { 1124 s32 *s; 1125 1126 for (s = start; s < end; s++) { 1127 void *addr = (void *)s + *s; 1128 u32 hash; 1129 1130 hash = decode_preamble_hash(addr); 1131 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", 1132 addr, addr, 5, addr)) 1133 return -EINVAL; 1134 1135 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); 1136 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); 1137 text_poke_early(addr + fineibt_preamble_hash, &hash, 4); 1138 } 1139 1140 return 0; 1141 } 1142 1143 static void cfi_rewrite_endbr(s32 *start, s32 *end) 1144 { 1145 s32 *s; 1146 1147 for (s = start; s < end; s++) { 1148 void *addr = (void *)s + *s; 1149 1150 poison_endbr(addr+16, false); 1151 } 1152 } 1153 1154 /* .retpoline_sites */ 1155 static int cfi_rand_callers(s32 *start, s32 *end) 1156 { 1157 s32 *s; 1158 1159 for (s = start; s < end; s++) { 1160 void *addr = (void *)s + *s; 1161 u32 hash; 1162 1163 addr -= fineibt_caller_size; 1164 hash = decode_caller_hash(addr); 1165 if (hash) { 1166 hash = -cfi_rehash(hash); 1167 text_poke_early(addr + 2, &hash, 4); 1168 } 1169 } 1170 1171 return 0; 1172 } 1173 1174 static int cfi_rewrite_callers(s32 *start, s32 *end) 1175 { 1176 s32 *s; 1177 1178 for (s = start; s < end; s++) { 1179 void *addr = (void *)s + *s; 1180 u32 hash; 1181 1182 addr -= fineibt_caller_size; 1183 hash = decode_caller_hash(addr); 1184 if (hash) { 1185 text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); 1186 WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); 1187 text_poke_early(addr + fineibt_caller_hash, &hash, 4); 1188 } 1189 /* rely on apply_retpolines() */ 1190 } 1191 1192 return 0; 1193 } 1194 1195 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 1196 s32 *start_cfi, s32 *end_cfi, bool builtin) 1197 { 1198 int ret; 1199 1200 if (WARN_ONCE(fineibt_preamble_size != 16, 1201 "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) 1202 return; 1203 1204 if (cfi_mode == CFI_DEFAULT) { 1205 cfi_mode = CFI_KCFI; 1206 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) 1207 cfi_mode = CFI_FINEIBT; 1208 } 1209 1210 /* 1211 * Rewrite the callers to not use the __cfi_ stubs, such that we might 1212 * rewrite them. This disables all CFI. If this succeeds but any of the 1213 * later stages fails, we're without CFI. 1214 */ 1215 ret = cfi_disable_callers(start_retpoline, end_retpoline); 1216 if (ret) 1217 goto err; 1218 1219 if (cfi_rand) { 1220 if (builtin) { 1221 cfi_seed = get_random_u32(); 1222 cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); 1223 cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); 1224 } 1225 1226 ret = cfi_rand_preamble(start_cfi, end_cfi); 1227 if (ret) 1228 goto err; 1229 1230 ret = cfi_rand_callers(start_retpoline, end_retpoline); 1231 if (ret) 1232 goto err; 1233 } 1234 1235 switch (cfi_mode) { 1236 case CFI_OFF: 1237 if (builtin) 1238 pr_info("Disabling CFI\n"); 1239 return; 1240 1241 case CFI_KCFI: 1242 ret = cfi_enable_callers(start_retpoline, end_retpoline); 1243 if (ret) 1244 goto err; 1245 1246 if (builtin) 1247 pr_info("Using kCFI\n"); 1248 return; 1249 1250 case CFI_FINEIBT: 1251 /* place the FineIBT preamble at func()-16 */ 1252 ret = cfi_rewrite_preamble(start_cfi, end_cfi); 1253 if (ret) 1254 goto err; 1255 1256 /* rewrite the callers to target func()-16 */ 1257 ret = cfi_rewrite_callers(start_retpoline, end_retpoline); 1258 if (ret) 1259 goto err; 1260 1261 /* now that nobody targets func()+0, remove ENDBR there */ 1262 cfi_rewrite_endbr(start_cfi, end_cfi); 1263 1264 if (builtin) 1265 pr_info("Using FineIBT CFI\n"); 1266 return; 1267 1268 default: 1269 break; 1270 } 1271 1272 err: 1273 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); 1274 } 1275 1276 static inline void poison_hash(void *addr) 1277 { 1278 *(u32 *)addr = 0; 1279 } 1280 1281 static void poison_cfi(void *addr) 1282 { 1283 switch (cfi_mode) { 1284 case CFI_FINEIBT: 1285 /* 1286 * __cfi_\func: 1287 * osp nopl (%rax) 1288 * subl $0, %r10d 1289 * jz 1f 1290 * ud2 1291 * 1: nop 1292 */ 1293 poison_endbr(addr, false); 1294 poison_hash(addr + fineibt_preamble_hash); 1295 break; 1296 1297 case CFI_KCFI: 1298 /* 1299 * __cfi_\func: 1300 * movl $0, %eax 1301 * .skip 11, 0x90 1302 */ 1303 poison_hash(addr + 1); 1304 break; 1305 1306 default: 1307 break; 1308 } 1309 } 1310 1311 #else 1312 1313 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 1314 s32 *start_cfi, s32 *end_cfi, bool builtin) 1315 { 1316 } 1317 1318 #ifdef CONFIG_X86_KERNEL_IBT 1319 static void poison_cfi(void *addr) { } 1320 #endif 1321 1322 #endif 1323 1324 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 1325 s32 *start_cfi, s32 *end_cfi) 1326 { 1327 return __apply_fineibt(start_retpoline, end_retpoline, 1328 start_cfi, end_cfi, 1329 /* .builtin = */ false); 1330 } 1331 1332 #ifdef CONFIG_SMP 1333 static void alternatives_smp_lock(const s32 *start, const s32 *end, 1334 u8 *text, u8 *text_end) 1335 { 1336 const s32 *poff; 1337 1338 for (poff = start; poff < end; poff++) { 1339 u8 *ptr = (u8 *)poff + *poff; 1340 1341 if (!*poff || ptr < text || ptr >= text_end) 1342 continue; 1343 /* turn DS segment override prefix into lock prefix */ 1344 if (*ptr == 0x3e) 1345 text_poke(ptr, ((unsigned char []){0xf0}), 1); 1346 } 1347 } 1348 1349 static void alternatives_smp_unlock(const s32 *start, const s32 *end, 1350 u8 *text, u8 *text_end) 1351 { 1352 const s32 *poff; 1353 1354 for (poff = start; poff < end; poff++) { 1355 u8 *ptr = (u8 *)poff + *poff; 1356 1357 if (!*poff || ptr < text || ptr >= text_end) 1358 continue; 1359 /* turn lock prefix into DS segment override prefix */ 1360 if (*ptr == 0xf0) 1361 text_poke(ptr, ((unsigned char []){0x3E}), 1); 1362 } 1363 } 1364 1365 struct smp_alt_module { 1366 /* what is this ??? */ 1367 struct module *mod; 1368 char *name; 1369 1370 /* ptrs to lock prefixes */ 1371 const s32 *locks; 1372 const s32 *locks_end; 1373 1374 /* .text segment, needed to avoid patching init code ;) */ 1375 u8 *text; 1376 u8 *text_end; 1377 1378 struct list_head next; 1379 }; 1380 static LIST_HEAD(smp_alt_modules); 1381 static bool uniproc_patched = false; /* protected by text_mutex */ 1382 1383 void __init_or_module alternatives_smp_module_add(struct module *mod, 1384 char *name, 1385 void *locks, void *locks_end, 1386 void *text, void *text_end) 1387 { 1388 struct smp_alt_module *smp; 1389 1390 mutex_lock(&text_mutex); 1391 if (!uniproc_patched) 1392 goto unlock; 1393 1394 if (num_possible_cpus() == 1) 1395 /* Don't bother remembering, we'll never have to undo it. */ 1396 goto smp_unlock; 1397 1398 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 1399 if (NULL == smp) 1400 /* we'll run the (safe but slow) SMP code then ... */ 1401 goto unlock; 1402 1403 smp->mod = mod; 1404 smp->name = name; 1405 smp->locks = locks; 1406 smp->locks_end = locks_end; 1407 smp->text = text; 1408 smp->text_end = text_end; 1409 DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", 1410 smp->locks, smp->locks_end, 1411 smp->text, smp->text_end, smp->name); 1412 1413 list_add_tail(&smp->next, &smp_alt_modules); 1414 smp_unlock: 1415 alternatives_smp_unlock(locks, locks_end, text, text_end); 1416 unlock: 1417 mutex_unlock(&text_mutex); 1418 } 1419 1420 void __init_or_module alternatives_smp_module_del(struct module *mod) 1421 { 1422 struct smp_alt_module *item; 1423 1424 mutex_lock(&text_mutex); 1425 list_for_each_entry(item, &smp_alt_modules, next) { 1426 if (mod != item->mod) 1427 continue; 1428 list_del(&item->next); 1429 kfree(item); 1430 break; 1431 } 1432 mutex_unlock(&text_mutex); 1433 } 1434 1435 void alternatives_enable_smp(void) 1436 { 1437 struct smp_alt_module *mod; 1438 1439 /* Why bother if there are no other CPUs? */ 1440 BUG_ON(num_possible_cpus() == 1); 1441 1442 mutex_lock(&text_mutex); 1443 1444 if (uniproc_patched) { 1445 pr_info("switching to SMP code\n"); 1446 BUG_ON(num_online_cpus() != 1); 1447 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 1448 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 1449 list_for_each_entry(mod, &smp_alt_modules, next) 1450 alternatives_smp_lock(mod->locks, mod->locks_end, 1451 mod->text, mod->text_end); 1452 uniproc_patched = false; 1453 } 1454 mutex_unlock(&text_mutex); 1455 } 1456 1457 /* 1458 * Return 1 if the address range is reserved for SMP-alternatives. 1459 * Must hold text_mutex. 1460 */ 1461 int alternatives_text_reserved(void *start, void *end) 1462 { 1463 struct smp_alt_module *mod; 1464 const s32 *poff; 1465 u8 *text_start = start; 1466 u8 *text_end = end; 1467 1468 lockdep_assert_held(&text_mutex); 1469 1470 list_for_each_entry(mod, &smp_alt_modules, next) { 1471 if (mod->text > text_end || mod->text_end < text_start) 1472 continue; 1473 for (poff = mod->locks; poff < mod->locks_end; poff++) { 1474 const u8 *ptr = (const u8 *)poff + *poff; 1475 1476 if (text_start <= ptr && text_end > ptr) 1477 return 1; 1478 } 1479 } 1480 1481 return 0; 1482 } 1483 #endif /* CONFIG_SMP */ 1484 1485 #ifdef CONFIG_PARAVIRT 1486 1487 /* Use this to add nops to a buffer, then text_poke the whole buffer. */ 1488 static void __init_or_module add_nops(void *insns, unsigned int len) 1489 { 1490 while (len > 0) { 1491 unsigned int noplen = len; 1492 if (noplen > ASM_NOP_MAX) 1493 noplen = ASM_NOP_MAX; 1494 memcpy(insns, x86_nops[noplen], noplen); 1495 insns += noplen; 1496 len -= noplen; 1497 } 1498 } 1499 1500 void __init_or_module apply_paravirt(struct paravirt_patch_site *start, 1501 struct paravirt_patch_site *end) 1502 { 1503 struct paravirt_patch_site *p; 1504 char insn_buff[MAX_PATCH_LEN]; 1505 1506 for (p = start; p < end; p++) { 1507 unsigned int used; 1508 1509 BUG_ON(p->len > MAX_PATCH_LEN); 1510 /* prep the buffer with the original instructions */ 1511 memcpy(insn_buff, p->instr, p->len); 1512 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); 1513 1514 BUG_ON(used > p->len); 1515 1516 /* Pad the rest with nops */ 1517 add_nops(insn_buff + used, p->len - used); 1518 text_poke_early(p->instr, insn_buff, p->len); 1519 } 1520 } 1521 extern struct paravirt_patch_site __start_parainstructions[], 1522 __stop_parainstructions[]; 1523 #endif /* CONFIG_PARAVIRT */ 1524 1525 /* 1526 * Self-test for the INT3 based CALL emulation code. 1527 * 1528 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up 1529 * properly and that there is a stack gap between the INT3 frame and the 1530 * previous context. Without this gap doing a virtual PUSH on the interrupted 1531 * stack would corrupt the INT3 IRET frame. 1532 * 1533 * See entry_{32,64}.S for more details. 1534 */ 1535 1536 /* 1537 * We define the int3_magic() function in assembly to control the calling 1538 * convention such that we can 'call' it from assembly. 1539 */ 1540 1541 extern void int3_magic(unsigned int *ptr); /* defined in asm */ 1542 1543 asm ( 1544 " .pushsection .init.text, \"ax\", @progbits\n" 1545 " .type int3_magic, @function\n" 1546 "int3_magic:\n" 1547 ANNOTATE_NOENDBR 1548 " movl $1, (%" _ASM_ARG1 ")\n" 1549 ASM_RET 1550 " .size int3_magic, .-int3_magic\n" 1551 " .popsection\n" 1552 ); 1553 1554 extern void int3_selftest_ip(void); /* defined in asm below */ 1555 1556 static int __init 1557 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) 1558 { 1559 unsigned long selftest = (unsigned long)&int3_selftest_ip; 1560 struct die_args *args = data; 1561 struct pt_regs *regs = args->regs; 1562 1563 OPTIMIZER_HIDE_VAR(selftest); 1564 1565 if (!regs || user_mode(regs)) 1566 return NOTIFY_DONE; 1567 1568 if (val != DIE_INT3) 1569 return NOTIFY_DONE; 1570 1571 if (regs->ip - INT3_INSN_SIZE != selftest) 1572 return NOTIFY_DONE; 1573 1574 int3_emulate_call(regs, (unsigned long)&int3_magic); 1575 return NOTIFY_STOP; 1576 } 1577 1578 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ 1579 static noinline void __init int3_selftest(void) 1580 { 1581 static __initdata struct notifier_block int3_exception_nb = { 1582 .notifier_call = int3_exception_notify, 1583 .priority = INT_MAX-1, /* last */ 1584 }; 1585 unsigned int val = 0; 1586 1587 BUG_ON(register_die_notifier(&int3_exception_nb)); 1588 1589 /* 1590 * Basically: int3_magic(&val); but really complicated :-) 1591 * 1592 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb 1593 * notifier above will emulate CALL for us. 1594 */ 1595 asm volatile ("int3_selftest_ip:\n\t" 1596 ANNOTATE_NOENDBR 1597 " int3; nop; nop; nop; nop\n\t" 1598 : ASM_CALL_CONSTRAINT 1599 : __ASM_SEL_RAW(a, D) (&val) 1600 : "memory"); 1601 1602 BUG_ON(val != 1); 1603 1604 unregister_die_notifier(&int3_exception_nb); 1605 } 1606 1607 static __initdata int __alt_reloc_selftest_addr; 1608 1609 extern void __init __alt_reloc_selftest(void *arg); 1610 __visible noinline void __init __alt_reloc_selftest(void *arg) 1611 { 1612 WARN_ON(arg != &__alt_reloc_selftest_addr); 1613 } 1614 1615 static noinline void __init alt_reloc_selftest(void) 1616 { 1617 /* 1618 * Tests apply_relocation(). 1619 * 1620 * This has a relative immediate (CALL) in a place other than the first 1621 * instruction and additionally on x86_64 we get a RIP-relative LEA: 1622 * 1623 * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c 1624 * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 1625 * 1626 * Getting this wrong will either crash and burn or tickle the WARN 1627 * above. 1628 */ 1629 asm_inline volatile ( 1630 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) 1631 : /* output */ 1632 : [mem] "m" (__alt_reloc_selftest_addr) 1633 : _ASM_ARG1 1634 ); 1635 } 1636 1637 void __init alternative_instructions(void) 1638 { 1639 int3_selftest(); 1640 1641 /* 1642 * The patching is not fully atomic, so try to avoid local 1643 * interruptions that might execute the to be patched code. 1644 * Other CPUs are not running. 1645 */ 1646 stop_nmi(); 1647 1648 /* 1649 * Don't stop machine check exceptions while patching. 1650 * MCEs only happen when something got corrupted and in this 1651 * case we must do something about the corruption. 1652 * Ignoring it is worse than an unlikely patching race. 1653 * Also machine checks tend to be broadcast and if one CPU 1654 * goes into machine check the others follow quickly, so we don't 1655 * expect a machine check to cause undue problems during to code 1656 * patching. 1657 */ 1658 1659 /* 1660 * Paravirt patching and alternative patching can be combined to 1661 * replace a function call with a short direct code sequence (e.g. 1662 * by setting a constant return value instead of doing that in an 1663 * external function). 1664 * In order to make this work the following sequence is required: 1665 * 1. set (artificial) features depending on used paravirt 1666 * functions which can later influence alternative patching 1667 * 2. apply paravirt patching (generally replacing an indirect 1668 * function call with a direct one) 1669 * 3. apply alternative patching (e.g. replacing a direct function 1670 * call with a custom code sequence) 1671 * Doing paravirt patching after alternative patching would clobber 1672 * the optimization of the custom code with a function call again. 1673 */ 1674 paravirt_set_cap(); 1675 1676 /* 1677 * First patch paravirt functions, such that we overwrite the indirect 1678 * call with the direct call. 1679 */ 1680 apply_paravirt(__parainstructions, __parainstructions_end); 1681 1682 __apply_fineibt(__retpoline_sites, __retpoline_sites_end, 1683 __cfi_sites, __cfi_sites_end, true); 1684 1685 /* 1686 * Rewrite the retpolines, must be done before alternatives since 1687 * those can rewrite the retpoline thunks. 1688 */ 1689 apply_retpolines(__retpoline_sites, __retpoline_sites_end); 1690 apply_returns(__return_sites, __return_sites_end); 1691 1692 /* 1693 * Then patch alternatives, such that those paravirt calls that are in 1694 * alternatives can be overwritten by their immediate fragments. 1695 */ 1696 apply_alternatives(__alt_instructions, __alt_instructions_end); 1697 1698 /* 1699 * Now all calls are established. Apply the call thunks if 1700 * required. 1701 */ 1702 callthunks_patch_builtin_calls(); 1703 1704 /* 1705 * Seal all functions that do not have their address taken. 1706 */ 1707 apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); 1708 1709 #ifdef CONFIG_SMP 1710 /* Patch to UP if other cpus not imminent. */ 1711 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 1712 uniproc_patched = true; 1713 alternatives_smp_module_add(NULL, "core kernel", 1714 __smp_locks, __smp_locks_end, 1715 _text, _etext); 1716 } 1717 1718 if (!uniproc_patched || num_possible_cpus() == 1) { 1719 free_init_pages("SMP alternatives", 1720 (unsigned long)__smp_locks, 1721 (unsigned long)__smp_locks_end); 1722 } 1723 #endif 1724 1725 restart_nmi(); 1726 alternatives_patched = 1; 1727 1728 alt_reloc_selftest(); 1729 } 1730 1731 /** 1732 * text_poke_early - Update instructions on a live kernel at boot time 1733 * @addr: address to modify 1734 * @opcode: source of the copy 1735 * @len: length to copy 1736 * 1737 * When you use this code to patch more than one byte of an instruction 1738 * you need to make sure that other CPUs cannot execute this code in parallel. 1739 * Also no thread must be currently preempted in the middle of these 1740 * instructions. And on the local CPU you need to be protected against NMI or 1741 * MCE handlers seeing an inconsistent instruction while you patch. 1742 */ 1743 void __init_or_module text_poke_early(void *addr, const void *opcode, 1744 size_t len) 1745 { 1746 unsigned long flags; 1747 1748 if (boot_cpu_has(X86_FEATURE_NX) && 1749 is_module_text_address((unsigned long)addr)) { 1750 /* 1751 * Modules text is marked initially as non-executable, so the 1752 * code cannot be running and speculative code-fetches are 1753 * prevented. Just change the code. 1754 */ 1755 memcpy(addr, opcode, len); 1756 } else { 1757 local_irq_save(flags); 1758 memcpy(addr, opcode, len); 1759 local_irq_restore(flags); 1760 sync_core(); 1761 1762 /* 1763 * Could also do a CLFLUSH here to speed up CPU recovery; but 1764 * that causes hangs on some VIA CPUs. 1765 */ 1766 } 1767 } 1768 1769 typedef struct { 1770 struct mm_struct *mm; 1771 } temp_mm_state_t; 1772 1773 /* 1774 * Using a temporary mm allows to set temporary mappings that are not accessible 1775 * by other CPUs. Such mappings are needed to perform sensitive memory writes 1776 * that override the kernel memory protections (e.g., W^X), without exposing the 1777 * temporary page-table mappings that are required for these write operations to 1778 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the 1779 * mapping is torn down. 1780 * 1781 * Context: The temporary mm needs to be used exclusively by a single core. To 1782 * harden security IRQs must be disabled while the temporary mm is 1783 * loaded, thereby preventing interrupt handler bugs from overriding 1784 * the kernel memory protection. 1785 */ 1786 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) 1787 { 1788 temp_mm_state_t temp_state; 1789 1790 lockdep_assert_irqs_disabled(); 1791 1792 /* 1793 * Make sure not to be in TLB lazy mode, as otherwise we'll end up 1794 * with a stale address space WITHOUT being in lazy mode after 1795 * restoring the previous mm. 1796 */ 1797 if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) 1798 leave_mm(smp_processor_id()); 1799 1800 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); 1801 switch_mm_irqs_off(NULL, mm, current); 1802 1803 /* 1804 * If breakpoints are enabled, disable them while the temporary mm is 1805 * used. Userspace might set up watchpoints on addresses that are used 1806 * in the temporary mm, which would lead to wrong signals being sent or 1807 * crashes. 1808 * 1809 * Note that breakpoints are not disabled selectively, which also causes 1810 * kernel breakpoints (e.g., perf's) to be disabled. This might be 1811 * undesirable, but still seems reasonable as the code that runs in the 1812 * temporary mm should be short. 1813 */ 1814 if (hw_breakpoint_active()) 1815 hw_breakpoint_disable(); 1816 1817 return temp_state; 1818 } 1819 1820 static inline void unuse_temporary_mm(temp_mm_state_t prev_state) 1821 { 1822 lockdep_assert_irqs_disabled(); 1823 switch_mm_irqs_off(NULL, prev_state.mm, current); 1824 1825 /* 1826 * Restore the breakpoints if they were disabled before the temporary mm 1827 * was loaded. 1828 */ 1829 if (hw_breakpoint_active()) 1830 hw_breakpoint_restore(); 1831 } 1832 1833 __ro_after_init struct mm_struct *poking_mm; 1834 __ro_after_init unsigned long poking_addr; 1835 1836 static void text_poke_memcpy(void *dst, const void *src, size_t len) 1837 { 1838 memcpy(dst, src, len); 1839 } 1840 1841 static void text_poke_memset(void *dst, const void *src, size_t len) 1842 { 1843 int c = *(const int *)src; 1844 1845 memset(dst, c, len); 1846 } 1847 1848 typedef void text_poke_f(void *dst, const void *src, size_t len); 1849 1850 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) 1851 { 1852 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; 1853 struct page *pages[2] = {NULL}; 1854 temp_mm_state_t prev; 1855 unsigned long flags; 1856 pte_t pte, *ptep; 1857 spinlock_t *ptl; 1858 pgprot_t pgprot; 1859 1860 /* 1861 * While boot memory allocator is running we cannot use struct pages as 1862 * they are not yet initialized. There is no way to recover. 1863 */ 1864 BUG_ON(!after_bootmem); 1865 1866 if (!core_kernel_text((unsigned long)addr)) { 1867 pages[0] = vmalloc_to_page(addr); 1868 if (cross_page_boundary) 1869 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 1870 } else { 1871 pages[0] = virt_to_page(addr); 1872 WARN_ON(!PageReserved(pages[0])); 1873 if (cross_page_boundary) 1874 pages[1] = virt_to_page(addr + PAGE_SIZE); 1875 } 1876 /* 1877 * If something went wrong, crash and burn since recovery paths are not 1878 * implemented. 1879 */ 1880 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); 1881 1882 /* 1883 * Map the page without the global bit, as TLB flushing is done with 1884 * flush_tlb_mm_range(), which is intended for non-global PTEs. 1885 */ 1886 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); 1887 1888 /* 1889 * The lock is not really needed, but this allows to avoid open-coding. 1890 */ 1891 ptep = get_locked_pte(poking_mm, poking_addr, &ptl); 1892 1893 /* 1894 * This must not fail; preallocated in poking_init(). 1895 */ 1896 VM_BUG_ON(!ptep); 1897 1898 local_irq_save(flags); 1899 1900 pte = mk_pte(pages[0], pgprot); 1901 set_pte_at(poking_mm, poking_addr, ptep, pte); 1902 1903 if (cross_page_boundary) { 1904 pte = mk_pte(pages[1], pgprot); 1905 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); 1906 } 1907 1908 /* 1909 * Loading the temporary mm behaves as a compiler barrier, which 1910 * guarantees that the PTE will be set at the time memcpy() is done. 1911 */ 1912 prev = use_temporary_mm(poking_mm); 1913 1914 kasan_disable_current(); 1915 func((u8 *)poking_addr + offset_in_page(addr), src, len); 1916 kasan_enable_current(); 1917 1918 /* 1919 * Ensure that the PTE is only cleared after the instructions of memcpy 1920 * were issued by using a compiler barrier. 1921 */ 1922 barrier(); 1923 1924 pte_clear(poking_mm, poking_addr, ptep); 1925 if (cross_page_boundary) 1926 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); 1927 1928 /* 1929 * Loading the previous page-table hierarchy requires a serializing 1930 * instruction that already allows the core to see the updated version. 1931 * Xen-PV is assumed to serialize execution in a similar manner. 1932 */ 1933 unuse_temporary_mm(prev); 1934 1935 /* 1936 * Flushing the TLB might involve IPIs, which would require enabled 1937 * IRQs, but not if the mm is not used, as it is in this point. 1938 */ 1939 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + 1940 (cross_page_boundary ? 2 : 1) * PAGE_SIZE, 1941 PAGE_SHIFT, false); 1942 1943 if (func == text_poke_memcpy) { 1944 /* 1945 * If the text does not match what we just wrote then something is 1946 * fundamentally screwy; there's nothing we can really do about that. 1947 */ 1948 BUG_ON(memcmp(addr, src, len)); 1949 } 1950 1951 local_irq_restore(flags); 1952 pte_unmap_unlock(ptep, ptl); 1953 return addr; 1954 } 1955 1956 /** 1957 * text_poke - Update instructions on a live kernel 1958 * @addr: address to modify 1959 * @opcode: source of the copy 1960 * @len: length to copy 1961 * 1962 * Only atomic text poke/set should be allowed when not doing early patching. 1963 * It means the size must be writable atomically and the address must be aligned 1964 * in a way that permits an atomic write. It also makes sure we fit on a single 1965 * page. 1966 * 1967 * Note that the caller must ensure that if the modified code is part of a 1968 * module, the module would not be removed during poking. This can be achieved 1969 * by registering a module notifier, and ordering module removal and patching 1970 * trough a mutex. 1971 */ 1972 void *text_poke(void *addr, const void *opcode, size_t len) 1973 { 1974 lockdep_assert_held(&text_mutex); 1975 1976 return __text_poke(text_poke_memcpy, addr, opcode, len); 1977 } 1978 1979 /** 1980 * text_poke_kgdb - Update instructions on a live kernel by kgdb 1981 * @addr: address to modify 1982 * @opcode: source of the copy 1983 * @len: length to copy 1984 * 1985 * Only atomic text poke/set should be allowed when not doing early patching. 1986 * It means the size must be writable atomically and the address must be aligned 1987 * in a way that permits an atomic write. It also makes sure we fit on a single 1988 * page. 1989 * 1990 * Context: should only be used by kgdb, which ensures no other core is running, 1991 * despite the fact it does not hold the text_mutex. 1992 */ 1993 void *text_poke_kgdb(void *addr, const void *opcode, size_t len) 1994 { 1995 return __text_poke(text_poke_memcpy, addr, opcode, len); 1996 } 1997 1998 void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, 1999 bool core_ok) 2000 { 2001 unsigned long start = (unsigned long)addr; 2002 size_t patched = 0; 2003 2004 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) 2005 return NULL; 2006 2007 while (patched < len) { 2008 unsigned long ptr = start + patched; 2009 size_t s; 2010 2011 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 2012 2013 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); 2014 patched += s; 2015 } 2016 return addr; 2017 } 2018 2019 /** 2020 * text_poke_copy - Copy instructions into (an unused part of) RX memory 2021 * @addr: address to modify 2022 * @opcode: source of the copy 2023 * @len: length to copy, could be more than 2x PAGE_SIZE 2024 * 2025 * Not safe against concurrent execution; useful for JITs to dump 2026 * new code blocks into unused regions of RX memory. Can be used in 2027 * conjunction with synchronize_rcu_tasks() to wait for existing 2028 * execution to quiesce after having made sure no existing functions 2029 * pointers are live. 2030 */ 2031 void *text_poke_copy(void *addr, const void *opcode, size_t len) 2032 { 2033 mutex_lock(&text_mutex); 2034 addr = text_poke_copy_locked(addr, opcode, len, false); 2035 mutex_unlock(&text_mutex); 2036 return addr; 2037 } 2038 2039 /** 2040 * text_poke_set - memset into (an unused part of) RX memory 2041 * @addr: address to modify 2042 * @c: the byte to fill the area with 2043 * @len: length to copy, could be more than 2x PAGE_SIZE 2044 * 2045 * This is useful to overwrite unused regions of RX memory with illegal 2046 * instructions. 2047 */ 2048 void *text_poke_set(void *addr, int c, size_t len) 2049 { 2050 unsigned long start = (unsigned long)addr; 2051 size_t patched = 0; 2052 2053 if (WARN_ON_ONCE(core_kernel_text(start))) 2054 return NULL; 2055 2056 mutex_lock(&text_mutex); 2057 while (patched < len) { 2058 unsigned long ptr = start + patched; 2059 size_t s; 2060 2061 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 2062 2063 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); 2064 patched += s; 2065 } 2066 mutex_unlock(&text_mutex); 2067 return addr; 2068 } 2069 2070 static void do_sync_core(void *info) 2071 { 2072 sync_core(); 2073 } 2074 2075 void text_poke_sync(void) 2076 { 2077 on_each_cpu(do_sync_core, NULL, 1); 2078 } 2079 2080 /* 2081 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of 2082 * this thing. When len == 6 everything is prefixed with 0x0f and we map 2083 * opcode to Jcc.d8, using len to distinguish. 2084 */ 2085 struct text_poke_loc { 2086 /* addr := _stext + rel_addr */ 2087 s32 rel_addr; 2088 s32 disp; 2089 u8 len; 2090 u8 opcode; 2091 const u8 text[POKE_MAX_OPCODE_SIZE]; 2092 /* see text_poke_bp_batch() */ 2093 u8 old; 2094 }; 2095 2096 struct bp_patching_desc { 2097 struct text_poke_loc *vec; 2098 int nr_entries; 2099 atomic_t refs; 2100 }; 2101 2102 static struct bp_patching_desc bp_desc; 2103 2104 static __always_inline 2105 struct bp_patching_desc *try_get_desc(void) 2106 { 2107 struct bp_patching_desc *desc = &bp_desc; 2108 2109 if (!raw_atomic_inc_not_zero(&desc->refs)) 2110 return NULL; 2111 2112 return desc; 2113 } 2114 2115 static __always_inline void put_desc(void) 2116 { 2117 struct bp_patching_desc *desc = &bp_desc; 2118 2119 smp_mb__before_atomic(); 2120 raw_atomic_dec(&desc->refs); 2121 } 2122 2123 static __always_inline void *text_poke_addr(struct text_poke_loc *tp) 2124 { 2125 return _stext + tp->rel_addr; 2126 } 2127 2128 static __always_inline int patch_cmp(const void *key, const void *elt) 2129 { 2130 struct text_poke_loc *tp = (struct text_poke_loc *) elt; 2131 2132 if (key < text_poke_addr(tp)) 2133 return -1; 2134 if (key > text_poke_addr(tp)) 2135 return 1; 2136 return 0; 2137 } 2138 2139 noinstr int poke_int3_handler(struct pt_regs *regs) 2140 { 2141 struct bp_patching_desc *desc; 2142 struct text_poke_loc *tp; 2143 int ret = 0; 2144 void *ip; 2145 2146 if (user_mode(regs)) 2147 return 0; 2148 2149 /* 2150 * Having observed our INT3 instruction, we now must observe 2151 * bp_desc with non-zero refcount: 2152 * 2153 * bp_desc.refs = 1 INT3 2154 * WMB RMB 2155 * write INT3 if (bp_desc.refs != 0) 2156 */ 2157 smp_rmb(); 2158 2159 desc = try_get_desc(); 2160 if (!desc) 2161 return 0; 2162 2163 /* 2164 * Discount the INT3. See text_poke_bp_batch(). 2165 */ 2166 ip = (void *) regs->ip - INT3_INSN_SIZE; 2167 2168 /* 2169 * Skip the binary search if there is a single member in the vector. 2170 */ 2171 if (unlikely(desc->nr_entries > 1)) { 2172 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, 2173 sizeof(struct text_poke_loc), 2174 patch_cmp); 2175 if (!tp) 2176 goto out_put; 2177 } else { 2178 tp = desc->vec; 2179 if (text_poke_addr(tp) != ip) 2180 goto out_put; 2181 } 2182 2183 ip += tp->len; 2184 2185 switch (tp->opcode) { 2186 case INT3_INSN_OPCODE: 2187 /* 2188 * Someone poked an explicit INT3, they'll want to handle it, 2189 * do not consume. 2190 */ 2191 goto out_put; 2192 2193 case RET_INSN_OPCODE: 2194 int3_emulate_ret(regs); 2195 break; 2196 2197 case CALL_INSN_OPCODE: 2198 int3_emulate_call(regs, (long)ip + tp->disp); 2199 break; 2200 2201 case JMP32_INSN_OPCODE: 2202 case JMP8_INSN_OPCODE: 2203 int3_emulate_jmp(regs, (long)ip + tp->disp); 2204 break; 2205 2206 case 0x70 ... 0x7f: /* Jcc */ 2207 int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); 2208 break; 2209 2210 default: 2211 BUG(); 2212 } 2213 2214 ret = 1; 2215 2216 out_put: 2217 put_desc(); 2218 return ret; 2219 } 2220 2221 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) 2222 static struct text_poke_loc tp_vec[TP_VEC_MAX]; 2223 static int tp_vec_nr; 2224 2225 /** 2226 * text_poke_bp_batch() -- update instructions on live kernel on SMP 2227 * @tp: vector of instructions to patch 2228 * @nr_entries: number of entries in the vector 2229 * 2230 * Modify multi-byte instruction by using int3 breakpoint on SMP. 2231 * We completely avoid stop_machine() here, and achieve the 2232 * synchronization using int3 breakpoint. 2233 * 2234 * The way it is done: 2235 * - For each entry in the vector: 2236 * - add a int3 trap to the address that will be patched 2237 * - sync cores 2238 * - For each entry in the vector: 2239 * - update all but the first byte of the patched range 2240 * - sync cores 2241 * - For each entry in the vector: 2242 * - replace the first byte (int3) by the first byte of 2243 * replacing opcode 2244 * - sync cores 2245 */ 2246 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) 2247 { 2248 unsigned char int3 = INT3_INSN_OPCODE; 2249 unsigned int i; 2250 int do_sync; 2251 2252 lockdep_assert_held(&text_mutex); 2253 2254 bp_desc.vec = tp; 2255 bp_desc.nr_entries = nr_entries; 2256 2257 /* 2258 * Corresponds to the implicit memory barrier in try_get_desc() to 2259 * ensure reading a non-zero refcount provides up to date bp_desc data. 2260 */ 2261 atomic_set_release(&bp_desc.refs, 1); 2262 2263 /* 2264 * Function tracing can enable thousands of places that need to be 2265 * updated. This can take quite some time, and with full kernel debugging 2266 * enabled, this could cause the softlockup watchdog to trigger. 2267 * This function gets called every 256 entries added to be patched. 2268 * Call cond_resched() here to make sure that other tasks can get scheduled 2269 * while processing all the functions being patched. 2270 */ 2271 cond_resched(); 2272 2273 /* 2274 * Corresponding read barrier in int3 notifier for making sure the 2275 * nr_entries and handler are correctly ordered wrt. patching. 2276 */ 2277 smp_wmb(); 2278 2279 /* 2280 * First step: add a int3 trap to the address that will be patched. 2281 */ 2282 for (i = 0; i < nr_entries; i++) { 2283 tp[i].old = *(u8 *)text_poke_addr(&tp[i]); 2284 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); 2285 } 2286 2287 text_poke_sync(); 2288 2289 /* 2290 * Second step: update all but the first byte of the patched range. 2291 */ 2292 for (do_sync = 0, i = 0; i < nr_entries; i++) { 2293 u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; 2294 u8 _new[POKE_MAX_OPCODE_SIZE+1]; 2295 const u8 *new = tp[i].text; 2296 int len = tp[i].len; 2297 2298 if (len - INT3_INSN_SIZE > 0) { 2299 memcpy(old + INT3_INSN_SIZE, 2300 text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 2301 len - INT3_INSN_SIZE); 2302 2303 if (len == 6) { 2304 _new[0] = 0x0f; 2305 memcpy(_new + 1, new, 5); 2306 new = _new; 2307 } 2308 2309 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 2310 new + INT3_INSN_SIZE, 2311 len - INT3_INSN_SIZE); 2312 2313 do_sync++; 2314 } 2315 2316 /* 2317 * Emit a perf event to record the text poke, primarily to 2318 * support Intel PT decoding which must walk the executable code 2319 * to reconstruct the trace. The flow up to here is: 2320 * - write INT3 byte 2321 * - IPI-SYNC 2322 * - write instruction tail 2323 * At this point the actual control flow will be through the 2324 * INT3 and handler and not hit the old or new instruction. 2325 * Intel PT outputs FUP/TIP packets for the INT3, so the flow 2326 * can still be decoded. Subsequently: 2327 * - emit RECORD_TEXT_POKE with the new instruction 2328 * - IPI-SYNC 2329 * - write first byte 2330 * - IPI-SYNC 2331 * So before the text poke event timestamp, the decoder will see 2332 * either the old instruction flow or FUP/TIP of INT3. After the 2333 * text poke event timestamp, the decoder will see either the 2334 * new instruction flow or FUP/TIP of INT3. Thus decoders can 2335 * use the timestamp as the point at which to modify the 2336 * executable code. 2337 * The old instruction is recorded so that the event can be 2338 * processed forwards or backwards. 2339 */ 2340 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); 2341 } 2342 2343 if (do_sync) { 2344 /* 2345 * According to Intel, this core syncing is very likely 2346 * not necessary and we'd be safe even without it. But 2347 * better safe than sorry (plus there's not only Intel). 2348 */ 2349 text_poke_sync(); 2350 } 2351 2352 /* 2353 * Third step: replace the first byte (int3) by the first byte of 2354 * replacing opcode. 2355 */ 2356 for (do_sync = 0, i = 0; i < nr_entries; i++) { 2357 u8 byte = tp[i].text[0]; 2358 2359 if (tp[i].len == 6) 2360 byte = 0x0f; 2361 2362 if (byte == INT3_INSN_OPCODE) 2363 continue; 2364 2365 text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); 2366 do_sync++; 2367 } 2368 2369 if (do_sync) 2370 text_poke_sync(); 2371 2372 /* 2373 * Remove and wait for refs to be zero. 2374 */ 2375 if (!atomic_dec_and_test(&bp_desc.refs)) 2376 atomic_cond_read_acquire(&bp_desc.refs, !VAL); 2377 } 2378 2379 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, 2380 const void *opcode, size_t len, const void *emulate) 2381 { 2382 struct insn insn; 2383 int ret, i = 0; 2384 2385 if (len == 6) 2386 i = 1; 2387 memcpy((void *)tp->text, opcode+i, len-i); 2388 if (!emulate) 2389 emulate = opcode; 2390 2391 ret = insn_decode_kernel(&insn, emulate); 2392 BUG_ON(ret < 0); 2393 2394 tp->rel_addr = addr - (void *)_stext; 2395 tp->len = len; 2396 tp->opcode = insn.opcode.bytes[0]; 2397 2398 if (is_jcc32(&insn)) { 2399 /* 2400 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. 2401 */ 2402 tp->opcode = insn.opcode.bytes[1] - 0x10; 2403 } 2404 2405 switch (tp->opcode) { 2406 case RET_INSN_OPCODE: 2407 case JMP32_INSN_OPCODE: 2408 case JMP8_INSN_OPCODE: 2409 /* 2410 * Control flow instructions without implied execution of the 2411 * next instruction can be padded with INT3. 2412 */ 2413 for (i = insn.length; i < len; i++) 2414 BUG_ON(tp->text[i] != INT3_INSN_OPCODE); 2415 break; 2416 2417 default: 2418 BUG_ON(len != insn.length); 2419 } 2420 2421 switch (tp->opcode) { 2422 case INT3_INSN_OPCODE: 2423 case RET_INSN_OPCODE: 2424 break; 2425 2426 case CALL_INSN_OPCODE: 2427 case JMP32_INSN_OPCODE: 2428 case JMP8_INSN_OPCODE: 2429 case 0x70 ... 0x7f: /* Jcc */ 2430 tp->disp = insn.immediate.value; 2431 break; 2432 2433 default: /* assume NOP */ 2434 switch (len) { 2435 case 2: /* NOP2 -- emulate as JMP8+0 */ 2436 BUG_ON(memcmp(emulate, x86_nops[len], len)); 2437 tp->opcode = JMP8_INSN_OPCODE; 2438 tp->disp = 0; 2439 break; 2440 2441 case 5: /* NOP5 -- emulate as JMP32+0 */ 2442 BUG_ON(memcmp(emulate, x86_nops[len], len)); 2443 tp->opcode = JMP32_INSN_OPCODE; 2444 tp->disp = 0; 2445 break; 2446 2447 default: /* unknown instruction */ 2448 BUG(); 2449 } 2450 break; 2451 } 2452 } 2453 2454 /* 2455 * We hard rely on the tp_vec being ordered; ensure this is so by flushing 2456 * early if needed. 2457 */ 2458 static bool tp_order_fail(void *addr) 2459 { 2460 struct text_poke_loc *tp; 2461 2462 if (!tp_vec_nr) 2463 return false; 2464 2465 if (!addr) /* force */ 2466 return true; 2467 2468 tp = &tp_vec[tp_vec_nr - 1]; 2469 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) 2470 return true; 2471 2472 return false; 2473 } 2474 2475 static void text_poke_flush(void *addr) 2476 { 2477 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { 2478 text_poke_bp_batch(tp_vec, tp_vec_nr); 2479 tp_vec_nr = 0; 2480 } 2481 } 2482 2483 void text_poke_finish(void) 2484 { 2485 text_poke_flush(NULL); 2486 } 2487 2488 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) 2489 { 2490 struct text_poke_loc *tp; 2491 2492 text_poke_flush(addr); 2493 2494 tp = &tp_vec[tp_vec_nr++]; 2495 text_poke_loc_init(tp, addr, opcode, len, emulate); 2496 } 2497 2498 /** 2499 * text_poke_bp() -- update instructions on live kernel on SMP 2500 * @addr: address to patch 2501 * @opcode: opcode of new instruction 2502 * @len: length to copy 2503 * @emulate: instruction to be emulated 2504 * 2505 * Update a single instruction with the vector in the stack, avoiding 2506 * dynamically allocated memory. This function should be used when it is 2507 * not possible to allocate memory. 2508 */ 2509 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) 2510 { 2511 struct text_poke_loc tp; 2512 2513 text_poke_loc_init(&tp, addr, opcode, len, emulate); 2514 text_poke_bp_batch(&tp, 1); 2515 } 2516