1 // SPDX-License-Identifier: GPL-2.0-only 2 #define pr_fmt(fmt) "SMP alternatives: " fmt 3 4 #include <linux/mmu_context.h> 5 #include <linux/perf_event.h> 6 #include <linux/vmalloc.h> 7 #include <linux/memory.h> 8 #include <linux/execmem.h> 9 10 #include <asm/text-patching.h> 11 #include <asm/insn.h> 12 #include <asm/ibt.h> 13 #include <asm/set_memory.h> 14 #include <asm/nmi.h> 15 16 int __read_mostly alternatives_patched; 17 18 EXPORT_SYMBOL_GPL(alternatives_patched); 19 20 #define MAX_PATCH_LEN (255-1) 21 22 #define DA_ALL (~0) 23 #define DA_ALT 0x01 24 #define DA_RET 0x02 25 #define DA_RETPOLINE 0x04 26 #define DA_ENDBR 0x08 27 #define DA_SMP 0x10 28 29 static unsigned int debug_alternative; 30 31 static int __init debug_alt(char *str) 32 { 33 if (str && *str == '=') 34 str++; 35 36 if (!str || kstrtouint(str, 0, &debug_alternative)) 37 debug_alternative = DA_ALL; 38 39 return 1; 40 } 41 __setup("debug-alternative", debug_alt); 42 43 static int noreplace_smp; 44 45 static int __init setup_noreplace_smp(char *str) 46 { 47 noreplace_smp = 1; 48 return 1; 49 } 50 __setup("noreplace-smp", setup_noreplace_smp); 51 52 #define DPRINTK(type, fmt, args...) \ 53 do { \ 54 if (debug_alternative & DA_##type) \ 55 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ 56 } while (0) 57 58 #define DUMP_BYTES(type, buf, len, fmt, args...) \ 59 do { \ 60 if (unlikely(debug_alternative & DA_##type)) { \ 61 int j; \ 62 \ 63 if (!(len)) \ 64 break; \ 65 \ 66 printk(KERN_DEBUG pr_fmt(fmt), ##args); \ 67 for (j = 0; j < (len) - 1; j++) \ 68 printk(KERN_CONT "%02hhx ", buf[j]); \ 69 printk(KERN_CONT "%02hhx\n", buf[j]); \ 70 } \ 71 } while (0) 72 73 static const unsigned char x86nops[] = 74 { 75 BYTES_NOP1, 76 BYTES_NOP2, 77 BYTES_NOP3, 78 BYTES_NOP4, 79 BYTES_NOP5, 80 BYTES_NOP6, 81 BYTES_NOP7, 82 BYTES_NOP8, 83 #ifdef CONFIG_64BIT 84 BYTES_NOP9, 85 BYTES_NOP10, 86 BYTES_NOP11, 87 #endif 88 }; 89 90 const unsigned char * const x86_nops[ASM_NOP_MAX+1] = 91 { 92 NULL, 93 x86nops, 94 x86nops + 1, 95 x86nops + 1 + 2, 96 x86nops + 1 + 2 + 3, 97 x86nops + 1 + 2 + 3 + 4, 98 x86nops + 1 + 2 + 3 + 4 + 5, 99 x86nops + 1 + 2 + 3 + 4 + 5 + 6, 100 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 101 #ifdef CONFIG_64BIT 102 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, 103 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, 104 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, 105 #endif 106 }; 107 108 #ifdef CONFIG_FINEIBT 109 static bool cfi_paranoid __ro_after_init; 110 #endif 111 112 #ifdef CONFIG_MITIGATION_ITS 113 114 static struct module *its_mod; 115 static void *its_page; 116 static unsigned int its_offset; 117 118 /* Initialize a thunk with the "jmp *reg; int3" instructions. */ 119 static void *its_init_thunk(void *thunk, int reg) 120 { 121 u8 *bytes = thunk; 122 int offset = 0; 123 int i = 0; 124 125 #ifdef CONFIG_FINEIBT 126 if (cfi_paranoid) { 127 /* 128 * When ITS uses indirect branch thunk the fineibt_paranoid 129 * caller sequence doesn't fit in the caller site. So put the 130 * remaining part of the sequence (<ea> + JNE) into the ITS 131 * thunk. 132 */ 133 bytes[i++] = 0xea; /* invalid instruction */ 134 bytes[i++] = 0x75; /* JNE */ 135 bytes[i++] = 0xfd; 136 137 offset = 1; 138 } 139 #endif 140 141 if (reg >= 8) { 142 bytes[i++] = 0x41; /* REX.B prefix */ 143 reg -= 8; 144 } 145 bytes[i++] = 0xff; 146 bytes[i++] = 0xe0 + reg; /* jmp *reg */ 147 bytes[i++] = 0xcc; 148 149 return thunk + offset; 150 } 151 152 void its_init_mod(struct module *mod) 153 { 154 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) 155 return; 156 157 mutex_lock(&text_mutex); 158 its_mod = mod; 159 its_page = NULL; 160 } 161 162 void its_fini_mod(struct module *mod) 163 { 164 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) 165 return; 166 167 WARN_ON_ONCE(its_mod != mod); 168 169 its_mod = NULL; 170 its_page = NULL; 171 mutex_unlock(&text_mutex); 172 173 for (int i = 0; i < mod->its_num_pages; i++) { 174 void *page = mod->its_page_array[i]; 175 execmem_restore_rox(page, PAGE_SIZE); 176 } 177 } 178 179 void its_free_mod(struct module *mod) 180 { 181 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) 182 return; 183 184 for (int i = 0; i < mod->its_num_pages; i++) { 185 void *page = mod->its_page_array[i]; 186 execmem_free(page); 187 } 188 kfree(mod->its_page_array); 189 } 190 191 static void *its_alloc(void) 192 { 193 void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); 194 195 if (!page) 196 return NULL; 197 198 if (its_mod) { 199 void *tmp = krealloc(its_mod->its_page_array, 200 (its_mod->its_num_pages+1) * sizeof(void *), 201 GFP_KERNEL); 202 if (!tmp) 203 return NULL; 204 205 its_mod->its_page_array = tmp; 206 its_mod->its_page_array[its_mod->its_num_pages++] = page; 207 208 execmem_make_temp_rw(page, PAGE_SIZE); 209 } 210 211 return no_free_ptr(page); 212 } 213 214 static void *its_allocate_thunk(int reg) 215 { 216 int size = 3 + (reg / 8); 217 void *thunk; 218 219 #ifdef CONFIG_FINEIBT 220 /* 221 * The ITS thunk contains an indirect jump and an int3 instruction so 222 * its size is 3 or 4 bytes depending on the register used. If CFI 223 * paranoid is used then 3 extra bytes are added in the ITS thunk to 224 * complete the fineibt_paranoid caller sequence. 225 */ 226 if (cfi_paranoid) 227 size += 3; 228 #endif 229 230 if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { 231 its_page = its_alloc(); 232 if (!its_page) { 233 pr_err("ITS page allocation failed\n"); 234 return NULL; 235 } 236 memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); 237 its_offset = 32; 238 } 239 240 /* 241 * If the indirect branch instruction will be in the lower half 242 * of a cacheline, then update the offset to reach the upper half. 243 */ 244 if ((its_offset + size - 1) % 64 < 32) 245 its_offset = ((its_offset - 1) | 0x3F) + 33; 246 247 thunk = its_page + its_offset; 248 its_offset += size; 249 250 return its_init_thunk(thunk, reg); 251 } 252 253 u8 *its_static_thunk(int reg) 254 { 255 u8 *thunk = __x86_indirect_its_thunk_array[reg]; 256 257 #ifdef CONFIG_FINEIBT 258 /* Paranoid thunk starts 2 bytes before */ 259 if (cfi_paranoid) 260 return thunk - 2; 261 #endif 262 return thunk; 263 } 264 265 #endif 266 267 /* 268 * Nomenclature for variable names to simplify and clarify this code and ease 269 * any potential staring at it: 270 * 271 * @instr: source address of the original instructions in the kernel text as 272 * generated by the compiler. 273 * 274 * @buf: temporary buffer on which the patching operates. This buffer is 275 * eventually text-poked into the kernel image. 276 * 277 * @replacement/@repl: pointer to the opcodes which are replacing @instr, located 278 * in the .altinstr_replacement section. 279 */ 280 281 /* 282 * Fill the buffer with a single effective instruction of size @len. 283 * 284 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) 285 * for every single-byte NOP, try to generate the maximally available NOP of 286 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for 287 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and 288 * *jump* over instead of executing long and daft NOPs. 289 */ 290 static void add_nop(u8 *buf, unsigned int len) 291 { 292 u8 *target = buf + len; 293 294 if (!len) 295 return; 296 297 if (len <= ASM_NOP_MAX) { 298 memcpy(buf, x86_nops[len], len); 299 return; 300 } 301 302 if (len < 128) { 303 __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE); 304 buf += JMP8_INSN_SIZE; 305 } else { 306 __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE); 307 buf += JMP32_INSN_SIZE; 308 } 309 310 for (;buf < target; buf++) 311 *buf = INT3_INSN_OPCODE; 312 } 313 314 /* 315 * Matches NOP and NOPL, not any of the other possible NOPs. 316 */ 317 static bool insn_is_nop(struct insn *insn) 318 { 319 /* Anything NOP, but no REP NOP */ 320 if (insn->opcode.bytes[0] == 0x90 && 321 (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) 322 return true; 323 324 /* NOPL */ 325 if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) 326 return true; 327 328 /* TODO: more nops */ 329 330 return false; 331 } 332 333 /* 334 * Find the offset of the first non-NOP instruction starting at @offset 335 * but no further than @len. 336 */ 337 static int skip_nops(u8 *buf, int offset, int len) 338 { 339 struct insn insn; 340 341 for (; offset < len; offset += insn.length) { 342 if (insn_decode_kernel(&insn, &buf[offset])) 343 break; 344 345 if (!insn_is_nop(&insn)) 346 break; 347 } 348 349 return offset; 350 } 351 352 /* 353 * "noinline" to cause control flow change and thus invalidate I$ and 354 * cause refetch after modification. 355 */ 356 static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) 357 { 358 for (int next, i = 0; i < len; i = next) { 359 struct insn insn; 360 361 if (insn_decode_kernel(&insn, &buf[i])) 362 return; 363 364 next = i + insn.length; 365 366 if (insn_is_nop(&insn)) { 367 int nop = i; 368 369 /* Has the NOP already been optimized? */ 370 if (i + insn.length == len) 371 return; 372 373 next = skip_nops(buf, next, len); 374 375 add_nop(buf + nop, next - nop); 376 DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next); 377 } 378 } 379 } 380 381 /* 382 * In this context, "source" is where the instructions are placed in the 383 * section .altinstr_replacement, for example during kernel build by the 384 * toolchain. 385 * "Destination" is where the instructions are being patched in by this 386 * machinery. 387 * 388 * The source offset is: 389 * 390 * src_imm = target - src_next_ip (1) 391 * 392 * and the target offset is: 393 * 394 * dst_imm = target - dst_next_ip (2) 395 * 396 * so rework (1) as an expression for target like: 397 * 398 * target = src_imm + src_next_ip (1a) 399 * 400 * and substitute in (2) to get: 401 * 402 * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) 403 * 404 * Now, since the instruction stream is 'identical' at src and dst (it 405 * is being copied after all) it can be stated that: 406 * 407 * src_next_ip = src + ip_offset 408 * dst_next_ip = dst + ip_offset (4) 409 * 410 * Substitute (4) in (3) and observe ip_offset being cancelled out to 411 * obtain: 412 * 413 * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) 414 * = src_imm + src - dst + ip_offset - ip_offset 415 * = src_imm + src - dst (5) 416 * 417 * IOW, only the relative displacement of the code block matters. 418 */ 419 420 #define apply_reloc_n(n_, p_, d_) \ 421 do { \ 422 s32 v = *(s##n_ *)(p_); \ 423 v += (d_); \ 424 BUG_ON((v >> 31) != (v >> (n_-1))); \ 425 *(s##n_ *)(p_) = (s##n_)v; \ 426 } while (0) 427 428 429 static __always_inline 430 void apply_reloc(int n, void *ptr, uintptr_t diff) 431 { 432 switch (n) { 433 case 1: apply_reloc_n(8, ptr, diff); break; 434 case 2: apply_reloc_n(16, ptr, diff); break; 435 case 4: apply_reloc_n(32, ptr, diff); break; 436 default: BUG(); 437 } 438 } 439 440 static __always_inline 441 bool need_reloc(unsigned long offset, u8 *src, size_t src_len) 442 { 443 u8 *target = src + offset; 444 /* 445 * If the target is inside the patched block, it's relative to the 446 * block itself and does not need relocation. 447 */ 448 return (target < src || target > src + src_len); 449 } 450 451 static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) 452 { 453 for (int next, i = 0; i < instrlen; i = next) { 454 struct insn insn; 455 456 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) 457 return; 458 459 next = i + insn.length; 460 461 switch (insn.opcode.bytes[0]) { 462 case 0x0f: 463 if (insn.opcode.bytes[1] < 0x80 || 464 insn.opcode.bytes[1] > 0x8f) 465 break; 466 467 fallthrough; /* Jcc.d32 */ 468 case 0x70 ... 0x7f: /* Jcc.d8 */ 469 case JMP8_INSN_OPCODE: 470 case JMP32_INSN_OPCODE: 471 case CALL_INSN_OPCODE: 472 if (need_reloc(next + insn.immediate.value, repl, repl_len)) { 473 apply_reloc(insn.immediate.nbytes, 474 buf + i + insn_offset_immediate(&insn), 475 repl - instr); 476 } 477 478 /* 479 * Where possible, convert JMP.d32 into JMP.d8. 480 */ 481 if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { 482 s32 imm = insn.immediate.value; 483 imm += repl - instr; 484 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; 485 if ((imm >> 31) == (imm >> 7)) { 486 buf[i+0] = JMP8_INSN_OPCODE; 487 buf[i+1] = (s8)imm; 488 489 memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); 490 } 491 } 492 break; 493 } 494 495 if (insn_rip_relative(&insn)) { 496 if (need_reloc(next + insn.displacement.value, repl, repl_len)) { 497 apply_reloc(insn.displacement.nbytes, 498 buf + i + insn_offset_displacement(&insn), 499 repl - instr); 500 } 501 } 502 } 503 } 504 505 void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) 506 { 507 __apply_relocation(buf, instr, instrlen, repl, repl_len); 508 optimize_nops(instr, buf, instrlen); 509 } 510 511 /* Low-level backend functions usable from alternative code replacements. */ 512 DEFINE_ASM_FUNC(nop_func, "", .entry.text); 513 EXPORT_SYMBOL_GPL(nop_func); 514 515 noinstr void BUG_func(void) 516 { 517 BUG(); 518 } 519 EXPORT_SYMBOL(BUG_func); 520 521 #define CALL_RIP_REL_OPCODE 0xff 522 #define CALL_RIP_REL_MODRM 0x15 523 524 /* 525 * Rewrite the "call BUG_func" replacement to point to the target of the 526 * indirect pv_ops call "call *disp(%ip)". 527 */ 528 static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) 529 { 530 void *target, *bug = &BUG_func; 531 s32 disp; 532 533 if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { 534 pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); 535 BUG(); 536 } 537 538 if (a->instrlen != 6 || 539 instr[0] != CALL_RIP_REL_OPCODE || 540 instr[1] != CALL_RIP_REL_MODRM) { 541 pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); 542 BUG(); 543 } 544 545 /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ 546 disp = *(s32 *)(instr + 2); 547 #ifdef CONFIG_X86_64 548 /* ff 15 00 00 00 00 call *0x0(%rip) */ 549 /* target address is stored at "next instruction + disp". */ 550 target = *(void **)(instr + a->instrlen + disp); 551 #else 552 /* ff 15 00 00 00 00 call *0x0 */ 553 /* target address is stored at disp. */ 554 target = *(void **)disp; 555 #endif 556 if (!target) 557 target = bug; 558 559 /* (BUG_func - .) + (target - BUG_func) := target - . */ 560 *(s32 *)(insn_buff + 1) += target - bug; 561 562 if (target == &nop_func) 563 return 0; 564 565 return 5; 566 } 567 568 static inline u8 * instr_va(struct alt_instr *i) 569 { 570 return (u8 *)&i->instr_offset + i->instr_offset; 571 } 572 573 /* 574 * Replace instructions with better alternatives for this CPU type. This runs 575 * before SMP is initialized to avoid SMP problems with self modifying code. 576 * This implies that asymmetric systems where APs have less capabilities than 577 * the boot processor are not handled. Tough. Make sure you disable such 578 * features by hand. 579 * 580 * Marked "noinline" to cause control flow change and thus insn cache 581 * to refetch changed I$ lines. 582 */ 583 void __init_or_module noinline apply_alternatives(struct alt_instr *start, 584 struct alt_instr *end) 585 { 586 u8 insn_buff[MAX_PATCH_LEN]; 587 u8 *instr, *replacement; 588 struct alt_instr *a, *b; 589 590 DPRINTK(ALT, "alt table %px, -> %px", start, end); 591 592 /* 593 * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using 594 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. 595 * During the process, KASAN becomes confused seeing partial LA57 596 * conversion and triggers a false-positive out-of-bound report. 597 * 598 * Disable KASAN until the patching is complete. 599 */ 600 kasan_disable_current(); 601 602 /* 603 * The scan order should be from start to end. A later scanned 604 * alternative code can overwrite previously scanned alternative code. 605 * Some kernel functions (e.g. memcpy, memset, etc) use this order to 606 * patch code. 607 * 608 * So be careful if you want to change the scan order to any other 609 * order. 610 */ 611 for (a = start; a < end; a++) { 612 int insn_buff_sz = 0; 613 614 /* 615 * In case of nested ALTERNATIVE()s the outer alternative might 616 * add more padding. To ensure consistent patching find the max 617 * padding for all alt_instr entries for this site (nested 618 * alternatives result in consecutive entries). 619 */ 620 for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) { 621 u8 len = max(a->instrlen, b->instrlen); 622 a->instrlen = b->instrlen = len; 623 } 624 625 instr = instr_va(a); 626 replacement = (u8 *)&a->repl_offset + a->repl_offset; 627 BUG_ON(a->instrlen > sizeof(insn_buff)); 628 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 629 630 /* 631 * Patch if either: 632 * - feature is present 633 * - feature not present but ALT_FLAG_NOT is set to mean, 634 * patch if feature is *NOT* present. 635 */ 636 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { 637 memcpy(insn_buff, instr, a->instrlen); 638 optimize_nops(instr, insn_buff, a->instrlen); 639 text_poke_early(instr, insn_buff, a->instrlen); 640 continue; 641 } 642 643 DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", 644 a->cpuid >> 5, 645 a->cpuid & 0x1f, 646 instr, instr, a->instrlen, 647 replacement, a->replacementlen, a->flags); 648 649 memcpy(insn_buff, replacement, a->replacementlen); 650 insn_buff_sz = a->replacementlen; 651 652 if (a->flags & ALT_FLAG_DIRECT_CALL) { 653 insn_buff_sz = alt_replace_call(instr, insn_buff, a); 654 if (insn_buff_sz < 0) 655 continue; 656 } 657 658 for (; insn_buff_sz < a->instrlen; insn_buff_sz++) 659 insn_buff[insn_buff_sz] = 0x90; 660 661 text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); 662 663 DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); 664 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); 665 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); 666 667 text_poke_early(instr, insn_buff, insn_buff_sz); 668 } 669 670 kasan_enable_current(); 671 } 672 673 static inline bool is_jcc32(struct insn *insn) 674 { 675 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ 676 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; 677 } 678 679 #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) 680 681 /* 682 * CALL/JMP *%\reg 683 */ 684 static int emit_indirect(int op, int reg, u8 *bytes) 685 { 686 int i = 0; 687 u8 modrm; 688 689 switch (op) { 690 case CALL_INSN_OPCODE: 691 modrm = 0x10; /* Reg = 2; CALL r/m */ 692 break; 693 694 case JMP32_INSN_OPCODE: 695 modrm = 0x20; /* Reg = 4; JMP r/m */ 696 break; 697 698 default: 699 WARN_ON_ONCE(1); 700 return -1; 701 } 702 703 if (reg >= 8) { 704 bytes[i++] = 0x41; /* REX.B prefix */ 705 reg -= 8; 706 } 707 708 modrm |= 0xc0; /* Mod = 3 */ 709 modrm += reg; 710 711 bytes[i++] = 0xff; /* opcode */ 712 bytes[i++] = modrm; 713 714 return i; 715 } 716 717 static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, 718 void *call_dest, void *jmp_dest) 719 { 720 u8 op = insn->opcode.bytes[0]; 721 int i = 0; 722 723 /* 724 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional 725 * tail-calls. Deal with them. 726 */ 727 if (is_jcc32(insn)) { 728 bytes[i++] = op; 729 op = insn->opcode.bytes[1]; 730 goto clang_jcc; 731 } 732 733 if (insn->length == 6) 734 bytes[i++] = 0x2e; /* CS-prefix */ 735 736 switch (op) { 737 case CALL_INSN_OPCODE: 738 __text_gen_insn(bytes+i, op, addr+i, 739 call_dest, 740 CALL_INSN_SIZE); 741 i += CALL_INSN_SIZE; 742 break; 743 744 case JMP32_INSN_OPCODE: 745 clang_jcc: 746 __text_gen_insn(bytes+i, op, addr+i, 747 jmp_dest, 748 JMP32_INSN_SIZE); 749 i += JMP32_INSN_SIZE; 750 break; 751 752 default: 753 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); 754 return -1; 755 } 756 757 WARN_ON_ONCE(i != insn->length); 758 759 return i; 760 } 761 762 static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) 763 { 764 return __emit_trampoline(addr, insn, bytes, 765 __x86_indirect_call_thunk_array[reg], 766 __x86_indirect_jump_thunk_array[reg]); 767 } 768 769 #ifdef CONFIG_MITIGATION_ITS 770 static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) 771 { 772 u8 *thunk = __x86_indirect_its_thunk_array[reg]; 773 u8 *tmp = its_allocate_thunk(reg); 774 775 if (tmp) 776 thunk = tmp; 777 778 return __emit_trampoline(addr, insn, bytes, thunk, thunk); 779 } 780 781 /* Check if an indirect branch is at ITS-unsafe address */ 782 static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) 783 { 784 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) 785 return false; 786 787 /* Indirect branch opcode is 2 or 3 bytes depending on reg */ 788 addr += 1 + reg / 8; 789 790 /* Lower-half of the cacheline? */ 791 return !(addr & 0x20); 792 } 793 #else /* CONFIG_MITIGATION_ITS */ 794 795 #ifdef CONFIG_FINEIBT 796 static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) 797 { 798 return false; 799 } 800 #endif 801 802 #endif /* CONFIG_MITIGATION_ITS */ 803 804 /* 805 * Rewrite the compiler generated retpoline thunk calls. 806 * 807 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate 808 * indirect instructions, avoiding the extra indirection. 809 * 810 * For example, convert: 811 * 812 * CALL __x86_indirect_thunk_\reg 813 * 814 * into: 815 * 816 * CALL *%\reg 817 * 818 * It also tries to inline spectre_v2=retpoline,lfence when size permits. 819 */ 820 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) 821 { 822 retpoline_thunk_t *target; 823 int reg, ret, i = 0; 824 u8 op, cc; 825 826 target = addr + insn->length + insn->immediate.value; 827 reg = target - __x86_indirect_thunk_array; 828 829 if (WARN_ON_ONCE(reg & ~0xf)) 830 return -1; 831 832 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ 833 BUG_ON(reg == 4); 834 835 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && 836 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { 837 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) 838 return emit_call_track_retpoline(addr, insn, reg, bytes); 839 840 return -1; 841 } 842 843 op = insn->opcode.bytes[0]; 844 845 /* 846 * Convert: 847 * 848 * Jcc.d32 __x86_indirect_thunk_\reg 849 * 850 * into: 851 * 852 * Jncc.d8 1f 853 * [ LFENCE ] 854 * JMP *%\reg 855 * [ NOP ] 856 * 1: 857 */ 858 if (is_jcc32(insn)) { 859 cc = insn->opcode.bytes[1] & 0xf; 860 cc ^= 1; /* invert condition */ 861 862 bytes[i++] = 0x70 + cc; /* Jcc.d8 */ 863 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ 864 865 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ 866 op = JMP32_INSN_OPCODE; 867 } 868 869 /* 870 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. 871 */ 872 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { 873 bytes[i++] = 0x0f; 874 bytes[i++] = 0xae; 875 bytes[i++] = 0xe8; /* LFENCE */ 876 } 877 878 #ifdef CONFIG_MITIGATION_ITS 879 /* 880 * Check if the address of last byte of emitted-indirect is in 881 * lower-half of the cacheline. Such branches need ITS mitigation. 882 */ 883 if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) 884 return emit_its_trampoline(addr, insn, reg, bytes); 885 #endif 886 887 ret = emit_indirect(op, reg, bytes + i); 888 if (ret < 0) 889 return ret; 890 i += ret; 891 892 /* 893 * The compiler is supposed to EMIT an INT3 after every unconditional 894 * JMP instruction due to AMD BTC. However, if the compiler is too old 895 * or MITIGATION_SLS isn't enabled, we still need an INT3 after 896 * indirect JMPs even on Intel. 897 */ 898 if (op == JMP32_INSN_OPCODE && i < insn->length) 899 bytes[i++] = INT3_INSN_OPCODE; 900 901 for (; i < insn->length;) 902 bytes[i++] = BYTES_NOP1; 903 904 return i; 905 } 906 907 /* 908 * Generated by 'objtool --retpoline'. 909 */ 910 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) 911 { 912 s32 *s; 913 914 for (s = start; s < end; s++) { 915 void *addr = (void *)s + *s; 916 struct insn insn; 917 int len, ret; 918 u8 bytes[16]; 919 u8 op1, op2; 920 u8 *dest; 921 922 ret = insn_decode_kernel(&insn, addr); 923 if (WARN_ON_ONCE(ret < 0)) 924 continue; 925 926 op1 = insn.opcode.bytes[0]; 927 op2 = insn.opcode.bytes[1]; 928 929 switch (op1) { 930 case 0x70 ... 0x7f: /* Jcc.d8 */ 931 /* See cfi_paranoid. */ 932 WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); 933 continue; 934 935 case CALL_INSN_OPCODE: 936 case JMP32_INSN_OPCODE: 937 /* Check for cfi_paranoid + ITS */ 938 dest = addr + insn.length + insn.immediate.value; 939 if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { 940 WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); 941 continue; 942 } 943 break; 944 945 case 0x0f: /* escape */ 946 if (op2 >= 0x80 && op2 <= 0x8f) 947 break; 948 fallthrough; 949 default: 950 WARN_ON_ONCE(1); 951 continue; 952 } 953 954 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", 955 addr, addr, insn.length, 956 addr + insn.length + insn.immediate.value); 957 958 len = patch_retpoline(addr, &insn, bytes); 959 if (len == insn.length) { 960 optimize_nops(addr, bytes, len); 961 DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); 962 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); 963 text_poke_early(addr, bytes, len); 964 } 965 } 966 } 967 968 #ifdef CONFIG_MITIGATION_RETHUNK 969 970 bool cpu_wants_rethunk(void) 971 { 972 return cpu_feature_enabled(X86_FEATURE_RETHUNK); 973 } 974 975 bool cpu_wants_rethunk_at(void *addr) 976 { 977 if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) 978 return false; 979 if (x86_return_thunk != its_return_thunk) 980 return true; 981 982 return !((unsigned long)addr & 0x20); 983 } 984 985 /* 986 * Rewrite the compiler generated return thunk tail-calls. 987 * 988 * For example, convert: 989 * 990 * JMP __x86_return_thunk 991 * 992 * into: 993 * 994 * RET 995 */ 996 static int patch_return(void *addr, struct insn *insn, u8 *bytes) 997 { 998 int i = 0; 999 1000 /* Patch the custom return thunks... */ 1001 if (cpu_wants_rethunk_at(addr)) { 1002 i = JMP32_INSN_SIZE; 1003 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); 1004 } else { 1005 /* ... or patch them out if not needed. */ 1006 bytes[i++] = RET_INSN_OPCODE; 1007 } 1008 1009 for (; i < insn->length;) 1010 bytes[i++] = INT3_INSN_OPCODE; 1011 return i; 1012 } 1013 1014 void __init_or_module noinline apply_returns(s32 *start, s32 *end) 1015 { 1016 s32 *s; 1017 1018 if (cpu_wants_rethunk()) 1019 static_call_force_reinit(); 1020 1021 for (s = start; s < end; s++) { 1022 void *dest = NULL, *addr = (void *)s + *s; 1023 struct insn insn; 1024 int len, ret; 1025 u8 bytes[16]; 1026 u8 op; 1027 1028 ret = insn_decode_kernel(&insn, addr); 1029 if (WARN_ON_ONCE(ret < 0)) 1030 continue; 1031 1032 op = insn.opcode.bytes[0]; 1033 if (op == JMP32_INSN_OPCODE) 1034 dest = addr + insn.length + insn.immediate.value; 1035 1036 if (__static_call_fixup(addr, op, dest) || 1037 WARN_ONCE(dest != &__x86_return_thunk, 1038 "missing return thunk: %pS-%pS: %*ph", 1039 addr, dest, 5, addr)) 1040 continue; 1041 1042 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", 1043 addr, addr, insn.length, 1044 addr + insn.length + insn.immediate.value); 1045 1046 len = patch_return(addr, &insn, bytes); 1047 if (len == insn.length) { 1048 DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); 1049 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); 1050 text_poke_early(addr, bytes, len); 1051 } 1052 } 1053 } 1054 #else /* !CONFIG_MITIGATION_RETHUNK: */ 1055 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 1056 #endif /* !CONFIG_MITIGATION_RETHUNK */ 1057 1058 #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ 1059 1060 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } 1061 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 1062 1063 #endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ 1064 1065 #ifdef CONFIG_X86_KERNEL_IBT 1066 1067 __noendbr bool is_endbr(u32 *val) 1068 { 1069 u32 endbr; 1070 1071 __get_kernel_nofault(&endbr, val, u32, Efault); 1072 return __is_endbr(endbr); 1073 1074 Efault: 1075 return false; 1076 } 1077 1078 #ifdef CONFIG_FINEIBT 1079 1080 static __noendbr bool exact_endbr(u32 *val) 1081 { 1082 u32 endbr; 1083 1084 __get_kernel_nofault(&endbr, val, u32, Efault); 1085 return endbr == gen_endbr(); 1086 1087 Efault: 1088 return false; 1089 } 1090 1091 #endif 1092 1093 static void poison_cfi(void *addr); 1094 1095 static void __init_or_module poison_endbr(void *addr) 1096 { 1097 u32 poison = gen_endbr_poison(); 1098 1099 if (WARN_ON_ONCE(!is_endbr(addr))) 1100 return; 1101 1102 DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); 1103 1104 /* 1105 * When we have IBT, the lack of ENDBR will trigger #CP 1106 */ 1107 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); 1108 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); 1109 text_poke_early(addr, &poison, 4); 1110 } 1111 1112 /* 1113 * Generated by: objtool --ibt 1114 * 1115 * Seal the functions for indirect calls by clobbering the ENDBR instructions 1116 * and the kCFI hash value. 1117 */ 1118 void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) 1119 { 1120 s32 *s; 1121 1122 for (s = start; s < end; s++) { 1123 void *addr = (void *)s + *s; 1124 1125 poison_endbr(addr); 1126 if (IS_ENABLED(CONFIG_FINEIBT)) 1127 poison_cfi(addr - 16); 1128 } 1129 } 1130 1131 #else /* !CONFIG_X86_KERNEL_IBT: */ 1132 1133 void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } 1134 1135 #endif /* !CONFIG_X86_KERNEL_IBT */ 1136 1137 #ifdef CONFIG_CFI_AUTO_DEFAULT 1138 # define __CFI_DEFAULT CFI_AUTO 1139 #elif defined(CONFIG_CFI_CLANG) 1140 # define __CFI_DEFAULT CFI_KCFI 1141 #else 1142 # define __CFI_DEFAULT CFI_OFF 1143 #endif 1144 1145 enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; 1146 1147 #ifdef CONFIG_FINEIBT_BHI 1148 bool cfi_bhi __ro_after_init = false; 1149 #endif 1150 1151 #ifdef CONFIG_CFI_CLANG 1152 struct bpf_insn; 1153 1154 /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ 1155 extern unsigned int __bpf_prog_runX(const void *ctx, 1156 const struct bpf_insn *insn); 1157 1158 KCFI_REFERENCE(__bpf_prog_runX); 1159 1160 /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ 1161 asm ( 1162 " .pushsection .data..ro_after_init,\"aw\",@progbits \n" 1163 " .type cfi_bpf_hash,@object \n" 1164 " .globl cfi_bpf_hash \n" 1165 " .p2align 2, 0x0 \n" 1166 "cfi_bpf_hash: \n" 1167 " .long __kcfi_typeid___bpf_prog_runX \n" 1168 " .size cfi_bpf_hash, 4 \n" 1169 " .popsection \n" 1170 ); 1171 1172 /* Must match bpf_callback_t */ 1173 extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); 1174 1175 KCFI_REFERENCE(__bpf_callback_fn); 1176 1177 /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ 1178 asm ( 1179 " .pushsection .data..ro_after_init,\"aw\",@progbits \n" 1180 " .type cfi_bpf_subprog_hash,@object \n" 1181 " .globl cfi_bpf_subprog_hash \n" 1182 " .p2align 2, 0x0 \n" 1183 "cfi_bpf_subprog_hash: \n" 1184 " .long __kcfi_typeid___bpf_callback_fn \n" 1185 " .size cfi_bpf_subprog_hash, 4 \n" 1186 " .popsection \n" 1187 ); 1188 1189 u32 cfi_get_func_hash(void *func) 1190 { 1191 u32 hash; 1192 1193 func -= cfi_get_offset(); 1194 switch (cfi_mode) { 1195 case CFI_FINEIBT: 1196 func += 7; 1197 break; 1198 case CFI_KCFI: 1199 func += 1; 1200 break; 1201 default: 1202 return 0; 1203 } 1204 1205 if (get_kernel_nofault(hash, func)) 1206 return 0; 1207 1208 return hash; 1209 } 1210 1211 int cfi_get_func_arity(void *func) 1212 { 1213 bhi_thunk *target; 1214 s32 disp; 1215 1216 if (cfi_mode != CFI_FINEIBT && !cfi_bhi) 1217 return 0; 1218 1219 if (get_kernel_nofault(disp, func - 4)) 1220 return 0; 1221 1222 target = func + disp; 1223 return target - __bhi_args; 1224 } 1225 #endif 1226 1227 #ifdef CONFIG_FINEIBT 1228 1229 static bool cfi_rand __ro_after_init = true; 1230 static u32 cfi_seed __ro_after_init; 1231 1232 /* 1233 * Re-hash the CFI hash with a boot-time seed while making sure the result is 1234 * not a valid ENDBR instruction. 1235 */ 1236 static u32 cfi_rehash(u32 hash) 1237 { 1238 hash ^= cfi_seed; 1239 while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { 1240 bool lsb = hash & 1; 1241 hash >>= 1; 1242 if (lsb) 1243 hash ^= 0x80200003; 1244 } 1245 return hash; 1246 } 1247 1248 static __init int cfi_parse_cmdline(char *str) 1249 { 1250 if (!str) 1251 return -EINVAL; 1252 1253 while (str) { 1254 char *next = strchr(str, ','); 1255 if (next) { 1256 *next = 0; 1257 next++; 1258 } 1259 1260 if (!strcmp(str, "auto")) { 1261 cfi_mode = CFI_AUTO; 1262 } else if (!strcmp(str, "off")) { 1263 cfi_mode = CFI_OFF; 1264 cfi_rand = false; 1265 } else if (!strcmp(str, "kcfi")) { 1266 cfi_mode = CFI_KCFI; 1267 } else if (!strcmp(str, "fineibt")) { 1268 cfi_mode = CFI_FINEIBT; 1269 } else if (!strcmp(str, "norand")) { 1270 cfi_rand = false; 1271 } else if (!strcmp(str, "warn")) { 1272 pr_alert("CFI mismatch non-fatal!\n"); 1273 cfi_warn = true; 1274 } else if (!strcmp(str, "paranoid")) { 1275 if (cfi_mode == CFI_FINEIBT) { 1276 cfi_paranoid = true; 1277 } else { 1278 pr_err("Ignoring paranoid; depends on fineibt.\n"); 1279 } 1280 } else if (!strcmp(str, "bhi")) { 1281 #ifdef CONFIG_FINEIBT_BHI 1282 if (cfi_mode == CFI_FINEIBT) { 1283 cfi_bhi = true; 1284 } else { 1285 pr_err("Ignoring bhi; depends on fineibt.\n"); 1286 } 1287 #else 1288 pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n"); 1289 #endif 1290 } else { 1291 pr_err("Ignoring unknown cfi option (%s).", str); 1292 } 1293 1294 str = next; 1295 } 1296 1297 return 0; 1298 } 1299 early_param("cfi", cfi_parse_cmdline); 1300 1301 /* 1302 * kCFI FineIBT 1303 * 1304 * __cfi_\func: __cfi_\func: 1305 * movl $0x12345678,%eax // 5 endbr64 // 4 1306 * nop subl $0x12345678,%r10d // 7 1307 * nop jne __cfi_\func+6 // 2 1308 * nop nop3 // 3 1309 * nop 1310 * nop 1311 * nop 1312 * nop 1313 * nop 1314 * nop 1315 * nop 1316 * nop 1317 * 1318 * 1319 * caller: caller: 1320 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 1321 * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 1322 * je 1f // 2 nop4 // 4 1323 * ud2 // 2 1324 * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 1325 * 1326 */ 1327 1328 /* 1329 * <fineibt_preamble_start>: 1330 * 0: f3 0f 1e fa endbr64 1331 * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d 1332 * b: 75 f9 jne 6 <fineibt_preamble_start+0x6> 1333 * d: 0f 1f 00 nopl (%rax) 1334 * 1335 * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as 1336 * (bad) on x86_64 and raises #UD. 1337 */ 1338 asm( ".pushsection .rodata \n" 1339 "fineibt_preamble_start: \n" 1340 " endbr64 \n" 1341 " subl $0x12345678, %r10d \n" 1342 "fineibt_preamble_bhi: \n" 1343 " jne fineibt_preamble_start+6 \n" 1344 ASM_NOP3 1345 "fineibt_preamble_end: \n" 1346 ".popsection\n" 1347 ); 1348 1349 extern u8 fineibt_preamble_start[]; 1350 extern u8 fineibt_preamble_bhi[]; 1351 extern u8 fineibt_preamble_end[]; 1352 1353 #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) 1354 #define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) 1355 #define fineibt_preamble_ud 6 1356 #define fineibt_preamble_hash 7 1357 1358 /* 1359 * <fineibt_caller_start>: 1360 * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 1361 * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11 1362 * a: 0f 1f 40 00 nopl 0x0(%rax) 1363 */ 1364 asm( ".pushsection .rodata \n" 1365 "fineibt_caller_start: \n" 1366 " movl $0x12345678, %r10d \n" 1367 " lea -0x10(%r11), %r11 \n" 1368 ASM_NOP4 1369 "fineibt_caller_end: \n" 1370 ".popsection \n" 1371 ); 1372 1373 extern u8 fineibt_caller_start[]; 1374 extern u8 fineibt_caller_end[]; 1375 1376 #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) 1377 #define fineibt_caller_hash 2 1378 1379 #define fineibt_caller_jmp (fineibt_caller_size - 2) 1380 1381 /* 1382 * Since FineIBT does hash validation on the callee side it is prone to 1383 * circumvention attacks where a 'naked' ENDBR instruction exists that 1384 * is not part of the fineibt_preamble sequence. 1385 * 1386 * Notably the x86 entry points must be ENDBR and equally cannot be 1387 * fineibt_preamble. 1388 * 1389 * The fineibt_paranoid caller sequence adds additional caller side 1390 * hash validation. This stops such circumvention attacks dead, but at the cost 1391 * of adding a load. 1392 * 1393 * <fineibt_paranoid_start>: 1394 * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 1395 * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d 1396 * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11 1397 * e: 75 fd jne d <fineibt_paranoid_start+0xd> 1398 * 10: 41 ff d3 call *%r11 1399 * 13: 90 nop 1400 * 1401 * Notably LEA does not modify flags and can be reordered with the CMP, 1402 * avoiding a dependency. Again, using a non-taken (backwards) branch 1403 * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the 1404 * Jcc.d8, causing #UD. 1405 */ 1406 asm( ".pushsection .rodata \n" 1407 "fineibt_paranoid_start: \n" 1408 " movl $0x12345678, %r10d \n" 1409 " cmpl -9(%r11), %r10d \n" 1410 " lea -0x10(%r11), %r11 \n" 1411 " jne fineibt_paranoid_start+0xd \n" 1412 "fineibt_paranoid_ind: \n" 1413 " call *%r11 \n" 1414 " nop \n" 1415 "fineibt_paranoid_end: \n" 1416 ".popsection \n" 1417 ); 1418 1419 extern u8 fineibt_paranoid_start[]; 1420 extern u8 fineibt_paranoid_ind[]; 1421 extern u8 fineibt_paranoid_end[]; 1422 1423 #define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) 1424 #define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) 1425 #define fineibt_paranoid_ud 0xd 1426 1427 static u32 decode_preamble_hash(void *addr, int *reg) 1428 { 1429 u8 *p = addr; 1430 1431 /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ 1432 if (p[0] >= 0xb8 && p[0] < 0xc0) { 1433 if (reg) 1434 *reg = p[0] - 0xb8; 1435 return *(u32 *)(addr + 1); 1436 } 1437 1438 return 0; /* invalid hash value */ 1439 } 1440 1441 static u32 decode_caller_hash(void *addr) 1442 { 1443 u8 *p = addr; 1444 1445 /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ 1446 if (p[0] == 0x41 && p[1] == 0xba) 1447 return -*(u32 *)(addr + 2); 1448 1449 /* e8 0c 88 a9 cb ed jmp.d8 +12 */ 1450 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) 1451 return -*(u32 *)(addr + 2); 1452 1453 return 0; /* invalid hash value */ 1454 } 1455 1456 /* .retpoline_sites */ 1457 static int cfi_disable_callers(s32 *start, s32 *end) 1458 { 1459 /* 1460 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate 1461 * in tact for later usage. Also see decode_caller_hash() and 1462 * cfi_rewrite_callers(). 1463 */ 1464 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; 1465 s32 *s; 1466 1467 for (s = start; s < end; s++) { 1468 void *addr = (void *)s + *s; 1469 u32 hash; 1470 1471 addr -= fineibt_caller_size; 1472 hash = decode_caller_hash(addr); 1473 if (!hash) /* nocfi callers */ 1474 continue; 1475 1476 text_poke_early(addr, jmp, 2); 1477 } 1478 1479 return 0; 1480 } 1481 1482 static int cfi_enable_callers(s32 *start, s32 *end) 1483 { 1484 /* 1485 * Re-enable kCFI, undo what cfi_disable_callers() did. 1486 */ 1487 const u8 mov[] = { 0x41, 0xba }; 1488 s32 *s; 1489 1490 for (s = start; s < end; s++) { 1491 void *addr = (void *)s + *s; 1492 u32 hash; 1493 1494 addr -= fineibt_caller_size; 1495 hash = decode_caller_hash(addr); 1496 if (!hash) /* nocfi callers */ 1497 continue; 1498 1499 text_poke_early(addr, mov, 2); 1500 } 1501 1502 return 0; 1503 } 1504 1505 /* .cfi_sites */ 1506 static int cfi_rand_preamble(s32 *start, s32 *end) 1507 { 1508 s32 *s; 1509 1510 for (s = start; s < end; s++) { 1511 void *addr = (void *)s + *s; 1512 u32 hash; 1513 1514 hash = decode_preamble_hash(addr, NULL); 1515 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", 1516 addr, addr, 5, addr)) 1517 return -EINVAL; 1518 1519 hash = cfi_rehash(hash); 1520 text_poke_early(addr + 1, &hash, 4); 1521 } 1522 1523 return 0; 1524 } 1525 1526 static void cfi_fineibt_bhi_preamble(void *addr, int arity) 1527 { 1528 if (!arity) 1529 return; 1530 1531 if (!cfi_warn && arity == 1) { 1532 /* 1533 * Crazy scheme to allow arity-1 inline: 1534 * 1535 * __cfi_foo: 1536 * 0: f3 0f 1e fa endbr64 1537 * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d 1538 * b: 49 0f 45 fa cmovne %r10, %rdi 1539 * f: 75 f5 jne __cfi_foo+6 1540 * 11: 0f 1f 00 nopl (%rax) 1541 * 1542 * Code that direct calls to foo()+0, decodes the tail end as: 1543 * 1544 * foo: 1545 * 0: f5 cmc 1546 * 1: 0f 1f 00 nopl (%rax) 1547 * 1548 * which clobbers CF, but does not affect anything ABI 1549 * wise. 1550 * 1551 * Notably, this scheme is incompatible with permissive CFI 1552 * because the CMOVcc is unconditional and RDI will have been 1553 * clobbered. 1554 */ 1555 const u8 magic[9] = { 1556 0x49, 0x0f, 0x45, 0xfa, 1557 0x75, 0xf5, 1558 BYTES_NOP3, 1559 }; 1560 1561 text_poke_early(addr + fineibt_preamble_bhi, magic, 9); 1562 1563 return; 1564 } 1565 1566 text_poke_early(addr + fineibt_preamble_bhi, 1567 text_gen_insn(CALL_INSN_OPCODE, 1568 addr + fineibt_preamble_bhi, 1569 __bhi_args[arity]), 1570 CALL_INSN_SIZE); 1571 } 1572 1573 static int cfi_rewrite_preamble(s32 *start, s32 *end) 1574 { 1575 s32 *s; 1576 1577 for (s = start; s < end; s++) { 1578 void *addr = (void *)s + *s; 1579 int arity; 1580 u32 hash; 1581 1582 /* 1583 * When the function doesn't start with ENDBR the compiler will 1584 * have determined there are no indirect calls to it and we 1585 * don't need no CFI either. 1586 */ 1587 if (!is_endbr(addr + 16)) 1588 continue; 1589 1590 hash = decode_preamble_hash(addr, &arity); 1591 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", 1592 addr, addr, 5, addr)) 1593 return -EINVAL; 1594 1595 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); 1596 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); 1597 text_poke_early(addr + fineibt_preamble_hash, &hash, 4); 1598 1599 WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, 1600 "kCFI preamble has wrong register at: %pS %*ph\n", 1601 addr, 5, addr); 1602 1603 if (cfi_bhi) 1604 cfi_fineibt_bhi_preamble(addr, arity); 1605 } 1606 1607 return 0; 1608 } 1609 1610 static void cfi_rewrite_endbr(s32 *start, s32 *end) 1611 { 1612 s32 *s; 1613 1614 for (s = start; s < end; s++) { 1615 void *addr = (void *)s + *s; 1616 1617 if (!exact_endbr(addr + 16)) 1618 continue; 1619 1620 poison_endbr(addr + 16); 1621 } 1622 } 1623 1624 /* .retpoline_sites */ 1625 static int cfi_rand_callers(s32 *start, s32 *end) 1626 { 1627 s32 *s; 1628 1629 for (s = start; s < end; s++) { 1630 void *addr = (void *)s + *s; 1631 u32 hash; 1632 1633 addr -= fineibt_caller_size; 1634 hash = decode_caller_hash(addr); 1635 if (hash) { 1636 hash = -cfi_rehash(hash); 1637 text_poke_early(addr + 2, &hash, 4); 1638 } 1639 } 1640 1641 return 0; 1642 } 1643 1644 static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) 1645 { 1646 u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; 1647 1648 #ifdef CONFIG_MITIGATION_ITS 1649 u8 *tmp = its_allocate_thunk(reg); 1650 if (tmp) 1651 thunk = tmp; 1652 #endif 1653 1654 return __emit_trampoline(addr, insn, bytes, thunk, thunk); 1655 } 1656 1657 static int cfi_rewrite_callers(s32 *start, s32 *end) 1658 { 1659 s32 *s; 1660 1661 BUG_ON(fineibt_paranoid_size != 20); 1662 1663 for (s = start; s < end; s++) { 1664 void *addr = (void *)s + *s; 1665 struct insn insn; 1666 u8 bytes[20]; 1667 u32 hash; 1668 int ret; 1669 u8 op; 1670 1671 addr -= fineibt_caller_size; 1672 hash = decode_caller_hash(addr); 1673 if (!hash) 1674 continue; 1675 1676 if (!cfi_paranoid) { 1677 text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); 1678 WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); 1679 text_poke_early(addr + fineibt_caller_hash, &hash, 4); 1680 /* rely on apply_retpolines() */ 1681 continue; 1682 } 1683 1684 /* cfi_paranoid */ 1685 ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); 1686 if (WARN_ON_ONCE(ret < 0)) 1687 continue; 1688 1689 op = insn.opcode.bytes[0]; 1690 if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { 1691 WARN_ON_ONCE(1); 1692 continue; 1693 } 1694 1695 memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); 1696 memcpy(bytes + fineibt_caller_hash, &hash, 4); 1697 1698 if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { 1699 emit_paranoid_trampoline(addr + fineibt_caller_size, 1700 &insn, 11, bytes + fineibt_caller_size); 1701 } else { 1702 ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); 1703 if (WARN_ON_ONCE(ret != 3)) 1704 continue; 1705 } 1706 1707 text_poke_early(addr, bytes, fineibt_paranoid_size); 1708 } 1709 1710 return 0; 1711 } 1712 1713 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 1714 s32 *start_cfi, s32 *end_cfi, bool builtin) 1715 { 1716 int ret; 1717 1718 if (WARN_ONCE(fineibt_preamble_size != 16, 1719 "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) 1720 return; 1721 1722 if (cfi_mode == CFI_AUTO) { 1723 cfi_mode = CFI_KCFI; 1724 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { 1725 /* 1726 * FRED has much saner context on exception entry and 1727 * is less easy to take advantage of. 1728 */ 1729 if (!cpu_feature_enabled(X86_FEATURE_FRED)) 1730 cfi_paranoid = true; 1731 cfi_mode = CFI_FINEIBT; 1732 } 1733 } 1734 1735 /* 1736 * Rewrite the callers to not use the __cfi_ stubs, such that we might 1737 * rewrite them. This disables all CFI. If this succeeds but any of the 1738 * later stages fails, we're without CFI. 1739 */ 1740 ret = cfi_disable_callers(start_retpoline, end_retpoline); 1741 if (ret) 1742 goto err; 1743 1744 if (cfi_rand) { 1745 if (builtin) { 1746 cfi_seed = get_random_u32(); 1747 cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); 1748 cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); 1749 } 1750 1751 ret = cfi_rand_preamble(start_cfi, end_cfi); 1752 if (ret) 1753 goto err; 1754 1755 ret = cfi_rand_callers(start_retpoline, end_retpoline); 1756 if (ret) 1757 goto err; 1758 } 1759 1760 switch (cfi_mode) { 1761 case CFI_OFF: 1762 if (builtin) 1763 pr_info("Disabling CFI\n"); 1764 return; 1765 1766 case CFI_KCFI: 1767 ret = cfi_enable_callers(start_retpoline, end_retpoline); 1768 if (ret) 1769 goto err; 1770 1771 if (builtin) 1772 pr_info("Using kCFI\n"); 1773 return; 1774 1775 case CFI_FINEIBT: 1776 /* place the FineIBT preamble at func()-16 */ 1777 ret = cfi_rewrite_preamble(start_cfi, end_cfi); 1778 if (ret) 1779 goto err; 1780 1781 /* rewrite the callers to target func()-16 */ 1782 ret = cfi_rewrite_callers(start_retpoline, end_retpoline); 1783 if (ret) 1784 goto err; 1785 1786 /* now that nobody targets func()+0, remove ENDBR there */ 1787 cfi_rewrite_endbr(start_cfi, end_cfi); 1788 1789 if (builtin) { 1790 pr_info("Using %sFineIBT%s CFI\n", 1791 cfi_paranoid ? "paranoid " : "", 1792 cfi_bhi ? "+BHI" : ""); 1793 } 1794 return; 1795 1796 default: 1797 break; 1798 } 1799 1800 err: 1801 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); 1802 } 1803 1804 static inline void poison_hash(void *addr) 1805 { 1806 *(u32 *)addr = 0; 1807 } 1808 1809 static void poison_cfi(void *addr) 1810 { 1811 /* 1812 * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes, 1813 * some (static) functions for which they can determine the address 1814 * is never taken do not get a __cfi prefix, but *DO* get an ENDBR. 1815 * 1816 * As such, these functions will get sealed, but we need to be careful 1817 * to not unconditionally scribble the previous function. 1818 */ 1819 switch (cfi_mode) { 1820 case CFI_FINEIBT: 1821 /* 1822 * FineIBT prefix should start with an ENDBR. 1823 */ 1824 if (!is_endbr(addr)) 1825 break; 1826 1827 /* 1828 * __cfi_\func: 1829 * osp nopl (%rax) 1830 * subl $0, %r10d 1831 * jz 1f 1832 * ud2 1833 * 1: nop 1834 */ 1835 poison_endbr(addr); 1836 poison_hash(addr + fineibt_preamble_hash); 1837 break; 1838 1839 case CFI_KCFI: 1840 /* 1841 * kCFI prefix should start with a valid hash. 1842 */ 1843 if (!decode_preamble_hash(addr, NULL)) 1844 break; 1845 1846 /* 1847 * __cfi_\func: 1848 * movl $0, %eax 1849 * .skip 11, 0x90 1850 */ 1851 poison_hash(addr + 1); 1852 break; 1853 1854 default: 1855 break; 1856 } 1857 } 1858 1859 /* 1860 * When regs->ip points to a 0xEA byte in the FineIBT preamble, 1861 * return true and fill out target and type. 1862 * 1863 * We check the preamble by checking for the ENDBR instruction relative to the 1864 * 0xEA instruction. 1865 */ 1866 static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) 1867 { 1868 unsigned long addr = regs->ip - fineibt_preamble_ud; 1869 u32 hash; 1870 1871 if (!exact_endbr((void *)addr)) 1872 return false; 1873 1874 *target = addr + fineibt_preamble_size; 1875 1876 __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); 1877 *type = (u32)regs->r10 + hash; 1878 1879 /* 1880 * Since regs->ip points to the middle of an instruction; it cannot 1881 * continue with the normal fixup. 1882 */ 1883 regs->ip = *target; 1884 1885 return true; 1886 1887 Efault: 1888 return false; 1889 } 1890 1891 /* 1892 * regs->ip points to one of the UD2 in __bhi_args[]. 1893 */ 1894 static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) 1895 { 1896 unsigned long addr; 1897 u32 hash; 1898 1899 if (!cfi_bhi) 1900 return false; 1901 1902 if (regs->ip < (unsigned long)__bhi_args || 1903 regs->ip >= (unsigned long)__bhi_args_end) 1904 return false; 1905 1906 /* 1907 * Fetch the return address from the stack, this points to the 1908 * FineIBT preamble. Since the CALL instruction is in the 5 last 1909 * bytes of the preamble, the return address is in fact the target 1910 * address. 1911 */ 1912 __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); 1913 *target = addr; 1914 1915 addr -= fineibt_preamble_size; 1916 if (!exact_endbr((void *)addr)) 1917 return false; 1918 1919 __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); 1920 *type = (u32)regs->r10 + hash; 1921 1922 /* 1923 * The UD2 sites are constructed with a RET immediately following, 1924 * as such the non-fatal case can use the regular fixup. 1925 */ 1926 return true; 1927 1928 Efault: 1929 return false; 1930 } 1931 1932 static bool is_paranoid_thunk(unsigned long addr) 1933 { 1934 u32 thunk; 1935 1936 __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); 1937 return (thunk & 0x00FFFFFF) == 0xfd75ea; 1938 1939 Efault: 1940 return false; 1941 } 1942 1943 /* 1944 * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] 1945 * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS 1946 * thunk. 1947 */ 1948 static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) 1949 { 1950 unsigned long addr = regs->ip - fineibt_paranoid_ud; 1951 1952 if (!cfi_paranoid) 1953 return false; 1954 1955 if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { 1956 *target = regs->r11 + fineibt_preamble_size; 1957 *type = regs->r10; 1958 1959 /* 1960 * Since the trapping instruction is the exact, but LOCK prefixed, 1961 * Jcc.d8 that got us here, the normal fixup will work. 1962 */ 1963 return true; 1964 } 1965 1966 /* 1967 * The cfi_paranoid + ITS thunk combination results in: 1968 * 1969 * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 1970 * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d 1971 * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 1972 * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 1973 * 1974 * Where the paranoid_thunk looks like: 1975 * 1976 * 1d: <ea> (bad) 1977 * __x86_indirect_paranoid_thunk_r11: 1978 * 1e: 75 fd jne 1d 1979 * __x86_indirect_its_thunk_r11: 1980 * 20: 41 ff eb jmp *%r11 1981 * 23: cc int3 1982 * 1983 */ 1984 if (is_paranoid_thunk(regs->ip)) { 1985 *target = regs->r11 + fineibt_preamble_size; 1986 *type = regs->r10; 1987 1988 regs->ip = *target; 1989 return true; 1990 } 1991 1992 return false; 1993 } 1994 1995 bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) 1996 { 1997 if (decode_fineibt_paranoid(regs, target, type)) 1998 return true; 1999 2000 if (decode_fineibt_bhi(regs, target, type)) 2001 return true; 2002 2003 return decode_fineibt_preamble(regs, target, type); 2004 } 2005 2006 #else /* !CONFIG_FINEIBT: */ 2007 2008 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 2009 s32 *start_cfi, s32 *end_cfi, bool builtin) 2010 { 2011 } 2012 2013 #ifdef CONFIG_X86_KERNEL_IBT 2014 static void poison_cfi(void *addr) { } 2015 #endif 2016 2017 #endif /* !CONFIG_FINEIBT */ 2018 2019 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 2020 s32 *start_cfi, s32 *end_cfi) 2021 { 2022 return __apply_fineibt(start_retpoline, end_retpoline, 2023 start_cfi, end_cfi, 2024 /* .builtin = */ false); 2025 } 2026 2027 #ifdef CONFIG_SMP 2028 static void alternatives_smp_lock(const s32 *start, const s32 *end, 2029 u8 *text, u8 *text_end) 2030 { 2031 const s32 *poff; 2032 2033 for (poff = start; poff < end; poff++) { 2034 u8 *ptr = (u8 *)poff + *poff; 2035 2036 if (!*poff || ptr < text || ptr >= text_end) 2037 continue; 2038 /* turn DS segment override prefix into lock prefix */ 2039 if (*ptr == 0x3e) 2040 text_poke(ptr, ((unsigned char []){0xf0}), 1); 2041 } 2042 } 2043 2044 static void alternatives_smp_unlock(const s32 *start, const s32 *end, 2045 u8 *text, u8 *text_end) 2046 { 2047 const s32 *poff; 2048 2049 for (poff = start; poff < end; poff++) { 2050 u8 *ptr = (u8 *)poff + *poff; 2051 2052 if (!*poff || ptr < text || ptr >= text_end) 2053 continue; 2054 /* turn lock prefix into DS segment override prefix */ 2055 if (*ptr == 0xf0) 2056 text_poke(ptr, ((unsigned char []){0x3E}), 1); 2057 } 2058 } 2059 2060 struct smp_alt_module { 2061 /* what is this ??? */ 2062 struct module *mod; 2063 char *name; 2064 2065 /* ptrs to lock prefixes */ 2066 const s32 *locks; 2067 const s32 *locks_end; 2068 2069 /* .text segment, needed to avoid patching init code ;) */ 2070 u8 *text; 2071 u8 *text_end; 2072 2073 struct list_head next; 2074 }; 2075 static LIST_HEAD(smp_alt_modules); 2076 static bool uniproc_patched = false; /* protected by text_mutex */ 2077 2078 void __init_or_module alternatives_smp_module_add(struct module *mod, 2079 char *name, 2080 void *locks, void *locks_end, 2081 void *text, void *text_end) 2082 { 2083 struct smp_alt_module *smp; 2084 2085 mutex_lock(&text_mutex); 2086 if (!uniproc_patched) 2087 goto unlock; 2088 2089 if (num_possible_cpus() == 1) 2090 /* Don't bother remembering, we'll never have to undo it. */ 2091 goto smp_unlock; 2092 2093 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 2094 if (NULL == smp) 2095 /* we'll run the (safe but slow) SMP code then ... */ 2096 goto unlock; 2097 2098 smp->mod = mod; 2099 smp->name = name; 2100 smp->locks = locks; 2101 smp->locks_end = locks_end; 2102 smp->text = text; 2103 smp->text_end = text_end; 2104 DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", 2105 smp->locks, smp->locks_end, 2106 smp->text, smp->text_end, smp->name); 2107 2108 list_add_tail(&smp->next, &smp_alt_modules); 2109 smp_unlock: 2110 alternatives_smp_unlock(locks, locks_end, text, text_end); 2111 unlock: 2112 mutex_unlock(&text_mutex); 2113 } 2114 2115 void __init_or_module alternatives_smp_module_del(struct module *mod) 2116 { 2117 struct smp_alt_module *item; 2118 2119 mutex_lock(&text_mutex); 2120 list_for_each_entry(item, &smp_alt_modules, next) { 2121 if (mod != item->mod) 2122 continue; 2123 list_del(&item->next); 2124 kfree(item); 2125 break; 2126 } 2127 mutex_unlock(&text_mutex); 2128 } 2129 2130 void alternatives_enable_smp(void) 2131 { 2132 struct smp_alt_module *mod; 2133 2134 /* Why bother if there are no other CPUs? */ 2135 BUG_ON(num_possible_cpus() == 1); 2136 2137 mutex_lock(&text_mutex); 2138 2139 if (uniproc_patched) { 2140 pr_info("switching to SMP code\n"); 2141 BUG_ON(num_online_cpus() != 1); 2142 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 2143 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 2144 list_for_each_entry(mod, &smp_alt_modules, next) 2145 alternatives_smp_lock(mod->locks, mod->locks_end, 2146 mod->text, mod->text_end); 2147 uniproc_patched = false; 2148 } 2149 mutex_unlock(&text_mutex); 2150 } 2151 2152 /* 2153 * Return 1 if the address range is reserved for SMP-alternatives. 2154 * Must hold text_mutex. 2155 */ 2156 int alternatives_text_reserved(void *start, void *end) 2157 { 2158 struct smp_alt_module *mod; 2159 const s32 *poff; 2160 u8 *text_start = start; 2161 u8 *text_end = end; 2162 2163 lockdep_assert_held(&text_mutex); 2164 2165 list_for_each_entry(mod, &smp_alt_modules, next) { 2166 if (mod->text > text_end || mod->text_end < text_start) 2167 continue; 2168 for (poff = mod->locks; poff < mod->locks_end; poff++) { 2169 const u8 *ptr = (const u8 *)poff + *poff; 2170 2171 if (text_start <= ptr && text_end > ptr) 2172 return 1; 2173 } 2174 } 2175 2176 return 0; 2177 } 2178 #endif /* CONFIG_SMP */ 2179 2180 /* 2181 * Self-test for the INT3 based CALL emulation code. 2182 * 2183 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up 2184 * properly and that there is a stack gap between the INT3 frame and the 2185 * previous context. Without this gap doing a virtual PUSH on the interrupted 2186 * stack would corrupt the INT3 IRET frame. 2187 * 2188 * See entry_{32,64}.S for more details. 2189 */ 2190 2191 /* 2192 * We define the int3_magic() function in assembly to control the calling 2193 * convention such that we can 'call' it from assembly. 2194 */ 2195 2196 extern void int3_magic(unsigned int *ptr); /* defined in asm */ 2197 2198 asm ( 2199 " .pushsection .init.text, \"ax\", @progbits\n" 2200 " .type int3_magic, @function\n" 2201 "int3_magic:\n" 2202 ANNOTATE_NOENDBR 2203 " movl $1, (%" _ASM_ARG1 ")\n" 2204 ASM_RET 2205 " .size int3_magic, .-int3_magic\n" 2206 " .popsection\n" 2207 ); 2208 2209 extern void int3_selftest_ip(void); /* defined in asm below */ 2210 2211 static int __init 2212 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) 2213 { 2214 unsigned long selftest = (unsigned long)&int3_selftest_ip; 2215 struct die_args *args = data; 2216 struct pt_regs *regs = args->regs; 2217 2218 OPTIMIZER_HIDE_VAR(selftest); 2219 2220 if (!regs || user_mode(regs)) 2221 return NOTIFY_DONE; 2222 2223 if (val != DIE_INT3) 2224 return NOTIFY_DONE; 2225 2226 if (regs->ip - INT3_INSN_SIZE != selftest) 2227 return NOTIFY_DONE; 2228 2229 int3_emulate_call(regs, (unsigned long)&int3_magic); 2230 return NOTIFY_STOP; 2231 } 2232 2233 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ 2234 static noinline void __init int3_selftest(void) 2235 { 2236 static __initdata struct notifier_block int3_exception_nb = { 2237 .notifier_call = int3_exception_notify, 2238 .priority = INT_MAX-1, /* last */ 2239 }; 2240 unsigned int val = 0; 2241 2242 BUG_ON(register_die_notifier(&int3_exception_nb)); 2243 2244 /* 2245 * Basically: int3_magic(&val); but really complicated :-) 2246 * 2247 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb 2248 * notifier above will emulate CALL for us. 2249 */ 2250 asm volatile ("int3_selftest_ip:\n\t" 2251 ANNOTATE_NOENDBR 2252 " int3; nop; nop; nop; nop\n\t" 2253 : ASM_CALL_CONSTRAINT 2254 : __ASM_SEL_RAW(a, D) (&val) 2255 : "memory"); 2256 2257 BUG_ON(val != 1); 2258 2259 unregister_die_notifier(&int3_exception_nb); 2260 } 2261 2262 static __initdata int __alt_reloc_selftest_addr; 2263 2264 extern void __init __alt_reloc_selftest(void *arg); 2265 __visible noinline void __init __alt_reloc_selftest(void *arg) 2266 { 2267 WARN_ON(arg != &__alt_reloc_selftest_addr); 2268 } 2269 2270 static noinline void __init alt_reloc_selftest(void) 2271 { 2272 /* 2273 * Tests text_poke_apply_relocation(). 2274 * 2275 * This has a relative immediate (CALL) in a place other than the first 2276 * instruction and additionally on x86_64 we get a RIP-relative LEA: 2277 * 2278 * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c 2279 * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 2280 * 2281 * Getting this wrong will either crash and burn or tickle the WARN 2282 * above. 2283 */ 2284 asm_inline volatile ( 2285 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) 2286 : ASM_CALL_CONSTRAINT 2287 : [mem] "m" (__alt_reloc_selftest_addr) 2288 : _ASM_ARG1 2289 ); 2290 } 2291 2292 void __init alternative_instructions(void) 2293 { 2294 u64 ibt; 2295 2296 int3_selftest(); 2297 2298 /* 2299 * The patching is not fully atomic, so try to avoid local 2300 * interruptions that might execute the to be patched code. 2301 * Other CPUs are not running. 2302 */ 2303 stop_nmi(); 2304 2305 /* 2306 * Don't stop machine check exceptions while patching. 2307 * MCEs only happen when something got corrupted and in this 2308 * case we must do something about the corruption. 2309 * Ignoring it is worse than an unlikely patching race. 2310 * Also machine checks tend to be broadcast and if one CPU 2311 * goes into machine check the others follow quickly, so we don't 2312 * expect a machine check to cause undue problems during to code 2313 * patching. 2314 */ 2315 2316 /* 2317 * Make sure to set (artificial) features depending on used paravirt 2318 * functions which can later influence alternative patching. 2319 */ 2320 paravirt_set_cap(); 2321 2322 /* Keep CET-IBT disabled until caller/callee are patched */ 2323 ibt = ibt_save(/*disable*/ true); 2324 2325 __apply_fineibt(__retpoline_sites, __retpoline_sites_end, 2326 __cfi_sites, __cfi_sites_end, true); 2327 2328 /* 2329 * Rewrite the retpolines, must be done before alternatives since 2330 * those can rewrite the retpoline thunks. 2331 */ 2332 apply_retpolines(__retpoline_sites, __retpoline_sites_end); 2333 apply_returns(__return_sites, __return_sites_end); 2334 2335 /* 2336 * Adjust all CALL instructions to point to func()-10, including 2337 * those in .altinstr_replacement. 2338 */ 2339 callthunks_patch_builtin_calls(); 2340 2341 apply_alternatives(__alt_instructions, __alt_instructions_end); 2342 2343 /* 2344 * Seal all functions that do not have their address taken. 2345 */ 2346 apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); 2347 2348 ibt_restore(ibt); 2349 2350 #ifdef CONFIG_SMP 2351 /* Patch to UP if other cpus not imminent. */ 2352 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 2353 uniproc_patched = true; 2354 alternatives_smp_module_add(NULL, "core kernel", 2355 __smp_locks, __smp_locks_end, 2356 _text, _etext); 2357 } 2358 2359 if (!uniproc_patched || num_possible_cpus() == 1) { 2360 free_init_pages("SMP alternatives", 2361 (unsigned long)__smp_locks, 2362 (unsigned long)__smp_locks_end); 2363 } 2364 #endif 2365 2366 restart_nmi(); 2367 alternatives_patched = 1; 2368 2369 alt_reloc_selftest(); 2370 } 2371 2372 /** 2373 * text_poke_early - Update instructions on a live kernel at boot time 2374 * @addr: address to modify 2375 * @opcode: source of the copy 2376 * @len: length to copy 2377 * 2378 * When you use this code to patch more than one byte of an instruction 2379 * you need to make sure that other CPUs cannot execute this code in parallel. 2380 * Also no thread must be currently preempted in the middle of these 2381 * instructions. And on the local CPU you need to be protected against NMI or 2382 * MCE handlers seeing an inconsistent instruction while you patch. 2383 */ 2384 void __init_or_module text_poke_early(void *addr, const void *opcode, 2385 size_t len) 2386 { 2387 unsigned long flags; 2388 2389 if (boot_cpu_has(X86_FEATURE_NX) && 2390 is_module_text_address((unsigned long)addr)) { 2391 /* 2392 * Modules text is marked initially as non-executable, so the 2393 * code cannot be running and speculative code-fetches are 2394 * prevented. Just change the code. 2395 */ 2396 memcpy(addr, opcode, len); 2397 } else { 2398 local_irq_save(flags); 2399 memcpy(addr, opcode, len); 2400 sync_core(); 2401 local_irq_restore(flags); 2402 2403 /* 2404 * Could also do a CLFLUSH here to speed up CPU recovery; but 2405 * that causes hangs on some VIA CPUs. 2406 */ 2407 } 2408 } 2409 2410 __ro_after_init struct mm_struct *text_poke_mm; 2411 __ro_after_init unsigned long text_poke_mm_addr; 2412 2413 static void text_poke_memcpy(void *dst, const void *src, size_t len) 2414 { 2415 memcpy(dst, src, len); 2416 } 2417 2418 static void text_poke_memset(void *dst, const void *src, size_t len) 2419 { 2420 int c = *(const int *)src; 2421 2422 memset(dst, c, len); 2423 } 2424 2425 typedef void text_poke_f(void *dst, const void *src, size_t len); 2426 2427 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) 2428 { 2429 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; 2430 struct page *pages[2] = {NULL}; 2431 struct mm_struct *prev_mm; 2432 unsigned long flags; 2433 pte_t pte, *ptep; 2434 spinlock_t *ptl; 2435 pgprot_t pgprot; 2436 2437 /* 2438 * While boot memory allocator is running we cannot use struct pages as 2439 * they are not yet initialized. There is no way to recover. 2440 */ 2441 BUG_ON(!after_bootmem); 2442 2443 if (!core_kernel_text((unsigned long)addr)) { 2444 pages[0] = vmalloc_to_page(addr); 2445 if (cross_page_boundary) 2446 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 2447 } else { 2448 pages[0] = virt_to_page(addr); 2449 WARN_ON(!PageReserved(pages[0])); 2450 if (cross_page_boundary) 2451 pages[1] = virt_to_page(addr + PAGE_SIZE); 2452 } 2453 /* 2454 * If something went wrong, crash and burn since recovery paths are not 2455 * implemented. 2456 */ 2457 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); 2458 2459 /* 2460 * Map the page without the global bit, as TLB flushing is done with 2461 * flush_tlb_mm_range(), which is intended for non-global PTEs. 2462 */ 2463 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); 2464 2465 /* 2466 * The lock is not really needed, but this allows to avoid open-coding. 2467 */ 2468 ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); 2469 2470 /* 2471 * This must not fail; preallocated in poking_init(). 2472 */ 2473 VM_BUG_ON(!ptep); 2474 2475 local_irq_save(flags); 2476 2477 pte = mk_pte(pages[0], pgprot); 2478 set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); 2479 2480 if (cross_page_boundary) { 2481 pte = mk_pte(pages[1], pgprot); 2482 set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); 2483 } 2484 2485 /* 2486 * Loading the temporary mm behaves as a compiler barrier, which 2487 * guarantees that the PTE will be set at the time memcpy() is done. 2488 */ 2489 prev_mm = use_temporary_mm(text_poke_mm); 2490 2491 kasan_disable_current(); 2492 func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); 2493 kasan_enable_current(); 2494 2495 /* 2496 * Ensure that the PTE is only cleared after the instructions of memcpy 2497 * were issued by using a compiler barrier. 2498 */ 2499 barrier(); 2500 2501 pte_clear(text_poke_mm, text_poke_mm_addr, ptep); 2502 if (cross_page_boundary) 2503 pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); 2504 2505 /* 2506 * Loading the previous page-table hierarchy requires a serializing 2507 * instruction that already allows the core to see the updated version. 2508 * Xen-PV is assumed to serialize execution in a similar manner. 2509 */ 2510 unuse_temporary_mm(prev_mm); 2511 2512 /* 2513 * Flushing the TLB might involve IPIs, which would require enabled 2514 * IRQs, but not if the mm is not used, as it is in this point. 2515 */ 2516 flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + 2517 (cross_page_boundary ? 2 : 1) * PAGE_SIZE, 2518 PAGE_SHIFT, false); 2519 2520 if (func == text_poke_memcpy) { 2521 /* 2522 * If the text does not match what we just wrote then something is 2523 * fundamentally screwy; there's nothing we can really do about that. 2524 */ 2525 BUG_ON(memcmp(addr, src, len)); 2526 } 2527 2528 local_irq_restore(flags); 2529 pte_unmap_unlock(ptep, ptl); 2530 return addr; 2531 } 2532 2533 /** 2534 * text_poke - Update instructions on a live kernel 2535 * @addr: address to modify 2536 * @opcode: source of the copy 2537 * @len: length to copy 2538 * 2539 * Only atomic text poke/set should be allowed when not doing early patching. 2540 * It means the size must be writable atomically and the address must be aligned 2541 * in a way that permits an atomic write. It also makes sure we fit on a single 2542 * page. 2543 * 2544 * Note that the caller must ensure that if the modified code is part of a 2545 * module, the module would not be removed during poking. This can be achieved 2546 * by registering a module notifier, and ordering module removal and patching 2547 * through a mutex. 2548 */ 2549 void *text_poke(void *addr, const void *opcode, size_t len) 2550 { 2551 lockdep_assert_held(&text_mutex); 2552 2553 return __text_poke(text_poke_memcpy, addr, opcode, len); 2554 } 2555 2556 /** 2557 * text_poke_kgdb - Update instructions on a live kernel by kgdb 2558 * @addr: address to modify 2559 * @opcode: source of the copy 2560 * @len: length to copy 2561 * 2562 * Only atomic text poke/set should be allowed when not doing early patching. 2563 * It means the size must be writable atomically and the address must be aligned 2564 * in a way that permits an atomic write. It also makes sure we fit on a single 2565 * page. 2566 * 2567 * Context: should only be used by kgdb, which ensures no other core is running, 2568 * despite the fact it does not hold the text_mutex. 2569 */ 2570 void *text_poke_kgdb(void *addr, const void *opcode, size_t len) 2571 { 2572 return __text_poke(text_poke_memcpy, addr, opcode, len); 2573 } 2574 2575 void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, 2576 bool core_ok) 2577 { 2578 unsigned long start = (unsigned long)addr; 2579 size_t patched = 0; 2580 2581 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) 2582 return NULL; 2583 2584 while (patched < len) { 2585 unsigned long ptr = start + patched; 2586 size_t s; 2587 2588 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 2589 2590 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); 2591 patched += s; 2592 } 2593 return addr; 2594 } 2595 2596 /** 2597 * text_poke_copy - Copy instructions into (an unused part of) RX memory 2598 * @addr: address to modify 2599 * @opcode: source of the copy 2600 * @len: length to copy, could be more than 2x PAGE_SIZE 2601 * 2602 * Not safe against concurrent execution; useful for JITs to dump 2603 * new code blocks into unused regions of RX memory. Can be used in 2604 * conjunction with synchronize_rcu_tasks() to wait for existing 2605 * execution to quiesce after having made sure no existing functions 2606 * pointers are live. 2607 */ 2608 void *text_poke_copy(void *addr, const void *opcode, size_t len) 2609 { 2610 mutex_lock(&text_mutex); 2611 addr = text_poke_copy_locked(addr, opcode, len, false); 2612 mutex_unlock(&text_mutex); 2613 return addr; 2614 } 2615 2616 /** 2617 * text_poke_set - memset into (an unused part of) RX memory 2618 * @addr: address to modify 2619 * @c: the byte to fill the area with 2620 * @len: length to copy, could be more than 2x PAGE_SIZE 2621 * 2622 * This is useful to overwrite unused regions of RX memory with illegal 2623 * instructions. 2624 */ 2625 void *text_poke_set(void *addr, int c, size_t len) 2626 { 2627 unsigned long start = (unsigned long)addr; 2628 size_t patched = 0; 2629 2630 if (WARN_ON_ONCE(core_kernel_text(start))) 2631 return NULL; 2632 2633 mutex_lock(&text_mutex); 2634 while (patched < len) { 2635 unsigned long ptr = start + patched; 2636 size_t s; 2637 2638 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); 2639 2640 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); 2641 patched += s; 2642 } 2643 mutex_unlock(&text_mutex); 2644 return addr; 2645 } 2646 2647 static void do_sync_core(void *info) 2648 { 2649 sync_core(); 2650 } 2651 2652 void smp_text_poke_sync_each_cpu(void) 2653 { 2654 on_each_cpu(do_sync_core, NULL, 1); 2655 } 2656 2657 /* 2658 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of 2659 * this thing. When len == 6 everything is prefixed with 0x0f and we map 2660 * opcode to Jcc.d8, using len to distinguish. 2661 */ 2662 struct smp_text_poke_loc { 2663 /* addr := _stext + rel_addr */ 2664 s32 rel_addr; 2665 s32 disp; 2666 u8 len; 2667 u8 opcode; 2668 const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; 2669 /* see smp_text_poke_batch_finish() */ 2670 u8 old; 2671 }; 2672 2673 #define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) 2674 2675 static struct smp_text_poke_array { 2676 struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; 2677 int nr_entries; 2678 } text_poke_array; 2679 2680 static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); 2681 2682 /* 2683 * These four __always_inline annotations imply noinstr, necessary 2684 * due to smp_text_poke_int3_handler() being noinstr: 2685 */ 2686 2687 static __always_inline bool try_get_text_poke_array(void) 2688 { 2689 atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); 2690 2691 if (!raw_atomic_inc_not_zero(refs)) 2692 return false; 2693 2694 return true; 2695 } 2696 2697 static __always_inline void put_text_poke_array(void) 2698 { 2699 atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); 2700 2701 smp_mb__before_atomic(); 2702 raw_atomic_dec(refs); 2703 } 2704 2705 static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) 2706 { 2707 return _stext + tpl->rel_addr; 2708 } 2709 2710 static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) 2711 { 2712 if (tpl_a < text_poke_addr(tpl_b)) 2713 return -1; 2714 if (tpl_a > text_poke_addr(tpl_b)) 2715 return 1; 2716 return 0; 2717 } 2718 2719 noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) 2720 { 2721 struct smp_text_poke_loc *tpl; 2722 int ret = 0; 2723 void *ip; 2724 2725 if (user_mode(regs)) 2726 return 0; 2727 2728 /* 2729 * Having observed our INT3 instruction, we now must observe 2730 * text_poke_array with non-zero refcount: 2731 * 2732 * text_poke_array_refs = 1 INT3 2733 * WMB RMB 2734 * write INT3 if (text_poke_array_refs != 0) 2735 */ 2736 smp_rmb(); 2737 2738 if (!try_get_text_poke_array()) 2739 return 0; 2740 2741 /* 2742 * Discount the INT3. See smp_text_poke_batch_finish(). 2743 */ 2744 ip = (void *) regs->ip - INT3_INSN_SIZE; 2745 2746 /* 2747 * Skip the binary search if there is a single member in the vector. 2748 */ 2749 if (unlikely(text_poke_array.nr_entries > 1)) { 2750 tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, 2751 sizeof(struct smp_text_poke_loc), 2752 patch_cmp); 2753 if (!tpl) 2754 goto out_put; 2755 } else { 2756 tpl = text_poke_array.vec; 2757 if (text_poke_addr(tpl) != ip) 2758 goto out_put; 2759 } 2760 2761 ip += tpl->len; 2762 2763 switch (tpl->opcode) { 2764 case INT3_INSN_OPCODE: 2765 /* 2766 * Someone poked an explicit INT3, they'll want to handle it, 2767 * do not consume. 2768 */ 2769 goto out_put; 2770 2771 case RET_INSN_OPCODE: 2772 int3_emulate_ret(regs); 2773 break; 2774 2775 case CALL_INSN_OPCODE: 2776 int3_emulate_call(regs, (long)ip + tpl->disp); 2777 break; 2778 2779 case JMP32_INSN_OPCODE: 2780 case JMP8_INSN_OPCODE: 2781 int3_emulate_jmp(regs, (long)ip + tpl->disp); 2782 break; 2783 2784 case 0x70 ... 0x7f: /* Jcc */ 2785 int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); 2786 break; 2787 2788 default: 2789 BUG(); 2790 } 2791 2792 ret = 1; 2793 2794 out_put: 2795 put_text_poke_array(); 2796 return ret; 2797 } 2798 2799 /** 2800 * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP 2801 * 2802 * Input state: 2803 * text_poke_array.vec: vector of instructions to patch 2804 * text_poke_array.nr_entries: number of entries in the vector 2805 * 2806 * Modify multi-byte instructions by using INT3 breakpoints on SMP. 2807 * We completely avoid using stop_machine() here, and achieve the 2808 * synchronization using INT3 breakpoints and SMP cross-calls. 2809 * 2810 * The way it is done: 2811 * - For each entry in the vector: 2812 * - add an INT3 trap to the address that will be patched 2813 * - SMP sync all CPUs 2814 * - For each entry in the vector: 2815 * - update all but the first byte of the patched range 2816 * - SMP sync all CPUs 2817 * - For each entry in the vector: 2818 * - replace the first byte (INT3) by the first byte of the 2819 * replacing opcode 2820 * - SMP sync all CPUs 2821 */ 2822 void smp_text_poke_batch_finish(void) 2823 { 2824 unsigned char int3 = INT3_INSN_OPCODE; 2825 unsigned int i; 2826 int do_sync; 2827 2828 if (!text_poke_array.nr_entries) 2829 return; 2830 2831 lockdep_assert_held(&text_mutex); 2832 2833 /* 2834 * Corresponds to the implicit memory barrier in try_get_text_poke_array() to 2835 * ensure reading a non-zero refcount provides up to date text_poke_array data. 2836 */ 2837 for_each_possible_cpu(i) 2838 atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); 2839 2840 /* 2841 * Function tracing can enable thousands of places that need to be 2842 * updated. This can take quite some time, and with full kernel debugging 2843 * enabled, this could cause the softlockup watchdog to trigger. 2844 * This function gets called every 256 entries added to be patched. 2845 * Call cond_resched() here to make sure that other tasks can get scheduled 2846 * while processing all the functions being patched. 2847 */ 2848 cond_resched(); 2849 2850 /* 2851 * Corresponding read barrier in INT3 notifier for making sure the 2852 * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. 2853 */ 2854 smp_wmb(); 2855 2856 /* 2857 * First step: add a INT3 trap to the address that will be patched. 2858 */ 2859 for (i = 0; i < text_poke_array.nr_entries; i++) { 2860 text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); 2861 text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); 2862 } 2863 2864 smp_text_poke_sync_each_cpu(); 2865 2866 /* 2867 * Second step: update all but the first byte of the patched range. 2868 */ 2869 for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { 2870 u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; 2871 u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; 2872 const u8 *new = text_poke_array.vec[i].text; 2873 int len = text_poke_array.vec[i].len; 2874 2875 if (len - INT3_INSN_SIZE > 0) { 2876 memcpy(old + INT3_INSN_SIZE, 2877 text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, 2878 len - INT3_INSN_SIZE); 2879 2880 if (len == 6) { 2881 _new[0] = 0x0f; 2882 memcpy(_new + 1, new, 5); 2883 new = _new; 2884 } 2885 2886 text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, 2887 new + INT3_INSN_SIZE, 2888 len - INT3_INSN_SIZE); 2889 2890 do_sync++; 2891 } 2892 2893 /* 2894 * Emit a perf event to record the text poke, primarily to 2895 * support Intel PT decoding which must walk the executable code 2896 * to reconstruct the trace. The flow up to here is: 2897 * - write INT3 byte 2898 * - IPI-SYNC 2899 * - write instruction tail 2900 * At this point the actual control flow will be through the 2901 * INT3 and handler and not hit the old or new instruction. 2902 * Intel PT outputs FUP/TIP packets for the INT3, so the flow 2903 * can still be decoded. Subsequently: 2904 * - emit RECORD_TEXT_POKE with the new instruction 2905 * - IPI-SYNC 2906 * - write first byte 2907 * - IPI-SYNC 2908 * So before the text poke event timestamp, the decoder will see 2909 * either the old instruction flow or FUP/TIP of INT3. After the 2910 * text poke event timestamp, the decoder will see either the 2911 * new instruction flow or FUP/TIP of INT3. Thus decoders can 2912 * use the timestamp as the point at which to modify the 2913 * executable code. 2914 * The old instruction is recorded so that the event can be 2915 * processed forwards or backwards. 2916 */ 2917 perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); 2918 } 2919 2920 if (do_sync) { 2921 /* 2922 * According to Intel, this core syncing is very likely 2923 * not necessary and we'd be safe even without it. But 2924 * better safe than sorry (plus there's not only Intel). 2925 */ 2926 smp_text_poke_sync_each_cpu(); 2927 } 2928 2929 /* 2930 * Third step: replace the first byte (INT3) by the first byte of the 2931 * replacing opcode. 2932 */ 2933 for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { 2934 u8 byte = text_poke_array.vec[i].text[0]; 2935 2936 if (text_poke_array.vec[i].len == 6) 2937 byte = 0x0f; 2938 2939 if (byte == INT3_INSN_OPCODE) 2940 continue; 2941 2942 text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); 2943 do_sync++; 2944 } 2945 2946 if (do_sync) 2947 smp_text_poke_sync_each_cpu(); 2948 2949 /* 2950 * Remove and wait for refs to be zero. 2951 * 2952 * Notably, if after step-3 above the INT3 got removed, then the 2953 * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 2954 * handlers and the below spin-wait will not happen. 2955 * 2956 * IOW. unless the replacement instruction is INT3, this case goes 2957 * unused. 2958 */ 2959 for_each_possible_cpu(i) { 2960 atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); 2961 2962 if (unlikely(!atomic_dec_and_test(refs))) 2963 atomic_cond_read_acquire(refs, !VAL); 2964 } 2965 2966 /* They are all completed: */ 2967 text_poke_array.nr_entries = 0; 2968 } 2969 2970 static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) 2971 { 2972 struct smp_text_poke_loc *tpl; 2973 struct insn insn; 2974 int ret, i = 0; 2975 2976 tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; 2977 2978 if (len == 6) 2979 i = 1; 2980 memcpy((void *)tpl->text, opcode+i, len-i); 2981 if (!emulate) 2982 emulate = opcode; 2983 2984 ret = insn_decode_kernel(&insn, emulate); 2985 BUG_ON(ret < 0); 2986 2987 tpl->rel_addr = addr - (void *)_stext; 2988 tpl->len = len; 2989 tpl->opcode = insn.opcode.bytes[0]; 2990 2991 if (is_jcc32(&insn)) { 2992 /* 2993 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. 2994 */ 2995 tpl->opcode = insn.opcode.bytes[1] - 0x10; 2996 } 2997 2998 switch (tpl->opcode) { 2999 case RET_INSN_OPCODE: 3000 case JMP32_INSN_OPCODE: 3001 case JMP8_INSN_OPCODE: 3002 /* 3003 * Control flow instructions without implied execution of the 3004 * next instruction can be padded with INT3. 3005 */ 3006 for (i = insn.length; i < len; i++) 3007 BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); 3008 break; 3009 3010 default: 3011 BUG_ON(len != insn.length); 3012 } 3013 3014 switch (tpl->opcode) { 3015 case INT3_INSN_OPCODE: 3016 case RET_INSN_OPCODE: 3017 break; 3018 3019 case CALL_INSN_OPCODE: 3020 case JMP32_INSN_OPCODE: 3021 case JMP8_INSN_OPCODE: 3022 case 0x70 ... 0x7f: /* Jcc */ 3023 tpl->disp = insn.immediate.value; 3024 break; 3025 3026 default: /* assume NOP */ 3027 switch (len) { 3028 case 2: /* NOP2 -- emulate as JMP8+0 */ 3029 BUG_ON(memcmp(emulate, x86_nops[len], len)); 3030 tpl->opcode = JMP8_INSN_OPCODE; 3031 tpl->disp = 0; 3032 break; 3033 3034 case 5: /* NOP5 -- emulate as JMP32+0 */ 3035 BUG_ON(memcmp(emulate, x86_nops[len], len)); 3036 tpl->opcode = JMP32_INSN_OPCODE; 3037 tpl->disp = 0; 3038 break; 3039 3040 default: /* unknown instruction */ 3041 BUG(); 3042 } 3043 break; 3044 } 3045 } 3046 3047 /* 3048 * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing 3049 * early if needed. 3050 */ 3051 static bool text_poke_addr_ordered(void *addr) 3052 { 3053 WARN_ON_ONCE(!addr); 3054 3055 if (!text_poke_array.nr_entries) 3056 return true; 3057 3058 /* 3059 * If the last current entry's address is higher than the 3060 * new entry's address we'd like to add, then ordering 3061 * is violated and we must first flush all pending patching 3062 * requests: 3063 */ 3064 if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) 3065 return false; 3066 3067 return true; 3068 } 3069 3070 /** 3071 * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched 3072 * @addr: address to patch 3073 * @opcode: opcode of new instruction 3074 * @len: length to copy 3075 * @emulate: instruction to be emulated 3076 * 3077 * Add a new instruction to the current queue of to-be-patched instructions 3078 * the kernel maintains. The patching request will not be executed immediately, 3079 * but becomes part of an array of patching requests, optimized for batched 3080 * execution. All pending patching requests will be executed on the next 3081 * smp_text_poke_batch_finish() call. 3082 */ 3083 void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) 3084 { 3085 if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) 3086 smp_text_poke_batch_finish(); 3087 __smp_text_poke_batch_add(addr, opcode, len, emulate); 3088 } 3089 3090 /** 3091 * smp_text_poke_single() -- update instruction on live kernel on SMP immediately 3092 * @addr: address to patch 3093 * @opcode: opcode of new instruction 3094 * @len: length to copy 3095 * @emulate: instruction to be emulated 3096 * 3097 * Update a single instruction with the vector in the stack, avoiding 3098 * dynamically allocated memory. This function should be used when it is 3099 * not possible to allocate memory for a vector. The single instruction 3100 * is patched in immediately. 3101 */ 3102 void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) 3103 { 3104 __smp_text_poke_batch_add(addr, opcode, len, emulate); 3105 smp_text_poke_batch_finish(); 3106 } 3107