xref: /linux/arch/x86/kernel/alternative.c (revision 7a9b709e7cc5ce1ffb84ce07bf6d157e1de758df)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #define pr_fmt(fmt) "SMP alternatives: " fmt
3 
4 #include <linux/module.h>
5 #include <linux/sched.h>
6 #include <linux/perf_event.h>
7 #include <linux/mutex.h>
8 #include <linux/list.h>
9 #include <linux/stringify.h>
10 #include <linux/highmem.h>
11 #include <linux/mm.h>
12 #include <linux/vmalloc.h>
13 #include <linux/memory.h>
14 #include <linux/stop_machine.h>
15 #include <linux/slab.h>
16 #include <linux/kdebug.h>
17 #include <linux/kprobes.h>
18 #include <linux/mmu_context.h>
19 #include <linux/bsearch.h>
20 #include <linux/sync_core.h>
21 #include <linux/execmem.h>
22 #include <asm/text-patching.h>
23 #include <asm/alternative.h>
24 #include <asm/sections.h>
25 #include <asm/mce.h>
26 #include <asm/nmi.h>
27 #include <asm/cacheflush.h>
28 #include <asm/tlbflush.h>
29 #include <asm/insn.h>
30 #include <asm/io.h>
31 #include <asm/fixmap.h>
32 #include <asm/paravirt.h>
33 #include <asm/asm-prototypes.h>
34 #include <asm/cfi.h>
35 #include <asm/ibt.h>
36 #include <asm/set_memory.h>
37 
38 int __read_mostly alternatives_patched;
39 
40 EXPORT_SYMBOL_GPL(alternatives_patched);
41 
42 #define MAX_PATCH_LEN (255-1)
43 
44 #define DA_ALL		(~0)
45 #define DA_ALT		0x01
46 #define DA_RET		0x02
47 #define DA_RETPOLINE	0x04
48 #define DA_ENDBR	0x08
49 #define DA_SMP		0x10
50 
51 static unsigned int debug_alternative;
52 
53 static int __init debug_alt(char *str)
54 {
55 	if (str && *str == '=')
56 		str++;
57 
58 	if (!str || kstrtouint(str, 0, &debug_alternative))
59 		debug_alternative = DA_ALL;
60 
61 	return 1;
62 }
63 __setup("debug-alternative", debug_alt);
64 
65 static int noreplace_smp;
66 
67 static int __init setup_noreplace_smp(char *str)
68 {
69 	noreplace_smp = 1;
70 	return 1;
71 }
72 __setup("noreplace-smp", setup_noreplace_smp);
73 
74 #define DPRINTK(type, fmt, args...)					\
75 do {									\
76 	if (debug_alternative & DA_##type)				\
77 		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
78 } while (0)
79 
80 #define DUMP_BYTES(type, buf, len, fmt, args...)			\
81 do {									\
82 	if (unlikely(debug_alternative & DA_##type)) {			\
83 		int j;							\
84 									\
85 		if (!(len))						\
86 			break;						\
87 									\
88 		printk(KERN_DEBUG pr_fmt(fmt), ##args);			\
89 		for (j = 0; j < (len) - 1; j++)				\
90 			printk(KERN_CONT "%02hhx ", buf[j]);		\
91 		printk(KERN_CONT "%02hhx\n", buf[j]);			\
92 	}								\
93 } while (0)
94 
95 static const unsigned char x86nops[] =
96 {
97 	BYTES_NOP1,
98 	BYTES_NOP2,
99 	BYTES_NOP3,
100 	BYTES_NOP4,
101 	BYTES_NOP5,
102 	BYTES_NOP6,
103 	BYTES_NOP7,
104 	BYTES_NOP8,
105 #ifdef CONFIG_64BIT
106 	BYTES_NOP9,
107 	BYTES_NOP10,
108 	BYTES_NOP11,
109 #endif
110 };
111 
112 const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
113 {
114 	NULL,
115 	x86nops,
116 	x86nops + 1,
117 	x86nops + 1 + 2,
118 	x86nops + 1 + 2 + 3,
119 	x86nops + 1 + 2 + 3 + 4,
120 	x86nops + 1 + 2 + 3 + 4 + 5,
121 	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
122 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
123 #ifdef CONFIG_64BIT
124 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
125 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
126 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
127 #endif
128 };
129 
130 #ifdef CONFIG_FINEIBT
131 static bool cfi_paranoid __ro_after_init;
132 #endif
133 
134 #ifdef CONFIG_MITIGATION_ITS
135 
136 static struct module *its_mod;
137 static void *its_page;
138 static unsigned int its_offset;
139 
140 /* Initialize a thunk with the "jmp *reg; int3" instructions. */
141 static void *its_init_thunk(void *thunk, int reg)
142 {
143 	u8 *bytes = thunk;
144 	int offset = 0;
145 	int i = 0;
146 
147 #ifdef CONFIG_FINEIBT
148 	if (cfi_paranoid) {
149 		/*
150 		 * When ITS uses indirect branch thunk the fineibt_paranoid
151 		 * caller sequence doesn't fit in the caller site. So put the
152 		 * remaining part of the sequence (<ea> + JNE) into the ITS
153 		 * thunk.
154 		 */
155 		bytes[i++] = 0xea; /* invalid instruction */
156 		bytes[i++] = 0x75; /* JNE */
157 		bytes[i++] = 0xfd;
158 
159 		offset = 1;
160 	}
161 #endif
162 
163 	if (reg >= 8) {
164 		bytes[i++] = 0x41; /* REX.B prefix */
165 		reg -= 8;
166 	}
167 	bytes[i++] = 0xff;
168 	bytes[i++] = 0xe0 + reg; /* jmp *reg */
169 	bytes[i++] = 0xcc;
170 
171 	return thunk + offset;
172 }
173 
174 void its_init_mod(struct module *mod)
175 {
176 	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
177 		return;
178 
179 	mutex_lock(&text_mutex);
180 	its_mod = mod;
181 	its_page = NULL;
182 }
183 
184 void its_fini_mod(struct module *mod)
185 {
186 	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
187 		return;
188 
189 	WARN_ON_ONCE(its_mod != mod);
190 
191 	its_mod = NULL;
192 	its_page = NULL;
193 	mutex_unlock(&text_mutex);
194 
195 	for (int i = 0; i < mod->its_num_pages; i++) {
196 		void *page = mod->its_page_array[i];
197 		execmem_restore_rox(page, PAGE_SIZE);
198 	}
199 }
200 
201 void its_free_mod(struct module *mod)
202 {
203 	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
204 		return;
205 
206 	for (int i = 0; i < mod->its_num_pages; i++) {
207 		void *page = mod->its_page_array[i];
208 		execmem_free(page);
209 	}
210 	kfree(mod->its_page_array);
211 }
212 
213 static void *its_alloc(void)
214 {
215 	void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
216 
217 	if (!page)
218 		return NULL;
219 
220 	if (its_mod) {
221 		void *tmp = krealloc(its_mod->its_page_array,
222 				     (its_mod->its_num_pages+1) * sizeof(void *),
223 				     GFP_KERNEL);
224 		if (!tmp)
225 			return NULL;
226 
227 		its_mod->its_page_array = tmp;
228 		its_mod->its_page_array[its_mod->its_num_pages++] = page;
229 
230 		execmem_make_temp_rw(page, PAGE_SIZE);
231 	}
232 
233 	return no_free_ptr(page);
234 }
235 
236 static void *its_allocate_thunk(int reg)
237 {
238 	int size = 3 + (reg / 8);
239 	void *thunk;
240 
241 #ifdef CONFIG_FINEIBT
242 	/*
243 	 * The ITS thunk contains an indirect jump and an int3 instruction so
244 	 * its size is 3 or 4 bytes depending on the register used. If CFI
245 	 * paranoid is used then 3 extra bytes are added in the ITS thunk to
246 	 * complete the fineibt_paranoid caller sequence.
247 	 */
248 	if (cfi_paranoid)
249 		size += 3;
250 #endif
251 
252 	if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
253 		its_page = its_alloc();
254 		if (!its_page) {
255 			pr_err("ITS page allocation failed\n");
256 			return NULL;
257 		}
258 		memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
259 		its_offset = 32;
260 	}
261 
262 	/*
263 	 * If the indirect branch instruction will be in the lower half
264 	 * of a cacheline, then update the offset to reach the upper half.
265 	 */
266 	if ((its_offset + size - 1) % 64 < 32)
267 		its_offset = ((its_offset - 1) | 0x3F) + 33;
268 
269 	thunk = its_page + its_offset;
270 	its_offset += size;
271 
272 	return its_init_thunk(thunk, reg);
273 }
274 
275 u8 *its_static_thunk(int reg)
276 {
277 	u8 *thunk = __x86_indirect_its_thunk_array[reg];
278 
279 #ifdef CONFIG_FINEIBT
280 	/* Paranoid thunk starts 2 bytes before */
281 	if (cfi_paranoid)
282 		return thunk - 2;
283 #endif
284 	return thunk;
285 }
286 
287 #endif
288 
289 /*
290  * Nomenclature for variable names to simplify and clarify this code and ease
291  * any potential staring at it:
292  *
293  * @instr: source address of the original instructions in the kernel text as
294  * generated by the compiler.
295  *
296  * @buf: temporary buffer on which the patching operates. This buffer is
297  * eventually text-poked into the kernel image.
298  *
299  * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
300  * in the .altinstr_replacement section.
301  */
302 
303 /*
304  * Fill the buffer with a single effective instruction of size @len.
305  *
306  * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
307  * for every single-byte NOP, try to generate the maximally available NOP of
308  * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
309  * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
310  * *jump* over instead of executing long and daft NOPs.
311  */
312 static void add_nop(u8 *buf, unsigned int len)
313 {
314 	u8 *target = buf + len;
315 
316 	if (!len)
317 		return;
318 
319 	if (len <= ASM_NOP_MAX) {
320 		memcpy(buf, x86_nops[len], len);
321 		return;
322 	}
323 
324 	if (len < 128) {
325 		__text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
326 		buf += JMP8_INSN_SIZE;
327 	} else {
328 		__text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
329 		buf += JMP32_INSN_SIZE;
330 	}
331 
332 	for (;buf < target; buf++)
333 		*buf = INT3_INSN_OPCODE;
334 }
335 
336 extern s32 __retpoline_sites[], __retpoline_sites_end[];
337 extern s32 __return_sites[], __return_sites_end[];
338 extern s32 __cfi_sites[], __cfi_sites_end[];
339 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
340 extern s32 __smp_locks[], __smp_locks_end[];
341 void text_poke_early(void *addr, const void *opcode, size_t len);
342 
343 /*
344  * Matches NOP and NOPL, not any of the other possible NOPs.
345  */
346 static bool insn_is_nop(struct insn *insn)
347 {
348 	/* Anything NOP, but no REP NOP */
349 	if (insn->opcode.bytes[0] == 0x90 &&
350 	    (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
351 		return true;
352 
353 	/* NOPL */
354 	if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
355 		return true;
356 
357 	/* TODO: more nops */
358 
359 	return false;
360 }
361 
362 /*
363  * Find the offset of the first non-NOP instruction starting at @offset
364  * but no further than @len.
365  */
366 static int skip_nops(u8 *buf, int offset, int len)
367 {
368 	struct insn insn;
369 
370 	for (; offset < len; offset += insn.length) {
371 		if (insn_decode_kernel(&insn, &buf[offset]))
372 			break;
373 
374 		if (!insn_is_nop(&insn))
375 			break;
376 	}
377 
378 	return offset;
379 }
380 
381 /*
382  * "noinline" to cause control flow change and thus invalidate I$ and
383  * cause refetch after modification.
384  */
385 static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
386 {
387 	for (int next, i = 0; i < len; i = next) {
388 		struct insn insn;
389 
390 		if (insn_decode_kernel(&insn, &buf[i]))
391 			return;
392 
393 		next = i + insn.length;
394 
395 		if (insn_is_nop(&insn)) {
396 			int nop = i;
397 
398 			/* Has the NOP already been optimized? */
399 			if (i + insn.length == len)
400 				return;
401 
402 			next = skip_nops(buf, next, len);
403 
404 			add_nop(buf + nop, next - nop);
405 			DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
406 		}
407 	}
408 }
409 
410 /*
411  * In this context, "source" is where the instructions are placed in the
412  * section .altinstr_replacement, for example during kernel build by the
413  * toolchain.
414  * "Destination" is where the instructions are being patched in by this
415  * machinery.
416  *
417  * The source offset is:
418  *
419  *   src_imm = target - src_next_ip                  (1)
420  *
421  * and the target offset is:
422  *
423  *   dst_imm = target - dst_next_ip                  (2)
424  *
425  * so rework (1) as an expression for target like:
426  *
427  *   target = src_imm + src_next_ip                  (1a)
428  *
429  * and substitute in (2) to get:
430  *
431  *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
432  *
433  * Now, since the instruction stream is 'identical' at src and dst (it
434  * is being copied after all) it can be stated that:
435  *
436  *   src_next_ip = src + ip_offset
437  *   dst_next_ip = dst + ip_offset                   (4)
438  *
439  * Substitute (4) in (3) and observe ip_offset being cancelled out to
440  * obtain:
441  *
442  *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
443  *           = src_imm + src - dst + ip_offset - ip_offset
444  *           = src_imm + src - dst                   (5)
445  *
446  * IOW, only the relative displacement of the code block matters.
447  */
448 
449 #define apply_reloc_n(n_, p_, d_)				\
450 	do {							\
451 		s32 v = *(s##n_ *)(p_);				\
452 		v += (d_);					\
453 		BUG_ON((v >> 31) != (v >> (n_-1)));		\
454 		*(s##n_ *)(p_) = (s##n_)v;			\
455 	} while (0)
456 
457 
458 static __always_inline
459 void apply_reloc(int n, void *ptr, uintptr_t diff)
460 {
461 	switch (n) {
462 	case 1: apply_reloc_n(8, ptr, diff); break;
463 	case 2: apply_reloc_n(16, ptr, diff); break;
464 	case 4: apply_reloc_n(32, ptr, diff); break;
465 	default: BUG();
466 	}
467 }
468 
469 static __always_inline
470 bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
471 {
472 	u8 *target = src + offset;
473 	/*
474 	 * If the target is inside the patched block, it's relative to the
475 	 * block itself and does not need relocation.
476 	 */
477 	return (target < src || target > src + src_len);
478 }
479 
480 static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
481 {
482 	for (int next, i = 0; i < instrlen; i = next) {
483 		struct insn insn;
484 
485 		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
486 			return;
487 
488 		next = i + insn.length;
489 
490 		switch (insn.opcode.bytes[0]) {
491 		case 0x0f:
492 			if (insn.opcode.bytes[1] < 0x80 ||
493 			    insn.opcode.bytes[1] > 0x8f)
494 				break;
495 
496 			fallthrough;	/* Jcc.d32 */
497 		case 0x70 ... 0x7f:	/* Jcc.d8 */
498 		case JMP8_INSN_OPCODE:
499 		case JMP32_INSN_OPCODE:
500 		case CALL_INSN_OPCODE:
501 			if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
502 				apply_reloc(insn.immediate.nbytes,
503 					    buf + i + insn_offset_immediate(&insn),
504 					    repl - instr);
505 			}
506 
507 			/*
508 			 * Where possible, convert JMP.d32 into JMP.d8.
509 			 */
510 			if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
511 				s32 imm = insn.immediate.value;
512 				imm += repl - instr;
513 				imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
514 				if ((imm >> 31) == (imm >> 7)) {
515 					buf[i+0] = JMP8_INSN_OPCODE;
516 					buf[i+1] = (s8)imm;
517 
518 					memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
519 				}
520 			}
521 			break;
522 		}
523 
524 		if (insn_rip_relative(&insn)) {
525 			if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
526 				apply_reloc(insn.displacement.nbytes,
527 					    buf + i + insn_offset_displacement(&insn),
528 					    repl - instr);
529 			}
530 		}
531 	}
532 }
533 
534 void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
535 {
536 	__apply_relocation(buf, instr, instrlen, repl, repl_len);
537 	optimize_nops(instr, buf, instrlen);
538 }
539 
540 /* Low-level backend functions usable from alternative code replacements. */
541 DEFINE_ASM_FUNC(nop_func, "", .entry.text);
542 EXPORT_SYMBOL_GPL(nop_func);
543 
544 noinstr void BUG_func(void)
545 {
546 	BUG();
547 }
548 EXPORT_SYMBOL(BUG_func);
549 
550 #define CALL_RIP_REL_OPCODE	0xff
551 #define CALL_RIP_REL_MODRM	0x15
552 
553 /*
554  * Rewrite the "call BUG_func" replacement to point to the target of the
555  * indirect pv_ops call "call *disp(%ip)".
556  */
557 static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
558 {
559 	void *target, *bug = &BUG_func;
560 	s32 disp;
561 
562 	if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
563 		pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
564 		BUG();
565 	}
566 
567 	if (a->instrlen != 6 ||
568 	    instr[0] != CALL_RIP_REL_OPCODE ||
569 	    instr[1] != CALL_RIP_REL_MODRM) {
570 		pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
571 		BUG();
572 	}
573 
574 	/* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
575 	disp = *(s32 *)(instr + 2);
576 #ifdef CONFIG_X86_64
577 	/* ff 15 00 00 00 00   call   *0x0(%rip) */
578 	/* target address is stored at "next instruction + disp". */
579 	target = *(void **)(instr + a->instrlen + disp);
580 #else
581 	/* ff 15 00 00 00 00   call   *0x0 */
582 	/* target address is stored at disp. */
583 	target = *(void **)disp;
584 #endif
585 	if (!target)
586 		target = bug;
587 
588 	/* (BUG_func - .) + (target - BUG_func) := target - . */
589 	*(s32 *)(insn_buff + 1) += target - bug;
590 
591 	if (target == &nop_func)
592 		return 0;
593 
594 	return 5;
595 }
596 
597 static inline u8 * instr_va(struct alt_instr *i)
598 {
599 	return (u8 *)&i->instr_offset + i->instr_offset;
600 }
601 
602 /*
603  * Replace instructions with better alternatives for this CPU type. This runs
604  * before SMP is initialized to avoid SMP problems with self modifying code.
605  * This implies that asymmetric systems where APs have less capabilities than
606  * the boot processor are not handled. Tough. Make sure you disable such
607  * features by hand.
608  *
609  * Marked "noinline" to cause control flow change and thus insn cache
610  * to refetch changed I$ lines.
611  */
612 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
613 						  struct alt_instr *end)
614 {
615 	u8 insn_buff[MAX_PATCH_LEN];
616 	u8 *instr, *replacement;
617 	struct alt_instr *a, *b;
618 
619 	DPRINTK(ALT, "alt table %px, -> %px", start, end);
620 
621 	/*
622 	 * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
623 	 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
624 	 * During the process, KASAN becomes confused seeing partial LA57
625 	 * conversion and triggers a false-positive out-of-bound report.
626 	 *
627 	 * Disable KASAN until the patching is complete.
628 	 */
629 	kasan_disable_current();
630 
631 	/*
632 	 * The scan order should be from start to end. A later scanned
633 	 * alternative code can overwrite previously scanned alternative code.
634 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
635 	 * patch code.
636 	 *
637 	 * So be careful if you want to change the scan order to any other
638 	 * order.
639 	 */
640 	for (a = start; a < end; a++) {
641 		int insn_buff_sz = 0;
642 
643 		/*
644 		 * In case of nested ALTERNATIVE()s the outer alternative might
645 		 * add more padding. To ensure consistent patching find the max
646 		 * padding for all alt_instr entries for this site (nested
647 		 * alternatives result in consecutive entries).
648 		 */
649 		for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
650 			u8 len = max(a->instrlen, b->instrlen);
651 			a->instrlen = b->instrlen = len;
652 		}
653 
654 		instr = instr_va(a);
655 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
656 		BUG_ON(a->instrlen > sizeof(insn_buff));
657 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
658 
659 		/*
660 		 * Patch if either:
661 		 * - feature is present
662 		 * - feature not present but ALT_FLAG_NOT is set to mean,
663 		 *   patch if feature is *NOT* present.
664 		 */
665 		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
666 			memcpy(insn_buff, instr, a->instrlen);
667 			optimize_nops(instr, insn_buff, a->instrlen);
668 			text_poke_early(instr, insn_buff, a->instrlen);
669 			continue;
670 		}
671 
672 		DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
673 			a->cpuid >> 5,
674 			a->cpuid & 0x1f,
675 			instr, instr, a->instrlen,
676 			replacement, a->replacementlen, a->flags);
677 
678 		memcpy(insn_buff, replacement, a->replacementlen);
679 		insn_buff_sz = a->replacementlen;
680 
681 		if (a->flags & ALT_FLAG_DIRECT_CALL) {
682 			insn_buff_sz = alt_replace_call(instr, insn_buff, a);
683 			if (insn_buff_sz < 0)
684 				continue;
685 		}
686 
687 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
688 			insn_buff[insn_buff_sz] = 0x90;
689 
690 		apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
691 
692 		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
693 		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
694 		DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
695 
696 		text_poke_early(instr, insn_buff, insn_buff_sz);
697 	}
698 
699 	kasan_enable_current();
700 }
701 
702 static inline bool is_jcc32(struct insn *insn)
703 {
704 	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
705 	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
706 }
707 
708 #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
709 
710 /*
711  * CALL/JMP *%\reg
712  */
713 static int emit_indirect(int op, int reg, u8 *bytes)
714 {
715 	int i = 0;
716 	u8 modrm;
717 
718 	switch (op) {
719 	case CALL_INSN_OPCODE:
720 		modrm = 0x10; /* Reg = 2; CALL r/m */
721 		break;
722 
723 	case JMP32_INSN_OPCODE:
724 		modrm = 0x20; /* Reg = 4; JMP r/m */
725 		break;
726 
727 	default:
728 		WARN_ON_ONCE(1);
729 		return -1;
730 	}
731 
732 	if (reg >= 8) {
733 		bytes[i++] = 0x41; /* REX.B prefix */
734 		reg -= 8;
735 	}
736 
737 	modrm |= 0xc0; /* Mod = 3 */
738 	modrm += reg;
739 
740 	bytes[i++] = 0xff; /* opcode */
741 	bytes[i++] = modrm;
742 
743 	return i;
744 }
745 
746 static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
747 			     void *call_dest, void *jmp_dest)
748 {
749 	u8 op = insn->opcode.bytes[0];
750 	int i = 0;
751 
752 	/*
753 	 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
754 	 * tail-calls. Deal with them.
755 	 */
756 	if (is_jcc32(insn)) {
757 		bytes[i++] = op;
758 		op = insn->opcode.bytes[1];
759 		goto clang_jcc;
760 	}
761 
762 	if (insn->length == 6)
763 		bytes[i++] = 0x2e; /* CS-prefix */
764 
765 	switch (op) {
766 	case CALL_INSN_OPCODE:
767 		__text_gen_insn(bytes+i, op, addr+i,
768 				call_dest,
769 				CALL_INSN_SIZE);
770 		i += CALL_INSN_SIZE;
771 		break;
772 
773 	case JMP32_INSN_OPCODE:
774 clang_jcc:
775 		__text_gen_insn(bytes+i, op, addr+i,
776 				jmp_dest,
777 				JMP32_INSN_SIZE);
778 		i += JMP32_INSN_SIZE;
779 		break;
780 
781 	default:
782 		WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
783 		return -1;
784 	}
785 
786 	WARN_ON_ONCE(i != insn->length);
787 
788 	return i;
789 }
790 
791 static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
792 {
793 	return __emit_trampoline(addr, insn, bytes,
794 				 __x86_indirect_call_thunk_array[reg],
795 				 __x86_indirect_jump_thunk_array[reg]);
796 }
797 
798 #ifdef CONFIG_MITIGATION_ITS
799 static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
800 {
801 	u8 *thunk = __x86_indirect_its_thunk_array[reg];
802 	u8 *tmp = its_allocate_thunk(reg);
803 
804 	if (tmp)
805 		thunk = tmp;
806 
807 	return __emit_trampoline(addr, insn, bytes, thunk, thunk);
808 }
809 
810 /* Check if an indirect branch is at ITS-unsafe address */
811 static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
812 {
813 	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
814 		return false;
815 
816 	/* Indirect branch opcode is 2 or 3 bytes depending on reg */
817 	addr += 1 + reg / 8;
818 
819 	/* Lower-half of the cacheline? */
820 	return !(addr & 0x20);
821 }
822 #else /* CONFIG_MITIGATION_ITS */
823 
824 #ifdef CONFIG_FINEIBT
825 static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
826 {
827 	return false;
828 }
829 #endif
830 
831 #endif /* CONFIG_MITIGATION_ITS */
832 
833 /*
834  * Rewrite the compiler generated retpoline thunk calls.
835  *
836  * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
837  * indirect instructions, avoiding the extra indirection.
838  *
839  * For example, convert:
840  *
841  *   CALL __x86_indirect_thunk_\reg
842  *
843  * into:
844  *
845  *   CALL *%\reg
846  *
847  * It also tries to inline spectre_v2=retpoline,lfence when size permits.
848  */
849 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
850 {
851 	retpoline_thunk_t *target;
852 	int reg, ret, i = 0;
853 	u8 op, cc;
854 
855 	target = addr + insn->length + insn->immediate.value;
856 	reg = target - __x86_indirect_thunk_array;
857 
858 	if (WARN_ON_ONCE(reg & ~0xf))
859 		return -1;
860 
861 	/* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
862 	BUG_ON(reg == 4);
863 
864 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
865 	    !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
866 		if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
867 			return emit_call_track_retpoline(addr, insn, reg, bytes);
868 
869 		return -1;
870 	}
871 
872 	op = insn->opcode.bytes[0];
873 
874 	/*
875 	 * Convert:
876 	 *
877 	 *   Jcc.d32 __x86_indirect_thunk_\reg
878 	 *
879 	 * into:
880 	 *
881 	 *   Jncc.d8 1f
882 	 *   [ LFENCE ]
883 	 *   JMP *%\reg
884 	 *   [ NOP ]
885 	 * 1:
886 	 */
887 	if (is_jcc32(insn)) {
888 		cc = insn->opcode.bytes[1] & 0xf;
889 		cc ^= 1; /* invert condition */
890 
891 		bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
892 		bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
893 
894 		/* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
895 		op = JMP32_INSN_OPCODE;
896 	}
897 
898 	/*
899 	 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
900 	 */
901 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
902 		bytes[i++] = 0x0f;
903 		bytes[i++] = 0xae;
904 		bytes[i++] = 0xe8; /* LFENCE */
905 	}
906 
907 #ifdef CONFIG_MITIGATION_ITS
908 	/*
909 	 * Check if the address of last byte of emitted-indirect is in
910 	 * lower-half of the cacheline. Such branches need ITS mitigation.
911 	 */
912 	if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
913 		return emit_its_trampoline(addr, insn, reg, bytes);
914 #endif
915 
916 	ret = emit_indirect(op, reg, bytes + i);
917 	if (ret < 0)
918 		return ret;
919 	i += ret;
920 
921 	/*
922 	 * The compiler is supposed to EMIT an INT3 after every unconditional
923 	 * JMP instruction due to AMD BTC. However, if the compiler is too old
924 	 * or MITIGATION_SLS isn't enabled, we still need an INT3 after
925 	 * indirect JMPs even on Intel.
926 	 */
927 	if (op == JMP32_INSN_OPCODE && i < insn->length)
928 		bytes[i++] = INT3_INSN_OPCODE;
929 
930 	for (; i < insn->length;)
931 		bytes[i++] = BYTES_NOP1;
932 
933 	return i;
934 }
935 
936 /*
937  * Generated by 'objtool --retpoline'.
938  */
939 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
940 {
941 	s32 *s;
942 
943 	for (s = start; s < end; s++) {
944 		void *addr = (void *)s + *s;
945 		struct insn insn;
946 		int len, ret;
947 		u8 bytes[16];
948 		u8 op1, op2;
949 		u8 *dest;
950 
951 		ret = insn_decode_kernel(&insn, addr);
952 		if (WARN_ON_ONCE(ret < 0))
953 			continue;
954 
955 		op1 = insn.opcode.bytes[0];
956 		op2 = insn.opcode.bytes[1];
957 
958 		switch (op1) {
959 		case 0x70 ... 0x7f:	/* Jcc.d8 */
960 			/* See cfi_paranoid. */
961 			WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
962 			continue;
963 
964 		case CALL_INSN_OPCODE:
965 		case JMP32_INSN_OPCODE:
966 			/* Check for cfi_paranoid + ITS */
967 			dest = addr + insn.length + insn.immediate.value;
968 			if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) {
969 				WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
970 				continue;
971 			}
972 			break;
973 
974 		case 0x0f: /* escape */
975 			if (op2 >= 0x80 && op2 <= 0x8f)
976 				break;
977 			fallthrough;
978 		default:
979 			WARN_ON_ONCE(1);
980 			continue;
981 		}
982 
983 		DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
984 			addr, addr, insn.length,
985 			addr + insn.length + insn.immediate.value);
986 
987 		len = patch_retpoline(addr, &insn, bytes);
988 		if (len == insn.length) {
989 			optimize_nops(addr, bytes, len);
990 			DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
991 			DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
992 			text_poke_early(addr, bytes, len);
993 		}
994 	}
995 }
996 
997 #ifdef CONFIG_MITIGATION_RETHUNK
998 
999 bool cpu_wants_rethunk(void)
1000 {
1001 	return cpu_feature_enabled(X86_FEATURE_RETHUNK);
1002 }
1003 
1004 bool cpu_wants_rethunk_at(void *addr)
1005 {
1006 	if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
1007 		return false;
1008 	if (x86_return_thunk != its_return_thunk)
1009 		return true;
1010 
1011 	return !((unsigned long)addr & 0x20);
1012 }
1013 
1014 /*
1015  * Rewrite the compiler generated return thunk tail-calls.
1016  *
1017  * For example, convert:
1018  *
1019  *   JMP __x86_return_thunk
1020  *
1021  * into:
1022  *
1023  *   RET
1024  */
1025 static int patch_return(void *addr, struct insn *insn, u8 *bytes)
1026 {
1027 	int i = 0;
1028 
1029 	/* Patch the custom return thunks... */
1030 	if (cpu_wants_rethunk_at(addr)) {
1031 		i = JMP32_INSN_SIZE;
1032 		__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
1033 	} else {
1034 		/* ... or patch them out if not needed. */
1035 		bytes[i++] = RET_INSN_OPCODE;
1036 	}
1037 
1038 	for (; i < insn->length;)
1039 		bytes[i++] = INT3_INSN_OPCODE;
1040 	return i;
1041 }
1042 
1043 void __init_or_module noinline apply_returns(s32 *start, s32 *end)
1044 {
1045 	s32 *s;
1046 
1047 	if (cpu_wants_rethunk())
1048 		static_call_force_reinit();
1049 
1050 	for (s = start; s < end; s++) {
1051 		void *dest = NULL, *addr = (void *)s + *s;
1052 		struct insn insn;
1053 		int len, ret;
1054 		u8 bytes[16];
1055 		u8 op;
1056 
1057 		ret = insn_decode_kernel(&insn, addr);
1058 		if (WARN_ON_ONCE(ret < 0))
1059 			continue;
1060 
1061 		op = insn.opcode.bytes[0];
1062 		if (op == JMP32_INSN_OPCODE)
1063 			dest = addr + insn.length + insn.immediate.value;
1064 
1065 		if (__static_call_fixup(addr, op, dest) ||
1066 		    WARN_ONCE(dest != &__x86_return_thunk,
1067 			      "missing return thunk: %pS-%pS: %*ph",
1068 			      addr, dest, 5, addr))
1069 			continue;
1070 
1071 		DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
1072 			addr, addr, insn.length,
1073 			addr + insn.length + insn.immediate.value);
1074 
1075 		len = patch_return(addr, &insn, bytes);
1076 		if (len == insn.length) {
1077 			DUMP_BYTES(RET, ((u8*)addr),  len, "%px: orig: ", addr);
1078 			DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
1079 			text_poke_early(addr, bytes, len);
1080 		}
1081 	}
1082 }
1083 #else /* !CONFIG_MITIGATION_RETHUNK: */
1084 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1085 #endif /* !CONFIG_MITIGATION_RETHUNK */
1086 
1087 #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1088 
1089 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
1090 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1091 
1092 #endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1093 
1094 #ifdef CONFIG_X86_KERNEL_IBT
1095 
1096 __noendbr bool is_endbr(u32 *val)
1097 {
1098 	u32 endbr;
1099 
1100 	__get_kernel_nofault(&endbr, val, u32, Efault);
1101 	return __is_endbr(endbr);
1102 
1103 Efault:
1104 	return false;
1105 }
1106 
1107 #ifdef CONFIG_FINEIBT
1108 
1109 static __noendbr bool exact_endbr(u32 *val)
1110 {
1111 	u32 endbr;
1112 
1113 	__get_kernel_nofault(&endbr, val, u32, Efault);
1114 	return endbr == gen_endbr();
1115 
1116 Efault:
1117 	return false;
1118 }
1119 
1120 #endif
1121 
1122 static void poison_cfi(void *addr);
1123 
1124 static void __init_or_module poison_endbr(void *addr)
1125 {
1126 	u32 poison = gen_endbr_poison();
1127 
1128 	if (WARN_ON_ONCE(!is_endbr(addr)))
1129 		return;
1130 
1131 	DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
1132 
1133 	/*
1134 	 * When we have IBT, the lack of ENDBR will trigger #CP
1135 	 */
1136 	DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
1137 	DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
1138 	text_poke_early(addr, &poison, 4);
1139 }
1140 
1141 /*
1142  * Generated by: objtool --ibt
1143  *
1144  * Seal the functions for indirect calls by clobbering the ENDBR instructions
1145  * and the kCFI hash value.
1146  */
1147 void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
1148 {
1149 	s32 *s;
1150 
1151 	for (s = start; s < end; s++) {
1152 		void *addr = (void *)s + *s;
1153 
1154 		poison_endbr(addr);
1155 		if (IS_ENABLED(CONFIG_FINEIBT))
1156 			poison_cfi(addr - 16);
1157 	}
1158 }
1159 
1160 #else /* !CONFIG_X86_KERNEL_IBT: */
1161 
1162 void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
1163 
1164 #endif /* !CONFIG_X86_KERNEL_IBT */
1165 
1166 #ifdef CONFIG_CFI_AUTO_DEFAULT
1167 # define __CFI_DEFAULT CFI_AUTO
1168 #elif defined(CONFIG_CFI_CLANG)
1169 # define __CFI_DEFAULT CFI_KCFI
1170 #else
1171 # define __CFI_DEFAULT CFI_OFF
1172 #endif
1173 
1174 enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
1175 
1176 #ifdef CONFIG_FINEIBT_BHI
1177 bool cfi_bhi __ro_after_init = false;
1178 #endif
1179 
1180 #ifdef CONFIG_CFI_CLANG
1181 struct bpf_insn;
1182 
1183 /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
1184 extern unsigned int __bpf_prog_runX(const void *ctx,
1185 				    const struct bpf_insn *insn);
1186 
1187 KCFI_REFERENCE(__bpf_prog_runX);
1188 
1189 /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
1190 asm (
1191 "	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
1192 "	.type	cfi_bpf_hash,@object				\n"
1193 "	.globl	cfi_bpf_hash					\n"
1194 "	.p2align	2, 0x0					\n"
1195 "cfi_bpf_hash:							\n"
1196 "	.long	__kcfi_typeid___bpf_prog_runX			\n"
1197 "	.size	cfi_bpf_hash, 4					\n"
1198 "	.popsection						\n"
1199 );
1200 
1201 /* Must match bpf_callback_t */
1202 extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
1203 
1204 KCFI_REFERENCE(__bpf_callback_fn);
1205 
1206 /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
1207 asm (
1208 "	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
1209 "	.type	cfi_bpf_subprog_hash,@object			\n"
1210 "	.globl	cfi_bpf_subprog_hash				\n"
1211 "	.p2align	2, 0x0					\n"
1212 "cfi_bpf_subprog_hash:						\n"
1213 "	.long	__kcfi_typeid___bpf_callback_fn			\n"
1214 "	.size	cfi_bpf_subprog_hash, 4				\n"
1215 "	.popsection						\n"
1216 );
1217 
1218 u32 cfi_get_func_hash(void *func)
1219 {
1220 	u32 hash;
1221 
1222 	func -= cfi_get_offset();
1223 	switch (cfi_mode) {
1224 	case CFI_FINEIBT:
1225 		func += 7;
1226 		break;
1227 	case CFI_KCFI:
1228 		func += 1;
1229 		break;
1230 	default:
1231 		return 0;
1232 	}
1233 
1234 	if (get_kernel_nofault(hash, func))
1235 		return 0;
1236 
1237 	return hash;
1238 }
1239 
1240 int cfi_get_func_arity(void *func)
1241 {
1242 	bhi_thunk *target;
1243 	s32 disp;
1244 
1245 	if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
1246 		return 0;
1247 
1248 	if (get_kernel_nofault(disp, func - 4))
1249 		return 0;
1250 
1251 	target = func + disp;
1252 	return target - __bhi_args;
1253 }
1254 #endif
1255 
1256 #ifdef CONFIG_FINEIBT
1257 
1258 static bool cfi_rand __ro_after_init = true;
1259 static u32  cfi_seed __ro_after_init;
1260 
1261 /*
1262  * Re-hash the CFI hash with a boot-time seed while making sure the result is
1263  * not a valid ENDBR instruction.
1264  */
1265 static u32 cfi_rehash(u32 hash)
1266 {
1267 	hash ^= cfi_seed;
1268 	while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
1269 		bool lsb = hash & 1;
1270 		hash >>= 1;
1271 		if (lsb)
1272 			hash ^= 0x80200003;
1273 	}
1274 	return hash;
1275 }
1276 
1277 static __init int cfi_parse_cmdline(char *str)
1278 {
1279 	if (!str)
1280 		return -EINVAL;
1281 
1282 	while (str) {
1283 		char *next = strchr(str, ',');
1284 		if (next) {
1285 			*next = 0;
1286 			next++;
1287 		}
1288 
1289 		if (!strcmp(str, "auto")) {
1290 			cfi_mode = CFI_AUTO;
1291 		} else if (!strcmp(str, "off")) {
1292 			cfi_mode = CFI_OFF;
1293 			cfi_rand = false;
1294 		} else if (!strcmp(str, "kcfi")) {
1295 			cfi_mode = CFI_KCFI;
1296 		} else if (!strcmp(str, "fineibt")) {
1297 			cfi_mode = CFI_FINEIBT;
1298 		} else if (!strcmp(str, "norand")) {
1299 			cfi_rand = false;
1300 		} else if (!strcmp(str, "warn")) {
1301 			pr_alert("CFI mismatch non-fatal!\n");
1302 			cfi_warn = true;
1303 		} else if (!strcmp(str, "paranoid")) {
1304 			if (cfi_mode == CFI_FINEIBT) {
1305 				cfi_paranoid = true;
1306 			} else {
1307 				pr_err("Ignoring paranoid; depends on fineibt.\n");
1308 			}
1309 		} else if (!strcmp(str, "bhi")) {
1310 #ifdef CONFIG_FINEIBT_BHI
1311 			if (cfi_mode == CFI_FINEIBT) {
1312 				cfi_bhi = true;
1313 			} else {
1314 				pr_err("Ignoring bhi; depends on fineibt.\n");
1315 			}
1316 #else
1317 			pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n");
1318 #endif
1319 		} else {
1320 			pr_err("Ignoring unknown cfi option (%s).", str);
1321 		}
1322 
1323 		str = next;
1324 	}
1325 
1326 	return 0;
1327 }
1328 early_param("cfi", cfi_parse_cmdline);
1329 
1330 /*
1331  * kCFI						FineIBT
1332  *
1333  * __cfi_\func:					__cfi_\func:
1334  *	movl   $0x12345678,%eax		// 5	     endbr64			// 4
1335  *	nop					     subl   $0x12345678,%r10d   // 7
1336  *	nop					     jne    __cfi_\func+6	// 2
1337  *	nop					     nop3			// 3
1338  *	nop
1339  *	nop
1340  *	nop
1341  *	nop
1342  *	nop
1343  *	nop
1344  *	nop
1345  *	nop
1346  *
1347  *
1348  * caller:					caller:
1349  *	movl	$(-0x12345678),%r10d	 // 6	     movl   $0x12345678,%r10d	// 6
1350  *	addl	$-15(%r11),%r10d	 // 4	     lea    -0x10(%r11),%r11	// 4
1351  *	je	1f			 // 2	     nop4			// 4
1352  *	ud2				 // 2
1353  * 1:	cs call	__x86_indirect_thunk_r11 // 6	     call   *%r11; nop3;	// 6
1354  *
1355  */
1356 
1357 /*
1358  * <fineibt_preamble_start>:
1359  *  0:   f3 0f 1e fa             endbr64
1360  *  4:   41 81 <ea> 78 56 34 12  sub    $0x12345678, %r10d
1361  *  b:   75 f9                   jne    6 <fineibt_preamble_start+0x6>
1362  *  d:   0f 1f 00                nopl   (%rax)
1363  *
1364  * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
1365  * (bad) on x86_64 and raises #UD.
1366  */
1367 asm(	".pushsection .rodata				\n"
1368 	"fineibt_preamble_start:			\n"
1369 	"	endbr64					\n"
1370 	"	subl	$0x12345678, %r10d		\n"
1371 	"fineibt_preamble_bhi:				\n"
1372 	"	jne	fineibt_preamble_start+6	\n"
1373 	ASM_NOP3
1374 	"fineibt_preamble_end:				\n"
1375 	".popsection\n"
1376 );
1377 
1378 extern u8 fineibt_preamble_start[];
1379 extern u8 fineibt_preamble_bhi[];
1380 extern u8 fineibt_preamble_end[];
1381 
1382 #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1383 #define fineibt_preamble_bhi  (fineibt_preamble_bhi - fineibt_preamble_start)
1384 #define fineibt_preamble_ud   6
1385 #define fineibt_preamble_hash 7
1386 
1387 /*
1388  * <fineibt_caller_start>:
1389  *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
1390  *  6:   4d 8d 5b f0             lea    -0x10(%r11), %r11
1391  *  a:   0f 1f 40 00             nopl   0x0(%rax)
1392  */
1393 asm(	".pushsection .rodata			\n"
1394 	"fineibt_caller_start:			\n"
1395 	"	movl	$0x12345678, %r10d	\n"
1396 	"	lea	-0x10(%r11), %r11	\n"
1397 	ASM_NOP4
1398 	"fineibt_caller_end:			\n"
1399 	".popsection				\n"
1400 );
1401 
1402 extern u8 fineibt_caller_start[];
1403 extern u8 fineibt_caller_end[];
1404 
1405 #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1406 #define fineibt_caller_hash 2
1407 
1408 #define fineibt_caller_jmp (fineibt_caller_size - 2)
1409 
1410 /*
1411  * Since FineIBT does hash validation on the callee side it is prone to
1412  * circumvention attacks where a 'naked' ENDBR instruction exists that
1413  * is not part of the fineibt_preamble sequence.
1414  *
1415  * Notably the x86 entry points must be ENDBR and equally cannot be
1416  * fineibt_preamble.
1417  *
1418  * The fineibt_paranoid caller sequence adds additional caller side
1419  * hash validation. This stops such circumvention attacks dead, but at the cost
1420  * of adding a load.
1421  *
1422  * <fineibt_paranoid_start>:
1423  *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
1424  *  6:   45 3b 53 f7             cmp    -0x9(%r11), %r10d
1425  *  a:   4d 8d 5b <f0>           lea    -0x10(%r11), %r11
1426  *  e:   75 fd                   jne    d <fineibt_paranoid_start+0xd>
1427  * 10:   41 ff d3                call   *%r11
1428  * 13:   90                      nop
1429  *
1430  * Notably LEA does not modify flags and can be reordered with the CMP,
1431  * avoiding a dependency. Again, using a non-taken (backwards) branch
1432  * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
1433  * Jcc.d8, causing #UD.
1434  */
1435 asm(	".pushsection .rodata				\n"
1436 	"fineibt_paranoid_start:			\n"
1437 	"	movl	$0x12345678, %r10d		\n"
1438 	"	cmpl	-9(%r11), %r10d			\n"
1439 	"	lea	-0x10(%r11), %r11		\n"
1440 	"	jne	fineibt_paranoid_start+0xd	\n"
1441 	"fineibt_paranoid_ind:				\n"
1442 	"	call	*%r11				\n"
1443 	"	nop					\n"
1444 	"fineibt_paranoid_end:				\n"
1445 	".popsection					\n"
1446 );
1447 
1448 extern u8 fineibt_paranoid_start[];
1449 extern u8 fineibt_paranoid_ind[];
1450 extern u8 fineibt_paranoid_end[];
1451 
1452 #define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
1453 #define fineibt_paranoid_ind  (fineibt_paranoid_ind - fineibt_paranoid_start)
1454 #define fineibt_paranoid_ud   0xd
1455 
1456 static u32 decode_preamble_hash(void *addr, int *reg)
1457 {
1458 	u8 *p = addr;
1459 
1460 	/* b8+reg 78 56 34 12          movl    $0x12345678,\reg */
1461 	if (p[0] >= 0xb8 && p[0] < 0xc0) {
1462 		if (reg)
1463 			*reg = p[0] - 0xb8;
1464 		return *(u32 *)(addr + 1);
1465 	}
1466 
1467 	return 0; /* invalid hash value */
1468 }
1469 
1470 static u32 decode_caller_hash(void *addr)
1471 {
1472 	u8 *p = addr;
1473 
1474 	/* 41 ba 88 a9 cb ed       mov    $(-0x12345678),%r10d */
1475 	if (p[0] == 0x41 && p[1] == 0xba)
1476 		return -*(u32 *)(addr + 2);
1477 
1478 	/* e8 0c 88 a9 cb ed	   jmp.d8  +12 */
1479 	if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1480 		return -*(u32 *)(addr + 2);
1481 
1482 	return 0; /* invalid hash value */
1483 }
1484 
1485 /* .retpoline_sites */
1486 static int cfi_disable_callers(s32 *start, s32 *end)
1487 {
1488 	/*
1489 	 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1490 	 * in tact for later usage. Also see decode_caller_hash() and
1491 	 * cfi_rewrite_callers().
1492 	 */
1493 	const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1494 	s32 *s;
1495 
1496 	for (s = start; s < end; s++) {
1497 		void *addr = (void *)s + *s;
1498 		u32 hash;
1499 
1500 		addr -= fineibt_caller_size;
1501 		hash = decode_caller_hash(addr);
1502 		if (!hash) /* nocfi callers */
1503 			continue;
1504 
1505 		text_poke_early(addr, jmp, 2);
1506 	}
1507 
1508 	return 0;
1509 }
1510 
1511 static int cfi_enable_callers(s32 *start, s32 *end)
1512 {
1513 	/*
1514 	 * Re-enable kCFI, undo what cfi_disable_callers() did.
1515 	 */
1516 	const u8 mov[] = { 0x41, 0xba };
1517 	s32 *s;
1518 
1519 	for (s = start; s < end; s++) {
1520 		void *addr = (void *)s + *s;
1521 		u32 hash;
1522 
1523 		addr -= fineibt_caller_size;
1524 		hash = decode_caller_hash(addr);
1525 		if (!hash) /* nocfi callers */
1526 			continue;
1527 
1528 		text_poke_early(addr, mov, 2);
1529 	}
1530 
1531 	return 0;
1532 }
1533 
1534 /* .cfi_sites */
1535 static int cfi_rand_preamble(s32 *start, s32 *end)
1536 {
1537 	s32 *s;
1538 
1539 	for (s = start; s < end; s++) {
1540 		void *addr = (void *)s + *s;
1541 		u32 hash;
1542 
1543 		hash = decode_preamble_hash(addr, NULL);
1544 		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1545 			 addr, addr, 5, addr))
1546 			return -EINVAL;
1547 
1548 		hash = cfi_rehash(hash);
1549 		text_poke_early(addr + 1, &hash, 4);
1550 	}
1551 
1552 	return 0;
1553 }
1554 
1555 static void cfi_fineibt_bhi_preamble(void *addr, int arity)
1556 {
1557 	if (!arity)
1558 		return;
1559 
1560 	if (!cfi_warn && arity == 1) {
1561 		/*
1562 		 * Crazy scheme to allow arity-1 inline:
1563 		 *
1564 		 * __cfi_foo:
1565 		 *  0: f3 0f 1e fa             endbr64
1566 		 *  4: 41 81 <ea> 78 56 34 12  sub     0x12345678, %r10d
1567 		 *  b: 49 0f 45 fa             cmovne  %r10, %rdi
1568 		 *  f: 75 f5                   jne     __cfi_foo+6
1569 		 * 11: 0f 1f 00                nopl    (%rax)
1570 		 *
1571 		 * Code that direct calls to foo()+0, decodes the tail end as:
1572 		 *
1573 		 * foo:
1574 		 *  0: f5                      cmc
1575 		 *  1: 0f 1f 00                nopl    (%rax)
1576 		 *
1577 		 * which clobbers CF, but does not affect anything ABI
1578 		 * wise.
1579 		 *
1580 		 * Notably, this scheme is incompatible with permissive CFI
1581 		 * because the CMOVcc is unconditional and RDI will have been
1582 		 * clobbered.
1583 		 */
1584 		const u8 magic[9] = {
1585 			0x49, 0x0f, 0x45, 0xfa,
1586 			0x75, 0xf5,
1587 			BYTES_NOP3,
1588 		};
1589 
1590 		text_poke_early(addr + fineibt_preamble_bhi, magic, 9);
1591 
1592 		return;
1593 	}
1594 
1595 	text_poke_early(addr + fineibt_preamble_bhi,
1596 			text_gen_insn(CALL_INSN_OPCODE,
1597 				      addr + fineibt_preamble_bhi,
1598 				      __bhi_args[arity]),
1599 			CALL_INSN_SIZE);
1600 }
1601 
1602 static int cfi_rewrite_preamble(s32 *start, s32 *end)
1603 {
1604 	s32 *s;
1605 
1606 	for (s = start; s < end; s++) {
1607 		void *addr = (void *)s + *s;
1608 		int arity;
1609 		u32 hash;
1610 
1611 		/*
1612 		 * When the function doesn't start with ENDBR the compiler will
1613 		 * have determined there are no indirect calls to it and we
1614 		 * don't need no CFI either.
1615 		 */
1616 		if (!is_endbr(addr + 16))
1617 			continue;
1618 
1619 		hash = decode_preamble_hash(addr, &arity);
1620 		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1621 			 addr, addr, 5, addr))
1622 			return -EINVAL;
1623 
1624 		text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
1625 		WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
1626 		text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
1627 
1628 		WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
1629 			  "kCFI preamble has wrong register at: %pS %*ph\n",
1630 			  addr, 5, addr);
1631 
1632 		if (cfi_bhi)
1633 			cfi_fineibt_bhi_preamble(addr, arity);
1634 	}
1635 
1636 	return 0;
1637 }
1638 
1639 static void cfi_rewrite_endbr(s32 *start, s32 *end)
1640 {
1641 	s32 *s;
1642 
1643 	for (s = start; s < end; s++) {
1644 		void *addr = (void *)s + *s;
1645 
1646 		if (!exact_endbr(addr + 16))
1647 			continue;
1648 
1649 		poison_endbr(addr + 16);
1650 	}
1651 }
1652 
1653 /* .retpoline_sites */
1654 static int cfi_rand_callers(s32 *start, s32 *end)
1655 {
1656 	s32 *s;
1657 
1658 	for (s = start; s < end; s++) {
1659 		void *addr = (void *)s + *s;
1660 		u32 hash;
1661 
1662 		addr -= fineibt_caller_size;
1663 		hash = decode_caller_hash(addr);
1664 		if (hash) {
1665 			hash = -cfi_rehash(hash);
1666 			text_poke_early(addr + 2, &hash, 4);
1667 		}
1668 	}
1669 
1670 	return 0;
1671 }
1672 
1673 static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
1674 {
1675 	u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
1676 
1677 #ifdef CONFIG_MITIGATION_ITS
1678 	u8 *tmp = its_allocate_thunk(reg);
1679 	if (tmp)
1680 		thunk = tmp;
1681 #endif
1682 
1683 	return __emit_trampoline(addr, insn, bytes, thunk, thunk);
1684 }
1685 
1686 static int cfi_rewrite_callers(s32 *start, s32 *end)
1687 {
1688 	s32 *s;
1689 
1690 	BUG_ON(fineibt_paranoid_size != 20);
1691 
1692 	for (s = start; s < end; s++) {
1693 		void *addr = (void *)s + *s;
1694 		struct insn insn;
1695 		u8 bytes[20];
1696 		u32 hash;
1697 		int ret;
1698 		u8 op;
1699 
1700 		addr -= fineibt_caller_size;
1701 		hash = decode_caller_hash(addr);
1702 		if (!hash)
1703 			continue;
1704 
1705 		if (!cfi_paranoid) {
1706 			text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
1707 			WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
1708 			text_poke_early(addr + fineibt_caller_hash, &hash, 4);
1709 			/* rely on apply_retpolines() */
1710 			continue;
1711 		}
1712 
1713 		/* cfi_paranoid */
1714 		ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
1715 		if (WARN_ON_ONCE(ret < 0))
1716 			continue;
1717 
1718 		op = insn.opcode.bytes[0];
1719 		if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
1720 			WARN_ON_ONCE(1);
1721 			continue;
1722 		}
1723 
1724 		memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
1725 		memcpy(bytes + fineibt_caller_hash, &hash, 4);
1726 
1727 		if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
1728 			emit_paranoid_trampoline(addr + fineibt_caller_size,
1729 						 &insn, 11, bytes + fineibt_caller_size);
1730 		} else {
1731 			ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
1732 			if (WARN_ON_ONCE(ret != 3))
1733 				continue;
1734 		}
1735 
1736 		text_poke_early(addr, bytes, fineibt_paranoid_size);
1737 	}
1738 
1739 	return 0;
1740 }
1741 
1742 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1743 			    s32 *start_cfi, s32 *end_cfi, bool builtin)
1744 {
1745 	int ret;
1746 
1747 	if (WARN_ONCE(fineibt_preamble_size != 16,
1748 		      "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1749 		return;
1750 
1751 	if (cfi_mode == CFI_AUTO) {
1752 		cfi_mode = CFI_KCFI;
1753 		if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
1754 			/*
1755 			 * FRED has much saner context on exception entry and
1756 			 * is less easy to take advantage of.
1757 			 */
1758 			if (!cpu_feature_enabled(X86_FEATURE_FRED))
1759 				cfi_paranoid = true;
1760 			cfi_mode = CFI_FINEIBT;
1761 		}
1762 	}
1763 
1764 	/*
1765 	 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1766 	 * rewrite them. This disables all CFI. If this succeeds but any of the
1767 	 * later stages fails, we're without CFI.
1768 	 */
1769 	ret = cfi_disable_callers(start_retpoline, end_retpoline);
1770 	if (ret)
1771 		goto err;
1772 
1773 	if (cfi_rand) {
1774 		if (builtin) {
1775 			cfi_seed = get_random_u32();
1776 			cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1777 			cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1778 		}
1779 
1780 		ret = cfi_rand_preamble(start_cfi, end_cfi);
1781 		if (ret)
1782 			goto err;
1783 
1784 		ret = cfi_rand_callers(start_retpoline, end_retpoline);
1785 		if (ret)
1786 			goto err;
1787 	}
1788 
1789 	switch (cfi_mode) {
1790 	case CFI_OFF:
1791 		if (builtin)
1792 			pr_info("Disabling CFI\n");
1793 		return;
1794 
1795 	case CFI_KCFI:
1796 		ret = cfi_enable_callers(start_retpoline, end_retpoline);
1797 		if (ret)
1798 			goto err;
1799 
1800 		if (builtin)
1801 			pr_info("Using kCFI\n");
1802 		return;
1803 
1804 	case CFI_FINEIBT:
1805 		/* place the FineIBT preamble at func()-16 */
1806 		ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1807 		if (ret)
1808 			goto err;
1809 
1810 		/* rewrite the callers to target func()-16 */
1811 		ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1812 		if (ret)
1813 			goto err;
1814 
1815 		/* now that nobody targets func()+0, remove ENDBR there */
1816 		cfi_rewrite_endbr(start_cfi, end_cfi);
1817 
1818 		if (builtin) {
1819 			pr_info("Using %sFineIBT%s CFI\n",
1820 				cfi_paranoid ? "paranoid " : "",
1821 				cfi_bhi ? "+BHI" : "");
1822 		}
1823 		return;
1824 
1825 	default:
1826 		break;
1827 	}
1828 
1829 err:
1830 	pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1831 }
1832 
1833 static inline void poison_hash(void *addr)
1834 {
1835 	*(u32 *)addr = 0;
1836 }
1837 
1838 static void poison_cfi(void *addr)
1839 {
1840 	/*
1841 	 * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
1842 	 * some (static) functions for which they can determine the address
1843 	 * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
1844 	 *
1845 	 * As such, these functions will get sealed, but we need to be careful
1846 	 * to not unconditionally scribble the previous function.
1847 	 */
1848 	switch (cfi_mode) {
1849 	case CFI_FINEIBT:
1850 		/*
1851 		 * FineIBT prefix should start with an ENDBR.
1852 		 */
1853 		if (!is_endbr(addr))
1854 			break;
1855 
1856 		/*
1857 		 * __cfi_\func:
1858 		 *	osp nopl (%rax)
1859 		 *	subl	$0, %r10d
1860 		 *	jz	1f
1861 		 *	ud2
1862 		 * 1:	nop
1863 		 */
1864 		poison_endbr(addr);
1865 		poison_hash(addr + fineibt_preamble_hash);
1866 		break;
1867 
1868 	case CFI_KCFI:
1869 		/*
1870 		 * kCFI prefix should start with a valid hash.
1871 		 */
1872 		if (!decode_preamble_hash(addr, NULL))
1873 			break;
1874 
1875 		/*
1876 		 * __cfi_\func:
1877 		 *	movl	$0, %eax
1878 		 *	.skip	11, 0x90
1879 		 */
1880 		poison_hash(addr + 1);
1881 		break;
1882 
1883 	default:
1884 		break;
1885 	}
1886 }
1887 
1888 /*
1889  * When regs->ip points to a 0xEA byte in the FineIBT preamble,
1890  * return true and fill out target and type.
1891  *
1892  * We check the preamble by checking for the ENDBR instruction relative to the
1893  * 0xEA instruction.
1894  */
1895 static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
1896 {
1897 	unsigned long addr = regs->ip - fineibt_preamble_ud;
1898 	u32 hash;
1899 
1900 	if (!exact_endbr((void *)addr))
1901 		return false;
1902 
1903 	*target = addr + fineibt_preamble_size;
1904 
1905 	__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1906 	*type = (u32)regs->r10 + hash;
1907 
1908 	/*
1909 	 * Since regs->ip points to the middle of an instruction; it cannot
1910 	 * continue with the normal fixup.
1911 	 */
1912 	regs->ip = *target;
1913 
1914 	return true;
1915 
1916 Efault:
1917 	return false;
1918 }
1919 
1920 /*
1921  * regs->ip points to one of the UD2 in __bhi_args[].
1922  */
1923 static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
1924 {
1925 	unsigned long addr;
1926 	u32 hash;
1927 
1928 	if (!cfi_bhi)
1929 		return false;
1930 
1931 	if (regs->ip < (unsigned long)__bhi_args ||
1932 	    regs->ip >= (unsigned long)__bhi_args_end)
1933 		return false;
1934 
1935 	/*
1936 	 * Fetch the return address from the stack, this points to the
1937 	 * FineIBT preamble. Since the CALL instruction is in the 5 last
1938 	 * bytes of the preamble, the return address is in fact the target
1939 	 * address.
1940 	 */
1941 	__get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
1942 	*target = addr;
1943 
1944 	addr -= fineibt_preamble_size;
1945 	if (!exact_endbr((void *)addr))
1946 		return false;
1947 
1948 	__get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1949 	*type = (u32)regs->r10 + hash;
1950 
1951 	/*
1952 	 * The UD2 sites are constructed with a RET immediately following,
1953 	 * as such the non-fatal case can use the regular fixup.
1954 	 */
1955 	return true;
1956 
1957 Efault:
1958 	return false;
1959 }
1960 
1961 static bool is_paranoid_thunk(unsigned long addr)
1962 {
1963 	u32 thunk;
1964 
1965 	__get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
1966 	return (thunk & 0x00FFFFFF) == 0xfd75ea;
1967 
1968 Efault:
1969 	return false;
1970 }
1971 
1972 /*
1973  * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
1974  * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS
1975  * thunk.
1976  */
1977 static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
1978 {
1979 	unsigned long addr = regs->ip - fineibt_paranoid_ud;
1980 
1981 	if (!cfi_paranoid)
1982 		return false;
1983 
1984 	if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
1985 		*target = regs->r11 + fineibt_preamble_size;
1986 		*type = regs->r10;
1987 
1988 		/*
1989 		 * Since the trapping instruction is the exact, but LOCK prefixed,
1990 		 * Jcc.d8 that got us here, the normal fixup will work.
1991 		 */
1992 		return true;
1993 	}
1994 
1995 	/*
1996 	 * The cfi_paranoid + ITS thunk combination results in:
1997 	 *
1998 	 *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
1999 	 *  6:   45 3b 53 f7             cmp    -0x9(%r11), %r10d
2000 	 *  a:   4d 8d 5b f0             lea    -0x10(%r11), %r11
2001 	 *  e:   2e e8 XX XX XX XX	 cs call __x86_indirect_paranoid_thunk_r11
2002 	 *
2003 	 * Where the paranoid_thunk looks like:
2004 	 *
2005 	 *  1d:  <ea>                    (bad)
2006 	 *  __x86_indirect_paranoid_thunk_r11:
2007 	 *  1e:  75 fd                   jne 1d
2008 	 *  __x86_indirect_its_thunk_r11:
2009 	 *  20:  41 ff eb                jmp *%r11
2010 	 *  23:  cc                      int3
2011 	 *
2012 	 */
2013 	if (is_paranoid_thunk(regs->ip)) {
2014 		*target = regs->r11 + fineibt_preamble_size;
2015 		*type = regs->r10;
2016 
2017 		regs->ip = *target;
2018 		return true;
2019 	}
2020 
2021 	return false;
2022 }
2023 
2024 bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
2025 {
2026 	if (decode_fineibt_paranoid(regs, target, type))
2027 		return true;
2028 
2029 	if (decode_fineibt_bhi(regs, target, type))
2030 		return true;
2031 
2032 	return decode_fineibt_preamble(regs, target, type);
2033 }
2034 
2035 #else /* !CONFIG_FINEIBT: */
2036 
2037 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2038 			    s32 *start_cfi, s32 *end_cfi, bool builtin)
2039 {
2040 }
2041 
2042 #ifdef CONFIG_X86_KERNEL_IBT
2043 static void poison_cfi(void *addr) { }
2044 #endif
2045 
2046 #endif /* !CONFIG_FINEIBT */
2047 
2048 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2049 		   s32 *start_cfi, s32 *end_cfi)
2050 {
2051 	return __apply_fineibt(start_retpoline, end_retpoline,
2052 			       start_cfi, end_cfi,
2053 			       /* .builtin = */ false);
2054 }
2055 
2056 #ifdef CONFIG_SMP
2057 static void alternatives_smp_lock(const s32 *start, const s32 *end,
2058 				  u8 *text, u8 *text_end)
2059 {
2060 	const s32 *poff;
2061 
2062 	for (poff = start; poff < end; poff++) {
2063 		u8 *ptr = (u8 *)poff + *poff;
2064 
2065 		if (!*poff || ptr < text || ptr >= text_end)
2066 			continue;
2067 		/* turn DS segment override prefix into lock prefix */
2068 		if (*ptr == 0x3e)
2069 			text_poke(ptr, ((unsigned char []){0xf0}), 1);
2070 	}
2071 }
2072 
2073 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
2074 				    u8 *text, u8 *text_end)
2075 {
2076 	const s32 *poff;
2077 
2078 	for (poff = start; poff < end; poff++) {
2079 		u8 *ptr = (u8 *)poff + *poff;
2080 
2081 		if (!*poff || ptr < text || ptr >= text_end)
2082 			continue;
2083 		/* turn lock prefix into DS segment override prefix */
2084 		if (*ptr == 0xf0)
2085 			text_poke(ptr, ((unsigned char []){0x3E}), 1);
2086 	}
2087 }
2088 
2089 struct smp_alt_module {
2090 	/* what is this ??? */
2091 	struct module	*mod;
2092 	char		*name;
2093 
2094 	/* ptrs to lock prefixes */
2095 	const s32	*locks;
2096 	const s32	*locks_end;
2097 
2098 	/* .text segment, needed to avoid patching init code ;) */
2099 	u8		*text;
2100 	u8		*text_end;
2101 
2102 	struct list_head next;
2103 };
2104 static LIST_HEAD(smp_alt_modules);
2105 static bool uniproc_patched = false;	/* protected by text_mutex */
2106 
2107 void __init_or_module alternatives_smp_module_add(struct module *mod,
2108 						  char *name,
2109 						  void *locks, void *locks_end,
2110 						  void *text,  void *text_end)
2111 {
2112 	struct smp_alt_module *smp;
2113 
2114 	mutex_lock(&text_mutex);
2115 	if (!uniproc_patched)
2116 		goto unlock;
2117 
2118 	if (num_possible_cpus() == 1)
2119 		/* Don't bother remembering, we'll never have to undo it. */
2120 		goto smp_unlock;
2121 
2122 	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
2123 	if (NULL == smp)
2124 		/* we'll run the (safe but slow) SMP code then ... */
2125 		goto unlock;
2126 
2127 	smp->mod	= mod;
2128 	smp->name	= name;
2129 	smp->locks	= locks;
2130 	smp->locks_end	= locks_end;
2131 	smp->text	= text;
2132 	smp->text_end	= text_end;
2133 	DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
2134 		smp->locks, smp->locks_end,
2135 		smp->text, smp->text_end, smp->name);
2136 
2137 	list_add_tail(&smp->next, &smp_alt_modules);
2138 smp_unlock:
2139 	alternatives_smp_unlock(locks, locks_end, text, text_end);
2140 unlock:
2141 	mutex_unlock(&text_mutex);
2142 }
2143 
2144 void __init_or_module alternatives_smp_module_del(struct module *mod)
2145 {
2146 	struct smp_alt_module *item;
2147 
2148 	mutex_lock(&text_mutex);
2149 	list_for_each_entry(item, &smp_alt_modules, next) {
2150 		if (mod != item->mod)
2151 			continue;
2152 		list_del(&item->next);
2153 		kfree(item);
2154 		break;
2155 	}
2156 	mutex_unlock(&text_mutex);
2157 }
2158 
2159 void alternatives_enable_smp(void)
2160 {
2161 	struct smp_alt_module *mod;
2162 
2163 	/* Why bother if there are no other CPUs? */
2164 	BUG_ON(num_possible_cpus() == 1);
2165 
2166 	mutex_lock(&text_mutex);
2167 
2168 	if (uniproc_patched) {
2169 		pr_info("switching to SMP code\n");
2170 		BUG_ON(num_online_cpus() != 1);
2171 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
2172 		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
2173 		list_for_each_entry(mod, &smp_alt_modules, next)
2174 			alternatives_smp_lock(mod->locks, mod->locks_end,
2175 					      mod->text, mod->text_end);
2176 		uniproc_patched = false;
2177 	}
2178 	mutex_unlock(&text_mutex);
2179 }
2180 
2181 /*
2182  * Return 1 if the address range is reserved for SMP-alternatives.
2183  * Must hold text_mutex.
2184  */
2185 int alternatives_text_reserved(void *start, void *end)
2186 {
2187 	struct smp_alt_module *mod;
2188 	const s32 *poff;
2189 	u8 *text_start = start;
2190 	u8 *text_end = end;
2191 
2192 	lockdep_assert_held(&text_mutex);
2193 
2194 	list_for_each_entry(mod, &smp_alt_modules, next) {
2195 		if (mod->text > text_end || mod->text_end < text_start)
2196 			continue;
2197 		for (poff = mod->locks; poff < mod->locks_end; poff++) {
2198 			const u8 *ptr = (const u8 *)poff + *poff;
2199 
2200 			if (text_start <= ptr && text_end > ptr)
2201 				return 1;
2202 		}
2203 	}
2204 
2205 	return 0;
2206 }
2207 #endif /* CONFIG_SMP */
2208 
2209 /*
2210  * Self-test for the INT3 based CALL emulation code.
2211  *
2212  * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
2213  * properly and that there is a stack gap between the INT3 frame and the
2214  * previous context. Without this gap doing a virtual PUSH on the interrupted
2215  * stack would corrupt the INT3 IRET frame.
2216  *
2217  * See entry_{32,64}.S for more details.
2218  */
2219 
2220 /*
2221  * We define the int3_magic() function in assembly to control the calling
2222  * convention such that we can 'call' it from assembly.
2223  */
2224 
2225 extern void int3_magic(unsigned int *ptr); /* defined in asm */
2226 
2227 asm (
2228 "	.pushsection	.init.text, \"ax\", @progbits\n"
2229 "	.type		int3_magic, @function\n"
2230 "int3_magic:\n"
2231 	ANNOTATE_NOENDBR
2232 "	movl	$1, (%" _ASM_ARG1 ")\n"
2233 	ASM_RET
2234 "	.size		int3_magic, .-int3_magic\n"
2235 "	.popsection\n"
2236 );
2237 
2238 extern void int3_selftest_ip(void); /* defined in asm below */
2239 
2240 static int __init
2241 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
2242 {
2243 	unsigned long selftest = (unsigned long)&int3_selftest_ip;
2244 	struct die_args *args = data;
2245 	struct pt_regs *regs = args->regs;
2246 
2247 	OPTIMIZER_HIDE_VAR(selftest);
2248 
2249 	if (!regs || user_mode(regs))
2250 		return NOTIFY_DONE;
2251 
2252 	if (val != DIE_INT3)
2253 		return NOTIFY_DONE;
2254 
2255 	if (regs->ip - INT3_INSN_SIZE != selftest)
2256 		return NOTIFY_DONE;
2257 
2258 	int3_emulate_call(regs, (unsigned long)&int3_magic);
2259 	return NOTIFY_STOP;
2260 }
2261 
2262 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */
2263 static noinline void __init int3_selftest(void)
2264 {
2265 	static __initdata struct notifier_block int3_exception_nb = {
2266 		.notifier_call	= int3_exception_notify,
2267 		.priority	= INT_MAX-1, /* last */
2268 	};
2269 	unsigned int val = 0;
2270 
2271 	BUG_ON(register_die_notifier(&int3_exception_nb));
2272 
2273 	/*
2274 	 * Basically: int3_magic(&val); but really complicated :-)
2275 	 *
2276 	 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
2277 	 * notifier above will emulate CALL for us.
2278 	 */
2279 	asm volatile ("int3_selftest_ip:\n\t"
2280 		      ANNOTATE_NOENDBR
2281 		      "    int3; nop; nop; nop; nop\n\t"
2282 		      : ASM_CALL_CONSTRAINT
2283 		      : __ASM_SEL_RAW(a, D) (&val)
2284 		      : "memory");
2285 
2286 	BUG_ON(val != 1);
2287 
2288 	unregister_die_notifier(&int3_exception_nb);
2289 }
2290 
2291 static __initdata int __alt_reloc_selftest_addr;
2292 
2293 extern void __init __alt_reloc_selftest(void *arg);
2294 __visible noinline void __init __alt_reloc_selftest(void *arg)
2295 {
2296 	WARN_ON(arg != &__alt_reloc_selftest_addr);
2297 }
2298 
2299 static noinline void __init alt_reloc_selftest(void)
2300 {
2301 	/*
2302 	 * Tests apply_relocation().
2303 	 *
2304 	 * This has a relative immediate (CALL) in a place other than the first
2305 	 * instruction and additionally on x86_64 we get a RIP-relative LEA:
2306 	 *
2307 	 *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
2308 	 *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
2309 	 *
2310 	 * Getting this wrong will either crash and burn or tickle the WARN
2311 	 * above.
2312 	 */
2313 	asm_inline volatile (
2314 		ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
2315 		: ASM_CALL_CONSTRAINT
2316 		: [mem] "m" (__alt_reloc_selftest_addr)
2317 		: _ASM_ARG1
2318 	);
2319 }
2320 
2321 void __init alternative_instructions(void)
2322 {
2323 	u64 ibt;
2324 
2325 	int3_selftest();
2326 
2327 	/*
2328 	 * The patching is not fully atomic, so try to avoid local
2329 	 * interruptions that might execute the to be patched code.
2330 	 * Other CPUs are not running.
2331 	 */
2332 	stop_nmi();
2333 
2334 	/*
2335 	 * Don't stop machine check exceptions while patching.
2336 	 * MCEs only happen when something got corrupted and in this
2337 	 * case we must do something about the corruption.
2338 	 * Ignoring it is worse than an unlikely patching race.
2339 	 * Also machine checks tend to be broadcast and if one CPU
2340 	 * goes into machine check the others follow quickly, so we don't
2341 	 * expect a machine check to cause undue problems during to code
2342 	 * patching.
2343 	 */
2344 
2345 	/*
2346 	 * Make sure to set (artificial) features depending on used paravirt
2347 	 * functions which can later influence alternative patching.
2348 	 */
2349 	paravirt_set_cap();
2350 
2351 	/* Keep CET-IBT disabled until caller/callee are patched */
2352 	ibt = ibt_save(/*disable*/ true);
2353 
2354 	__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
2355 			__cfi_sites, __cfi_sites_end, true);
2356 
2357 	/*
2358 	 * Rewrite the retpolines, must be done before alternatives since
2359 	 * those can rewrite the retpoline thunks.
2360 	 */
2361 	apply_retpolines(__retpoline_sites, __retpoline_sites_end);
2362 	apply_returns(__return_sites, __return_sites_end);
2363 
2364 	/*
2365 	 * Adjust all CALL instructions to point to func()-10, including
2366 	 * those in .altinstr_replacement.
2367 	 */
2368 	callthunks_patch_builtin_calls();
2369 
2370 	apply_alternatives(__alt_instructions, __alt_instructions_end);
2371 
2372 	/*
2373 	 * Seal all functions that do not have their address taken.
2374 	 */
2375 	apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
2376 
2377 	ibt_restore(ibt);
2378 
2379 #ifdef CONFIG_SMP
2380 	/* Patch to UP if other cpus not imminent. */
2381 	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
2382 		uniproc_patched = true;
2383 		alternatives_smp_module_add(NULL, "core kernel",
2384 					    __smp_locks, __smp_locks_end,
2385 					    _text, _etext);
2386 	}
2387 
2388 	if (!uniproc_patched || num_possible_cpus() == 1) {
2389 		free_init_pages("SMP alternatives",
2390 				(unsigned long)__smp_locks,
2391 				(unsigned long)__smp_locks_end);
2392 	}
2393 #endif
2394 
2395 	restart_nmi();
2396 	alternatives_patched = 1;
2397 
2398 	alt_reloc_selftest();
2399 }
2400 
2401 /**
2402  * text_poke_early - Update instructions on a live kernel at boot time
2403  * @addr: address to modify
2404  * @opcode: source of the copy
2405  * @len: length to copy
2406  *
2407  * When you use this code to patch more than one byte of an instruction
2408  * you need to make sure that other CPUs cannot execute this code in parallel.
2409  * Also no thread must be currently preempted in the middle of these
2410  * instructions. And on the local CPU you need to be protected against NMI or
2411  * MCE handlers seeing an inconsistent instruction while you patch.
2412  */
2413 void __init_or_module text_poke_early(void *addr, const void *opcode,
2414 				      size_t len)
2415 {
2416 	unsigned long flags;
2417 
2418 	if (boot_cpu_has(X86_FEATURE_NX) &&
2419 	    is_module_text_address((unsigned long)addr)) {
2420 		/*
2421 		 * Modules text is marked initially as non-executable, so the
2422 		 * code cannot be running and speculative code-fetches are
2423 		 * prevented. Just change the code.
2424 		 */
2425 		memcpy(addr, opcode, len);
2426 	} else {
2427 		local_irq_save(flags);
2428 		memcpy(addr, opcode, len);
2429 		sync_core();
2430 		local_irq_restore(flags);
2431 
2432 		/*
2433 		 * Could also do a CLFLUSH here to speed up CPU recovery; but
2434 		 * that causes hangs on some VIA CPUs.
2435 		 */
2436 	}
2437 }
2438 
2439 typedef struct {
2440 	struct mm_struct *mm;
2441 } temp_mm_state_t;
2442 
2443 /*
2444  * Using a temporary mm allows to set temporary mappings that are not accessible
2445  * by other CPUs. Such mappings are needed to perform sensitive memory writes
2446  * that override the kernel memory protections (e.g., W^X), without exposing the
2447  * temporary page-table mappings that are required for these write operations to
2448  * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
2449  * mapping is torn down.
2450  *
2451  * Context: The temporary mm needs to be used exclusively by a single core. To
2452  *          harden security IRQs must be disabled while the temporary mm is
2453  *          loaded, thereby preventing interrupt handler bugs from overriding
2454  *          the kernel memory protection.
2455  */
2456 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
2457 {
2458 	temp_mm_state_t temp_state;
2459 
2460 	lockdep_assert_irqs_disabled();
2461 
2462 	/*
2463 	 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
2464 	 * with a stale address space WITHOUT being in lazy mode after
2465 	 * restoring the previous mm.
2466 	 */
2467 	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
2468 		leave_mm();
2469 
2470 	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
2471 	switch_mm_irqs_off(NULL, mm, current);
2472 
2473 	/*
2474 	 * If breakpoints are enabled, disable them while the temporary mm is
2475 	 * used. Userspace might set up watchpoints on addresses that are used
2476 	 * in the temporary mm, which would lead to wrong signals being sent or
2477 	 * crashes.
2478 	 *
2479 	 * Note that breakpoints are not disabled selectively, which also causes
2480 	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
2481 	 * undesirable, but still seems reasonable as the code that runs in the
2482 	 * temporary mm should be short.
2483 	 */
2484 	if (hw_breakpoint_active())
2485 		hw_breakpoint_disable();
2486 
2487 	return temp_state;
2488 }
2489 
2490 __ro_after_init struct mm_struct *poking_mm;
2491 __ro_after_init unsigned long poking_addr;
2492 
2493 static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
2494 {
2495 	lockdep_assert_irqs_disabled();
2496 
2497 	switch_mm_irqs_off(NULL, prev_state.mm, current);
2498 
2499 	/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
2500 	cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
2501 
2502 	/*
2503 	 * Restore the breakpoints if they were disabled before the temporary mm
2504 	 * was loaded.
2505 	 */
2506 	if (hw_breakpoint_active())
2507 		hw_breakpoint_restore();
2508 }
2509 
2510 static void text_poke_memcpy(void *dst, const void *src, size_t len)
2511 {
2512 	memcpy(dst, src, len);
2513 }
2514 
2515 static void text_poke_memset(void *dst, const void *src, size_t len)
2516 {
2517 	int c = *(const int *)src;
2518 
2519 	memset(dst, c, len);
2520 }
2521 
2522 typedef void text_poke_f(void *dst, const void *src, size_t len);
2523 
2524 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
2525 {
2526 	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
2527 	struct page *pages[2] = {NULL};
2528 	temp_mm_state_t prev;
2529 	unsigned long flags;
2530 	pte_t pte, *ptep;
2531 	spinlock_t *ptl;
2532 	pgprot_t pgprot;
2533 
2534 	/*
2535 	 * While boot memory allocator is running we cannot use struct pages as
2536 	 * they are not yet initialized. There is no way to recover.
2537 	 */
2538 	BUG_ON(!after_bootmem);
2539 
2540 	if (!core_kernel_text((unsigned long)addr)) {
2541 		pages[0] = vmalloc_to_page(addr);
2542 		if (cross_page_boundary)
2543 			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
2544 	} else {
2545 		pages[0] = virt_to_page(addr);
2546 		WARN_ON(!PageReserved(pages[0]));
2547 		if (cross_page_boundary)
2548 			pages[1] = virt_to_page(addr + PAGE_SIZE);
2549 	}
2550 	/*
2551 	 * If something went wrong, crash and burn since recovery paths are not
2552 	 * implemented.
2553 	 */
2554 	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
2555 
2556 	/*
2557 	 * Map the page without the global bit, as TLB flushing is done with
2558 	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
2559 	 */
2560 	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
2561 
2562 	/*
2563 	 * The lock is not really needed, but this allows to avoid open-coding.
2564 	 */
2565 	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
2566 
2567 	/*
2568 	 * This must not fail; preallocated in poking_init().
2569 	 */
2570 	VM_BUG_ON(!ptep);
2571 
2572 	local_irq_save(flags);
2573 
2574 	pte = mk_pte(pages[0], pgprot);
2575 	set_pte_at(poking_mm, poking_addr, ptep, pte);
2576 
2577 	if (cross_page_boundary) {
2578 		pte = mk_pte(pages[1], pgprot);
2579 		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
2580 	}
2581 
2582 	/*
2583 	 * Loading the temporary mm behaves as a compiler barrier, which
2584 	 * guarantees that the PTE will be set at the time memcpy() is done.
2585 	 */
2586 	prev = use_temporary_mm(poking_mm);
2587 
2588 	kasan_disable_current();
2589 	func((u8 *)poking_addr + offset_in_page(addr), src, len);
2590 	kasan_enable_current();
2591 
2592 	/*
2593 	 * Ensure that the PTE is only cleared after the instructions of memcpy
2594 	 * were issued by using a compiler barrier.
2595 	 */
2596 	barrier();
2597 
2598 	pte_clear(poking_mm, poking_addr, ptep);
2599 	if (cross_page_boundary)
2600 		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
2601 
2602 	/*
2603 	 * Loading the previous page-table hierarchy requires a serializing
2604 	 * instruction that already allows the core to see the updated version.
2605 	 * Xen-PV is assumed to serialize execution in a similar manner.
2606 	 */
2607 	unuse_temporary_mm(prev);
2608 
2609 	/*
2610 	 * Flushing the TLB might involve IPIs, which would require enabled
2611 	 * IRQs, but not if the mm is not used, as it is in this point.
2612 	 */
2613 	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
2614 			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
2615 			   PAGE_SHIFT, false);
2616 
2617 	if (func == text_poke_memcpy) {
2618 		/*
2619 		 * If the text does not match what we just wrote then something is
2620 		 * fundamentally screwy; there's nothing we can really do about that.
2621 		 */
2622 		BUG_ON(memcmp(addr, src, len));
2623 	}
2624 
2625 	local_irq_restore(flags);
2626 	pte_unmap_unlock(ptep, ptl);
2627 	return addr;
2628 }
2629 
2630 /**
2631  * text_poke - Update instructions on a live kernel
2632  * @addr: address to modify
2633  * @opcode: source of the copy
2634  * @len: length to copy
2635  *
2636  * Only atomic text poke/set should be allowed when not doing early patching.
2637  * It means the size must be writable atomically and the address must be aligned
2638  * in a way that permits an atomic write. It also makes sure we fit on a single
2639  * page.
2640  *
2641  * Note that the caller must ensure that if the modified code is part of a
2642  * module, the module would not be removed during poking. This can be achieved
2643  * by registering a module notifier, and ordering module removal and patching
2644  * through a mutex.
2645  */
2646 void *text_poke(void *addr, const void *opcode, size_t len)
2647 {
2648 	lockdep_assert_held(&text_mutex);
2649 
2650 	return __text_poke(text_poke_memcpy, addr, opcode, len);
2651 }
2652 
2653 /**
2654  * text_poke_kgdb - Update instructions on a live kernel by kgdb
2655  * @addr: address to modify
2656  * @opcode: source of the copy
2657  * @len: length to copy
2658  *
2659  * Only atomic text poke/set should be allowed when not doing early patching.
2660  * It means the size must be writable atomically and the address must be aligned
2661  * in a way that permits an atomic write. It also makes sure we fit on a single
2662  * page.
2663  *
2664  * Context: should only be used by kgdb, which ensures no other core is running,
2665  *	    despite the fact it does not hold the text_mutex.
2666  */
2667 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2668 {
2669 	return __text_poke(text_poke_memcpy, addr, opcode, len);
2670 }
2671 
2672 void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2673 			    bool core_ok)
2674 {
2675 	unsigned long start = (unsigned long)addr;
2676 	size_t patched = 0;
2677 
2678 	if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2679 		return NULL;
2680 
2681 	while (patched < len) {
2682 		unsigned long ptr = start + patched;
2683 		size_t s;
2684 
2685 		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2686 
2687 		__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
2688 		patched += s;
2689 	}
2690 	return addr;
2691 }
2692 
2693 /**
2694  * text_poke_copy - Copy instructions into (an unused part of) RX memory
2695  * @addr: address to modify
2696  * @opcode: source of the copy
2697  * @len: length to copy, could be more than 2x PAGE_SIZE
2698  *
2699  * Not safe against concurrent execution; useful for JITs to dump
2700  * new code blocks into unused regions of RX memory. Can be used in
2701  * conjunction with synchronize_rcu_tasks() to wait for existing
2702  * execution to quiesce after having made sure no existing functions
2703  * pointers are live.
2704  */
2705 void *text_poke_copy(void *addr, const void *opcode, size_t len)
2706 {
2707 	mutex_lock(&text_mutex);
2708 	addr = text_poke_copy_locked(addr, opcode, len, false);
2709 	mutex_unlock(&text_mutex);
2710 	return addr;
2711 }
2712 
2713 /**
2714  * text_poke_set - memset into (an unused part of) RX memory
2715  * @addr: address to modify
2716  * @c: the byte to fill the area with
2717  * @len: length to copy, could be more than 2x PAGE_SIZE
2718  *
2719  * This is useful to overwrite unused regions of RX memory with illegal
2720  * instructions.
2721  */
2722 void *text_poke_set(void *addr, int c, size_t len)
2723 {
2724 	unsigned long start = (unsigned long)addr;
2725 	size_t patched = 0;
2726 
2727 	if (WARN_ON_ONCE(core_kernel_text(start)))
2728 		return NULL;
2729 
2730 	mutex_lock(&text_mutex);
2731 	while (patched < len) {
2732 		unsigned long ptr = start + patched;
2733 		size_t s;
2734 
2735 		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2736 
2737 		__text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
2738 		patched += s;
2739 	}
2740 	mutex_unlock(&text_mutex);
2741 	return addr;
2742 }
2743 
2744 static void do_sync_core(void *info)
2745 {
2746 	sync_core();
2747 }
2748 
2749 void text_poke_sync(void)
2750 {
2751 	on_each_cpu(do_sync_core, NULL, 1);
2752 }
2753 
2754 /*
2755  * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2756  * this thing. When len == 6 everything is prefixed with 0x0f and we map
2757  * opcode to Jcc.d8, using len to distinguish.
2758  */
2759 struct text_poke_loc {
2760 	/* addr := _stext + rel_addr */
2761 	s32 rel_addr;
2762 	s32 disp;
2763 	u8 len;
2764 	u8 opcode;
2765 	const u8 text[POKE_MAX_OPCODE_SIZE];
2766 	/* see text_poke_bp_batch() */
2767 	u8 old;
2768 };
2769 
2770 struct bp_patching_desc {
2771 	struct text_poke_loc *vec;
2772 	int nr_entries;
2773 	atomic_t refs;
2774 };
2775 
2776 static struct bp_patching_desc bp_desc;
2777 
2778 static __always_inline
2779 struct bp_patching_desc *try_get_desc(void)
2780 {
2781 	struct bp_patching_desc *desc = &bp_desc;
2782 
2783 	if (!raw_atomic_inc_not_zero(&desc->refs))
2784 		return NULL;
2785 
2786 	return desc;
2787 }
2788 
2789 static __always_inline void put_desc(void)
2790 {
2791 	struct bp_patching_desc *desc = &bp_desc;
2792 
2793 	smp_mb__before_atomic();
2794 	raw_atomic_dec(&desc->refs);
2795 }
2796 
2797 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
2798 {
2799 	return _stext + tp->rel_addr;
2800 }
2801 
2802 static __always_inline int patch_cmp(const void *key, const void *elt)
2803 {
2804 	struct text_poke_loc *tp = (struct text_poke_loc *) elt;
2805 
2806 	if (key < text_poke_addr(tp))
2807 		return -1;
2808 	if (key > text_poke_addr(tp))
2809 		return 1;
2810 	return 0;
2811 }
2812 
2813 noinstr int poke_int3_handler(struct pt_regs *regs)
2814 {
2815 	struct bp_patching_desc *desc;
2816 	struct text_poke_loc *tp;
2817 	int ret = 0;
2818 	void *ip;
2819 
2820 	if (user_mode(regs))
2821 		return 0;
2822 
2823 	/*
2824 	 * Having observed our INT3 instruction, we now must observe
2825 	 * bp_desc with non-zero refcount:
2826 	 *
2827 	 *	bp_desc.refs = 1		INT3
2828 	 *	WMB				RMB
2829 	 *	write INT3			if (bp_desc.refs != 0)
2830 	 */
2831 	smp_rmb();
2832 
2833 	desc = try_get_desc();
2834 	if (!desc)
2835 		return 0;
2836 
2837 	/*
2838 	 * Discount the INT3. See text_poke_bp_batch().
2839 	 */
2840 	ip = (void *) regs->ip - INT3_INSN_SIZE;
2841 
2842 	/*
2843 	 * Skip the binary search if there is a single member in the vector.
2844 	 */
2845 	if (unlikely(desc->nr_entries > 1)) {
2846 		tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
2847 				      sizeof(struct text_poke_loc),
2848 				      patch_cmp);
2849 		if (!tp)
2850 			goto out_put;
2851 	} else {
2852 		tp = desc->vec;
2853 		if (text_poke_addr(tp) != ip)
2854 			goto out_put;
2855 	}
2856 
2857 	ip += tp->len;
2858 
2859 	switch (tp->opcode) {
2860 	case INT3_INSN_OPCODE:
2861 		/*
2862 		 * Someone poked an explicit INT3, they'll want to handle it,
2863 		 * do not consume.
2864 		 */
2865 		goto out_put;
2866 
2867 	case RET_INSN_OPCODE:
2868 		int3_emulate_ret(regs);
2869 		break;
2870 
2871 	case CALL_INSN_OPCODE:
2872 		int3_emulate_call(regs, (long)ip + tp->disp);
2873 		break;
2874 
2875 	case JMP32_INSN_OPCODE:
2876 	case JMP8_INSN_OPCODE:
2877 		int3_emulate_jmp(regs, (long)ip + tp->disp);
2878 		break;
2879 
2880 	case 0x70 ... 0x7f: /* Jcc */
2881 		int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
2882 		break;
2883 
2884 	default:
2885 		BUG();
2886 	}
2887 
2888 	ret = 1;
2889 
2890 out_put:
2891 	put_desc();
2892 	return ret;
2893 }
2894 
2895 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
2896 static struct text_poke_loc tp_vec[TP_VEC_MAX];
2897 static int tp_vec_nr;
2898 
2899 /**
2900  * text_poke_bp_batch() -- update instructions on live kernel on SMP
2901  * @tp:			vector of instructions to patch
2902  * @nr_entries:		number of entries in the vector
2903  *
2904  * Modify multi-byte instruction by using int3 breakpoint on SMP.
2905  * We completely avoid stop_machine() here, and achieve the
2906  * synchronization using int3 breakpoint.
2907  *
2908  * The way it is done:
2909  *	- For each entry in the vector:
2910  *		- add a int3 trap to the address that will be patched
2911  *	- sync cores
2912  *	- For each entry in the vector:
2913  *		- update all but the first byte of the patched range
2914  *	- sync cores
2915  *	- For each entry in the vector:
2916  *		- replace the first byte (int3) by the first byte of
2917  *		  replacing opcode
2918  *	- sync cores
2919  */
2920 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
2921 {
2922 	unsigned char int3 = INT3_INSN_OPCODE;
2923 	unsigned int i;
2924 	int do_sync;
2925 
2926 	lockdep_assert_held(&text_mutex);
2927 
2928 	bp_desc.vec = tp;
2929 	bp_desc.nr_entries = nr_entries;
2930 
2931 	/*
2932 	 * Corresponds to the implicit memory barrier in try_get_desc() to
2933 	 * ensure reading a non-zero refcount provides up to date bp_desc data.
2934 	 */
2935 	atomic_set_release(&bp_desc.refs, 1);
2936 
2937 	/*
2938 	 * Function tracing can enable thousands of places that need to be
2939 	 * updated. This can take quite some time, and with full kernel debugging
2940 	 * enabled, this could cause the softlockup watchdog to trigger.
2941 	 * This function gets called every 256 entries added to be patched.
2942 	 * Call cond_resched() here to make sure that other tasks can get scheduled
2943 	 * while processing all the functions being patched.
2944 	 */
2945 	cond_resched();
2946 
2947 	/*
2948 	 * Corresponding read barrier in int3 notifier for making sure the
2949 	 * nr_entries and handler are correctly ordered wrt. patching.
2950 	 */
2951 	smp_wmb();
2952 
2953 	/*
2954 	 * First step: add a int3 trap to the address that will be patched.
2955 	 */
2956 	for (i = 0; i < nr_entries; i++) {
2957 		tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
2958 		text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
2959 	}
2960 
2961 	text_poke_sync();
2962 
2963 	/*
2964 	 * Second step: update all but the first byte of the patched range.
2965 	 */
2966 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
2967 		u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
2968 		u8 _new[POKE_MAX_OPCODE_SIZE+1];
2969 		const u8 *new = tp[i].text;
2970 		int len = tp[i].len;
2971 
2972 		if (len - INT3_INSN_SIZE > 0) {
2973 			memcpy(old + INT3_INSN_SIZE,
2974 			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2975 			       len - INT3_INSN_SIZE);
2976 
2977 			if (len == 6) {
2978 				_new[0] = 0x0f;
2979 				memcpy(_new + 1, new, 5);
2980 				new = _new;
2981 			}
2982 
2983 			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2984 				  new + INT3_INSN_SIZE,
2985 				  len - INT3_INSN_SIZE);
2986 
2987 			do_sync++;
2988 		}
2989 
2990 		/*
2991 		 * Emit a perf event to record the text poke, primarily to
2992 		 * support Intel PT decoding which must walk the executable code
2993 		 * to reconstruct the trace. The flow up to here is:
2994 		 *   - write INT3 byte
2995 		 *   - IPI-SYNC
2996 		 *   - write instruction tail
2997 		 * At this point the actual control flow will be through the
2998 		 * INT3 and handler and not hit the old or new instruction.
2999 		 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
3000 		 * can still be decoded. Subsequently:
3001 		 *   - emit RECORD_TEXT_POKE with the new instruction
3002 		 *   - IPI-SYNC
3003 		 *   - write first byte
3004 		 *   - IPI-SYNC
3005 		 * So before the text poke event timestamp, the decoder will see
3006 		 * either the old instruction flow or FUP/TIP of INT3. After the
3007 		 * text poke event timestamp, the decoder will see either the
3008 		 * new instruction flow or FUP/TIP of INT3. Thus decoders can
3009 		 * use the timestamp as the point at which to modify the
3010 		 * executable code.
3011 		 * The old instruction is recorded so that the event can be
3012 		 * processed forwards or backwards.
3013 		 */
3014 		perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
3015 	}
3016 
3017 	if (do_sync) {
3018 		/*
3019 		 * According to Intel, this core syncing is very likely
3020 		 * not necessary and we'd be safe even without it. But
3021 		 * better safe than sorry (plus there's not only Intel).
3022 		 */
3023 		text_poke_sync();
3024 	}
3025 
3026 	/*
3027 	 * Third step: replace the first byte (int3) by the first byte of
3028 	 * replacing opcode.
3029 	 */
3030 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
3031 		u8 byte = tp[i].text[0];
3032 
3033 		if (tp[i].len == 6)
3034 			byte = 0x0f;
3035 
3036 		if (byte == INT3_INSN_OPCODE)
3037 			continue;
3038 
3039 		text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
3040 		do_sync++;
3041 	}
3042 
3043 	if (do_sync)
3044 		text_poke_sync();
3045 
3046 	/*
3047 	 * Remove and wait for refs to be zero.
3048 	 */
3049 	if (!atomic_dec_and_test(&bp_desc.refs))
3050 		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
3051 }
3052 
3053 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
3054 			       const void *opcode, size_t len, const void *emulate)
3055 {
3056 	struct insn insn;
3057 	int ret, i = 0;
3058 
3059 	if (len == 6)
3060 		i = 1;
3061 	memcpy((void *)tp->text, opcode+i, len-i);
3062 	if (!emulate)
3063 		emulate = opcode;
3064 
3065 	ret = insn_decode_kernel(&insn, emulate);
3066 	BUG_ON(ret < 0);
3067 
3068 	tp->rel_addr = addr - (void *)_stext;
3069 	tp->len = len;
3070 	tp->opcode = insn.opcode.bytes[0];
3071 
3072 	if (is_jcc32(&insn)) {
3073 		/*
3074 		 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
3075 		 */
3076 		tp->opcode = insn.opcode.bytes[1] - 0x10;
3077 	}
3078 
3079 	switch (tp->opcode) {
3080 	case RET_INSN_OPCODE:
3081 	case JMP32_INSN_OPCODE:
3082 	case JMP8_INSN_OPCODE:
3083 		/*
3084 		 * Control flow instructions without implied execution of the
3085 		 * next instruction can be padded with INT3.
3086 		 */
3087 		for (i = insn.length; i < len; i++)
3088 			BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
3089 		break;
3090 
3091 	default:
3092 		BUG_ON(len != insn.length);
3093 	}
3094 
3095 	switch (tp->opcode) {
3096 	case INT3_INSN_OPCODE:
3097 	case RET_INSN_OPCODE:
3098 		break;
3099 
3100 	case CALL_INSN_OPCODE:
3101 	case JMP32_INSN_OPCODE:
3102 	case JMP8_INSN_OPCODE:
3103 	case 0x70 ... 0x7f: /* Jcc */
3104 		tp->disp = insn.immediate.value;
3105 		break;
3106 
3107 	default: /* assume NOP */
3108 		switch (len) {
3109 		case 2: /* NOP2 -- emulate as JMP8+0 */
3110 			BUG_ON(memcmp(emulate, x86_nops[len], len));
3111 			tp->opcode = JMP8_INSN_OPCODE;
3112 			tp->disp = 0;
3113 			break;
3114 
3115 		case 5: /* NOP5 -- emulate as JMP32+0 */
3116 			BUG_ON(memcmp(emulate, x86_nops[len], len));
3117 			tp->opcode = JMP32_INSN_OPCODE;
3118 			tp->disp = 0;
3119 			break;
3120 
3121 		default: /* unknown instruction */
3122 			BUG();
3123 		}
3124 		break;
3125 	}
3126 }
3127 
3128 /*
3129  * We hard rely on the tp_vec being ordered; ensure this is so by flushing
3130  * early if needed.
3131  */
3132 static bool tp_order_fail(void *addr)
3133 {
3134 	struct text_poke_loc *tp;
3135 
3136 	if (!tp_vec_nr)
3137 		return false;
3138 
3139 	if (!addr) /* force */
3140 		return true;
3141 
3142 	tp = &tp_vec[tp_vec_nr - 1];
3143 	if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
3144 		return true;
3145 
3146 	return false;
3147 }
3148 
3149 static void text_poke_flush(void *addr)
3150 {
3151 	if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
3152 		text_poke_bp_batch(tp_vec, tp_vec_nr);
3153 		tp_vec_nr = 0;
3154 	}
3155 }
3156 
3157 void text_poke_finish(void)
3158 {
3159 	text_poke_flush(NULL);
3160 }
3161 
3162 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
3163 {
3164 	struct text_poke_loc *tp;
3165 
3166 	text_poke_flush(addr);
3167 
3168 	tp = &tp_vec[tp_vec_nr++];
3169 	text_poke_loc_init(tp, addr, opcode, len, emulate);
3170 }
3171 
3172 /**
3173  * text_poke_bp() -- update instructions on live kernel on SMP
3174  * @addr:	address to patch
3175  * @opcode:	opcode of new instruction
3176  * @len:	length to copy
3177  * @emulate:	instruction to be emulated
3178  *
3179  * Update a single instruction with the vector in the stack, avoiding
3180  * dynamically allocated memory. This function should be used when it is
3181  * not possible to allocate memory.
3182  */
3183 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
3184 {
3185 	struct text_poke_loc tp;
3186 
3187 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
3188 	text_poke_bp_batch(&tp, 1);
3189 }
3190