xref: /freebsd/sys/cddl/dev/kinst/amd64/kinst_isa.c (revision 7ef62cebc2f965b0f640263e179276928885e33d)
1 /*
2  * SPDX-License-Identifier: CDDL 1.0
3  *
4  * Copyright (c) 2022 Christos Margiolis <christos@FreeBSD.org>
5  * Copyright (c) 2022 Mark Johnston <markj@FreeBSD.org>
6  * Copyright (c) 2023 The FreeBSD Foundation
7  *
8  * Portions of this software were developed by Christos Margiolis
9  * <christos@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
10  */
11 
12 #include <sys/param.h>
13 #include <sys/pcpu.h>
14 
15 #include <machine/cpufunc.h>
16 #include <machine/md_var.h>
17 
18 #include <sys/dtrace.h>
19 #include <cddl/dev/dtrace/dtrace_cddl.h>
20 #include <dis_tables.h>
21 
22 #include "kinst.h"
23 
24 #define KINST_PUSHL_RBP		0x55
25 #define KINST_POPL_RBP		0x5d
26 #define KINST_STI		0xfb
27 #define KINST_POPF		0x9d
28 
29 #define KINST_MODRM_MOD(b)	(((b) & 0xc0) >> 6)
30 #define KINST_MODRM_REG(b)	(((b) & 0x38) >> 3)
31 #define KINST_MODRM_RM(b)	((b) & 0x07)
32 
33 #define KINST_SIB_SCALE(s)	(((s) & 0xc0) >> 6)
34 #define KINST_SIB_INDEX(s)	(((s) & 0x38) >> 3)
35 #define KINST_SIB_BASE(s)	(((s) & 0x07) >> 0)
36 
37 #define KINST_REX_W(r)		(((r) & 0x08) >> 3)
38 #define KINST_REX_R(r)		(((r) & 0x04) >> 2)
39 #define KINST_REX_X(r)		(((r) & 0x02) >> 1)
40 #define KINST_REX_B(r)		(((r) & 0x01) >> 0)
41 
42 #define KINST_F_CALL		0x0001	/* instruction is a "call" */
43 #define KINST_F_DIRECT_CALL	0x0002	/* instruction is a direct call */
44 #define KINST_F_RIPREL		0x0004	/* instruction is position-dependent */
45 #define KINST_F_JMP		0x0008	/* instruction is a %rip-relative jmp */
46 #define KINST_F_MOD_DIRECT	0x0010	/* operand is not a memory address */
47 
48 /*
49  * Per-CPU trampolines used when the interrupted thread is executing with
50  * interrupts disabled.  If an interrupt is raised while executing a trampoline,
51  * the interrupt thread cannot safely overwrite its trampoline if it hits a
52  * kinst probe while executing the interrupt handler.
53  */
54 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp);
55 
56 /*
57  * Map ModR/M register bits to a trapframe offset.
58  */
59 static int
60 kinst_regoff(int reg)
61 {
62 #define	_MATCH_REG(i, reg)			\
63 	case i:					\
64 		return (offsetof(struct trapframe, tf_ ## reg) / \
65 		    sizeof(register_t))
66 	switch (reg) {
67 	_MATCH_REG( 0, rax);
68 	_MATCH_REG( 1, rcx);
69 	_MATCH_REG( 2, rdx);
70 	_MATCH_REG( 3, rbx);
71 	_MATCH_REG( 4, rsp); /* SIB when mod != 3 */
72 	_MATCH_REG( 5, rbp);
73 	_MATCH_REG( 6, rsi);
74 	_MATCH_REG( 7, rdi);
75 	_MATCH_REG( 8, r8); /* REX.R is set */
76 	_MATCH_REG( 9, r9);
77 	_MATCH_REG(10, r10);
78 	_MATCH_REG(11, r11);
79 	_MATCH_REG(12, r12);
80 	_MATCH_REG(13, r13);
81 	_MATCH_REG(14, r14);
82 	_MATCH_REG(15, r15);
83 	}
84 #undef _MATCH_REG
85 	panic("%s: unhandled register index %d", __func__, reg);
86 }
87 
88 /*
89  * Obtain the specified register's value.
90  */
91 static uint64_t
92 kinst_regval(struct trapframe *frame, int reg)
93 {
94 	if (reg == -1)
95 		return (0);
96 	return (((register_t *)frame)[kinst_regoff(reg)]);
97 }
98 
99 static uint32_t
100 kinst_riprel_disp(struct kinst_probe *kp, void *dst)
101 {
102 	return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
103 	    (intptr_t)dst));
104 }
105 
106 static void
107 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
108 {
109 	uint8_t *instr;
110 	uint32_t disp;
111 	int ilen;
112 
113 	ilen = kp->kp_md.tinstlen;
114 
115 	kinst_memcpy(tramp, kp->kp_md.template, ilen);
116 	if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
117 		disp = kinst_riprel_disp(kp, tramp);
118 		kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
119 	}
120 
121 	/*
122 	 * The following position-independent jmp takes us back to the
123 	 * original code.  It is encoded as "jmp *0(%rip)" (six bytes),
124 	 * followed by the absolute address of the instruction following
125 	 * the one that was traced (eight bytes).
126 	 */
127 	tramp[ilen + 0] = 0xff;
128 	tramp[ilen + 1] = 0x25;
129 	tramp[ilen + 2] = 0x00;
130 	tramp[ilen + 3] = 0x00;
131 	tramp[ilen + 4] = 0x00;
132 	tramp[ilen + 5] = 0x00;
133 	instr = kp->kp_patchpoint + kp->kp_md.instlen;
134 	kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
135 }
136 
137 int
138 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
139 {
140 	solaris_cpu_t *cpu;
141 	uintptr_t *stack, retaddr;
142 	struct kinst_probe *kp;
143 	struct kinst_probe_md *kpmd;
144 	uint8_t *tramp;
145 
146 	stack = (uintptr_t *)frame->tf_rsp;
147 	cpu = &solaris_cpu[curcpu];
148 
149 	LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
150 		if ((uintptr_t)kp->kp_patchpoint == addr)
151 			break;
152 	}
153 	if (kp == NULL)
154 		return (0);
155 
156 	/*
157 	 * Report the address of the breakpoint for the benefit of consumers
158 	 * fetching register values with regs[].
159 	 */
160 	frame->tf_rip--;
161 
162 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
163 	cpu->cpu_dtrace_caller = stack[0];
164 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
165 	dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
166 	cpu->cpu_dtrace_caller = 0;
167 
168 	kpmd = &kp->kp_md;
169 	if ((kpmd->flags & KINST_F_CALL) != 0) {
170 		/*
171 		 * dtrace_invop_start() reserves space on the stack to
172 		 * store the return address of the call instruction.
173 		 */
174 		retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
175 		*(uintptr_t *)scratch = retaddr;
176 
177 		if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
178 			frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
179 			    kpmd->disp + kpmd->instlen);
180 		} else {
181 			register_t rval;
182 
183 			if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
184 				/* rip-relative */
185 				rval = frame->tf_rip + kpmd->instlen;
186 			} else {
187 				/* indirect */
188 				rval = kinst_regval(frame, kpmd->reg1) +
189 				    (kinst_regval(frame, kpmd->reg2) <<
190 				    kpmd->scale);
191 			}
192 
193 			if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
194 				frame->tf_rip = rval + kpmd->disp;
195 			} else {
196 				frame->tf_rip =
197 				    *(uintptr_t *)(rval + kpmd->disp);
198 			}
199 		}
200 		return (DTRACE_INVOP_CALL);
201 	} else {
202 		if ((frame->tf_rflags & PSL_I) == 0)
203 			tramp = DPCPU_GET(intr_tramp);
204 		else
205 			tramp = curthread->t_kinst_tramp;
206 		if (tramp == NULL) {
207 			/*
208 			 * A trampoline allocation failed, so this probe is
209 			 * effectively disabled.  Restore the original
210 			 * instruction.
211 			 *
212 			 * We can't safely print anything here, but the
213 			 * trampoline allocator should have left a breadcrumb in
214 			 * the dmesg.
215 			 */
216 			kinst_patch_tracepoint(kp, kp->kp_savedval);
217 			frame->tf_rip = (register_t)kp->kp_patchpoint;
218 		} else {
219 			kinst_trampoline_populate(kp, tramp);
220 			frame->tf_rip = (register_t)tramp;
221 		}
222 		return (DTRACE_INVOP_NOP);
223 	}
224 }
225 
226 void
227 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
228 {
229 	register_t reg;
230 	int oldwp;
231 
232 	reg = intr_disable();
233 	oldwp = disable_wp();
234 	*kp->kp_patchpoint = val;
235 	restore_wp(oldwp);
236 	intr_restore(reg);
237 }
238 
239 static void
240 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
241 {
242 	kp->kp_md.disp = (int64_t)(int8_t)byte;
243 }
244 
245 static void
246 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
247 {
248 	int32_t disp32;
249 
250 	memcpy(&disp32, bytes, sizeof(disp32));
251 	kp->kp_md.disp = (int64_t)disp32;
252 }
253 
254 /*
255  * Set up all of the state needed to faithfully execute a probed instruction.
256  *
257  * In the simple case, we copy the instruction unmodified to a per-thread
258  * trampoline, wherein it is followed by a jump back to the original code.
259  * - Instructions can have %rip as an operand:
260  *   - with %rip-relative addressing encoded in ModR/M, or
261  *   - implicitly as a part of the instruction definition (jmp, call).
262  * - Call instructions (which may be %rip-relative) need to push the correct
263  *   return address onto the stack.
264  *
265  * Call instructions are simple enough to be emulated in software, so we simply
266  * do not use the trampoline mechanism in that case.  kinst_invop() will compute
267  * the branch target using the address info computed here (register operands and
268  * displacement).
269  *
270  * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
271  * displacement; when populating the trampoline the displacement is adjusted to
272  * be relative to the trampoline address.  Trampolines are always allocated
273  * above KERNBASE for this reason.
274  *
275  * For other %rip-relative operands (just jumps) we take the same approach.
276  * Instructions which specify an 8-bit displacement must be rewritten to use a
277  * 32-bit displacement.
278  */
279 static int
280 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
281 {
282 	struct kinst_probe_md *kpmd;
283 	dis86_t d86;
284 	uint8_t *bytes, modrm, rex;
285 	int dispoff, i, ilen, opcidx;
286 
287 	kpmd = &kp->kp_md;
288 
289 	d86.d86_data = instr;
290 	d86.d86_get_byte = dtrace_dis_get_byte;
291 	d86.d86_check_func = NULL;
292 	if (dtrace_disx86(&d86, SIZE64) != 0) {
293 		KINST_LOG("failed to disassemble instruction at: %p", *instr);
294 		return (EINVAL);
295 	}
296 	bytes = d86.d86_bytes;
297 	kpmd->instlen = kpmd->tinstlen = d86.d86_len;
298 
299 	/*
300 	 * Skip over prefixes, save REX.
301 	 */
302 	rex = 0;
303 	for (i = 0; i < kpmd->instlen; i++) {
304 		switch (bytes[i]) {
305 		case 0xf0 ... 0xf3:
306 			/* group 1 */
307 			continue;
308 		case 0x26:
309 		case 0x2e:
310 		case 0x36:
311 		case 0x3e:
312 		case 0x64:
313 		case 0x65:
314 			/* group 2 */
315 			continue;
316 		case 0x66:
317 			/* group 3 */
318 			continue;
319 		case 0x67:
320 			/* group 4 */
321 			continue;
322 		case 0x40 ... 0x4f:
323 			/* REX */
324 			rex = bytes[i];
325 			continue;
326 		}
327 		break;
328 	}
329 	KASSERT(i < kpmd->instlen,
330 	    ("%s: failed to disassemble instruction at %p", __func__, bytes));
331 	opcidx = i;
332 
333 	/*
334 	 * Identify instructions of interest by opcode: calls and jumps.
335 	 * Extract displacements.
336 	 */
337 	dispoff = -1;
338 	switch (bytes[opcidx]) {
339 	case 0x0f:
340 		switch (bytes[opcidx + 1]) {
341 		case 0x80 ... 0x8f:
342 			/* conditional jmp near */
343 			kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
344 			dispoff = opcidx + 2;
345 			kinst_set_disp32(kp, &bytes[dispoff]);
346 			break;
347 		}
348 		break;
349 	case 0xe3:
350 		/*
351 		 * There is no straightforward way to translate this instruction
352 		 * to use a 32-bit displacement.  Fortunately, it is rarely
353 		 * used.
354 		 */
355 		return (EINVAL);
356 	case 0x70 ... 0x7f:
357 		/* conditional jmp short */
358 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
359 		dispoff = opcidx + 1;
360 		kinst_set_disp8(kp, bytes[dispoff]);
361 		break;
362 	case 0xe9:
363 		/* unconditional jmp near */
364 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
365 		dispoff = opcidx + 1;
366 		kinst_set_disp32(kp, &bytes[dispoff]);
367 		break;
368 	case 0xeb:
369 		/* unconditional jmp short */
370 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
371 		dispoff = opcidx + 1;
372 		kinst_set_disp8(kp, bytes[dispoff]);
373 		break;
374 	case 0xe8:
375 	case 0x9a:
376 		/* direct call */
377 		kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
378 		dispoff = opcidx + 1;
379 		kinst_set_disp32(kp, &bytes[dispoff]);
380 		break;
381 	case 0xff:
382 		KASSERT(d86.d86_got_modrm,
383 		    ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
384 		switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
385 		case 0x02:
386 		case 0x03:
387 			/* indirect call */
388 			kpmd->flags |= KINST_F_CALL;
389 			break;
390 		case 0x04:
391 		case 0x05:
392 			/* indirect jump */
393 			kpmd->flags |= KINST_F_JMP;
394 			break;
395 		}
396 	}
397 
398 	/*
399 	 * If there's a ModR/M byte, we need to check it to see if the operand
400 	 * is %rip-relative, and rewrite the displacement if so.  If not, we
401 	 * might still have to extract operand info if this is a call
402 	 * instruction.
403 	 */
404 	if (d86.d86_got_modrm) {
405 		uint8_t mod, rm, sib;
406 
407 		kpmd->reg1 = kpmd->reg2 = -1;
408 
409 		modrm = bytes[d86.d86_rmindex];
410 		mod = KINST_MODRM_MOD(modrm);
411 		rm = KINST_MODRM_RM(modrm);
412 		if (mod == 0 && rm == 5) {
413 			kpmd->flags |= KINST_F_RIPREL;
414 			dispoff = d86.d86_rmindex + 1;
415 			kinst_set_disp32(kp, &bytes[dispoff]);
416 		} else if ((kpmd->flags & KINST_F_CALL) != 0) {
417 			bool havesib;
418 
419 			havesib = (mod != 3 && rm == 4);
420 			dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
421 			if (mod == 1)
422 				kinst_set_disp8(kp, bytes[dispoff]);
423 			else if (mod == 2)
424 				kinst_set_disp32(kp, &bytes[dispoff]);
425 			else if (mod == 3)
426 				kpmd->flags |= KINST_F_MOD_DIRECT;
427 
428 			if (havesib) {
429 				sib = bytes[d86.d86_rmindex + 1];
430 				if (KINST_SIB_BASE(sib) != 5) {
431 					kpmd->reg1 = KINST_SIB_BASE(sib) |
432 					    (KINST_REX_B(rex) << 3);
433 				}
434 				kpmd->scale = KINST_SIB_SCALE(sib);
435 				kpmd->reg2 = KINST_SIB_INDEX(sib) |
436 				    (KINST_REX_X(rex) << 3);
437 			} else {
438 				kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
439 			}
440 		}
441 	}
442 
443 	/*
444 	 * Calls are emulated in software; once operands are decoded we have
445 	 * nothing else to do.
446 	 */
447 	if ((kpmd->flags & KINST_F_CALL) != 0)
448 		return (0);
449 
450 	/*
451 	 * Allocate and populate an instruction trampoline template.
452 	 *
453 	 * Position-independent instructions can simply be copied, but
454 	 * position-dependent instructions require some surgery: jump
455 	 * instructions with an 8-bit displacement need to be converted to use a
456 	 * 32-bit displacement, and the adjusted displacement needs to be
457 	 * computed.
458 	 */
459 	ilen = kpmd->instlen;
460 	if ((kpmd->flags & KINST_F_RIPREL) != 0) {
461 		if ((kpmd->flags & KINST_F_JMP) == 0 ||
462 		    bytes[opcidx] == 0x0f ||
463 		    bytes[opcidx] == 0xe9 ||
464 		    bytes[opcidx] == 0xff) {
465 			memcpy(kpmd->template, bytes, dispoff);
466 			memcpy(&kpmd->template[dispoff + 4],
467 			    &bytes[dispoff + 4], ilen - (dispoff + 4));
468 			kpmd->dispoff = dispoff;
469 		} else if (bytes[opcidx] == 0xeb) {
470 			memcpy(kpmd->template, bytes, opcidx);
471 			kpmd->template[opcidx] = 0xe9;
472 			kpmd->dispoff = opcidx + 1;
473 
474 			/* Instruction length changes from 2 to 5. */
475 			kpmd->tinstlen = 5;
476 			kpmd->disp -= 3;
477 		} else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f)  {
478 			memcpy(kpmd->template, bytes, opcidx);
479 			kpmd->template[opcidx] = 0x0f;
480 			kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
481 			kpmd->dispoff = opcidx + 2;
482 
483 			/* Instruction length changes from 2 to 6. */
484 			kpmd->tinstlen = 6;
485 			kpmd->disp -= 4;
486 		} else {
487 			panic("unhandled opcode %#x", bytes[opcidx]);
488 		}
489 	} else {
490 		memcpy(kpmd->template, bytes, ilen);
491 	}
492 
493 	return (0);
494 }
495 
496 int
497 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
498     void *opaque)
499 {
500 	struct kinst_probe *kp;
501 	dtrace_kinst_probedesc_t *pd;
502 	const char *func;
503 	int error, instrsize, n, off;
504 	uint8_t *instr, *limit, *tmp;
505 	bool push_found, pop_found;
506 
507 	pd = opaque;
508 	func = symval->name;
509 	if (kinst_excluded(func))
510 		return (0);
511 	if (strcmp(func, pd->kpd_func) != 0)
512 		return (0);
513 
514 	instr = (uint8_t *)symval->value;
515 	limit = (uint8_t *)symval->value + symval->size;
516 	if (instr >= limit)
517 		return (0);
518 
519 	/*
520 	 * Refuse to instrument functions lacking the usual frame pointer
521 	 * manipulations since they might correspond to exception handlers.
522 	 */
523 	tmp = instr;
524 	push_found = pop_found = false;
525 	while (tmp < limit) {
526 		if (*tmp == KINST_PUSHL_RBP)
527 			push_found = true;
528 		else if (*tmp == KINST_POPL_RBP)
529 			pop_found = true;
530 		if (push_found && pop_found)
531 			break;
532 		tmp += dtrace_instr_size(tmp);
533 	}
534 	if (!push_found || !pop_found)
535 		return (0);
536 
537 	n = 0;
538 	while (instr < limit) {
539 		instrsize = dtrace_instr_size(instr);
540 		off = (int)(instr - (uint8_t *)symval->value);
541 		if (pd->kpd_off != -1 && off != pd->kpd_off) {
542 			instr += instrsize;
543 			continue;
544 		}
545 
546 		/*
547 		 * Check for instructions which may enable interrupts.  Such
548 		 * instructions are tricky to trace since it is unclear whether
549 		 * to use the per-thread or per-CPU trampolines.  Since they are
550 		 * rare, we don't bother to implement special handling for them.
551 		 *
552 		 * If the caller specified an offset, return an error, otherwise
553 		 * silently ignore the instruction so that it remains possible
554 		 * to enable all instructions in a function.
555 		 */
556 		if (instrsize == 1 &&
557 		    (instr[0] == KINST_POPF || instr[0] == KINST_STI)) {
558 			if (pd->kpd_off != -1)
559 				return (EINVAL);
560 			instr += instrsize;
561 			continue;
562 		}
563 
564 		/*
565 		 * Prevent separate dtrace(1) instances from creating copies of
566 		 * the same probe.
567 		 */
568 		LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
569 			if (strcmp(kp->kp_func, func) == 0 &&
570 			    strtol(kp->kp_name, NULL, 10) == off)
571 				return (0);
572 		}
573 		if (++n > KINST_PROBETAB_MAX) {
574 			KINST_LOG("probe list full: %d entries", n);
575 			return (ENOMEM);
576 		}
577 		kp = malloc(sizeof(struct kinst_probe), M_KINST,
578 		    M_WAITOK | M_ZERO);
579 		kp->kp_func = func;
580 		snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
581 		kp->kp_savedval = *instr;
582 		kp->kp_patchval = KINST_PATCHVAL;
583 		kp->kp_patchpoint = instr;
584 
585 		error = kinst_instr_dissect(kp, &instr);
586 		if (error != 0)
587 			return (error);
588 
589 		kinst_probe_create(kp, lf);
590 	}
591 
592 	return (0);
593 }
594 
595 int
596 kinst_md_init(void)
597 {
598 	uint8_t *tramp;
599 	int cpu;
600 
601 	CPU_FOREACH(cpu) {
602 		tramp = kinst_trampoline_alloc(M_WAITOK);
603 		if (tramp == NULL)
604 			return (ENOMEM);
605 		DPCPU_ID_SET(cpu, intr_tramp, tramp);
606 	}
607 
608 	return (0);
609 }
610 
611 void
612 kinst_md_deinit(void)
613 {
614 	uint8_t *tramp;
615 	int cpu;
616 
617 	CPU_FOREACH(cpu) {
618 		tramp = DPCPU_ID_GET(cpu, intr_tramp);
619 		if (tramp != NULL) {
620 			kinst_trampoline_dealloc(tramp);
621 			DPCPU_ID_SET(cpu, intr_tramp, NULL);
622 		}
623 	}
624 }
625 
626 /*
627  * Exclude machine-dependent functions that are not safe-to-trace.
628  */
629 bool
630 kinst_md_excluded(const char *name)
631 {
632 	return (false);
633 }
634