xref: /freebsd/sys/cddl/dev/kinst/amd64/kinst_isa.c (revision 9f44a47fd07924afc035991af15d84e6585dea4f)
1 /*
2  * SPDX-License-Identifier: CDDL 1.0
3  *
4  * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
5  * Copyright 2022 Mark Johnston <markj@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/pcpu.h>
10 
11 #include <machine/cpufunc.h>
12 #include <machine/md_var.h>
13 
14 #include <sys/dtrace.h>
15 #include <cddl/dev/dtrace/dtrace_cddl.h>
16 #include <dis_tables.h>
17 
18 #include "kinst.h"
19 
20 #define KINST_PUSHL_RBP		0x55
21 #define KINST_STI		0xfb
22 #define KINST_POPF		0x9d
23 
24 #define KINST_MODRM_MOD(b)	(((b) & 0xc0) >> 6)
25 #define KINST_MODRM_REG(b)	(((b) & 0x38) >> 3)
26 #define KINST_MODRM_RM(b)	((b) & 0x07)
27 
28 #define KINST_SIB_SCALE(s)	(((s) & 0xc0) >> 6)
29 #define KINST_SIB_INDEX(s)	(((s) & 0x38) >> 3)
30 #define KINST_SIB_BASE(s)	(((s) & 0x07) >> 0)
31 
32 #define KINST_REX_W(r)		(((r) & 0x08) >> 3)
33 #define KINST_REX_R(r)		(((r) & 0x04) >> 2)
34 #define KINST_REX_X(r)		(((r) & 0x02) >> 1)
35 #define KINST_REX_B(r)		(((r) & 0x01) >> 0)
36 
37 #define KINST_F_CALL		0x0001	/* instruction is a "call" */
38 #define KINST_F_DIRECT_CALL	0x0002	/* instruction is a direct call */
39 #define KINST_F_RIPREL		0x0004	/* instruction is position-dependent */
40 #define KINST_F_JMP		0x0008	/* instruction is a %rip-relative jmp */
41 #define KINST_F_MOD_DIRECT	0x0010	/* operand is not a memory address */
42 
43 /*
44  * Per-CPU trampolines used when the interrupted thread is executing with
45  * interrupts disabled.  If an interrupt is raised while executing a trampoline,
46  * the interrupt thread cannot safely overwrite its trampoline if it hits a
47  * kinst probe while executing the interrupt handler.
48  */
49 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp);
50 
51 /*
52  * Map ModR/M register bits to a trapframe offset.
53  */
54 static int
55 kinst_regoff(int reg)
56 {
57 #define	_MATCH_REG(i, reg)			\
58 	case i:					\
59 		return (offsetof(struct trapframe, tf_ ## reg) / \
60 		    sizeof(register_t))
61 	switch (reg) {
62 	_MATCH_REG( 0, rax);
63 	_MATCH_REG( 1, rcx);
64 	_MATCH_REG( 2, rdx);
65 	_MATCH_REG( 3, rbx);
66 	_MATCH_REG( 4, rsp); /* SIB when mod != 3 */
67 	_MATCH_REG( 5, rbp);
68 	_MATCH_REG( 6, rsi);
69 	_MATCH_REG( 7, rdi);
70 	_MATCH_REG( 8, r8); /* REX.R is set */
71 	_MATCH_REG( 9, r9);
72 	_MATCH_REG(10, r10);
73 	_MATCH_REG(11, r11);
74 	_MATCH_REG(12, r12);
75 	_MATCH_REG(13, r13);
76 	_MATCH_REG(14, r14);
77 	_MATCH_REG(15, r15);
78 	}
79 #undef _MATCH_REG
80 	panic("%s: unhandled register index %d", __func__, reg);
81 }
82 
83 /*
84  * Obtain the specified register's value.
85  */
86 static uint64_t
87 kinst_regval(struct trapframe *frame, int reg)
88 {
89 	if (reg == -1)
90 		return (0);
91 	return (((register_t *)frame)[kinst_regoff(reg)]);
92 }
93 
94 static uint32_t
95 kinst_riprel_disp(struct kinst_probe *kp, void *dst)
96 {
97 	return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
98 	    (intptr_t)dst));
99 }
100 
101 static void
102 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
103 {
104 	uint8_t *instr;
105 	uint32_t disp;
106 	int ilen;
107 
108 	ilen = kp->kp_md.tinstlen;
109 
110 	kinst_memcpy(tramp, kp->kp_md.template, ilen);
111 	if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
112 		disp = kinst_riprel_disp(kp, tramp);
113 		kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
114 	}
115 
116 	/*
117 	 * The following position-independent jmp takes us back to the
118 	 * original code.  It is encoded as "jmp *0(%rip)" (six bytes),
119 	 * followed by the absolute address of the instruction following
120 	 * the one that was traced (eight bytes).
121 	 */
122 	tramp[ilen + 0] = 0xff;
123 	tramp[ilen + 1] = 0x25;
124 	tramp[ilen + 2] = 0x00;
125 	tramp[ilen + 3] = 0x00;
126 	tramp[ilen + 4] = 0x00;
127 	tramp[ilen + 5] = 0x00;
128 	instr = kp->kp_patchpoint + kp->kp_md.instlen;
129 	kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
130 }
131 
132 int
133 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
134 {
135 	solaris_cpu_t *cpu;
136 	uintptr_t *stack, retaddr;
137 	struct kinst_probe *kp;
138 	struct kinst_probe_md *kpmd;
139 	uint8_t *tramp;
140 
141 	stack = (uintptr_t *)frame->tf_rsp;
142 	cpu = &solaris_cpu[curcpu];
143 
144 	LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
145 		if ((uintptr_t)kp->kp_patchpoint == addr)
146 			break;
147 	}
148 	if (kp == NULL)
149 		return (0);
150 
151 	/*
152 	 * Report the address of the breakpoint for the benefit of consumers
153 	 * fetching register values with regs[].
154 	 */
155 	frame->tf_rip--;
156 
157 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
158 	cpu->cpu_dtrace_caller = stack[0];
159 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
160 	dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
161 	cpu->cpu_dtrace_caller = 0;
162 
163 	kpmd = &kp->kp_md;
164 	if ((kpmd->flags & KINST_F_CALL) != 0) {
165 		/*
166 		 * dtrace_invop_start() reserves space on the stack to
167 		 * store the return address of the call instruction.
168 		 */
169 		retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
170 		*(uintptr_t *)scratch = retaddr;
171 
172 		if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
173 			frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
174 			    kpmd->disp + kpmd->instlen);
175 		} else {
176 			register_t rval;
177 
178 			if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
179 				/* rip-relative */
180 				rval = frame->tf_rip + kpmd->instlen;
181 			} else {
182 				/* indirect */
183 				rval = kinst_regval(frame, kpmd->reg1) +
184 				    (kinst_regval(frame, kpmd->reg2) <<
185 				    kpmd->scale);
186 			}
187 
188 			if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
189 				frame->tf_rip = rval + kpmd->disp;
190 			} else {
191 				frame->tf_rip =
192 				    *(uintptr_t *)(rval + kpmd->disp);
193 			}
194 		}
195 		return (DTRACE_INVOP_CALL);
196 	} else {
197 		if ((frame->tf_rflags & PSL_I) == 0)
198 			tramp = DPCPU_GET(intr_tramp);
199 		else
200 			tramp = curthread->t_kinst;
201 		if (tramp == NULL) {
202 			/*
203 			 * A trampoline allocation failed, so this probe is
204 			 * effectively disabled.  Restore the original
205 			 * instruction.
206 			 *
207 			 * We can't safely print anything here, but the
208 			 * trampoline allocator should have left a breadcrumb in
209 			 * the dmesg.
210 			 */
211 			kinst_patch_tracepoint(kp, kp->kp_savedval);
212 			frame->tf_rip = (register_t)kp->kp_patchpoint;
213 		} else {
214 			kinst_trampoline_populate(kp, tramp);
215 			frame->tf_rip = (register_t)tramp;
216 		}
217 		return (DTRACE_INVOP_NOP);
218 	}
219 }
220 
221 void
222 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
223 {
224 	register_t reg;
225 	int oldwp;
226 
227 	reg = intr_disable();
228 	oldwp = disable_wp();
229 	*kp->kp_patchpoint = val;
230 	restore_wp(oldwp);
231 	intr_restore(reg);
232 }
233 
234 static void
235 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
236 {
237 	kp->kp_md.disp = (int64_t)(int8_t)byte;
238 }
239 
240 static void
241 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
242 {
243 	int32_t disp32;
244 
245 	memcpy(&disp32, bytes, sizeof(disp32));
246 	kp->kp_md.disp = (int64_t)disp32;
247 }
248 
249 /*
250  * Set up all of the state needed to faithfully execute a probed instruction.
251  *
252  * In the simple case, we copy the instruction unmodified to a per-thread
253  * trampoline, wherein it is followed by a jump back to the original code.
254  * - Instructions can have %rip as an operand:
255  *   - with %rip-relative addressing encoded in ModR/M, or
256  *   - implicitly as a part of the instruction definition (jmp, call).
257  * - Call instructions (which may be %rip-relative) need to push the correct
258  *   return address onto the stack.
259  *
260  * Call instructions are simple enough to be emulated in software, so we simply
261  * do not use the trampoline mechanism in that case.  kinst_invop() will compute
262  * the branch target using the address info computed here (register operands and
263  * displacement).
264  *
265  * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
266  * displacement; when populating the trampoline the displacement is adjusted to
267  * be relative to the trampoline address.  Trampolines are always allocated
268  * above KERNBASE for this reason.
269  *
270  * For other %rip-relative operands (just jumps) we take the same approach.
271  * Instructions which specify an 8-bit displacement must be rewritten to use a
272  * 32-bit displacement.
273  */
274 static int
275 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
276 {
277 	struct kinst_probe_md *kpmd;
278 	dis86_t d86;
279 	uint8_t *bytes, modrm, rex;
280 	int dispoff, i, ilen, opcidx;
281 
282 	kpmd = &kp->kp_md;
283 
284 	d86.d86_data = instr;
285 	d86.d86_get_byte = dtrace_dis_get_byte;
286 	d86.d86_check_func = NULL;
287 	if (dtrace_disx86(&d86, SIZE64) != 0) {
288 		KINST_LOG("failed to disassemble instruction at: %p", *instr);
289 		return (EINVAL);
290 	}
291 	bytes = d86.d86_bytes;
292 	kpmd->instlen = kpmd->tinstlen = d86.d86_len;
293 
294 	/*
295 	 * Skip over prefixes, save REX.
296 	 */
297 	rex = 0;
298 	for (i = 0; i < kpmd->instlen; i++) {
299 		switch (bytes[i]) {
300 		case 0xf0 ... 0xf3:
301 			/* group 1 */
302 			continue;
303 		case 0x26:
304 		case 0x2e:
305 		case 0x36:
306 		case 0x3e:
307 		case 0x64:
308 		case 0x65:
309 			/* group 2 */
310 			continue;
311 		case 0x66:
312 			/* group 3 */
313 			continue;
314 		case 0x67:
315 			/* group 4 */
316 			continue;
317 		case 0x40 ... 0x4f:
318 			/* REX */
319 			rex = bytes[i];
320 			continue;
321 		}
322 		break;
323 	}
324 	KASSERT(i < kpmd->instlen,
325 	    ("%s: failed to disassemble instruction at %p", __func__, bytes));
326 	opcidx = i;
327 
328 	/*
329 	 * Identify instructions of interest by opcode: calls and jumps.
330 	 * Extract displacements.
331 	 */
332 	dispoff = -1;
333 	switch (bytes[opcidx]) {
334 	case 0x0f:
335 		switch (bytes[opcidx + 1]) {
336 		case 0x80 ... 0x8f:
337 			/* conditional jmp near */
338 			kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
339 			dispoff = opcidx + 2;
340 			kinst_set_disp32(kp, &bytes[dispoff]);
341 			break;
342 		}
343 		break;
344 	case 0xe3:
345 		/*
346 		 * There is no straightforward way to translate this instruction
347 		 * to use a 32-bit displacement.  Fortunately, it is rarely
348 		 * used.
349 		 */
350 		return (EINVAL);
351 	case 0x70 ... 0x7f:
352 		/* conditional jmp short */
353 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
354 		dispoff = opcidx + 1;
355 		kinst_set_disp8(kp, bytes[dispoff]);
356 		break;
357 	case 0xe9:
358 		/* unconditional jmp near */
359 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
360 		dispoff = opcidx + 1;
361 		kinst_set_disp32(kp, &bytes[dispoff]);
362 		break;
363 	case 0xeb:
364 		/* unconditional jmp short */
365 		kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
366 		dispoff = opcidx + 1;
367 		kinst_set_disp8(kp, bytes[dispoff]);
368 		break;
369 	case 0xe8:
370 	case 0x9a:
371 		/* direct call */
372 		kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
373 		dispoff = opcidx + 1;
374 		kinst_set_disp32(kp, &bytes[dispoff]);
375 		break;
376 	case 0xff:
377 		KASSERT(d86.d86_got_modrm,
378 		    ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
379 		switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
380 		case 0x02:
381 		case 0x03:
382 			/* indirect call */
383 			kpmd->flags |= KINST_F_CALL;
384 			break;
385 		case 0x04:
386 		case 0x05:
387 			/* indirect jump */
388 			kpmd->flags |= KINST_F_JMP;
389 			break;
390 		}
391 	}
392 
393 	/*
394 	 * If there's a ModR/M byte, we need to check it to see if the operand
395 	 * is %rip-relative, and rewrite the displacement if so.  If not, we
396 	 * might still have to extract operand info if this is a call
397 	 * instruction.
398 	 */
399 	if (d86.d86_got_modrm) {
400 		uint8_t mod, rm, sib;
401 
402 		kpmd->reg1 = kpmd->reg2 = -1;
403 
404 		modrm = bytes[d86.d86_rmindex];
405 		mod = KINST_MODRM_MOD(modrm);
406 		rm = KINST_MODRM_RM(modrm);
407 		if (mod == 0 && rm == 5) {
408 			kpmd->flags |= KINST_F_RIPREL;
409 			dispoff = d86.d86_rmindex + 1;
410 			kinst_set_disp32(kp, &bytes[dispoff]);
411 		} else if ((kpmd->flags & KINST_F_CALL) != 0) {
412 			bool havesib;
413 
414 			havesib = (mod != 3 && rm == 4);
415 			dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
416 			if (mod == 1)
417 				kinst_set_disp8(kp, bytes[dispoff]);
418 			else if (mod == 2)
419 				kinst_set_disp32(kp, &bytes[dispoff]);
420 			else if (mod == 3)
421 				kpmd->flags |= KINST_F_MOD_DIRECT;
422 
423 			if (havesib) {
424 				sib = bytes[d86.d86_rmindex + 1];
425 				if (KINST_SIB_BASE(sib) != 5) {
426 					kpmd->reg1 = KINST_SIB_BASE(sib) |
427 					    (KINST_REX_B(rex) << 3);
428 				}
429 				kpmd->scale = KINST_SIB_SCALE(sib);
430 				kpmd->reg2 = KINST_SIB_INDEX(sib) |
431 				    (KINST_REX_X(rex) << 3);
432 			} else {
433 				kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
434 			}
435 		}
436 	}
437 
438 	/*
439 	 * Calls are emulated in software; once operands are decoded we have
440 	 * nothing else to do.
441 	 */
442 	if ((kpmd->flags & KINST_F_CALL) != 0)
443 		return (0);
444 
445 	/*
446 	 * Allocate and populate an instruction trampoline template.
447 	 *
448 	 * Position-independent instructions can simply be copied, but
449 	 * position-dependent instructions require some surgery: jump
450 	 * instructions with an 8-bit displacement need to be converted to use a
451 	 * 32-bit displacement, and the adjusted displacement needs to be
452 	 * computed.
453 	 */
454 	ilen = kpmd->instlen;
455 	if ((kpmd->flags & KINST_F_RIPREL) != 0) {
456 		if ((kpmd->flags & KINST_F_JMP) == 0 ||
457 		    bytes[opcidx] == 0x0f ||
458 		    bytes[opcidx] == 0xe9 ||
459 		    bytes[opcidx] == 0xff) {
460 			memcpy(kpmd->template, bytes, dispoff);
461 			memcpy(&kpmd->template[dispoff + 4],
462 			    &bytes[dispoff + 4], ilen - (dispoff + 4));
463 			kpmd->dispoff = dispoff;
464 		} else if (bytes[opcidx] == 0xeb) {
465 			memcpy(kpmd->template, bytes, opcidx);
466 			kpmd->template[opcidx] = 0xe9;
467 			kpmd->dispoff = opcidx + 1;
468 
469 			/* Instruction length changes from 2 to 5. */
470 			kpmd->tinstlen = 5;
471 			kpmd->disp -= 3;
472 		} else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f)  {
473 			memcpy(kpmd->template, bytes, opcidx);
474 			kpmd->template[opcidx] = 0x0f;
475 			kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
476 			kpmd->dispoff = opcidx + 2;
477 
478 			/* Instruction length changes from 2 to 6. */
479 			kpmd->tinstlen = 6;
480 			kpmd->disp -= 4;
481 		} else {
482 			panic("unhandled opcode %#x", bytes[opcidx]);
483 		}
484 	} else {
485 		memcpy(kpmd->template, bytes, ilen);
486 	}
487 
488 	return (0);
489 }
490 
491 int
492 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
493     void *opaque)
494 {
495 	struct kinst_probe *kp;
496 	dtrace_kinst_probedesc_t *pd;
497 	const char *func;
498 	int error, instrsize, n, off;
499 	uint8_t *instr, *limit;
500 
501 	pd = opaque;
502 	func = symval->name;
503 	if (kinst_excluded(func))
504 		return (0);
505 	if (strcmp(func, pd->kpd_func) != 0)
506 		return (0);
507 
508 	instr = (uint8_t *)symval->value;
509 	limit = (uint8_t *)symval->value + symval->size;
510 	if (instr >= limit)
511 		return (0);
512 
513 	/*
514 	 * Ignore functions not beginning with the usual function prologue.
515 	 * These might correspond to exception handlers with which we should not
516 	 * meddle.  This does however exclude functions which can be safely
517 	 * traced, such as cpu_switch().
518 	 */
519 	if (*instr != KINST_PUSHL_RBP)
520 		return (0);
521 
522 	n = 0;
523 	while (instr < limit) {
524 		instrsize = dtrace_instr_size(instr);
525 		off = (int)(instr - (uint8_t *)symval->value);
526 		if (pd->kpd_off != -1 && off != pd->kpd_off) {
527 			instr += instrsize;
528 			continue;
529 		}
530 
531 		/*
532 		 * Check for instructions which may enable interrupts.  Such
533 		 * instructions are tricky to trace since it is unclear whether
534 		 * to use the per-thread or per-CPU trampolines.  Since they are
535 		 * rare, we don't bother to implement special handling for them.
536 		 *
537 		 * If the caller specified an offset, return an error, otherwise
538 		 * silently ignore the instruction so that it remains possible
539 		 * to enable all instructions in a function.
540 		 */
541 		if (instrsize == 1 &&
542 		    (instr[0] == KINST_POPF || instr[0] == KINST_STI)) {
543 			if (pd->kpd_off != -1)
544 				return (EINVAL);
545 			instr += instrsize;
546 			continue;
547 		}
548 
549 		/*
550 		 * Prevent separate dtrace(1) instances from creating copies of
551 		 * the same probe.
552 		 */
553 		LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
554 			if (strcmp(kp->kp_func, func) == 0 &&
555 			    strtol(kp->kp_name, NULL, 10) == off)
556 				return (0);
557 		}
558 		if (++n > KINST_PROBETAB_MAX) {
559 			KINST_LOG("probe list full: %d entries", n);
560 			return (ENOMEM);
561 		}
562 		kp = malloc(sizeof(struct kinst_probe), M_KINST,
563 		    M_WAITOK | M_ZERO);
564 		kp->kp_func = func;
565 		snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
566 		kp->kp_savedval = *instr;
567 		kp->kp_patchval = KINST_PATCHVAL;
568 		kp->kp_patchpoint = instr;
569 
570 		error = kinst_instr_dissect(kp, &instr);
571 		if (error != 0)
572 			return (error);
573 
574 		kinst_probe_create(kp, lf);
575 	}
576 
577 	return (0);
578 }
579 
580 int
581 kinst_md_init(void)
582 {
583 	uint8_t *tramp;
584 	int cpu;
585 
586 	CPU_FOREACH(cpu) {
587 		tramp = kinst_trampoline_alloc(M_WAITOK);
588 		if (tramp == NULL)
589 			return (ENOMEM);
590 		DPCPU_ID_SET(cpu, intr_tramp, tramp);
591 	}
592 
593 	return (0);
594 }
595 
596 void
597 kinst_md_deinit(void)
598 {
599 	uint8_t *tramp;
600 	int cpu;
601 
602 	CPU_FOREACH(cpu) {
603 		tramp = DPCPU_ID_GET(cpu, intr_tramp);
604 		if (tramp != NULL) {
605 			kinst_trampoline_dealloc(tramp);
606 			DPCPU_ID_SET(cpu, intr_tramp, NULL);
607 		}
608 	}
609 }
610 
611 /*
612  * Exclude machine-dependent functions that are not safe-to-trace.
613  */
614 bool
615 kinst_md_excluded(const char *name)
616 {
617 	return (false);
618 }
619