1 /*
2 * SPDX-License-Identifier: CDDL 1.0
3 *
4 * Copyright (c) 2022 Christos Margiolis <christos@FreeBSD.org>
5 * Copyright (c) 2022 Mark Johnston <markj@FreeBSD.org>
6 * Copyright (c) 2023 The FreeBSD Foundation
7 *
8 * Portions of this software were developed by Christos Margiolis
9 * <christos@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
10 */
11
12 #include <sys/param.h>
13 #include <sys/pcpu.h>
14
15 #include <machine/cpufunc.h>
16 #include <machine/md_var.h>
17
18 #include <sys/dtrace.h>
19 #include <cddl/dev/dtrace/dtrace_cddl.h>
20 #include <dis_tables.h>
21
22 #include "kinst.h"
23
24 #define KINST_PUSHL_RBP 0x55
25 #define KINST_STI 0xfb
26 #define KINST_POPF 0x9d
27
28 #define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6)
29 #define KINST_MODRM_REG(b) (((b) & 0x38) >> 3)
30 #define KINST_MODRM_RM(b) ((b) & 0x07)
31
32 #define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6)
33 #define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3)
34 #define KINST_SIB_BASE(s) (((s) & 0x07) >> 0)
35
36 #define KINST_REX_W(r) (((r) & 0x08) >> 3)
37 #define KINST_REX_R(r) (((r) & 0x04) >> 2)
38 #define KINST_REX_X(r) (((r) & 0x02) >> 1)
39 #define KINST_REX_B(r) (((r) & 0x01) >> 0)
40
41 #define KINST_F_CALL 0x0001 /* instruction is a "call" */
42 #define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */
43 #define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */
44 #define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */
45 #define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */
46
47 /*
48 * Per-CPU trampolines used when the interrupted thread is executing with
49 * interrupts disabled. If an interrupt is raised while executing a trampoline,
50 * the interrupt thread cannot safely overwrite its trampoline if it hits a
51 * kinst probe while executing the interrupt handler.
52 */
53 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp);
54
55 /*
56 * Map ModR/M register bits to a trapframe offset.
57 */
58 static int
kinst_regoff(int reg)59 kinst_regoff(int reg)
60 {
61 #define _MATCH_REG(i, reg) \
62 case i: \
63 return (offsetof(struct trapframe, tf_ ## reg) / \
64 sizeof(register_t))
65 switch (reg) {
66 _MATCH_REG( 0, rax);
67 _MATCH_REG( 1, rcx);
68 _MATCH_REG( 2, rdx);
69 _MATCH_REG( 3, rbx);
70 _MATCH_REG( 4, rsp); /* SIB when mod != 3 */
71 _MATCH_REG( 5, rbp);
72 _MATCH_REG( 6, rsi);
73 _MATCH_REG( 7, rdi);
74 _MATCH_REG( 8, r8); /* REX.R is set */
75 _MATCH_REG( 9, r9);
76 _MATCH_REG(10, r10);
77 _MATCH_REG(11, r11);
78 _MATCH_REG(12, r12);
79 _MATCH_REG(13, r13);
80 _MATCH_REG(14, r14);
81 _MATCH_REG(15, r15);
82 }
83 #undef _MATCH_REG
84 panic("%s: unhandled register index %d", __func__, reg);
85 }
86
87 /*
88 * Obtain the specified register's value.
89 */
90 static uint64_t
kinst_regval(struct trapframe * frame,int reg)91 kinst_regval(struct trapframe *frame, int reg)
92 {
93 if (reg == -1)
94 return (0);
95 return (((register_t *)frame)[kinst_regoff(reg)]);
96 }
97
98 static uint32_t
kinst_riprel_disp(struct kinst_probe * kp,void * dst)99 kinst_riprel_disp(struct kinst_probe *kp, void *dst)
100 {
101 return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
102 (intptr_t)dst));
103 }
104
105 static void
kinst_trampoline_populate(struct kinst_probe * kp,uint8_t * tramp)106 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
107 {
108 uint8_t *instr;
109 uint32_t disp;
110 int ilen;
111
112 ilen = kp->kp_md.tinstlen;
113
114 kinst_memcpy(tramp, kp->kp_md.template, ilen);
115 if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
116 disp = kinst_riprel_disp(kp, tramp);
117 kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
118 }
119
120 /*
121 * The following position-independent jmp takes us back to the
122 * original code. It is encoded as "jmp *0(%rip)" (six bytes),
123 * followed by the absolute address of the instruction following
124 * the one that was traced (eight bytes).
125 */
126 tramp[ilen + 0] = 0xff;
127 tramp[ilen + 1] = 0x25;
128 tramp[ilen + 2] = 0x00;
129 tramp[ilen + 3] = 0x00;
130 tramp[ilen + 4] = 0x00;
131 tramp[ilen + 5] = 0x00;
132 instr = kp->kp_patchpoint + kp->kp_md.instlen;
133 kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
134 }
135
136 int
kinst_invop(uintptr_t addr,struct trapframe * frame,uintptr_t scratch)137 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
138 {
139 solaris_cpu_t *cpu;
140 uintptr_t *stack, retaddr;
141 struct kinst_probe *kp;
142 struct kinst_probe_md *kpmd;
143 uint8_t *tramp;
144
145 stack = (uintptr_t *)frame->tf_rsp;
146 cpu = &solaris_cpu[curcpu];
147
148 LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
149 if ((uintptr_t)kp->kp_patchpoint == addr)
150 break;
151 }
152 if (kp == NULL)
153 return (0);
154
155 /*
156 * Report the address of the breakpoint for the benefit of consumers
157 * fetching register values with regs[].
158 */
159 frame->tf_rip--;
160
161 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
162 cpu->cpu_dtrace_caller = stack[0];
163 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
164 dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
165 cpu->cpu_dtrace_caller = 0;
166
167 kpmd = &kp->kp_md;
168 if ((kpmd->flags & KINST_F_CALL) != 0) {
169 /*
170 * dtrace_invop_start() reserves space on the stack to
171 * store the return address of the call instruction.
172 */
173 retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
174 *(uintptr_t *)scratch = retaddr;
175
176 if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
177 frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
178 kpmd->disp + kpmd->instlen);
179 } else {
180 register_t rval;
181
182 if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
183 /* rip-relative */
184 rval = frame->tf_rip + kpmd->instlen;
185 } else {
186 /* indirect */
187 rval = kinst_regval(frame, kpmd->reg1) +
188 (kinst_regval(frame, kpmd->reg2) <<
189 kpmd->scale);
190 }
191
192 if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
193 frame->tf_rip = rval + kpmd->disp;
194 } else {
195 frame->tf_rip =
196 *(uintptr_t *)(rval + kpmd->disp);
197 }
198 }
199 return (DTRACE_INVOP_CALL);
200 } else {
201 if ((frame->tf_rflags & PSL_I) == 0)
202 tramp = DPCPU_GET(intr_tramp);
203 else
204 tramp = curthread->t_kinst_tramp;
205 if (tramp == NULL) {
206 /*
207 * A trampoline allocation failed, so this probe is
208 * effectively disabled. Restore the original
209 * instruction.
210 *
211 * We can't safely print anything here, but the
212 * trampoline allocator should have left a breadcrumb in
213 * the dmesg.
214 */
215 kinst_patch_tracepoint(kp, kp->kp_savedval);
216 frame->tf_rip = (register_t)kp->kp_patchpoint;
217 } else {
218 kinst_trampoline_populate(kp, tramp);
219 frame->tf_rip = (register_t)tramp;
220 }
221 return (DTRACE_INVOP_NOP);
222 }
223 }
224
225 void
kinst_patch_tracepoint(struct kinst_probe * kp,kinst_patchval_t val)226 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
227 {
228 register_t reg;
229 int oldwp;
230
231 reg = intr_disable();
232 oldwp = disable_wp();
233 *kp->kp_patchpoint = val;
234 restore_wp(oldwp);
235 intr_restore(reg);
236 }
237
238 static void
kinst_set_disp8(struct kinst_probe * kp,uint8_t byte)239 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
240 {
241 kp->kp_md.disp = (int64_t)(int8_t)byte;
242 }
243
244 static void
kinst_set_disp32(struct kinst_probe * kp,uint8_t * bytes)245 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
246 {
247 int32_t disp32;
248
249 memcpy(&disp32, bytes, sizeof(disp32));
250 kp->kp_md.disp = (int64_t)disp32;
251 }
252
253 /*
254 * Set up all of the state needed to faithfully execute a probed instruction.
255 *
256 * In the simple case, we copy the instruction unmodified to a per-thread
257 * trampoline, wherein it is followed by a jump back to the original code.
258 * - Instructions can have %rip as an operand:
259 * - with %rip-relative addressing encoded in ModR/M, or
260 * - implicitly as a part of the instruction definition (jmp, call).
261 * - Call instructions (which may be %rip-relative) need to push the correct
262 * return address onto the stack.
263 *
264 * Call instructions are simple enough to be emulated in software, so we simply
265 * do not use the trampoline mechanism in that case. kinst_invop() will compute
266 * the branch target using the address info computed here (register operands and
267 * displacement).
268 *
269 * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
270 * displacement; when populating the trampoline the displacement is adjusted to
271 * be relative to the trampoline address. Trampolines are always allocated
272 * above KERNBASE for this reason.
273 *
274 * For other %rip-relative operands (just jumps) we take the same approach.
275 * Instructions which specify an 8-bit displacement must be rewritten to use a
276 * 32-bit displacement.
277 */
278 static int
kinst_instr_dissect(struct kinst_probe * kp,uint8_t ** instr)279 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
280 {
281 struct kinst_probe_md *kpmd;
282 dis86_t d86;
283 uint8_t *bytes, modrm, rex;
284 int dispoff, i, ilen, opcidx;
285
286 kpmd = &kp->kp_md;
287
288 d86.d86_data = instr;
289 d86.d86_get_byte = dtrace_dis_get_byte;
290 d86.d86_check_func = NULL;
291 if (dtrace_disx86(&d86, SIZE64) != 0) {
292 KINST_LOG("failed to disassemble instruction at: %p", *instr);
293 return (EINVAL);
294 }
295 bytes = d86.d86_bytes;
296 kpmd->instlen = kpmd->tinstlen = d86.d86_len;
297
298 /*
299 * Skip over prefixes, save REX.
300 */
301 rex = 0;
302 for (i = 0; i < kpmd->instlen; i++) {
303 switch (bytes[i]) {
304 case 0xf0 ... 0xf3:
305 /* group 1 */
306 continue;
307 case 0x26:
308 case 0x2e:
309 case 0x36:
310 case 0x3e:
311 case 0x64:
312 case 0x65:
313 /* group 2 */
314 continue;
315 case 0x66:
316 /* group 3 */
317 continue;
318 case 0x67:
319 /* group 4 */
320 continue;
321 case 0x40 ... 0x4f:
322 /* REX */
323 rex = bytes[i];
324 continue;
325 }
326 break;
327 }
328 KASSERT(i < kpmd->instlen,
329 ("%s: failed to disassemble instruction at %p", __func__, bytes));
330 opcidx = i;
331
332 /*
333 * Identify instructions of interest by opcode: calls and jumps.
334 * Extract displacements.
335 */
336 dispoff = -1;
337 switch (bytes[opcidx]) {
338 case 0x0f:
339 switch (bytes[opcidx + 1]) {
340 case 0x80 ... 0x8f:
341 /* conditional jmp near */
342 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
343 dispoff = opcidx + 2;
344 kinst_set_disp32(kp, &bytes[dispoff]);
345 break;
346 }
347 break;
348 case 0xe3:
349 /*
350 * There is no straightforward way to translate this instruction
351 * to use a 32-bit displacement. Fortunately, it is rarely
352 * used.
353 */
354 return (EINVAL);
355 case 0x70 ... 0x7f:
356 /* conditional jmp short */
357 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
358 dispoff = opcidx + 1;
359 kinst_set_disp8(kp, bytes[dispoff]);
360 break;
361 case 0xe9:
362 /* unconditional jmp near */
363 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
364 dispoff = opcidx + 1;
365 kinst_set_disp32(kp, &bytes[dispoff]);
366 break;
367 case 0xeb:
368 /* unconditional jmp short */
369 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
370 dispoff = opcidx + 1;
371 kinst_set_disp8(kp, bytes[dispoff]);
372 break;
373 case 0xe8:
374 case 0x9a:
375 /* direct call */
376 kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
377 dispoff = opcidx + 1;
378 kinst_set_disp32(kp, &bytes[dispoff]);
379 break;
380 case 0xff:
381 KASSERT(d86.d86_got_modrm,
382 ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
383 switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
384 case 0x02:
385 case 0x03:
386 /* indirect call */
387 kpmd->flags |= KINST_F_CALL;
388 break;
389 case 0x04:
390 case 0x05:
391 /* indirect jump */
392 kpmd->flags |= KINST_F_JMP;
393 break;
394 }
395 }
396
397 /*
398 * If there's a ModR/M byte, we need to check it to see if the operand
399 * is %rip-relative, and rewrite the displacement if so. If not, we
400 * might still have to extract operand info if this is a call
401 * instruction.
402 */
403 if (d86.d86_got_modrm) {
404 uint8_t mod, rm, sib;
405
406 kpmd->reg1 = kpmd->reg2 = -1;
407
408 modrm = bytes[d86.d86_rmindex];
409 mod = KINST_MODRM_MOD(modrm);
410 rm = KINST_MODRM_RM(modrm);
411 if (mod == 0 && rm == 5) {
412 kpmd->flags |= KINST_F_RIPREL;
413 dispoff = d86.d86_rmindex + 1;
414 kinst_set_disp32(kp, &bytes[dispoff]);
415 } else if ((kpmd->flags & KINST_F_CALL) != 0) {
416 bool havesib;
417
418 havesib = (mod != 3 && rm == 4);
419 dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
420 if (mod == 1)
421 kinst_set_disp8(kp, bytes[dispoff]);
422 else if (mod == 2)
423 kinst_set_disp32(kp, &bytes[dispoff]);
424 else if (mod == 3)
425 kpmd->flags |= KINST_F_MOD_DIRECT;
426
427 if (havesib) {
428 sib = bytes[d86.d86_rmindex + 1];
429 if (KINST_SIB_BASE(sib) != 5) {
430 kpmd->reg1 = KINST_SIB_BASE(sib) |
431 (KINST_REX_B(rex) << 3);
432 }
433 kpmd->scale = KINST_SIB_SCALE(sib);
434 kpmd->reg2 = KINST_SIB_INDEX(sib) |
435 (KINST_REX_X(rex) << 3);
436 } else {
437 kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
438 }
439 }
440 }
441
442 /*
443 * Calls are emulated in software; once operands are decoded we have
444 * nothing else to do.
445 */
446 if ((kpmd->flags & KINST_F_CALL) != 0)
447 return (0);
448
449 /*
450 * Allocate and populate an instruction trampoline template.
451 *
452 * Position-independent instructions can simply be copied, but
453 * position-dependent instructions require some surgery: jump
454 * instructions with an 8-bit displacement need to be converted to use a
455 * 32-bit displacement, and the adjusted displacement needs to be
456 * computed.
457 */
458 ilen = kpmd->instlen;
459 if ((kpmd->flags & KINST_F_RIPREL) != 0) {
460 if ((kpmd->flags & KINST_F_JMP) == 0 ||
461 bytes[opcidx] == 0x0f ||
462 bytes[opcidx] == 0xe9 ||
463 bytes[opcidx] == 0xff) {
464 memcpy(kpmd->template, bytes, dispoff);
465 memcpy(&kpmd->template[dispoff + 4],
466 &bytes[dispoff + 4], ilen - (dispoff + 4));
467 kpmd->dispoff = dispoff;
468 } else if (bytes[opcidx] == 0xeb) {
469 memcpy(kpmd->template, bytes, opcidx);
470 kpmd->template[opcidx] = 0xe9;
471 kpmd->dispoff = opcidx + 1;
472
473 /* Instruction length changes from 2 to 5. */
474 kpmd->tinstlen = 5;
475 kpmd->disp -= 3;
476 } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) {
477 memcpy(kpmd->template, bytes, opcidx);
478 kpmd->template[opcidx] = 0x0f;
479 kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
480 kpmd->dispoff = opcidx + 2;
481
482 /* Instruction length changes from 2 to 6. */
483 kpmd->tinstlen = 6;
484 kpmd->disp -= 4;
485 } else {
486 panic("unhandled opcode %#x", bytes[opcidx]);
487 }
488 } else {
489 memcpy(kpmd->template, bytes, ilen);
490 }
491
492 return (0);
493 }
494
495 int
kinst_make_probe(linker_file_t lf,int symindx,linker_symval_t * symval,void * opaque)496 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
497 void *opaque)
498 {
499 struct kinst_probe *kp;
500 dtrace_kinst_probedesc_t *pd;
501 const char *func;
502 int error, instrsize, n, off;
503 uint8_t *instr, *limit, *tmp;
504 bool push_found;
505
506 pd = opaque;
507 func = symval->name;
508 if (kinst_excluded(func))
509 return (0);
510 if (strcmp(func, pd->kpd_func) != 0)
511 return (0);
512
513 instr = (uint8_t *)symval->value;
514 limit = (uint8_t *)symval->value + symval->size;
515 if (instr >= limit)
516 return (0);
517
518 /*
519 * Refuse to instrument functions lacking the usual frame pointer
520 * manipulations since they might correspond to exception handlers.
521 */
522 tmp = instr;
523 push_found = false;
524 while (tmp < limit) {
525 /*
526 * Checking for 'pop %rbp' as well makes the filtering too
527 * strict as it would skip functions that never return (e.g.,
528 * vnlru_proc()).
529 */
530 if (*tmp == KINST_PUSHL_RBP) {
531 push_found = true;
532 break;
533 }
534 tmp += dtrace_instr_size(tmp);
535 }
536 if (!push_found)
537 return (0);
538
539 n = 0;
540 while (instr < limit) {
541 instrsize = dtrace_instr_size(instr);
542 off = (int)(instr - (uint8_t *)symval->value);
543 if (pd->kpd_off != -1 && off != pd->kpd_off) {
544 instr += instrsize;
545 continue;
546 }
547
548 /*
549 * Check for instructions which may enable interrupts. Such
550 * instructions are tricky to trace since it is unclear whether
551 * to use the per-thread or per-CPU trampolines. Since they are
552 * rare, we don't bother to implement special handling for them.
553 *
554 * If the caller specified an offset, return an error, otherwise
555 * silently ignore the instruction so that it remains possible
556 * to enable all instructions in a function.
557 */
558 if (instrsize == 1 &&
559 (instr[0] == KINST_POPF || instr[0] == KINST_STI)) {
560 if (pd->kpd_off != -1)
561 return (EINVAL);
562 instr += instrsize;
563 continue;
564 }
565
566 /*
567 * Prevent separate dtrace(1) instances from creating copies of
568 * the same probe.
569 */
570 LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
571 if (strcmp(kp->kp_func, func) == 0 &&
572 strtol(kp->kp_name, NULL, 10) == off)
573 return (0);
574 }
575 if (++n > KINST_PROBETAB_MAX) {
576 KINST_LOG("probe list full: %d entries", n);
577 return (ENOMEM);
578 }
579 kp = malloc(sizeof(struct kinst_probe), M_KINST,
580 M_WAITOK | M_ZERO);
581 kp->kp_func = func;
582 snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
583 kp->kp_savedval = *instr;
584 kp->kp_patchval = KINST_PATCHVAL;
585 kp->kp_patchpoint = instr;
586
587 error = kinst_instr_dissect(kp, &instr);
588 if (error != 0)
589 return (error);
590
591 kinst_probe_create(kp, lf);
592 }
593
594 return (0);
595 }
596
597 int
kinst_md_init(void)598 kinst_md_init(void)
599 {
600 uint8_t *tramp;
601 int cpu;
602
603 CPU_FOREACH(cpu) {
604 tramp = kinst_trampoline_alloc(M_WAITOK);
605 if (tramp == NULL)
606 return (ENOMEM);
607 DPCPU_ID_SET(cpu, intr_tramp, tramp);
608 }
609
610 return (0);
611 }
612
613 void
kinst_md_deinit(void)614 kinst_md_deinit(void)
615 {
616 uint8_t *tramp;
617 int cpu;
618
619 CPU_FOREACH(cpu) {
620 tramp = DPCPU_ID_GET(cpu, intr_tramp);
621 if (tramp != NULL) {
622 kinst_trampoline_dealloc(tramp);
623 DPCPU_ID_SET(cpu, intr_tramp, NULL);
624 }
625 }
626 }
627
628 /*
629 * Exclude machine-dependent functions that are not safe-to-trace.
630 */
631 bool
kinst_md_excluded(const char * name)632 kinst_md_excluded(const char *name)
633 {
634 return (false);
635 }
636