1 /* 2 * SPDX-License-Identifier: CDDL 1.0 3 * 4 * Copyright (c) 2022 Christos Margiolis <christos@FreeBSD.org> 5 * Copyright (c) 2022 Mark Johnston <markj@FreeBSD.org> 6 * Copyright (c) 2023 The FreeBSD Foundation 7 * 8 * Portions of this software were developed by Christos Margiolis 9 * <christos@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 10 */ 11 12 #include <sys/param.h> 13 #include <sys/pcpu.h> 14 15 #include <machine/cpufunc.h> 16 #include <machine/md_var.h> 17 18 #include <sys/dtrace.h> 19 #include <cddl/dev/dtrace/dtrace_cddl.h> 20 #include <dis_tables.h> 21 22 #include "kinst.h" 23 24 #define KINST_PUSHL_RBP 0x55 25 #define KINST_POPL_RBP 0x5d 26 #define KINST_STI 0xfb 27 #define KINST_POPF 0x9d 28 29 #define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6) 30 #define KINST_MODRM_REG(b) (((b) & 0x38) >> 3) 31 #define KINST_MODRM_RM(b) ((b) & 0x07) 32 33 #define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6) 34 #define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3) 35 #define KINST_SIB_BASE(s) (((s) & 0x07) >> 0) 36 37 #define KINST_REX_W(r) (((r) & 0x08) >> 3) 38 #define KINST_REX_R(r) (((r) & 0x04) >> 2) 39 #define KINST_REX_X(r) (((r) & 0x02) >> 1) 40 #define KINST_REX_B(r) (((r) & 0x01) >> 0) 41 42 #define KINST_F_CALL 0x0001 /* instruction is a "call" */ 43 #define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */ 44 #define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */ 45 #define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */ 46 #define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */ 47 48 /* 49 * Per-CPU trampolines used when the interrupted thread is executing with 50 * interrupts disabled. If an interrupt is raised while executing a trampoline, 51 * the interrupt thread cannot safely overwrite its trampoline if it hits a 52 * kinst probe while executing the interrupt handler. 53 */ 54 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp); 55 56 /* 57 * Map ModR/M register bits to a trapframe offset. 58 */ 59 static int 60 kinst_regoff(int reg) 61 { 62 #define _MATCH_REG(i, reg) \ 63 case i: \ 64 return (offsetof(struct trapframe, tf_ ## reg) / \ 65 sizeof(register_t)) 66 switch (reg) { 67 _MATCH_REG( 0, rax); 68 _MATCH_REG( 1, rcx); 69 _MATCH_REG( 2, rdx); 70 _MATCH_REG( 3, rbx); 71 _MATCH_REG( 4, rsp); /* SIB when mod != 3 */ 72 _MATCH_REG( 5, rbp); 73 _MATCH_REG( 6, rsi); 74 _MATCH_REG( 7, rdi); 75 _MATCH_REG( 8, r8); /* REX.R is set */ 76 _MATCH_REG( 9, r9); 77 _MATCH_REG(10, r10); 78 _MATCH_REG(11, r11); 79 _MATCH_REG(12, r12); 80 _MATCH_REG(13, r13); 81 _MATCH_REG(14, r14); 82 _MATCH_REG(15, r15); 83 } 84 #undef _MATCH_REG 85 panic("%s: unhandled register index %d", __func__, reg); 86 } 87 88 /* 89 * Obtain the specified register's value. 90 */ 91 static uint64_t 92 kinst_regval(struct trapframe *frame, int reg) 93 { 94 if (reg == -1) 95 return (0); 96 return (((register_t *)frame)[kinst_regoff(reg)]); 97 } 98 99 static uint32_t 100 kinst_riprel_disp(struct kinst_probe *kp, void *dst) 101 { 102 return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp - 103 (intptr_t)dst)); 104 } 105 106 static void 107 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp) 108 { 109 uint8_t *instr; 110 uint32_t disp; 111 int ilen; 112 113 ilen = kp->kp_md.tinstlen; 114 115 kinst_memcpy(tramp, kp->kp_md.template, ilen); 116 if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) { 117 disp = kinst_riprel_disp(kp, tramp); 118 kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t)); 119 } 120 121 /* 122 * The following position-independent jmp takes us back to the 123 * original code. It is encoded as "jmp *0(%rip)" (six bytes), 124 * followed by the absolute address of the instruction following 125 * the one that was traced (eight bytes). 126 */ 127 tramp[ilen + 0] = 0xff; 128 tramp[ilen + 1] = 0x25; 129 tramp[ilen + 2] = 0x00; 130 tramp[ilen + 3] = 0x00; 131 tramp[ilen + 4] = 0x00; 132 tramp[ilen + 5] = 0x00; 133 instr = kp->kp_patchpoint + kp->kp_md.instlen; 134 kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t)); 135 } 136 137 int 138 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch) 139 { 140 solaris_cpu_t *cpu; 141 uintptr_t *stack, retaddr; 142 struct kinst_probe *kp; 143 struct kinst_probe_md *kpmd; 144 uint8_t *tramp; 145 146 stack = (uintptr_t *)frame->tf_rsp; 147 cpu = &solaris_cpu[curcpu]; 148 149 LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) { 150 if ((uintptr_t)kp->kp_patchpoint == addr) 151 break; 152 } 153 if (kp == NULL) 154 return (0); 155 156 /* 157 * Report the address of the breakpoint for the benefit of consumers 158 * fetching register values with regs[]. 159 */ 160 frame->tf_rip--; 161 162 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 163 cpu->cpu_dtrace_caller = stack[0]; 164 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); 165 dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0); 166 cpu->cpu_dtrace_caller = 0; 167 168 kpmd = &kp->kp_md; 169 if ((kpmd->flags & KINST_F_CALL) != 0) { 170 /* 171 * dtrace_invop_start() reserves space on the stack to 172 * store the return address of the call instruction. 173 */ 174 retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen); 175 *(uintptr_t *)scratch = retaddr; 176 177 if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) { 178 frame->tf_rip = (uintptr_t)(kp->kp_patchpoint + 179 kpmd->disp + kpmd->instlen); 180 } else { 181 register_t rval; 182 183 if (kpmd->reg1 == -1 && kpmd->reg2 == -1) { 184 /* rip-relative */ 185 rval = frame->tf_rip + kpmd->instlen; 186 } else { 187 /* indirect */ 188 rval = kinst_regval(frame, kpmd->reg1) + 189 (kinst_regval(frame, kpmd->reg2) << 190 kpmd->scale); 191 } 192 193 if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) { 194 frame->tf_rip = rval + kpmd->disp; 195 } else { 196 frame->tf_rip = 197 *(uintptr_t *)(rval + kpmd->disp); 198 } 199 } 200 return (DTRACE_INVOP_CALL); 201 } else { 202 if ((frame->tf_rflags & PSL_I) == 0) 203 tramp = DPCPU_GET(intr_tramp); 204 else 205 tramp = curthread->t_kinst_tramp; 206 if (tramp == NULL) { 207 /* 208 * A trampoline allocation failed, so this probe is 209 * effectively disabled. Restore the original 210 * instruction. 211 * 212 * We can't safely print anything here, but the 213 * trampoline allocator should have left a breadcrumb in 214 * the dmesg. 215 */ 216 kinst_patch_tracepoint(kp, kp->kp_savedval); 217 frame->tf_rip = (register_t)kp->kp_patchpoint; 218 } else { 219 kinst_trampoline_populate(kp, tramp); 220 frame->tf_rip = (register_t)tramp; 221 } 222 return (DTRACE_INVOP_NOP); 223 } 224 } 225 226 void 227 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val) 228 { 229 register_t reg; 230 int oldwp; 231 232 reg = intr_disable(); 233 oldwp = disable_wp(); 234 *kp->kp_patchpoint = val; 235 restore_wp(oldwp); 236 intr_restore(reg); 237 } 238 239 static void 240 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte) 241 { 242 kp->kp_md.disp = (int64_t)(int8_t)byte; 243 } 244 245 static void 246 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes) 247 { 248 int32_t disp32; 249 250 memcpy(&disp32, bytes, sizeof(disp32)); 251 kp->kp_md.disp = (int64_t)disp32; 252 } 253 254 /* 255 * Set up all of the state needed to faithfully execute a probed instruction. 256 * 257 * In the simple case, we copy the instruction unmodified to a per-thread 258 * trampoline, wherein it is followed by a jump back to the original code. 259 * - Instructions can have %rip as an operand: 260 * - with %rip-relative addressing encoded in ModR/M, or 261 * - implicitly as a part of the instruction definition (jmp, call). 262 * - Call instructions (which may be %rip-relative) need to push the correct 263 * return address onto the stack. 264 * 265 * Call instructions are simple enough to be emulated in software, so we simply 266 * do not use the trampoline mechanism in that case. kinst_invop() will compute 267 * the branch target using the address info computed here (register operands and 268 * displacement). 269 * 270 * %rip-relative operands encoded using the ModR/M byte always use a 32-bit 271 * displacement; when populating the trampoline the displacement is adjusted to 272 * be relative to the trampoline address. Trampolines are always allocated 273 * above KERNBASE for this reason. 274 * 275 * For other %rip-relative operands (just jumps) we take the same approach. 276 * Instructions which specify an 8-bit displacement must be rewritten to use a 277 * 32-bit displacement. 278 */ 279 static int 280 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr) 281 { 282 struct kinst_probe_md *kpmd; 283 dis86_t d86; 284 uint8_t *bytes, modrm, rex; 285 int dispoff, i, ilen, opcidx; 286 287 kpmd = &kp->kp_md; 288 289 d86.d86_data = instr; 290 d86.d86_get_byte = dtrace_dis_get_byte; 291 d86.d86_check_func = NULL; 292 if (dtrace_disx86(&d86, SIZE64) != 0) { 293 KINST_LOG("failed to disassemble instruction at: %p", *instr); 294 return (EINVAL); 295 } 296 bytes = d86.d86_bytes; 297 kpmd->instlen = kpmd->tinstlen = d86.d86_len; 298 299 /* 300 * Skip over prefixes, save REX. 301 */ 302 rex = 0; 303 for (i = 0; i < kpmd->instlen; i++) { 304 switch (bytes[i]) { 305 case 0xf0 ... 0xf3: 306 /* group 1 */ 307 continue; 308 case 0x26: 309 case 0x2e: 310 case 0x36: 311 case 0x3e: 312 case 0x64: 313 case 0x65: 314 /* group 2 */ 315 continue; 316 case 0x66: 317 /* group 3 */ 318 continue; 319 case 0x67: 320 /* group 4 */ 321 continue; 322 case 0x40 ... 0x4f: 323 /* REX */ 324 rex = bytes[i]; 325 continue; 326 } 327 break; 328 } 329 KASSERT(i < kpmd->instlen, 330 ("%s: failed to disassemble instruction at %p", __func__, bytes)); 331 opcidx = i; 332 333 /* 334 * Identify instructions of interest by opcode: calls and jumps. 335 * Extract displacements. 336 */ 337 dispoff = -1; 338 switch (bytes[opcidx]) { 339 case 0x0f: 340 switch (bytes[opcidx + 1]) { 341 case 0x80 ... 0x8f: 342 /* conditional jmp near */ 343 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 344 dispoff = opcidx + 2; 345 kinst_set_disp32(kp, &bytes[dispoff]); 346 break; 347 } 348 break; 349 case 0xe3: 350 /* 351 * There is no straightforward way to translate this instruction 352 * to use a 32-bit displacement. Fortunately, it is rarely 353 * used. 354 */ 355 return (EINVAL); 356 case 0x70 ... 0x7f: 357 /* conditional jmp short */ 358 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 359 dispoff = opcidx + 1; 360 kinst_set_disp8(kp, bytes[dispoff]); 361 break; 362 case 0xe9: 363 /* unconditional jmp near */ 364 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 365 dispoff = opcidx + 1; 366 kinst_set_disp32(kp, &bytes[dispoff]); 367 break; 368 case 0xeb: 369 /* unconditional jmp short */ 370 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 371 dispoff = opcidx + 1; 372 kinst_set_disp8(kp, bytes[dispoff]); 373 break; 374 case 0xe8: 375 case 0x9a: 376 /* direct call */ 377 kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL; 378 dispoff = opcidx + 1; 379 kinst_set_disp32(kp, &bytes[dispoff]); 380 break; 381 case 0xff: 382 KASSERT(d86.d86_got_modrm, 383 ("no ModR/M byte for instr at %p", *instr - kpmd->instlen)); 384 switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) { 385 case 0x02: 386 case 0x03: 387 /* indirect call */ 388 kpmd->flags |= KINST_F_CALL; 389 break; 390 case 0x04: 391 case 0x05: 392 /* indirect jump */ 393 kpmd->flags |= KINST_F_JMP; 394 break; 395 } 396 } 397 398 /* 399 * If there's a ModR/M byte, we need to check it to see if the operand 400 * is %rip-relative, and rewrite the displacement if so. If not, we 401 * might still have to extract operand info if this is a call 402 * instruction. 403 */ 404 if (d86.d86_got_modrm) { 405 uint8_t mod, rm, sib; 406 407 kpmd->reg1 = kpmd->reg2 = -1; 408 409 modrm = bytes[d86.d86_rmindex]; 410 mod = KINST_MODRM_MOD(modrm); 411 rm = KINST_MODRM_RM(modrm); 412 if (mod == 0 && rm == 5) { 413 kpmd->flags |= KINST_F_RIPREL; 414 dispoff = d86.d86_rmindex + 1; 415 kinst_set_disp32(kp, &bytes[dispoff]); 416 } else if ((kpmd->flags & KINST_F_CALL) != 0) { 417 bool havesib; 418 419 havesib = (mod != 3 && rm == 4); 420 dispoff = d86.d86_rmindex + (havesib ? 2 : 1); 421 if (mod == 1) 422 kinst_set_disp8(kp, bytes[dispoff]); 423 else if (mod == 2) 424 kinst_set_disp32(kp, &bytes[dispoff]); 425 else if (mod == 3) 426 kpmd->flags |= KINST_F_MOD_DIRECT; 427 428 if (havesib) { 429 sib = bytes[d86.d86_rmindex + 1]; 430 if (KINST_SIB_BASE(sib) != 5) { 431 kpmd->reg1 = KINST_SIB_BASE(sib) | 432 (KINST_REX_B(rex) << 3); 433 } 434 kpmd->scale = KINST_SIB_SCALE(sib); 435 kpmd->reg2 = KINST_SIB_INDEX(sib) | 436 (KINST_REX_X(rex) << 3); 437 } else { 438 kpmd->reg1 = rm | (KINST_REX_B(rex) << 3); 439 } 440 } 441 } 442 443 /* 444 * Calls are emulated in software; once operands are decoded we have 445 * nothing else to do. 446 */ 447 if ((kpmd->flags & KINST_F_CALL) != 0) 448 return (0); 449 450 /* 451 * Allocate and populate an instruction trampoline template. 452 * 453 * Position-independent instructions can simply be copied, but 454 * position-dependent instructions require some surgery: jump 455 * instructions with an 8-bit displacement need to be converted to use a 456 * 32-bit displacement, and the adjusted displacement needs to be 457 * computed. 458 */ 459 ilen = kpmd->instlen; 460 if ((kpmd->flags & KINST_F_RIPREL) != 0) { 461 if ((kpmd->flags & KINST_F_JMP) == 0 || 462 bytes[opcidx] == 0x0f || 463 bytes[opcidx] == 0xe9 || 464 bytes[opcidx] == 0xff) { 465 memcpy(kpmd->template, bytes, dispoff); 466 memcpy(&kpmd->template[dispoff + 4], 467 &bytes[dispoff + 4], ilen - (dispoff + 4)); 468 kpmd->dispoff = dispoff; 469 } else if (bytes[opcidx] == 0xeb) { 470 memcpy(kpmd->template, bytes, opcidx); 471 kpmd->template[opcidx] = 0xe9; 472 kpmd->dispoff = opcidx + 1; 473 474 /* Instruction length changes from 2 to 5. */ 475 kpmd->tinstlen = 5; 476 kpmd->disp -= 3; 477 } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) { 478 memcpy(kpmd->template, bytes, opcidx); 479 kpmd->template[opcidx] = 0x0f; 480 kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10; 481 kpmd->dispoff = opcidx + 2; 482 483 /* Instruction length changes from 2 to 6. */ 484 kpmd->tinstlen = 6; 485 kpmd->disp -= 4; 486 } else { 487 panic("unhandled opcode %#x", bytes[opcidx]); 488 } 489 } else { 490 memcpy(kpmd->template, bytes, ilen); 491 } 492 493 return (0); 494 } 495 496 int 497 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval, 498 void *opaque) 499 { 500 struct kinst_probe *kp; 501 dtrace_kinst_probedesc_t *pd; 502 const char *func; 503 int error, instrsize, n, off; 504 uint8_t *instr, *limit, *tmp; 505 bool push_found, pop_found; 506 507 pd = opaque; 508 func = symval->name; 509 if (kinst_excluded(func)) 510 return (0); 511 if (strcmp(func, pd->kpd_func) != 0) 512 return (0); 513 514 instr = (uint8_t *)symval->value; 515 limit = (uint8_t *)symval->value + symval->size; 516 if (instr >= limit) 517 return (0); 518 519 /* 520 * Refuse to instrument functions lacking the usual frame pointer 521 * manipulations since they might correspond to exception handlers. 522 */ 523 tmp = instr; 524 push_found = pop_found = false; 525 while (tmp < limit) { 526 if (*tmp == KINST_PUSHL_RBP) 527 push_found = true; 528 else if (*tmp == KINST_POPL_RBP) 529 pop_found = true; 530 if (push_found && pop_found) 531 break; 532 tmp += dtrace_instr_size(tmp); 533 } 534 if (!push_found || !pop_found) 535 return (0); 536 537 n = 0; 538 while (instr < limit) { 539 instrsize = dtrace_instr_size(instr); 540 off = (int)(instr - (uint8_t *)symval->value); 541 if (pd->kpd_off != -1 && off != pd->kpd_off) { 542 instr += instrsize; 543 continue; 544 } 545 546 /* 547 * Check for instructions which may enable interrupts. Such 548 * instructions are tricky to trace since it is unclear whether 549 * to use the per-thread or per-CPU trampolines. Since they are 550 * rare, we don't bother to implement special handling for them. 551 * 552 * If the caller specified an offset, return an error, otherwise 553 * silently ignore the instruction so that it remains possible 554 * to enable all instructions in a function. 555 */ 556 if (instrsize == 1 && 557 (instr[0] == KINST_POPF || instr[0] == KINST_STI)) { 558 if (pd->kpd_off != -1) 559 return (EINVAL); 560 instr += instrsize; 561 continue; 562 } 563 564 /* 565 * Prevent separate dtrace(1) instances from creating copies of 566 * the same probe. 567 */ 568 LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) { 569 if (strcmp(kp->kp_func, func) == 0 && 570 strtol(kp->kp_name, NULL, 10) == off) 571 return (0); 572 } 573 if (++n > KINST_PROBETAB_MAX) { 574 KINST_LOG("probe list full: %d entries", n); 575 return (ENOMEM); 576 } 577 kp = malloc(sizeof(struct kinst_probe), M_KINST, 578 M_WAITOK | M_ZERO); 579 kp->kp_func = func; 580 snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off); 581 kp->kp_savedval = *instr; 582 kp->kp_patchval = KINST_PATCHVAL; 583 kp->kp_patchpoint = instr; 584 585 error = kinst_instr_dissect(kp, &instr); 586 if (error != 0) 587 return (error); 588 589 kinst_probe_create(kp, lf); 590 } 591 592 return (0); 593 } 594 595 int 596 kinst_md_init(void) 597 { 598 uint8_t *tramp; 599 int cpu; 600 601 CPU_FOREACH(cpu) { 602 tramp = kinst_trampoline_alloc(M_WAITOK); 603 if (tramp == NULL) 604 return (ENOMEM); 605 DPCPU_ID_SET(cpu, intr_tramp, tramp); 606 } 607 608 return (0); 609 } 610 611 void 612 kinst_md_deinit(void) 613 { 614 uint8_t *tramp; 615 int cpu; 616 617 CPU_FOREACH(cpu) { 618 tramp = DPCPU_ID_GET(cpu, intr_tramp); 619 if (tramp != NULL) { 620 kinst_trampoline_dealloc(tramp); 621 DPCPU_ID_SET(cpu, intr_tramp, NULL); 622 } 623 } 624 } 625 626 /* 627 * Exclude machine-dependent functions that are not safe-to-trace. 628 */ 629 bool 630 kinst_md_excluded(const char *name) 631 { 632 return (false); 633 } 634