1 /* 2 * SPDX-License-Identifier: CDDL 1.0 3 * 4 * Copyright (c) 2022 Christos Margiolis <christos@FreeBSD.org> 5 * Copyright (c) 2022 Mark Johnston <markj@FreeBSD.org> 6 * Copyright (c) 2023 The FreeBSD Foundation 7 * 8 * Portions of this software were developed by Christos Margiolis 9 * <christos@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 10 */ 11 12 #include <sys/param.h> 13 #include <sys/pcpu.h> 14 15 #include <machine/cpufunc.h> 16 #include <machine/md_var.h> 17 18 #include <sys/dtrace.h> 19 #include <cddl/dev/dtrace/dtrace_cddl.h> 20 #include <dis_tables.h> 21 22 #include "kinst.h" 23 24 #define KINST_PUSHL_RBP 0x55 25 #define KINST_STI 0xfb 26 #define KINST_POPF 0x9d 27 28 #define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6) 29 #define KINST_MODRM_REG(b) (((b) & 0x38) >> 3) 30 #define KINST_MODRM_RM(b) ((b) & 0x07) 31 32 #define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6) 33 #define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3) 34 #define KINST_SIB_BASE(s) (((s) & 0x07) >> 0) 35 36 #define KINST_REX_W(r) (((r) & 0x08) >> 3) 37 #define KINST_REX_R(r) (((r) & 0x04) >> 2) 38 #define KINST_REX_X(r) (((r) & 0x02) >> 1) 39 #define KINST_REX_B(r) (((r) & 0x01) >> 0) 40 41 #define KINST_F_CALL 0x0001 /* instruction is a "call" */ 42 #define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */ 43 #define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */ 44 #define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */ 45 #define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */ 46 47 /* 48 * Per-CPU trampolines used when the interrupted thread is executing with 49 * interrupts disabled. If an interrupt is raised while executing a trampoline, 50 * the interrupt thread cannot safely overwrite its trampoline if it hits a 51 * kinst probe while executing the interrupt handler. 52 */ 53 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp); 54 55 /* 56 * Map ModR/M register bits to a trapframe offset. 57 */ 58 static int 59 kinst_regoff(int reg) 60 { 61 #define _MATCH_REG(i, reg) \ 62 case i: \ 63 return (offsetof(struct trapframe, tf_ ## reg) / \ 64 sizeof(register_t)) 65 switch (reg) { 66 _MATCH_REG( 0, rax); 67 _MATCH_REG( 1, rcx); 68 _MATCH_REG( 2, rdx); 69 _MATCH_REG( 3, rbx); 70 _MATCH_REG( 4, rsp); /* SIB when mod != 3 */ 71 _MATCH_REG( 5, rbp); 72 _MATCH_REG( 6, rsi); 73 _MATCH_REG( 7, rdi); 74 _MATCH_REG( 8, r8); /* REX.R is set */ 75 _MATCH_REG( 9, r9); 76 _MATCH_REG(10, r10); 77 _MATCH_REG(11, r11); 78 _MATCH_REG(12, r12); 79 _MATCH_REG(13, r13); 80 _MATCH_REG(14, r14); 81 _MATCH_REG(15, r15); 82 } 83 #undef _MATCH_REG 84 panic("%s: unhandled register index %d", __func__, reg); 85 } 86 87 /* 88 * Obtain the specified register's value. 89 */ 90 static uint64_t 91 kinst_regval(struct trapframe *frame, int reg) 92 { 93 if (reg == -1) 94 return (0); 95 return (((register_t *)frame)[kinst_regoff(reg)]); 96 } 97 98 static uint32_t 99 kinst_riprel_disp(struct kinst_probe *kp, void *dst) 100 { 101 return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp - 102 (intptr_t)dst)); 103 } 104 105 static void 106 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp) 107 { 108 uint8_t *instr; 109 uint32_t disp; 110 int ilen; 111 112 ilen = kp->kp_md.tinstlen; 113 114 kinst_memcpy(tramp, kp->kp_md.template, ilen); 115 if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) { 116 disp = kinst_riprel_disp(kp, tramp); 117 kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t)); 118 } 119 120 /* 121 * The following position-independent jmp takes us back to the 122 * original code. It is encoded as "jmp *0(%rip)" (six bytes), 123 * followed by the absolute address of the instruction following 124 * the one that was traced (eight bytes). 125 */ 126 tramp[ilen + 0] = 0xff; 127 tramp[ilen + 1] = 0x25; 128 tramp[ilen + 2] = 0x00; 129 tramp[ilen + 3] = 0x00; 130 tramp[ilen + 4] = 0x00; 131 tramp[ilen + 5] = 0x00; 132 instr = kp->kp_patchpoint + kp->kp_md.instlen; 133 kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t)); 134 } 135 136 int 137 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch) 138 { 139 solaris_cpu_t *cpu; 140 uintptr_t *stack, retaddr; 141 struct kinst_probe *kp; 142 struct kinst_probe_md *kpmd; 143 uint8_t *tramp; 144 145 stack = (uintptr_t *)frame->tf_rsp; 146 cpu = &solaris_cpu[curcpu]; 147 148 LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) { 149 if ((uintptr_t)kp->kp_patchpoint == addr) 150 break; 151 } 152 if (kp == NULL) 153 return (0); 154 155 /* 156 * Report the address of the breakpoint for the benefit of consumers 157 * fetching register values with regs[]. 158 */ 159 frame->tf_rip--; 160 161 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 162 cpu->cpu_dtrace_caller = stack[0]; 163 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); 164 dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0); 165 cpu->cpu_dtrace_caller = 0; 166 167 kpmd = &kp->kp_md; 168 if ((kpmd->flags & KINST_F_CALL) != 0) { 169 /* 170 * dtrace_invop_start() reserves space on the stack to 171 * store the return address of the call instruction. 172 */ 173 retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen); 174 *(uintptr_t *)scratch = retaddr; 175 176 if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) { 177 frame->tf_rip = (uintptr_t)(kp->kp_patchpoint + 178 kpmd->disp + kpmd->instlen); 179 } else { 180 register_t rval; 181 182 if (kpmd->reg1 == -1 && kpmd->reg2 == -1) { 183 /* rip-relative */ 184 rval = frame->tf_rip + kpmd->instlen; 185 } else { 186 /* indirect */ 187 rval = kinst_regval(frame, kpmd->reg1) + 188 (kinst_regval(frame, kpmd->reg2) << 189 kpmd->scale); 190 } 191 192 if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) { 193 frame->tf_rip = rval + kpmd->disp; 194 } else { 195 frame->tf_rip = 196 *(uintptr_t *)(rval + kpmd->disp); 197 } 198 } 199 return (DTRACE_INVOP_CALL); 200 } else { 201 if ((frame->tf_rflags & PSL_I) == 0) 202 tramp = DPCPU_GET(intr_tramp); 203 else 204 tramp = curthread->t_kinst_tramp; 205 if (tramp == NULL) { 206 /* 207 * A trampoline allocation failed, so this probe is 208 * effectively disabled. Restore the original 209 * instruction. 210 * 211 * We can't safely print anything here, but the 212 * trampoline allocator should have left a breadcrumb in 213 * the dmesg. 214 */ 215 kinst_patch_tracepoint(kp, kp->kp_savedval); 216 frame->tf_rip = (register_t)kp->kp_patchpoint; 217 } else { 218 kinst_trampoline_populate(kp, tramp); 219 frame->tf_rip = (register_t)tramp; 220 } 221 return (DTRACE_INVOP_NOP); 222 } 223 } 224 225 void 226 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val) 227 { 228 register_t reg; 229 int oldwp; 230 231 reg = intr_disable(); 232 oldwp = disable_wp(); 233 *kp->kp_patchpoint = val; 234 restore_wp(oldwp); 235 intr_restore(reg); 236 } 237 238 static void 239 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte) 240 { 241 kp->kp_md.disp = (int64_t)(int8_t)byte; 242 } 243 244 static void 245 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes) 246 { 247 int32_t disp32; 248 249 memcpy(&disp32, bytes, sizeof(disp32)); 250 kp->kp_md.disp = (int64_t)disp32; 251 } 252 253 /* 254 * Set up all of the state needed to faithfully execute a probed instruction. 255 * 256 * In the simple case, we copy the instruction unmodified to a per-thread 257 * trampoline, wherein it is followed by a jump back to the original code. 258 * - Instructions can have %rip as an operand: 259 * - with %rip-relative addressing encoded in ModR/M, or 260 * - implicitly as a part of the instruction definition (jmp, call). 261 * - Call instructions (which may be %rip-relative) need to push the correct 262 * return address onto the stack. 263 * 264 * Call instructions are simple enough to be emulated in software, so we simply 265 * do not use the trampoline mechanism in that case. kinst_invop() will compute 266 * the branch target using the address info computed here (register operands and 267 * displacement). 268 * 269 * %rip-relative operands encoded using the ModR/M byte always use a 32-bit 270 * displacement; when populating the trampoline the displacement is adjusted to 271 * be relative to the trampoline address. Trampolines are always allocated 272 * above KERNBASE for this reason. 273 * 274 * For other %rip-relative operands (just jumps) we take the same approach. 275 * Instructions which specify an 8-bit displacement must be rewritten to use a 276 * 32-bit displacement. 277 */ 278 static int 279 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr) 280 { 281 struct kinst_probe_md *kpmd; 282 dis86_t d86; 283 uint8_t *bytes, modrm, rex; 284 int dispoff, i, ilen, opcidx; 285 286 kpmd = &kp->kp_md; 287 288 d86.d86_data = instr; 289 d86.d86_get_byte = dtrace_dis_get_byte; 290 d86.d86_check_func = NULL; 291 if (dtrace_disx86(&d86, SIZE64) != 0) { 292 KINST_LOG("failed to disassemble instruction at: %p", *instr); 293 return (EINVAL); 294 } 295 bytes = d86.d86_bytes; 296 kpmd->instlen = kpmd->tinstlen = d86.d86_len; 297 298 /* 299 * Skip over prefixes, save REX. 300 */ 301 rex = 0; 302 for (i = 0; i < kpmd->instlen; i++) { 303 switch (bytes[i]) { 304 case 0xf0 ... 0xf3: 305 /* group 1 */ 306 continue; 307 case 0x26: 308 case 0x2e: 309 case 0x36: 310 case 0x3e: 311 case 0x64: 312 case 0x65: 313 /* group 2 */ 314 continue; 315 case 0x66: 316 /* group 3 */ 317 continue; 318 case 0x67: 319 /* group 4 */ 320 continue; 321 case 0x40 ... 0x4f: 322 /* REX */ 323 rex = bytes[i]; 324 continue; 325 } 326 break; 327 } 328 KASSERT(i < kpmd->instlen, 329 ("%s: failed to disassemble instruction at %p", __func__, bytes)); 330 opcidx = i; 331 332 /* 333 * Identify instructions of interest by opcode: calls and jumps. 334 * Extract displacements. 335 */ 336 dispoff = -1; 337 switch (bytes[opcidx]) { 338 case 0x0f: 339 switch (bytes[opcidx + 1]) { 340 case 0x80 ... 0x8f: 341 /* conditional jmp near */ 342 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 343 dispoff = opcidx + 2; 344 kinst_set_disp32(kp, &bytes[dispoff]); 345 break; 346 } 347 break; 348 case 0xe3: 349 /* 350 * There is no straightforward way to translate this instruction 351 * to use a 32-bit displacement. Fortunately, it is rarely 352 * used. 353 */ 354 return (EINVAL); 355 case 0x70 ... 0x7f: 356 /* conditional jmp short */ 357 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 358 dispoff = opcidx + 1; 359 kinst_set_disp8(kp, bytes[dispoff]); 360 break; 361 case 0xe9: 362 /* unconditional jmp near */ 363 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 364 dispoff = opcidx + 1; 365 kinst_set_disp32(kp, &bytes[dispoff]); 366 break; 367 case 0xeb: 368 /* unconditional jmp short */ 369 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 370 dispoff = opcidx + 1; 371 kinst_set_disp8(kp, bytes[dispoff]); 372 break; 373 case 0xe8: 374 case 0x9a: 375 /* direct call */ 376 kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL; 377 dispoff = opcidx + 1; 378 kinst_set_disp32(kp, &bytes[dispoff]); 379 break; 380 case 0xff: 381 KASSERT(d86.d86_got_modrm, 382 ("no ModR/M byte for instr at %p", *instr - kpmd->instlen)); 383 switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) { 384 case 0x02: 385 case 0x03: 386 /* indirect call */ 387 kpmd->flags |= KINST_F_CALL; 388 break; 389 case 0x04: 390 case 0x05: 391 /* indirect jump */ 392 kpmd->flags |= KINST_F_JMP; 393 break; 394 } 395 } 396 397 /* 398 * If there's a ModR/M byte, we need to check it to see if the operand 399 * is %rip-relative, and rewrite the displacement if so. If not, we 400 * might still have to extract operand info if this is a call 401 * instruction. 402 */ 403 if (d86.d86_got_modrm) { 404 uint8_t mod, rm, sib; 405 406 kpmd->reg1 = kpmd->reg2 = -1; 407 408 modrm = bytes[d86.d86_rmindex]; 409 mod = KINST_MODRM_MOD(modrm); 410 rm = KINST_MODRM_RM(modrm); 411 if (mod == 0 && rm == 5) { 412 kpmd->flags |= KINST_F_RIPREL; 413 dispoff = d86.d86_rmindex + 1; 414 kinst_set_disp32(kp, &bytes[dispoff]); 415 } else if ((kpmd->flags & KINST_F_CALL) != 0) { 416 bool havesib; 417 418 havesib = (mod != 3 && rm == 4); 419 dispoff = d86.d86_rmindex + (havesib ? 2 : 1); 420 if (mod == 1) 421 kinst_set_disp8(kp, bytes[dispoff]); 422 else if (mod == 2) 423 kinst_set_disp32(kp, &bytes[dispoff]); 424 else if (mod == 3) 425 kpmd->flags |= KINST_F_MOD_DIRECT; 426 427 if (havesib) { 428 sib = bytes[d86.d86_rmindex + 1]; 429 if (KINST_SIB_BASE(sib) != 5) { 430 kpmd->reg1 = KINST_SIB_BASE(sib) | 431 (KINST_REX_B(rex) << 3); 432 } 433 kpmd->scale = KINST_SIB_SCALE(sib); 434 kpmd->reg2 = KINST_SIB_INDEX(sib) | 435 (KINST_REX_X(rex) << 3); 436 } else { 437 kpmd->reg1 = rm | (KINST_REX_B(rex) << 3); 438 } 439 } 440 } 441 442 /* 443 * Calls are emulated in software; once operands are decoded we have 444 * nothing else to do. 445 */ 446 if ((kpmd->flags & KINST_F_CALL) != 0) 447 return (0); 448 449 /* 450 * Allocate and populate an instruction trampoline template. 451 * 452 * Position-independent instructions can simply be copied, but 453 * position-dependent instructions require some surgery: jump 454 * instructions with an 8-bit displacement need to be converted to use a 455 * 32-bit displacement, and the adjusted displacement needs to be 456 * computed. 457 */ 458 ilen = kpmd->instlen; 459 if ((kpmd->flags & KINST_F_RIPREL) != 0) { 460 if ((kpmd->flags & KINST_F_JMP) == 0 || 461 bytes[opcidx] == 0x0f || 462 bytes[opcidx] == 0xe9 || 463 bytes[opcidx] == 0xff) { 464 memcpy(kpmd->template, bytes, dispoff); 465 memcpy(&kpmd->template[dispoff + 4], 466 &bytes[dispoff + 4], ilen - (dispoff + 4)); 467 kpmd->dispoff = dispoff; 468 } else if (bytes[opcidx] == 0xeb) { 469 memcpy(kpmd->template, bytes, opcidx); 470 kpmd->template[opcidx] = 0xe9; 471 kpmd->dispoff = opcidx + 1; 472 473 /* Instruction length changes from 2 to 5. */ 474 kpmd->tinstlen = 5; 475 kpmd->disp -= 3; 476 } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) { 477 memcpy(kpmd->template, bytes, opcidx); 478 kpmd->template[opcidx] = 0x0f; 479 kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10; 480 kpmd->dispoff = opcidx + 2; 481 482 /* Instruction length changes from 2 to 6. */ 483 kpmd->tinstlen = 6; 484 kpmd->disp -= 4; 485 } else { 486 panic("unhandled opcode %#x", bytes[opcidx]); 487 } 488 } else { 489 memcpy(kpmd->template, bytes, ilen); 490 } 491 492 return (0); 493 } 494 495 int 496 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval, 497 void *opaque) 498 { 499 struct kinst_probe *kp; 500 dtrace_kinst_probedesc_t *pd; 501 const char *func; 502 int error, instrsize, n, off; 503 uint8_t *instr, *limit, *tmp; 504 bool push_found; 505 506 pd = opaque; 507 func = symval->name; 508 if (kinst_excluded(func)) 509 return (0); 510 if (strcmp(func, pd->kpd_func) != 0) 511 return (0); 512 513 instr = (uint8_t *)symval->value; 514 limit = (uint8_t *)symval->value + symval->size; 515 if (instr >= limit) 516 return (0); 517 518 /* 519 * Refuse to instrument functions lacking the usual frame pointer 520 * manipulations since they might correspond to exception handlers. 521 */ 522 tmp = instr; 523 push_found = false; 524 while (tmp < limit) { 525 /* 526 * Checking for 'pop %rbp' as well makes the filtering too 527 * strict as it would skip functions that never return (e.g., 528 * vnlru_proc()). 529 */ 530 if (*tmp == KINST_PUSHL_RBP) { 531 push_found = true; 532 break; 533 } 534 tmp += dtrace_instr_size(tmp); 535 } 536 if (!push_found) 537 return (0); 538 539 n = 0; 540 while (instr < limit) { 541 instrsize = dtrace_instr_size(instr); 542 off = (int)(instr - (uint8_t *)symval->value); 543 if (pd->kpd_off != -1 && off != pd->kpd_off) { 544 instr += instrsize; 545 continue; 546 } 547 548 /* 549 * Check for instructions which may enable interrupts. Such 550 * instructions are tricky to trace since it is unclear whether 551 * to use the per-thread or per-CPU trampolines. Since they are 552 * rare, we don't bother to implement special handling for them. 553 * 554 * If the caller specified an offset, return an error, otherwise 555 * silently ignore the instruction so that it remains possible 556 * to enable all instructions in a function. 557 */ 558 if (instrsize == 1 && 559 (instr[0] == KINST_POPF || instr[0] == KINST_STI)) { 560 if (pd->kpd_off != -1) 561 return (EINVAL); 562 instr += instrsize; 563 continue; 564 } 565 566 /* 567 * Prevent separate dtrace(1) instances from creating copies of 568 * the same probe. 569 */ 570 LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) { 571 if (strcmp(kp->kp_func, func) == 0 && 572 strtol(kp->kp_name, NULL, 10) == off) 573 return (0); 574 } 575 if (++n > KINST_PROBETAB_MAX) { 576 KINST_LOG("probe list full: %d entries", n); 577 return (ENOMEM); 578 } 579 kp = malloc(sizeof(struct kinst_probe), M_KINST, 580 M_WAITOK | M_ZERO); 581 kp->kp_func = func; 582 snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off); 583 kp->kp_savedval = *instr; 584 kp->kp_patchval = KINST_PATCHVAL; 585 kp->kp_patchpoint = instr; 586 587 error = kinst_instr_dissect(kp, &instr); 588 if (error != 0) 589 return (error); 590 591 kinst_probe_create(kp, lf); 592 } 593 594 return (0); 595 } 596 597 int 598 kinst_md_init(void) 599 { 600 uint8_t *tramp; 601 int cpu; 602 603 CPU_FOREACH(cpu) { 604 tramp = kinst_trampoline_alloc(M_WAITOK); 605 if (tramp == NULL) 606 return (ENOMEM); 607 DPCPU_ID_SET(cpu, intr_tramp, tramp); 608 } 609 610 return (0); 611 } 612 613 void 614 kinst_md_deinit(void) 615 { 616 uint8_t *tramp; 617 int cpu; 618 619 CPU_FOREACH(cpu) { 620 tramp = DPCPU_ID_GET(cpu, intr_tramp); 621 if (tramp != NULL) { 622 kinst_trampoline_dealloc(tramp); 623 DPCPU_ID_SET(cpu, intr_tramp, NULL); 624 } 625 } 626 } 627 628 /* 629 * Exclude machine-dependent functions that are not safe-to-trace. 630 */ 631 bool 632 kinst_md_excluded(const char *name) 633 { 634 return (false); 635 } 636