1 /* 2 * SPDX-License-Identifier: CDDL 1.0 3 * 4 * Copyright 2022 Christos Margiolis <christos@FreeBSD.org> 5 * Copyright 2022 Mark Johnston <markj@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/pcpu.h> 10 11 #include <machine/cpufunc.h> 12 #include <machine/md_var.h> 13 14 #include <sys/dtrace.h> 15 #include <cddl/dev/dtrace/dtrace_cddl.h> 16 #include <dis_tables.h> 17 18 #include "kinst.h" 19 20 #define KINST_PUSHL_RBP 0x55 21 #define KINST_STI 0xfb 22 #define KINST_POPF 0x9d 23 24 #define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6) 25 #define KINST_MODRM_REG(b) (((b) & 0x38) >> 3) 26 #define KINST_MODRM_RM(b) ((b) & 0x07) 27 28 #define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6) 29 #define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3) 30 #define KINST_SIB_BASE(s) (((s) & 0x07) >> 0) 31 32 #define KINST_REX_W(r) (((r) & 0x08) >> 3) 33 #define KINST_REX_R(r) (((r) & 0x04) >> 2) 34 #define KINST_REX_X(r) (((r) & 0x02) >> 1) 35 #define KINST_REX_B(r) (((r) & 0x01) >> 0) 36 37 #define KINST_F_CALL 0x0001 /* instruction is a "call" */ 38 #define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */ 39 #define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */ 40 #define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */ 41 #define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */ 42 43 /* 44 * Per-CPU trampolines used when the interrupted thread is executing with 45 * interrupts disabled. If an interrupt is raised while executing a trampoline, 46 * the interrupt thread cannot safely overwrite its trampoline if it hits a 47 * kinst probe while executing the interrupt handler. 48 */ 49 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp); 50 51 /* 52 * Map ModR/M register bits to a trapframe offset. 53 */ 54 static int 55 kinst_regoff(int reg) 56 { 57 #define _MATCH_REG(i, reg) \ 58 case i: \ 59 return (offsetof(struct trapframe, tf_ ## reg) / \ 60 sizeof(register_t)) 61 switch (reg) { 62 _MATCH_REG( 0, rax); 63 _MATCH_REG( 1, rcx); 64 _MATCH_REG( 2, rdx); 65 _MATCH_REG( 3, rbx); 66 _MATCH_REG( 4, rsp); /* SIB when mod != 3 */ 67 _MATCH_REG( 5, rbp); 68 _MATCH_REG( 6, rsi); 69 _MATCH_REG( 7, rdi); 70 _MATCH_REG( 8, r8); /* REX.R is set */ 71 _MATCH_REG( 9, r9); 72 _MATCH_REG(10, r10); 73 _MATCH_REG(11, r11); 74 _MATCH_REG(12, r12); 75 _MATCH_REG(13, r13); 76 _MATCH_REG(14, r14); 77 _MATCH_REG(15, r15); 78 } 79 #undef _MATCH_REG 80 panic("%s: unhandled register index %d", __func__, reg); 81 } 82 83 /* 84 * Obtain the specified register's value. 85 */ 86 static uint64_t 87 kinst_regval(struct trapframe *frame, int reg) 88 { 89 if (reg == -1) 90 return (0); 91 return (((register_t *)frame)[kinst_regoff(reg)]); 92 } 93 94 static uint32_t 95 kinst_riprel_disp(struct kinst_probe *kp, void *dst) 96 { 97 return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp - 98 (intptr_t)dst)); 99 } 100 101 static void 102 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp) 103 { 104 uint8_t *instr; 105 uint32_t disp; 106 int ilen; 107 108 ilen = kp->kp_md.tinstlen; 109 110 kinst_memcpy(tramp, kp->kp_md.template, ilen); 111 if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) { 112 disp = kinst_riprel_disp(kp, tramp); 113 kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t)); 114 } 115 116 /* 117 * The following position-independent jmp takes us back to the 118 * original code. It is encoded as "jmp *0(%rip)" (six bytes), 119 * followed by the absolute address of the instruction following 120 * the one that was traced (eight bytes). 121 */ 122 tramp[ilen + 0] = 0xff; 123 tramp[ilen + 1] = 0x25; 124 tramp[ilen + 2] = 0x00; 125 tramp[ilen + 3] = 0x00; 126 tramp[ilen + 4] = 0x00; 127 tramp[ilen + 5] = 0x00; 128 instr = kp->kp_patchpoint + kp->kp_md.instlen; 129 kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t)); 130 } 131 132 int 133 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch) 134 { 135 solaris_cpu_t *cpu; 136 uintptr_t *stack, retaddr; 137 struct kinst_probe *kp; 138 struct kinst_probe_md *kpmd; 139 uint8_t *tramp; 140 141 stack = (uintptr_t *)frame->tf_rsp; 142 cpu = &solaris_cpu[curcpu]; 143 144 LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) { 145 if ((uintptr_t)kp->kp_patchpoint == addr) 146 break; 147 } 148 if (kp == NULL) 149 return (0); 150 151 /* 152 * Report the address of the breakpoint for the benefit of consumers 153 * fetching register values with regs[]. 154 */ 155 frame->tf_rip--; 156 157 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 158 cpu->cpu_dtrace_caller = stack[0]; 159 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); 160 dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0); 161 cpu->cpu_dtrace_caller = 0; 162 163 kpmd = &kp->kp_md; 164 if ((kpmd->flags & KINST_F_CALL) != 0) { 165 /* 166 * dtrace_invop_start() reserves space on the stack to 167 * store the return address of the call instruction. 168 */ 169 retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen); 170 *(uintptr_t *)scratch = retaddr; 171 172 if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) { 173 frame->tf_rip = (uintptr_t)(kp->kp_patchpoint + 174 kpmd->disp + kpmd->instlen); 175 } else { 176 register_t rval; 177 178 if (kpmd->reg1 == -1 && kpmd->reg2 == -1) { 179 /* rip-relative */ 180 rval = frame->tf_rip + kpmd->instlen; 181 } else { 182 /* indirect */ 183 rval = kinst_regval(frame, kpmd->reg1) + 184 (kinst_regval(frame, kpmd->reg2) << 185 kpmd->scale); 186 } 187 188 if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) { 189 frame->tf_rip = rval + kpmd->disp; 190 } else { 191 frame->tf_rip = 192 *(uintptr_t *)(rval + kpmd->disp); 193 } 194 } 195 return (DTRACE_INVOP_CALL); 196 } else { 197 if ((frame->tf_rflags & PSL_I) == 0) 198 tramp = DPCPU_GET(intr_tramp); 199 else 200 tramp = curthread->t_kinst_tramp; 201 if (tramp == NULL) { 202 /* 203 * A trampoline allocation failed, so this probe is 204 * effectively disabled. Restore the original 205 * instruction. 206 * 207 * We can't safely print anything here, but the 208 * trampoline allocator should have left a breadcrumb in 209 * the dmesg. 210 */ 211 kinst_patch_tracepoint(kp, kp->kp_savedval); 212 frame->tf_rip = (register_t)kp->kp_patchpoint; 213 } else { 214 kinst_trampoline_populate(kp, tramp); 215 frame->tf_rip = (register_t)tramp; 216 } 217 return (DTRACE_INVOP_NOP); 218 } 219 } 220 221 void 222 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val) 223 { 224 register_t reg; 225 int oldwp; 226 227 reg = intr_disable(); 228 oldwp = disable_wp(); 229 *kp->kp_patchpoint = val; 230 restore_wp(oldwp); 231 intr_restore(reg); 232 } 233 234 static void 235 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte) 236 { 237 kp->kp_md.disp = (int64_t)(int8_t)byte; 238 } 239 240 static void 241 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes) 242 { 243 int32_t disp32; 244 245 memcpy(&disp32, bytes, sizeof(disp32)); 246 kp->kp_md.disp = (int64_t)disp32; 247 } 248 249 /* 250 * Set up all of the state needed to faithfully execute a probed instruction. 251 * 252 * In the simple case, we copy the instruction unmodified to a per-thread 253 * trampoline, wherein it is followed by a jump back to the original code. 254 * - Instructions can have %rip as an operand: 255 * - with %rip-relative addressing encoded in ModR/M, or 256 * - implicitly as a part of the instruction definition (jmp, call). 257 * - Call instructions (which may be %rip-relative) need to push the correct 258 * return address onto the stack. 259 * 260 * Call instructions are simple enough to be emulated in software, so we simply 261 * do not use the trampoline mechanism in that case. kinst_invop() will compute 262 * the branch target using the address info computed here (register operands and 263 * displacement). 264 * 265 * %rip-relative operands encoded using the ModR/M byte always use a 32-bit 266 * displacement; when populating the trampoline the displacement is adjusted to 267 * be relative to the trampoline address. Trampolines are always allocated 268 * above KERNBASE for this reason. 269 * 270 * For other %rip-relative operands (just jumps) we take the same approach. 271 * Instructions which specify an 8-bit displacement must be rewritten to use a 272 * 32-bit displacement. 273 */ 274 static int 275 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr) 276 { 277 struct kinst_probe_md *kpmd; 278 dis86_t d86; 279 uint8_t *bytes, modrm, rex; 280 int dispoff, i, ilen, opcidx; 281 282 kpmd = &kp->kp_md; 283 284 d86.d86_data = instr; 285 d86.d86_get_byte = dtrace_dis_get_byte; 286 d86.d86_check_func = NULL; 287 if (dtrace_disx86(&d86, SIZE64) != 0) { 288 KINST_LOG("failed to disassemble instruction at: %p", *instr); 289 return (EINVAL); 290 } 291 bytes = d86.d86_bytes; 292 kpmd->instlen = kpmd->tinstlen = d86.d86_len; 293 294 /* 295 * Skip over prefixes, save REX. 296 */ 297 rex = 0; 298 for (i = 0; i < kpmd->instlen; i++) { 299 switch (bytes[i]) { 300 case 0xf0 ... 0xf3: 301 /* group 1 */ 302 continue; 303 case 0x26: 304 case 0x2e: 305 case 0x36: 306 case 0x3e: 307 case 0x64: 308 case 0x65: 309 /* group 2 */ 310 continue; 311 case 0x66: 312 /* group 3 */ 313 continue; 314 case 0x67: 315 /* group 4 */ 316 continue; 317 case 0x40 ... 0x4f: 318 /* REX */ 319 rex = bytes[i]; 320 continue; 321 } 322 break; 323 } 324 KASSERT(i < kpmd->instlen, 325 ("%s: failed to disassemble instruction at %p", __func__, bytes)); 326 opcidx = i; 327 328 /* 329 * Identify instructions of interest by opcode: calls and jumps. 330 * Extract displacements. 331 */ 332 dispoff = -1; 333 switch (bytes[opcidx]) { 334 case 0x0f: 335 switch (bytes[opcidx + 1]) { 336 case 0x80 ... 0x8f: 337 /* conditional jmp near */ 338 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 339 dispoff = opcidx + 2; 340 kinst_set_disp32(kp, &bytes[dispoff]); 341 break; 342 } 343 break; 344 case 0xe3: 345 /* 346 * There is no straightforward way to translate this instruction 347 * to use a 32-bit displacement. Fortunately, it is rarely 348 * used. 349 */ 350 return (EINVAL); 351 case 0x70 ... 0x7f: 352 /* conditional jmp short */ 353 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 354 dispoff = opcidx + 1; 355 kinst_set_disp8(kp, bytes[dispoff]); 356 break; 357 case 0xe9: 358 /* unconditional jmp near */ 359 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 360 dispoff = opcidx + 1; 361 kinst_set_disp32(kp, &bytes[dispoff]); 362 break; 363 case 0xeb: 364 /* unconditional jmp short */ 365 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 366 dispoff = opcidx + 1; 367 kinst_set_disp8(kp, bytes[dispoff]); 368 break; 369 case 0xe8: 370 case 0x9a: 371 /* direct call */ 372 kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL; 373 dispoff = opcidx + 1; 374 kinst_set_disp32(kp, &bytes[dispoff]); 375 break; 376 case 0xff: 377 KASSERT(d86.d86_got_modrm, 378 ("no ModR/M byte for instr at %p", *instr - kpmd->instlen)); 379 switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) { 380 case 0x02: 381 case 0x03: 382 /* indirect call */ 383 kpmd->flags |= KINST_F_CALL; 384 break; 385 case 0x04: 386 case 0x05: 387 /* indirect jump */ 388 kpmd->flags |= KINST_F_JMP; 389 break; 390 } 391 } 392 393 /* 394 * If there's a ModR/M byte, we need to check it to see if the operand 395 * is %rip-relative, and rewrite the displacement if so. If not, we 396 * might still have to extract operand info if this is a call 397 * instruction. 398 */ 399 if (d86.d86_got_modrm) { 400 uint8_t mod, rm, sib; 401 402 kpmd->reg1 = kpmd->reg2 = -1; 403 404 modrm = bytes[d86.d86_rmindex]; 405 mod = KINST_MODRM_MOD(modrm); 406 rm = KINST_MODRM_RM(modrm); 407 if (mod == 0 && rm == 5) { 408 kpmd->flags |= KINST_F_RIPREL; 409 dispoff = d86.d86_rmindex + 1; 410 kinst_set_disp32(kp, &bytes[dispoff]); 411 } else if ((kpmd->flags & KINST_F_CALL) != 0) { 412 bool havesib; 413 414 havesib = (mod != 3 && rm == 4); 415 dispoff = d86.d86_rmindex + (havesib ? 2 : 1); 416 if (mod == 1) 417 kinst_set_disp8(kp, bytes[dispoff]); 418 else if (mod == 2) 419 kinst_set_disp32(kp, &bytes[dispoff]); 420 else if (mod == 3) 421 kpmd->flags |= KINST_F_MOD_DIRECT; 422 423 if (havesib) { 424 sib = bytes[d86.d86_rmindex + 1]; 425 if (KINST_SIB_BASE(sib) != 5) { 426 kpmd->reg1 = KINST_SIB_BASE(sib) | 427 (KINST_REX_B(rex) << 3); 428 } 429 kpmd->scale = KINST_SIB_SCALE(sib); 430 kpmd->reg2 = KINST_SIB_INDEX(sib) | 431 (KINST_REX_X(rex) << 3); 432 } else { 433 kpmd->reg1 = rm | (KINST_REX_B(rex) << 3); 434 } 435 } 436 } 437 438 /* 439 * Calls are emulated in software; once operands are decoded we have 440 * nothing else to do. 441 */ 442 if ((kpmd->flags & KINST_F_CALL) != 0) 443 return (0); 444 445 /* 446 * Allocate and populate an instruction trampoline template. 447 * 448 * Position-independent instructions can simply be copied, but 449 * position-dependent instructions require some surgery: jump 450 * instructions with an 8-bit displacement need to be converted to use a 451 * 32-bit displacement, and the adjusted displacement needs to be 452 * computed. 453 */ 454 ilen = kpmd->instlen; 455 if ((kpmd->flags & KINST_F_RIPREL) != 0) { 456 if ((kpmd->flags & KINST_F_JMP) == 0 || 457 bytes[opcidx] == 0x0f || 458 bytes[opcidx] == 0xe9 || 459 bytes[opcidx] == 0xff) { 460 memcpy(kpmd->template, bytes, dispoff); 461 memcpy(&kpmd->template[dispoff + 4], 462 &bytes[dispoff + 4], ilen - (dispoff + 4)); 463 kpmd->dispoff = dispoff; 464 } else if (bytes[opcidx] == 0xeb) { 465 memcpy(kpmd->template, bytes, opcidx); 466 kpmd->template[opcidx] = 0xe9; 467 kpmd->dispoff = opcidx + 1; 468 469 /* Instruction length changes from 2 to 5. */ 470 kpmd->tinstlen = 5; 471 kpmd->disp -= 3; 472 } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) { 473 memcpy(kpmd->template, bytes, opcidx); 474 kpmd->template[opcidx] = 0x0f; 475 kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10; 476 kpmd->dispoff = opcidx + 2; 477 478 /* Instruction length changes from 2 to 6. */ 479 kpmd->tinstlen = 6; 480 kpmd->disp -= 4; 481 } else { 482 panic("unhandled opcode %#x", bytes[opcidx]); 483 } 484 } else { 485 memcpy(kpmd->template, bytes, ilen); 486 } 487 488 return (0); 489 } 490 491 int 492 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval, 493 void *opaque) 494 { 495 struct kinst_probe *kp; 496 dtrace_kinst_probedesc_t *pd; 497 const char *func; 498 int error, instrsize, n, off; 499 uint8_t *instr, *limit; 500 501 pd = opaque; 502 func = symval->name; 503 if (kinst_excluded(func)) 504 return (0); 505 if (strcmp(func, pd->kpd_func) != 0) 506 return (0); 507 508 instr = (uint8_t *)symval->value; 509 limit = (uint8_t *)symval->value + symval->size; 510 if (instr >= limit) 511 return (0); 512 513 /* 514 * Ignore functions not beginning with the usual function prologue. 515 * These might correspond to exception handlers with which we should not 516 * meddle. This does however exclude functions which can be safely 517 * traced, such as cpu_switch(). 518 */ 519 if (*instr != KINST_PUSHL_RBP) 520 return (0); 521 522 n = 0; 523 while (instr < limit) { 524 instrsize = dtrace_instr_size(instr); 525 off = (int)(instr - (uint8_t *)symval->value); 526 if (pd->kpd_off != -1 && off != pd->kpd_off) { 527 instr += instrsize; 528 continue; 529 } 530 531 /* 532 * Check for instructions which may enable interrupts. Such 533 * instructions are tricky to trace since it is unclear whether 534 * to use the per-thread or per-CPU trampolines. Since they are 535 * rare, we don't bother to implement special handling for them. 536 * 537 * If the caller specified an offset, return an error, otherwise 538 * silently ignore the instruction so that it remains possible 539 * to enable all instructions in a function. 540 */ 541 if (instrsize == 1 && 542 (instr[0] == KINST_POPF || instr[0] == KINST_STI)) { 543 if (pd->kpd_off != -1) 544 return (EINVAL); 545 instr += instrsize; 546 continue; 547 } 548 549 /* 550 * Prevent separate dtrace(1) instances from creating copies of 551 * the same probe. 552 */ 553 LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) { 554 if (strcmp(kp->kp_func, func) == 0 && 555 strtol(kp->kp_name, NULL, 10) == off) 556 return (0); 557 } 558 if (++n > KINST_PROBETAB_MAX) { 559 KINST_LOG("probe list full: %d entries", n); 560 return (ENOMEM); 561 } 562 kp = malloc(sizeof(struct kinst_probe), M_KINST, 563 M_WAITOK | M_ZERO); 564 kp->kp_func = func; 565 snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off); 566 kp->kp_savedval = *instr; 567 kp->kp_patchval = KINST_PATCHVAL; 568 kp->kp_patchpoint = instr; 569 570 error = kinst_instr_dissect(kp, &instr); 571 if (error != 0) 572 return (error); 573 574 kinst_probe_create(kp, lf); 575 } 576 577 return (0); 578 } 579 580 int 581 kinst_md_init(void) 582 { 583 uint8_t *tramp; 584 int cpu; 585 586 CPU_FOREACH(cpu) { 587 tramp = kinst_trampoline_alloc(M_WAITOK); 588 if (tramp == NULL) 589 return (ENOMEM); 590 DPCPU_ID_SET(cpu, intr_tramp, tramp); 591 } 592 593 return (0); 594 } 595 596 void 597 kinst_md_deinit(void) 598 { 599 uint8_t *tramp; 600 int cpu; 601 602 CPU_FOREACH(cpu) { 603 tramp = DPCPU_ID_GET(cpu, intr_tramp); 604 if (tramp != NULL) { 605 kinst_trampoline_dealloc(tramp); 606 DPCPU_ID_SET(cpu, intr_tramp, NULL); 607 } 608 } 609 } 610 611 /* 612 * Exclude machine-dependent functions that are not safe-to-trace. 613 */ 614 bool 615 kinst_md_excluded(const char *name) 616 { 617 return (false); 618 } 619