1 /* 2 * SPDX-License-Identifier: CDDL 1.0 3 * 4 * Copyright 2022 Christos Margiolis <christos@FreeBSD.org> 5 * Copyright 2022 Mark Johnston <markj@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/pcpu.h> 10 11 #include <machine/cpufunc.h> 12 #include <machine/md_var.h> 13 14 #include <sys/dtrace.h> 15 #include <cddl/dev/dtrace/dtrace_cddl.h> 16 #include <dis_tables.h> 17 18 #include "kinst.h" 19 20 #define KINST_PUSHL_RBP 0x55 21 #define KINST_STI 0xfb 22 #define KINST_POPF 0x9d 23 24 #define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6) 25 #define KINST_MODRM_REG(b) (((b) & 0x38) >> 3) 26 #define KINST_MODRM_RM(b) ((b) & 0x07) 27 28 #define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6) 29 #define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3) 30 #define KINST_SIB_BASE(s) (((s) & 0x07) >> 0) 31 32 #define KINST_REX_W(r) (((r) & 0x08) >> 3) 33 #define KINST_REX_R(r) (((r) & 0x04) >> 2) 34 #define KINST_REX_X(r) (((r) & 0x02) >> 1) 35 #define KINST_REX_B(r) (((r) & 0x01) >> 0) 36 37 #define KINST_F_CALL 0x0001 /* instruction is a "call" */ 38 #define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */ 39 #define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */ 40 #define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */ 41 #define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */ 42 43 /* 44 * Per-CPU trampolines used when the interrupted thread is executing with 45 * interrupts disabled. If an interrupt is raised while executing a trampoline, 46 * the interrupt thread cannot safely overwrite its trampoline if it hits a 47 * kinst probe while executing the interrupt handler. 48 */ 49 DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp); 50 51 /* 52 * Map ModR/M register bits to a trapframe offset. 53 */ 54 static int 55 kinst_regoff(int reg) 56 { 57 #define _MATCH_REG(i, reg) \ 58 case i: \ 59 return (offsetof(struct trapframe, tf_ ## reg) / \ 60 sizeof(register_t)) 61 switch (reg) { 62 _MATCH_REG( 0, rax); 63 _MATCH_REG( 1, rcx); 64 _MATCH_REG( 2, rdx); 65 _MATCH_REG( 3, rbx); 66 _MATCH_REG( 4, rsp); /* SIB when mod != 3 */ 67 _MATCH_REG( 5, rbp); 68 _MATCH_REG( 6, rsi); 69 _MATCH_REG( 7, rdi); 70 _MATCH_REG( 8, r8); /* REX.R is set */ 71 _MATCH_REG( 9, r9); 72 _MATCH_REG(10, r10); 73 _MATCH_REG(11, r11); 74 _MATCH_REG(12, r12); 75 _MATCH_REG(13, r13); 76 _MATCH_REG(14, r14); 77 _MATCH_REG(15, r15); 78 } 79 #undef _MATCH_REG 80 panic("%s: unhandled register index %d", __func__, reg); 81 } 82 83 /* 84 * Obtain the specified register's value. 85 */ 86 static uint64_t 87 kinst_regval(struct trapframe *frame, int reg) 88 { 89 if (reg == -1) 90 return (0); 91 return (((register_t *)frame)[kinst_regoff(reg)]); 92 } 93 94 static uint32_t 95 kinst_riprel_disp(struct kinst_probe *kp, void *dst) 96 { 97 return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp - 98 (intptr_t)dst)); 99 } 100 101 static void 102 kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp) 103 { 104 uint8_t *instr; 105 uint32_t disp; 106 int ilen; 107 108 ilen = kp->kp_md.tinstlen; 109 110 memcpy(tramp, kp->kp_md.template, ilen); 111 if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) { 112 disp = kinst_riprel_disp(kp, tramp); 113 memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t)); 114 } 115 116 /* 117 * The following position-independent jmp takes us back to the 118 * original code. It is encoded as "jmp *0(%rip)" (six bytes), 119 * followed by the absolute address of the instruction following 120 * the one that was traced (eight bytes). 121 */ 122 tramp[ilen + 0] = 0xff; 123 tramp[ilen + 1] = 0x25; 124 tramp[ilen + 2] = 0x00; 125 tramp[ilen + 3] = 0x00; 126 tramp[ilen + 4] = 0x00; 127 tramp[ilen + 5] = 0x00; 128 instr = kp->kp_patchpoint + kp->kp_md.instlen; 129 memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t)); 130 } 131 132 int 133 kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch) 134 { 135 solaris_cpu_t *cpu; 136 uintptr_t *stack, retaddr; 137 struct kinst_probe *kp; 138 struct kinst_probe_md *kpmd; 139 uint8_t *tramp; 140 141 stack = (uintptr_t *)frame->tf_rsp; 142 cpu = &solaris_cpu[curcpu]; 143 144 LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) { 145 if ((uintptr_t)kp->kp_patchpoint == addr) 146 break; 147 } 148 if (kp == NULL) 149 return (0); 150 151 /* 152 * Report the address of the breakpoint for the benefit of consumers 153 * fetching register values with regs[]. 154 */ 155 frame->tf_rip--; 156 157 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 158 cpu->cpu_dtrace_caller = stack[0]; 159 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); 160 dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0); 161 cpu->cpu_dtrace_caller = 0; 162 163 kpmd = &kp->kp_md; 164 if ((kpmd->flags & KINST_F_CALL) != 0) { 165 /* 166 * dtrace_invop_start() reserves space on the stack to 167 * store the return address of the call instruction. 168 */ 169 retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen); 170 *(uintptr_t *)scratch = retaddr; 171 172 if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) { 173 frame->tf_rip = (uintptr_t)(kp->kp_patchpoint + 174 kpmd->disp + kpmd->instlen); 175 } else { 176 register_t rval; 177 178 if (kpmd->reg1 == -1 && kpmd->reg2 == -1) { 179 /* rip-relative */ 180 rval = frame->tf_rip + kpmd->instlen; 181 } else { 182 /* indirect */ 183 rval = kinst_regval(frame, kpmd->reg1) + 184 (kinst_regval(frame, kpmd->reg2) << 185 kpmd->scale); 186 } 187 188 if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) { 189 frame->tf_rip = rval + kpmd->disp; 190 } else { 191 frame->tf_rip = 192 *(uintptr_t *)(rval + kpmd->disp); 193 } 194 } 195 return (DTRACE_INVOP_CALL); 196 } else { 197 if ((frame->tf_rflags & PSL_I) == 0) 198 tramp = DPCPU_GET(intr_tramp); 199 else 200 tramp = curthread->t_kinst; 201 if (tramp == NULL) { 202 /* 203 * A trampoline allocation failed, so this probe is 204 * effectively disabled. Restore the original 205 * instruction. 206 * 207 * We can't safely print anything here, but the 208 * trampoline allocator should have left a breadcrumb in 209 * the dmesg. 210 */ 211 kinst_patch_tracepoint(kp, kp->kp_savedval); 212 frame->tf_rip = (register_t)kp->kp_patchpoint; 213 } else { 214 kinst_trampoline_populate(kp, tramp); 215 frame->tf_rip = (register_t)tramp; 216 } 217 return (DTRACE_INVOP_NOP); 218 } 219 } 220 221 void 222 kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val) 223 { 224 register_t reg; 225 int oldwp; 226 227 reg = intr_disable(); 228 oldwp = disable_wp(); 229 *kp->kp_patchpoint = val; 230 restore_wp(oldwp); 231 intr_restore(reg); 232 } 233 234 static void 235 kinst_set_disp8(struct kinst_probe *kp, uint8_t byte) 236 { 237 kp->kp_md.disp = (int64_t)(int8_t)byte; 238 } 239 240 static void 241 kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes) 242 { 243 int32_t disp32; 244 245 memcpy(&disp32, bytes, sizeof(disp32)); 246 kp->kp_md.disp = (int64_t)disp32; 247 } 248 249 static int 250 kinst_dis_get_byte(void *p) 251 { 252 int ret; 253 uint8_t **instr = p; 254 255 ret = **instr; 256 (*instr)++; 257 258 return (ret); 259 } 260 261 /* 262 * Set up all of the state needed to faithfully execute a probed instruction. 263 * 264 * In the simple case, we copy the instruction unmodified to a per-thread 265 * trampoline, wherein it is followed by a jump back to the original code. 266 * - Instructions can have %rip as an operand: 267 * - with %rip-relative addressing encoded in ModR/M, or 268 * - implicitly as a part of the instruction definition (jmp, call). 269 * - Call instructions (which may be %rip-relative) need to push the correct 270 * return address onto the stack. 271 * 272 * Call instructions are simple enough to be emulated in software, so we simply 273 * do not use the trampoline mechanism in that case. kinst_invop() will compute 274 * the branch target using the address info computed here (register operands and 275 * displacement). 276 * 277 * %rip-relative operands encoded using the ModR/M byte always use a 32-bit 278 * displacement; when populating the trampoline the displacement is adjusted to 279 * be relative to the trampoline address. Trampolines are always allocated 280 * above KERNBASE for this reason. 281 * 282 * For other %rip-relative operands (just jumps) we take the same approach. 283 * Instructions which specify an 8-bit displacement must be rewritten to use a 284 * 32-bit displacement. 285 */ 286 static int 287 kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr) 288 { 289 struct kinst_probe_md *kpmd; 290 dis86_t d86; 291 uint8_t *bytes, modrm, rex; 292 int dispoff, i, ilen, opcidx; 293 294 kpmd = &kp->kp_md; 295 296 d86.d86_data = instr; 297 d86.d86_get_byte = kinst_dis_get_byte; 298 d86.d86_check_func = NULL; 299 if (dtrace_disx86(&d86, SIZE64) != 0) { 300 KINST_LOG("failed to disassemble instruction at: %p", *instr); 301 return (EINVAL); 302 } 303 bytes = d86.d86_bytes; 304 kpmd->instlen = kpmd->tinstlen = d86.d86_len; 305 306 /* 307 * Skip over prefixes, save REX. 308 */ 309 rex = 0; 310 for (i = 0; i < kpmd->instlen; i++) { 311 switch (bytes[i]) { 312 case 0xf0 ... 0xf3: 313 /* group 1 */ 314 continue; 315 case 0x26: 316 case 0x2e: 317 case 0x36: 318 case 0x3e: 319 case 0x64: 320 case 0x65: 321 /* group 2 */ 322 continue; 323 case 0x66: 324 /* group 3 */ 325 continue; 326 case 0x67: 327 /* group 4 */ 328 continue; 329 case 0x40 ... 0x4f: 330 /* REX */ 331 rex = bytes[i]; 332 continue; 333 } 334 break; 335 } 336 KASSERT(i < kpmd->instlen, 337 ("%s: failed to disassemble instruction at %p", __func__, bytes)); 338 opcidx = i; 339 340 /* 341 * Identify instructions of interest by opcode: calls and jumps. 342 * Extract displacements. 343 */ 344 dispoff = -1; 345 switch (bytes[opcidx]) { 346 case 0x0f: 347 switch (bytes[opcidx + 1]) { 348 case 0x80 ... 0x8f: 349 /* conditional jmp near */ 350 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 351 dispoff = opcidx + 2; 352 kinst_set_disp32(kp, &bytes[dispoff]); 353 break; 354 } 355 break; 356 case 0xe3: 357 /* 358 * There is no straightforward way to translate this instruction 359 * to use a 32-bit displacement. Fortunately, it is rarely 360 * used. 361 */ 362 return (EINVAL); 363 case 0x70 ... 0x7f: 364 /* conditional jmp short */ 365 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 366 dispoff = opcidx + 1; 367 kinst_set_disp8(kp, bytes[dispoff]); 368 break; 369 case 0xe9: 370 /* unconditional jmp near */ 371 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 372 dispoff = opcidx + 1; 373 kinst_set_disp32(kp, &bytes[dispoff]); 374 break; 375 case 0xeb: 376 /* unconditional jmp short */ 377 kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; 378 dispoff = opcidx + 1; 379 kinst_set_disp8(kp, bytes[dispoff]); 380 break; 381 case 0xe8: 382 case 0x9a: 383 /* direct call */ 384 kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL; 385 dispoff = opcidx + 1; 386 kinst_set_disp32(kp, &bytes[dispoff]); 387 break; 388 case 0xff: 389 KASSERT(d86.d86_got_modrm, 390 ("no ModR/M byte for instr at %p", *instr - kpmd->instlen)); 391 switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) { 392 case 0x02: 393 case 0x03: 394 /* indirect call */ 395 kpmd->flags |= KINST_F_CALL; 396 break; 397 case 0x04: 398 case 0x05: 399 /* indirect jump */ 400 kpmd->flags |= KINST_F_JMP; 401 break; 402 } 403 } 404 405 /* 406 * If there's a ModR/M byte, we need to check it to see if the operand 407 * is %rip-relative, and rewrite the displacement if so. If not, we 408 * might still have to extract operand info if this is a call 409 * instruction. 410 */ 411 if (d86.d86_got_modrm) { 412 uint8_t mod, rm, sib; 413 414 kpmd->reg1 = kpmd->reg2 = -1; 415 416 modrm = bytes[d86.d86_rmindex]; 417 mod = KINST_MODRM_MOD(modrm); 418 rm = KINST_MODRM_RM(modrm); 419 if (mod == 0 && rm == 5) { 420 kpmd->flags |= KINST_F_RIPREL; 421 dispoff = d86.d86_rmindex + 1; 422 kinst_set_disp32(kp, &bytes[dispoff]); 423 } else if ((kpmd->flags & KINST_F_CALL) != 0) { 424 bool havesib; 425 426 havesib = (mod != 3 && rm == 4); 427 dispoff = d86.d86_rmindex + (havesib ? 2 : 1); 428 if (mod == 1) 429 kinst_set_disp8(kp, bytes[dispoff]); 430 else if (mod == 2) 431 kinst_set_disp32(kp, &bytes[dispoff]); 432 else if (mod == 3) 433 kpmd->flags |= KINST_F_MOD_DIRECT; 434 435 if (havesib) { 436 sib = bytes[d86.d86_rmindex + 1]; 437 if (KINST_SIB_BASE(sib) != 5) { 438 kpmd->reg1 = KINST_SIB_BASE(sib) | 439 (KINST_REX_B(rex) << 3); 440 } 441 kpmd->scale = KINST_SIB_SCALE(sib); 442 kpmd->reg2 = KINST_SIB_INDEX(sib) | 443 (KINST_REX_X(rex) << 3); 444 } else { 445 kpmd->reg1 = rm | (KINST_REX_B(rex) << 3); 446 } 447 } 448 } 449 450 /* 451 * Calls are emulated in software; once operands are decoded we have 452 * nothing else to do. 453 */ 454 if ((kpmd->flags & KINST_F_CALL) != 0) 455 return (0); 456 457 /* 458 * Allocate and populate an instruction trampoline template. 459 * 460 * Position-independent instructions can simply be copied, but 461 * position-dependent instructions require some surgery: jump 462 * instructions with an 8-bit displacement need to be converted to use a 463 * 32-bit displacement, and the adjusted displacement needs to be 464 * computed. 465 */ 466 ilen = kpmd->instlen; 467 if ((kpmd->flags & KINST_F_RIPREL) != 0) { 468 if ((kpmd->flags & KINST_F_JMP) == 0 || 469 bytes[opcidx] == 0x0f || 470 bytes[opcidx] == 0xe9 || 471 bytes[opcidx] == 0xff) { 472 memcpy(kpmd->template, bytes, dispoff); 473 memcpy(&kpmd->template[dispoff + 4], 474 &bytes[dispoff + 4], ilen - (dispoff + 4)); 475 kpmd->dispoff = dispoff; 476 } else if (bytes[opcidx] == 0xeb) { 477 memcpy(kpmd->template, bytes, opcidx); 478 kpmd->template[opcidx] = 0xe9; 479 kpmd->dispoff = opcidx + 1; 480 481 /* Instruction length changes from 2 to 5. */ 482 kpmd->tinstlen = 5; 483 kpmd->disp -= 3; 484 } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) { 485 memcpy(kpmd->template, bytes, opcidx); 486 kpmd->template[opcidx] = 0x0f; 487 kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10; 488 kpmd->dispoff = opcidx + 2; 489 490 /* Instruction length changes from 2 to 6. */ 491 kpmd->tinstlen = 6; 492 kpmd->disp -= 4; 493 } else { 494 panic("unhandled opcode %#x", bytes[opcidx]); 495 } 496 } else { 497 memcpy(kpmd->template, bytes, ilen); 498 } 499 500 return (0); 501 } 502 503 int 504 kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval, 505 void *opaque) 506 { 507 struct kinst_probe *kp; 508 dtrace_kinst_probedesc_t *pd; 509 const char *func; 510 int error, instrsize, n, off; 511 uint8_t *instr, *limit; 512 513 pd = opaque; 514 func = symval->name; 515 if (strcmp(func, pd->kpd_func) != 0 || strcmp(func, "trap_check") == 0) 516 return (0); 517 518 instr = (uint8_t *)symval->value; 519 limit = (uint8_t *)symval->value + symval->size; 520 if (instr >= limit) 521 return (0); 522 523 /* 524 * Ignore functions not beginning with the usual function prologue. 525 * These might correspond to exception handlers with which we should not 526 * meddle. This does however exclude functions which can be safely 527 * traced, such as cpu_switch(). 528 */ 529 if (*instr != KINST_PUSHL_RBP) 530 return (0); 531 532 n = 0; 533 while (instr < limit) { 534 instrsize = dtrace_instr_size(instr); 535 off = (int)(instr - (uint8_t *)symval->value); 536 if (pd->kpd_off != -1 && off != pd->kpd_off) { 537 instr += instrsize; 538 continue; 539 } 540 541 /* 542 * Check for instructions which may enable interrupts. Such 543 * instructions are tricky to trace since it is unclear whether 544 * to use the per-thread or per-CPU trampolines. Since they are 545 * rare, we don't bother to implement special handling for them. 546 * 547 * If the caller specified an offset, return an error, otherwise 548 * silently ignore the instruction so that it remains possible 549 * to enable all instructions in a function. 550 */ 551 if (instrsize == 1 && 552 (instr[0] == KINST_POPF || instr[0] == KINST_STI)) { 553 if (pd->kpd_off != -1) 554 return (EINVAL); 555 instr += instrsize; 556 continue; 557 } 558 559 /* 560 * Prevent separate dtrace(1) instances from creating copies of 561 * the same probe. 562 */ 563 LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) { 564 if (strcmp(kp->kp_func, func) == 0 && 565 strtol(kp->kp_name, NULL, 10) == off) 566 return (0); 567 } 568 if (++n > KINST_PROBETAB_MAX) { 569 KINST_LOG("probe list full: %d entries", n); 570 return (ENOMEM); 571 } 572 kp = malloc(sizeof(struct kinst_probe), M_KINST, 573 M_WAITOK | M_ZERO); 574 kp->kp_func = func; 575 snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off); 576 kp->kp_savedval = *instr; 577 kp->kp_patchval = KINST_PATCHVAL; 578 kp->kp_patchpoint = instr; 579 580 error = kinst_instr_dissect(kp, &instr); 581 if (error != 0) 582 return (error); 583 584 kinst_probe_create(kp, lf); 585 } 586 587 return (0); 588 } 589 590 int 591 kinst_md_init(void) 592 { 593 uint8_t *tramp; 594 int cpu; 595 596 CPU_FOREACH(cpu) { 597 tramp = kinst_trampoline_alloc(M_WAITOK); 598 if (tramp == NULL) 599 return (ENOMEM); 600 DPCPU_ID_SET(cpu, intr_tramp, tramp); 601 } 602 603 return (0); 604 } 605 606 void 607 kinst_md_deinit(void) 608 { 609 uint8_t *tramp; 610 int cpu; 611 612 CPU_FOREACH(cpu) { 613 tramp = DPCPU_ID_GET(cpu, intr_tramp); 614 if (tramp != NULL) { 615 kinst_trampoline_dealloc(DPCPU_ID_GET(cpu, intr_tramp)); 616 DPCPU_ID_SET(cpu, intr_tramp, NULL); 617 } 618 } 619 } 620