1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 * 22 * $FreeBSD$ 23 * 24 */ 25 /* 26 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 /* 31 * Copyright (c) 2011, Joyent, Inc. All rights reserved. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/types.h> 37 #include <sys/kernel.h> 38 #include <sys/malloc.h> 39 #include <sys/kmem.h> 40 #include <sys/smp.h> 41 #include <sys/dtrace_impl.h> 42 #include <sys/dtrace_bsd.h> 43 #include <machine/clock.h> 44 #include <machine/frame.h> 45 #include <vm/pmap.h> 46 47 extern uintptr_t dtrace_in_probe_addr; 48 extern int dtrace_in_probe; 49 50 extern void dtrace_getnanotime(struct timespec *tsp); 51 52 int dtrace_invop(uintptr_t, uintptr_t *, uintptr_t); 53 54 typedef struct dtrace_invop_hdlr { 55 int (*dtih_func)(uintptr_t, uintptr_t *, uintptr_t); 56 struct dtrace_invop_hdlr *dtih_next; 57 } dtrace_invop_hdlr_t; 58 59 dtrace_invop_hdlr_t *dtrace_invop_hdlr; 60 61 int 62 dtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) 63 { 64 dtrace_invop_hdlr_t *hdlr; 65 int rval; 66 67 for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) 68 if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0) 69 return (rval); 70 71 return (0); 72 } 73 74 void 75 dtrace_invop_add(int (*func)(uintptr_t, uintptr_t *, uintptr_t)) 76 { 77 dtrace_invop_hdlr_t *hdlr; 78 79 hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); 80 hdlr->dtih_func = func; 81 hdlr->dtih_next = dtrace_invop_hdlr; 82 dtrace_invop_hdlr = hdlr; 83 } 84 85 void 86 dtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t)) 87 { 88 dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; 89 90 for (;;) { 91 if (hdlr == NULL) 92 panic("attempt to remove non-existent invop handler"); 93 94 if (hdlr->dtih_func == func) 95 break; 96 97 prev = hdlr; 98 hdlr = hdlr->dtih_next; 99 } 100 101 if (prev == NULL) { 102 ASSERT(dtrace_invop_hdlr == hdlr); 103 dtrace_invop_hdlr = hdlr->dtih_next; 104 } else { 105 ASSERT(dtrace_invop_hdlr != hdlr); 106 prev->dtih_next = hdlr->dtih_next; 107 } 108 109 kmem_free(hdlr, 0); 110 } 111 112 /*ARGSUSED*/ 113 void 114 dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) 115 { 116 (*func)(0, (uintptr_t) addr_PTmap); 117 } 118 119 void 120 dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) 121 { 122 cpuset_t cpus; 123 124 if (cpu == DTRACE_CPUALL) 125 cpus = all_cpus; 126 else 127 CPU_SETOF(cpu, &cpus); 128 129 smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, 130 smp_no_rendevous_barrier, arg); 131 } 132 133 static void 134 dtrace_sync_func(void) 135 { 136 } 137 138 void 139 dtrace_sync(void) 140 { 141 dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); 142 } 143 144 #ifdef notyet 145 int (*dtrace_fasttrap_probe_ptr)(struct regs *); 146 int (*dtrace_pid_probe_ptr)(struct regs *); 147 int (*dtrace_return_probe_ptr)(struct regs *); 148 149 void 150 dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid) 151 { 152 krwlock_t *rwp; 153 proc_t *p = curproc; 154 extern void trap(struct regs *, caddr_t, processorid_t); 155 156 if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) { 157 if (curthread->t_cred != p->p_cred) { 158 cred_t *oldcred = curthread->t_cred; 159 /* 160 * DTrace accesses t_cred in probe context. t_cred 161 * must always be either NULL, or point to a valid, 162 * allocated cred structure. 163 */ 164 curthread->t_cred = crgetcred(); 165 crfree(oldcred); 166 } 167 } 168 169 if (rp->r_trapno == T_DTRACE_RET) { 170 uint8_t step = curthread->t_dtrace_step; 171 uint8_t ret = curthread->t_dtrace_ret; 172 uintptr_t npc = curthread->t_dtrace_npc; 173 174 if (curthread->t_dtrace_ast) { 175 aston(curthread); 176 curthread->t_sig_check = 1; 177 } 178 179 /* 180 * Clear all user tracing flags. 181 */ 182 curthread->t_dtrace_ft = 0; 183 184 /* 185 * If we weren't expecting to take a return probe trap, kill 186 * the process as though it had just executed an unassigned 187 * trap instruction. 188 */ 189 if (step == 0) { 190 tsignal(curthread, SIGILL); 191 return; 192 } 193 194 /* 195 * If we hit this trap unrelated to a return probe, we're 196 * just here to reset the AST flag since we deferred a signal 197 * until after we logically single-stepped the instruction we 198 * copied out. 199 */ 200 if (ret == 0) { 201 rp->r_pc = npc; 202 return; 203 } 204 205 /* 206 * We need to wait until after we've called the 207 * dtrace_return_probe_ptr function pointer to set %pc. 208 */ 209 rwp = &CPU->cpu_ft_lock; 210 rw_enter(rwp, RW_READER); 211 if (dtrace_return_probe_ptr != NULL) 212 (void) (*dtrace_return_probe_ptr)(rp); 213 rw_exit(rwp); 214 rp->r_pc = npc; 215 216 } else if (rp->r_trapno == T_DTRACE_PROBE) { 217 rwp = &CPU->cpu_ft_lock; 218 rw_enter(rwp, RW_READER); 219 if (dtrace_fasttrap_probe_ptr != NULL) 220 (void) (*dtrace_fasttrap_probe_ptr)(rp); 221 rw_exit(rwp); 222 223 } else if (rp->r_trapno == T_BPTFLT) { 224 uint8_t instr; 225 rwp = &CPU->cpu_ft_lock; 226 227 /* 228 * The DTrace fasttrap provider uses the breakpoint trap 229 * (int 3). We let DTrace take the first crack at handling 230 * this trap; if it's not a probe that DTrace knowns about, 231 * we call into the trap() routine to handle it like a 232 * breakpoint placed by a conventional debugger. 233 */ 234 rw_enter(rwp, RW_READER); 235 if (dtrace_pid_probe_ptr != NULL && 236 (*dtrace_pid_probe_ptr)(rp) == 0) { 237 rw_exit(rwp); 238 return; 239 } 240 rw_exit(rwp); 241 242 /* 243 * If the instruction that caused the breakpoint trap doesn't 244 * look like an int 3 anymore, it may be that this tracepoint 245 * was removed just after the user thread executed it. In 246 * that case, return to user land to retry the instuction. 247 */ 248 if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 && 249 instr != FASTTRAP_INSTR) { 250 rp->r_pc--; 251 return; 252 } 253 254 trap(rp, addr, cpuid); 255 256 } else { 257 trap(rp, addr, cpuid); 258 } 259 } 260 261 void 262 dtrace_safe_synchronous_signal(void) 263 { 264 kthread_t *t = curthread; 265 struct regs *rp = lwptoregs(ttolwp(t)); 266 size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 267 268 ASSERT(t->t_dtrace_on); 269 270 /* 271 * If we're not in the range of scratch addresses, we're not actually 272 * tracing user instructions so turn off the flags. If the instruction 273 * we copied out caused a synchonous trap, reset the pc back to its 274 * original value and turn off the flags. 275 */ 276 if (rp->r_pc < t->t_dtrace_scrpc || 277 rp->r_pc > t->t_dtrace_astpc + isz) { 278 t->t_dtrace_ft = 0; 279 } else if (rp->r_pc == t->t_dtrace_scrpc || 280 rp->r_pc == t->t_dtrace_astpc) { 281 rp->r_pc = t->t_dtrace_pc; 282 t->t_dtrace_ft = 0; 283 } 284 } 285 286 int 287 dtrace_safe_defer_signal(void) 288 { 289 kthread_t *t = curthread; 290 struct regs *rp = lwptoregs(ttolwp(t)); 291 size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 292 293 ASSERT(t->t_dtrace_on); 294 295 /* 296 * If we're not in the range of scratch addresses, we're not actually 297 * tracing user instructions so turn off the flags. 298 */ 299 if (rp->r_pc < t->t_dtrace_scrpc || 300 rp->r_pc > t->t_dtrace_astpc + isz) { 301 t->t_dtrace_ft = 0; 302 return (0); 303 } 304 305 /* 306 * If we have executed the original instruction, but we have performed 307 * neither the jmp back to t->t_dtrace_npc nor the clean up of any 308 * registers used to emulate %rip-relative instructions in 64-bit mode, 309 * we'll save ourselves some effort by doing that here and taking the 310 * signal right away. We detect this condition by seeing if the program 311 * counter is the range [scrpc + isz, astpc). 312 */ 313 if (rp->r_pc >= t->t_dtrace_scrpc + isz && 314 rp->r_pc < t->t_dtrace_astpc) { 315 #ifdef __amd64 316 /* 317 * If there is a scratch register and we're on the 318 * instruction immediately after the modified instruction, 319 * restore the value of that scratch register. 320 */ 321 if (t->t_dtrace_reg != 0 && 322 rp->r_pc == t->t_dtrace_scrpc + isz) { 323 switch (t->t_dtrace_reg) { 324 case REG_RAX: 325 rp->r_rax = t->t_dtrace_regv; 326 break; 327 case REG_RCX: 328 rp->r_rcx = t->t_dtrace_regv; 329 break; 330 case REG_R8: 331 rp->r_r8 = t->t_dtrace_regv; 332 break; 333 case REG_R9: 334 rp->r_r9 = t->t_dtrace_regv; 335 break; 336 } 337 } 338 #endif 339 rp->r_pc = t->t_dtrace_npc; 340 t->t_dtrace_ft = 0; 341 return (0); 342 } 343 344 /* 345 * Otherwise, make sure we'll return to the kernel after executing 346 * the copied out instruction and defer the signal. 347 */ 348 if (!t->t_dtrace_step) { 349 ASSERT(rp->r_pc < t->t_dtrace_astpc); 350 rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc; 351 t->t_dtrace_step = 1; 352 } 353 354 t->t_dtrace_ast = 1; 355 356 return (1); 357 } 358 #endif 359 360 static int64_t tgt_cpu_tsc; 361 static int64_t hst_cpu_tsc; 362 static int64_t tsc_skew[MAXCPU]; 363 static uint64_t nsec_scale; 364 365 /* See below for the explanation of this macro. */ 366 #define SCALE_SHIFT 28 367 368 static void 369 dtrace_gethrtime_init_cpu(void *arg) 370 { 371 uintptr_t cpu = (uintptr_t) arg; 372 373 if (cpu == curcpu) 374 tgt_cpu_tsc = rdtsc(); 375 else 376 hst_cpu_tsc = rdtsc(); 377 } 378 379 static void 380 dtrace_gethrtime_init(void *arg) 381 { 382 struct pcpu *pc; 383 uint64_t tsc_f; 384 cpuset_t map; 385 int i; 386 387 /* 388 * Get TSC frequency known at this moment. 389 * This should be constant if TSC is invariant. 390 * Otherwise tick->time conversion will be inaccurate, but 391 * will preserve monotonic property of TSC. 392 */ 393 tsc_f = atomic_load_acq_64(&tsc_freq); 394 395 /* 396 * The following line checks that nsec_scale calculated below 397 * doesn't overflow 32-bit unsigned integer, so that it can multiply 398 * another 32-bit integer without overflowing 64-bit. 399 * Thus minimum supported TSC frequency is 62.5MHz. 400 */ 401 KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low")); 402 403 /* 404 * We scale up NANOSEC/tsc_f ratio to preserve as much precision 405 * as possible. 406 * 2^28 factor was chosen quite arbitrarily from practical 407 * considerations: 408 * - it supports TSC frequencies as low as 62.5MHz (see above); 409 * - it provides quite good precision (e < 0.01%) up to THz 410 * (terahertz) values; 411 */ 412 nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f; 413 414 /* The current CPU is the reference one. */ 415 sched_pin(); 416 tsc_skew[curcpu] = 0; 417 CPU_FOREACH(i) { 418 if (i == curcpu) 419 continue; 420 421 pc = pcpu_find(i); 422 CPU_SETOF(PCPU_GET(cpuid), &map); 423 CPU_SET(pc->pc_cpuid, &map); 424 425 smp_rendezvous_cpus(map, NULL, 426 dtrace_gethrtime_init_cpu, 427 smp_no_rendevous_barrier, (void *)(uintptr_t) i); 428 429 tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; 430 } 431 sched_unpin(); 432 } 433 434 SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); 435 436 /* 437 * DTrace needs a high resolution time function which can 438 * be called from a probe context and guaranteed not to have 439 * instrumented with probes itself. 440 * 441 * Returns nanoseconds since boot. 442 */ 443 uint64_t 444 dtrace_gethrtime() 445 { 446 uint64_t tsc; 447 uint32_t lo; 448 uint32_t hi; 449 450 /* 451 * We split TSC value into lower and higher 32-bit halves and separately 452 * scale them with nsec_scale, then we scale them down by 2^28 453 * (see nsec_scale calculations) taking into account 32-bit shift of 454 * the higher half and finally add. 455 */ 456 tsc = rdtsc() - tsc_skew[curcpu]; 457 lo = tsc; 458 hi = tsc >> 32; 459 return (((lo * nsec_scale) >> SCALE_SHIFT) + 460 ((hi * nsec_scale) << (32 - SCALE_SHIFT))); 461 } 462 463 uint64_t 464 dtrace_gethrestime(void) 465 { 466 struct timespec current_time; 467 468 dtrace_getnanotime(¤t_time); 469 470 return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); 471 } 472 473 /* Function to handle DTrace traps during probes. See amd64/amd64/trap.c */ 474 int 475 dtrace_trap(struct trapframe *frame, u_int type) 476 { 477 /* 478 * A trap can occur while DTrace executes a probe. Before 479 * executing the probe, DTrace blocks re-scheduling and sets 480 * a flag in it's per-cpu flags to indicate that it doesn't 481 * want to fault. On returning from the probe, the no-fault 482 * flag is cleared and finally re-scheduling is enabled. 483 * 484 * Check if DTrace has enabled 'no-fault' mode: 485 * 486 */ 487 if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { 488 /* 489 * There are only a couple of trap types that are expected. 490 * All the rest will be handled in the usual way. 491 */ 492 switch (type) { 493 /* Privilieged instruction fault. */ 494 case T_PRIVINFLT: 495 break; 496 /* General protection fault. */ 497 case T_PROTFLT: 498 /* Flag an illegal operation. */ 499 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; 500 501 /* 502 * Offset the instruction pointer to the instruction 503 * following the one causing the fault. 504 */ 505 frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); 506 return (1); 507 /* Page fault. */ 508 case T_PAGEFLT: 509 /* Flag a bad address. */ 510 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; 511 cpu_core[curcpu].cpuc_dtrace_illval = frame->tf_addr; 512 513 /* 514 * Offset the instruction pointer to the instruction 515 * following the one causing the fault. 516 */ 517 frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); 518 return (1); 519 default: 520 /* Handle all other traps in the usual way. */ 521 break; 522 } 523 } 524 525 /* Handle the trap in the usual way. */ 526 return (0); 527 } 528