xref: /freebsd/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c (revision 21b492ed51aa6ff8008a8aa83333b1de30288a15)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Portions Copyright 2010 The FreeBSD Foundation
22  *
23  * $FreeBSD$
24  */
25 
26 /*
27  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #include <sys/fasttrap_isa.h>
32 #include <sys/fasttrap_impl.h>
33 #include <sys/dtrace.h>
34 #include <sys/dtrace_impl.h>
35 #include <sys/cmn_err.h>
36 #include <sys/types.h>
37 #include <sys/dtrace_bsd.h>
38 #include <sys/proc.h>
39 #include <sys/reg.h>
40 #include <sys/rmlock.h>
41 #include <cddl/dev/dtrace/dtrace_cddl.h>
42 #include <cddl/dev/dtrace/x86/regset.h>
43 #include <machine/segments.h>
44 #include <machine/pcb.h>
45 #include <machine/trap.h>
46 #include <sys/sysmacros.h>
47 #include <sys/ptrace.h>
48 
49 #ifdef __i386__
50 #define	r_rax	r_eax
51 #define	r_rbx	r_ebx
52 #define	r_rip	r_eip
53 #define	r_rflags r_eflags
54 #define	r_rsp	r_esp
55 #define	r_rbp	r_ebp
56 #endif
57 
58 /*
59  * Lossless User-Land Tracing on x86
60  * ---------------------------------
61  *
62  * The execution of most instructions is not dependent on the address; for
63  * these instructions it is sufficient to copy them into the user process's
64  * address space and execute them. To effectively single-step an instruction
65  * in user-land, we copy out the following sequence of instructions to scratch
66  * space in the user thread's ulwp_t structure.
67  *
68  * We then set the program counter (%eip or %rip) to point to this scratch
69  * space. Once execution resumes, the original instruction is executed and
70  * then control flow is redirected to what was originally the subsequent
71  * instruction. If the kernel attemps to deliver a signal while single-
72  * stepping, the signal is deferred and the program counter is moved into the
73  * second sequence of instructions. The second sequence ends in a trap into
74  * the kernel where the deferred signal is then properly handled and delivered.
75  *
76  * For instructions whose execute is position dependent, we perform simple
77  * emulation. These instructions are limited to control transfer
78  * instructions in 32-bit mode, but in 64-bit mode there's the added wrinkle
79  * of %rip-relative addressing that means that almost any instruction can be
80  * position dependent. For all the details on how we emulate generic
81  * instructions included %rip-relative instructions, see the code in
82  * fasttrap_pid_probe() below where we handle instructions of type
83  * FASTTRAP_T_COMMON (under the header: Generic Instruction Tracing).
84  */
85 
86 #define	FASTTRAP_MODRM_MOD(modrm)	(((modrm) >> 6) & 0x3)
87 #define	FASTTRAP_MODRM_REG(modrm)	(((modrm) >> 3) & 0x7)
88 #define	FASTTRAP_MODRM_RM(modrm)	((modrm) & 0x7)
89 #define	FASTTRAP_MODRM(mod, reg, rm)	(((mod) << 6) | ((reg) << 3) | (rm))
90 
91 #define	FASTTRAP_SIB_SCALE(sib)		(((sib) >> 6) & 0x3)
92 #define	FASTTRAP_SIB_INDEX(sib)		(((sib) >> 3) & 0x7)
93 #define	FASTTRAP_SIB_BASE(sib)		((sib) & 0x7)
94 
95 #define	FASTTRAP_REX_W(rex)		(((rex) >> 3) & 1)
96 #define	FASTTRAP_REX_R(rex)		(((rex) >> 2) & 1)
97 #define	FASTTRAP_REX_X(rex)		(((rex) >> 1) & 1)
98 #define	FASTTRAP_REX_B(rex)		((rex) & 1)
99 #define	FASTTRAP_REX(w, r, x, b)	\
100 	(0x40 | ((w) << 3) | ((r) << 2) | ((x) << 1) | (b))
101 
102 /*
103  * Single-byte op-codes.
104  */
105 #define	FASTTRAP_PUSHL_EBP	0x55
106 
107 #define	FASTTRAP_JO		0x70
108 #define	FASTTRAP_JNO		0x71
109 #define	FASTTRAP_JB		0x72
110 #define	FASTTRAP_JAE		0x73
111 #define	FASTTRAP_JE		0x74
112 #define	FASTTRAP_JNE		0x75
113 #define	FASTTRAP_JBE		0x76
114 #define	FASTTRAP_JA		0x77
115 #define	FASTTRAP_JS		0x78
116 #define	FASTTRAP_JNS		0x79
117 #define	FASTTRAP_JP		0x7a
118 #define	FASTTRAP_JNP		0x7b
119 #define	FASTTRAP_JL		0x7c
120 #define	FASTTRAP_JGE		0x7d
121 #define	FASTTRAP_JLE		0x7e
122 #define	FASTTRAP_JG		0x7f
123 
124 #define	FASTTRAP_NOP		0x90
125 
126 #define	FASTTRAP_MOV_EAX	0xb8
127 #define	FASTTRAP_MOV_ECX	0xb9
128 
129 #define	FASTTRAP_RET16		0xc2
130 #define	FASTTRAP_RET		0xc3
131 
132 #define	FASTTRAP_LOOPNZ		0xe0
133 #define	FASTTRAP_LOOPZ		0xe1
134 #define	FASTTRAP_LOOP		0xe2
135 #define	FASTTRAP_JCXZ		0xe3
136 
137 #define	FASTTRAP_CALL		0xe8
138 #define	FASTTRAP_JMP32		0xe9
139 #define	FASTTRAP_JMP8		0xeb
140 
141 #define	FASTTRAP_INT3		0xcc
142 #define	FASTTRAP_INT		0xcd
143 
144 #define	FASTTRAP_2_BYTE_OP	0x0f
145 #define	FASTTRAP_GROUP5_OP	0xff
146 
147 /*
148  * Two-byte op-codes (second byte only).
149  */
150 #define	FASTTRAP_0F_JO		0x80
151 #define	FASTTRAP_0F_JNO		0x81
152 #define	FASTTRAP_0F_JB		0x82
153 #define	FASTTRAP_0F_JAE		0x83
154 #define	FASTTRAP_0F_JE		0x84
155 #define	FASTTRAP_0F_JNE		0x85
156 #define	FASTTRAP_0F_JBE		0x86
157 #define	FASTTRAP_0F_JA		0x87
158 #define	FASTTRAP_0F_JS		0x88
159 #define	FASTTRAP_0F_JNS		0x89
160 #define	FASTTRAP_0F_JP		0x8a
161 #define	FASTTRAP_0F_JNP		0x8b
162 #define	FASTTRAP_0F_JL		0x8c
163 #define	FASTTRAP_0F_JGE		0x8d
164 #define	FASTTRAP_0F_JLE		0x8e
165 #define	FASTTRAP_0F_JG		0x8f
166 
167 #define	FASTTRAP_EFLAGS_OF	0x800
168 #define	FASTTRAP_EFLAGS_DF	0x400
169 #define	FASTTRAP_EFLAGS_SF	0x080
170 #define	FASTTRAP_EFLAGS_ZF	0x040
171 #define	FASTTRAP_EFLAGS_AF	0x010
172 #define	FASTTRAP_EFLAGS_PF	0x004
173 #define	FASTTRAP_EFLAGS_CF	0x001
174 
175 /*
176  * Instruction prefixes.
177  */
178 #define	FASTTRAP_PREFIX_OPERAND	0x66
179 #define	FASTTRAP_PREFIX_ADDRESS	0x67
180 #define	FASTTRAP_PREFIX_CS	0x2E
181 #define	FASTTRAP_PREFIX_DS	0x3E
182 #define	FASTTRAP_PREFIX_ES	0x26
183 #define	FASTTRAP_PREFIX_FS	0x64
184 #define	FASTTRAP_PREFIX_GS	0x65
185 #define	FASTTRAP_PREFIX_SS	0x36
186 #define	FASTTRAP_PREFIX_LOCK	0xF0
187 #define	FASTTRAP_PREFIX_REP	0xF3
188 #define	FASTTRAP_PREFIX_REPNE	0xF2
189 
190 #define	FASTTRAP_NOREG	0xff
191 
192 /*
193  * Map between instruction register encodings and the kernel constants which
194  * correspond to indicies into struct regs.
195  */
196 #ifdef __amd64
197 static const uint8_t regmap[16] = {
198 	REG_RAX, REG_RCX, REG_RDX, REG_RBX, REG_RSP, REG_RBP, REG_RSI, REG_RDI,
199 	REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_R13, REG_R14, REG_R15,
200 };
201 #else
202 static const uint8_t regmap[8] = {
203 	EAX, ECX, EDX, EBX, UESP, EBP, ESI, EDI
204 };
205 #endif
206 
207 static ulong_t fasttrap_getreg(struct reg *, uint_t);
208 
209 static uint64_t
210 fasttrap_anarg(struct reg *rp, int function_entry, int argno)
211 {
212 	uint64_t value = 0;
213 	int shift = function_entry ? 1 : 0;
214 
215 #ifdef __amd64
216 	if (curproc->p_model == DATAMODEL_LP64) {
217 		uintptr_t *stack;
218 
219 		/*
220 		 * In 64-bit mode, the first six arguments are stored in
221 		 * registers.
222 		 */
223 		if (argno < 6)
224 			switch (argno) {
225 			case 0:
226 				return (rp->r_rdi);
227 			case 1:
228 				return (rp->r_rsi);
229 			case 2:
230 				return (rp->r_rdx);
231 			case 3:
232 				return (rp->r_rcx);
233 			case 4:
234 				return (rp->r_r8);
235 			case 5:
236 				return (rp->r_r9);
237 			}
238 
239 		stack = (uintptr_t *)rp->r_rsp;
240 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
241 		value = dtrace_fulword(&stack[argno - 6 + shift]);
242 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
243 	} else {
244 #endif
245 		uint32_t *stack = (uint32_t *)rp->r_rsp;
246 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
247 		value = dtrace_fuword32(&stack[argno + shift]);
248 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
249 #ifdef __amd64
250 	}
251 #endif
252 
253 	return (value);
254 }
255 
256 /*ARGSUSED*/
257 int
258 fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
259     fasttrap_probe_type_t type)
260 {
261 	uint8_t instr[FASTTRAP_MAX_INSTR_SIZE + 10];
262 	size_t len = FASTTRAP_MAX_INSTR_SIZE;
263 	size_t first = MIN(len, PAGESIZE - (pc & PAGEOFFSET));
264 	uint_t start = 0;
265 	int rmindex, size;
266 	uint8_t seg, rex = 0;
267 
268 	/*
269 	 * Read the instruction at the given address out of the process's
270 	 * address space. We don't have to worry about a debugger
271 	 * changing this instruction before we overwrite it with our trap
272 	 * instruction since P_PR_LOCK is set. Since instructions can span
273 	 * pages, we potentially read the instruction in two parts. If the
274 	 * second part fails, we just zero out that part of the instruction.
275 	 */
276 	if (uread(p, &instr[0], first, pc) != 0)
277 		return (-1);
278 	if (len > first &&
279 	    uread(p, &instr[first], len - first, pc + first) != 0) {
280 		bzero(&instr[first], len - first);
281 		len = first;
282 	}
283 
284 	/*
285 	 * If the disassembly fails, then we have a malformed instruction.
286 	 */
287 	if ((size = dtrace_instr_size_isa(instr, p->p_model, &rmindex)) <= 0)
288 		return (-1);
289 
290 	/*
291 	 * Make sure the disassembler isn't completely broken.
292 	 */
293 	ASSERT(-1 <= rmindex && rmindex < size);
294 
295 	/*
296 	 * If the computed size is greater than the number of bytes read,
297 	 * then it was a malformed instruction possibly because it fell on a
298 	 * page boundary and the subsequent page was missing or because of
299 	 * some malicious user.
300 	 */
301 	if (size > len)
302 		return (-1);
303 
304 	tp->ftt_size = (uint8_t)size;
305 	tp->ftt_segment = FASTTRAP_SEG_NONE;
306 
307 	/*
308 	 * Find the start of the instruction's opcode by processing any
309 	 * legacy prefixes.
310 	 */
311 	for (;;) {
312 		seg = 0;
313 		switch (instr[start]) {
314 		case FASTTRAP_PREFIX_SS:
315 			seg++;
316 			/*FALLTHRU*/
317 		case FASTTRAP_PREFIX_GS:
318 			seg++;
319 			/*FALLTHRU*/
320 		case FASTTRAP_PREFIX_FS:
321 			seg++;
322 			/*FALLTHRU*/
323 		case FASTTRAP_PREFIX_ES:
324 			seg++;
325 			/*FALLTHRU*/
326 		case FASTTRAP_PREFIX_DS:
327 			seg++;
328 			/*FALLTHRU*/
329 		case FASTTRAP_PREFIX_CS:
330 			seg++;
331 			/*FALLTHRU*/
332 		case FASTTRAP_PREFIX_OPERAND:
333 		case FASTTRAP_PREFIX_ADDRESS:
334 		case FASTTRAP_PREFIX_LOCK:
335 		case FASTTRAP_PREFIX_REP:
336 		case FASTTRAP_PREFIX_REPNE:
337 			if (seg != 0) {
338 				/*
339 				 * It's illegal for an instruction to specify
340 				 * two segment prefixes -- give up on this
341 				 * illegal instruction.
342 				 */
343 				if (tp->ftt_segment != FASTTRAP_SEG_NONE)
344 					return (-1);
345 
346 				tp->ftt_segment = seg;
347 			}
348 			start++;
349 			continue;
350 		}
351 		break;
352 	}
353 
354 #ifdef __amd64
355 	/*
356 	 * Identify the REX prefix on 64-bit processes.
357 	 */
358 	if (p->p_model == DATAMODEL_LP64 && (instr[start] & 0xf0) == 0x40)
359 		rex = instr[start++];
360 #endif
361 
362 	/*
363 	 * Now that we're pretty sure that the instruction is okay, copy the
364 	 * valid part to the tracepoint.
365 	 */
366 	bcopy(instr, tp->ftt_instr, FASTTRAP_MAX_INSTR_SIZE);
367 
368 	tp->ftt_type = FASTTRAP_T_COMMON;
369 	if (instr[start] == FASTTRAP_2_BYTE_OP) {
370 		switch (instr[start + 1]) {
371 		case FASTTRAP_0F_JO:
372 		case FASTTRAP_0F_JNO:
373 		case FASTTRAP_0F_JB:
374 		case FASTTRAP_0F_JAE:
375 		case FASTTRAP_0F_JE:
376 		case FASTTRAP_0F_JNE:
377 		case FASTTRAP_0F_JBE:
378 		case FASTTRAP_0F_JA:
379 		case FASTTRAP_0F_JS:
380 		case FASTTRAP_0F_JNS:
381 		case FASTTRAP_0F_JP:
382 		case FASTTRAP_0F_JNP:
383 		case FASTTRAP_0F_JL:
384 		case FASTTRAP_0F_JGE:
385 		case FASTTRAP_0F_JLE:
386 		case FASTTRAP_0F_JG:
387 			tp->ftt_type = FASTTRAP_T_JCC;
388 			tp->ftt_code = (instr[start + 1] & 0x0f) | FASTTRAP_JO;
389 			tp->ftt_dest = pc + tp->ftt_size +
390 			    /* LINTED - alignment */
391 			    *(int32_t *)&instr[start + 2];
392 			break;
393 		}
394 	} else if (instr[start] == FASTTRAP_GROUP5_OP) {
395 		uint_t mod = FASTTRAP_MODRM_MOD(instr[start + 1]);
396 		uint_t reg = FASTTRAP_MODRM_REG(instr[start + 1]);
397 		uint_t rm = FASTTRAP_MODRM_RM(instr[start + 1]);
398 
399 		if (reg == 2 || reg == 4) {
400 			uint_t i, sz;
401 
402 			if (reg == 2)
403 				tp->ftt_type = FASTTRAP_T_CALL;
404 			else
405 				tp->ftt_type = FASTTRAP_T_JMP;
406 
407 			if (mod == 3)
408 				tp->ftt_code = 2;
409 			else
410 				tp->ftt_code = 1;
411 
412 			ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0);
413 
414 			/*
415 			 * See AMD x86-64 Architecture Programmer's Manual
416 			 * Volume 3, Section 1.2.7, Table 1-12, and
417 			 * Appendix A.3.1, Table A-15.
418 			 */
419 			if (mod != 3 && rm == 4) {
420 				uint8_t sib = instr[start + 2];
421 				uint_t index = FASTTRAP_SIB_INDEX(sib);
422 				uint_t base = FASTTRAP_SIB_BASE(sib);
423 
424 				tp->ftt_scale = FASTTRAP_SIB_SCALE(sib);
425 
426 				tp->ftt_index = (index == 4) ?
427 				    FASTTRAP_NOREG :
428 				    regmap[index | (FASTTRAP_REX_X(rex) << 3)];
429 				tp->ftt_base = (mod == 0 && base == 5) ?
430 				    FASTTRAP_NOREG :
431 				    regmap[base | (FASTTRAP_REX_B(rex) << 3)];
432 
433 				i = 3;
434 				sz = mod == 1 ? 1 : 4;
435 			} else {
436 				/*
437 				 * In 64-bit mode, mod == 0 and r/m == 5
438 				 * denotes %rip-relative addressing; in 32-bit
439 				 * mode, the base register isn't used. In both
440 				 * modes, there is a 32-bit operand.
441 				 */
442 				if (mod == 0 && rm == 5) {
443 #ifdef __amd64
444 					if (p->p_model == DATAMODEL_LP64)
445 						tp->ftt_base = REG_RIP;
446 					else
447 #endif
448 						tp->ftt_base = FASTTRAP_NOREG;
449 					sz = 4;
450 				} else  {
451 					uint8_t base = rm |
452 					    (FASTTRAP_REX_B(rex) << 3);
453 
454 					tp->ftt_base = regmap[base];
455 					sz = mod == 1 ? 1 : mod == 2 ? 4 : 0;
456 				}
457 				tp->ftt_index = FASTTRAP_NOREG;
458 				i = 2;
459 			}
460 
461 			if (sz == 1) {
462 				tp->ftt_dest = *(int8_t *)&instr[start + i];
463 			} else if (sz == 4) {
464 				/* LINTED - alignment */
465 				tp->ftt_dest = *(int32_t *)&instr[start + i];
466 			} else {
467 				tp->ftt_dest = 0;
468 			}
469 		}
470 	} else {
471 		switch (instr[start]) {
472 		case FASTTRAP_RET:
473 			tp->ftt_type = FASTTRAP_T_RET;
474 			break;
475 
476 		case FASTTRAP_RET16:
477 			tp->ftt_type = FASTTRAP_T_RET16;
478 			/* LINTED - alignment */
479 			tp->ftt_dest = *(uint16_t *)&instr[start + 1];
480 			break;
481 
482 		case FASTTRAP_JO:
483 		case FASTTRAP_JNO:
484 		case FASTTRAP_JB:
485 		case FASTTRAP_JAE:
486 		case FASTTRAP_JE:
487 		case FASTTRAP_JNE:
488 		case FASTTRAP_JBE:
489 		case FASTTRAP_JA:
490 		case FASTTRAP_JS:
491 		case FASTTRAP_JNS:
492 		case FASTTRAP_JP:
493 		case FASTTRAP_JNP:
494 		case FASTTRAP_JL:
495 		case FASTTRAP_JGE:
496 		case FASTTRAP_JLE:
497 		case FASTTRAP_JG:
498 			tp->ftt_type = FASTTRAP_T_JCC;
499 			tp->ftt_code = instr[start];
500 			tp->ftt_dest = pc + tp->ftt_size +
501 			    (int8_t)instr[start + 1];
502 			break;
503 
504 		case FASTTRAP_LOOPNZ:
505 		case FASTTRAP_LOOPZ:
506 		case FASTTRAP_LOOP:
507 			tp->ftt_type = FASTTRAP_T_LOOP;
508 			tp->ftt_code = instr[start];
509 			tp->ftt_dest = pc + tp->ftt_size +
510 			    (int8_t)instr[start + 1];
511 			break;
512 
513 		case FASTTRAP_JCXZ:
514 			tp->ftt_type = FASTTRAP_T_JCXZ;
515 			tp->ftt_dest = pc + tp->ftt_size +
516 			    (int8_t)instr[start + 1];
517 			break;
518 
519 		case FASTTRAP_CALL:
520 			tp->ftt_type = FASTTRAP_T_CALL;
521 			tp->ftt_dest = pc + tp->ftt_size +
522 			    /* LINTED - alignment */
523 			    *(int32_t *)&instr[start + 1];
524 			tp->ftt_code = 0;
525 			break;
526 
527 		case FASTTRAP_JMP32:
528 			tp->ftt_type = FASTTRAP_T_JMP;
529 			tp->ftt_dest = pc + tp->ftt_size +
530 			    /* LINTED - alignment */
531 			    *(int32_t *)&instr[start + 1];
532 			break;
533 		case FASTTRAP_JMP8:
534 			tp->ftt_type = FASTTRAP_T_JMP;
535 			tp->ftt_dest = pc + tp->ftt_size +
536 			    (int8_t)instr[start + 1];
537 			break;
538 
539 		case FASTTRAP_PUSHL_EBP:
540 			if (start == 0)
541 				tp->ftt_type = FASTTRAP_T_PUSHL_EBP;
542 			break;
543 
544 		case FASTTRAP_NOP:
545 #ifdef __amd64
546 			ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0);
547 
548 			/*
549 			 * On amd64 we have to be careful not to confuse a nop
550 			 * (actually xchgl %eax, %eax) with an instruction using
551 			 * the same opcode, but that does something different
552 			 * (e.g. xchgl %r8d, %eax or xcghq %r8, %rax).
553 			 */
554 			if (FASTTRAP_REX_B(rex) == 0)
555 #endif
556 				tp->ftt_type = FASTTRAP_T_NOP;
557 			break;
558 
559 		case FASTTRAP_INT3:
560 			/*
561 			 * The pid provider shares the int3 trap with debugger
562 			 * breakpoints so we can't instrument them.
563 			 */
564 			ASSERT(instr[start] == FASTTRAP_INSTR);
565 			return (-1);
566 
567 		case FASTTRAP_INT:
568 			/*
569 			 * Interrupts seem like they could be traced with
570 			 * no negative implications, but it's possible that
571 			 * a thread could be redirected by the trap handling
572 			 * code which would eventually return to the
573 			 * instruction after the interrupt. If the interrupt
574 			 * were in our scratch space, the subsequent
575 			 * instruction might be overwritten before we return.
576 			 * Accordingly we refuse to instrument any interrupt.
577 			 */
578 			return (-1);
579 		}
580 	}
581 
582 #ifdef __amd64
583 	if (p->p_model == DATAMODEL_LP64 && tp->ftt_type == FASTTRAP_T_COMMON) {
584 		/*
585 		 * If the process is 64-bit and the instruction type is still
586 		 * FASTTRAP_T_COMMON -- meaning we're going to copy it out an
587 		 * execute it -- we need to watch for %rip-relative
588 		 * addressing mode. See the portion of fasttrap_pid_probe()
589 		 * below where we handle tracepoints with type
590 		 * FASTTRAP_T_COMMON for how we emulate instructions that
591 		 * employ %rip-relative addressing.
592 		 */
593 		if (rmindex != -1) {
594 			uint_t mod = FASTTRAP_MODRM_MOD(instr[rmindex]);
595 			uint_t reg = FASTTRAP_MODRM_REG(instr[rmindex]);
596 			uint_t rm = FASTTRAP_MODRM_RM(instr[rmindex]);
597 
598 			ASSERT(rmindex > start);
599 
600 			if (mod == 0 && rm == 5) {
601 				/*
602 				 * We need to be sure to avoid other
603 				 * registers used by this instruction. While
604 				 * the reg field may determine the op code
605 				 * rather than denoting a register, assuming
606 				 * that it denotes a register is always safe.
607 				 * We leave the REX field intact and use
608 				 * whatever value's there for simplicity.
609 				 */
610 				if (reg != 0) {
611 					tp->ftt_ripmode = FASTTRAP_RIP_1 |
612 					    (FASTTRAP_RIP_X *
613 					    FASTTRAP_REX_B(rex));
614 					rm = 0;
615 				} else {
616 					tp->ftt_ripmode = FASTTRAP_RIP_2 |
617 					    (FASTTRAP_RIP_X *
618 					    FASTTRAP_REX_B(rex));
619 					rm = 1;
620 				}
621 
622 				tp->ftt_modrm = tp->ftt_instr[rmindex];
623 				tp->ftt_instr[rmindex] =
624 				    FASTTRAP_MODRM(2, reg, rm);
625 			}
626 		}
627 	}
628 #endif
629 
630 	return (0);
631 }
632 
633 int
634 fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
635 {
636 	fasttrap_instr_t instr = FASTTRAP_INSTR;
637 
638 	if (uwrite(p, &instr, 1, tp->ftt_pc) != 0)
639 		return (-1);
640 
641 	return (0);
642 }
643 
644 int
645 fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
646 {
647 	uint8_t instr;
648 
649 	/*
650 	 * Distinguish between read or write failures and a changed
651 	 * instruction.
652 	 */
653 	if (uread(p, &instr, 1, tp->ftt_pc) != 0)
654 		return (0);
655 	if (instr != FASTTRAP_INSTR)
656 		return (0);
657 	if (uwrite(p, &tp->ftt_instr[0], 1, tp->ftt_pc) != 0)
658 		return (-1);
659 
660 	return (0);
661 }
662 
663 #ifdef __amd64
664 static uintptr_t
665 fasttrap_fulword_noerr(const void *uaddr)
666 {
667 	uintptr_t ret;
668 
669 	if ((ret = fasttrap_fulword(uaddr)) != -1)
670 		return (ret);
671 
672 	return (0);
673 }
674 #endif
675 
676 static uint32_t
677 fasttrap_fuword32_noerr(const void *uaddr)
678 {
679 	uint32_t ret;
680 
681 	if ((ret = fasttrap_fuword32(uaddr)) != -1)
682 		return (ret);
683 
684 	return (0);
685 }
686 
687 static void
688 fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid,
689     uintptr_t new_pc)
690 {
691 	fasttrap_tracepoint_t *tp;
692 	fasttrap_bucket_t *bucket;
693 	fasttrap_id_t *id;
694 	struct rm_priotracker tracker;
695 
696 	rm_rlock(&fasttrap_tp_lock, &tracker);
697 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
698 
699 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
700 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
701 		    tp->ftt_proc->ftpc_acount != 0)
702 			break;
703 	}
704 
705 	/*
706 	 * Don't sweat it if we can't find the tracepoint again; unlike
707 	 * when we're in fasttrap_pid_probe(), finding the tracepoint here
708 	 * is not essential to the correct execution of the process.
709 	 */
710 	if (tp == NULL) {
711 		rm_runlock(&fasttrap_tp_lock, &tracker);
712 		return;
713 	}
714 
715 	for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
716 		/*
717 		 * If there's a branch that could act as a return site, we
718 		 * need to trace it, and check here if the program counter is
719 		 * external to the function.
720 		 */
721 		if (tp->ftt_type != FASTTRAP_T_RET &&
722 		    tp->ftt_type != FASTTRAP_T_RET16 &&
723 		    new_pc - id->fti_probe->ftp_faddr <
724 		    id->fti_probe->ftp_fsize)
725 			continue;
726 
727 		dtrace_probe(id->fti_probe->ftp_id,
728 		    pc - id->fti_probe->ftp_faddr,
729 		    rp->r_rax, rp->r_rbx, 0, 0);
730 	}
731 
732 	rm_runlock(&fasttrap_tp_lock, &tracker);
733 }
734 
735 static void
736 fasttrap_sigsegv(proc_t *p, kthread_t *t, uintptr_t addr)
737 {
738 	ksiginfo_t ksi;
739 
740 	ksiginfo_init(&ksi);
741 	ksi.ksi_signo = SIGSEGV;
742 	ksi.ksi_code = SEGV_MAPERR;
743 	ksi.ksi_addr = (caddr_t)addr;
744 	PROC_LOCK(p);
745 	(void)tdksignal(t, SIGSEGV, &ksi);
746 	PROC_UNLOCK(p);
747 }
748 
749 #ifdef __amd64
750 static void
751 fasttrap_usdt_args64(fasttrap_probe_t *probe, struct reg *rp, int argc,
752     uintptr_t *argv)
753 {
754 	int i, x, cap = MIN(argc, probe->ftp_nargs);
755 	uintptr_t *stack = (uintptr_t *)rp->r_rsp;
756 
757 	for (i = 0; i < cap; i++) {
758 		x = probe->ftp_argmap[i];
759 
760 		if (x < 6)
761 			argv[i] = (&rp->r_rdi)[x];
762 		else
763 			argv[i] = fasttrap_fulword_noerr(&stack[x]);
764 	}
765 
766 	for (; i < argc; i++) {
767 		argv[i] = 0;
768 	}
769 }
770 #endif
771 
772 static void
773 fasttrap_usdt_args32(fasttrap_probe_t *probe, struct reg *rp, int argc,
774     uint32_t *argv)
775 {
776 	int i, x, cap = MIN(argc, probe->ftp_nargs);
777 	uint32_t *stack = (uint32_t *)rp->r_rsp;
778 
779 	for (i = 0; i < cap; i++) {
780 		x = probe->ftp_argmap[i];
781 
782 		argv[i] = fasttrap_fuword32_noerr(&stack[x]);
783 	}
784 
785 	for (; i < argc; i++) {
786 		argv[i] = 0;
787 	}
788 }
789 
790 static int
791 fasttrap_do_seg(fasttrap_tracepoint_t *tp, struct reg *rp, uintptr_t *addr)
792 {
793 	proc_t *p = curproc;
794 #ifdef __i386__
795 	struct segment_descriptor *desc;
796 #else
797 	struct user_segment_descriptor *desc;
798 #endif
799 	uint16_t sel = 0, ndx, type;
800 	uintptr_t limit;
801 
802 	switch (tp->ftt_segment) {
803 	case FASTTRAP_SEG_CS:
804 		sel = rp->r_cs;
805 		break;
806 	case FASTTRAP_SEG_DS:
807 		sel = rp->r_ds;
808 		break;
809 	case FASTTRAP_SEG_ES:
810 		sel = rp->r_es;
811 		break;
812 	case FASTTRAP_SEG_FS:
813 		sel = rp->r_fs;
814 		break;
815 	case FASTTRAP_SEG_GS:
816 		sel = rp->r_gs;
817 		break;
818 	case FASTTRAP_SEG_SS:
819 		sel = rp->r_ss;
820 		break;
821 	}
822 
823 	/*
824 	 * Make sure the given segment register specifies a user priority
825 	 * selector rather than a kernel selector.
826 	 */
827 	if (ISPL(sel) != SEL_UPL)
828 		return (-1);
829 
830 	ndx = IDXSEL(sel);
831 
832 	/*
833 	 * Check the bounds and grab the descriptor out of the specified
834 	 * descriptor table.
835 	 */
836 	if (ISLDT(sel)) {
837 #ifdef __i386__
838 		if (ndx > p->p_md.md_ldt->ldt_len)
839 			return (-1);
840 
841 		desc = (struct segment_descriptor *)
842 		    p->p_md.md_ldt[ndx].ldt_base;
843 #else
844 		if (ndx > max_ldt_segment)
845 			return (-1);
846 
847 		desc = (struct user_segment_descriptor *)
848 		    p->p_md.md_ldt[ndx].ldt_base;
849 #endif
850 
851 	} else {
852 		if (ndx >= NGDT)
853 			return (-1);
854 
855 #ifdef __i386__
856 		desc = &gdt[ndx].sd;
857 #else
858 		desc = PCPU_PTR(gdt)[ndx];
859 #endif
860 	}
861 
862 	/*
863 	 * The descriptor must have user privilege level and it must be
864 	 * present in memory.
865 	 */
866 	if (desc->sd_dpl != SEL_UPL || desc->sd_p != 1)
867 		return (-1);
868 
869 	type = desc->sd_type;
870 
871 	/*
872 	 * If the S bit in the type field is not set, this descriptor can
873 	 * only be used in system context.
874 	 */
875 	if ((type & 0x10) != 0x10)
876 		return (-1);
877 
878 	limit = USD_GETLIMIT(desc) * (desc->sd_gran ? PAGESIZE : 1);
879 
880 	if (tp->ftt_segment == FASTTRAP_SEG_CS) {
881 		/*
882 		 * The code/data bit and readable bit must both be set.
883 		 */
884 		if ((type & 0xa) != 0xa)
885 			return (-1);
886 
887 		if (*addr > limit)
888 			return (-1);
889 	} else {
890 		/*
891 		 * The code/data bit must be clear.
892 		 */
893 		if ((type & 0x8) != 0)
894 			return (-1);
895 
896 		/*
897 		 * If the expand-down bit is clear, we just check the limit as
898 		 * it would naturally be applied. Otherwise, we need to check
899 		 * that the address is the range [limit + 1 .. 0xffff] or
900 		 * [limit + 1 ... 0xffffffff] depending on if the default
901 		 * operand size bit is set.
902 		 */
903 		if ((type & 0x4) == 0) {
904 			if (*addr > limit)
905 				return (-1);
906 		} else if (desc->sd_def32) {
907 			if (*addr < limit + 1 || 0xffff < *addr)
908 				return (-1);
909 		} else {
910 			if (*addr < limit + 1 || 0xffffffff < *addr)
911 				return (-1);
912 		}
913 	}
914 
915 	*addr += USD_GETBASE(desc);
916 
917 	return (0);
918 }
919 
920 int
921 fasttrap_pid_probe(struct trapframe *tf)
922 {
923 	struct reg reg, *rp;
924 	proc_t *p = curproc, *pp;
925 	struct rm_priotracker tracker;
926 	uint64_t gen;
927 	uintptr_t pc;
928 	uintptr_t new_pc = 0;
929 	fasttrap_bucket_t *bucket;
930 	fasttrap_tracepoint_t *tp, tp_local;
931 	pid_t pid;
932 	dtrace_icookie_t cookie;
933 	uint_t is_enabled = 0;
934 
935 	fill_frame_regs(tf, &reg);
936 	rp = &reg;
937 
938 	pc = rp->r_rip - 1;
939 
940 	/*
941 	 * It's possible that a user (in a veritable orgy of bad planning)
942 	 * could redirect this thread's flow of control before it reached the
943 	 * return probe fasttrap. In this case we need to kill the process
944 	 * since it's in a unrecoverable state.
945 	 */
946 	if (curthread->t_dtrace_step) {
947 		ASSERT(curthread->t_dtrace_on);
948 		fasttrap_sigtrap(p, curthread, pc);
949 		return (0);
950 	}
951 
952 	/*
953 	 * Clear all user tracing flags.
954 	 */
955 	curthread->t_dtrace_ft = 0;
956 	curthread->t_dtrace_pc = 0;
957 	curthread->t_dtrace_npc = 0;
958 	curthread->t_dtrace_scrpc = 0;
959 	curthread->t_dtrace_astpc = 0;
960 #ifdef __amd64
961 	curthread->t_dtrace_regv = 0;
962 #endif
963 
964 	/*
965 	 * Treat a child created by a call to vfork(2) as if it were its
966 	 * parent. We know that there's only one thread of control in such a
967 	 * process: this one.
968 	 */
969 	pp = p;
970 	sx_slock(&proctree_lock);
971 	while (pp->p_vmspace == pp->p_pptr->p_vmspace)
972 		pp = pp->p_pptr;
973 	pid = pp->p_pid;
974 	if (pp != p) {
975 		PROC_LOCK(pp);
976 		if ((pp->p_flag & P_WEXIT) != 0) {
977 			/*
978 			 * This can happen if the child was created with
979 			 * rfork(2).  Userspace tracing cannot work reliably in
980 			 * such a scenario, but we can at least try.
981 			 */
982 			PROC_UNLOCK(pp);
983 			sx_sunlock(&proctree_lock);
984 			return (-1);
985 		}
986 		_PHOLD_LITE(pp);
987 		PROC_UNLOCK(pp);
988 	}
989 	sx_sunlock(&proctree_lock);
990 
991 	rm_rlock(&fasttrap_tp_lock, &tracker);
992 
993 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
994 
995 	/*
996 	 * Lookup the tracepoint that the process just hit.
997 	 */
998 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
999 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
1000 		    tp->ftt_proc->ftpc_acount != 0)
1001 			break;
1002 	}
1003 
1004 	/*
1005 	 * If we couldn't find a matching tracepoint, either a tracepoint has
1006 	 * been inserted without using the pid<pid> ioctl interface (see
1007 	 * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
1008 	 */
1009 	if (tp == NULL) {
1010 		rm_runlock(&fasttrap_tp_lock, &tracker);
1011 		gen = atomic_load_acq_64(&pp->p_fasttrap_tp_gen);
1012 		if (pp != p)
1013 			PRELE(pp);
1014 		if (curthread->t_fasttrap_tp_gen != gen) {
1015 			/*
1016 			 * At least one tracepoint associated with this PID has
1017 			 * been removed from the table since #BP was raised.
1018 			 * Speculate that we hit a tracepoint that has since
1019 			 * been removed, and retry the instruction.
1020 			 */
1021 			curthread->t_fasttrap_tp_gen = gen;
1022 #ifdef __amd64
1023 			tf->tf_rip = pc;
1024 #else
1025 			tf->tf_eip = pc;
1026 #endif
1027 			return (0);
1028 		}
1029 		return (-1);
1030 	}
1031 	if (pp != p)
1032 		PRELE(pp);
1033 
1034 	/*
1035 	 * Set the program counter to the address of the traced instruction
1036 	 * so that it looks right in ustack() output.
1037 	 */
1038 	rp->r_rip = pc;
1039 
1040 	if (tp->ftt_ids != NULL) {
1041 		fasttrap_id_t *id;
1042 
1043 #ifdef __amd64
1044 		if (p->p_model == DATAMODEL_LP64) {
1045 			for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
1046 				fasttrap_probe_t *probe = id->fti_probe;
1047 
1048 				if (id->fti_ptype == DTFTP_ENTRY) {
1049 					/*
1050 					 * We note that this was an entry
1051 					 * probe to help ustack() find the
1052 					 * first caller.
1053 					 */
1054 					cookie = dtrace_interrupt_disable();
1055 					DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
1056 					dtrace_probe(probe->ftp_id, rp->r_rdi,
1057 					    rp->r_rsi, rp->r_rdx, rp->r_rcx,
1058 					    rp->r_r8);
1059 					DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
1060 					dtrace_interrupt_enable(cookie);
1061 				} else if (id->fti_ptype == DTFTP_IS_ENABLED) {
1062 					/*
1063 					 * Note that in this case, we don't
1064 					 * call dtrace_probe() since it's only
1065 					 * an artificial probe meant to change
1066 					 * the flow of control so that it
1067 					 * encounters the true probe.
1068 					 */
1069 					is_enabled = 1;
1070 				} else if (probe->ftp_argmap == NULL) {
1071 					dtrace_probe(probe->ftp_id, rp->r_rdi,
1072 					    rp->r_rsi, rp->r_rdx, rp->r_rcx,
1073 					    rp->r_r8);
1074 				} else {
1075 					uintptr_t t[5];
1076 
1077 					fasttrap_usdt_args64(probe, rp,
1078 					    sizeof (t) / sizeof (t[0]), t);
1079 
1080 					dtrace_probe(probe->ftp_id, t[0], t[1],
1081 					    t[2], t[3], t[4]);
1082 				}
1083 			}
1084 		} else {
1085 #endif
1086 			uintptr_t s0, s1, s2, s3, s4, s5;
1087 			uint32_t *stack = (uint32_t *)rp->r_rsp;
1088 
1089 			/*
1090 			 * In 32-bit mode, all arguments are passed on the
1091 			 * stack. If this is a function entry probe, we need
1092 			 * to skip the first entry on the stack as it
1093 			 * represents the return address rather than a
1094 			 * parameter to the function.
1095 			 */
1096 			s0 = fasttrap_fuword32_noerr(&stack[0]);
1097 			s1 = fasttrap_fuword32_noerr(&stack[1]);
1098 			s2 = fasttrap_fuword32_noerr(&stack[2]);
1099 			s3 = fasttrap_fuword32_noerr(&stack[3]);
1100 			s4 = fasttrap_fuword32_noerr(&stack[4]);
1101 			s5 = fasttrap_fuword32_noerr(&stack[5]);
1102 
1103 			for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
1104 				fasttrap_probe_t *probe = id->fti_probe;
1105 
1106 				if (id->fti_ptype == DTFTP_ENTRY) {
1107 					/*
1108 					 * We note that this was an entry
1109 					 * probe to help ustack() find the
1110 					 * first caller.
1111 					 */
1112 					cookie = dtrace_interrupt_disable();
1113 					DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
1114 					dtrace_probe(probe->ftp_id, s1, s2,
1115 					    s3, s4, s5);
1116 					DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
1117 					dtrace_interrupt_enable(cookie);
1118 				} else if (id->fti_ptype == DTFTP_IS_ENABLED) {
1119 					/*
1120 					 * Note that in this case, we don't
1121 					 * call dtrace_probe() since it's only
1122 					 * an artificial probe meant to change
1123 					 * the flow of control so that it
1124 					 * encounters the true probe.
1125 					 */
1126 					is_enabled = 1;
1127 				} else if (probe->ftp_argmap == NULL) {
1128 					dtrace_probe(probe->ftp_id, s0, s1,
1129 					    s2, s3, s4);
1130 				} else {
1131 					uint32_t t[5];
1132 
1133 					fasttrap_usdt_args32(probe, rp,
1134 					    sizeof (t) / sizeof (t[0]), t);
1135 
1136 					dtrace_probe(probe->ftp_id, t[0], t[1],
1137 					    t[2], t[3], t[4]);
1138 				}
1139 			}
1140 #ifdef __amd64
1141 		}
1142 #endif
1143 	}
1144 
1145 	/*
1146 	 * We're about to do a bunch of work so we cache a local copy of
1147 	 * the tracepoint to emulate the instruction, and then find the
1148 	 * tracepoint again later if we need to light up any return probes.
1149 	 */
1150 	tp_local = *tp;
1151 	rm_runlock(&fasttrap_tp_lock, &tracker);
1152 	tp = &tp_local;
1153 
1154 	/*
1155 	 * Set the program counter to appear as though the traced instruction
1156 	 * had completely executed. This ensures that fasttrap_getreg() will
1157 	 * report the expected value for REG_RIP.
1158 	 */
1159 	rp->r_rip = pc + tp->ftt_size;
1160 
1161 	/*
1162 	 * If there's an is-enabled probe connected to this tracepoint it
1163 	 * means that there was a 'xorl %eax, %eax' or 'xorq %rax, %rax'
1164 	 * instruction that was placed there by DTrace when the binary was
1165 	 * linked. As this probe is, in fact, enabled, we need to stuff 1
1166 	 * into %eax or %rax. Accordingly, we can bypass all the instruction
1167 	 * emulation logic since we know the inevitable result. It's possible
1168 	 * that a user could construct a scenario where the 'is-enabled'
1169 	 * probe was on some other instruction, but that would be a rather
1170 	 * exotic way to shoot oneself in the foot.
1171 	 */
1172 	if (is_enabled) {
1173 		rp->r_rax = 1;
1174 		new_pc = rp->r_rip;
1175 		goto done;
1176 	}
1177 
1178 	/*
1179 	 * We emulate certain types of instructions to ensure correctness
1180 	 * (in the case of position dependent instructions) or optimize
1181 	 * common cases. The rest we have the thread execute back in user-
1182 	 * land.
1183 	 */
1184 	switch (tp->ftt_type) {
1185 	case FASTTRAP_T_RET:
1186 	case FASTTRAP_T_RET16:
1187 	{
1188 		uintptr_t dst = 0;
1189 		uintptr_t addr = 0;
1190 		int ret = 0;
1191 
1192 		/*
1193 		 * We have to emulate _every_ facet of the behavior of a ret
1194 		 * instruction including what happens if the load from %esp
1195 		 * fails; in that case, we send a SIGSEGV.
1196 		 */
1197 #ifdef __amd64
1198 		if (p->p_model == DATAMODEL_NATIVE) {
1199 			ret = dst = fasttrap_fulword((void *)rp->r_rsp);
1200 			addr = rp->r_rsp + sizeof (uintptr_t);
1201 		} else {
1202 #endif
1203 			uint32_t dst32;
1204 			ret = dst32 = fasttrap_fuword32((void *)rp->r_rsp);
1205 			dst = dst32;
1206 			addr = rp->r_rsp + sizeof (uint32_t);
1207 #ifdef __amd64
1208 		}
1209 #endif
1210 
1211 		if (ret == -1) {
1212 			fasttrap_sigsegv(p, curthread, rp->r_rsp);
1213 			new_pc = pc;
1214 			break;
1215 		}
1216 
1217 		if (tp->ftt_type == FASTTRAP_T_RET16)
1218 			addr += tp->ftt_dest;
1219 
1220 		rp->r_rsp = addr;
1221 		new_pc = dst;
1222 		break;
1223 	}
1224 
1225 	case FASTTRAP_T_JCC:
1226 	{
1227 		uint_t taken = 0;
1228 
1229 		switch (tp->ftt_code) {
1230 		case FASTTRAP_JO:
1231 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) != 0;
1232 			break;
1233 		case FASTTRAP_JNO:
1234 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0;
1235 			break;
1236 		case FASTTRAP_JB:
1237 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0;
1238 			break;
1239 		case FASTTRAP_JAE:
1240 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0;
1241 			break;
1242 		case FASTTRAP_JE:
1243 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0;
1244 			break;
1245 		case FASTTRAP_JNE:
1246 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0;
1247 			break;
1248 		case FASTTRAP_JBE:
1249 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0 ||
1250 			    (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0;
1251 			break;
1252 		case FASTTRAP_JA:
1253 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0 &&
1254 			    (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0;
1255 			break;
1256 		case FASTTRAP_JS:
1257 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) != 0;
1258 			break;
1259 		case FASTTRAP_JNS:
1260 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0;
1261 			break;
1262 		case FASTTRAP_JP:
1263 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) != 0;
1264 			break;
1265 		case FASTTRAP_JNP:
1266 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) == 0;
1267 			break;
1268 		case FASTTRAP_JL:
1269 			taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) !=
1270 			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1271 			break;
1272 		case FASTTRAP_JGE:
1273 			taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) ==
1274 			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1275 			break;
1276 		case FASTTRAP_JLE:
1277 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 ||
1278 			    ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) !=
1279 			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1280 			break;
1281 		case FASTTRAP_JG:
1282 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 &&
1283 			    ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) ==
1284 			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1285 			break;
1286 
1287 		}
1288 
1289 		if (taken)
1290 			new_pc = tp->ftt_dest;
1291 		else
1292 			new_pc = pc + tp->ftt_size;
1293 		break;
1294 	}
1295 
1296 	case FASTTRAP_T_LOOP:
1297 	{
1298 		uint_t taken = 0;
1299 #ifdef __amd64
1300 		greg_t cx = rp->r_rcx--;
1301 #else
1302 		greg_t cx = rp->r_ecx--;
1303 #endif
1304 
1305 		switch (tp->ftt_code) {
1306 		case FASTTRAP_LOOPNZ:
1307 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 &&
1308 			    cx != 0;
1309 			break;
1310 		case FASTTRAP_LOOPZ:
1311 			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 &&
1312 			    cx != 0;
1313 			break;
1314 		case FASTTRAP_LOOP:
1315 			taken = (cx != 0);
1316 			break;
1317 		}
1318 
1319 		if (taken)
1320 			new_pc = tp->ftt_dest;
1321 		else
1322 			new_pc = pc + tp->ftt_size;
1323 		break;
1324 	}
1325 
1326 	case FASTTRAP_T_JCXZ:
1327 	{
1328 #ifdef __amd64
1329 		greg_t cx = rp->r_rcx;
1330 #else
1331 		greg_t cx = rp->r_ecx;
1332 #endif
1333 
1334 		if (cx == 0)
1335 			new_pc = tp->ftt_dest;
1336 		else
1337 			new_pc = pc + tp->ftt_size;
1338 		break;
1339 	}
1340 
1341 	case FASTTRAP_T_PUSHL_EBP:
1342 	{
1343 		int ret = 0;
1344 
1345 #ifdef __amd64
1346 		if (p->p_model == DATAMODEL_NATIVE) {
1347 			rp->r_rsp -= sizeof (uintptr_t);
1348 			ret = fasttrap_sulword((void *)rp->r_rsp, rp->r_rbp);
1349 		} else {
1350 #endif
1351 			rp->r_rsp -= sizeof (uint32_t);
1352 			ret = fasttrap_suword32((void *)rp->r_rsp, rp->r_rbp);
1353 #ifdef __amd64
1354 		}
1355 #endif
1356 
1357 		if (ret == -1) {
1358 			fasttrap_sigsegv(p, curthread, rp->r_rsp);
1359 			new_pc = pc;
1360 			break;
1361 		}
1362 
1363 		new_pc = pc + tp->ftt_size;
1364 		break;
1365 	}
1366 
1367 	case FASTTRAP_T_NOP:
1368 		new_pc = pc + tp->ftt_size;
1369 		break;
1370 
1371 	case FASTTRAP_T_JMP:
1372 	case FASTTRAP_T_CALL:
1373 		if (tp->ftt_code == 0) {
1374 			new_pc = tp->ftt_dest;
1375 		} else {
1376 			uintptr_t value, addr = tp->ftt_dest;
1377 
1378 			if (tp->ftt_base != FASTTRAP_NOREG)
1379 				addr += fasttrap_getreg(rp, tp->ftt_base);
1380 			if (tp->ftt_index != FASTTRAP_NOREG)
1381 				addr += fasttrap_getreg(rp, tp->ftt_index) <<
1382 				    tp->ftt_scale;
1383 
1384 			if (tp->ftt_code == 1) {
1385 				/*
1386 				 * If there's a segment prefix for this
1387 				 * instruction, we'll need to check permissions
1388 				 * and bounds on the given selector, and adjust
1389 				 * the address accordingly.
1390 				 */
1391 				if (tp->ftt_segment != FASTTRAP_SEG_NONE &&
1392 				    fasttrap_do_seg(tp, rp, &addr) != 0) {
1393 					fasttrap_sigsegv(p, curthread, addr);
1394 					new_pc = pc;
1395 					break;
1396 				}
1397 
1398 #ifdef __amd64
1399 				if (p->p_model == DATAMODEL_NATIVE) {
1400 #endif
1401 					if ((value = fasttrap_fulword((void *)addr))
1402 					     == -1) {
1403 						fasttrap_sigsegv(p, curthread,
1404 						    addr);
1405 						new_pc = pc;
1406 						break;
1407 					}
1408 					new_pc = value;
1409 #ifdef __amd64
1410 				} else {
1411 					uint32_t value32;
1412 					addr = (uintptr_t)(uint32_t)addr;
1413 					if ((value32 = fasttrap_fuword32((void *)addr))
1414 					    == -1) {
1415 						fasttrap_sigsegv(p, curthread,
1416 						    addr);
1417 						new_pc = pc;
1418 						break;
1419 					}
1420 					new_pc = value32;
1421 				}
1422 #endif
1423 			} else {
1424 				new_pc = addr;
1425 			}
1426 		}
1427 
1428 		/*
1429 		 * If this is a call instruction, we need to push the return
1430 		 * address onto the stack. If this fails, we send the process
1431 		 * a SIGSEGV and reset the pc to emulate what would happen if
1432 		 * this instruction weren't traced.
1433 		 */
1434 		if (tp->ftt_type == FASTTRAP_T_CALL) {
1435 			int ret = 0;
1436 			uintptr_t addr = 0, pcps;
1437 #ifdef __amd64
1438 			if (p->p_model == DATAMODEL_NATIVE) {
1439 				addr = rp->r_rsp - sizeof (uintptr_t);
1440 				pcps = pc + tp->ftt_size;
1441 				ret = fasttrap_sulword((void *)addr, pcps);
1442 			} else {
1443 #endif
1444 				addr = rp->r_rsp - sizeof (uint32_t);
1445 				pcps = (uint32_t)(pc + tp->ftt_size);
1446 				ret = fasttrap_suword32((void *)addr, pcps);
1447 #ifdef __amd64
1448 			}
1449 #endif
1450 
1451 			if (ret == -1) {
1452 				fasttrap_sigsegv(p, curthread, addr);
1453 				new_pc = pc;
1454 				break;
1455 			}
1456 
1457 			rp->r_rsp = addr;
1458 		}
1459 
1460 		break;
1461 
1462 	case FASTTRAP_T_COMMON:
1463 	{
1464 		uintptr_t addr;
1465 #if defined(__amd64)
1466 		uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22];
1467 #else
1468 		uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7];
1469 #endif
1470 		uint_t i = 0;
1471 		fasttrap_scrspace_t *scrspace;
1472 		scrspace = fasttrap_scraddr(curthread, tp->ftt_proc);
1473 		if (scrspace == NULL) {
1474 			/*
1475 			 * We failed to allocate scratch space for this thread.
1476 			 * Try to write the original instruction back out and
1477 			 * reset the pc.
1478 			 */
1479 			if (fasttrap_copyout(tp->ftt_instr, (void *)pc,
1480 			    tp->ftt_size))
1481 				fasttrap_sigtrap(p, curthread, pc);
1482 			new_pc = pc;
1483 			break;
1484 		}
1485 		addr = scrspace->ftss_addr;
1486 
1487 		/*
1488 		 * Generic Instruction Tracing
1489 		 * ---------------------------
1490 		 *
1491 		 * This is the layout of the scratch space in the user-land
1492 		 * thread structure for our generated instructions.
1493 		 *
1494 		 *	32-bit mode			bytes
1495 		 *	------------------------	-----
1496 		 * a:	<original instruction>		<= 15
1497 		 *	jmp	<pc + tp->ftt_size>	    5
1498 		 * b:	<original instruction>		<= 15
1499 		 *	int	T_DTRACE_RET		    2
1500 		 *					-----
1501 		 *					<= 37
1502 		 *
1503 		 *	64-bit mode			bytes
1504 		 *	------------------------	-----
1505 		 * a:	<original instruction>		<= 15
1506 		 *	jmp	0(%rip)			    6
1507 		 *	<pc + tp->ftt_size>		    8
1508 		 * b:	<original instruction>		<= 15
1509 		 * 	int	T_DTRACE_RET		    2
1510 		 * 					-----
1511 		 * 					<= 46
1512 		 *
1513 		 * The %pc is set to a, and curthread->t_dtrace_astpc is set
1514 		 * to b. If we encounter a signal on the way out of the
1515 		 * kernel, trap() will set %pc to curthread->t_dtrace_astpc
1516 		 * so that we execute the original instruction and re-enter
1517 		 * the kernel rather than redirecting to the next instruction.
1518 		 *
1519 		 * If there are return probes (so we know that we're going to
1520 		 * need to reenter the kernel after executing the original
1521 		 * instruction), the scratch space will just contain the
1522 		 * original instruction followed by an interrupt -- the same
1523 		 * data as at b.
1524 		 *
1525 		 * %rip-relative Addressing
1526 		 * ------------------------
1527 		 *
1528 		 * There's a further complication in 64-bit mode due to %rip-
1529 		 * relative addressing. While this is clearly a beneficial
1530 		 * architectural decision for position independent code, it's
1531 		 * hard not to see it as a personal attack against the pid
1532 		 * provider since before there was a relatively small set of
1533 		 * instructions to emulate; with %rip-relative addressing,
1534 		 * almost every instruction can potentially depend on the
1535 		 * address at which it's executed. Rather than emulating
1536 		 * the broad spectrum of instructions that can now be
1537 		 * position dependent, we emulate jumps and others as in
1538 		 * 32-bit mode, and take a different tack for instructions
1539 		 * using %rip-relative addressing.
1540 		 *
1541 		 * For every instruction that uses the ModRM byte, the
1542 		 * in-kernel disassembler reports its location. We use the
1543 		 * ModRM byte to identify that an instruction uses
1544 		 * %rip-relative addressing and to see what other registers
1545 		 * the instruction uses. To emulate those instructions,
1546 		 * we modify the instruction to be %rax-relative rather than
1547 		 * %rip-relative (or %rcx-relative if the instruction uses
1548 		 * %rax; or %r8- or %r9-relative if the REX.B is present so
1549 		 * we don't have to rewrite the REX prefix). We then load
1550 		 * the value that %rip would have been into the scratch
1551 		 * register and generate an instruction to reset the scratch
1552 		 * register back to its original value. The instruction
1553 		 * sequence looks like this:
1554 		 *
1555 		 *	64-mode %rip-relative		bytes
1556 		 *	------------------------	-----
1557 		 * a:	<modified instruction>		<= 15
1558 		 *	movq	$<value>, %<scratch>	    6
1559 		 *	jmp	0(%rip)			    6
1560 		 *	<pc + tp->ftt_size>		    8
1561 		 * b:	<modified instruction>  	<= 15
1562 		 * 	int	T_DTRACE_RET		    2
1563 		 * 					-----
1564 		 *					   52
1565 		 *
1566 		 * We set curthread->t_dtrace_regv so that upon receiving
1567 		 * a signal we can reset the value of the scratch register.
1568 		 */
1569 
1570 		ASSERT(tp->ftt_size <= FASTTRAP_MAX_INSTR_SIZE);
1571 
1572 		curthread->t_dtrace_scrpc = addr;
1573 		bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size);
1574 		i += tp->ftt_size;
1575 
1576 #ifdef __amd64
1577 		if (tp->ftt_ripmode != 0) {
1578 			greg_t *reg = NULL;
1579 
1580 			ASSERT(p->p_model == DATAMODEL_LP64);
1581 			ASSERT(tp->ftt_ripmode &
1582 			    (FASTTRAP_RIP_1 | FASTTRAP_RIP_2));
1583 
1584 			/*
1585 			 * If this was a %rip-relative instruction, we change
1586 			 * it to be either a %rax- or %rcx-relative
1587 			 * instruction (depending on whether those registers
1588 			 * are used as another operand; or %r8- or %r9-
1589 			 * relative depending on the value of REX.B). We then
1590 			 * set that register and generate a movq instruction
1591 			 * to reset the value.
1592 			 */
1593 			if (tp->ftt_ripmode & FASTTRAP_RIP_X)
1594 				scratch[i++] = FASTTRAP_REX(1, 0, 0, 1);
1595 			else
1596 				scratch[i++] = FASTTRAP_REX(1, 0, 0, 0);
1597 
1598 			if (tp->ftt_ripmode & FASTTRAP_RIP_1)
1599 				scratch[i++] = FASTTRAP_MOV_EAX;
1600 			else
1601 				scratch[i++] = FASTTRAP_MOV_ECX;
1602 
1603 			switch (tp->ftt_ripmode) {
1604 			case FASTTRAP_RIP_1:
1605 				reg = &rp->r_rax;
1606 				curthread->t_dtrace_reg = REG_RAX;
1607 				break;
1608 			case FASTTRAP_RIP_2:
1609 				reg = &rp->r_rcx;
1610 				curthread->t_dtrace_reg = REG_RCX;
1611 				break;
1612 			case FASTTRAP_RIP_1 | FASTTRAP_RIP_X:
1613 				reg = &rp->r_r8;
1614 				curthread->t_dtrace_reg = REG_R8;
1615 				break;
1616 			case FASTTRAP_RIP_2 | FASTTRAP_RIP_X:
1617 				reg = &rp->r_r9;
1618 				curthread->t_dtrace_reg = REG_R9;
1619 				break;
1620 			}
1621 
1622 			/* LINTED - alignment */
1623 			*(uint64_t *)&scratch[i] = *reg;
1624 			curthread->t_dtrace_regv = *reg;
1625 			*reg = pc + tp->ftt_size;
1626 			i += sizeof (uint64_t);
1627 		}
1628 #endif
1629 
1630 		/*
1631 		 * Generate the branch instruction to what would have
1632 		 * normally been the subsequent instruction. In 32-bit mode,
1633 		 * this is just a relative branch; in 64-bit mode this is a
1634 		 * %rip-relative branch that loads the 64-bit pc value
1635 		 * immediately after the jmp instruction.
1636 		 */
1637 #ifdef __amd64
1638 		if (p->p_model == DATAMODEL_LP64) {
1639 			scratch[i++] = FASTTRAP_GROUP5_OP;
1640 			scratch[i++] = FASTTRAP_MODRM(0, 4, 5);
1641 			/* LINTED - alignment */
1642 			*(uint32_t *)&scratch[i] = 0;
1643 			i += sizeof (uint32_t);
1644 			/* LINTED - alignment */
1645 			*(uint64_t *)&scratch[i] = pc + tp->ftt_size;
1646 			i += sizeof (uint64_t);
1647 		} else {
1648 #endif
1649 			/*
1650 			 * Set up the jmp to the next instruction; note that
1651 			 * the size of the traced instruction cancels out.
1652 			 */
1653 			scratch[i++] = FASTTRAP_JMP32;
1654 			/* LINTED - alignment */
1655 			*(uint32_t *)&scratch[i] = pc - addr - 5;
1656 			i += sizeof (uint32_t);
1657 #ifdef __amd64
1658 		}
1659 #endif
1660 
1661 		curthread->t_dtrace_astpc = addr + i;
1662 		bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size);
1663 		i += tp->ftt_size;
1664 		scratch[i++] = FASTTRAP_INT;
1665 		scratch[i++] = T_DTRACE_RET;
1666 
1667 		ASSERT(i <= sizeof (scratch));
1668 
1669 		if (fasttrap_copyout(scratch, (char *)addr, i)) {
1670 			fasttrap_sigtrap(p, curthread, pc);
1671 			new_pc = pc;
1672 			break;
1673 		}
1674 		if (tp->ftt_retids != NULL) {
1675 			curthread->t_dtrace_step = 1;
1676 			curthread->t_dtrace_ret = 1;
1677 			new_pc = curthread->t_dtrace_astpc;
1678 		} else {
1679 			new_pc = curthread->t_dtrace_scrpc;
1680 		}
1681 
1682 		curthread->t_dtrace_pc = pc;
1683 		curthread->t_dtrace_npc = pc + tp->ftt_size;
1684 		curthread->t_dtrace_on = 1;
1685 		break;
1686 	}
1687 
1688 	default:
1689 		panic("fasttrap: mishandled an instruction");
1690 	}
1691 
1692 done:
1693 	/*
1694 	 * If there were no return probes when we first found the tracepoint,
1695 	 * we should feel no obligation to honor any return probes that were
1696 	 * subsequently enabled -- they'll just have to wait until the next
1697 	 * time around.
1698 	 */
1699 	if (tp->ftt_retids != NULL) {
1700 		/*
1701 		 * We need to wait until the results of the instruction are
1702 		 * apparent before invoking any return probes. If this
1703 		 * instruction was emulated we can just call
1704 		 * fasttrap_return_common(); if it needs to be executed, we
1705 		 * need to wait until the user thread returns to the kernel.
1706 		 */
1707 		if (tp->ftt_type != FASTTRAP_T_COMMON) {
1708 			/*
1709 			 * Set the program counter to the address of the traced
1710 			 * instruction so that it looks right in ustack()
1711 			 * output. We had previously set it to the end of the
1712 			 * instruction to simplify %rip-relative addressing.
1713 			 */
1714 			rp->r_rip = pc;
1715 
1716 			fasttrap_return_common(rp, pc, pid, new_pc);
1717 		} else {
1718 			ASSERT(curthread->t_dtrace_ret != 0);
1719 			ASSERT(curthread->t_dtrace_pc == pc);
1720 			ASSERT(curthread->t_dtrace_scrpc != 0);
1721 			ASSERT(new_pc == curthread->t_dtrace_astpc);
1722 		}
1723 	}
1724 
1725 	rp->r_rip = new_pc;
1726 
1727 	PROC_LOCK(p);
1728 	proc_write_regs(curthread, rp);
1729 	PROC_UNLOCK(p);
1730 
1731 	return (0);
1732 }
1733 
1734 int
1735 fasttrap_return_probe(struct trapframe *tf)
1736 {
1737 	struct reg reg, *rp;
1738 	proc_t *p = curproc;
1739 	uintptr_t pc = curthread->t_dtrace_pc;
1740 	uintptr_t npc = curthread->t_dtrace_npc;
1741 
1742 	fill_frame_regs(tf, &reg);
1743 	rp = &reg;
1744 
1745 	curthread->t_dtrace_pc = 0;
1746 	curthread->t_dtrace_npc = 0;
1747 	curthread->t_dtrace_scrpc = 0;
1748 	curthread->t_dtrace_astpc = 0;
1749 
1750 #ifdef illumos
1751 	/*
1752 	 * Treat a child created by a call to vfork(2) as if it were its
1753 	 * parent. We know that there's only one thread of control in such a
1754 	 * process: this one.
1755 	 */
1756 	while (p->p_flag & SVFORK) {
1757 		p = p->p_parent;
1758 	}
1759 #endif
1760 
1761 	/*
1762 	 * We set rp->r_rip to the address of the traced instruction so
1763 	 * that it appears to dtrace_probe() that we're on the original
1764 	 * instruction.
1765 	 */
1766 	rp->r_rip = pc;
1767 
1768 	fasttrap_return_common(rp, pc, p->p_pid, npc);
1769 
1770 	return (0);
1771 }
1772 
1773 /*ARGSUSED*/
1774 uint64_t
1775 fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
1776     int aframes)
1777 {
1778 	struct reg r;
1779 
1780 	fill_regs(curthread, &r);
1781 
1782 	return (fasttrap_anarg(&r, 1, argno));
1783 }
1784 
1785 /*ARGSUSED*/
1786 uint64_t
1787 fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
1788     int aframes)
1789 {
1790 	struct reg r;
1791 
1792 	fill_regs(curthread, &r);
1793 
1794 	return (fasttrap_anarg(&r, 0, argno));
1795 }
1796 
1797 static ulong_t
1798 fasttrap_getreg(struct reg *rp, uint_t reg)
1799 {
1800 #ifdef __amd64
1801 	switch (reg) {
1802 	case REG_R15:		return (rp->r_r15);
1803 	case REG_R14:		return (rp->r_r14);
1804 	case REG_R13:		return (rp->r_r13);
1805 	case REG_R12:		return (rp->r_r12);
1806 	case REG_R11:		return (rp->r_r11);
1807 	case REG_R10:		return (rp->r_r10);
1808 	case REG_R9:		return (rp->r_r9);
1809 	case REG_R8:		return (rp->r_r8);
1810 	case REG_RDI:		return (rp->r_rdi);
1811 	case REG_RSI:		return (rp->r_rsi);
1812 	case REG_RBP:		return (rp->r_rbp);
1813 	case REG_RBX:		return (rp->r_rbx);
1814 	case REG_RDX:		return (rp->r_rdx);
1815 	case REG_RCX:		return (rp->r_rcx);
1816 	case REG_RAX:		return (rp->r_rax);
1817 	case REG_TRAPNO:	return (rp->r_trapno);
1818 	case REG_ERR:		return (rp->r_err);
1819 	case REG_RIP:		return (rp->r_rip);
1820 	case REG_CS:		return (rp->r_cs);
1821 	case REG_RFL:		return (rp->r_rflags);
1822 	case REG_RSP:		return (rp->r_rsp);
1823 	case REG_SS:		return (rp->r_ss);
1824 	case REG_FS:		return (rp->r_fs);
1825 	case REG_GS:		return (rp->r_gs);
1826 	case REG_DS:		return (rp->r_ds);
1827 	case REG_ES:		return (rp->r_es);
1828 	case REG_FSBASE:	return (rdmsr(MSR_FSBASE));
1829 	case REG_GSBASE:	return (rdmsr(MSR_GSBASE));
1830 	}
1831 
1832 	panic("dtrace: illegal register constant");
1833 	/*NOTREACHED*/
1834 #else
1835 #define _NGREG 19
1836 	if (reg >= _NGREG)
1837 		panic("dtrace: illegal register constant");
1838 
1839 	return (((greg_t *)&rp->r_gs)[reg]);
1840 #endif
1841 }
1842