xref: /illumos-gate/usr/src/uts/sparc/dtrace/fasttrap_isa.c (revision fb1354ed4c9fee45e038d38a155ea6fb11ee17bb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <sys/fasttrap_isa.h>
31 #include <sys/fasttrap_impl.h>
32 #include <sys/dtrace.h>
33 #include <sys/dtrace_impl.h>
34 #include <sys/cmn_err.h>
35 #include <sys/frame.h>
36 #include <sys/stack.h>
37 #include <sys/sysmacros.h>
38 #include <sys/trap.h>
39 
40 #include <v9/sys/machpcb.h>
41 #include <v9/sys/privregs.h>
42 
43 /*
44  * Lossless User-Land Tracing on SPARC
45  * -----------------------------------
46  *
47  * The Basic Idea
48  *
49  * The most important design constraint is, of course, correct execution of
50  * the user thread above all else. The next most important goal is rapid
51  * execution. We combine execution of instructions in user-land with
52  * emulation of certain instructions in the kernel to aim for complete
53  * correctness and maximal performance.
54  *
55  * We take advantage of the split PC/NPC architecture to speed up logical
56  * single-stepping; when we copy an instruction out to the scratch space in
57  * the ulwp_t structure (held in the %g7 register on SPARC), we can
58  * effectively single step by setting the PC to our scratch space and leaving
59  * the NPC alone. This executes the replaced instruction and then continues
60  * on without having to reenter the kernel as with single- stepping. The
61  * obvious caveat is for instructions whose execution is PC dependant --
62  * branches, call and link instructions (call and jmpl), and the rdpc
63  * instruction. These instructions cannot be executed in the manner described
64  * so they must be emulated in the kernel.
65  *
66  * Emulation for this small set of instructions if fairly simple; the most
67  * difficult part being emulating branch conditions.
68  *
69  *
70  * A Cache Heavy Portfolio
71  *
72  * It's important to note at this time that copying an instruction out to the
73  * ulwp_t scratch space in user-land is rather complicated. SPARC has
74  * separate data and instruction caches so any writes to the D$ (using a
75  * store instruction for example) aren't necessarily reflected in the I$.
76  * The flush instruction can be used to synchronize the two and must be used
77  * for any self-modifying code, but the flush instruction only applies to the
78  * primary address space (the absence of a flusha analogue to the flush
79  * instruction that accepts an ASI argument is an obvious omission from SPARC
80  * v9 where the notion of the alternate address space was introduced on
81  * SPARC). To correctly copy out the instruction we must use a block store
82  * that doesn't allocate in the D$ and ensures synchronization with the I$;
83  * see dtrace_blksuword32() for the implementation  (this function uses
84  * ASI_BLK_COMMIT_S to write a block through the secondary ASI in the manner
85  * described). Refer to the UltraSPARC I/II manual for details on the
86  * ASI_BLK_COMMIT_S ASI.
87  *
88  *
89  * Return Subtleties
90  *
91  * When we're firing a return probe we need to expose the value returned by
92  * the function being traced. Since the function can set the return value
93  * in its last instruction, we need to fire the return probe only _after_
94  * the effects of the instruction are apparent. For instructions that we
95  * emulate, we can call dtrace_probe() after we've performed the emulation;
96  * for instructions that we execute after we return to user-land, we set
97  * %pc to the instruction we copied out (as described above) and set %npc
98  * to a trap instruction stashed in the ulwp_t structure. After the traced
99  * instruction is executed, the trap instruction returns control to the
100  * kernel where we can fire the return probe.
101  *
102  * This need for a second trap in cases where we execute the traced
103  * instruction makes it all the more important to emulate the most common
104  * instructions to avoid the second trip in and out of the kernel.
105  *
106  *
107  * Making it Fast
108  *
109  * Since copying out an instruction is neither simple nor inexpensive for the
110  * CPU, we should attempt to avoid doing it in as many cases as possible.
111  * Since function entry and return are usually the most interesting probe
112  * sites, we attempt to tune the performance of the fasttrap provider around
113  * instructions typically in those places.
114  *
115  * Looking at a bunch of functions in libraries and executables reveals that
116  * most functions begin with either a save or a sethi (to setup a larger
117  * argument to the save) and end with a restore or an or (in the case of leaf
118  * functions). To try to improve performance, we emulate all of these
119  * instructions in the kernel.
120  *
121  * The save and restore instructions are a little tricky since they perform
122  * register window maniplulation. Rather than trying to tinker with the
123  * register windows from the kernel, we emulate the implicit add that takes
124  * place as part of those instructions and set the %pc to point to a simple
125  * save or restore we've hidden in the ulwp_t structure. If we're in a return
126  * probe so want to make it seem as though the tracepoint has been completely
127  * executed we need to remember that we've pulled this trick with restore and
128  * pull registers from the previous window (the one that we'll switch to once
129  * the simple store instruction is executed) rather than the current one. This
130  * is why in the case of emulating a restore we set the DTrace CPU flag
131  * CPU_DTRACE_FAKERESTORE before calling dtrace_probe() for the return probes
132  * (see fasttrap_return_common()).
133  */
134 
135 #define	OP(x)		((x) >> 30)
136 #define	OP2(x)		(((x) >> 22) & 0x07)
137 #define	OP3(x)		(((x) >> 19) & 0x3f)
138 #define	RCOND(x)	(((x) >> 25) & 0x07)
139 #define	COND(x)		(((x) >> 25) & 0x0f)
140 #define	A(x)		(((x) >> 29) & 0x01)
141 #define	I(x)		(((x) >> 13) & 0x01)
142 #define	RD(x)		(((x) >> 25) & 0x1f)
143 #define	RS1(x)		(((x) >> 14) & 0x1f)
144 #define	RS2(x)		(((x) >> 0) & 0x1f)
145 #define	CC(x)		(((x) >> 20) & 0x03)
146 #define	DISP16(x)	((((x) >> 6) & 0xc000) | ((x) & 0x3fff))
147 #define	DISP22(x)	((x) & 0x3fffff)
148 #define	DISP19(x)	((x) & 0x7ffff)
149 #define	DISP30(x)	((x) & 0x3fffffff)
150 #define	SW_TRAP(x)	((x) & 0x7f)
151 
152 #define	OP3_OR		0x02
153 #define	OP3_RD		0x28
154 #define	OP3_JMPL	0x38
155 #define	OP3_RETURN	0x39
156 #define	OP3_TCC		0x3a
157 #define	OP3_SAVE	0x3c
158 #define	OP3_RESTORE	0x3d
159 
160 #define	OP3_PREFETCH	0x2d
161 #define	OP3_CASA	0x3c
162 #define	OP3_PREFETCHA	0x3d
163 #define	OP3_CASXA	0x3e
164 
165 #define	OP2_ILLTRAP	0x0
166 #define	OP2_BPcc	0x1
167 #define	OP2_Bicc	0x2
168 #define	OP2_BPr		0x3
169 #define	OP2_SETHI	0x4
170 #define	OP2_FBPfcc	0x5
171 #define	OP2_FBfcc	0x6
172 
173 #define	R_G0		0
174 #define	R_O0		8
175 #define	R_SP		14
176 #define	R_I0		24
177 #define	R_I1		25
178 #define	R_I2		26
179 #define	R_I3		27
180 
181 /*
182  * Check the comment in fasttrap.h when changing these offsets or adding
183  * new instructions.
184  */
185 #define	FASTTRAP_OFF_SAVE	64
186 #define	FASTTRAP_OFF_RESTORE	68
187 #define	FASTTRAP_OFF_FTRET	72
188 #define	FASTTRAP_OFF_RETURN	76
189 
190 #define	BREAKPOINT_INSTR	0x91d02001	/* ta 1 */
191 
192 /*
193  * Tunable to let users turn off the fancy save instruction optimization.
194  * If a program is non-ABI compliant, there's a possibility that the save
195  * instruction optimization could cause an error.
196  */
197 int fasttrap_optimize_save = 1;
198 
199 static uint64_t
200 fasttrap_anarg(struct regs *rp, int argno)
201 {
202 	uint64_t value;
203 
204 	if (argno < 6)
205 		return ((&rp->r_o0)[argno]);
206 
207 	if (curproc->p_model == DATAMODEL_NATIVE) {
208 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
209 
210 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
211 		value = dtrace_fulword(&fr->fr_argd[argno]);
212 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
213 		    CPU_DTRACE_BADALIGN);
214 	} else {
215 		struct frame32 *fr = (struct frame32 *)rp->r_sp;
216 
217 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
218 		value = dtrace_fuword32(&fr->fr_argd[argno]);
219 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
220 		    CPU_DTRACE_BADALIGN);
221 	}
222 
223 	return (value);
224 }
225 
226 static ulong_t fasttrap_getreg(struct regs *, uint_t);
227 static void fasttrap_putreg(struct regs *, uint_t, ulong_t);
228 
229 int
230 fasttrap_probe(struct regs *rp)
231 {
232 	dtrace_probe(fasttrap_probe_id,
233 	    rp->r_o0, rp->r_o1, rp->r_o2, rp->r_o3, rp->r_o4);
234 
235 	rp->r_pc = rp->r_npc;
236 	rp->r_npc = rp->r_pc + 4;
237 
238 	return (0);
239 }
240 
241 static void
242 fasttrap_usdt_args(fasttrap_probe_t *probe, struct regs *rp, int argc,
243     uintptr_t *argv)
244 {
245 	int i, x, cap = MIN(argc, probe->ftp_nargs);
246 
247 	if (curproc->p_model == DATAMODEL_NATIVE) {
248 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
249 		uintptr_t v;
250 
251 		for (i = 0; i < cap; i++) {
252 			x = probe->ftp_argmap[i];
253 
254 			if (x < 6)
255 				argv[i] = (&rp->r_o0)[x];
256 			else if (fasttrap_fulword(&fr->fr_argd[x], &v) != 0)
257 				argv[i] = 0;
258 		}
259 
260 	} else {
261 		struct frame32 *fr = (struct frame32 *)rp->r_sp;
262 		uint32_t v;
263 
264 		for (i = 0; i < cap; i++) {
265 			x = probe->ftp_argmap[i];
266 
267 			if (x < 6)
268 				argv[i] = (&rp->r_o0)[x];
269 			else if (fasttrap_fuword32(&fr->fr_argd[x], &v) != 0)
270 				argv[i] = 0;
271 		}
272 	}
273 
274 	for (; i < argc; i++) {
275 		argv[i] = 0;
276 	}
277 }
278 
279 static void
280 fasttrap_return_common(struct regs *rp, uintptr_t pc, pid_t pid,
281     uint_t fake_restore)
282 {
283 	fasttrap_tracepoint_t *tp;
284 	fasttrap_bucket_t *bucket;
285 	fasttrap_id_t *id;
286 	kmutex_t *pid_mtx;
287 	dtrace_icookie_t cookie;
288 
289 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
290 	mutex_enter(pid_mtx);
291 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
292 
293 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
294 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
295 		    !tp->ftt_proc->ftpc_defunct)
296 			break;
297 	}
298 
299 	/*
300 	 * Don't sweat it if we can't find the tracepoint again; unlike
301 	 * when we're in fasttrap_pid_probe(), finding the tracepoint here
302 	 * is not essential to the correct execution of the process.
303 	 */
304 	if (tp == NULL || tp->ftt_retids == NULL) {
305 		mutex_exit(pid_mtx);
306 		return;
307 	}
308 
309 	for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
310 		fasttrap_probe_t *probe = id->fti_probe;
311 
312 		if (probe->ftp_type == DTFTP_POST_OFFSETS) {
313 			if (probe->ftp_argmap == NULL) {
314 				dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1,
315 				    rp->r_o2, rp->r_o3, rp->r_o4);
316 			} else {
317 				uintptr_t t[5];
318 
319 				fasttrap_usdt_args(probe, rp,
320 				    sizeof (t) / sizeof (t[0]), t);
321 
322 				dtrace_probe(probe->ftp_id, t[0], t[1],
323 				    t[2], t[3], t[4]);
324 			}
325 			continue;
326 		}
327 
328 		/*
329 		 * If this is only a possible return point, we must
330 		 * be looking at a potential tail call in leaf context.
331 		 * If the %npc is still within this function, then we
332 		 * must have misidentified a jmpl as a tail-call when it
333 		 * is, in fact, part of a jump table. It would be nice to
334 		 * remove this tracepoint, but this is neither the time
335 		 * nor the place.
336 		 */
337 		if ((tp->ftt_flags & FASTTRAP_F_RETMAYBE) &&
338 		    rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
339 			continue;
340 
341 		/*
342 		 * It's possible for a function to branch to the delay slot
343 		 * of an instruction that we've identified as a return site.
344 		 * We can dectect this spurious return probe activation by
345 		 * observing that in this case %npc will be %pc + 4 and %npc
346 		 * will be inside the current function (unless the user is
347 		 * doing _crazy_ instruction picking in which case there's
348 		 * very little we can do). The second check is important
349 		 * in case the last instructions of a function make a tail-
350 		 * call to the function located immediately subsequent.
351 		 */
352 		if (rp->r_npc == rp->r_pc + 4 &&
353 		    rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
354 			continue;
355 
356 		/*
357 		 * The first argument is the offset of return tracepoint
358 		 * in the function; the remaining arguments are the return
359 		 * values.
360 		 *
361 		 * If fake_restore is set, we need to pull the return values
362 		 * out of the %i's rather than the %o's -- a little trickier.
363 		 */
364 		if (!fake_restore) {
365 			dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
366 			    rp->r_o0, rp->r_o1, rp->r_o2, rp->r_o3);
367 		} else {
368 			uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
369 			uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
370 			uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
371 			uintptr_t arg3 = fasttrap_getreg(rp, R_I3);
372 
373 			cookie = dtrace_interrupt_disable();
374 			DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
375 			dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
376 			    arg0, arg1, arg2, arg3);
377 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
378 			dtrace_interrupt_enable(cookie);
379 		}
380 	}
381 
382 	mutex_exit(pid_mtx);
383 }
384 
385 int
386 fasttrap_pid_probe(struct regs *rp)
387 {
388 	proc_t *p = curproc;
389 	fasttrap_tracepoint_t *tp, tp_local;
390 	fasttrap_id_t *id;
391 	pid_t pid;
392 	uintptr_t pc = rp->r_pc;
393 	uintptr_t npc = rp->r_npc;
394 	uintptr_t orig_pc = pc;
395 	fasttrap_bucket_t *bucket;
396 	kmutex_t *pid_mtx;
397 	uint_t fake_restore = 0;
398 	dtrace_icookie_t cookie;
399 
400 	/*
401 	 * It's possible that a user (in a veritable orgy of bad planning)
402 	 * could redirect this thread's flow of control before it reached the
403 	 * return probe fasttrap. In this case we need to kill the process
404 	 * since it's in a unrecoverable state.
405 	 */
406 	if (curthread->t_dtrace_step) {
407 		ASSERT(curthread->t_dtrace_on);
408 		fasttrap_sigtrap(p, curthread, pc);
409 		return (0);
410 	}
411 
412 	/*
413 	 * Clear all user tracing flags.
414 	 */
415 	curthread->t_dtrace_ft = 0;
416 	curthread->t_dtrace_pc = 0;
417 	curthread->t_dtrace_npc = 0;
418 	curthread->t_dtrace_scrpc = 0;
419 	curthread->t_dtrace_astpc = 0;
420 
421 	/*
422 	 * Treat a child created by a call to vfork(2) as if it were its
423 	 * parent. We know that there's only one thread of control in such a
424 	 * process: this one.
425 	 */
426 	while (p->p_flag & SVFORK) {
427 		p = p->p_parent;
428 	}
429 
430 	pid = p->p_pid;
431 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
432 	mutex_enter(pid_mtx);
433 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
434 
435 	/*
436 	 * Lookup the tracepoint that the process just hit.
437 	 */
438 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
439 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
440 		    !tp->ftt_proc->ftpc_defunct)
441 			break;
442 	}
443 
444 	/*
445 	 * If we couldn't find a matching tracepoint, either a tracepoint has
446 	 * been inserted without using the pid<pid> ioctl interface (see
447 	 * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
448 	 */
449 	if (tp == NULL) {
450 		mutex_exit(pid_mtx);
451 		return (-1);
452 	}
453 
454 	for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
455 		fasttrap_probe_t *probe = id->fti_probe;
456 		int isentry;
457 		/*
458 		 * We note that this was an entry probe to help ustack() find
459 		 * the first caller.
460 		 */
461 		if ((isentry = (probe->ftp_type == DTFTP_ENTRY)) != 0) {
462 			cookie = dtrace_interrupt_disable();
463 			DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
464 		}
465 		dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1, rp->r_o2,
466 		    rp->r_o3, rp->r_o4);
467 		if (isentry) {
468 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
469 			dtrace_interrupt_enable(cookie);
470 		}
471 	}
472 
473 	/*
474 	 * We're about to do a bunch of work so we cache a local copy of
475 	 * the tracepoint to emulate the instruction, and then find the
476 	 * tracepoint again later if we need to light up any return probes.
477 	 */
478 	tp_local = *tp;
479 	mutex_exit(pid_mtx);
480 	tp = &tp_local;
481 
482 	/*
483 	 * We emulate certain types of instructions do ensure correctness
484 	 * (in the case of position dependent instructions) or optimize
485 	 * common cases. The rest we have the thread execute back in user-
486 	 * land.
487 	 */
488 	switch (tp->ftt_type) {
489 	case FASTTRAP_T_SAVE:
490 	{
491 		int32_t imm;
492 
493 		/*
494 		 * This an optimization to let us handle function entry
495 		 * probes more efficiently. Many functions begin with a save
496 		 * instruction that follows the pattern:
497 		 *	save	%sp, <imm>, %sp
498 		 *
499 		 * Meanwhile, we've stashed the instruction:
500 		 *	save	%g1, %g0, %sp
501 		 *
502 		 * off of %g7, so all we have to do is stick the right value
503 		 * into %g1 and reset %pc to point to the instruction we've
504 		 * cleverly hidden (%npc should not be touched).
505 		 */
506 
507 		imm = tp->ftt_instr << 19;
508 		imm >>= 19;
509 		rp->r_g1 = rp->r_sp + imm;
510 		pc = rp->r_g7 + FASTTRAP_OFF_SAVE;
511 		break;
512 	}
513 
514 	case FASTTRAP_T_RESTORE:
515 	{
516 		ulong_t value;
517 		uint_t rd;
518 
519 		/*
520 		 * This is an optimization to let us handle function
521 		 * return probes more efficiently. Most non-leaf functions
522 		 * end with the sequence:
523 		 *	ret
524 		 *	restore	<reg>, <reg_or_imm>, %oX
525 		 *
526 		 * We've stashed the instruction:
527 		 *	restore	%g0, %g0, %g0
528 		 *
529 		 * off of %g7 so we just need to place the correct value
530 		 * in the right %i register (since after our fake-o
531 		 * restore, the %i's will become the %o's) and set the %pc
532 		 * to point to our hidden restore. We also set fake_restore to
533 		 * let fasttrap_return_common() know that it will find the
534 		 * return values in the %i's rather than the %o's.
535 		 */
536 
537 		if (I(tp->ftt_instr)) {
538 			int32_t imm;
539 
540 			imm = tp->ftt_instr << 19;
541 			imm >>= 19;
542 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
543 		} else {
544 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
545 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
546 		}
547 
548 		/*
549 		 * Convert %o's to %i's; leave %g's as they are.
550 		 */
551 		rd = RD(tp->ftt_instr);
552 		fasttrap_putreg(rp, ((rd & 0x18) == 0x8) ? rd + 16 : rd, value);
553 
554 		pc = rp->r_g7 + FASTTRAP_OFF_RESTORE;
555 		fake_restore = 1;
556 		break;
557 	}
558 
559 	case FASTTRAP_T_RETURN:
560 	{
561 		uintptr_t target;
562 
563 		/*
564 		 * A return instruction is like a jmpl (without the link
565 		 * part) that executes an implicit restore. We've stashed
566 		 * the instruction:
567 		 *	return %o0
568 		 *
569 		 * off of %g7 so we just need to place the target in %o0
570 		 * and set the %pc to point to the stashed return instruction.
571 		 * We use %o0 since that register disappears after the return
572 		 * executes, erasing any evidence of this tampering.
573 		 */
574 		if (I(tp->ftt_instr)) {
575 			int32_t imm;
576 
577 			imm = tp->ftt_instr << 19;
578 			imm >>= 19;
579 			target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
580 		} else {
581 			target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
582 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
583 		}
584 
585 		fasttrap_putreg(rp, R_O0, target);
586 
587 		pc = rp->r_g7 + FASTTRAP_OFF_RETURN;
588 		fake_restore = 1;
589 		break;
590 	}
591 
592 	case FASTTRAP_T_OR:
593 	{
594 		ulong_t value;
595 
596 		if (I(tp->ftt_instr)) {
597 			int32_t imm;
598 
599 			imm = tp->ftt_instr << 19;
600 			imm >>= 19;
601 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) | imm;
602 		} else {
603 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) |
604 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
605 		}
606 
607 		fasttrap_putreg(rp, RD(tp->ftt_instr), value);
608 		pc = rp->r_npc;
609 		npc = pc + 4;
610 		break;
611 	}
612 
613 	case FASTTRAP_T_SETHI:
614 		if (RD(tp->ftt_instr) != R_G0) {
615 			uint32_t imm32 = tp->ftt_instr << 10;
616 			fasttrap_putreg(rp, RD(tp->ftt_instr), (ulong_t)imm32);
617 		}
618 		pc = rp->r_npc;
619 		npc = pc + 4;
620 		break;
621 
622 	case FASTTRAP_T_CCR:
623 	{
624 		uint_t c, v, z, n, taken;
625 		uint_t ccr = rp->r_tstate >> TSTATE_CCR_SHIFT;
626 
627 		if (tp->ftt_cc != 0)
628 			ccr >>= 4;
629 
630 		c = (ccr >> 0) & 1;
631 		v = (ccr >> 1) & 1;
632 		z = (ccr >> 2) & 1;
633 		n = (ccr >> 3) & 1;
634 
635 		switch (tp->ftt_code) {
636 		case 0x0:	/* BN */
637 			taken = 0;		break;
638 		case 0x1:	/* BE */
639 			taken = z;		break;
640 		case 0x2:	/* BLE */
641 			taken = z | (n ^ v);	break;
642 		case 0x3:	/* BL */
643 			taken = n ^ v;		break;
644 		case 0x4:	/* BLEU */
645 			taken = c | z;		break;
646 		case 0x5:	/* BCS (BLU) */
647 			taken = c;		break;
648 		case 0x6:	/* BNEG */
649 			taken = n;		break;
650 		case 0x7:	/* BVS */
651 			taken = v;		break;
652 		case 0x8:	/* BA */
653 			/*
654 			 * We handle the BA case differently since the annul
655 			 * bit means something slightly different.
656 			 */
657 			panic("fasttrap: mishandled a branch");
658 			taken = 1;		break;
659 		case 0x9:	/* BNE */
660 			taken = ~z;		break;
661 		case 0xa:	/* BG */
662 			taken = ~(z | (n ^ v));	break;
663 		case 0xb:	/* BGE */
664 			taken = ~(n ^ v);	break;
665 		case 0xc:	/* BGU */
666 			taken = ~(c | z);	break;
667 		case 0xd:	/* BCC (BGEU) */
668 			taken = ~c;		break;
669 		case 0xe:	/* BPOS */
670 			taken = ~n;		break;
671 		case 0xf:	/* BVC */
672 			taken = ~v;		break;
673 		}
674 
675 		if (taken & 1) {
676 			pc = rp->r_npc;
677 			npc = tp->ftt_dest;
678 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
679 			/*
680 			 * Untaken annulled branches don't execute the
681 			 * instruction in the delay slot.
682 			 */
683 			pc = rp->r_npc + 4;
684 			npc = pc + 4;
685 		} else {
686 			pc = rp->r_npc;
687 			npc = pc + 4;
688 		}
689 		break;
690 	}
691 
692 	case FASTTRAP_T_FCC:
693 	{
694 		uint_t fcc;
695 		uint_t taken;
696 		uint64_t fsr;
697 
698 		dtrace_getfsr(&fsr);
699 
700 		if (tp->ftt_cc == 0) {
701 			fcc = (fsr >> 10) & 0x3;
702 		} else {
703 			uint_t shift;
704 			ASSERT(tp->ftt_cc <= 3);
705 			shift = 30 + tp->ftt_cc * 2;
706 			fcc = (fsr >> shift) & 0x3;
707 		}
708 
709 		switch (tp->ftt_code) {
710 		case 0x0:	/* FBN */
711 			taken = (1 << fcc) & (0|0|0|0);	break;
712 		case 0x1:	/* FBNE */
713 			taken = (1 << fcc) & (8|4|2|0);	break;
714 		case 0x2:	/* FBLG */
715 			taken = (1 << fcc) & (0|4|2|0);	break;
716 		case 0x3:	/* FBUL */
717 			taken = (1 << fcc) & (8|0|2|0);	break;
718 		case 0x4:	/* FBL */
719 			taken = (1 << fcc) & (0|0|2|0);	break;
720 		case 0x5:	/* FBUG */
721 			taken = (1 << fcc) & (8|4|0|0);	break;
722 		case 0x6:	/* FBG */
723 			taken = (1 << fcc) & (0|4|0|0);	break;
724 		case 0x7:	/* FBU */
725 			taken = (1 << fcc) & (8|0|0|0);	break;
726 		case 0x8:	/* FBA */
727 			/*
728 			 * We handle the FBA case differently since the annul
729 			 * bit means something slightly different.
730 			 */
731 			panic("fasttrap: mishandled a branch");
732 			taken = (1 << fcc) & (8|4|2|1);	break;
733 		case 0x9:	/* FBE */
734 			taken = (1 << fcc) & (0|0|0|1);	break;
735 		case 0xa:	/* FBUE */
736 			taken = (1 << fcc) & (8|0|0|1);	break;
737 		case 0xb:	/* FBGE */
738 			taken = (1 << fcc) & (0|4|0|1);	break;
739 		case 0xc:	/* FBUGE */
740 			taken = (1 << fcc) & (8|4|0|1);	break;
741 		case 0xd:	/* FBLE */
742 			taken = (1 << fcc) & (0|0|2|1);	break;
743 		case 0xe:	/* FBULE */
744 			taken = (1 << fcc) & (8|0|2|1);	break;
745 		case 0xf:	/* FBO */
746 			taken = (1 << fcc) & (0|4|2|1);	break;
747 		}
748 
749 		if (taken) {
750 			pc = rp->r_npc;
751 			npc = tp->ftt_dest;
752 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
753 			/*
754 			 * Untaken annulled branches don't execute the
755 			 * instruction in the delay slot.
756 			 */
757 			pc = rp->r_npc + 4;
758 			npc = pc + 4;
759 		} else {
760 			pc = rp->r_npc;
761 			npc = pc + 4;
762 		}
763 		break;
764 	}
765 
766 	case FASTTRAP_T_REG:
767 	{
768 		uint64_t value;
769 		uint_t taken;
770 		uint_t reg = RS1(tp->ftt_instr);
771 
772 		/*
773 		 * An ILP32 process shouldn't be using a branch predicated on
774 		 * an %i or an %l since it would violate the ABI. It's a
775 		 * violation of the ABI because we can't ensure deterministic
776 		 * behavior. We should have identified this case when we
777 		 * enabled the probe.
778 		 */
779 		ASSERT(p->p_model == DATAMODEL_LP64 || reg < 16);
780 
781 		value = fasttrap_getreg(rp, reg);
782 
783 		switch (tp->ftt_code) {
784 		case 0x1:	/* BRZ */
785 			taken = (value == 0);	break;
786 		case 0x2:	/* BRLEZ */
787 			taken = (value <= 0);	break;
788 		case 0x3:	/* BRLZ */
789 			taken = (value < 0);	break;
790 		case 0x5:	/* BRNZ */
791 			taken = (value != 0);	break;
792 		case 0x6:	/* BRGZ */
793 			taken = (value > 0);	break;
794 		case 0x7:	/* BRGEZ */
795 			taken = (value <= 0);	break;
796 		default:
797 		case 0x0:
798 		case 0x4:
799 			panic("fasttrap: mishandled a branch");
800 		}
801 
802 		if (taken) {
803 			pc = rp->r_npc;
804 			npc = tp->ftt_dest;
805 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
806 			/*
807 			 * Untaken annulled branches don't execute the
808 			 * instruction in the delay slot.
809 			 */
810 			pc = rp->r_npc + 4;
811 			npc = pc + 4;
812 		} else {
813 			pc = rp->r_npc;
814 			npc = pc + 4;
815 		}
816 		break;
817 	}
818 
819 	case FASTTRAP_T_ALWAYS:
820 		/*
821 		 * BAs, BA,As...
822 		 */
823 
824 		if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
825 			/*
826 			 * Annulled branch always instructions never execute
827 			 * the instruction in the delay slot.
828 			 */
829 			pc = tp->ftt_dest;
830 			npc = tp->ftt_dest + 4;
831 		} else {
832 			pc = rp->r_npc;
833 			npc = tp->ftt_dest;
834 		}
835 		break;
836 
837 	case FASTTRAP_T_RDPC:
838 		fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
839 		pc = rp->r_npc;
840 		npc = pc + 4;
841 		break;
842 
843 	case FASTTRAP_T_CALL:
844 		/*
845 		 * It's a call _and_ link remember...
846 		 */
847 		rp->r_o7 = rp->r_pc;
848 		pc = rp->r_npc;
849 		npc = tp->ftt_dest;
850 		break;
851 
852 	case FASTTRAP_T_JMPL:
853 		pc = rp->r_npc;
854 
855 		if (I(tp->ftt_instr)) {
856 			uint_t rs1 = RS1(tp->ftt_instr);
857 			int32_t imm;
858 
859 			imm = tp->ftt_instr << 19;
860 			imm >>= 19;
861 			npc = fasttrap_getreg(rp, rs1) + imm;
862 		} else {
863 			uint_t rs1 = RS1(tp->ftt_instr);
864 			uint_t rs2 = RS2(tp->ftt_instr);
865 
866 			npc = fasttrap_getreg(rp, rs1) +
867 			    fasttrap_getreg(rp, rs2);
868 		}
869 
870 		/*
871 		 * Do the link part of the jump-and-link instruction.
872 		 */
873 		fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
874 
875 		break;
876 
877 	case FASTTRAP_T_COMMON:
878 	{
879 		curthread->t_dtrace_scrpc = rp->r_g7;
880 		curthread->t_dtrace_astpc = rp->r_g7 + FASTTRAP_OFF_FTRET;
881 
882 		/*
883 		 * Copy the instruction to a reserved location in the
884 		 * user-land thread structure, then set the PC to that
885 		 * location and leave the NPC alone. We take pains to ensure
886 		 * consistency in the instruction stream (See SPARC
887 		 * Architecture Manual Version 9, sections 8.4.7, A.20, and
888 		 * H.1.6; UltraSPARC I/II User's Manual, sections 3.1.1.1,
889 		 * and 13.6.4) by using the ASI ASI_BLK_COMMIT_S to copy the
890 		 * instruction into the user's address space without
891 		 * bypassing the I$. There's no AS_USER version of this ASI
892 		 * (as exist for other ASIs) so we use the lofault
893 		 * mechanism to catch faults.
894 		 */
895 		if (dtrace_blksuword32(rp->r_g7, &tp->ftt_instr, 1) == -1) {
896 			/*
897 			 * If the copyout fails, then the process's state
898 			 * is not consistent (the effects of the traced
899 			 * instruction will never be seen). This process
900 			 * cannot be allowed to continue execution.
901 			 */
902 			fasttrap_sigtrap(curproc, curthread, pc);
903 			return (0);
904 		}
905 
906 		curthread->t_dtrace_pc = pc;
907 		curthread->t_dtrace_npc = npc;
908 		curthread->t_dtrace_on = 1;
909 
910 		pc = curthread->t_dtrace_scrpc;
911 
912 		if (tp->ftt_retids != NULL) {
913 			curthread->t_dtrace_step = 1;
914 			curthread->t_dtrace_ret = 1;
915 			npc = curthread->t_dtrace_astpc;
916 		}
917 		break;
918 	}
919 
920 	default:
921 		panic("fasttrap: mishandled an instruction");
922 	}
923 
924 	/*
925 	 * This bit me in the ass a couple of times, so lets toss this
926 	 * in as a cursory sanity check.
927 	 */
928 	ASSERT(pc != rp->r_g7 + 4);
929 	ASSERT(pc != rp->r_g7 + 8);
930 
931 	/*
932 	 * If there were no return probes when we first found the tracepoint,
933 	 * we should feel no obligation to honor any return probes that were
934 	 * subsequently enabled -- they'll just have to wait until the next
935 	 * time around.
936 	 */
937 	if (tp->ftt_retids != NULL) {
938 		/*
939 		 * We need to wait until the results of the instruction are
940 		 * apparent before invoking any return probes. If this
941 		 * instruction was emulated we can just call
942 		 * fasttrap_return_common(); if it needs to be executed, we
943 		 * need to wait until we return to the kernel.
944 		 */
945 		if (tp->ftt_type != FASTTRAP_T_COMMON) {
946 			fasttrap_return_common(rp, orig_pc, pid, fake_restore);
947 		} else {
948 			ASSERT(curthread->t_dtrace_ret != 0);
949 			ASSERT(curthread->t_dtrace_pc == orig_pc);
950 			ASSERT(curthread->t_dtrace_scrpc == rp->r_g7);
951 			ASSERT(npc == curthread->t_dtrace_astpc);
952 		}
953 	}
954 
955 	ASSERT(pc != 0);
956 	rp->r_pc = pc;
957 	rp->r_npc = npc;
958 
959 	return (0);
960 }
961 
962 int
963 fasttrap_return_probe(struct regs *rp)
964 {
965 	proc_t *p = ttoproc(curthread);
966 	pid_t pid;
967 	uintptr_t pc = curthread->t_dtrace_pc;
968 	uintptr_t npc = curthread->t_dtrace_npc;
969 
970 	curthread->t_dtrace_pc = 0;
971 	curthread->t_dtrace_npc = 0;
972 	curthread->t_dtrace_scrpc = 0;
973 	curthread->t_dtrace_astpc = 0;
974 
975 	/*
976 	 * Treat a child created by a call to vfork(2) as if it were its
977 	 * parent. We know there's only one thread of control in such a
978 	 * process: this one.
979 	 */
980 	while (p->p_flag & SVFORK) {
981 		p = p->p_parent;
982 	}
983 
984 	/*
985 	 * We set the %pc and %npc to their values when the traced
986 	 * instruction was initially executed so that it appears to
987 	 * dtrace_probe() that we're on the original instruction, and so that
988 	 * the user can't easily detect our complex web of lies.
989 	 * dtrace_return_probe() (our caller) will correctly set %pc and %npc
990 	 * after we return.
991 	 */
992 	rp->r_pc = pc;
993 	rp->r_npc = npc;
994 
995 	pid = p->p_pid;
996 	fasttrap_return_common(rp, pc, pid, 0);
997 
998 	return (0);
999 }
1000 
1001 int
1002 fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
1003 {
1004 	fasttrap_instr_t instr = FASTTRAP_INSTR;
1005 
1006 	if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
1007 		return (-1);
1008 
1009 	return (0);
1010 }
1011 
1012 int
1013 fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
1014 {
1015 	fasttrap_instr_t instr;
1016 
1017 	/*
1018 	 * Distinguish between read or write failures and a changed
1019 	 * instruction.
1020 	 */
1021 	if (uread(p, &instr, 4, tp->ftt_pc) != 0)
1022 		return (0);
1023 	if (instr != FASTTRAP_INSTR && instr != BREAKPOINT_INSTR)
1024 		return (0);
1025 	if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
1026 		return (-1);
1027 
1028 	return (0);
1029 }
1030 
1031 int
1032 fasttrap_tracepoint_init(proc_t *p, fasttrap_probe_t *probe,
1033     fasttrap_tracepoint_t *tp, uintptr_t pc)
1034 {
1035 	uint32_t instr;
1036 	int32_t disp;
1037 
1038 	/*
1039 	 * Read the instruction at the given address out of the process's
1040 	 * address space. We don't have to worry about a debugger
1041 	 * changing this instruction before we overwrite it with our trap
1042 	 * instruction since P_PR_LOCK is set.
1043 	 */
1044 	if (uread(p, &instr, 4, pc) != 0)
1045 		return (-1);
1046 
1047 	/*
1048 	 * Decode the instruction to fill in the probe flags. We can have
1049 	 * the process execute most instructions on its own using a pc/npc
1050 	 * trick, but pc-relative control transfer present a problem since
1051 	 * we're relocating the instruction. We emulate these instructions
1052 	 * in the kernel. We assume a default type and over-write that as
1053 	 * needed.
1054 	 *
1055 	 * pc-relative instructions must be emulated for correctness;
1056 	 * other instructions (which represent a large set of commonly traced
1057 	 * instructions) are emulated or otherwise optimized for performance.
1058 	 */
1059 	tp->ftt_type = FASTTRAP_T_COMMON;
1060 	if (OP(instr) == 1) {
1061 		/*
1062 		 * Call instructions.
1063 		 */
1064 		tp->ftt_type = FASTTRAP_T_CALL;
1065 		disp = DISP30(instr) << 2;
1066 		tp->ftt_dest = pc + (intptr_t)disp;
1067 
1068 	} else if (OP(instr) == 0) {
1069 		/*
1070 		 * Branch instructions.
1071 		 *
1072 		 * Unconditional branches need careful attention when they're
1073 		 * annulled: annulled unconditional branches never execute
1074 		 * the instruction in the delay slot.
1075 		 */
1076 		switch (OP2(instr)) {
1077 		case OP2_ILLTRAP:
1078 		case 0x7:
1079 			/*
1080 			 * The compiler may place an illtrap after a call to
1081 			 * a function that returns a structure. In the case of
1082 			 * a returned structure, the compiler places an illtrap
1083 			 * whose const22 field is the size of the returned
1084 			 * structure immediately following the delay slot of
1085 			 * the call. To stay out of the way, we refuse to
1086 			 * place tracepoints on top of illtrap instructions.
1087 			 *
1088 			 * This is one of the dumbest architectural decisions
1089 			 * I've ever had to work around.
1090 			 *
1091 			 * We also identify the only illegal op2 value (See
1092 			 * SPARC Architecture Manual Version 9, E.2 table 31).
1093 			 */
1094 			return (-1);
1095 
1096 		case OP2_BPcc:
1097 			if (COND(instr) == 8) {
1098 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1099 			} else {
1100 				/*
1101 				 * Check for an illegal instruction.
1102 				 */
1103 				if (CC(instr) & 1)
1104 					return (-1);
1105 				tp->ftt_type = FASTTRAP_T_CCR;
1106 				tp->ftt_cc = CC(instr);
1107 				tp->ftt_code = COND(instr);
1108 			}
1109 
1110 			if (A(instr) != 0)
1111 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1112 
1113 			disp = DISP19(instr);
1114 			disp <<= 13;
1115 			disp >>= 11;
1116 			tp->ftt_dest = pc + (intptr_t)disp;
1117 			break;
1118 
1119 		case OP2_Bicc:
1120 			if (COND(instr) == 8) {
1121 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1122 			} else {
1123 				tp->ftt_type = FASTTRAP_T_CCR;
1124 				tp->ftt_cc = 0;
1125 				tp->ftt_code = COND(instr);
1126 			}
1127 
1128 			if (A(instr) != 0)
1129 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1130 
1131 			disp = DISP22(instr);
1132 			disp <<= 10;
1133 			disp >>= 8;
1134 			tp->ftt_dest = pc + (intptr_t)disp;
1135 			break;
1136 
1137 		case OP2_BPr:
1138 			/*
1139 			 * Check for an illegal instruction.
1140 			 */
1141 			if ((RCOND(instr) & 3) == 0)
1142 				return (-1);
1143 
1144 			/*
1145 			 * It's a violation of the v8plus ABI to use a
1146 			 * register-predicated branch in a 32-bit app if
1147 			 * the register used is an %l or an %i (%gs and %os
1148 			 * are legit because they're not saved to the stack
1149 			 * in 32-bit words when we take a trap).
1150 			 */
1151 			if (p->p_model == DATAMODEL_ILP32 && RS1(instr) >= 16)
1152 				return (-1);
1153 
1154 			tp->ftt_type = FASTTRAP_T_REG;
1155 			if (A(instr) != 0)
1156 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1157 			disp = DISP16(instr);
1158 			disp <<= 16;
1159 			disp >>= 14;
1160 			tp->ftt_dest = pc + (intptr_t)disp;
1161 			tp->ftt_code = RCOND(instr);
1162 			break;
1163 
1164 		case OP2_SETHI:
1165 			tp->ftt_type = FASTTRAP_T_SETHI;
1166 			break;
1167 
1168 		case OP2_FBPfcc:
1169 			if (COND(instr) == 8) {
1170 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1171 			} else {
1172 				tp->ftt_type = FASTTRAP_T_FCC;
1173 				tp->ftt_cc = CC(instr);
1174 				tp->ftt_code = COND(instr);
1175 			}
1176 
1177 			if (A(instr) != 0)
1178 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1179 
1180 			disp = DISP19(instr);
1181 			disp <<= 13;
1182 			disp >>= 11;
1183 			tp->ftt_dest = pc + (intptr_t)disp;
1184 			break;
1185 
1186 		case OP2_FBfcc:
1187 			if (COND(instr) == 8) {
1188 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1189 			} else {
1190 				tp->ftt_type = FASTTRAP_T_FCC;
1191 				tp->ftt_cc = 0;
1192 				tp->ftt_code = COND(instr);
1193 			}
1194 
1195 			if (A(instr) != 0)
1196 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1197 
1198 			disp = DISP22(instr);
1199 			disp <<= 10;
1200 			disp >>= 8;
1201 			tp->ftt_dest = pc + (intptr_t)disp;
1202 			break;
1203 		}
1204 
1205 	} else if (OP(instr) == 2) {
1206 		switch (OP3(instr)) {
1207 		case OP3_RETURN:
1208 			tp->ftt_type = FASTTRAP_T_RETURN;
1209 			break;
1210 
1211 		case OP3_JMPL:
1212 			tp->ftt_type = FASTTRAP_T_JMPL;
1213 			break;
1214 
1215 		case OP3_RD:
1216 			if (RS1(instr) == 5)
1217 				tp->ftt_type = FASTTRAP_T_RDPC;
1218 			break;
1219 
1220 		case OP3_SAVE:
1221 			/*
1222 			 * We optimize for save instructions at function
1223 			 * entry; see the comment in fasttrap_pid_probe()
1224 			 * (near FASTTRAP_T_SAVE) for details.
1225 			 */
1226 			if (fasttrap_optimize_save != 0 &&
1227 			    probe->ftp_type == DTFTP_ENTRY &&
1228 			    I(instr) == 1 && RD(instr) == R_SP)
1229 				tp->ftt_type = FASTTRAP_T_SAVE;
1230 			break;
1231 
1232 		case OP3_RESTORE:
1233 			/*
1234 			 * We optimize restore instructions at function
1235 			 * return; see the comment in fasttrap_pid_probe()
1236 			 * (near FASTTRAP_T_RESTORE) for details.
1237 			 *
1238 			 * rd must be an %o or %g register.
1239 			 */
1240 			if ((RD(instr) & 0x10) == 0)
1241 				tp->ftt_type = FASTTRAP_T_RESTORE;
1242 			break;
1243 
1244 		case OP3_OR:
1245 			/*
1246 			 * A large proportion of instructions in the delay
1247 			 * slot of retl instructions are or's so we emulate
1248 			 * these downstairs as an optimization.
1249 			 */
1250 			tp->ftt_type = FASTTRAP_T_OR;
1251 			break;
1252 
1253 		case OP3_TCC:
1254 			/*
1255 			 * Breakpoint instructions are effectively position-
1256 			 * dependent since the debugger uses the %pc value
1257 			 * to lookup which breakpoint was executed. As a
1258 			 * result, we can't actually instrument breakpoints.
1259 			 */
1260 			if (SW_TRAP(instr) == ST_BREAKPOINT)
1261 				return (-1);
1262 			break;
1263 
1264 		case 0x19:
1265 		case 0x1d:
1266 		case 0x29:
1267 		case 0x33:
1268 		case 0x3f:
1269 			/*
1270 			 * Identify illegal instructions (See SPARC
1271 			 * Architecture Manual Version 9, E.2 table 32).
1272 			 */
1273 			return (-1);
1274 		}
1275 	} else if (OP(instr) == 3) {
1276 		uint32_t op3 = OP3(instr);
1277 
1278 		/*
1279 		 * Identify illegal instructions (See SPARC Architecture
1280 		 * Manual Version 9, E.2 table 33).
1281 		 */
1282 		if ((op3 & 0x28) == 0x28) {
1283 			if (op3 != OP3_PREFETCH && op3 != OP3_CASA &&
1284 			    op3 != OP3_PREFETCHA && op3 != OP3_CASXA)
1285 				return (-1);
1286 		} else {
1287 			if ((op3 & 0x0f) == 0x0c || (op3 & 0x3b) == 0x31)
1288 				return (-1);
1289 		}
1290 	}
1291 
1292 	tp->ftt_instr = instr;
1293 
1294 	/*
1295 	 * We don't know how this tracepoint is going to be used, but in case
1296 	 * it's used as part of a function return probe, we need to indicate
1297 	 * whether it's always a return site or only potentially a return
1298 	 * site. If it's part of a return probe, it's always going to be a
1299 	 * return from that function if it's a restore instruction or if
1300 	 * the previous instruction was a return. If we could reliably
1301 	 * distinguish jump tables from return sites, this wouldn't be
1302 	 * necessary.
1303 	 */
1304 	if (tp->ftt_type != FASTTRAP_T_RESTORE &&
1305 	    (uread(p, &instr, 4, pc - sizeof (instr)) != 0 ||
1306 	    !(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
1307 		tp->ftt_flags |= FASTTRAP_F_RETMAYBE;
1308 
1309 	return (0);
1310 }
1311 
1312 /*ARGSUSED*/
1313 uint64_t
1314 fasttrap_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1315 {
1316 	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
1317 }
1318 
1319 /*ARGSUSED*/
1320 uint64_t
1321 fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
1322     int aframes)
1323 {
1324 	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
1325 }
1326 
1327 static uint64_t fasttrap_getreg_fast_cnt;
1328 static uint64_t fasttrap_getreg_mpcb_cnt;
1329 static uint64_t fasttrap_getreg_slow_cnt;
1330 
1331 static ulong_t
1332 fasttrap_getreg(struct regs *rp, uint_t reg)
1333 {
1334 	ulong_t value;
1335 	dtrace_icookie_t cookie;
1336 	struct machpcb *mpcb;
1337 	extern ulong_t dtrace_getreg_win(uint_t, uint_t);
1338 
1339 	/*
1340 	 * We have the %os and %gs in our struct regs, but if we need to
1341 	 * snag a %l or %i we need to go scrounging around in the process's
1342 	 * address space.
1343 	 */
1344 	if (reg == 0)
1345 		return (0);
1346 
1347 	if (reg < 16)
1348 		return ((&rp->r_g1)[reg - 1]);
1349 
1350 	/*
1351 	 * Before we look at the user's stack, we'll check the register
1352 	 * windows to see if the information we want is in there.
1353 	 */
1354 	cookie = dtrace_interrupt_disable();
1355 	if (dtrace_getotherwin() > 0) {
1356 		value = dtrace_getreg_win(reg, 1);
1357 		dtrace_interrupt_enable(cookie);
1358 
1359 		atomic_add_64(&fasttrap_getreg_fast_cnt, 1);
1360 
1361 		return (value);
1362 	}
1363 	dtrace_interrupt_enable(cookie);
1364 
1365 	/*
1366 	 * First check the machpcb structure to see if we've already read
1367 	 * in the register window we're looking for; if we haven't, (and
1368 	 * we probably haven't) try to copy in the value of the register.
1369 	 */
1370 	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
1371 
1372 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1373 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
1374 
1375 		if (mpcb->mpcb_wbcnt > 0) {
1376 			struct rwindow *rwin = (void *)mpcb->mpcb_wbuf;
1377 			int i = mpcb->mpcb_wbcnt;
1378 			do {
1379 				i--;
1380 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1381 					continue;
1382 
1383 				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
1384 				return (rwin[i].rw_local[reg - 16]);
1385 			} while (i > 0);
1386 		}
1387 
1388 		if (fasttrap_fulword(&fr->fr_local[reg - 16], &value) != 0)
1389 			goto err;
1390 	} else {
1391 		struct frame32 *fr =
1392 		    (struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
1393 		uint32_t *v32 = (uint32_t *)&value;
1394 
1395 		if (mpcb->mpcb_wbcnt > 0) {
1396 			struct rwindow32 *rwin = (void *)mpcb->mpcb_wbuf;
1397 			int i = mpcb->mpcb_wbcnt;
1398 			do {
1399 				i--;
1400 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1401 					continue;
1402 
1403 				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
1404 				return (rwin[i].rw_local[reg - 16]);
1405 			} while (i > 0);
1406 		}
1407 
1408 		if (fasttrap_fuword32(&fr->fr_local[reg - 16], &v32[1]) != 0)
1409 			goto err;
1410 
1411 		v32[0] = 0;
1412 	}
1413 
1414 	atomic_add_64(&fasttrap_getreg_slow_cnt, 1);
1415 	return (value);
1416 
1417 err:
1418 	/*
1419 	 * If the copy in failed, the process will be in a irrecoverable
1420 	 * state, and we have no choice but to kill it.
1421 	 */
1422 	psignal(ttoproc(curthread), SIGILL);
1423 	return (0);
1424 }
1425 
1426 static uint64_t fasttrap_putreg_fast_cnt;
1427 static uint64_t fasttrap_putreg_mpcb_cnt;
1428 static uint64_t fasttrap_putreg_slow_cnt;
1429 
1430 static void
1431 fasttrap_putreg(struct regs *rp, uint_t reg, ulong_t value)
1432 {
1433 	dtrace_icookie_t cookie;
1434 	struct machpcb *mpcb;
1435 	extern void dtrace_putreg_win(uint_t, ulong_t);
1436 
1437 	if (reg == 0)
1438 		return;
1439 
1440 	if (reg < 16) {
1441 		(&rp->r_g1)[reg - 1] = value;
1442 		return;
1443 	}
1444 
1445 	/*
1446 	 * If the user process is still using some register windows, we
1447 	 * can just place the value in the correct window.
1448 	 */
1449 	cookie = dtrace_interrupt_disable();
1450 	if (dtrace_getotherwin() > 0) {
1451 		dtrace_putreg_win(reg, value);
1452 		dtrace_interrupt_enable(cookie);
1453 		atomic_add_64(&fasttrap_putreg_fast_cnt, 1);
1454 		return;
1455 	}
1456 	dtrace_interrupt_enable(cookie);
1457 
1458 	/*
1459 	 * First see if there's a copy of the register window in the
1460 	 * machpcb structure that we can modify; if there isn't try to
1461 	 * copy out the value. If that fails, we try to create a new
1462 	 * register window in the machpcb structure. While this isn't
1463 	 * _precisely_ the intended use of the machpcb structure, it
1464 	 * can't cause any problems since we know at this point in the
1465 	 * code that all of the user's data have been flushed out of the
1466 	 * register file (since %otherwin is 0).
1467 	 */
1468 	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
1469 
1470 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1471 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
1472 		struct rwindow *rwin = (struct rwindow *)mpcb->mpcb_wbuf;
1473 
1474 		if (mpcb->mpcb_wbcnt > 0) {
1475 			int i = mpcb->mpcb_wbcnt;
1476 			do {
1477 				i--;
1478 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1479 					continue;
1480 
1481 				rwin[i].rw_local[reg - 16] = value;
1482 				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1483 				return;
1484 			} while (i > 0);
1485 		}
1486 
1487 		if (fasttrap_sulword(&fr->fr_local[reg - 16], value) != 0) {
1488 			if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
1489 			    &rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
1490 				goto err;
1491 
1492 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = value;
1493 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
1494 			mpcb->mpcb_wbcnt++;
1495 			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1496 			return;
1497 		}
1498 	} else {
1499 		struct frame32 *fr =
1500 		    (struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
1501 		struct rwindow32 *rwin = (struct rwindow32 *)mpcb->mpcb_wbuf;
1502 		uint32_t v32 = (uint32_t)value;
1503 
1504 		if (mpcb->mpcb_wbcnt > 0) {
1505 			int i = mpcb->mpcb_wbcnt;
1506 			do {
1507 				i--;
1508 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1509 					continue;
1510 
1511 				rwin[i].rw_local[reg - 16] = v32;
1512 				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1513 				return;
1514 			} while (i > 0);
1515 		}
1516 
1517 		if (fasttrap_suword32(&fr->fr_local[reg - 16], v32) != 0) {
1518 			if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
1519 			    &rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
1520 				goto err;
1521 
1522 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = v32;
1523 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
1524 			mpcb->mpcb_wbcnt++;
1525 			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1526 			return;
1527 		}
1528 	}
1529 
1530 	atomic_add_64(&fasttrap_putreg_slow_cnt, 1);
1531 	return;
1532 
1533 err:
1534 	/*
1535 	 * If we couldn't record this register's value, the process is in an
1536 	 * irrecoverable state and we have no choice but to euthanize it.
1537 	 */
1538 	psignal(ttoproc(curthread), SIGILL);
1539 }
1540