xref: /titanic_50/usr/src/uts/sparc/dtrace/fasttrap_isa.c (revision 8d4e547db823a866b8f73efc0acdc423e2963caf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/fasttrap_isa.h>
30 #include <sys/fasttrap_impl.h>
31 #include <sys/dtrace.h>
32 #include <sys/dtrace_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/frame.h>
35 #include <sys/stack.h>
36 #include <sys/sysmacros.h>
37 #include <sys/trap.h>
38 
39 #include <v9/sys/machpcb.h>
40 #include <v9/sys/privregs.h>
41 
42 /*
43  * Lossless User-Land Tracing on SPARC
44  * -----------------------------------
45  *
46  * The Basic Idea
47  *
48  * The most important design constraint is, of course, correct execution of
49  * the user thread above all else. The next most important goal is rapid
50  * execution. We combine execution of instructions in user-land with
51  * emulation of certain instructions in the kernel to aim for complete
52  * correctness and maximal performance.
53  *
54  * We take advantage of the split PC/NPC architecture to speed up logical
55  * single-stepping; when we copy an instruction out to the scratch space in
56  * the ulwp_t structure (held in the %g7 register on SPARC), we can
57  * effectively single step by setting the PC to our scratch space and leaving
58  * the NPC alone. This executes the replaced instruction and then continues
59  * on without having to reenter the kernel as with single- stepping. The
60  * obvious caveat is for instructions whose execution is PC dependant --
61  * branches, call and link instructions (call and jmpl), and the rdpc
62  * instruction. These instructions cannot be executed in the manner described
63  * so they must be emulated in the kernel.
64  *
65  * Emulation for this small set of instructions if fairly simple; the most
66  * difficult part being emulating branch conditions.
67  *
68  *
69  * A Cache Heavy Portfolio
70  *
71  * It's important to note at this time that copying an instruction out to the
72  * ulwp_t scratch space in user-land is rather complicated. SPARC has
73  * separate data and instruction caches so any writes to the D$ (using a
74  * store instruction for example) aren't necessarily reflected in the I$.
75  * The flush instruction can be used to synchronize the two and must be used
76  * for any self-modifying code, but the flush instruction only applies to the
77  * primary address space (the absence of a flusha analogue to the flush
78  * instruction that accepts an ASI argument is an obvious omission from SPARC
79  * v9 where the notion of the alternate address space was introduced on
80  * SPARC). To correctly copy out the instruction we must use a block store
81  * that doesn't allocate in the D$ and ensures synchronization with the I$;
82  * see dtrace_blksuword32() for the implementation  (this function uses
83  * ASI_BLK_COMMIT_S to write a block through the secondary ASI in the manner
84  * described). Refer to the UltraSPARC I/II manual for details on the
85  * ASI_BLK_COMMIT_S ASI.
86  *
87  *
88  * Return Subtleties
89  *
90  * When we're firing a return probe we need to expose the value returned by
91  * the function being traced. Since the function can set the return value
92  * in its last instruction, we need to fire the return probe only _after_
93  * the effects of the instruction are apparent. For instructions that we
94  * emulate, we can call dtrace_probe() after we've performed the emulation;
95  * for instructions that we execute after we return to user-land, we set
96  * %pc to the instruction we copied out (as described above) and set %npc
97  * to a trap instruction stashed in the ulwp_t structure. After the traced
98  * instruction is executed, the trap instruction returns control to the
99  * kernel where we can fire the return probe.
100  *
101  * This need for a second trap in cases where we execute the traced
102  * instruction makes it all the more important to emulate the most common
103  * instructions to avoid the second trip in and out of the kernel.
104  *
105  *
106  * Making it Fast
107  *
108  * Since copying out an instruction is neither simple nor inexpensive for the
109  * CPU, we should attempt to avoid doing it in as many cases as possible.
110  * Since function entry and return are usually the most interesting probe
111  * sites, we attempt to tune the performance of the fasttrap provider around
112  * instructions typically in those places.
113  *
114  * Looking at a bunch of functions in libraries and executables reveals that
115  * most functions begin with either a save or a sethi (to setup a larger
116  * argument to the save) and end with a restore or an or (in the case of leaf
117  * functions). To try to improve performance, we emulate all of these
118  * instructions in the kernel.
119  *
120  * The save and restore instructions are a little tricky since they perform
121  * register window maniplulation. Rather than trying to tinker with the
122  * register windows from the kernel, we emulate the implicit add that takes
123  * place as part of those instructions and set the %pc to point to a simple
124  * save or restore we've hidden in the ulwp_t structure. If we're in a return
125  * probe so want to make it seem as though the tracepoint has been completely
126  * executed we need to remember that we've pulled this trick with restore and
127  * pull registers from the previous window (the one that we'll switch to once
128  * the simple store instruction is executed) rather than the current one. This
129  * is why in the case of emulating a restore we set the DTrace CPU flag
130  * CPU_DTRACE_FAKERESTORE before calling dtrace_probe() for the return probes
131  * (see fasttrap_return_common()).
132  */
133 
134 #define	OP(x)		((x) >> 30)
135 #define	OP2(x)		(((x) >> 22) & 0x07)
136 #define	OP3(x)		(((x) >> 19) & 0x3f)
137 #define	RCOND(x)	(((x) >> 25) & 0x07)
138 #define	COND(x)		(((x) >> 25) & 0x0f)
139 #define	A(x)		(((x) >> 29) & 0x01)
140 #define	I(x)		(((x) >> 13) & 0x01)
141 #define	RD(x)		(((x) >> 25) & 0x1f)
142 #define	RS1(x)		(((x) >> 14) & 0x1f)
143 #define	RS2(x)		(((x) >> 0) & 0x1f)
144 #define	CC(x)		(((x) >> 20) & 0x03)
145 #define	DISP16(x)	((((x) >> 6) & 0xc000) | ((x) & 0x3fff))
146 #define	DISP22(x)	((x) & 0x3fffff)
147 #define	DISP19(x)	((x) & 0x7ffff)
148 #define	DISP30(x)	((x) & 0x3fffffff)
149 #define	SW_TRAP(x)	((x) & 0x7f)
150 
151 #define	OP3_OR		0x02
152 #define	OP3_RD		0x28
153 #define	OP3_JMPL	0x38
154 #define	OP3_RETURN	0x39
155 #define	OP3_TCC		0x3a
156 #define	OP3_SAVE	0x3c
157 #define	OP3_RESTORE	0x3d
158 
159 #define	OP3_PREFETCH	0x2d
160 #define	OP3_CASA	0x3c
161 #define	OP3_PREFETCHA	0x3d
162 #define	OP3_CASXA	0x3e
163 
164 #define	OP2_ILLTRAP	0x0
165 #define	OP2_BPcc	0x1
166 #define	OP2_Bicc	0x2
167 #define	OP2_BPr		0x3
168 #define	OP2_SETHI	0x4
169 #define	OP2_FBPfcc	0x5
170 #define	OP2_FBfcc	0x6
171 
172 #define	R_G0		0
173 #define	R_O0		8
174 #define	R_SP		14
175 #define	R_I0		24
176 #define	R_I1		25
177 #define	R_I2		26
178 #define	R_I3		27
179 
180 /*
181  * Check the comment in fasttrap.h when changing these offsets or adding
182  * new instructions.
183  */
184 #define	FASTTRAP_OFF_SAVE	64
185 #define	FASTTRAP_OFF_RESTORE	68
186 #define	FASTTRAP_OFF_FTRET	72
187 #define	FASTTRAP_OFF_RETURN	76
188 
189 #define	BREAKPOINT_INSTR	0x91d02001	/* ta 1 */
190 
191 /*
192  * Tunable to let users turn off the fancy save instruction optimization.
193  * If a program is non-ABI compliant, there's a possibility that the save
194  * instruction optimization could cause an error.
195  */
196 int fasttrap_optimize_save = 1;
197 
198 static uint64_t
199 fasttrap_anarg(struct regs *rp, int argno)
200 {
201 	uint64_t value;
202 
203 	if (argno < 6)
204 		return ((&rp->r_o0)[argno]);
205 
206 	if (curproc->p_model == DATAMODEL_NATIVE) {
207 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
208 
209 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
210 		value = dtrace_fulword(&fr->fr_argd[argno]);
211 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
212 		    CPU_DTRACE_BADALIGN);
213 	} else {
214 		struct frame32 *fr = (struct frame32 *)rp->r_sp;
215 
216 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
217 		value = dtrace_fuword32(&fr->fr_argd[argno]);
218 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
219 		    CPU_DTRACE_BADALIGN);
220 	}
221 
222 	return (value);
223 }
224 
225 static ulong_t fasttrap_getreg(struct regs *, uint_t);
226 static void fasttrap_putreg(struct regs *, uint_t, ulong_t);
227 
228 static void
229 fasttrap_usdt_args(fasttrap_probe_t *probe, struct regs *rp, int argc,
230     uintptr_t *argv)
231 {
232 	int i, x, cap = MIN(argc, probe->ftp_nargs);
233 
234 	if (curproc->p_model == DATAMODEL_NATIVE) {
235 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
236 		uintptr_t v;
237 
238 		for (i = 0; i < cap; i++) {
239 			x = probe->ftp_argmap[i];
240 
241 			if (x < 6)
242 				argv[i] = (&rp->r_o0)[x];
243 			else if (fasttrap_fulword(&fr->fr_argd[x], &v) != 0)
244 				argv[i] = 0;
245 		}
246 
247 	} else {
248 		struct frame32 *fr = (struct frame32 *)rp->r_sp;
249 		uint32_t v;
250 
251 		for (i = 0; i < cap; i++) {
252 			x = probe->ftp_argmap[i];
253 
254 			if (x < 6)
255 				argv[i] = (&rp->r_o0)[x];
256 			else if (fasttrap_fuword32(&fr->fr_argd[x], &v) != 0)
257 				argv[i] = 0;
258 		}
259 	}
260 
261 	for (; i < argc; i++) {
262 		argv[i] = 0;
263 	}
264 }
265 
266 static void
267 fasttrap_return_common(struct regs *rp, uintptr_t pc, pid_t pid,
268     uint_t fake_restore)
269 {
270 	fasttrap_tracepoint_t *tp;
271 	fasttrap_bucket_t *bucket;
272 	fasttrap_id_t *id;
273 	kmutex_t *pid_mtx;
274 	dtrace_icookie_t cookie;
275 
276 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
277 	mutex_enter(pid_mtx);
278 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
279 
280 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
281 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
282 		    !tp->ftt_proc->ftpc_defunct)
283 			break;
284 	}
285 
286 	/*
287 	 * Don't sweat it if we can't find the tracepoint again; unlike
288 	 * when we're in fasttrap_pid_probe(), finding the tracepoint here
289 	 * is not essential to the correct execution of the process.
290 	 */
291 	if (tp == NULL || tp->ftt_retids == NULL) {
292 		mutex_exit(pid_mtx);
293 		return;
294 	}
295 
296 	for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
297 		fasttrap_probe_t *probe = id->fti_probe;
298 
299 		if (id->fti_ptype == DTFTP_POST_OFFSETS) {
300 			if (probe->ftp_argmap == NULL) {
301 				dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1,
302 				    rp->r_o2, rp->r_o3, rp->r_o4);
303 			} else {
304 				uintptr_t t[5];
305 
306 				fasttrap_usdt_args(probe, rp,
307 				    sizeof (t) / sizeof (t[0]), t);
308 
309 				dtrace_probe(probe->ftp_id, t[0], t[1],
310 				    t[2], t[3], t[4]);
311 			}
312 			continue;
313 		}
314 
315 		/*
316 		 * If this is only a possible return point, we must
317 		 * be looking at a potential tail call in leaf context.
318 		 * If the %npc is still within this function, then we
319 		 * must have misidentified a jmpl as a tail-call when it
320 		 * is, in fact, part of a jump table. It would be nice to
321 		 * remove this tracepoint, but this is neither the time
322 		 * nor the place.
323 		 */
324 		if ((tp->ftt_flags & FASTTRAP_F_RETMAYBE) &&
325 		    rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
326 			continue;
327 
328 		/*
329 		 * It's possible for a function to branch to the delay slot
330 		 * of an instruction that we've identified as a return site.
331 		 * We can dectect this spurious return probe activation by
332 		 * observing that in this case %npc will be %pc + 4 and %npc
333 		 * will be inside the current function (unless the user is
334 		 * doing _crazy_ instruction picking in which case there's
335 		 * very little we can do). The second check is important
336 		 * in case the last instructions of a function make a tail-
337 		 * call to the function located immediately subsequent.
338 		 */
339 		if (rp->r_npc == rp->r_pc + 4 &&
340 		    rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
341 			continue;
342 
343 		/*
344 		 * The first argument is the offset of return tracepoint
345 		 * in the function; the remaining arguments are the return
346 		 * values.
347 		 *
348 		 * If fake_restore is set, we need to pull the return values
349 		 * out of the %i's rather than the %o's -- a little trickier.
350 		 */
351 		if (!fake_restore) {
352 			dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
353 			    rp->r_o0, rp->r_o1, rp->r_o2, rp->r_o3);
354 		} else {
355 			uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
356 			uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
357 			uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
358 			uintptr_t arg3 = fasttrap_getreg(rp, R_I3);
359 
360 			cookie = dtrace_interrupt_disable();
361 			DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
362 			dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
363 			    arg0, arg1, arg2, arg3);
364 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
365 			dtrace_interrupt_enable(cookie);
366 		}
367 	}
368 
369 	mutex_exit(pid_mtx);
370 }
371 
372 int
373 fasttrap_pid_probe(struct regs *rp)
374 {
375 	proc_t *p = curproc;
376 	fasttrap_tracepoint_t *tp, tp_local;
377 	fasttrap_id_t *id;
378 	pid_t pid;
379 	uintptr_t pc = rp->r_pc;
380 	uintptr_t npc = rp->r_npc;
381 	uintptr_t orig_pc = pc;
382 	fasttrap_bucket_t *bucket;
383 	kmutex_t *pid_mtx;
384 	uint_t fake_restore = 0, is_enabled = 0;
385 	dtrace_icookie_t cookie;
386 
387 	/*
388 	 * It's possible that a user (in a veritable orgy of bad planning)
389 	 * could redirect this thread's flow of control before it reached the
390 	 * return probe fasttrap. In this case we need to kill the process
391 	 * since it's in a unrecoverable state.
392 	 */
393 	if (curthread->t_dtrace_step) {
394 		ASSERT(curthread->t_dtrace_on);
395 		fasttrap_sigtrap(p, curthread, pc);
396 		return (0);
397 	}
398 
399 	/*
400 	 * Clear all user tracing flags.
401 	 */
402 	curthread->t_dtrace_ft = 0;
403 	curthread->t_dtrace_pc = 0;
404 	curthread->t_dtrace_npc = 0;
405 	curthread->t_dtrace_scrpc = 0;
406 	curthread->t_dtrace_astpc = 0;
407 
408 	/*
409 	 * Treat a child created by a call to vfork(2) as if it were its
410 	 * parent. We know that there's only one thread of control in such a
411 	 * process: this one.
412 	 */
413 	while (p->p_flag & SVFORK) {
414 		p = p->p_parent;
415 	}
416 
417 	pid = p->p_pid;
418 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
419 	mutex_enter(pid_mtx);
420 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
421 
422 	/*
423 	 * Lookup the tracepoint that the process just hit.
424 	 */
425 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
426 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
427 		    !tp->ftt_proc->ftpc_defunct)
428 			break;
429 	}
430 
431 	/*
432 	 * If we couldn't find a matching tracepoint, either a tracepoint has
433 	 * been inserted without using the pid<pid> ioctl interface (see
434 	 * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
435 	 */
436 	if (tp == NULL) {
437 		mutex_exit(pid_mtx);
438 		return (-1);
439 	}
440 
441 	for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
442 		fasttrap_probe_t *probe = id->fti_probe;
443 		int isentry = (id->fti_ptype == DTFTP_ENTRY);
444 
445 		if (id->fti_ptype == DTFTP_IS_ENABLED) {
446 			is_enabled = 1;
447 			continue;
448 		}
449 
450 		/*
451 		 * We note that this was an entry probe to help ustack() find
452 		 * the first caller.
453 		 */
454 		if (isentry) {
455 			cookie = dtrace_interrupt_disable();
456 			DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
457 		}
458 		dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1, rp->r_o2,
459 		    rp->r_o3, rp->r_o4);
460 		if (isentry) {
461 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
462 			dtrace_interrupt_enable(cookie);
463 		}
464 	}
465 
466 	/*
467 	 * We're about to do a bunch of work so we cache a local copy of
468 	 * the tracepoint to emulate the instruction, and then find the
469 	 * tracepoint again later if we need to light up any return probes.
470 	 */
471 	tp_local = *tp;
472 	mutex_exit(pid_mtx);
473 	tp = &tp_local;
474 
475 	/*
476 	 * If there's an is-enabled probe conntected to this tracepoint it
477 	 * means that there was a 'mov %g0, %o0' instruction that was placed
478 	 * there by DTrace when the binary was linked. As this probe is, in
479 	 * fact, enabled, we need to stuff 1 into %o0. Accordingly, we can
480 	 * bypass all the instruction emulation logic since we know the
481 	 * inevitable result. It's possible that a user could construct a
482 	 * scenario where the 'is-enabled' probe was on some other
483 	 * instruction, but that would be a rather exotic way to shoot oneself
484 	 * in the foot.
485 	 */
486 	if (is_enabled) {
487 		rp->r_o0 = 1;
488 		pc = rp->r_npc;
489 		npc = pc + 4;
490 		goto done;
491 	}
492 
493 	/*
494 	 * We emulate certain types of instructions to ensure correctness
495 	 * (in the case of position dependent instructions) or optimize
496 	 * common cases. The rest we have the thread execute back in user-
497 	 * land.
498 	 */
499 	switch (tp->ftt_type) {
500 	case FASTTRAP_T_SAVE:
501 	{
502 		int32_t imm;
503 
504 		/*
505 		 * This an optimization to let us handle function entry
506 		 * probes more efficiently. Many functions begin with a save
507 		 * instruction that follows the pattern:
508 		 *	save	%sp, <imm>, %sp
509 		 *
510 		 * Meanwhile, we've stashed the instruction:
511 		 *	save	%g1, %g0, %sp
512 		 *
513 		 * off of %g7, so all we have to do is stick the right value
514 		 * into %g1 and reset %pc to point to the instruction we've
515 		 * cleverly hidden (%npc should not be touched).
516 		 */
517 
518 		imm = tp->ftt_instr << 19;
519 		imm >>= 19;
520 		rp->r_g1 = rp->r_sp + imm;
521 		pc = rp->r_g7 + FASTTRAP_OFF_SAVE;
522 		break;
523 	}
524 
525 	case FASTTRAP_T_RESTORE:
526 	{
527 		ulong_t value;
528 		uint_t rd;
529 
530 		/*
531 		 * This is an optimization to let us handle function
532 		 * return probes more efficiently. Most non-leaf functions
533 		 * end with the sequence:
534 		 *	ret
535 		 *	restore	<reg>, <reg_or_imm>, %oX
536 		 *
537 		 * We've stashed the instruction:
538 		 *	restore	%g0, %g0, %g0
539 		 *
540 		 * off of %g7 so we just need to place the correct value
541 		 * in the right %i register (since after our fake-o
542 		 * restore, the %i's will become the %o's) and set the %pc
543 		 * to point to our hidden restore. We also set fake_restore to
544 		 * let fasttrap_return_common() know that it will find the
545 		 * return values in the %i's rather than the %o's.
546 		 */
547 
548 		if (I(tp->ftt_instr)) {
549 			int32_t imm;
550 
551 			imm = tp->ftt_instr << 19;
552 			imm >>= 19;
553 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
554 		} else {
555 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
556 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
557 		}
558 
559 		/*
560 		 * Convert %o's to %i's; leave %g's as they are.
561 		 */
562 		rd = RD(tp->ftt_instr);
563 		fasttrap_putreg(rp, ((rd & 0x18) == 0x8) ? rd + 16 : rd, value);
564 
565 		pc = rp->r_g7 + FASTTRAP_OFF_RESTORE;
566 		fake_restore = 1;
567 		break;
568 	}
569 
570 	case FASTTRAP_T_RETURN:
571 	{
572 		uintptr_t target;
573 
574 		/*
575 		 * A return instruction is like a jmpl (without the link
576 		 * part) that executes an implicit restore. We've stashed
577 		 * the instruction:
578 		 *	return %o0
579 		 *
580 		 * off of %g7 so we just need to place the target in %o0
581 		 * and set the %pc to point to the stashed return instruction.
582 		 * We use %o0 since that register disappears after the return
583 		 * executes, erasing any evidence of this tampering.
584 		 */
585 		if (I(tp->ftt_instr)) {
586 			int32_t imm;
587 
588 			imm = tp->ftt_instr << 19;
589 			imm >>= 19;
590 			target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
591 		} else {
592 			target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
593 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
594 		}
595 
596 		fasttrap_putreg(rp, R_O0, target);
597 
598 		pc = rp->r_g7 + FASTTRAP_OFF_RETURN;
599 		fake_restore = 1;
600 		break;
601 	}
602 
603 	case FASTTRAP_T_OR:
604 	{
605 		ulong_t value;
606 
607 		if (I(tp->ftt_instr)) {
608 			int32_t imm;
609 
610 			imm = tp->ftt_instr << 19;
611 			imm >>= 19;
612 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) | imm;
613 		} else {
614 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) |
615 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
616 		}
617 
618 		fasttrap_putreg(rp, RD(tp->ftt_instr), value);
619 		pc = rp->r_npc;
620 		npc = pc + 4;
621 		break;
622 	}
623 
624 	case FASTTRAP_T_SETHI:
625 		if (RD(tp->ftt_instr) != R_G0) {
626 			uint32_t imm32 = tp->ftt_instr << 10;
627 			fasttrap_putreg(rp, RD(tp->ftt_instr), (ulong_t)imm32);
628 		}
629 		pc = rp->r_npc;
630 		npc = pc + 4;
631 		break;
632 
633 	case FASTTRAP_T_CCR:
634 	{
635 		uint_t c, v, z, n, taken;
636 		uint_t ccr = rp->r_tstate >> TSTATE_CCR_SHIFT;
637 
638 		if (tp->ftt_cc != 0)
639 			ccr >>= 4;
640 
641 		c = (ccr >> 0) & 1;
642 		v = (ccr >> 1) & 1;
643 		z = (ccr >> 2) & 1;
644 		n = (ccr >> 3) & 1;
645 
646 		switch (tp->ftt_code) {
647 		case 0x0:	/* BN */
648 			taken = 0;		break;
649 		case 0x1:	/* BE */
650 			taken = z;		break;
651 		case 0x2:	/* BLE */
652 			taken = z | (n ^ v);	break;
653 		case 0x3:	/* BL */
654 			taken = n ^ v;		break;
655 		case 0x4:	/* BLEU */
656 			taken = c | z;		break;
657 		case 0x5:	/* BCS (BLU) */
658 			taken = c;		break;
659 		case 0x6:	/* BNEG */
660 			taken = n;		break;
661 		case 0x7:	/* BVS */
662 			taken = v;		break;
663 		case 0x8:	/* BA */
664 			/*
665 			 * We handle the BA case differently since the annul
666 			 * bit means something slightly different.
667 			 */
668 			panic("fasttrap: mishandled a branch");
669 			taken = 1;		break;
670 		case 0x9:	/* BNE */
671 			taken = ~z;		break;
672 		case 0xa:	/* BG */
673 			taken = ~(z | (n ^ v));	break;
674 		case 0xb:	/* BGE */
675 			taken = ~(n ^ v);	break;
676 		case 0xc:	/* BGU */
677 			taken = ~(c | z);	break;
678 		case 0xd:	/* BCC (BGEU) */
679 			taken = ~c;		break;
680 		case 0xe:	/* BPOS */
681 			taken = ~n;		break;
682 		case 0xf:	/* BVC */
683 			taken = ~v;		break;
684 		}
685 
686 		if (taken & 1) {
687 			pc = rp->r_npc;
688 			npc = tp->ftt_dest;
689 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
690 			/*
691 			 * Untaken annulled branches don't execute the
692 			 * instruction in the delay slot.
693 			 */
694 			pc = rp->r_npc + 4;
695 			npc = pc + 4;
696 		} else {
697 			pc = rp->r_npc;
698 			npc = pc + 4;
699 		}
700 		break;
701 	}
702 
703 	case FASTTRAP_T_FCC:
704 	{
705 		uint_t fcc;
706 		uint_t taken;
707 		uint64_t fsr;
708 
709 		dtrace_getfsr(&fsr);
710 
711 		if (tp->ftt_cc == 0) {
712 			fcc = (fsr >> 10) & 0x3;
713 		} else {
714 			uint_t shift;
715 			ASSERT(tp->ftt_cc <= 3);
716 			shift = 30 + tp->ftt_cc * 2;
717 			fcc = (fsr >> shift) & 0x3;
718 		}
719 
720 		switch (tp->ftt_code) {
721 		case 0x0:	/* FBN */
722 			taken = (1 << fcc) & (0|0|0|0);	break;
723 		case 0x1:	/* FBNE */
724 			taken = (1 << fcc) & (8|4|2|0);	break;
725 		case 0x2:	/* FBLG */
726 			taken = (1 << fcc) & (0|4|2|0);	break;
727 		case 0x3:	/* FBUL */
728 			taken = (1 << fcc) & (8|0|2|0);	break;
729 		case 0x4:	/* FBL */
730 			taken = (1 << fcc) & (0|0|2|0);	break;
731 		case 0x5:	/* FBUG */
732 			taken = (1 << fcc) & (8|4|0|0);	break;
733 		case 0x6:	/* FBG */
734 			taken = (1 << fcc) & (0|4|0|0);	break;
735 		case 0x7:	/* FBU */
736 			taken = (1 << fcc) & (8|0|0|0);	break;
737 		case 0x8:	/* FBA */
738 			/*
739 			 * We handle the FBA case differently since the annul
740 			 * bit means something slightly different.
741 			 */
742 			panic("fasttrap: mishandled a branch");
743 			taken = (1 << fcc) & (8|4|2|1);	break;
744 		case 0x9:	/* FBE */
745 			taken = (1 << fcc) & (0|0|0|1);	break;
746 		case 0xa:	/* FBUE */
747 			taken = (1 << fcc) & (8|0|0|1);	break;
748 		case 0xb:	/* FBGE */
749 			taken = (1 << fcc) & (0|4|0|1);	break;
750 		case 0xc:	/* FBUGE */
751 			taken = (1 << fcc) & (8|4|0|1);	break;
752 		case 0xd:	/* FBLE */
753 			taken = (1 << fcc) & (0|0|2|1);	break;
754 		case 0xe:	/* FBULE */
755 			taken = (1 << fcc) & (8|0|2|1);	break;
756 		case 0xf:	/* FBO */
757 			taken = (1 << fcc) & (0|4|2|1);	break;
758 		}
759 
760 		if (taken) {
761 			pc = rp->r_npc;
762 			npc = tp->ftt_dest;
763 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
764 			/*
765 			 * Untaken annulled branches don't execute the
766 			 * instruction in the delay slot.
767 			 */
768 			pc = rp->r_npc + 4;
769 			npc = pc + 4;
770 		} else {
771 			pc = rp->r_npc;
772 			npc = pc + 4;
773 		}
774 		break;
775 	}
776 
777 	case FASTTRAP_T_REG:
778 	{
779 		uint64_t value;
780 		uint_t taken;
781 		uint_t reg = RS1(tp->ftt_instr);
782 
783 		/*
784 		 * An ILP32 process shouldn't be using a branch predicated on
785 		 * an %i or an %l since it would violate the ABI. It's a
786 		 * violation of the ABI because we can't ensure deterministic
787 		 * behavior. We should have identified this case when we
788 		 * enabled the probe.
789 		 */
790 		ASSERT(p->p_model == DATAMODEL_LP64 || reg < 16);
791 
792 		value = fasttrap_getreg(rp, reg);
793 
794 		switch (tp->ftt_code) {
795 		case 0x1:	/* BRZ */
796 			taken = (value == 0);	break;
797 		case 0x2:	/* BRLEZ */
798 			taken = (value <= 0);	break;
799 		case 0x3:	/* BRLZ */
800 			taken = (value < 0);	break;
801 		case 0x5:	/* BRNZ */
802 			taken = (value != 0);	break;
803 		case 0x6:	/* BRGZ */
804 			taken = (value > 0);	break;
805 		case 0x7:	/* BRGEZ */
806 			taken = (value <= 0);	break;
807 		default:
808 		case 0x0:
809 		case 0x4:
810 			panic("fasttrap: mishandled a branch");
811 		}
812 
813 		if (taken) {
814 			pc = rp->r_npc;
815 			npc = tp->ftt_dest;
816 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
817 			/*
818 			 * Untaken annulled branches don't execute the
819 			 * instruction in the delay slot.
820 			 */
821 			pc = rp->r_npc + 4;
822 			npc = pc + 4;
823 		} else {
824 			pc = rp->r_npc;
825 			npc = pc + 4;
826 		}
827 		break;
828 	}
829 
830 	case FASTTRAP_T_ALWAYS:
831 		/*
832 		 * BAs, BA,As...
833 		 */
834 
835 		if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
836 			/*
837 			 * Annulled branch always instructions never execute
838 			 * the instruction in the delay slot.
839 			 */
840 			pc = tp->ftt_dest;
841 			npc = tp->ftt_dest + 4;
842 		} else {
843 			pc = rp->r_npc;
844 			npc = tp->ftt_dest;
845 		}
846 		break;
847 
848 	case FASTTRAP_T_RDPC:
849 		fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
850 		pc = rp->r_npc;
851 		npc = pc + 4;
852 		break;
853 
854 	case FASTTRAP_T_CALL:
855 		/*
856 		 * It's a call _and_ link remember...
857 		 */
858 		rp->r_o7 = rp->r_pc;
859 		pc = rp->r_npc;
860 		npc = tp->ftt_dest;
861 		break;
862 
863 	case FASTTRAP_T_JMPL:
864 		pc = rp->r_npc;
865 
866 		if (I(tp->ftt_instr)) {
867 			uint_t rs1 = RS1(tp->ftt_instr);
868 			int32_t imm;
869 
870 			imm = tp->ftt_instr << 19;
871 			imm >>= 19;
872 			npc = fasttrap_getreg(rp, rs1) + imm;
873 		} else {
874 			uint_t rs1 = RS1(tp->ftt_instr);
875 			uint_t rs2 = RS2(tp->ftt_instr);
876 
877 			npc = fasttrap_getreg(rp, rs1) +
878 			    fasttrap_getreg(rp, rs2);
879 		}
880 
881 		/*
882 		 * Do the link part of the jump-and-link instruction.
883 		 */
884 		fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
885 
886 		break;
887 
888 	case FASTTRAP_T_COMMON:
889 	{
890 		curthread->t_dtrace_scrpc = rp->r_g7;
891 		curthread->t_dtrace_astpc = rp->r_g7 + FASTTRAP_OFF_FTRET;
892 
893 		/*
894 		 * Copy the instruction to a reserved location in the
895 		 * user-land thread structure, then set the PC to that
896 		 * location and leave the NPC alone. We take pains to ensure
897 		 * consistency in the instruction stream (See SPARC
898 		 * Architecture Manual Version 9, sections 8.4.7, A.20, and
899 		 * H.1.6; UltraSPARC I/II User's Manual, sections 3.1.1.1,
900 		 * and 13.6.4) by using the ASI ASI_BLK_COMMIT_S to copy the
901 		 * instruction into the user's address space without
902 		 * bypassing the I$. There's no AS_USER version of this ASI
903 		 * (as exist for other ASIs) so we use the lofault
904 		 * mechanism to catch faults.
905 		 */
906 		if (dtrace_blksuword32(rp->r_g7, &tp->ftt_instr, 1) == -1) {
907 			/*
908 			 * If the copyout fails, then the process's state
909 			 * is not consistent (the effects of the traced
910 			 * instruction will never be seen). This process
911 			 * cannot be allowed to continue execution.
912 			 */
913 			fasttrap_sigtrap(curproc, curthread, pc);
914 			return (0);
915 		}
916 
917 		curthread->t_dtrace_pc = pc;
918 		curthread->t_dtrace_npc = npc;
919 		curthread->t_dtrace_on = 1;
920 
921 		pc = curthread->t_dtrace_scrpc;
922 
923 		if (tp->ftt_retids != NULL) {
924 			curthread->t_dtrace_step = 1;
925 			curthread->t_dtrace_ret = 1;
926 			npc = curthread->t_dtrace_astpc;
927 		}
928 		break;
929 	}
930 
931 	default:
932 		panic("fasttrap: mishandled an instruction");
933 	}
934 
935 	/*
936 	 * This bit me in the ass a couple of times, so lets toss this
937 	 * in as a cursory sanity check.
938 	 */
939 	ASSERT(pc != rp->r_g7 + 4);
940 	ASSERT(pc != rp->r_g7 + 8);
941 
942 done:
943 	/*
944 	 * If there were no return probes when we first found the tracepoint,
945 	 * we should feel no obligation to honor any return probes that were
946 	 * subsequently enabled -- they'll just have to wait until the next
947 	 * time around.
948 	 */
949 	if (tp->ftt_retids != NULL) {
950 		/*
951 		 * We need to wait until the results of the instruction are
952 		 * apparent before invoking any return probes. If this
953 		 * instruction was emulated we can just call
954 		 * fasttrap_return_common(); if it needs to be executed, we
955 		 * need to wait until we return to the kernel.
956 		 */
957 		if (tp->ftt_type != FASTTRAP_T_COMMON) {
958 			fasttrap_return_common(rp, orig_pc, pid, fake_restore);
959 		} else {
960 			ASSERT(curthread->t_dtrace_ret != 0);
961 			ASSERT(curthread->t_dtrace_pc == orig_pc);
962 			ASSERT(curthread->t_dtrace_scrpc == rp->r_g7);
963 			ASSERT(npc == curthread->t_dtrace_astpc);
964 		}
965 	}
966 
967 	ASSERT(pc != 0);
968 	rp->r_pc = pc;
969 	rp->r_npc = npc;
970 
971 	return (0);
972 }
973 
974 int
975 fasttrap_return_probe(struct regs *rp)
976 {
977 	proc_t *p = ttoproc(curthread);
978 	pid_t pid;
979 	uintptr_t pc = curthread->t_dtrace_pc;
980 	uintptr_t npc = curthread->t_dtrace_npc;
981 
982 	curthread->t_dtrace_pc = 0;
983 	curthread->t_dtrace_npc = 0;
984 	curthread->t_dtrace_scrpc = 0;
985 	curthread->t_dtrace_astpc = 0;
986 
987 	/*
988 	 * Treat a child created by a call to vfork(2) as if it were its
989 	 * parent. We know there's only one thread of control in such a
990 	 * process: this one.
991 	 */
992 	while (p->p_flag & SVFORK) {
993 		p = p->p_parent;
994 	}
995 
996 	/*
997 	 * We set the %pc and %npc to their values when the traced
998 	 * instruction was initially executed so that it appears to
999 	 * dtrace_probe() that we're on the original instruction, and so that
1000 	 * the user can't easily detect our complex web of lies.
1001 	 * dtrace_return_probe() (our caller) will correctly set %pc and %npc
1002 	 * after we return.
1003 	 */
1004 	rp->r_pc = pc;
1005 	rp->r_npc = npc;
1006 
1007 	pid = p->p_pid;
1008 	fasttrap_return_common(rp, pc, pid, 0);
1009 
1010 	return (0);
1011 }
1012 
1013 int
1014 fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
1015 {
1016 	fasttrap_instr_t instr = FASTTRAP_INSTR;
1017 
1018 	if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
1019 		return (-1);
1020 
1021 	return (0);
1022 }
1023 
1024 int
1025 fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
1026 {
1027 	fasttrap_instr_t instr;
1028 
1029 	/*
1030 	 * Distinguish between read or write failures and a changed
1031 	 * instruction.
1032 	 */
1033 	if (uread(p, &instr, 4, tp->ftt_pc) != 0)
1034 		return (0);
1035 	if (instr != FASTTRAP_INSTR && instr != BREAKPOINT_INSTR)
1036 		return (0);
1037 	if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
1038 		return (-1);
1039 
1040 	return (0);
1041 }
1042 
1043 int
1044 fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
1045     fasttrap_probe_type_t type)
1046 {
1047 	uint32_t instr;
1048 	int32_t disp;
1049 
1050 	/*
1051 	 * Read the instruction at the given address out of the process's
1052 	 * address space. We don't have to worry about a debugger
1053 	 * changing this instruction before we overwrite it with our trap
1054 	 * instruction since P_PR_LOCK is set.
1055 	 */
1056 	if (uread(p, &instr, 4, pc) != 0)
1057 		return (-1);
1058 
1059 	/*
1060 	 * Decode the instruction to fill in the probe flags. We can have
1061 	 * the process execute most instructions on its own using a pc/npc
1062 	 * trick, but pc-relative control transfer present a problem since
1063 	 * we're relocating the instruction. We emulate these instructions
1064 	 * in the kernel. We assume a default type and over-write that as
1065 	 * needed.
1066 	 *
1067 	 * pc-relative instructions must be emulated for correctness;
1068 	 * other instructions (which represent a large set of commonly traced
1069 	 * instructions) are emulated or otherwise optimized for performance.
1070 	 */
1071 	tp->ftt_type = FASTTRAP_T_COMMON;
1072 	if (OP(instr) == 1) {
1073 		/*
1074 		 * Call instructions.
1075 		 */
1076 		tp->ftt_type = FASTTRAP_T_CALL;
1077 		disp = DISP30(instr) << 2;
1078 		tp->ftt_dest = pc + (intptr_t)disp;
1079 
1080 	} else if (OP(instr) == 0) {
1081 		/*
1082 		 * Branch instructions.
1083 		 *
1084 		 * Unconditional branches need careful attention when they're
1085 		 * annulled: annulled unconditional branches never execute
1086 		 * the instruction in the delay slot.
1087 		 */
1088 		switch (OP2(instr)) {
1089 		case OP2_ILLTRAP:
1090 		case 0x7:
1091 			/*
1092 			 * The compiler may place an illtrap after a call to
1093 			 * a function that returns a structure. In the case of
1094 			 * a returned structure, the compiler places an illtrap
1095 			 * whose const22 field is the size of the returned
1096 			 * structure immediately following the delay slot of
1097 			 * the call. To stay out of the way, we refuse to
1098 			 * place tracepoints on top of illtrap instructions.
1099 			 *
1100 			 * This is one of the dumbest architectural decisions
1101 			 * I've ever had to work around.
1102 			 *
1103 			 * We also identify the only illegal op2 value (See
1104 			 * SPARC Architecture Manual Version 9, E.2 table 31).
1105 			 */
1106 			return (-1);
1107 
1108 		case OP2_BPcc:
1109 			if (COND(instr) == 8) {
1110 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1111 			} else {
1112 				/*
1113 				 * Check for an illegal instruction.
1114 				 */
1115 				if (CC(instr) & 1)
1116 					return (-1);
1117 				tp->ftt_type = FASTTRAP_T_CCR;
1118 				tp->ftt_cc = CC(instr);
1119 				tp->ftt_code = COND(instr);
1120 			}
1121 
1122 			if (A(instr) != 0)
1123 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1124 
1125 			disp = DISP19(instr);
1126 			disp <<= 13;
1127 			disp >>= 11;
1128 			tp->ftt_dest = pc + (intptr_t)disp;
1129 			break;
1130 
1131 		case OP2_Bicc:
1132 			if (COND(instr) == 8) {
1133 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1134 			} else {
1135 				tp->ftt_type = FASTTRAP_T_CCR;
1136 				tp->ftt_cc = 0;
1137 				tp->ftt_code = COND(instr);
1138 			}
1139 
1140 			if (A(instr) != 0)
1141 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1142 
1143 			disp = DISP22(instr);
1144 			disp <<= 10;
1145 			disp >>= 8;
1146 			tp->ftt_dest = pc + (intptr_t)disp;
1147 			break;
1148 
1149 		case OP2_BPr:
1150 			/*
1151 			 * Check for an illegal instruction.
1152 			 */
1153 			if ((RCOND(instr) & 3) == 0)
1154 				return (-1);
1155 
1156 			/*
1157 			 * It's a violation of the v8plus ABI to use a
1158 			 * register-predicated branch in a 32-bit app if
1159 			 * the register used is an %l or an %i (%gs and %os
1160 			 * are legit because they're not saved to the stack
1161 			 * in 32-bit words when we take a trap).
1162 			 */
1163 			if (p->p_model == DATAMODEL_ILP32 && RS1(instr) >= 16)
1164 				return (-1);
1165 
1166 			tp->ftt_type = FASTTRAP_T_REG;
1167 			if (A(instr) != 0)
1168 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1169 			disp = DISP16(instr);
1170 			disp <<= 16;
1171 			disp >>= 14;
1172 			tp->ftt_dest = pc + (intptr_t)disp;
1173 			tp->ftt_code = RCOND(instr);
1174 			break;
1175 
1176 		case OP2_SETHI:
1177 			tp->ftt_type = FASTTRAP_T_SETHI;
1178 			break;
1179 
1180 		case OP2_FBPfcc:
1181 			if (COND(instr) == 8) {
1182 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1183 			} else {
1184 				tp->ftt_type = FASTTRAP_T_FCC;
1185 				tp->ftt_cc = CC(instr);
1186 				tp->ftt_code = COND(instr);
1187 			}
1188 
1189 			if (A(instr) != 0)
1190 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1191 
1192 			disp = DISP19(instr);
1193 			disp <<= 13;
1194 			disp >>= 11;
1195 			tp->ftt_dest = pc + (intptr_t)disp;
1196 			break;
1197 
1198 		case OP2_FBfcc:
1199 			if (COND(instr) == 8) {
1200 				tp->ftt_type = FASTTRAP_T_ALWAYS;
1201 			} else {
1202 				tp->ftt_type = FASTTRAP_T_FCC;
1203 				tp->ftt_cc = 0;
1204 				tp->ftt_code = COND(instr);
1205 			}
1206 
1207 			if (A(instr) != 0)
1208 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
1209 
1210 			disp = DISP22(instr);
1211 			disp <<= 10;
1212 			disp >>= 8;
1213 			tp->ftt_dest = pc + (intptr_t)disp;
1214 			break;
1215 		}
1216 
1217 	} else if (OP(instr) == 2) {
1218 		switch (OP3(instr)) {
1219 		case OP3_RETURN:
1220 			tp->ftt_type = FASTTRAP_T_RETURN;
1221 			break;
1222 
1223 		case OP3_JMPL:
1224 			tp->ftt_type = FASTTRAP_T_JMPL;
1225 			break;
1226 
1227 		case OP3_RD:
1228 			if (RS1(instr) == 5)
1229 				tp->ftt_type = FASTTRAP_T_RDPC;
1230 			break;
1231 
1232 		case OP3_SAVE:
1233 			/*
1234 			 * We optimize for save instructions at function
1235 			 * entry; see the comment in fasttrap_pid_probe()
1236 			 * (near FASTTRAP_T_SAVE) for details.
1237 			 */
1238 			if (fasttrap_optimize_save != 0 &&
1239 			    type == DTFTP_ENTRY &&
1240 			    I(instr) == 1 && RD(instr) == R_SP)
1241 				tp->ftt_type = FASTTRAP_T_SAVE;
1242 			break;
1243 
1244 		case OP3_RESTORE:
1245 			/*
1246 			 * We optimize restore instructions at function
1247 			 * return; see the comment in fasttrap_pid_probe()
1248 			 * (near FASTTRAP_T_RESTORE) for details.
1249 			 *
1250 			 * rd must be an %o or %g register.
1251 			 */
1252 			if ((RD(instr) & 0x10) == 0)
1253 				tp->ftt_type = FASTTRAP_T_RESTORE;
1254 			break;
1255 
1256 		case OP3_OR:
1257 			/*
1258 			 * A large proportion of instructions in the delay
1259 			 * slot of retl instructions are or's so we emulate
1260 			 * these downstairs as an optimization.
1261 			 */
1262 			tp->ftt_type = FASTTRAP_T_OR;
1263 			break;
1264 
1265 		case OP3_TCC:
1266 			/*
1267 			 * Breakpoint instructions are effectively position-
1268 			 * dependent since the debugger uses the %pc value
1269 			 * to lookup which breakpoint was executed. As a
1270 			 * result, we can't actually instrument breakpoints.
1271 			 */
1272 			if (SW_TRAP(instr) == ST_BREAKPOINT)
1273 				return (-1);
1274 			break;
1275 
1276 		case 0x19:
1277 		case 0x1d:
1278 		case 0x29:
1279 		case 0x33:
1280 		case 0x3f:
1281 			/*
1282 			 * Identify illegal instructions (See SPARC
1283 			 * Architecture Manual Version 9, E.2 table 32).
1284 			 */
1285 			return (-1);
1286 		}
1287 	} else if (OP(instr) == 3) {
1288 		uint32_t op3 = OP3(instr);
1289 
1290 		/*
1291 		 * Identify illegal instructions (See SPARC Architecture
1292 		 * Manual Version 9, E.2 table 33).
1293 		 */
1294 		if ((op3 & 0x28) == 0x28) {
1295 			if (op3 != OP3_PREFETCH && op3 != OP3_CASA &&
1296 			    op3 != OP3_PREFETCHA && op3 != OP3_CASXA)
1297 				return (-1);
1298 		} else {
1299 			if ((op3 & 0x0f) == 0x0c || (op3 & 0x3b) == 0x31)
1300 				return (-1);
1301 		}
1302 	}
1303 
1304 	tp->ftt_instr = instr;
1305 
1306 	/*
1307 	 * We don't know how this tracepoint is going to be used, but in case
1308 	 * it's used as part of a function return probe, we need to indicate
1309 	 * whether it's always a return site or only potentially a return
1310 	 * site. If it's part of a return probe, it's always going to be a
1311 	 * return from that function if it's a restore instruction or if
1312 	 * the previous instruction was a return. If we could reliably
1313 	 * distinguish jump tables from return sites, this wouldn't be
1314 	 * necessary.
1315 	 */
1316 	if (tp->ftt_type != FASTTRAP_T_RESTORE &&
1317 	    (uread(p, &instr, 4, pc - sizeof (instr)) != 0 ||
1318 	    !(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
1319 		tp->ftt_flags |= FASTTRAP_F_RETMAYBE;
1320 
1321 	return (0);
1322 }
1323 
1324 /*ARGSUSED*/
1325 uint64_t
1326 fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
1327     int aframes)
1328 {
1329 	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
1330 }
1331 
1332 /*ARGSUSED*/
1333 uint64_t
1334 fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
1335     int aframes)
1336 {
1337 	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
1338 }
1339 
1340 static uint64_t fasttrap_getreg_fast_cnt;
1341 static uint64_t fasttrap_getreg_mpcb_cnt;
1342 static uint64_t fasttrap_getreg_slow_cnt;
1343 
1344 static ulong_t
1345 fasttrap_getreg(struct regs *rp, uint_t reg)
1346 {
1347 	ulong_t value;
1348 	dtrace_icookie_t cookie;
1349 	struct machpcb *mpcb;
1350 	extern ulong_t dtrace_getreg_win(uint_t, uint_t);
1351 
1352 	/*
1353 	 * We have the %os and %gs in our struct regs, but if we need to
1354 	 * snag a %l or %i we need to go scrounging around in the process's
1355 	 * address space.
1356 	 */
1357 	if (reg == 0)
1358 		return (0);
1359 
1360 	if (reg < 16)
1361 		return ((&rp->r_g1)[reg - 1]);
1362 
1363 	/*
1364 	 * Before we look at the user's stack, we'll check the register
1365 	 * windows to see if the information we want is in there.
1366 	 */
1367 	cookie = dtrace_interrupt_disable();
1368 	if (dtrace_getotherwin() > 0) {
1369 		value = dtrace_getreg_win(reg, 1);
1370 		dtrace_interrupt_enable(cookie);
1371 
1372 		atomic_add_64(&fasttrap_getreg_fast_cnt, 1);
1373 
1374 		return (value);
1375 	}
1376 	dtrace_interrupt_enable(cookie);
1377 
1378 	/*
1379 	 * First check the machpcb structure to see if we've already read
1380 	 * in the register window we're looking for; if we haven't, (and
1381 	 * we probably haven't) try to copy in the value of the register.
1382 	 */
1383 	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
1384 
1385 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1386 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
1387 
1388 		if (mpcb->mpcb_wbcnt > 0) {
1389 			struct rwindow *rwin = (void *)mpcb->mpcb_wbuf;
1390 			int i = mpcb->mpcb_wbcnt;
1391 			do {
1392 				i--;
1393 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1394 					continue;
1395 
1396 				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
1397 				return (rwin[i].rw_local[reg - 16]);
1398 			} while (i > 0);
1399 		}
1400 
1401 		if (fasttrap_fulword(&fr->fr_local[reg - 16], &value) != 0)
1402 			goto err;
1403 	} else {
1404 		struct frame32 *fr =
1405 		    (struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
1406 		uint32_t *v32 = (uint32_t *)&value;
1407 
1408 		if (mpcb->mpcb_wbcnt > 0) {
1409 			struct rwindow32 *rwin = (void *)mpcb->mpcb_wbuf;
1410 			int i = mpcb->mpcb_wbcnt;
1411 			do {
1412 				i--;
1413 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1414 					continue;
1415 
1416 				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
1417 				return (rwin[i].rw_local[reg - 16]);
1418 			} while (i > 0);
1419 		}
1420 
1421 		if (fasttrap_fuword32(&fr->fr_local[reg - 16], &v32[1]) != 0)
1422 			goto err;
1423 
1424 		v32[0] = 0;
1425 	}
1426 
1427 	atomic_add_64(&fasttrap_getreg_slow_cnt, 1);
1428 	return (value);
1429 
1430 err:
1431 	/*
1432 	 * If the copy in failed, the process will be in a irrecoverable
1433 	 * state, and we have no choice but to kill it.
1434 	 */
1435 	psignal(ttoproc(curthread), SIGILL);
1436 	return (0);
1437 }
1438 
1439 static uint64_t fasttrap_putreg_fast_cnt;
1440 static uint64_t fasttrap_putreg_mpcb_cnt;
1441 static uint64_t fasttrap_putreg_slow_cnt;
1442 
1443 static void
1444 fasttrap_putreg(struct regs *rp, uint_t reg, ulong_t value)
1445 {
1446 	dtrace_icookie_t cookie;
1447 	struct machpcb *mpcb;
1448 	extern void dtrace_putreg_win(uint_t, ulong_t);
1449 
1450 	if (reg == 0)
1451 		return;
1452 
1453 	if (reg < 16) {
1454 		(&rp->r_g1)[reg - 1] = value;
1455 		return;
1456 	}
1457 
1458 	/*
1459 	 * If the user process is still using some register windows, we
1460 	 * can just place the value in the correct window.
1461 	 */
1462 	cookie = dtrace_interrupt_disable();
1463 	if (dtrace_getotherwin() > 0) {
1464 		dtrace_putreg_win(reg, value);
1465 		dtrace_interrupt_enable(cookie);
1466 		atomic_add_64(&fasttrap_putreg_fast_cnt, 1);
1467 		return;
1468 	}
1469 	dtrace_interrupt_enable(cookie);
1470 
1471 	/*
1472 	 * First see if there's a copy of the register window in the
1473 	 * machpcb structure that we can modify; if there isn't try to
1474 	 * copy out the value. If that fails, we try to create a new
1475 	 * register window in the machpcb structure. While this isn't
1476 	 * _precisely_ the intended use of the machpcb structure, it
1477 	 * can't cause any problems since we know at this point in the
1478 	 * code that all of the user's data have been flushed out of the
1479 	 * register file (since %otherwin is 0).
1480 	 */
1481 	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
1482 
1483 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1484 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
1485 		struct rwindow *rwin = (struct rwindow *)mpcb->mpcb_wbuf;
1486 
1487 		if (mpcb->mpcb_wbcnt > 0) {
1488 			int i = mpcb->mpcb_wbcnt;
1489 			do {
1490 				i--;
1491 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1492 					continue;
1493 
1494 				rwin[i].rw_local[reg - 16] = value;
1495 				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1496 				return;
1497 			} while (i > 0);
1498 		}
1499 
1500 		if (fasttrap_sulword(&fr->fr_local[reg - 16], value) != 0) {
1501 			if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
1502 			    &rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
1503 				goto err;
1504 
1505 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = value;
1506 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
1507 			mpcb->mpcb_wbcnt++;
1508 			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1509 			return;
1510 		}
1511 	} else {
1512 		struct frame32 *fr =
1513 		    (struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
1514 		struct rwindow32 *rwin = (struct rwindow32 *)mpcb->mpcb_wbuf;
1515 		uint32_t v32 = (uint32_t)value;
1516 
1517 		if (mpcb->mpcb_wbcnt > 0) {
1518 			int i = mpcb->mpcb_wbcnt;
1519 			do {
1520 				i--;
1521 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
1522 					continue;
1523 
1524 				rwin[i].rw_local[reg - 16] = v32;
1525 				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1526 				return;
1527 			} while (i > 0);
1528 		}
1529 
1530 		if (fasttrap_suword32(&fr->fr_local[reg - 16], v32) != 0) {
1531 			if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
1532 			    &rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
1533 				goto err;
1534 
1535 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = v32;
1536 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
1537 			mpcb->mpcb_wbcnt++;
1538 			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
1539 			return;
1540 		}
1541 	}
1542 
1543 	atomic_add_64(&fasttrap_putreg_slow_cnt, 1);
1544 	return;
1545 
1546 err:
1547 	/*
1548 	 * If we couldn't record this register's value, the process is in an
1549 	 * irrecoverable state and we have no choice but to euthanize it.
1550 	 */
1551 	psignal(ttoproc(curthread), SIGILL);
1552 }
1553