xref: /titanic_41/usr/src/uts/sparc/v9/os/simulator.c (revision 08045defdf65ee890fef6e20510a093a17feb8fe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* common code with bug fixes from original version in trap.c */
27 
28 #include <sys/param.h>
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/archsystm.h>
32 #include <sys/vmsystm.h>
33 #include <sys/fpu/fpusystm.h>
34 #include <sys/fpu/fpu_simulator.h>
35 #include <sys/inline.h>
36 #include <sys/debug.h>
37 #include <sys/privregs.h>
38 #include <sys/machpcb.h>
39 #include <sys/simulate.h>
40 #include <sys/proc.h>
41 #include <sys/cmn_err.h>
42 #include <sys/stack.h>
43 #include <sys/watchpoint.h>
44 #include <sys/trap.h>
45 #include <sys/machtrap.h>
46 #include <sys/mman.h>
47 #include <sys/asi.h>
48 #include <sys/copyops.h>
49 #include <vm/as.h>
50 #include <vm/page.h>
51 #include <sys/model.h>
52 #include <vm/seg_vn.h>
53 #include <sys/byteorder.h>
54 
55 #define	IS_IBIT_SET(x)	(x & 0x2000)
56 #define	IS_VIS1(op, op3)(op == 2 && op3 == 0x36)
57 #define	IS_FLOAT_QUAD_OP(op, op3)(op == 2 && (op3 == 0x34 ||	\
58 		op3 == 0x35))
59 #define	IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(op, op3, asi)		\
60 		(op == 3 && (op3 == IOP_V8_LDDFA ||		\
61 		op3 == IOP_V8_STDFA) &&	asi > ASI_SNFL)
62 
63 static int aligndebug = 0;
64 
65 /*
66  * For the sake of those who must be compatible with unaligned
67  * architectures, users can link their programs to use a
68  * corrective trap handler that will fix unaligned references
69  * a special trap #6 (T_FIX_ALIGN) enables this 'feature'.
70  * Returns 1 for success, 0 for failure.
71  */
72 
73 int
74 do_unaligned(struct regs *rp, caddr_t *badaddr)
75 {
76 	uint_t	inst, op3, asi = 0;
77 	uint_t	rd, rs1, rs2;
78 	int	sz, nf = 0, ltlend = 0;
79 	int	floatflg;
80 	int	fsrflg;
81 	int	immflg;
82 	int	lddstdflg;
83 	caddr_t	addr;
84 	uint64_t val;
85 	union {
86 		uint64_t	l[2];
87 		uint32_t	i[4];
88 		uint16_t	s[8];
89 		uint8_t		c[16];
90 	} data;
91 
92 	ASSERT(USERMODE(rp->r_tstate));
93 	inst = fetch_user_instr((caddr_t)rp->r_pc);
94 
95 	op3 = (inst >> 19) & 0x3f;
96 	rd = (inst >> 25) & 0x1f;
97 	rs1 = (inst >> 14) & 0x1f;
98 	rs2 = inst & 0x1f;
99 	floatflg = (inst >> 24) & 1;
100 	immflg = (inst >> 13) & 1;
101 	lddstdflg = fsrflg = 0;
102 
103 	/* if not load or store do nothing */
104 	if ((inst >> 30) != 3)
105 		return (0);
106 
107 	/* if ldstub or swap, do nothing */
108 	if ((inst & 0xc1680000) == 0xc0680000)
109 		return (0);
110 
111 	/* if cas/casx, do nothing */
112 	if ((inst & 0xc1e00000) == 0xc1e00000)
113 		return (0);
114 
115 	if (floatflg) {
116 		switch ((inst >> 19) & 3) {	/* map size bits to a number */
117 		case 0: sz = 4;
118 			break;			/* ldf{a}/stf{a} */
119 		case 1: fsrflg = 1;
120 			if (rd == 0)
121 				sz = 4;		/* ldfsr/stfsr */
122 			else  if (rd == 1)
123 				sz = 8;		/* ldxfsr/stxfsr */
124 			else
125 				return (SIMU_ILLEGAL);
126 			break;
127 		case 2: sz = 16;
128 			break;		/* ldqf{a}/stqf{a} */
129 		case 3: sz = 8;
130 			break;		/* lddf{a}/stdf{a} */
131 		}
132 		/*
133 		 * Fix to access extra double register encoding plus
134 		 * compensate to access the correct fpu_dreg.
135 		 */
136 		if ((sz > 4) && (fsrflg == 0)) {
137 			if ((rd & 1) == 1)
138 				rd = (rd & 0x1e) | 0x20;
139 			rd = rd >> 1;
140 			if ((sz == 16) && ((rd & 0x1) != 0))
141 				return (SIMU_ILLEGAL);
142 		}
143 	} else {
144 		int sz_bits = (inst >> 19) & 0xf;
145 		switch (sz_bits) {		/* map size bits to a number */
146 		case 0:				/* lduw{a} */
147 		case 4:				/* stw{a} */
148 		case 8:				/* ldsw{a} */
149 		case 0xf:			/* swap */
150 			sz = 4; break;
151 		case 1:				/* ldub{a} */
152 		case 5:				/* stb{a} */
153 		case 9:				/* ldsb{a} */
154 		case 0xd:			/* ldstub */
155 			sz = 1; break;
156 		case 2:				/* lduh{a} */
157 		case 6:				/* sth{a} */
158 		case 0xa:			/* ldsh{a} */
159 			sz = 2; break;
160 		case 3:				/* ldd{a} */
161 		case 7:				/* std{a} */
162 			lddstdflg = 1;
163 			sz = 8; break;
164 		case 0xb:			/* ldx{a} */
165 		case 0xe:			/* stx{a} */
166 			sz = 8; break;
167 		}
168 	}
169 
170 
171 	/* only support primary and secondary asi's */
172 	if ((op3 >> 4) & 1) {
173 		if (immflg) {
174 			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
175 			    TSTATE_ASI_MASK;
176 		} else {
177 			asi = (inst >> 5) & 0xff;
178 		}
179 		switch (asi) {
180 		case ASI_P:
181 		case ASI_S:
182 			break;
183 		case ASI_PNF:
184 		case ASI_SNF:
185 			nf = 1;
186 			break;
187 		case ASI_PL:
188 		case ASI_SL:
189 			ltlend = 1;
190 			break;
191 		case ASI_PNFL:
192 		case ASI_SNFL:
193 			ltlend = 1;
194 			nf = 1;
195 			break;
196 		default:
197 			return (0);
198 		}
199 		/*
200 		 * Non-faulting stores generate a data_access_exception trap,
201 		 * according to the Spitfire manual, which should be signaled
202 		 * as an illegal instruction trap, because it can't be fixed.
203 		 */
204 		if ((nf) && ((op3 == IOP_V8_STQFA) || (op3 == IOP_V8_STDFA)))
205 			return (SIMU_ILLEGAL);
206 	}
207 
208 	if (aligndebug) {
209 		printf("unaligned access at %p, instruction: 0x%x\n",
210 		    (void *)rp->r_pc, inst);
211 		printf("type %s", (((inst >> 21) & 1) ? "st" : "ld"));
212 		if (((inst >> 21) & 1) == 0)
213 			printf(" %s", (((inst >> 22) & 1) ?
214 			    "signed" : "unsigned"));
215 		printf(" asi 0x%x size %d immflg %d\n", asi, sz, immflg);
216 		printf("rd = %d, op3 = 0x%x, rs1 = %d, rs2 = %d, imm13=0x%x\n",
217 		    rd, op3, rs1, rs2, (inst & 0x1fff));
218 	}
219 
220 	(void) flush_user_windows_to_stack(NULL);
221 	if (getreg(rp, rs1, &val, badaddr))
222 		return (SIMU_FAULT);
223 	addr = (caddr_t)val;		/* convert to 32/64 bit address */
224 	if (aligndebug)
225 		printf("addr 1 = %p\n", (void *)addr);
226 
227 	/* check immediate bit and use immediate field or reg (rs2) */
228 	if (immflg) {
229 		int imm;
230 		imm  = inst & 0x1fff;		/* mask out immediate field */
231 		imm <<= 19;			/* sign extend it */
232 		imm >>= 19;
233 		addr += imm;			/* compute address */
234 	} else {
235 		if (getreg(rp, rs2, &val, badaddr))
236 			return (SIMU_FAULT);
237 		addr += val;
238 	}
239 
240 	/*
241 	 * If this is a 32-bit program, chop the address accordingly.  The
242 	 * intermediate uintptr_t casts prevent warnings under a certain
243 	 * compiler, and the temporary 32 bit storage is intended to force
244 	 * proper code generation and break up what would otherwise be a
245 	 * quadruple cast.
246 	 */
247 	if (curproc->p_model == DATAMODEL_ILP32) {
248 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
249 		addr = (caddr_t)(uintptr_t)addr32;
250 	}
251 
252 	if (aligndebug)
253 		printf("addr 2 = %p\n", (void *)addr);
254 
255 	if (addr >= curproc->p_as->a_userlimit) {
256 		*badaddr = addr;
257 		goto badret;
258 	}
259 
260 	/* a single bit differentiates ld and st */
261 	if ((inst >> 21) & 1) {			/* store */
262 		if (floatflg) {
263 			klwp_id_t lwp = ttolwp(curthread);
264 			kfpu_t *fp = lwptofpu(lwp);
265 			/* Ensure fp has been enabled */
266 			if (fpu_exists) {
267 				if (!(_fp_read_fprs() & FPRS_FEF))
268 					fp_enable();
269 			} else {
270 				if (!fp->fpu_en)
271 					fp_enable();
272 			}
273 			/* if fpu_exists read fpu reg */
274 			if (fpu_exists) {
275 				if (fsrflg) {
276 					_fp_read_pfsr(&data.l[0]);
277 				} else {
278 					if (sz == 4) {
279 						data.i[0] = 0;
280 						_fp_read_pfreg(
281 						    (unsigned *)&data.i[1], rd);
282 					}
283 					if (sz >= 8)
284 						_fp_read_pdreg(
285 						    &data.l[0], rd);
286 					if (sz == 16)
287 						_fp_read_pdreg(
288 						    &data.l[1], rd+1);
289 				}
290 			} else {
291 				if (fsrflg) {
292 					/* Clear reserved bits, set version=7 */
293 					fp->fpu_fsr &= ~0x30301000;
294 					fp->fpu_fsr |= 0xE0000;
295 					data.l[0] = fp->fpu_fsr;
296 				} else {
297 					if (sz == 4) {
298 						data.i[0] = 0;
299 						data.i[1] =
300 						    (unsigned)fp->
301 						    fpu_fr.fpu_regs[rd];
302 					}
303 					if (sz >= 8)
304 						data.l[0] =
305 						    fp->fpu_fr.fpu_dregs[rd];
306 					if (sz == 16)
307 						data.l[1] =
308 						    fp->fpu_fr.fpu_dregs[rd+1];
309 				}
310 			}
311 		} else {
312 			if (lddstdflg) {		/* combine the data */
313 				if (getreg(rp, rd, &data.l[0], badaddr))
314 					return (SIMU_FAULT);
315 				if (getreg(rp, rd+1, &data.l[1], badaddr))
316 					return (SIMU_FAULT);
317 				if (ltlend) {
318 					/*
319 					 * For STD, each 32-bit word is byte-
320 					 * swapped individually.  For
321 					 * simplicity we don't want to do that
322 					 * below, so we swap the words now to
323 					 * get the desired result in the end.
324 					 */
325 					data.i[0] = data.i[3];
326 				} else {
327 					data.i[0] = data.i[1];
328 					data.i[1] = data.i[3];
329 				}
330 			} else {
331 				if (getreg(rp, rd, &data.l[0], badaddr))
332 					return (SIMU_FAULT);
333 			}
334 		}
335 
336 		if (aligndebug) {
337 			if (sz == 16) {
338 				printf("data %x %x %x %x\n",
339 				    data.i[0], data.i[1], data.i[2], data.c[3]);
340 			} else {
341 				printf("data %x %x %x %x %x %x %x %x\n",
342 				    data.c[0], data.c[1], data.c[2], data.c[3],
343 				    data.c[4], data.c[5], data.c[6], data.c[7]);
344 			}
345 		}
346 
347 		if (ltlend) {
348 			if (sz == 1) {
349 				if (xcopyout_little(&data.c[7], addr,
350 				    (size_t)sz) != 0)
351 					goto badret;
352 			} else if (sz == 2) {
353 				if (xcopyout_little(&data.s[3], addr,
354 				    (size_t)sz) != 0)
355 					goto badret;
356 			} else if (sz == 4) {
357 				if (xcopyout_little(&data.i[1], addr,
358 				    (size_t)sz) != 0)
359 					goto badret;
360 			} else {
361 				if (xcopyout_little(&data.l[0], addr,
362 				    (size_t)sz) != 0)
363 					goto badret;
364 			}
365 		} else {
366 			if (sz == 1) {
367 				if (copyout(&data.c[7], addr, (size_t)sz) == -1)
368 					goto badret;
369 			} else if (sz == 2) {
370 				if (copyout(&data.s[3], addr, (size_t)sz) == -1)
371 					goto badret;
372 			} else if (sz == 4) {
373 				if (copyout(&data.i[1], addr, (size_t)sz) == -1)
374 					goto badret;
375 			} else {
376 				if (copyout(&data.l[0], addr, (size_t)sz) == -1)
377 					goto badret;
378 			}
379 		}
380 	} else {				/* load */
381 		if (sz == 1) {
382 			if (ltlend) {
383 				if (xcopyin_little(addr, &data.c[7],
384 				    (size_t)sz) != 0) {
385 					if (nf)
386 						data.c[7] = 0;
387 					else
388 						goto badret;
389 				}
390 			} else {
391 				if (copyin(addr, &data.c[7],
392 				    (size_t)sz) == -1) {
393 					if (nf)
394 						data.c[7] = 0;
395 					else
396 						goto badret;
397 				}
398 			}
399 			/* if signed and the sign bit is set extend it */
400 			if (((inst >> 22) & 1) && ((data.c[7] >> 7) & 1)) {
401 				data.i[0] = (uint_t)-1;	/* extend sign bit */
402 				data.s[2] = (ushort_t)-1;
403 				data.c[6] = (uchar_t)-1;
404 			} else {
405 				data.i[0] = 0;	/* clear upper 32+24 bits */
406 				data.s[2] = 0;
407 				data.c[6] = 0;
408 			}
409 		} else if (sz == 2) {
410 			if (ltlend) {
411 				if (xcopyin_little(addr, &data.s[3],
412 				    (size_t)sz) != 0) {
413 					if (nf)
414 						data.s[3] = 0;
415 					else
416 						goto badret;
417 				}
418 			} else {
419 				if (copyin(addr, &data.s[3],
420 				    (size_t)sz) == -1) {
421 					if (nf)
422 						data.s[3] = 0;
423 					else
424 						goto badret;
425 				}
426 			}
427 			/* if signed and the sign bit is set extend it */
428 			if (((inst >> 22) & 1) && ((data.s[3] >> 15) & 1)) {
429 				data.i[0] = (uint_t)-1;	/* extend sign bit */
430 				data.s[2] = (ushort_t)-1;
431 			} else {
432 				data.i[0] = 0;	/* clear upper 32+16 bits */
433 				data.s[2] = 0;
434 			}
435 		} else if (sz == 4) {
436 			if (ltlend) {
437 				if (xcopyin_little(addr, &data.i[1],
438 				    (size_t)sz) != 0) {
439 					if (!nf)
440 						goto badret;
441 					data.i[1] = 0;
442 				}
443 			} else {
444 				if (copyin(addr, &data.i[1],
445 				    (size_t)sz) == -1) {
446 					if (!nf)
447 						goto badret;
448 					data.i[1] = 0;
449 				}
450 			}
451 			/* if signed and the sign bit is set extend it */
452 			if (((inst >> 22) & 1) && ((data.i[1] >> 31) & 1)) {
453 				data.i[0] = (uint_t)-1;	/* extend sign bit */
454 			} else {
455 				data.i[0] = 0;	/* clear upper 32 bits */
456 			}
457 		} else {
458 			if (ltlend) {
459 				if (xcopyin_little(addr, &data.l[0],
460 				    (size_t)sz) != 0) {
461 					if (!nf)
462 						goto badret;
463 					data.l[0] = 0;
464 				}
465 			} else {
466 				if (copyin(addr, &data.l[0],
467 				    (size_t)sz) == -1) {
468 					if (!nf)
469 						goto badret;
470 					data.l[0] = 0;
471 				}
472 			}
473 		}
474 
475 		if (aligndebug) {
476 			if (sz == 16) {
477 				printf("data %x %x %x %x\n",
478 				    data.i[0], data.i[1], data.i[2], data.c[3]);
479 			} else {
480 				printf("data %x %x %x %x %x %x %x %x\n",
481 				    data.c[0], data.c[1], data.c[2], data.c[3],
482 				    data.c[4], data.c[5], data.c[6], data.c[7]);
483 			}
484 		}
485 
486 		if (floatflg) {		/* if fpu_exists write fpu reg */
487 			klwp_id_t lwp = ttolwp(curthread);
488 			kfpu_t *fp = lwptofpu(lwp);
489 			/* Ensure fp has been enabled */
490 			if (fpu_exists) {
491 				if (!(_fp_read_fprs() & FPRS_FEF))
492 					fp_enable();
493 			} else {
494 				if (!fp->fpu_en)
495 					fp_enable();
496 			}
497 			/* if fpu_exists read fpu reg */
498 			if (fpu_exists) {
499 				if (fsrflg) {
500 					_fp_write_pfsr(&data.l[0]);
501 				} else {
502 					if (sz == 4)
503 						_fp_write_pfreg(
504 						    (unsigned *)&data.i[1], rd);
505 					if (sz >= 8)
506 						_fp_write_pdreg(
507 						    &data.l[0], rd);
508 					if (sz == 16)
509 						_fp_write_pdreg(
510 						    &data.l[1], rd+1);
511 				}
512 			} else {
513 				if (fsrflg) {
514 					fp->fpu_fsr = data.l[0];
515 				} else {
516 					if (sz == 4)
517 						fp->fpu_fr.fpu_regs[rd] =
518 						    (unsigned)data.i[1];
519 					if (sz >= 8)
520 						fp->fpu_fr.fpu_dregs[rd] =
521 						    data.l[0];
522 					if (sz == 16)
523 						fp->fpu_fr.fpu_dregs[rd+1] =
524 						    data.l[1];
525 				}
526 			}
527 		} else {
528 			if (lddstdflg) {		/* split the data */
529 				if (ltlend) {
530 					/*
531 					 * For LDD, each 32-bit word is byte-
532 					 * swapped individually.  We didn't
533 					 * do that above, but this will give
534 					 * us the desired result.
535 					 */
536 					data.i[3] = data.i[0];
537 				} else {
538 					data.i[3] = data.i[1];
539 					data.i[1] = data.i[0];
540 				}
541 				data.i[0] = 0;
542 				data.i[2] = 0;
543 				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
544 					goto badret;
545 				if (putreg(&data.l[1], rp, rd+1, badaddr) == -1)
546 					goto badret;
547 			} else {
548 				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
549 					goto badret;
550 			}
551 		}
552 	}
553 	return (SIMU_SUCCESS);
554 badret:
555 	return (SIMU_FAULT);
556 }
557 
558 
559 int
560 simulate_lddstd(struct regs *rp, caddr_t *badaddr)
561 {
562 	uint_t	inst, op3, asi = 0;
563 	uint_t	rd, rs1, rs2;
564 	int	nf = 0, ltlend = 0, usermode;
565 	int	immflg;
566 	uint64_t reven;
567 	uint64_t rodd;
568 	caddr_t	addr;
569 	uint64_t val;
570 	uint64_t data;
571 
572 	usermode = USERMODE(rp->r_tstate);
573 
574 	if (usermode)
575 		inst = fetch_user_instr((caddr_t)rp->r_pc);
576 	else
577 		inst = *(uint_t *)rp->r_pc;
578 
579 	op3 = (inst >> 19) & 0x3f;
580 	rd = (inst >> 25) & 0x1f;
581 	rs1 = (inst >> 14) & 0x1f;
582 	rs2 = inst & 0x1f;
583 	immflg = (inst >> 13) & 1;
584 
585 	if (USERMODE(rp->r_tstate))
586 		(void) flush_user_windows_to_stack(NULL);
587 	else
588 		flush_windows();
589 
590 	if ((op3 >> 4) & 1) {		/* is this LDDA/STDA? */
591 		if (immflg) {
592 			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
593 			    TSTATE_ASI_MASK;
594 		} else {
595 			asi = (inst >> 5) & 0xff;
596 		}
597 		switch (asi) {
598 		case ASI_P:
599 		case ASI_S:
600 			break;
601 		case ASI_PNF:
602 		case ASI_SNF:
603 			nf = 1;
604 			break;
605 		case ASI_PL:
606 		case ASI_SL:
607 			ltlend = 1;
608 			break;
609 		case ASI_PNFL:
610 		case ASI_SNFL:
611 			ltlend = 1;
612 			nf = 1;
613 			break;
614 		case ASI_AIUP:
615 		case ASI_AIUS:
616 			usermode = 1;
617 			break;
618 		case ASI_AIUPL:
619 		case ASI_AIUSL:
620 			usermode = 1;
621 			ltlend = 1;
622 			break;
623 		default:
624 			return (SIMU_ILLEGAL);
625 		}
626 	}
627 
628 	if (getreg(rp, rs1, &val, badaddr))
629 		return (SIMU_FAULT);
630 	addr = (caddr_t)val;		/* convert to 32/64 bit address */
631 
632 	/* check immediate bit and use immediate field or reg (rs2) */
633 	if (immflg) {
634 		int imm;
635 		imm  = inst & 0x1fff;		/* mask out immediate field */
636 		imm <<= 19;			/* sign extend it */
637 		imm >>= 19;
638 		addr += imm;			/* compute address */
639 	} else {
640 		if (getreg(rp, rs2, &val, badaddr))
641 			return (SIMU_FAULT);
642 		addr += val;
643 	}
644 
645 	/*
646 	 * T_UNIMP_LDD and T_UNIMP_STD are higher priority than
647 	 * T_ALIGNMENT.  So we have to make sure that the address is
648 	 * kosher before trying to use it, because the hardware hasn't
649 	 * checked it for us yet.
650 	 */
651 	if (((uintptr_t)addr & 0x7) != 0) {
652 		if (curproc->p_fixalignment)
653 			return (do_unaligned(rp, badaddr));
654 		else
655 			return (SIMU_UNALIGN);
656 	}
657 
658 	/*
659 	 * If this is a 32-bit program, chop the address accordingly.  The
660 	 * intermediate uintptr_t casts prevent warnings under a certain
661 	 * compiler, and the temporary 32 bit storage is intended to force
662 	 * proper code generation and break up what would otherwise be a
663 	 * quadruple cast.
664 	 */
665 	if (curproc->p_model == DATAMODEL_ILP32 && usermode) {
666 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
667 		addr = (caddr_t)(uintptr_t)addr32;
668 	}
669 
670 	if ((inst >> 21) & 1) {			/* store */
671 		if (getreg(rp, rd, &reven, badaddr))
672 			return (SIMU_FAULT);
673 		if (getreg(rp, rd+1, &rodd, badaddr))
674 			return (SIMU_FAULT);
675 		if (ltlend) {
676 			reven = BSWAP_32(reven);
677 			rodd  = BSWAP_32(rodd);
678 		}
679 		data = (reven << 32) | rodd;
680 		if (usermode) {
681 			if (suword64_nowatch(addr, data) == -1)
682 				return (SIMU_FAULT);
683 		} else {
684 			*(uint64_t *)addr = data;
685 		}
686 	} else {				/* load */
687 		if (usermode) {
688 			if (fuword64_nowatch(addr, &data)) {
689 				if (nf)
690 					data = 0;
691 				else
692 					return (SIMU_FAULT);
693 			}
694 		} else
695 			data = *(uint64_t *)addr;
696 
697 		reven = (data >> 32);
698 		rodd  = (uint64_t)(uint32_t)data;
699 		if (ltlend) {
700 			reven = BSWAP_32(reven);
701 			rodd  = BSWAP_32(rodd);
702 		}
703 
704 		if (putreg(&reven, rp, rd, badaddr) == -1)
705 			return (SIMU_FAULT);
706 		if (putreg(&rodd, rp, rd+1, badaddr) == -1)
707 			return (SIMU_FAULT);
708 	}
709 	return (SIMU_SUCCESS);
710 }
711 
712 
713 /*
714  * simulate popc
715  */
716 static int
717 simulate_popc(struct regs *rp, caddr_t *badaddr, uint_t inst)
718 {
719 	uint_t	rd, rs2, rs1;
720 	uint_t	immflg;
721 	uint64_t val, cnt = 0;
722 
723 	rd = (inst >> 25) & 0x1f;
724 	rs1 = (inst >> 14) & 0x1f;
725 	rs2 = inst & 0x1f;
726 	immflg = (inst >> 13) & 1;
727 
728 	if (rs1 > 0)
729 		return (SIMU_ILLEGAL);
730 
731 	(void) flush_user_windows_to_stack(NULL);
732 
733 	/* check immediate bit and use immediate field or reg (rs2) */
734 	if (immflg) {
735 		int64_t imm;
736 		imm  = inst & 0x1fff;		/* mask out immediate field */
737 		imm <<= 51;			/* sign extend it */
738 		imm >>= 51;
739 		if (imm != 0) {
740 			for (cnt = 0; imm != 0; imm &= imm-1)
741 				cnt++;
742 		}
743 	} else {
744 		if (getreg(rp, rs2, &val, badaddr))
745 			return (SIMU_FAULT);
746 		if (val != 0) {
747 			for (cnt = 0; val != 0; val &= val-1)
748 				cnt++;
749 		}
750 	}
751 
752 	if (putreg(&cnt, rp, rd, badaddr) == -1)
753 		return (SIMU_FAULT);
754 
755 	return (SIMU_SUCCESS);
756 }
757 
758 /*
759  * simulate mulscc
760  */
761 static int
762 simulate_mulscc(struct regs *rp, caddr_t *badaddr, uint_t inst)
763 {
764 	uint32_t	s1, s2;
765 	uint32_t	c, d, v;
766 	uint_t		rd, rs1;
767 	int64_t		d64;
768 	uint64_t	ud64;
769 	uint64_t	drs1;
770 
771 	(void) flush_user_windows_to_stack(NULL);
772 
773 	if ((inst >> 13) & 1) {		/* immediate */
774 		d64 = inst & 0x1fff;
775 		d64 <<= 51;		/* sign extend it */
776 		d64 >>= 51;
777 	} else {
778 		uint_t		rs2;
779 		uint64_t	drs2;
780 
781 		if (inst & 0x1fe0) {
782 			return (SIMU_ILLEGAL);
783 		}
784 		rs2 = inst & 0x1f;
785 		if (getreg(rp, rs2, &drs2, badaddr)) {
786 			return (SIMU_FAULT);
787 		}
788 		d64 = (int64_t)drs2;
789 	}
790 
791 	rs1 = (inst >> 14) & 0x1f;
792 	if (getreg(rp, rs1, &drs1, badaddr)) {
793 		return (SIMU_FAULT);
794 	}
795 	/* icc.n xor icc.v */
796 	s1 = ((rp->r_tstate & TSTATE_IN) >> (TSTATE_CCR_SHIFT + 3)) ^
797 	    ((rp->r_tstate & TSTATE_IV) >> (TSTATE_CCR_SHIFT + 1));
798 	s1 = (s1 << 31) | (((uint32_t)drs1) >> 1);
799 
800 	if (rp->r_y & 1) {
801 		s2 = (uint32_t)d64;
802 	} else {
803 		s2 = 0;
804 	}
805 	d = s1 + s2;
806 
807 	ud64 = (uint64_t)d;
808 
809 	/* set the icc flags */
810 	v = (s1 & s2 & ~d) | (~s1 & ~s2 & d);
811 	c = (s1 & s2) | (~d & (s1 | s2));
812 	rp->r_tstate &= ~TSTATE_ICC;
813 	rp->r_tstate |= (uint64_t)((c >> 31) & 1) << (TSTATE_CCR_SHIFT + 0);
814 	rp->r_tstate |= (uint64_t)((v >> 31) & 1) << (TSTATE_CCR_SHIFT + 1);
815 	rp->r_tstate |= (uint64_t)(d ? 0 : 1) << (TSTATE_CCR_SHIFT + 2);
816 	rp->r_tstate |= (uint64_t)((d >> 31) & 1) << (TSTATE_CCR_SHIFT + 3);
817 
818 	if (rp->r_tstate & TSTATE_IC) {
819 		ud64 |= (1ULL << 32);
820 	}
821 
822 	/* set the xcc flags */
823 	rp->r_tstate &= ~TSTATE_XCC;
824 	if (ud64 == 0) {
825 		rp->r_tstate |= TSTATE_XZ;
826 	}
827 
828 	rd = (inst >> 25) & 0x1f;
829 	if (putreg(&ud64, rp, rd, badaddr)) {
830 		return (SIMU_FAULT);
831 	}
832 
833 	d64 = (drs1 << 32) | (uint32_t)rp->r_y;
834 	d64 >>= 1;
835 	rp->r_y = (uint32_t)d64;
836 
837 	return (SIMU_SUCCESS);
838 }
839 
840 /*
841  * simulate unimplemented instructions (popc, ldqf{a}, stqf{a})
842  */
843 int
844 simulate_unimp(struct regs *rp, caddr_t *badaddr)
845 {
846 	uint_t	inst, optype, op3, asi;
847 	uint_t	rs1, rd;
848 	uint_t	ignor, i;
849 	machpcb_t *mpcb = lwptompcb(ttolwp(curthread));
850 	int	nomatch = 0;
851 	caddr_t	addr = (caddr_t)rp->r_pc;
852 	struct as *as;
853 	caddr_t	ka;
854 	pfn_t	pfnum;
855 	page_t *pp;
856 	proc_t *p = ttoproc(curthread);
857 	struct seg *mapseg;
858 	struct segvn_data *svd;
859 
860 	ASSERT(USERMODE(rp->r_tstate));
861 	inst = fetch_user_instr(addr);
862 	if (inst == (uint_t)-1) {
863 		mpcb->mpcb_illexcaddr = addr;
864 		mpcb->mpcb_illexcinsn = (uint32_t)-1;
865 		return (SIMU_ILLEGAL);
866 	}
867 
868 	/*
869 	 * When fixing dirty v8 instructions there's a race if two processors
870 	 * are executing the dirty executable at the same time.  If one
871 	 * cleans the instruction as the other is executing it the second
872 	 * processor will see a clean instruction when it comes through this
873 	 * code and will return SIMU_ILLEGAL.  To work around the race
874 	 * this code will keep track of the last illegal instruction seen
875 	 * by each lwp and will only take action if the illegal instruction
876 	 * is repeatable.
877 	 */
878 	if (addr != mpcb->mpcb_illexcaddr ||
879 	    inst != mpcb->mpcb_illexcinsn)
880 		nomatch = 1;
881 	mpcb->mpcb_illexcaddr = addr;
882 	mpcb->mpcb_illexcinsn = inst;
883 
884 	/* instruction fields */
885 	i = (inst >> 13) & 0x1;
886 	rd = (inst >> 25) & 0x1f;
887 	optype = (inst >> 30) & 0x3;
888 	op3 = (inst >> 19) & 0x3f;
889 	ignor = (inst >> 5) & 0xff;
890 	if (IS_IBIT_SET(inst)) {
891 		asi = (uint32_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
892 		    TSTATE_ASI_MASK);
893 	} else {
894 		asi = ignor;
895 	}
896 
897 	if (IS_VIS1(optype, op3) ||
898 	    IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(optype, op3, asi) ||
899 	    IS_FLOAT_QUAD_OP(optype, op3)) {
900 		klwp_t *lwp = ttolwp(curthread);
901 		kfpu_t *fp = lwptofpu(lwp);
902 		if (fpu_exists) {
903 			if (!(_fp_read_fprs() & FPRS_FEF))
904 				fp_enable();
905 			_fp_read_pfsr(&fp->fpu_fsr);
906 		} else {
907 			if (!fp->fpu_en)
908 				fp_enable();
909 		}
910 		fp_precise(rp);
911 		return (SIMU_RETRY);
912 	}
913 
914 	if (optype == 2 && op3 == IOP_V8_POPC) {
915 		return (simulate_popc(rp, badaddr, inst));
916 	} else if (optype == 3 && op3 == IOP_V8_POPC) {
917 		return (SIMU_ILLEGAL);
918 	} else if (optype == OP_V8_ARITH && op3 == IOP_V8_MULScc) {
919 		return (simulate_mulscc(rp, badaddr, inst));
920 	}
921 
922 	if (optype == OP_V8_LDSTR) {
923 		if (op3 == IOP_V8_LDQF || op3 == IOP_V8_LDQFA ||
924 		    op3 == IOP_V8_STQF || op3 == IOP_V8_STQFA)
925 			return (do_unaligned(rp, badaddr));
926 	}
927 
928 	/* This is a new instruction so illexccnt should also be set. */
929 	if (nomatch) {
930 		mpcb->mpcb_illexccnt = 0;
931 		return (SIMU_RETRY);
932 	}
933 
934 	/*
935 	 * In order to keep us from entering into an infinite loop while
936 	 * attempting to clean up faulty instructions, we will return
937 	 * SIMU_ILLEGAL once we've cleaned up the instruction as much
938 	 * as we can, and still end up here.
939 	 */
940 	if (mpcb->mpcb_illexccnt >= 3)
941 		return (SIMU_ILLEGAL);
942 
943 	mpcb->mpcb_illexccnt += 1;
944 
945 	/*
946 	 * The rest of the code handles v8 binaries with instructions
947 	 * that have dirty (non-zero) bits in reserved or 'ignored'
948 	 * fields; these will cause core dumps on v9 machines.
949 	 *
950 	 * We only clean dirty instructions in 32-bit programs (ie, v8)
951 	 * running on SPARCv9 processors.  True v9 programs are forced
952 	 * to use the instruction set as intended.
953 	 */
954 	if (lwp_getdatamodel(curthread->t_lwp) != DATAMODEL_ILP32)
955 		return (SIMU_ILLEGAL);
956 	switch (optype) {
957 	case OP_V8_BRANCH:
958 	case OP_V8_CALL:
959 		return (SIMU_ILLEGAL);	/* these don't have ignored fields */
960 		/*NOTREACHED*/
961 	case OP_V8_ARITH:
962 		switch (op3) {
963 		case IOP_V8_RETT:
964 			if (rd == 0 && !(i == 0 && ignor))
965 				return (SIMU_ILLEGAL);
966 			if (rd)
967 				inst &= ~(0x1f << 25);
968 			if (i == 0 && ignor)
969 				inst &= ~(0xff << 5);
970 			break;
971 		case IOP_V8_TCC:
972 			if (i == 0 && ignor != 0) {
973 				inst &= ~(0xff << 5);
974 			} else if (i == 1 && (((inst >> 7) & 0x3f) != 0)) {
975 				inst &= ~(0x3f << 7);
976 			} else {
977 				return (SIMU_ILLEGAL);
978 			}
979 			break;
980 		case IOP_V8_JMPL:
981 		case IOP_V8_RESTORE:
982 		case IOP_V8_SAVE:
983 			if ((op3 == IOP_V8_RETT && rd) ||
984 			    (i == 0 && ignor)) {
985 				inst &= ~(0xff << 5);
986 			} else {
987 				return (SIMU_ILLEGAL);
988 			}
989 			break;
990 		case IOP_V8_FCMP:
991 			if (rd == 0)
992 				return (SIMU_ILLEGAL);
993 			inst &= ~(0x1f << 25);
994 			break;
995 		case IOP_V8_RDASR:
996 			rs1 = ((inst >> 14) & 0x1f);
997 			if (rs1 == 1 || (rs1 >= 7 && rs1 <= 14)) {
998 				/*
999 				 * The instruction specifies an invalid
1000 				 * state register - better bail out than
1001 				 * "fix" it when we're not sure what was
1002 				 * intended.
1003 				 */
1004 				return (SIMU_ILLEGAL);
1005 			}
1006 				/*
1007 				 * Note: this case includes the 'stbar'
1008 				 * instruction (rs1 == 15 && i == 0).
1009 				 */
1010 				if ((ignor = (inst & 0x3fff)) != 0)
1011 					inst &= ~(0x3fff);
1012 			break;
1013 		case IOP_V8_SRA:
1014 		case IOP_V8_SRL:
1015 		case IOP_V8_SLL:
1016 			if (ignor == 0)
1017 				return (SIMU_ILLEGAL);
1018 			inst &= ~(0xff << 5);
1019 			break;
1020 		case IOP_V8_ADD:
1021 		case IOP_V8_AND:
1022 		case IOP_V8_OR:
1023 		case IOP_V8_XOR:
1024 		case IOP_V8_SUB:
1025 		case IOP_V8_ANDN:
1026 		case IOP_V8_ORN:
1027 		case IOP_V8_XNOR:
1028 		case IOP_V8_ADDC:
1029 		case IOP_V8_UMUL:
1030 		case IOP_V8_SMUL:
1031 		case IOP_V8_SUBC:
1032 		case IOP_V8_UDIV:
1033 		case IOP_V8_SDIV:
1034 		case IOP_V8_ADDcc:
1035 		case IOP_V8_ANDcc:
1036 		case IOP_V8_ORcc:
1037 		case IOP_V8_XORcc:
1038 		case IOP_V8_SUBcc:
1039 		case IOP_V8_ANDNcc:
1040 		case IOP_V8_ORNcc:
1041 		case IOP_V8_XNORcc:
1042 		case IOP_V8_ADDCcc:
1043 		case IOP_V8_UMULcc:
1044 		case IOP_V8_SMULcc:
1045 		case IOP_V8_SUBCcc:
1046 		case IOP_V8_UDIVcc:
1047 		case IOP_V8_SDIVcc:
1048 		case IOP_V8_TADDcc:
1049 		case IOP_V8_TSUBcc:
1050 		case IOP_V8_TADDccTV:
1051 		case IOP_V8_TSUBccTV:
1052 		case IOP_V8_MULScc:
1053 		case IOP_V8_WRASR:
1054 		case IOP_V8_FLUSH:
1055 			if (i != 0 || ignor == 0)
1056 				return (SIMU_ILLEGAL);
1057 			inst &= ~(0xff << 5);
1058 			break;
1059 		default:
1060 			return (SIMU_ILLEGAL);
1061 		}
1062 		break;
1063 	case OP_V8_LDSTR:
1064 		switch (op3) {
1065 		case IOP_V8_STFSR:
1066 		case IOP_V8_LDFSR:
1067 			if (rd == 0 && !(i == 0 && ignor))
1068 				return (SIMU_ILLEGAL);
1069 			if (rd)
1070 				inst &= ~(0x1f << 25);
1071 			if (i == 0 && ignor)
1072 				inst &= ~(0xff << 5);
1073 			break;
1074 		default:
1075 			if (optype == OP_V8_LDSTR && !IS_LDST_ALT(op3) &&
1076 			    i == 0 && ignor)
1077 				inst &= ~(0xff << 5);
1078 			else
1079 				return (SIMU_ILLEGAL);
1080 			break;
1081 		}
1082 		break;
1083 	default:
1084 		return (SIMU_ILLEGAL);
1085 	}
1086 
1087 	as = p->p_as;
1088 
1089 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1090 	mapseg = as_findseg(as, (caddr_t)rp->r_pc, 0);
1091 	ASSERT(mapseg != NULL);
1092 	svd = (struct segvn_data *)mapseg->s_data;
1093 
1094 	/*
1095 	 * We only create COW page for MAP_PRIVATE mappings.
1096 	 */
1097 	SEGVN_LOCK_ENTER(as, &svd->lock, RW_READER);
1098 	if ((svd->type & MAP_TYPE) & MAP_SHARED) {
1099 		SEGVN_LOCK_EXIT(as, &svd->lock);
1100 		AS_LOCK_EXIT(as, &as->a_lock);
1101 		return (SIMU_ILLEGAL);
1102 	}
1103 	SEGVN_LOCK_EXIT(as, &svd->lock);
1104 	AS_LOCK_EXIT(as, &as->a_lock);
1105 
1106 	/*
1107 	 * A "flush" instruction using the user PC's vaddr will not work
1108 	 * here, at least on Spitfire. Instead we create a temporary kernel
1109 	 * mapping to the user's text page, then modify and flush that.
1110 	 * Break COW by locking user page.
1111 	 */
1112 	if (as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK), PAGESIZE,
1113 	    F_SOFTLOCK, S_READ))
1114 		return (SIMU_FAULT);
1115 
1116 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1117 	pfnum = hat_getpfnum(as->a_hat, (caddr_t)rp->r_pc);
1118 	AS_LOCK_EXIT(as, &as->a_lock);
1119 	if (pf_is_memory(pfnum)) {
1120 		pp = page_numtopp_nolock(pfnum);
1121 		ASSERT(pp == NULL || PAGE_LOCKED(pp));
1122 	} else {
1123 		(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
1124 		    PAGESIZE, F_SOFTUNLOCK, S_READ);
1125 		return (SIMU_FAULT);
1126 	}
1127 
1128 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1129 	ka = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)rp->r_pc);
1130 	*(uint_t *)(ka + (uintptr_t)(rp->r_pc % PAGESIZE)) = inst;
1131 	doflush(ka + (uintptr_t)(rp->r_pc % PAGESIZE));
1132 	ppmapout(ka);
1133 	AS_LOCK_EXIT(as, &as->a_lock);
1134 
1135 	(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
1136 	    PAGESIZE, F_SOFTUNLOCK, S_READ);
1137 	return (SIMU_RETRY);
1138 }
1139 
1140 /*
1141  * Get the value of a register for instruction simulation
1142  * by using the regs or window structure pointers.
1143  * Return 0 for success, and -1 for failure.  If there is a failure,
1144  * save the faulting address using badaddr pointer.
1145  * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
1146  * Don't truncate globals/outs for 32 bit programs, for v8+ support.
1147  */
1148 int
1149 getreg(struct regs *rp, uint_t reg, uint64_t *val, caddr_t *badaddr)
1150 {
1151 	uint64_t *rgs, *sp;
1152 	int rv = 0;
1153 
1154 	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
1155 	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
1156 	if (reg == 0) {
1157 		*val = 0;
1158 	} else if (reg < 16) {
1159 		*val = rgs[reg];
1160 	} else if (IS_V9STACK(sp)) {
1161 		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
1162 		uint64_t *addr = (uint64_t *)&rw[reg - 16];
1163 		uint64_t res;
1164 
1165 		if (USERMODE(rp->r_tstate)) {
1166 			if (fuword64_nowatch(addr, &res) == -1) {
1167 				*badaddr = (caddr_t)addr;
1168 				rv = -1;
1169 			}
1170 		} else {
1171 			res = *addr;
1172 		}
1173 		*val = res;
1174 	} else {
1175 		caddr32_t sp32 = (caddr32_t)(uintptr_t)sp;
1176 		uint32_t *rw = (uint32_t *)(uintptr_t)sp32;
1177 		uint32_t *addr = (uint32_t *)&rw[reg - 16];
1178 		uint32_t res;
1179 
1180 		if (USERMODE(rp->r_tstate)) {
1181 			if (fuword32_nowatch(addr, &res) == -1) {
1182 				*badaddr = (caddr_t)addr;
1183 				rv = -1;
1184 			}
1185 		} else {
1186 			res = *addr;
1187 		}
1188 		*val = (uint64_t)res;
1189 	}
1190 	return (rv);
1191 }
1192 
1193 /*
1194  * Set the value of a register after instruction simulation
1195  * by using the regs or window structure pointers.
1196  * Return 0 for succes -1 failure.
1197  * save the faulting address using badaddr pointer.
1198  * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
1199  * Don't truncate globals/outs for 32 bit programs, for v8+ support.
1200  */
1201 int
1202 putreg(uint64_t	*data, struct regs *rp, uint_t reg, caddr_t *badaddr)
1203 {
1204 	uint64_t *rgs, *sp;
1205 	int rv = 0;
1206 
1207 	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
1208 	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
1209 	if (reg == 0) {
1210 		return (0);
1211 	} else if (reg < 16) {
1212 		rgs[reg] = *data;
1213 	} else if (IS_V9STACK(sp)) {
1214 		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
1215 		uint64_t *addr = (uint64_t *)&rw[reg - 16];
1216 		uint64_t res;
1217 
1218 		if (USERMODE(rp->r_tstate)) {
1219 			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);
1220 
1221 			res = *data;
1222 			if (suword64_nowatch(addr, res) != 0) {
1223 				*badaddr = (caddr_t)addr;
1224 				rv = -1;
1225 			}
1226 			/*
1227 			 * We have changed a local or in register;
1228 			 * nuke the watchpoint return windows.
1229 			 */
1230 			mpcb->mpcb_rsp[0] = NULL;
1231 			mpcb->mpcb_rsp[1] = NULL;
1232 		} else {
1233 			res = *data;
1234 			*addr = res;
1235 		}
1236 	} else {
1237 		caddr32_t sp32 = (caddr32_t)(uintptr_t)sp;
1238 		uint32_t *rw = (uint32_t *)(uintptr_t)sp32;
1239 		uint32_t *addr = (uint32_t *)&rw[reg - 16];
1240 		uint32_t res;
1241 
1242 		if (USERMODE(rp->r_tstate)) {
1243 			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);
1244 
1245 			res = (uint_t)*data;
1246 			if (suword32_nowatch(addr, res) != 0) {
1247 				*badaddr = (caddr_t)addr;
1248 				rv = -1;
1249 			}
1250 			/*
1251 			 * We have changed a local or in register;
1252 			 * nuke the watchpoint return windows.
1253 			 */
1254 			mpcb->mpcb_rsp[0] = NULL;
1255 			mpcb->mpcb_rsp[1] = NULL;
1256 
1257 		} else {
1258 			res = (uint_t)*data;
1259 			*addr = res;
1260 		}
1261 	}
1262 	return (rv);
1263 }
1264 
1265 /*
1266  * Calculate a memory reference address from instruction
1267  * operands, used to return the address of a fault, instead
1268  * of the instruction when an error occurs.  This is code that is
1269  * common with most of the routines that simulate instructions.
1270  */
1271 int
1272 calc_memaddr(struct regs *rp, caddr_t *badaddr)
1273 {
1274 	uint_t	inst;
1275 	uint_t	rd, rs1, rs2;
1276 	int	sz;
1277 	int	immflg;
1278 	int	floatflg;
1279 	caddr_t  addr;
1280 	uint64_t val;
1281 
1282 	if (USERMODE(rp->r_tstate))
1283 		inst = fetch_user_instr((caddr_t)rp->r_pc);
1284 	else
1285 		inst = *(uint_t *)rp->r_pc;
1286 
1287 	rd = (inst >> 25) & 0x1f;
1288 	rs1 = (inst >> 14) & 0x1f;
1289 	rs2 = inst & 0x1f;
1290 	floatflg = (inst >> 24) & 1;
1291 	immflg = (inst >> 13) & 1;
1292 
1293 	if (floatflg) {
1294 		switch ((inst >> 19) & 3) {	/* map size bits to a number */
1295 		case 0: sz = 4; break;		/* ldf/stf */
1296 		case 1: return (0);		/* ld[x]fsr/st[x]fsr */
1297 		case 2: sz = 16; break;		/* ldqf/stqf */
1298 		case 3: sz = 8; break;		/* lddf/stdf */
1299 		}
1300 		/*
1301 		 * Fix to access extra double register encoding plus
1302 		 * compensate to access the correct fpu_dreg.
1303 		 */
1304 		if (sz > 4) {
1305 			if ((rd & 1) == 1)
1306 				rd = (rd & 0x1e) | 0x20;
1307 			rd = rd >> 1;
1308 		}
1309 	} else {
1310 		switch ((inst >> 19) & 0xf) {	/* map size bits to a number */
1311 		case 0:				/* lduw */
1312 		case 4:				/* stw */
1313 		case 8:				/* ldsw */
1314 		case 0xf:			/* swap */
1315 			sz = 4; break;
1316 		case 1:				/* ldub */
1317 		case 5:				/* stb */
1318 		case 9:				/* ldsb */
1319 		case 0xd:			/* ldstub */
1320 			sz = 1; break;
1321 		case 2:				/* lduh */
1322 		case 6:				/* sth */
1323 		case 0xa:			/* ldsh */
1324 			sz = 2; break;
1325 		case 3:				/* ldd */
1326 		case 7:				/* std */
1327 		case 0xb:			/* ldx */
1328 		case 0xe:			/* stx */
1329 			sz = 8; break;
1330 		}
1331 	}
1332 
1333 	if (USERMODE(rp->r_tstate))
1334 		(void) flush_user_windows_to_stack(NULL);
1335 	else
1336 		flush_windows();
1337 
1338 	if (getreg(rp, rs1, &val, badaddr))
1339 		return (SIMU_FAULT);
1340 	addr = (caddr_t)val;
1341 
1342 	/* check immediate bit and use immediate field or reg (rs2) */
1343 	if (immflg) {
1344 		int imm;
1345 		imm = inst & 0x1fff;		/* mask out immediate field */
1346 		imm <<= 19;			/* sign extend it */
1347 		imm >>= 19;
1348 		addr += imm;			/* compute address */
1349 	} else {
1350 		if (getreg(rp, rs2, &val, badaddr))
1351 			return (SIMU_FAULT);
1352 		addr += val;
1353 	}
1354 
1355 	/*
1356 	 * If this is a 32-bit program, chop the address accordingly.  The
1357 	 * intermediate uintptr_t casts prevent warnings under a certain
1358 	 * compiler, and the temporary 32 bit storage is intended to force
1359 	 * proper code generation and break up what would otherwise be a
1360 	 * quadruple cast.
1361 	 */
1362 	if (curproc->p_model == DATAMODEL_ILP32 && USERMODE(rp->r_tstate)) {
1363 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
1364 		addr = (caddr_t)(uintptr_t)addr32;
1365 	}
1366 
1367 	*badaddr = addr;
1368 	return ((uintptr_t)addr & (sz - 1) ? SIMU_UNALIGN : SIMU_SUCCESS);
1369 }
1370 
1371 /*
1372  * Return the size of a load or store instruction (1, 2, 4, 8, 16, 64).
1373  * Also compute the precise address by instruction disassembly.
1374  * (v9 page faults only provide the page address via the hardware.)
1375  * Return 0 on failure (not a load or store instruction).
1376  */
1377 int
1378 instr_size(struct regs *rp, caddr_t *addrp, enum seg_rw rdwr)
1379 {
1380 	uint_t	inst, op3, asi;
1381 	uint_t	rd, rs1, rs2;
1382 	int	sz = 0;
1383 	int	immflg;
1384 	int	floatflg;
1385 	caddr_t	addr;
1386 	caddr_t badaddr;
1387 	uint64_t val;
1388 
1389 	if (rdwr == S_EXEC) {
1390 		*addrp = (caddr_t)rp->r_pc;
1391 		return (4);
1392 	}
1393 
1394 	/*
1395 	 * Fetch the instruction from user-level.
1396 	 * We would like to assert this:
1397 	 *   ASSERT(USERMODE(rp->r_tstate));
1398 	 * but we can't because we can reach this point from a
1399 	 * register window underflow/overflow and the v9 wbuf
1400 	 * traps call trap() with T_USER even though r_tstate
1401 	 * indicates a system trap, not a user trap.
1402 	 */
1403 	inst = fetch_user_instr((caddr_t)rp->r_pc);
1404 
1405 	op3 = (inst >> 19) & 0x3f;
1406 	rd = (inst >> 25) & 0x1f;
1407 	rs1 = (inst >> 14) & 0x1f;
1408 	rs2 = inst & 0x1f;
1409 	floatflg = (inst >> 24) & 1;
1410 	immflg = (inst >> 13) & 1;
1411 
1412 	/* if not load or store do nothing.  can't happen? */
1413 	if ((inst >> 30) != 3)
1414 		return (0);
1415 
1416 	if (immflg)
1417 		asi = (uint_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
1418 		    TSTATE_ASI_MASK);
1419 	else
1420 		asi = (inst >> 5) & 0xff;
1421 
1422 	if (floatflg) {
1423 		/* check for ld/st alternate and highest defined V9 asi */
1424 		if ((op3 & 0x30) == 0x30 && asi > ASI_SNFL) {
1425 			sz = extended_asi_size(asi);
1426 		} else {
1427 			switch (op3 & 3) {
1428 			case 0:
1429 				sz = 4;			/* ldf/stf/cas */
1430 				break;
1431 			case 1:
1432 				if (rd == 0)
1433 					sz = 4;		/* ldfsr/stfsr */
1434 				else
1435 					sz = 8;		/* ldxfsr/stxfsr */
1436 				break;
1437 			case 2:
1438 				if (op3 == 0x3e)
1439 					sz = 8;		/* casx */
1440 				else
1441 					sz = 16;	/* ldqf/stqf */
1442 				break;
1443 			case 3:
1444 				sz = 8;			/* lddf/stdf */
1445 				break;
1446 			}
1447 		}
1448 	} else {
1449 		switch (op3 & 0xf) {		/* map size bits to a number */
1450 		case 0:				/* lduw */
1451 		case 4:				/* stw */
1452 		case 8:				/* ldsw */
1453 		case 0xf:			/* swap */
1454 			sz = 4; break;
1455 		case 1:				/* ldub */
1456 		case 5:				/* stb */
1457 		case 9:				/* ldsb */
1458 		case 0xd:			/* ldstub */
1459 			sz = 1; break;
1460 		case 2:				/* lduh */
1461 		case 6:				/* sth */
1462 		case 0xa:			/* ldsh */
1463 			sz = 2; break;
1464 		case 3:				/* ldd */
1465 		case 7:				/* std */
1466 		case 0xb:			/* ldx */
1467 		case 0xe:			/* stx */
1468 			sz = 8; break;
1469 		}
1470 	}
1471 
1472 	if (sz == 0)	/* can't happen? */
1473 		return (0);
1474 	(void) flush_user_windows_to_stack(NULL);
1475 
1476 	if (getreg(rp, rs1, &val, &badaddr))
1477 		return (0);
1478 	addr = (caddr_t)val;
1479 
1480 	/* cas/casx don't use rs2 / simm13 to compute the address */
1481 	if ((op3 & 0x3d) != 0x3c) {
1482 		/* check immediate bit and use immediate field or reg (rs2) */
1483 		if (immflg) {
1484 			int imm;
1485 			imm  = inst & 0x1fff;	/* mask out immediate field */
1486 			imm <<= 19;		/* sign extend it */
1487 			imm >>= 19;
1488 			addr += imm;		/* compute address */
1489 		} else {
1490 			/*
1491 			 * asi's in the 0xCx range are partial store
1492 			 * instructions.  For these, rs2 is a mask, not part of
1493 			 * the address.
1494 			 */
1495 			if (!(floatflg && (asi & 0xf0) == 0xc0)) {
1496 				if (getreg(rp, rs2, &val, &badaddr))
1497 					return (0);
1498 				addr += val;
1499 			}
1500 		}
1501 	}
1502 
1503 	/*
1504 	 * If this is a 32-bit program, chop the address accordingly.  The
1505 	 * intermediate uintptr_t casts prevent warnings under a certain
1506 	 * compiler, and the temporary 32 bit storage is intended to force
1507 	 * proper code generation and break up what would otherwise be a
1508 	 * quadruple cast.
1509 	 */
1510 	if (curproc->p_model == DATAMODEL_ILP32) {
1511 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
1512 		addr = (caddr_t)(uintptr_t)addr32;
1513 	}
1514 
1515 	*addrp = addr;
1516 	ASSERT(sz != 0);
1517 	return (sz);
1518 }
1519 
1520 /*
1521  * Fetch an instruction from user-level.
1522  * Deal with watchpoints, if they are in effect.
1523  */
1524 int32_t
1525 fetch_user_instr(caddr_t vaddr)
1526 {
1527 	proc_t *p = curproc;
1528 	int32_t instr;
1529 
1530 	/*
1531 	 * If this is a 32-bit program, chop the address accordingly.  The
1532 	 * intermediate uintptr_t casts prevent warnings under a certain
1533 	 * compiler, and the temporary 32 bit storage is intended to force
1534 	 * proper code generation and break up what would otherwise be a
1535 	 * quadruple cast.
1536 	 */
1537 	if (p->p_model == DATAMODEL_ILP32) {
1538 		caddr32_t vaddr32 = (caddr32_t)(uintptr_t)vaddr;
1539 		vaddr = (caddr_t)(uintptr_t)vaddr32;
1540 	}
1541 
1542 	if (fuword32_nowatch(vaddr, (uint32_t *)&instr) == -1)
1543 		instr = -1;
1544 
1545 	return (instr);
1546 }
1547