xref: /titanic_50/usr/src/uts/sparc/v9/os/simulator.c (revision 461686c359e383739b8e0d23c68520a0e2e2c361)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* common code with bug fixes from original version in trap.c */
27 
28 #include <sys/param.h>
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/archsystm.h>
32 #include <sys/vmsystm.h>
33 #include <sys/fpu/fpusystm.h>
34 #include <sys/fpu/fpu_simulator.h>
35 #include <sys/inline.h>
36 #include <sys/debug.h>
37 #include <sys/privregs.h>
38 #include <sys/machpcb.h>
39 #include <sys/simulate.h>
40 #include <sys/proc.h>
41 #include <sys/cmn_err.h>
42 #include <sys/stack.h>
43 #include <sys/watchpoint.h>
44 #include <sys/trap.h>
45 #include <sys/machtrap.h>
46 #include <sys/mman.h>
47 #include <sys/asi.h>
48 #include <sys/copyops.h>
49 #include <vm/as.h>
50 #include <vm/page.h>
51 #include <sys/model.h>
52 #include <vm/seg_vn.h>
53 #include <sys/byteorder.h>
54 #include <sys/time.h>
55 
56 #define	IS_IBIT_SET(x)	(x & 0x2000)
57 #define	IS_VIS1(op, op3)(op == 2 && op3 == 0x36)
58 #define	IS_FLOAT_QUAD_OP(op, op3)(op == 2 && (op3 == 0x34 ||	\
59 		op3 == 0x35))
60 #define	IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(op, op3, asi)		\
61 		(op == 3 && (op3 == IOP_V8_LDDFA ||		\
62 		op3 == IOP_V8_STDFA) &&	asi > ASI_SNFL)
63 
64 static int aligndebug = 0;
65 
66 /*
67  * For the sake of those who must be compatible with unaligned
68  * architectures, users can link their programs to use a
69  * corrective trap handler that will fix unaligned references
70  * a special trap #6 (T_FIX_ALIGN) enables this 'feature'.
71  * Returns 1 for success, 0 for failure.
72  */
73 
74 int
75 do_unaligned(struct regs *rp, caddr_t *badaddr)
76 {
77 	uint_t	inst, op3, asi = 0;
78 	uint_t	rd, rs1, rs2;
79 	int	sz, nf = 0, ltlend = 0;
80 	int	floatflg;
81 	int	fsrflg;
82 	int	immflg;
83 	int	lddstdflg;
84 	caddr_t	addr;
85 	uint64_t val;
86 	union {
87 		uint64_t	l[2];
88 		uint32_t	i[4];
89 		uint16_t	s[8];
90 		uint8_t		c[16];
91 	} data;
92 
93 	ASSERT(USERMODE(rp->r_tstate));
94 	inst = fetch_user_instr((caddr_t)rp->r_pc);
95 
96 	op3 = (inst >> 19) & 0x3f;
97 	rd = (inst >> 25) & 0x1f;
98 	rs1 = (inst >> 14) & 0x1f;
99 	rs2 = inst & 0x1f;
100 	floatflg = (inst >> 24) & 1;
101 	immflg = (inst >> 13) & 1;
102 	lddstdflg = fsrflg = 0;
103 
104 	/* if not load or store do nothing */
105 	if ((inst >> 30) != 3)
106 		return (0);
107 
108 	/* if ldstub or swap, do nothing */
109 	if ((inst & 0xc1680000) == 0xc0680000)
110 		return (0);
111 
112 	/* if cas/casx, do nothing */
113 	if ((inst & 0xc1e00000) == 0xc1e00000)
114 		return (0);
115 
116 	if (floatflg) {
117 		switch ((inst >> 19) & 3) {	/* map size bits to a number */
118 		case 0: sz = 4;
119 			break;			/* ldf{a}/stf{a} */
120 		case 1: fsrflg = 1;
121 			if (rd == 0)
122 				sz = 4;		/* ldfsr/stfsr */
123 			else  if (rd == 1)
124 				sz = 8;		/* ldxfsr/stxfsr */
125 			else
126 				return (SIMU_ILLEGAL);
127 			break;
128 		case 2: sz = 16;
129 			break;		/* ldqf{a}/stqf{a} */
130 		case 3: sz = 8;
131 			break;		/* lddf{a}/stdf{a} */
132 		}
133 		/*
134 		 * Fix to access extra double register encoding plus
135 		 * compensate to access the correct fpu_dreg.
136 		 */
137 		if ((sz > 4) && (fsrflg == 0)) {
138 			if ((rd & 1) == 1)
139 				rd = (rd & 0x1e) | 0x20;
140 			rd = rd >> 1;
141 			if ((sz == 16) && ((rd & 0x1) != 0))
142 				return (SIMU_ILLEGAL);
143 		}
144 	} else {
145 		int sz_bits = (inst >> 19) & 0xf;
146 		switch (sz_bits) {		/* map size bits to a number */
147 		case 0:				/* lduw{a} */
148 		case 4:				/* stw{a} */
149 		case 8:				/* ldsw{a} */
150 		case 0xf:			/* swap */
151 			sz = 4; break;
152 		case 1:				/* ldub{a} */
153 		case 5:				/* stb{a} */
154 		case 9:				/* ldsb{a} */
155 		case 0xd:			/* ldstub */
156 			sz = 1; break;
157 		case 2:				/* lduh{a} */
158 		case 6:				/* sth{a} */
159 		case 0xa:			/* ldsh{a} */
160 			sz = 2; break;
161 		case 3:				/* ldd{a} */
162 		case 7:				/* std{a} */
163 			lddstdflg = 1;
164 			sz = 8; break;
165 		case 0xb:			/* ldx{a} */
166 		case 0xe:			/* stx{a} */
167 			sz = 8; break;
168 		}
169 	}
170 
171 
172 	/* only support primary and secondary asi's */
173 	if ((op3 >> 4) & 1) {
174 		if (immflg) {
175 			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
176 			    TSTATE_ASI_MASK;
177 		} else {
178 			asi = (inst >> 5) & 0xff;
179 		}
180 		switch (asi) {
181 		case ASI_P:
182 		case ASI_S:
183 			break;
184 		case ASI_PNF:
185 		case ASI_SNF:
186 			nf = 1;
187 			break;
188 		case ASI_PL:
189 		case ASI_SL:
190 			ltlend = 1;
191 			break;
192 		case ASI_PNFL:
193 		case ASI_SNFL:
194 			ltlend = 1;
195 			nf = 1;
196 			break;
197 		default:
198 			return (0);
199 		}
200 		/*
201 		 * Non-faulting stores generate a data_access_exception trap,
202 		 * according to the Spitfire manual, which should be signaled
203 		 * as an illegal instruction trap, because it can't be fixed.
204 		 */
205 		if ((nf) && ((op3 == IOP_V8_STQFA) || (op3 == IOP_V8_STDFA)))
206 			return (SIMU_ILLEGAL);
207 	}
208 
209 	if (aligndebug) {
210 		printf("unaligned access at %p, instruction: 0x%x\n",
211 		    (void *)rp->r_pc, inst);
212 		printf("type %s", (((inst >> 21) & 1) ? "st" : "ld"));
213 		if (((inst >> 21) & 1) == 0)
214 			printf(" %s", (((inst >> 22) & 1) ?
215 			    "signed" : "unsigned"));
216 		printf(" asi 0x%x size %d immflg %d\n", asi, sz, immflg);
217 		printf("rd = %d, op3 = 0x%x, rs1 = %d, rs2 = %d, imm13=0x%x\n",
218 		    rd, op3, rs1, rs2, (inst & 0x1fff));
219 	}
220 
221 	(void) flush_user_windows_to_stack(NULL);
222 	if (getreg(rp, rs1, &val, badaddr))
223 		return (SIMU_FAULT);
224 	addr = (caddr_t)val;		/* convert to 32/64 bit address */
225 	if (aligndebug)
226 		printf("addr 1 = %p\n", (void *)addr);
227 
228 	/* check immediate bit and use immediate field or reg (rs2) */
229 	if (immflg) {
230 		int imm;
231 		imm  = inst & 0x1fff;		/* mask out immediate field */
232 		imm <<= 19;			/* sign extend it */
233 		imm >>= 19;
234 		addr += imm;			/* compute address */
235 	} else {
236 		if (getreg(rp, rs2, &val, badaddr))
237 			return (SIMU_FAULT);
238 		addr += val;
239 	}
240 
241 	/*
242 	 * If this is a 32-bit program, chop the address accordingly.  The
243 	 * intermediate uintptr_t casts prevent warnings under a certain
244 	 * compiler, and the temporary 32 bit storage is intended to force
245 	 * proper code generation and break up what would otherwise be a
246 	 * quadruple cast.
247 	 */
248 	if (curproc->p_model == DATAMODEL_ILP32) {
249 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
250 		addr = (caddr_t)(uintptr_t)addr32;
251 	}
252 
253 	if (aligndebug)
254 		printf("addr 2 = %p\n", (void *)addr);
255 
256 	if (addr >= curproc->p_as->a_userlimit) {
257 		*badaddr = addr;
258 		goto badret;
259 	}
260 
261 	/* a single bit differentiates ld and st */
262 	if ((inst >> 21) & 1) {			/* store */
263 		if (floatflg) {
264 			klwp_id_t lwp = ttolwp(curthread);
265 			kfpu_t *fp = lwptofpu(lwp);
266 			/* Ensure fp has been enabled */
267 			if (fpu_exists) {
268 				if (!(_fp_read_fprs() & FPRS_FEF))
269 					fp_enable();
270 			} else {
271 				if (!fp->fpu_en)
272 					fp_enable();
273 			}
274 			/* if fpu_exists read fpu reg */
275 			if (fpu_exists) {
276 				if (fsrflg) {
277 					_fp_read_pfsr(&data.l[0]);
278 				} else {
279 					if (sz == 4) {
280 						data.i[0] = 0;
281 						_fp_read_pfreg(
282 						    (unsigned *)&data.i[1], rd);
283 					}
284 					if (sz >= 8)
285 						_fp_read_pdreg(
286 						    &data.l[0], rd);
287 					if (sz == 16)
288 						_fp_read_pdreg(
289 						    &data.l[1], rd+1);
290 				}
291 			} else {
292 				if (fsrflg) {
293 					/* Clear reserved bits, set version=7 */
294 					fp->fpu_fsr &= ~0x30301000;
295 					fp->fpu_fsr |= 0xE0000;
296 					data.l[0] = fp->fpu_fsr;
297 				} else {
298 					if (sz == 4) {
299 						data.i[0] = 0;
300 						data.i[1] =
301 						    (unsigned)fp->
302 						    fpu_fr.fpu_regs[rd];
303 					}
304 					if (sz >= 8)
305 						data.l[0] =
306 						    fp->fpu_fr.fpu_dregs[rd];
307 					if (sz == 16)
308 						data.l[1] =
309 						    fp->fpu_fr.fpu_dregs[rd+1];
310 				}
311 			}
312 		} else {
313 			if (lddstdflg) {		/* combine the data */
314 				if (getreg(rp, rd, &data.l[0], badaddr))
315 					return (SIMU_FAULT);
316 				if (getreg(rp, rd+1, &data.l[1], badaddr))
317 					return (SIMU_FAULT);
318 				if (ltlend) {
319 					/*
320 					 * For STD, each 32-bit word is byte-
321 					 * swapped individually.  For
322 					 * simplicity we don't want to do that
323 					 * below, so we swap the words now to
324 					 * get the desired result in the end.
325 					 */
326 					data.i[0] = data.i[3];
327 				} else {
328 					data.i[0] = data.i[1];
329 					data.i[1] = data.i[3];
330 				}
331 			} else {
332 				if (getreg(rp, rd, &data.l[0], badaddr))
333 					return (SIMU_FAULT);
334 			}
335 		}
336 
337 		if (aligndebug) {
338 			if (sz == 16) {
339 				printf("data %x %x %x %x\n",
340 				    data.i[0], data.i[1], data.i[2], data.c[3]);
341 			} else {
342 				printf("data %x %x %x %x %x %x %x %x\n",
343 				    data.c[0], data.c[1], data.c[2], data.c[3],
344 				    data.c[4], data.c[5], data.c[6], data.c[7]);
345 			}
346 		}
347 
348 		if (ltlend) {
349 			if (sz == 1) {
350 				if (xcopyout_little(&data.c[7], addr,
351 				    (size_t)sz) != 0)
352 					goto badret;
353 			} else if (sz == 2) {
354 				if (xcopyout_little(&data.s[3], addr,
355 				    (size_t)sz) != 0)
356 					goto badret;
357 			} else if (sz == 4) {
358 				if (xcopyout_little(&data.i[1], addr,
359 				    (size_t)sz) != 0)
360 					goto badret;
361 			} else {
362 				if (xcopyout_little(&data.l[0], addr,
363 				    (size_t)sz) != 0)
364 					goto badret;
365 			}
366 		} else {
367 			if (sz == 1) {
368 				if (copyout(&data.c[7], addr, (size_t)sz) == -1)
369 					goto badret;
370 			} else if (sz == 2) {
371 				if (copyout(&data.s[3], addr, (size_t)sz) == -1)
372 					goto badret;
373 			} else if (sz == 4) {
374 				if (copyout(&data.i[1], addr, (size_t)sz) == -1)
375 					goto badret;
376 			} else {
377 				if (copyout(&data.l[0], addr, (size_t)sz) == -1)
378 					goto badret;
379 			}
380 		}
381 	} else {				/* load */
382 		if (sz == 1) {
383 			if (ltlend) {
384 				if (xcopyin_little(addr, &data.c[7],
385 				    (size_t)sz) != 0) {
386 					if (nf)
387 						data.c[7] = 0;
388 					else
389 						goto badret;
390 				}
391 			} else {
392 				if (copyin(addr, &data.c[7],
393 				    (size_t)sz) == -1) {
394 					if (nf)
395 						data.c[7] = 0;
396 					else
397 						goto badret;
398 				}
399 			}
400 			/* if signed and the sign bit is set extend it */
401 			if (((inst >> 22) & 1) && ((data.c[7] >> 7) & 1)) {
402 				data.i[0] = (uint_t)-1;	/* extend sign bit */
403 				data.s[2] = (ushort_t)-1;
404 				data.c[6] = (uchar_t)-1;
405 			} else {
406 				data.i[0] = 0;	/* clear upper 32+24 bits */
407 				data.s[2] = 0;
408 				data.c[6] = 0;
409 			}
410 		} else if (sz == 2) {
411 			if (ltlend) {
412 				if (xcopyin_little(addr, &data.s[3],
413 				    (size_t)sz) != 0) {
414 					if (nf)
415 						data.s[3] = 0;
416 					else
417 						goto badret;
418 				}
419 			} else {
420 				if (copyin(addr, &data.s[3],
421 				    (size_t)sz) == -1) {
422 					if (nf)
423 						data.s[3] = 0;
424 					else
425 						goto badret;
426 				}
427 			}
428 			/* if signed and the sign bit is set extend it */
429 			if (((inst >> 22) & 1) && ((data.s[3] >> 15) & 1)) {
430 				data.i[0] = (uint_t)-1;	/* extend sign bit */
431 				data.s[2] = (ushort_t)-1;
432 			} else {
433 				data.i[0] = 0;	/* clear upper 32+16 bits */
434 				data.s[2] = 0;
435 			}
436 		} else if (sz == 4) {
437 			if (ltlend) {
438 				if (xcopyin_little(addr, &data.i[1],
439 				    (size_t)sz) != 0) {
440 					if (!nf)
441 						goto badret;
442 					data.i[1] = 0;
443 				}
444 			} else {
445 				if (copyin(addr, &data.i[1],
446 				    (size_t)sz) == -1) {
447 					if (!nf)
448 						goto badret;
449 					data.i[1] = 0;
450 				}
451 			}
452 			/* if signed and the sign bit is set extend it */
453 			if (((inst >> 22) & 1) && ((data.i[1] >> 31) & 1)) {
454 				data.i[0] = (uint_t)-1;	/* extend sign bit */
455 			} else {
456 				data.i[0] = 0;	/* clear upper 32 bits */
457 			}
458 		} else {
459 			if (ltlend) {
460 				if (xcopyin_little(addr, &data.l[0],
461 				    (size_t)sz) != 0) {
462 					if (!nf)
463 						goto badret;
464 					data.l[0] = 0;
465 				}
466 			} else {
467 				if (copyin(addr, &data.l[0],
468 				    (size_t)sz) == -1) {
469 					if (!nf)
470 						goto badret;
471 					data.l[0] = 0;
472 				}
473 			}
474 		}
475 
476 		if (aligndebug) {
477 			if (sz == 16) {
478 				printf("data %x %x %x %x\n",
479 				    data.i[0], data.i[1], data.i[2], data.c[3]);
480 			} else {
481 				printf("data %x %x %x %x %x %x %x %x\n",
482 				    data.c[0], data.c[1], data.c[2], data.c[3],
483 				    data.c[4], data.c[5], data.c[6], data.c[7]);
484 			}
485 		}
486 
487 		if (floatflg) {		/* if fpu_exists write fpu reg */
488 			klwp_id_t lwp = ttolwp(curthread);
489 			kfpu_t *fp = lwptofpu(lwp);
490 			/* Ensure fp has been enabled */
491 			if (fpu_exists) {
492 				if (!(_fp_read_fprs() & FPRS_FEF))
493 					fp_enable();
494 			} else {
495 				if (!fp->fpu_en)
496 					fp_enable();
497 			}
498 			/* if fpu_exists read fpu reg */
499 			if (fpu_exists) {
500 				if (fsrflg) {
501 					_fp_write_pfsr(&data.l[0]);
502 				} else {
503 					if (sz == 4)
504 						_fp_write_pfreg(
505 						    (unsigned *)&data.i[1], rd);
506 					if (sz >= 8)
507 						_fp_write_pdreg(
508 						    &data.l[0], rd);
509 					if (sz == 16)
510 						_fp_write_pdreg(
511 						    &data.l[1], rd+1);
512 				}
513 			} else {
514 				if (fsrflg) {
515 					fp->fpu_fsr = data.l[0];
516 				} else {
517 					if (sz == 4)
518 						fp->fpu_fr.fpu_regs[rd] =
519 						    (unsigned)data.i[1];
520 					if (sz >= 8)
521 						fp->fpu_fr.fpu_dregs[rd] =
522 						    data.l[0];
523 					if (sz == 16)
524 						fp->fpu_fr.fpu_dregs[rd+1] =
525 						    data.l[1];
526 				}
527 			}
528 		} else {
529 			if (lddstdflg) {		/* split the data */
530 				if (ltlend) {
531 					/*
532 					 * For LDD, each 32-bit word is byte-
533 					 * swapped individually.  We didn't
534 					 * do that above, but this will give
535 					 * us the desired result.
536 					 */
537 					data.i[3] = data.i[0];
538 				} else {
539 					data.i[3] = data.i[1];
540 					data.i[1] = data.i[0];
541 				}
542 				data.i[0] = 0;
543 				data.i[2] = 0;
544 				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
545 					goto badret;
546 				if (putreg(&data.l[1], rp, rd+1, badaddr) == -1)
547 					goto badret;
548 			} else {
549 				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
550 					goto badret;
551 			}
552 		}
553 	}
554 	return (SIMU_SUCCESS);
555 badret:
556 	return (SIMU_FAULT);
557 }
558 
559 
560 int
561 simulate_lddstd(struct regs *rp, caddr_t *badaddr)
562 {
563 	uint_t	inst, op3, asi = 0;
564 	uint_t	rd, rs1, rs2;
565 	int	nf = 0, ltlend = 0, usermode;
566 	int	immflg;
567 	uint64_t reven;
568 	uint64_t rodd;
569 	caddr_t	addr;
570 	uint64_t val;
571 	uint64_t data;
572 
573 	usermode = USERMODE(rp->r_tstate);
574 
575 	if (usermode)
576 		inst = fetch_user_instr((caddr_t)rp->r_pc);
577 	else
578 		inst = *(uint_t *)rp->r_pc;
579 
580 	op3 = (inst >> 19) & 0x3f;
581 	rd = (inst >> 25) & 0x1f;
582 	rs1 = (inst >> 14) & 0x1f;
583 	rs2 = inst & 0x1f;
584 	immflg = (inst >> 13) & 1;
585 
586 	if (USERMODE(rp->r_tstate))
587 		(void) flush_user_windows_to_stack(NULL);
588 	else
589 		flush_windows();
590 
591 	if ((op3 >> 4) & 1) {		/* is this LDDA/STDA? */
592 		if (immflg) {
593 			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
594 			    TSTATE_ASI_MASK;
595 		} else {
596 			asi = (inst >> 5) & 0xff;
597 		}
598 		switch (asi) {
599 		case ASI_P:
600 		case ASI_S:
601 			break;
602 		case ASI_PNF:
603 		case ASI_SNF:
604 			nf = 1;
605 			break;
606 		case ASI_PL:
607 		case ASI_SL:
608 			ltlend = 1;
609 			break;
610 		case ASI_PNFL:
611 		case ASI_SNFL:
612 			ltlend = 1;
613 			nf = 1;
614 			break;
615 		case ASI_AIUP:
616 		case ASI_AIUS:
617 			usermode = 1;
618 			break;
619 		case ASI_AIUPL:
620 		case ASI_AIUSL:
621 			usermode = 1;
622 			ltlend = 1;
623 			break;
624 		default:
625 			return (SIMU_ILLEGAL);
626 		}
627 	}
628 
629 	if (getreg(rp, rs1, &val, badaddr))
630 		return (SIMU_FAULT);
631 	addr = (caddr_t)val;		/* convert to 32/64 bit address */
632 
633 	/* check immediate bit and use immediate field or reg (rs2) */
634 	if (immflg) {
635 		int imm;
636 		imm  = inst & 0x1fff;		/* mask out immediate field */
637 		imm <<= 19;			/* sign extend it */
638 		imm >>= 19;
639 		addr += imm;			/* compute address */
640 	} else {
641 		if (getreg(rp, rs2, &val, badaddr))
642 			return (SIMU_FAULT);
643 		addr += val;
644 	}
645 
646 	/*
647 	 * T_UNIMP_LDD and T_UNIMP_STD are higher priority than
648 	 * T_ALIGNMENT.  So we have to make sure that the address is
649 	 * kosher before trying to use it, because the hardware hasn't
650 	 * checked it for us yet.
651 	 */
652 	if (((uintptr_t)addr & 0x7) != 0) {
653 		if (curproc->p_fixalignment)
654 			return (do_unaligned(rp, badaddr));
655 		else
656 			return (SIMU_UNALIGN);
657 	}
658 
659 	/*
660 	 * If this is a 32-bit program, chop the address accordingly.  The
661 	 * intermediate uintptr_t casts prevent warnings under a certain
662 	 * compiler, and the temporary 32 bit storage is intended to force
663 	 * proper code generation and break up what would otherwise be a
664 	 * quadruple cast.
665 	 */
666 	if (curproc->p_model == DATAMODEL_ILP32 && usermode) {
667 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
668 		addr = (caddr_t)(uintptr_t)addr32;
669 	}
670 
671 	if ((inst >> 21) & 1) {			/* store */
672 		if (getreg(rp, rd, &reven, badaddr))
673 			return (SIMU_FAULT);
674 		if (getreg(rp, rd+1, &rodd, badaddr))
675 			return (SIMU_FAULT);
676 		if (ltlend) {
677 			reven = BSWAP_32(reven);
678 			rodd  = BSWAP_32(rodd);
679 		}
680 		data = (reven << 32) | rodd;
681 		if (usermode) {
682 			if (suword64_nowatch(addr, data) == -1)
683 				return (SIMU_FAULT);
684 		} else {
685 			*(uint64_t *)addr = data;
686 		}
687 	} else {				/* load */
688 		if (usermode) {
689 			if (fuword64_nowatch(addr, &data)) {
690 				if (nf)
691 					data = 0;
692 				else
693 					return (SIMU_FAULT);
694 			}
695 		} else
696 			data = *(uint64_t *)addr;
697 
698 		reven = (data >> 32);
699 		rodd  = (uint64_t)(uint32_t)data;
700 		if (ltlend) {
701 			reven = BSWAP_32(reven);
702 			rodd  = BSWAP_32(rodd);
703 		}
704 
705 		if (putreg(&reven, rp, rd, badaddr) == -1)
706 			return (SIMU_FAULT);
707 		if (putreg(&rodd, rp, rd+1, badaddr) == -1)
708 			return (SIMU_FAULT);
709 	}
710 	return (SIMU_SUCCESS);
711 }
712 
713 
714 /*
715  * simulate popc
716  */
717 static int
718 simulate_popc(struct regs *rp, caddr_t *badaddr, uint_t inst)
719 {
720 	uint_t	rd, rs2, rs1;
721 	uint_t	immflg;
722 	uint64_t val, cnt = 0;
723 
724 	rd = (inst >> 25) & 0x1f;
725 	rs1 = (inst >> 14) & 0x1f;
726 	rs2 = inst & 0x1f;
727 	immflg = (inst >> 13) & 1;
728 
729 	if (rs1 > 0)
730 		return (SIMU_ILLEGAL);
731 
732 	(void) flush_user_windows_to_stack(NULL);
733 
734 	/* check immediate bit and use immediate field or reg (rs2) */
735 	if (immflg) {
736 		int64_t imm;
737 		imm  = inst & 0x1fff;		/* mask out immediate field */
738 		imm <<= 51;			/* sign extend it */
739 		imm >>= 51;
740 		if (imm != 0) {
741 			for (cnt = 0; imm != 0; imm &= imm-1)
742 				cnt++;
743 		}
744 	} else {
745 		if (getreg(rp, rs2, &val, badaddr))
746 			return (SIMU_FAULT);
747 		if (val != 0) {
748 			for (cnt = 0; val != 0; val &= val-1)
749 				cnt++;
750 		}
751 	}
752 
753 	if (putreg(&cnt, rp, rd, badaddr) == -1)
754 		return (SIMU_FAULT);
755 
756 	return (SIMU_SUCCESS);
757 }
758 
759 /*
760  * simulate mulscc
761  */
762 static int
763 simulate_mulscc(struct regs *rp, caddr_t *badaddr, uint_t inst)
764 {
765 	uint32_t	s1, s2;
766 	uint32_t	c, d, v;
767 	uint_t		rd, rs1;
768 	int64_t		d64;
769 	uint64_t	ud64;
770 	uint64_t	drs1;
771 
772 	(void) flush_user_windows_to_stack(NULL);
773 
774 	if ((inst >> 13) & 1) {		/* immediate */
775 		d64 = inst & 0x1fff;
776 		d64 <<= 51;		/* sign extend it */
777 		d64 >>= 51;
778 	} else {
779 		uint_t		rs2;
780 		uint64_t	drs2;
781 
782 		if (inst & 0x1fe0) {
783 			return (SIMU_ILLEGAL);
784 		}
785 		rs2 = inst & 0x1f;
786 		if (getreg(rp, rs2, &drs2, badaddr)) {
787 			return (SIMU_FAULT);
788 		}
789 		d64 = (int64_t)drs2;
790 	}
791 
792 	rs1 = (inst >> 14) & 0x1f;
793 	if (getreg(rp, rs1, &drs1, badaddr)) {
794 		return (SIMU_FAULT);
795 	}
796 	/* icc.n xor icc.v */
797 	s1 = ((rp->r_tstate & TSTATE_IN) >> (TSTATE_CCR_SHIFT + 3)) ^
798 	    ((rp->r_tstate & TSTATE_IV) >> (TSTATE_CCR_SHIFT + 1));
799 	s1 = (s1 << 31) | (((uint32_t)drs1) >> 1);
800 
801 	if (rp->r_y & 1) {
802 		s2 = (uint32_t)d64;
803 	} else {
804 		s2 = 0;
805 	}
806 	d = s1 + s2;
807 
808 	ud64 = (uint64_t)d;
809 
810 	/* set the icc flags */
811 	v = (s1 & s2 & ~d) | (~s1 & ~s2 & d);
812 	c = (s1 & s2) | (~d & (s1 | s2));
813 	rp->r_tstate &= ~TSTATE_ICC;
814 	rp->r_tstate |= (uint64_t)((c >> 31) & 1) << (TSTATE_CCR_SHIFT + 0);
815 	rp->r_tstate |= (uint64_t)((v >> 31) & 1) << (TSTATE_CCR_SHIFT + 1);
816 	rp->r_tstate |= (uint64_t)(d ? 0 : 1) << (TSTATE_CCR_SHIFT + 2);
817 	rp->r_tstate |= (uint64_t)((d >> 31) & 1) << (TSTATE_CCR_SHIFT + 3);
818 
819 	if (rp->r_tstate & TSTATE_IC) {
820 		ud64 |= (1ULL << 32);
821 	}
822 
823 	/* set the xcc flags */
824 	rp->r_tstate &= ~TSTATE_XCC;
825 	if (ud64 == 0) {
826 		rp->r_tstate |= TSTATE_XZ;
827 	}
828 
829 	rd = (inst >> 25) & 0x1f;
830 	if (putreg(&ud64, rp, rd, badaddr)) {
831 		return (SIMU_FAULT);
832 	}
833 
834 	d64 = (drs1 << 32) | (uint32_t)rp->r_y;
835 	d64 >>= 1;
836 	rp->r_y = (uint32_t)d64;
837 
838 	return (SIMU_SUCCESS);
839 }
840 
841 /*
842  * simulate unimplemented instructions (popc, ldqf{a}, stqf{a})
843  */
844 int
845 simulate_unimp(struct regs *rp, caddr_t *badaddr)
846 {
847 	uint_t	inst, optype, op3, asi;
848 	uint_t	rs1, rd;
849 	uint_t	ignor, i;
850 	machpcb_t *mpcb = lwptompcb(ttolwp(curthread));
851 	int	nomatch = 0;
852 	caddr_t	addr = (caddr_t)rp->r_pc;
853 	struct as *as;
854 	caddr_t	ka;
855 	pfn_t	pfnum;
856 	page_t *pp;
857 	proc_t *p = ttoproc(curthread);
858 	struct seg *mapseg;
859 	struct segvn_data *svd;
860 
861 	ASSERT(USERMODE(rp->r_tstate));
862 	inst = fetch_user_instr(addr);
863 	if (inst == (uint_t)-1) {
864 		mpcb->mpcb_illexcaddr = addr;
865 		mpcb->mpcb_illexcinsn = (uint32_t)-1;
866 		return (SIMU_ILLEGAL);
867 	}
868 
869 	/*
870 	 * When fixing dirty v8 instructions there's a race if two processors
871 	 * are executing the dirty executable at the same time.  If one
872 	 * cleans the instruction as the other is executing it the second
873 	 * processor will see a clean instruction when it comes through this
874 	 * code and will return SIMU_ILLEGAL.  To work around the race
875 	 * this code will keep track of the last illegal instruction seen
876 	 * by each lwp and will only take action if the illegal instruction
877 	 * is repeatable.
878 	 */
879 	if (addr != mpcb->mpcb_illexcaddr ||
880 	    inst != mpcb->mpcb_illexcinsn)
881 		nomatch = 1;
882 	mpcb->mpcb_illexcaddr = addr;
883 	mpcb->mpcb_illexcinsn = inst;
884 
885 	/* instruction fields */
886 	i = (inst >> 13) & 0x1;
887 	rd = (inst >> 25) & 0x1f;
888 	optype = (inst >> 30) & 0x3;
889 	op3 = (inst >> 19) & 0x3f;
890 	ignor = (inst >> 5) & 0xff;
891 	if (IS_IBIT_SET(inst)) {
892 		asi = (uint32_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
893 		    TSTATE_ASI_MASK);
894 	} else {
895 		asi = ignor;
896 	}
897 
898 	if (IS_VIS1(optype, op3) ||
899 	    IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(optype, op3, asi) ||
900 	    IS_FLOAT_QUAD_OP(optype, op3)) {
901 		klwp_t *lwp = ttolwp(curthread);
902 		kfpu_t *fp = lwptofpu(lwp);
903 		if (fpu_exists) {
904 			if (!(_fp_read_fprs() & FPRS_FEF))
905 				fp_enable();
906 			_fp_read_pfsr(&fp->fpu_fsr);
907 		} else {
908 			if (!fp->fpu_en)
909 				fp_enable();
910 		}
911 		fp_precise(rp);
912 		return (SIMU_RETRY);
913 	}
914 
915 	if (optype == 2 && op3 == IOP_V8_POPC) {
916 		return (simulate_popc(rp, badaddr, inst));
917 	} else if (optype == 3 && op3 == IOP_V8_POPC) {
918 		return (SIMU_ILLEGAL);
919 	} else if (optype == OP_V8_ARITH && op3 == IOP_V8_MULScc) {
920 		return (simulate_mulscc(rp, badaddr, inst));
921 	}
922 
923 	if (optype == OP_V8_LDSTR) {
924 		if (op3 == IOP_V8_LDQF || op3 == IOP_V8_LDQFA ||
925 		    op3 == IOP_V8_STQF || op3 == IOP_V8_STQFA)
926 			return (do_unaligned(rp, badaddr));
927 	}
928 
929 	/* This is a new instruction so illexccnt should also be set. */
930 	if (nomatch) {
931 		mpcb->mpcb_illexccnt = 0;
932 		return (SIMU_RETRY);
933 	}
934 
935 	/*
936 	 * In order to keep us from entering into an infinite loop while
937 	 * attempting to clean up faulty instructions, we will return
938 	 * SIMU_ILLEGAL once we've cleaned up the instruction as much
939 	 * as we can, and still end up here.
940 	 */
941 	if (mpcb->mpcb_illexccnt >= 3)
942 		return (SIMU_ILLEGAL);
943 
944 	mpcb->mpcb_illexccnt += 1;
945 
946 	/*
947 	 * The rest of the code handles v8 binaries with instructions
948 	 * that have dirty (non-zero) bits in reserved or 'ignored'
949 	 * fields; these will cause core dumps on v9 machines.
950 	 *
951 	 * We only clean dirty instructions in 32-bit programs (ie, v8)
952 	 * running on SPARCv9 processors.  True v9 programs are forced
953 	 * to use the instruction set as intended.
954 	 */
955 	if (lwp_getdatamodel(curthread->t_lwp) != DATAMODEL_ILP32)
956 		return (SIMU_ILLEGAL);
957 	switch (optype) {
958 	case OP_V8_BRANCH:
959 	case OP_V8_CALL:
960 		return (SIMU_ILLEGAL);	/* these don't have ignored fields */
961 		/*NOTREACHED*/
962 	case OP_V8_ARITH:
963 		switch (op3) {
964 		case IOP_V8_RETT:
965 			if (rd == 0 && !(i == 0 && ignor))
966 				return (SIMU_ILLEGAL);
967 			if (rd)
968 				inst &= ~(0x1f << 25);
969 			if (i == 0 && ignor)
970 				inst &= ~(0xff << 5);
971 			break;
972 		case IOP_V8_TCC:
973 			if (i == 0 && ignor != 0) {
974 				inst &= ~(0xff << 5);
975 			} else if (i == 1 && (((inst >> 7) & 0x3f) != 0)) {
976 				inst &= ~(0x3f << 7);
977 			} else {
978 				return (SIMU_ILLEGAL);
979 			}
980 			break;
981 		case IOP_V8_JMPL:
982 		case IOP_V8_RESTORE:
983 		case IOP_V8_SAVE:
984 			if ((op3 == IOP_V8_RETT && rd) ||
985 			    (i == 0 && ignor)) {
986 				inst &= ~(0xff << 5);
987 			} else {
988 				return (SIMU_ILLEGAL);
989 			}
990 			break;
991 		case IOP_V8_FCMP:
992 			if (rd == 0)
993 				return (SIMU_ILLEGAL);
994 			inst &= ~(0x1f << 25);
995 			break;
996 		case IOP_V8_RDASR:
997 			rs1 = ((inst >> 14) & 0x1f);
998 			if (rs1 == 1 || (rs1 >= 7 && rs1 <= 14)) {
999 				/*
1000 				 * The instruction specifies an invalid
1001 				 * state register - better bail out than
1002 				 * "fix" it when we're not sure what was
1003 				 * intended.
1004 				 */
1005 				return (SIMU_ILLEGAL);
1006 			}
1007 				/*
1008 				 * Note: this case includes the 'stbar'
1009 				 * instruction (rs1 == 15 && i == 0).
1010 				 */
1011 				if ((ignor = (inst & 0x3fff)) != 0)
1012 					inst &= ~(0x3fff);
1013 			break;
1014 		case IOP_V8_SRA:
1015 		case IOP_V8_SRL:
1016 		case IOP_V8_SLL:
1017 			if (ignor == 0)
1018 				return (SIMU_ILLEGAL);
1019 			inst &= ~(0xff << 5);
1020 			break;
1021 		case IOP_V8_ADD:
1022 		case IOP_V8_AND:
1023 		case IOP_V8_OR:
1024 		case IOP_V8_XOR:
1025 		case IOP_V8_SUB:
1026 		case IOP_V8_ANDN:
1027 		case IOP_V8_ORN:
1028 		case IOP_V8_XNOR:
1029 		case IOP_V8_ADDC:
1030 		case IOP_V8_UMUL:
1031 		case IOP_V8_SMUL:
1032 		case IOP_V8_SUBC:
1033 		case IOP_V8_UDIV:
1034 		case IOP_V8_SDIV:
1035 		case IOP_V8_ADDcc:
1036 		case IOP_V8_ANDcc:
1037 		case IOP_V8_ORcc:
1038 		case IOP_V8_XORcc:
1039 		case IOP_V8_SUBcc:
1040 		case IOP_V8_ANDNcc:
1041 		case IOP_V8_ORNcc:
1042 		case IOP_V8_XNORcc:
1043 		case IOP_V8_ADDCcc:
1044 		case IOP_V8_UMULcc:
1045 		case IOP_V8_SMULcc:
1046 		case IOP_V8_SUBCcc:
1047 		case IOP_V8_UDIVcc:
1048 		case IOP_V8_SDIVcc:
1049 		case IOP_V8_TADDcc:
1050 		case IOP_V8_TSUBcc:
1051 		case IOP_V8_TADDccTV:
1052 		case IOP_V8_TSUBccTV:
1053 		case IOP_V8_MULScc:
1054 		case IOP_V8_WRASR:
1055 		case IOP_V8_FLUSH:
1056 			if (i != 0 || ignor == 0)
1057 				return (SIMU_ILLEGAL);
1058 			inst &= ~(0xff << 5);
1059 			break;
1060 		default:
1061 			return (SIMU_ILLEGAL);
1062 		}
1063 		break;
1064 	case OP_V8_LDSTR:
1065 		switch (op3) {
1066 		case IOP_V8_STFSR:
1067 		case IOP_V8_LDFSR:
1068 			if (rd == 0 && !(i == 0 && ignor))
1069 				return (SIMU_ILLEGAL);
1070 			if (rd)
1071 				inst &= ~(0x1f << 25);
1072 			if (i == 0 && ignor)
1073 				inst &= ~(0xff << 5);
1074 			break;
1075 		default:
1076 			if (optype == OP_V8_LDSTR && !IS_LDST_ALT(op3) &&
1077 			    i == 0 && ignor)
1078 				inst &= ~(0xff << 5);
1079 			else
1080 				return (SIMU_ILLEGAL);
1081 			break;
1082 		}
1083 		break;
1084 	default:
1085 		return (SIMU_ILLEGAL);
1086 	}
1087 
1088 	as = p->p_as;
1089 
1090 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1091 	mapseg = as_findseg(as, (caddr_t)rp->r_pc, 0);
1092 	ASSERT(mapseg != NULL);
1093 	svd = (struct segvn_data *)mapseg->s_data;
1094 
1095 	/*
1096 	 * We only create COW page for MAP_PRIVATE mappings.
1097 	 */
1098 	SEGVN_LOCK_ENTER(as, &svd->lock, RW_READER);
1099 	if ((svd->type & MAP_TYPE) & MAP_SHARED) {
1100 		SEGVN_LOCK_EXIT(as, &svd->lock);
1101 		AS_LOCK_EXIT(as, &as->a_lock);
1102 		return (SIMU_ILLEGAL);
1103 	}
1104 	SEGVN_LOCK_EXIT(as, &svd->lock);
1105 	AS_LOCK_EXIT(as, &as->a_lock);
1106 
1107 	/*
1108 	 * A "flush" instruction using the user PC's vaddr will not work
1109 	 * here, at least on Spitfire. Instead we create a temporary kernel
1110 	 * mapping to the user's text page, then modify and flush that.
1111 	 * Break COW by locking user page.
1112 	 */
1113 	if (as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK), PAGESIZE,
1114 	    F_SOFTLOCK, S_READ))
1115 		return (SIMU_FAULT);
1116 
1117 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1118 	pfnum = hat_getpfnum(as->a_hat, (caddr_t)rp->r_pc);
1119 	AS_LOCK_EXIT(as, &as->a_lock);
1120 	if (pf_is_memory(pfnum)) {
1121 		pp = page_numtopp_nolock(pfnum);
1122 		ASSERT(pp == NULL || PAGE_LOCKED(pp));
1123 	} else {
1124 		(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
1125 		    PAGESIZE, F_SOFTUNLOCK, S_READ);
1126 		return (SIMU_FAULT);
1127 	}
1128 
1129 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1130 	ka = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)rp->r_pc);
1131 	*(uint_t *)(ka + (uintptr_t)(rp->r_pc % PAGESIZE)) = inst;
1132 	doflush(ka + (uintptr_t)(rp->r_pc % PAGESIZE));
1133 	ppmapout(ka);
1134 	AS_LOCK_EXIT(as, &as->a_lock);
1135 
1136 	(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
1137 	    PAGESIZE, F_SOFTUNLOCK, S_READ);
1138 	return (SIMU_RETRY);
1139 }
1140 
1141 /*
1142  * Simulate a "rd %tick" or "rd %stick" (%asr24) instruction.
1143  */
1144 int
1145 simulate_rdtick(struct regs *rp)
1146 {
1147 	uint_t	inst, op, op3, rd, rs1, i;
1148 	caddr_t badaddr;
1149 
1150 	inst = fetch_user_instr((caddr_t)rp->r_pc);
1151 	op   = (inst >> 30) & 0x3;
1152 	rd   = (inst >> 25) & 0x1F;
1153 	op3  = (inst >> 19) & 0x3F;
1154 	i    = (inst >> 13) & 0x1;
1155 
1156 	/*
1157 	 * Make sure this is either a %tick read (rs1 == 0x4) or
1158 	 * a %stick read (rs1 == 0x18) instruction.
1159 	 */
1160 	if (op == 2 && op3 == 0x28 && i == 0) {
1161 		rs1 = (inst >> 14) & 0x1F;
1162 
1163 		if (rs1 == 0x4) {
1164 			uint64_t tick;
1165 			(void) flush_user_windows_to_stack(NULL);
1166 			tick = gettick_counter();
1167 			if (putreg(&tick, rp, rd, &badaddr) == 0)
1168 				return (SIMU_SUCCESS);
1169 		} else if (rs1 == 0x18) {
1170 			uint64_t stick;
1171 			(void) flush_user_windows_to_stack(NULL);
1172 			stick = gethrtime_unscaled();
1173 			if (putreg(&stick, rp, rd, &badaddr) == 0)
1174 				return (SIMU_SUCCESS);
1175 		}
1176 	}
1177 
1178 	return (SIMU_FAULT);
1179 }
1180 
1181 /*
1182  * Get the value of a register for instruction simulation
1183  * by using the regs or window structure pointers.
1184  * Return 0 for success, and -1 for failure.  If there is a failure,
1185  * save the faulting address using badaddr pointer.
1186  * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
1187  * Don't truncate globals/outs for 32 bit programs, for v8+ support.
1188  */
1189 int
1190 getreg(struct regs *rp, uint_t reg, uint64_t *val, caddr_t *badaddr)
1191 {
1192 	uint64_t *rgs, *sp;
1193 	int rv = 0;
1194 
1195 	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
1196 	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
1197 	if (reg == 0) {
1198 		*val = 0;
1199 	} else if (reg < 16) {
1200 		*val = rgs[reg];
1201 	} else if (IS_V9STACK(sp)) {
1202 		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
1203 		uint64_t *addr = (uint64_t *)&rw[reg - 16];
1204 		uint64_t res;
1205 
1206 		if (USERMODE(rp->r_tstate)) {
1207 			if (fuword64_nowatch(addr, &res) == -1) {
1208 				*badaddr = (caddr_t)addr;
1209 				rv = -1;
1210 			}
1211 		} else {
1212 			res = *addr;
1213 		}
1214 		*val = res;
1215 	} else {
1216 		caddr32_t sp32 = (caddr32_t)(uintptr_t)sp;
1217 		uint32_t *rw = (uint32_t *)(uintptr_t)sp32;
1218 		uint32_t *addr = (uint32_t *)&rw[reg - 16];
1219 		uint32_t res;
1220 
1221 		if (USERMODE(rp->r_tstate)) {
1222 			if (fuword32_nowatch(addr, &res) == -1) {
1223 				*badaddr = (caddr_t)addr;
1224 				rv = -1;
1225 			}
1226 		} else {
1227 			res = *addr;
1228 		}
1229 		*val = (uint64_t)res;
1230 	}
1231 	return (rv);
1232 }
1233 
1234 /*
1235  * Set the value of a register after instruction simulation
1236  * by using the regs or window structure pointers.
1237  * Return 0 for succes -1 failure.
1238  * save the faulting address using badaddr pointer.
1239  * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
1240  * Don't truncate globals/outs for 32 bit programs, for v8+ support.
1241  */
1242 int
1243 putreg(uint64_t	*data, struct regs *rp, uint_t reg, caddr_t *badaddr)
1244 {
1245 	uint64_t *rgs, *sp;
1246 	int rv = 0;
1247 
1248 	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
1249 	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
1250 	if (reg == 0) {
1251 		return (0);
1252 	} else if (reg < 16) {
1253 		rgs[reg] = *data;
1254 	} else if (IS_V9STACK(sp)) {
1255 		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
1256 		uint64_t *addr = (uint64_t *)&rw[reg - 16];
1257 		uint64_t res;
1258 
1259 		if (USERMODE(rp->r_tstate)) {
1260 			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);
1261 
1262 			res = *data;
1263 			if (suword64_nowatch(addr, res) != 0) {
1264 				*badaddr = (caddr_t)addr;
1265 				rv = -1;
1266 			}
1267 			/*
1268 			 * We have changed a local or in register;
1269 			 * nuke the watchpoint return windows.
1270 			 */
1271 			mpcb->mpcb_rsp[0] = NULL;
1272 			mpcb->mpcb_rsp[1] = NULL;
1273 		} else {
1274 			res = *data;
1275 			*addr = res;
1276 		}
1277 	} else {
1278 		caddr32_t sp32 = (caddr32_t)(uintptr_t)sp;
1279 		uint32_t *rw = (uint32_t *)(uintptr_t)sp32;
1280 		uint32_t *addr = (uint32_t *)&rw[reg - 16];
1281 		uint32_t res;
1282 
1283 		if (USERMODE(rp->r_tstate)) {
1284 			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);
1285 
1286 			res = (uint_t)*data;
1287 			if (suword32_nowatch(addr, res) != 0) {
1288 				*badaddr = (caddr_t)addr;
1289 				rv = -1;
1290 			}
1291 			/*
1292 			 * We have changed a local or in register;
1293 			 * nuke the watchpoint return windows.
1294 			 */
1295 			mpcb->mpcb_rsp[0] = NULL;
1296 			mpcb->mpcb_rsp[1] = NULL;
1297 
1298 		} else {
1299 			res = (uint_t)*data;
1300 			*addr = res;
1301 		}
1302 	}
1303 	return (rv);
1304 }
1305 
1306 /*
1307  * Calculate a memory reference address from instruction
1308  * operands, used to return the address of a fault, instead
1309  * of the instruction when an error occurs.  This is code that is
1310  * common with most of the routines that simulate instructions.
1311  */
1312 int
1313 calc_memaddr(struct regs *rp, caddr_t *badaddr)
1314 {
1315 	uint_t	inst;
1316 	uint_t	rd, rs1, rs2;
1317 	int	sz;
1318 	int	immflg;
1319 	int	floatflg;
1320 	caddr_t  addr;
1321 	uint64_t val;
1322 
1323 	if (USERMODE(rp->r_tstate))
1324 		inst = fetch_user_instr((caddr_t)rp->r_pc);
1325 	else
1326 		inst = *(uint_t *)rp->r_pc;
1327 
1328 	rd = (inst >> 25) & 0x1f;
1329 	rs1 = (inst >> 14) & 0x1f;
1330 	rs2 = inst & 0x1f;
1331 	floatflg = (inst >> 24) & 1;
1332 	immflg = (inst >> 13) & 1;
1333 
1334 	if (floatflg) {
1335 		switch ((inst >> 19) & 3) {	/* map size bits to a number */
1336 		case 0: sz = 4; break;		/* ldf/stf */
1337 		case 1: return (0);		/* ld[x]fsr/st[x]fsr */
1338 		case 2: sz = 16; break;		/* ldqf/stqf */
1339 		case 3: sz = 8; break;		/* lddf/stdf */
1340 		}
1341 		/*
1342 		 * Fix to access extra double register encoding plus
1343 		 * compensate to access the correct fpu_dreg.
1344 		 */
1345 		if (sz > 4) {
1346 			if ((rd & 1) == 1)
1347 				rd = (rd & 0x1e) | 0x20;
1348 			rd = rd >> 1;
1349 		}
1350 	} else {
1351 		switch ((inst >> 19) & 0xf) {	/* map size bits to a number */
1352 		case 0:				/* lduw */
1353 		case 4:				/* stw */
1354 		case 8:				/* ldsw */
1355 		case 0xf:			/* swap */
1356 			sz = 4; break;
1357 		case 1:				/* ldub */
1358 		case 5:				/* stb */
1359 		case 9:				/* ldsb */
1360 		case 0xd:			/* ldstub */
1361 			sz = 1; break;
1362 		case 2:				/* lduh */
1363 		case 6:				/* sth */
1364 		case 0xa:			/* ldsh */
1365 			sz = 2; break;
1366 		case 3:				/* ldd */
1367 		case 7:				/* std */
1368 		case 0xb:			/* ldx */
1369 		case 0xe:			/* stx */
1370 			sz = 8; break;
1371 		}
1372 	}
1373 
1374 	if (USERMODE(rp->r_tstate))
1375 		(void) flush_user_windows_to_stack(NULL);
1376 	else
1377 		flush_windows();
1378 
1379 	if (getreg(rp, rs1, &val, badaddr))
1380 		return (SIMU_FAULT);
1381 	addr = (caddr_t)val;
1382 
1383 	/* check immediate bit and use immediate field or reg (rs2) */
1384 	if (immflg) {
1385 		int imm;
1386 		imm = inst & 0x1fff;		/* mask out immediate field */
1387 		imm <<= 19;			/* sign extend it */
1388 		imm >>= 19;
1389 		addr += imm;			/* compute address */
1390 	} else {
1391 		if (getreg(rp, rs2, &val, badaddr))
1392 			return (SIMU_FAULT);
1393 		addr += val;
1394 	}
1395 
1396 	/*
1397 	 * If this is a 32-bit program, chop the address accordingly.  The
1398 	 * intermediate uintptr_t casts prevent warnings under a certain
1399 	 * compiler, and the temporary 32 bit storage is intended to force
1400 	 * proper code generation and break up what would otherwise be a
1401 	 * quadruple cast.
1402 	 */
1403 	if (curproc->p_model == DATAMODEL_ILP32 && USERMODE(rp->r_tstate)) {
1404 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
1405 		addr = (caddr_t)(uintptr_t)addr32;
1406 	}
1407 
1408 	*badaddr = addr;
1409 	return ((uintptr_t)addr & (sz - 1) ? SIMU_UNALIGN : SIMU_SUCCESS);
1410 }
1411 
1412 /*
1413  * Return the size of a load or store instruction (1, 2, 4, 8, 16, 64).
1414  * Also compute the precise address by instruction disassembly.
1415  * (v9 page faults only provide the page address via the hardware.)
1416  * Return 0 on failure (not a load or store instruction).
1417  */
1418 int
1419 instr_size(struct regs *rp, caddr_t *addrp, enum seg_rw rdwr)
1420 {
1421 	uint_t	inst, op3, asi;
1422 	uint_t	rd, rs1, rs2;
1423 	int	sz = 0;
1424 	int	immflg;
1425 	int	floatflg;
1426 	caddr_t	addr;
1427 	caddr_t badaddr;
1428 	uint64_t val;
1429 
1430 	if (rdwr == S_EXEC) {
1431 		*addrp = (caddr_t)rp->r_pc;
1432 		return (4);
1433 	}
1434 
1435 	/*
1436 	 * Fetch the instruction from user-level.
1437 	 * We would like to assert this:
1438 	 *   ASSERT(USERMODE(rp->r_tstate));
1439 	 * but we can't because we can reach this point from a
1440 	 * register window underflow/overflow and the v9 wbuf
1441 	 * traps call trap() with T_USER even though r_tstate
1442 	 * indicates a system trap, not a user trap.
1443 	 */
1444 	inst = fetch_user_instr((caddr_t)rp->r_pc);
1445 
1446 	op3 = (inst >> 19) & 0x3f;
1447 	rd = (inst >> 25) & 0x1f;
1448 	rs1 = (inst >> 14) & 0x1f;
1449 	rs2 = inst & 0x1f;
1450 	floatflg = (inst >> 24) & 1;
1451 	immflg = (inst >> 13) & 1;
1452 
1453 	/* if not load or store do nothing.  can't happen? */
1454 	if ((inst >> 30) != 3)
1455 		return (0);
1456 
1457 	if (immflg)
1458 		asi = (uint_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
1459 		    TSTATE_ASI_MASK);
1460 	else
1461 		asi = (inst >> 5) & 0xff;
1462 
1463 	if (floatflg) {
1464 		/* check for ld/st alternate and highest defined V9 asi */
1465 		if ((op3 & 0x30) == 0x30 && asi > ASI_SNFL) {
1466 			sz = extended_asi_size(asi);
1467 		} else {
1468 			switch (op3 & 3) {
1469 			case 0:
1470 				sz = 4;			/* ldf/stf/cas */
1471 				break;
1472 			case 1:
1473 				if (rd == 0)
1474 					sz = 4;		/* ldfsr/stfsr */
1475 				else
1476 					sz = 8;		/* ldxfsr/stxfsr */
1477 				break;
1478 			case 2:
1479 				if (op3 == 0x3e)
1480 					sz = 8;		/* casx */
1481 				else
1482 					sz = 16;	/* ldqf/stqf */
1483 				break;
1484 			case 3:
1485 				sz = 8;			/* lddf/stdf */
1486 				break;
1487 			}
1488 		}
1489 	} else {
1490 		switch (op3 & 0xf) {		/* map size bits to a number */
1491 		case 0:				/* lduw */
1492 		case 4:				/* stw */
1493 		case 8:				/* ldsw */
1494 		case 0xf:			/* swap */
1495 			sz = 4; break;
1496 		case 1:				/* ldub */
1497 		case 5:				/* stb */
1498 		case 9:				/* ldsb */
1499 		case 0xd:			/* ldstub */
1500 			sz = 1; break;
1501 		case 2:				/* lduh */
1502 		case 6:				/* sth */
1503 		case 0xa:			/* ldsh */
1504 			sz = 2; break;
1505 		case 3:				/* ldd */
1506 		case 7:				/* std */
1507 		case 0xb:			/* ldx */
1508 		case 0xe:			/* stx */
1509 			sz = 8; break;
1510 		}
1511 	}
1512 
1513 	if (sz == 0)	/* can't happen? */
1514 		return (0);
1515 	(void) flush_user_windows_to_stack(NULL);
1516 
1517 	if (getreg(rp, rs1, &val, &badaddr))
1518 		return (0);
1519 	addr = (caddr_t)val;
1520 
1521 	/* cas/casx don't use rs2 / simm13 to compute the address */
1522 	if ((op3 & 0x3d) != 0x3c) {
1523 		/* check immediate bit and use immediate field or reg (rs2) */
1524 		if (immflg) {
1525 			int imm;
1526 			imm  = inst & 0x1fff;	/* mask out immediate field */
1527 			imm <<= 19;		/* sign extend it */
1528 			imm >>= 19;
1529 			addr += imm;		/* compute address */
1530 		} else {
1531 			/*
1532 			 * asi's in the 0xCx range are partial store
1533 			 * instructions.  For these, rs2 is a mask, not part of
1534 			 * the address.
1535 			 */
1536 			if (!(floatflg && (asi & 0xf0) == 0xc0)) {
1537 				if (getreg(rp, rs2, &val, &badaddr))
1538 					return (0);
1539 				addr += val;
1540 			}
1541 		}
1542 	}
1543 
1544 	/*
1545 	 * If this is a 32-bit program, chop the address accordingly.  The
1546 	 * intermediate uintptr_t casts prevent warnings under a certain
1547 	 * compiler, and the temporary 32 bit storage is intended to force
1548 	 * proper code generation and break up what would otherwise be a
1549 	 * quadruple cast.
1550 	 */
1551 	if (curproc->p_model == DATAMODEL_ILP32) {
1552 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
1553 		addr = (caddr_t)(uintptr_t)addr32;
1554 	}
1555 
1556 	*addrp = addr;
1557 	ASSERT(sz != 0);
1558 	return (sz);
1559 }
1560 
1561 /*
1562  * Fetch an instruction from user-level.
1563  * Deal with watchpoints, if they are in effect.
1564  */
1565 int32_t
1566 fetch_user_instr(caddr_t vaddr)
1567 {
1568 	proc_t *p = curproc;
1569 	int32_t instr;
1570 
1571 	/*
1572 	 * If this is a 32-bit program, chop the address accordingly.  The
1573 	 * intermediate uintptr_t casts prevent warnings under a certain
1574 	 * compiler, and the temporary 32 bit storage is intended to force
1575 	 * proper code generation and break up what would otherwise be a
1576 	 * quadruple cast.
1577 	 */
1578 	if (p->p_model == DATAMODEL_ILP32) {
1579 		caddr32_t vaddr32 = (caddr32_t)(uintptr_t)vaddr;
1580 		vaddr = (caddr_t)(uintptr_t)vaddr32;
1581 	}
1582 
1583 	if (fuword32_nowatch(vaddr, (uint32_t *)&instr) == -1)
1584 		instr = -1;
1585 
1586 	return (instr);
1587 }
1588