xref: /titanic_44/usr/src/uts/sparc/v9/os/simulator.c (revision 208e825d0597a017edee1b095c64040043c0c673)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /* common code with bug fixes from original version in trap.c */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/archsystm.h>
35 #include <sys/vmsystm.h>
36 #include <sys/fpu/fpusystm.h>
37 #include <sys/fpu/fpu_simulator.h>
38 #include <sys/inline.h>
39 #include <sys/debug.h>
40 #include <sys/privregs.h>
41 #include <sys/machpcb.h>
42 #include <sys/simulate.h>
43 #include <sys/proc.h>
44 #include <sys/cmn_err.h>
45 #include <sys/stack.h>
46 #include <sys/watchpoint.h>
47 #include <sys/trap.h>
48 #include <sys/machtrap.h>
49 #include <sys/mman.h>
50 #include <sys/asi.h>
51 #include <sys/copyops.h>
52 #include <vm/as.h>
53 #include <vm/page.h>
54 #include <sys/model.h>
55 #include <vm/seg_vn.h>
56 #include <sys/byteorder.h>
57 
58 #define	IS_IBIT_SET(x)	(x & 0x2000)
59 #define	IS_VIS1(op, op3)(op == 2 && op3 == 0x36)
60 #define	IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(op, op3, asi)		\
61 		(op == 3 && (op3 == IOP_V8_LDDFA ||		\
62 		op3 == IOP_V8_STDFA) &&	asi > ASI_SNFL)
63 
64 static int aligndebug = 0;
65 
66 /*
67  * For the sake of those who must be compatible with unaligned
68  * architectures, users can link their programs to use a
69  * corrective trap handler that will fix unaligned references
70  * a special trap #6 (T_FIX_ALIGN) enables this 'feature'.
71  * Returns 1 for success, 0 for failure.
72  */
73 
74 int
75 do_unaligned(struct regs *rp, caddr_t *badaddr)
76 {
77 	uint_t	inst, op3, asi = 0;
78 	uint_t	rd, rs1, rs2;
79 	int	sz, nf = 0, ltlend = 0;
80 	int	floatflg;
81 	int	fsrflg;
82 	int	immflg;
83 	int	lddstdflg;
84 	caddr_t	addr;
85 	uint64_t val;
86 	union {
87 		uint64_t	l[2];
88 		uint32_t	i[4];
89 		uint16_t	s[8];
90 		uint8_t		c[16];
91 	} data;
92 
93 	ASSERT(USERMODE(rp->r_tstate));
94 	inst = fetch_user_instr((caddr_t)rp->r_pc);
95 
96 	op3 = (inst >> 19) & 0x3f;
97 	rd = (inst >> 25) & 0x1f;
98 	rs1 = (inst >> 14) & 0x1f;
99 	rs2 = inst & 0x1f;
100 	floatflg = (inst >> 24) & 1;
101 	immflg = (inst >> 13) & 1;
102 	lddstdflg = fsrflg = 0;
103 
104 	/* if not load or store do nothing */
105 	if ((inst >> 30) != 3)
106 		return (0);
107 
108 	/* if ldstub or swap, do nothing */
109 	if ((inst & 0xc1680000) == 0xc0680000)
110 		return (0);
111 
112 	/* if cas/casx, do nothing */
113 	if ((inst & 0xc1e00000) == 0xc1e00000)
114 		return (0);
115 
116 	if (floatflg) {
117 		switch ((inst >> 19) & 3) {	/* map size bits to a number */
118 		case 0: sz = 4;
119 			break;			/* ldf{a}/stf{a} */
120 		case 1: fsrflg = 1;
121 			if (rd == 0)
122 				sz = 4;		/* ldfsr/stfsr */
123 			else  if (rd == 1)
124 				sz = 8;		/* ldxfsr/stxfsr */
125 			else
126 				return (SIMU_ILLEGAL);
127 			break;
128 		case 2: sz = 16;
129 			break;		/* ldqf{a}/stqf{a} */
130 		case 3: sz = 8;
131 			break;		/* lddf{a}/stdf{a} */
132 		}
133 		/*
134 		 * Fix to access extra double register encoding plus
135 		 * compensate to access the correct fpu_dreg.
136 		 */
137 		if ((sz > 4) && (fsrflg == 0)) {
138 			if ((rd & 1) == 1)
139 				rd = (rd & 0x1e) | 0x20;
140 			rd = rd >> 1;
141 			if ((sz == 16) && ((rd & 0x1) != 0))
142 				return (SIMU_ILLEGAL);
143 		}
144 	} else {
145 		int sz_bits = (inst >> 19) & 0xf;
146 		switch (sz_bits) {		/* map size bits to a number */
147 		case 0:				/* lduw{a} */
148 		case 4:				/* stw{a} */
149 		case 8:				/* ldsw{a} */
150 		case 0xf:			/* swap */
151 			sz = 4; break;
152 		case 1:				/* ldub{a} */
153 		case 5:				/* stb{a} */
154 		case 9:				/* ldsb{a} */
155 		case 0xd:			/* ldstub */
156 			sz = 1; break;
157 		case 2:				/* lduh{a} */
158 		case 6:				/* sth{a} */
159 		case 0xa:			/* ldsh{a} */
160 			sz = 2; break;
161 		case 3:				/* ldd{a} */
162 		case 7:				/* std{a} */
163 			lddstdflg = 1;
164 			sz = 8; break;
165 		case 0xb:			/* ldx{a} */
166 		case 0xe:			/* stx{a} */
167 			sz = 8; break;
168 		}
169 	}
170 
171 
172 	/* only support primary and secondary asi's */
173 	if ((op3 >> 4) & 1) {
174 		if (immflg) {
175 			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
176 					TSTATE_ASI_MASK;
177 		} else {
178 			asi = (inst >> 5) & 0xff;
179 		}
180 		switch (asi) {
181 		case ASI_P:
182 		case ASI_S:
183 			break;
184 		case ASI_PNF:
185 		case ASI_SNF:
186 			nf = 1;
187 			break;
188 		case ASI_PL:
189 		case ASI_SL:
190 			ltlend = 1;
191 			break;
192 		case ASI_PNFL:
193 		case ASI_SNFL:
194 			ltlend = 1;
195 			nf = 1;
196 			break;
197 		default:
198 			return (0);
199 		}
200 		/*
201 		 * Non-faulting stores generate a data_access_exception trap,
202 		 * according to the Spitfire manual, which should be signaled
203 		 * as an illegal instruction trap, because it can't be fixed.
204 		 */
205 		if ((nf) && ((op3 == IOP_V8_STQFA) || (op3 == IOP_V8_STDFA)))
206 			return (SIMU_ILLEGAL);
207 	}
208 
209 	if (aligndebug) {
210 		printf("unaligned access at %p, instruction: 0x%x\n",
211 		    (void *)rp->r_pc, inst);
212 		printf("type %s", (((inst >> 21) & 1) ? "st" : "ld"));
213 		if (((inst >> 21) & 1) == 0)
214 		    printf(" %s", (((inst >> 22) & 1) ? "signed" : "unsigned"));
215 		printf(" asi 0x%x size %d immflg %d\n", asi, sz, immflg);
216 		printf("rd = %d, op3 = 0x%x, rs1 = %d, rs2 = %d, imm13=0x%x\n",
217 			rd, op3, rs1, rs2, (inst & 0x1fff));
218 	}
219 
220 	(void) flush_user_windows_to_stack(NULL);
221 	if (getreg(rp, rs1, &val, badaddr))
222 		return (SIMU_FAULT);
223 	addr = (caddr_t)val;		/* convert to 32/64 bit address */
224 	if (aligndebug)
225 		printf("addr 1 = %p\n", (void *)addr);
226 
227 	/* check immediate bit and use immediate field or reg (rs2) */
228 	if (immflg) {
229 		int imm;
230 		imm  = inst & 0x1fff;		/* mask out immediate field */
231 		imm <<= 19;			/* sign extend it */
232 		imm >>= 19;
233 		addr += imm;			/* compute address */
234 	} else {
235 		if (getreg(rp, rs2, &val, badaddr))
236 			return (SIMU_FAULT);
237 		addr += val;
238 	}
239 
240 	/*
241 	 * If this is a 32-bit program, chop the address accordingly.  The
242 	 * intermediate uintptr_t casts prevent warnings under a certain
243 	 * compiler, and the temporary 32 bit storage is intended to force
244 	 * proper code generation and break up what would otherwise be a
245 	 * quadruple cast.
246 	 */
247 	if (curproc->p_model == DATAMODEL_ILP32) {
248 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
249 		addr = (caddr_t)(uintptr_t)addr32;
250 	}
251 
252 	if (aligndebug)
253 		printf("addr 2 = %p\n", (void *)addr);
254 
255 	if (addr >= curproc->p_as->a_userlimit) {
256 		*badaddr = addr;
257 		goto badret;
258 	}
259 
260 	/* a single bit differentiates ld and st */
261 	if ((inst >> 21) & 1) {			/* store */
262 		if (floatflg) {
263 			klwp_id_t lwp = ttolwp(curthread);
264 			kfpu_t *fp = lwptofpu(lwp);
265 			/* Ensure fp has been enabled */
266 			if (fpu_exists) {
267 				if (!(_fp_read_fprs() & FPRS_FEF))
268 					fp_enable();
269 			} else {
270 				if (!fp->fpu_en)
271 					fp_enable();
272 			}
273 			/* if fpu_exists read fpu reg */
274 			if (fpu_exists) {
275 				if (fsrflg) {
276 					_fp_read_pfsr(&data.l[0]);
277 				} else {
278 					if (sz == 4) {
279 						data.i[0] = 0;
280 						_fp_read_pfreg(
281 						    (unsigned *)&data.i[1], rd);
282 					}
283 					if (sz >= 8)
284 						_fp_read_pdreg(
285 							&data.l[0], rd);
286 					if (sz == 16)
287 						_fp_read_pdreg(
288 							&data.l[1], rd+1);
289 				}
290 			} else {
291 				if (fsrflg) {
292 					/* Clear reserved bits, set version=7 */
293 					fp->fpu_fsr &= ~0x30301000;
294 					fp->fpu_fsr |= 0xE0000;
295 					data.l[0] = fp->fpu_fsr;
296 				} else {
297 					if (sz == 4) {
298 						data.i[0] = 0;
299 						data.i[1] =
300 					    (unsigned)fp->fpu_fr.fpu_regs[rd];
301 					}
302 					if (sz >= 8)
303 						data.l[0] =
304 						    fp->fpu_fr.fpu_dregs[rd];
305 					if (sz == 16)
306 						data.l[1] =
307 						    fp->fpu_fr.fpu_dregs[rd+1];
308 				}
309 			}
310 		} else {
311 			if (lddstdflg) {		/* combine the data */
312 				if (getreg(rp, rd, &data.l[0], badaddr))
313 					return (SIMU_FAULT);
314 				if (getreg(rp, rd+1, &data.l[1], badaddr))
315 					return (SIMU_FAULT);
316 				if (ltlend) {
317 					/*
318 					 * For STD, each 32-bit word is byte-
319 					 * swapped individually.  For
320 					 * simplicity we don't want to do that
321 					 * below, so we swap the words now to
322 					 * get the desired result in the end.
323 					 */
324 					data.i[0] = data.i[3];
325 				} else {
326 					data.i[0] = data.i[1];
327 					data.i[1] = data.i[3];
328 				}
329 			} else {
330 				if (getreg(rp, rd, &data.l[0], badaddr))
331 					return (SIMU_FAULT);
332 			}
333 		}
334 
335 		if (aligndebug) {
336 			if (sz == 16) {
337 				printf("data %x %x %x %x\n",
338 				    data.i[0], data.i[1], data.i[2], data.c[3]);
339 			} else {
340 				printf("data %x %x %x %x %x %x %x %x\n",
341 				    data.c[0], data.c[1], data.c[2], data.c[3],
342 				    data.c[4], data.c[5], data.c[6], data.c[7]);
343 			}
344 		}
345 
346 		if (ltlend) {
347 			if (sz == 1) {
348 				if (xcopyout_little(&data.c[7], addr,
349 				    (size_t)sz) != 0)
350 					goto badret;
351 			} else if (sz == 2) {
352 				if (xcopyout_little(&data.s[3], addr,
353 				    (size_t)sz) != 0)
354 					goto badret;
355 			} else if (sz == 4) {
356 				if (xcopyout_little(&data.i[1], addr,
357 				    (size_t)sz) != 0)
358 					goto badret;
359 			} else {
360 				if (xcopyout_little(&data.l[0], addr,
361 				    (size_t)sz) != 0)
362 					goto badret;
363 			}
364 		} else {
365 			if (sz == 1) {
366 				if (copyout(&data.c[7], addr, (size_t)sz) == -1)
367 					goto badret;
368 			} else if (sz == 2) {
369 				if (copyout(&data.s[3], addr, (size_t)sz) == -1)
370 					goto badret;
371 			} else if (sz == 4) {
372 				if (copyout(&data.i[1], addr, (size_t)sz) == -1)
373 					goto badret;
374 			} else {
375 				if (copyout(&data.l[0], addr, (size_t)sz) == -1)
376 					goto badret;
377 			}
378 		}
379 	} else {				/* load */
380 		if (sz == 1) {
381 			if (ltlend) {
382 				if (xcopyin_little(addr, &data.c[7],
383 				    (size_t)sz) != 0) {
384 					if (nf)
385 						data.c[7] = 0;
386 					else
387 						goto badret;
388 				}
389 			} else {
390 				if (copyin(addr, &data.c[7],
391 				    (size_t)sz) == -1) {
392 					if (nf)
393 						data.c[7] = 0;
394 					else
395 						goto badret;
396 				}
397 			}
398 			/* if signed and the sign bit is set extend it */
399 			if (((inst >> 22) & 1) && ((data.c[7] >> 7) & 1)) {
400 				data.i[0] = (uint_t)-1;	/* extend sign bit */
401 				data.s[2] = (ushort_t)-1;
402 				data.c[6] = (uchar_t)-1;
403 			} else {
404 				data.i[0] = 0;	/* clear upper 32+24 bits */
405 				data.s[2] = 0;
406 				data.c[6] = 0;
407 			}
408 		} else if (sz == 2) {
409 			if (ltlend) {
410 				if (xcopyin_little(addr, &data.s[3],
411 				    (size_t)sz) != 0) {
412 					if (nf)
413 						data.s[3] = 0;
414 					else
415 						goto badret;
416 				}
417 			} else {
418 				if (copyin(addr, &data.s[3],
419 				    (size_t)sz) == -1) {
420 					if (nf)
421 						data.s[3] = 0;
422 					else
423 						goto badret;
424 				}
425 			}
426 			/* if signed and the sign bit is set extend it */
427 			if (((inst >> 22) & 1) && ((data.s[3] >> 15) & 1)) {
428 				data.i[0] = (uint_t)-1;	/* extend sign bit */
429 				data.s[2] = (ushort_t)-1;
430 			} else {
431 				data.i[0] = 0;	/* clear upper 32+16 bits */
432 				data.s[2] = 0;
433 			}
434 		} else if (sz == 4) {
435 			if (ltlend) {
436 				if (xcopyin_little(addr, &data.i[1],
437 				    (size_t)sz) != 0) {
438 					if (!nf)
439 						goto badret;
440 					data.i[1] = 0;
441 				}
442 			} else {
443 				if (copyin(addr, &data.i[1],
444 				    (size_t)sz) == -1) {
445 					if (!nf)
446 						goto badret;
447 					data.i[1] = 0;
448 				}
449 			}
450 			/* if signed and the sign bit is set extend it */
451 			if (((inst >> 22) & 1) && ((data.i[1] >> 31) & 1)) {
452 				data.i[0] = (uint_t)-1;	/* extend sign bit */
453 			} else {
454 				data.i[0] = 0;	/* clear upper 32 bits */
455 			}
456 		} else {
457 			if (ltlend) {
458 				if (xcopyin_little(addr, &data.l[0],
459 				    (size_t)sz) != 0) {
460 					if (!nf)
461 						goto badret;
462 					data.l[0] = 0;
463 				}
464 			} else {
465 				if (copyin(addr, &data.l[0],
466 				    (size_t)sz) == -1) {
467 					if (!nf)
468 						goto badret;
469 					data.l[0] = 0;
470 				}
471 			}
472 		}
473 
474 		if (aligndebug) {
475 			if (sz == 16) {
476 				printf("data %x %x %x %x\n",
477 				    data.i[0], data.i[1], data.i[2], data.c[3]);
478 			} else {
479 				printf("data %x %x %x %x %x %x %x %x\n",
480 				    data.c[0], data.c[1], data.c[2], data.c[3],
481 				    data.c[4], data.c[5], data.c[6], data.c[7]);
482 			}
483 		}
484 
485 		if (floatflg) {		/* if fpu_exists write fpu reg */
486 			klwp_id_t lwp = ttolwp(curthread);
487 			kfpu_t *fp = lwptofpu(lwp);
488 			/* Ensure fp has been enabled */
489 			if (fpu_exists) {
490 				if (!(_fp_read_fprs() & FPRS_FEF))
491 					fp_enable();
492 			} else {
493 				if (!fp->fpu_en)
494 					fp_enable();
495 			}
496 			/* if fpu_exists read fpu reg */
497 			if (fpu_exists) {
498 				if (fsrflg) {
499 					_fp_write_pfsr(&data.l[0]);
500 				} else {
501 					if (sz == 4)
502 						_fp_write_pfreg(
503 						    (unsigned *)&data.i[1], rd);
504 					if (sz >= 8)
505 						_fp_write_pdreg(
506 							&data.l[0], rd);
507 					if (sz == 16)
508 						_fp_write_pdreg(
509 							&data.l[1], rd+1);
510 				}
511 			} else {
512 				if (fsrflg) {
513 					fp->fpu_fsr = data.l[0];
514 				} else {
515 					if (sz == 4)
516 						fp->fpu_fr.fpu_regs[rd] =
517 							(unsigned)data.i[1];
518 					if (sz >= 8)
519 						fp->fpu_fr.fpu_dregs[rd] =
520 							data.l[0];
521 					if (sz == 16)
522 						fp->fpu_fr.fpu_dregs[rd+1] =
523 							data.l[1];
524 				}
525 			}
526 		} else {
527 			if (lddstdflg) {		/* split the data */
528 				if (ltlend) {
529 					/*
530 					 * For LDD, each 32-bit word is byte-
531 					 * swapped individually.  We didn't
532 					 * do that above, but this will give
533 					 * us the desired result.
534 					 */
535 					data.i[3] = data.i[0];
536 				} else {
537 					data.i[3] = data.i[1];
538 					data.i[1] = data.i[0];
539 				}
540 				data.i[0] = 0;
541 				data.i[2] = 0;
542 				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
543 					goto badret;
544 				if (putreg(&data.l[1], rp, rd+1, badaddr) == -1)
545 					goto badret;
546 			} else {
547 				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
548 					goto badret;
549 			}
550 		}
551 	}
552 	return (SIMU_SUCCESS);
553 badret:
554 	return (SIMU_FAULT);
555 }
556 
557 
558 int
559 simulate_lddstd(struct regs *rp, caddr_t *badaddr)
560 {
561 	uint_t	inst, op3, asi = 0;
562 	uint_t	rd, rs1, rs2;
563 	int	nf = 0, ltlend = 0, usermode;
564 	int	immflg;
565 	uint64_t reven;
566 	uint64_t rodd;
567 	caddr_t	addr;
568 	uint64_t val;
569 	uint64_t data;
570 
571 	usermode = USERMODE(rp->r_tstate);
572 
573 	if (usermode)
574 		inst = fetch_user_instr((caddr_t)rp->r_pc);
575 	else
576 		inst = *(uint_t *)rp->r_pc;
577 
578 	op3 = (inst >> 19) & 0x3f;
579 	rd = (inst >> 25) & 0x1f;
580 	rs1 = (inst >> 14) & 0x1f;
581 	rs2 = inst & 0x1f;
582 	immflg = (inst >> 13) & 1;
583 
584 	if (USERMODE(rp->r_tstate))
585 		(void) flush_user_windows_to_stack(NULL);
586 	else
587 		flush_windows();
588 
589 	if ((op3 >> 4) & 1) {		/* is this LDDA/STDA? */
590 		if (immflg) {
591 			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
592 					TSTATE_ASI_MASK;
593 		} else {
594 			asi = (inst >> 5) & 0xff;
595 		}
596 		switch (asi) {
597 		case ASI_P:
598 		case ASI_S:
599 			break;
600 		case ASI_PNF:
601 		case ASI_SNF:
602 			nf = 1;
603 			break;
604 		case ASI_PL:
605 		case ASI_SL:
606 			ltlend = 1;
607 			break;
608 		case ASI_PNFL:
609 		case ASI_SNFL:
610 			ltlend = 1;
611 			nf = 1;
612 			break;
613 		case ASI_AIUP:
614 		case ASI_AIUS:
615 			usermode = 1;
616 			break;
617 		case ASI_AIUPL:
618 		case ASI_AIUSL:
619 			usermode = 1;
620 			ltlend = 1;
621 			break;
622 		default:
623 			return (SIMU_ILLEGAL);
624 		}
625 	}
626 
627 	if (getreg(rp, rs1, &val, badaddr))
628 		return (SIMU_FAULT);
629 	addr = (caddr_t)val;		/* convert to 32/64 bit address */
630 
631 	/* check immediate bit and use immediate field or reg (rs2) */
632 	if (immflg) {
633 		int imm;
634 		imm  = inst & 0x1fff;		/* mask out immediate field */
635 		imm <<= 19;			/* sign extend it */
636 		imm >>= 19;
637 		addr += imm;			/* compute address */
638 	} else {
639 		if (getreg(rp, rs2, &val, badaddr))
640 			return (SIMU_FAULT);
641 		addr += val;
642 	}
643 
644 	/*
645 	 * T_UNIMP_LDD and T_UNIMP_STD are higher priority than
646 	 * T_ALIGNMENT.  So we have to make sure that the address is
647 	 * kosher before trying to use it, because the hardware hasn't
648 	 * checked it for us yet.
649 	 */
650 	if (((uintptr_t)addr & 0x7) != 0) {
651 		if (curproc->p_fixalignment)
652 			return (do_unaligned(rp, badaddr));
653 		else
654 			return (SIMU_UNALIGN);
655 	}
656 
657 	/*
658 	 * If this is a 32-bit program, chop the address accordingly.  The
659 	 * intermediate uintptr_t casts prevent warnings under a certain
660 	 * compiler, and the temporary 32 bit storage is intended to force
661 	 * proper code generation and break up what would otherwise be a
662 	 * quadruple cast.
663 	 */
664 	if (curproc->p_model == DATAMODEL_ILP32 && usermode) {
665 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
666 		addr = (caddr_t)(uintptr_t)addr32;
667 	}
668 
669 	if ((inst >> 21) & 1) {			/* store */
670 		if (getreg(rp, rd, &reven, badaddr))
671 			return (SIMU_FAULT);
672 		if (getreg(rp, rd+1, &rodd, badaddr))
673 			return (SIMU_FAULT);
674 		if (ltlend) {
675 			reven = BSWAP_32(reven);
676 			rodd  = BSWAP_32(rodd);
677 		}
678 		data = (reven << 32) | rodd;
679 		if (usermode) {
680 			if (suword64_nowatch(addr, data) == -1)
681 				return (SIMU_FAULT);
682 		} else {
683 			*(uint64_t *)addr = data;
684 		}
685 	} else {				/* load */
686 		if (usermode) {
687 			if (fuword64_nowatch(addr, &data)) {
688 				if (nf)
689 					data = 0;
690 				else
691 					return (SIMU_FAULT);
692 			}
693 		} else
694 			data = *(uint64_t *)addr;
695 
696 		reven = (data >> 32);
697 		rodd  = (uint64_t)(uint32_t)data;
698 		if (ltlend) {
699 			reven = BSWAP_32(reven);
700 			rodd  = BSWAP_32(rodd);
701 		}
702 
703 		if (putreg(&reven, rp, rd, badaddr) == -1)
704 			return (SIMU_FAULT);
705 		if (putreg(&rodd, rp, rd+1, badaddr) == -1)
706 			return (SIMU_FAULT);
707 	}
708 	return (SIMU_SUCCESS);
709 }
710 
711 
712 /*
713  * simulate popc
714  */
715 static int
716 simulate_popc(struct regs *rp, caddr_t *badaddr, uint_t inst)
717 {
718 	uint_t	rd, rs2, rs1;
719 	uint_t	immflg;
720 	uint64_t val, cnt = 0;
721 
722 	rd = (inst >> 25) & 0x1f;
723 	rs1 = (inst >> 14) & 0x1f;
724 	rs2 = inst & 0x1f;
725 	immflg = (inst >> 13) & 1;
726 
727 	if (rs1 > 0)
728 		return (SIMU_ILLEGAL);
729 
730 	(void) flush_user_windows_to_stack(NULL);
731 
732 	/* check immediate bit and use immediate field or reg (rs2) */
733 	if (immflg) {
734 		int64_t imm;
735 		imm  = inst & 0x1fff;		/* mask out immediate field */
736 		imm <<= 51;			/* sign extend it */
737 		imm >>= 51;
738 		if (imm != 0) {
739 			for (cnt = 0; imm != 0; imm &= imm-1)
740 				cnt++;
741 		}
742 	} else {
743 		if (getreg(rp, rs2, &val, badaddr))
744 			return (SIMU_FAULT);
745 		if (val != 0) {
746 			for (cnt = 0; val != 0; val &= val-1)
747 				cnt++;
748 		}
749 	}
750 
751 	if (putreg(&cnt, rp, rd, badaddr) == -1)
752 		return (SIMU_FAULT);
753 
754 	return (SIMU_SUCCESS);
755 }
756 
757 /*
758  * simulate unimplemented instructions (popc, ldqf{a}, stqf{a})
759  */
760 int
761 simulate_unimp(struct regs *rp, caddr_t *badaddr)
762 {
763 	uint_t	inst, optype, op3, asi;
764 	uint_t	rs1, rd;
765 	uint_t	ignor, i;
766 	machpcb_t *mpcb = lwptompcb(ttolwp(curthread));
767 	int	nomatch = 0;
768 	caddr_t	addr = (caddr_t)rp->r_pc;
769 	struct as *as;
770 	caddr_t	ka;
771 	pfn_t	pfnum;
772 	page_t *pp;
773 	proc_t *p = ttoproc(curthread);
774 	struct seg *mapseg;
775 	struct segvn_data *svd;
776 
777 	ASSERT(USERMODE(rp->r_tstate));
778 	inst = fetch_user_instr(addr);
779 	if (inst == (uint_t)-1) {
780 		mpcb->mpcb_illexcaddr = addr;
781 		mpcb->mpcb_illexcinsn = (uint32_t)-1;
782 		return (SIMU_ILLEGAL);
783 	}
784 
785 	/*
786 	 * When fixing dirty v8 instructions there's a race if two processors
787 	 * are executing the dirty executable at the same time.  If one
788 	 * cleans the instruction as the other is executing it the second
789 	 * processor will see a clean instruction when it comes through this
790 	 * code and will return SIMU_ILLEGAL.  To work around the race
791 	 * this code will keep track of the last illegal instruction seen
792 	 * by each lwp and will only take action if the illegal instruction
793 	 * is repeatable.
794 	 */
795 	if (addr != mpcb->mpcb_illexcaddr ||
796 	    inst != mpcb->mpcb_illexcinsn)
797 		nomatch = 1;
798 	mpcb->mpcb_illexcaddr = addr;
799 	mpcb->mpcb_illexcinsn = inst;
800 
801 	/* instruction fields */
802 	i = (inst >> 13) & 0x1;
803 	rd = (inst >> 25) & 0x1f;
804 	optype = (inst >> 30) & 0x3;
805 	op3 = (inst >> 19) & 0x3f;
806 	ignor = (inst >> 5) & 0xff;
807 	if (IS_IBIT_SET(inst)) {
808 		asi = (uint32_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
809 		    TSTATE_ASI_MASK);
810 	} else {
811 		asi = ignor;
812 	}
813 
814 	if (IS_VIS1(optype, op3) ||
815 	    IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(optype, op3, asi)) {
816 		klwp_t *lwp = ttolwp(curthread);
817 		kfpu_t *fp = lwptofpu(lwp);
818 		if (fpu_exists) {
819 			if (!(_fp_read_fprs() & FPRS_FEF))
820 				fp_enable();
821 			_fp_read_pfsr(&fp->fpu_fsr);
822 		} else {
823 			if (!fp->fpu_en)
824 				fp_enable();
825 		}
826 		fp_precise(rp);
827 		return (SIMU_RETRY);
828 	}
829 
830 	if (optype == 2 && op3 == IOP_V8_POPC) {
831 		return (simulate_popc(rp, badaddr, inst));
832 	} else if (optype == 3 && op3 == IOP_V8_POPC) {
833 		return (SIMU_ILLEGAL);
834 	}
835 
836 	if (optype == OP_V8_LDSTR) {
837 		if (op3 == IOP_V8_LDQF || op3 == IOP_V8_LDQFA ||
838 		    op3 == IOP_V8_STQF || op3 == IOP_V8_STQFA)
839 			return (do_unaligned(rp, badaddr));
840 	}
841 
842 	if (nomatch)
843 		return (SIMU_RETRY);
844 
845 	/*
846 	 * The rest of the code handles v8 binaries with instructions
847 	 * that have dirty (non-zero) bits in reserved or 'ignored'
848 	 * fields; these will cause core dumps on v9 machines.
849 	 *
850 	 * We only clean dirty instructions in 32-bit programs (ie, v8)
851 	 * running on SPARCv9 processors.  True v9 programs are forced
852 	 * to use the instruction set as intended.
853 	 */
854 	if (lwp_getdatamodel(curthread->t_lwp) != DATAMODEL_ILP32)
855 		return (SIMU_ILLEGAL);
856 	switch (optype) {
857 	case OP_V8_BRANCH:
858 	case OP_V8_CALL:
859 		return (SIMU_ILLEGAL);	/* these don't have ignored fields */
860 		/*NOTREACHED*/
861 	case OP_V8_ARITH:
862 		switch (op3) {
863 		case IOP_V8_RETT:
864 			if (rd == 0 && !(i == 0 && ignor))
865 				return (SIMU_ILLEGAL);
866 			if (rd)
867 				inst &= ~(0x1f << 25);
868 			if (i == 0 && ignor)
869 				inst &= ~(0xff << 5);
870 			break;
871 		case IOP_V8_TCC:
872 			if (i == 0 && ignor != 0) {
873 				inst &= ~(0xff << 5);
874 			} else if (i == 1 && (((inst >> 7) & 0x3f) != 0)) {
875 				inst &= ~(0x3f << 7);
876 			} else {
877 				return (SIMU_ILLEGAL);
878 			}
879 			break;
880 		case IOP_V8_JMPL:
881 		case IOP_V8_RESTORE:
882 		case IOP_V8_SAVE:
883 			if ((op3 == IOP_V8_RETT && rd) ||
884 			    (i == 0 && ignor)) {
885 				inst &= ~(0xff << 5);
886 			} else {
887 				return (SIMU_ILLEGAL);
888 			}
889 			break;
890 		case IOP_V8_FCMP:
891 			if (rd == 0)
892 				return (SIMU_ILLEGAL);
893 			inst &= ~(0x1f << 25);
894 			break;
895 		case IOP_V8_RDASR:
896 			rs1 = ((inst >> 14) & 0x1f);
897 			if (rs1 == 1 || (rs1 >= 7 && rs1 <= 14)) {
898 				/*
899 				 * The instruction specifies an invalid
900 				 * state register - better bail out than
901 				 * "fix" it when we're not sure what was
902 				 * intended.
903 				 */
904 				return (SIMU_ILLEGAL);
905 			}
906 				/*
907 				 * Note: this case includes the 'stbar'
908 				 * instruction (rs1 == 15 && i == 0).
909 				 */
910 				if ((ignor = (inst & 0x3fff)) != 0)
911 					inst &= ~(0x3fff);
912 			break;
913 		case IOP_V8_SRA:
914 		case IOP_V8_SRL:
915 		case IOP_V8_SLL:
916 			if (ignor == 0)
917 				return (SIMU_ILLEGAL);
918 			inst &= ~(0xff << 5);
919 			break;
920 		case IOP_V8_ADD:
921 		case IOP_V8_AND:
922 		case IOP_V8_OR:
923 		case IOP_V8_XOR:
924 		case IOP_V8_SUB:
925 		case IOP_V8_ANDN:
926 		case IOP_V8_ORN:
927 		case IOP_V8_XNOR:
928 		case IOP_V8_ADDC:
929 		case IOP_V8_UMUL:
930 		case IOP_V8_SMUL:
931 		case IOP_V8_SUBC:
932 		case IOP_V8_UDIV:
933 		case IOP_V8_SDIV:
934 		case IOP_V8_ADDcc:
935 		case IOP_V8_ANDcc:
936 		case IOP_V8_ORcc:
937 		case IOP_V8_XORcc:
938 		case IOP_V8_SUBcc:
939 		case IOP_V8_ANDNcc:
940 		case IOP_V8_ORNcc:
941 		case IOP_V8_XNORcc:
942 		case IOP_V8_ADDCcc:
943 		case IOP_V8_UMULcc:
944 		case IOP_V8_SMULcc:
945 		case IOP_V8_SUBCcc:
946 		case IOP_V8_UDIVcc:
947 		case IOP_V8_SDIVcc:
948 		case IOP_V8_TADDcc:
949 		case IOP_V8_TSUBcc:
950 		case IOP_V8_TADDccTV:
951 		case IOP_V8_TSUBccTV:
952 		case IOP_V8_MULScc:
953 		case IOP_V8_WRASR:
954 		case IOP_V8_FLUSH:
955 			if (i != 0 || ignor == 0)
956 				return (SIMU_ILLEGAL);
957 			inst &= ~(0xff << 5);
958 			break;
959 		default:
960 			return (SIMU_ILLEGAL);
961 		}
962 		break;
963 	case OP_V8_LDSTR:
964 		switch (op3) {
965 		case IOP_V8_STFSR:
966 		case IOP_V8_LDFSR:
967 			if (rd == 0 && !(i == 0 && ignor))
968 				return (SIMU_ILLEGAL);
969 			if (rd)
970 				inst &= ~(0x1f << 25);
971 			if (i == 0 && ignor)
972 				inst &= ~(0xff << 5);
973 			break;
974 		default:
975 			if (optype == OP_V8_LDSTR && !IS_LDST_ALT(op3) &&
976 			    i == 0 && ignor)
977 				inst &= ~(0xff << 5);
978 			else
979 				return (SIMU_ILLEGAL);
980 			break;
981 		}
982 		break;
983 	default:
984 		return (SIMU_ILLEGAL);
985 	}
986 
987 	as = p->p_as;
988 
989 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
990 	mapseg = as_findseg(as, (caddr_t)rp->r_pc, 0);
991 	ASSERT(mapseg != NULL);
992 	svd = (struct segvn_data *)mapseg->s_data;
993 
994 	/*
995 	 * We only create COW page for MAP_PRIVATE mappings.
996 	 */
997 	SEGVN_LOCK_ENTER(as, &svd->lock, RW_READER);
998 	if ((svd->type & MAP_TYPE) & MAP_SHARED) {
999 		SEGVN_LOCK_EXIT(as, &svd->lock);
1000 		AS_LOCK_EXIT(as, &as->a_lock);
1001 		return (SIMU_ILLEGAL);
1002 	}
1003 	SEGVN_LOCK_EXIT(as, &svd->lock);
1004 	AS_LOCK_EXIT(as, &as->a_lock);
1005 
1006 	/*
1007 	 * A "flush" instruction using the user PC's vaddr will not work
1008 	 * here, at least on Spitfire. Instead we create a temporary kernel
1009 	 * mapping to the user's text page, then modify and flush that.
1010 	 * Break COW by locking user page.
1011 	 */
1012 	if (as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK), PAGESIZE,
1013 	    F_SOFTLOCK, S_READ))
1014 		return (SIMU_FAULT);
1015 
1016 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1017 	pfnum = hat_getpfnum(as->a_hat, (caddr_t)rp->r_pc);
1018 	AS_LOCK_EXIT(as, &as->a_lock);
1019 	if (pf_is_memory(pfnum)) {
1020 		pp = page_numtopp_nolock(pfnum);
1021 		ASSERT(pp == NULL || PAGE_LOCKED(pp));
1022 	} else {
1023 		(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
1024 		    PAGESIZE, F_SOFTUNLOCK, S_READ);
1025 		return (SIMU_FAULT);
1026 	}
1027 
1028 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1029 	ka = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)rp->r_pc);
1030 	*(uint_t *)(ka + (uintptr_t)(rp->r_pc % PAGESIZE)) = inst;
1031 	doflush(ka + (uintptr_t)(rp->r_pc % PAGESIZE));
1032 	ppmapout(ka);
1033 	AS_LOCK_EXIT(as, &as->a_lock);
1034 
1035 	(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
1036 	    PAGESIZE, F_SOFTUNLOCK, S_READ);
1037 	return (SIMU_RETRY);
1038 }
1039 
1040 /*
1041  * Get the value of a register for instruction simulation
1042  * by using the regs or window structure pointers.
1043  * Return 0 for success, and -1 for failure.  If there is a failure,
1044  * save the faulting address using badaddr pointer.
1045  * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
1046  * Don't truncate globals/outs for 32 bit programs, for v8+ support.
1047  */
1048 int
1049 getreg(struct regs *rp, uint_t reg, uint64_t *val, caddr_t *badaddr)
1050 {
1051 	uint64_t *rgs, *sp;
1052 	int rv = 0;
1053 
1054 	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
1055 	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
1056 	if (reg == 0) {
1057 		*val = 0;
1058 	} else if (reg < 16) {
1059 		*val = rgs[reg];
1060 	} else if (IS_V9STACK(sp)) {
1061 		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
1062 		uint64_t *addr = (uint64_t *)&rw[reg - 16];
1063 		uint64_t res;
1064 
1065 		if (USERMODE(rp->r_tstate)) {
1066 			if (fuword64_nowatch(addr, &res) == -1) {
1067 				*badaddr = (caddr_t)addr;
1068 				rv = -1;
1069 			}
1070 		} else {
1071 			res = *addr;
1072 		}
1073 		*val = res;
1074 	} else {
1075 		caddr32_t sp32 = (caddr32_t)(uintptr_t)sp;
1076 		uint32_t *rw = (uint32_t *)(uintptr_t)sp32;
1077 		uint32_t *addr = (uint32_t *)&rw[reg - 16];
1078 		uint32_t res;
1079 
1080 		if (USERMODE(rp->r_tstate)) {
1081 			if (fuword32_nowatch(addr, &res) == -1) {
1082 				*badaddr = (caddr_t)addr;
1083 				rv = -1;
1084 			}
1085 		} else {
1086 			res = *addr;
1087 		}
1088 		*val = (uint64_t)res;
1089 	}
1090 	return (rv);
1091 }
1092 
1093 /*
1094  * Set the value of a register after instruction simulation
1095  * by using the regs or window structure pointers.
1096  * Return 0 for succes -1 failure.
1097  * save the faulting address using badaddr pointer.
1098  * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
1099  * Don't truncate globals/outs for 32 bit programs, for v8+ support.
1100  */
1101 int
1102 putreg(uint64_t	*data, struct regs *rp, uint_t reg, caddr_t *badaddr)
1103 {
1104 	uint64_t *rgs, *sp;
1105 	int rv = 0;
1106 
1107 	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
1108 	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
1109 	if (reg == 0) {
1110 		return (0);
1111 	} else if (reg < 16) {
1112 		rgs[reg] = *data;
1113 	} else if (IS_V9STACK(sp)) {
1114 		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
1115 		uint64_t *addr = (uint64_t *)&rw[reg - 16];
1116 		uint64_t res;
1117 
1118 		if (USERMODE(rp->r_tstate)) {
1119 			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);
1120 
1121 			res = *data;
1122 			if (suword64_nowatch(addr, res) != 0) {
1123 				*badaddr = (caddr_t)addr;
1124 				rv = -1;
1125 			}
1126 			/*
1127 			 * We have changed a local or in register;
1128 			 * nuke the watchpoint return windows.
1129 			 */
1130 			mpcb->mpcb_rsp[0] = NULL;
1131 			mpcb->mpcb_rsp[1] = NULL;
1132 		} else {
1133 			res = *data;
1134 			*addr = res;
1135 		}
1136 	} else {
1137 		caddr32_t sp32 = (caddr32_t)(uintptr_t)sp;
1138 		uint32_t *rw = (uint32_t *)(uintptr_t)sp32;
1139 		uint32_t *addr = (uint32_t *)&rw[reg - 16];
1140 		uint32_t res;
1141 
1142 		if (USERMODE(rp->r_tstate)) {
1143 			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);
1144 
1145 			res = (uint_t)*data;
1146 			if (suword32_nowatch(addr, res) != 0) {
1147 				*badaddr = (caddr_t)addr;
1148 				rv = -1;
1149 			}
1150 			/*
1151 			 * We have changed a local or in register;
1152 			 * nuke the watchpoint return windows.
1153 			 */
1154 			mpcb->mpcb_rsp[0] = NULL;
1155 			mpcb->mpcb_rsp[1] = NULL;
1156 
1157 		} else {
1158 			res = (uint_t)*data;
1159 			*addr = res;
1160 		}
1161 	}
1162 	return (rv);
1163 }
1164 
1165 /*
1166  * Calculate a memory reference address from instruction
1167  * operands, used to return the address of a fault, instead
1168  * of the instruction when an error occurs.  This is code that is
1169  * common with most of the routines that simulate instructions.
1170  */
1171 int
1172 calc_memaddr(struct regs *rp, caddr_t *badaddr)
1173 {
1174 	uint_t	inst;
1175 	uint_t	rd, rs1, rs2;
1176 	int	sz;
1177 	int	immflg;
1178 	int	floatflg;
1179 	caddr_t  addr;
1180 	uint64_t val;
1181 
1182 	if (USERMODE(rp->r_tstate))
1183 		inst = fetch_user_instr((caddr_t)rp->r_pc);
1184 	else
1185 		inst = *(uint_t *)rp->r_pc;
1186 
1187 	rd = (inst >> 25) & 0x1f;
1188 	rs1 = (inst >> 14) & 0x1f;
1189 	rs2 = inst & 0x1f;
1190 	floatflg = (inst >> 24) & 1;
1191 	immflg = (inst >> 13) & 1;
1192 
1193 	if (floatflg) {
1194 		switch ((inst >> 19) & 3) {	/* map size bits to a number */
1195 		case 0: sz = 4; break;		/* ldf/stf */
1196 		case 1: return (0);		/* ld[x]fsr/st[x]fsr */
1197 		case 2: sz = 16; break;		/* ldqf/stqf */
1198 		case 3: sz = 8; break;		/* lddf/stdf */
1199 		}
1200 		/*
1201 		 * Fix to access extra double register encoding plus
1202 		 * compensate to access the correct fpu_dreg.
1203 		 */
1204 		if (sz > 4) {
1205 			if ((rd & 1) == 1)
1206 				rd = (rd & 0x1e) | 0x20;
1207 			rd = rd >> 1;
1208 		}
1209 	} else {
1210 		switch ((inst >> 19) & 0xf) {	/* map size bits to a number */
1211 		case 0:				/* lduw */
1212 		case 4:				/* stw */
1213 		case 8:				/* ldsw */
1214 		case 0xf:			/* swap */
1215 			sz = 4; break;
1216 		case 1:				/* ldub */
1217 		case 5:				/* stb */
1218 		case 9:				/* ldsb */
1219 		case 0xd:			/* ldstub */
1220 			sz = 1; break;
1221 		case 2:				/* lduh */
1222 		case 6:				/* sth */
1223 		case 0xa:			/* ldsh */
1224 			sz = 2; break;
1225 		case 3:				/* ldd */
1226 		case 7:				/* std */
1227 		case 0xb:			/* ldx */
1228 		case 0xe:			/* stx */
1229 			sz = 8; break;
1230 		}
1231 	}
1232 
1233 	if (USERMODE(rp->r_tstate))
1234 		(void) flush_user_windows_to_stack(NULL);
1235 	else
1236 		flush_windows();
1237 
1238 	if (getreg(rp, rs1, &val, badaddr))
1239 		return (SIMU_FAULT);
1240 	addr = (caddr_t)val;
1241 
1242 	/* check immediate bit and use immediate field or reg (rs2) */
1243 	if (immflg) {
1244 		int imm;
1245 		imm = inst & 0x1fff;		/* mask out immediate field */
1246 		imm <<= 19;			/* sign extend it */
1247 		imm >>= 19;
1248 		addr += imm;			/* compute address */
1249 	} else {
1250 		if (getreg(rp, rs2, &val, badaddr))
1251 			return (SIMU_FAULT);
1252 		addr += val;
1253 	}
1254 
1255 	/*
1256 	 * If this is a 32-bit program, chop the address accordingly.  The
1257 	 * intermediate uintptr_t casts prevent warnings under a certain
1258 	 * compiler, and the temporary 32 bit storage is intended to force
1259 	 * proper code generation and break up what would otherwise be a
1260 	 * quadruple cast.
1261 	 */
1262 	if (curproc->p_model == DATAMODEL_ILP32 && USERMODE(rp->r_tstate)) {
1263 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
1264 		addr = (caddr_t)(uintptr_t)addr32;
1265 	}
1266 
1267 	*badaddr = addr;
1268 	return ((uintptr_t)addr & (sz - 1) ? SIMU_UNALIGN : SIMU_SUCCESS);
1269 }
1270 
1271 /*
1272  * Return the size of a load or store instruction (1, 2, 4, 8, 16, 64).
1273  * Also compute the precise address by instruction disassembly.
1274  * (v9 page faults only provide the page address via the hardware.)
1275  * Return 0 on failure (not a load or store instruction).
1276  */
1277 int
1278 instr_size(struct regs *rp, caddr_t *addrp, enum seg_rw rdwr)
1279 {
1280 	uint_t	inst, op3, asi;
1281 	uint_t	rd, rs1, rs2;
1282 	int	sz = 0;
1283 	int	immflg;
1284 	int	floatflg;
1285 	caddr_t	addr;
1286 	caddr_t badaddr;
1287 	uint64_t val;
1288 
1289 	if (rdwr == S_EXEC) {
1290 		*addrp = (caddr_t)rp->r_pc;
1291 		return (4);
1292 	}
1293 
1294 	/*
1295 	 * Fetch the instruction from user-level.
1296 	 * We would like to assert this:
1297 	 *   ASSERT(USERMODE(rp->r_tstate));
1298 	 * but we can't because we can reach this point from a
1299 	 * register window underflow/overflow and the v9 wbuf
1300 	 * traps call trap() with T_USER even though r_tstate
1301 	 * indicates a system trap, not a user trap.
1302 	 */
1303 	inst = fetch_user_instr((caddr_t)rp->r_pc);
1304 
1305 	op3 = (inst >> 19) & 0x3f;
1306 	rd = (inst >> 25) & 0x1f;
1307 	rs1 = (inst >> 14) & 0x1f;
1308 	rs2 = inst & 0x1f;
1309 	floatflg = (inst >> 24) & 1;
1310 	immflg = (inst >> 13) & 1;
1311 
1312 	/* if not load or store do nothing.  can't happen? */
1313 	if ((inst >> 30) != 3)
1314 		return (0);
1315 
1316 	if (immflg)
1317 		asi = (uint_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
1318 				TSTATE_ASI_MASK);
1319 	else
1320 		asi = (inst >> 5) & 0xff;
1321 
1322 	if (floatflg) {
1323 		/* check for ld/st alternate and highest defined V9 asi */
1324 		if ((op3 & 0x30) == 0x30 && asi > ASI_SNFL) {
1325 			sz = extended_asi_size(asi);
1326 		} else {
1327 			switch (op3 & 3) {
1328 			case 0:
1329 				sz = 4;			/* ldf/stf/cas */
1330 				break;
1331 			case 1:
1332 				if (rd == 0)
1333 					sz = 4;		/* ldfsr/stfsr */
1334 				else
1335 					sz = 8;		/* ldxfsr/stxfsr */
1336 				break;
1337 			case 2:
1338 				if (op3 == 0x3e)
1339 					sz = 8;		/* casx */
1340 				else
1341 					sz = 16;	/* ldqf/stqf */
1342 				break;
1343 			case 3:
1344 				sz = 8;			/* lddf/stdf */
1345 				break;
1346 			}
1347 		}
1348 	} else {
1349 		switch (op3 & 0xf) {		/* map size bits to a number */
1350 		case 0:				/* lduw */
1351 		case 4:				/* stw */
1352 		case 8:				/* ldsw */
1353 		case 0xf:			/* swap */
1354 			sz = 4; break;
1355 		case 1:				/* ldub */
1356 		case 5:				/* stb */
1357 		case 9:				/* ldsb */
1358 		case 0xd:			/* ldstub */
1359 			sz = 1; break;
1360 		case 2:				/* lduh */
1361 		case 6:				/* sth */
1362 		case 0xa:			/* ldsh */
1363 			sz = 2; break;
1364 		case 3:				/* ldd */
1365 		case 7:				/* std */
1366 		case 0xb:			/* ldx */
1367 		case 0xe:			/* stx */
1368 			sz = 8; break;
1369 		}
1370 	}
1371 
1372 	if (sz == 0)	/* can't happen? */
1373 		return (0);
1374 	(void) flush_user_windows_to_stack(NULL);
1375 
1376 	if (getreg(rp, rs1, &val, &badaddr))
1377 		return (0);
1378 	addr = (caddr_t)val;
1379 
1380 	/* cas/casx don't use rs2 / simm13 to compute the address */
1381 	if ((op3 & 0x3d) != 0x3c) {
1382 		/* check immediate bit and use immediate field or reg (rs2) */
1383 		if (immflg) {
1384 			int imm;
1385 			imm  = inst & 0x1fff;	/* mask out immediate field */
1386 			imm <<= 19;		/* sign extend it */
1387 			imm >>= 19;
1388 			addr += imm;		/* compute address */
1389 		} else {
1390 			/*
1391 			 * asi's in the 0xCx range are partial store
1392 			 * instructions.  For these, rs2 is a mask, not part of
1393 			 * the address.
1394 			 */
1395 			if (!(floatflg && (asi & 0xf0) == 0xc0)) {
1396 				if (getreg(rp, rs2, &val, &badaddr))
1397 					return (0);
1398 				addr += val;
1399 			}
1400 		}
1401 	}
1402 
1403 	/*
1404 	 * If this is a 32-bit program, chop the address accordingly.  The
1405 	 * intermediate uintptr_t casts prevent warnings under a certain
1406 	 * compiler, and the temporary 32 bit storage is intended to force
1407 	 * proper code generation and break up what would otherwise be a
1408 	 * quadruple cast.
1409 	 */
1410 	if (curproc->p_model == DATAMODEL_ILP32) {
1411 		caddr32_t addr32 = (caddr32_t)(uintptr_t)addr;
1412 		addr = (caddr_t)(uintptr_t)addr32;
1413 	}
1414 
1415 	*addrp = addr;
1416 	ASSERT(sz != 0);
1417 	return (sz);
1418 }
1419 
1420 /*
1421  * Fetch an instruction from user-level.
1422  * Deal with watchpoints, if they are in effect.
1423  */
1424 int32_t
1425 fetch_user_instr(caddr_t vaddr)
1426 {
1427 	proc_t *p = curproc;
1428 	int32_t instr;
1429 
1430 	/*
1431 	 * If this is a 32-bit program, chop the address accordingly.  The
1432 	 * intermediate uintptr_t casts prevent warnings under a certain
1433 	 * compiler, and the temporary 32 bit storage is intended to force
1434 	 * proper code generation and break up what would otherwise be a
1435 	 * quadruple cast.
1436 	 */
1437 	if (p->p_model == DATAMODEL_ILP32) {
1438 		caddr32_t vaddr32 = (caddr32_t)(uintptr_t)vaddr;
1439 		vaddr = (caddr_t)(uintptr_t)vaddr32;
1440 	}
1441 
1442 	if (fuword32_nowatch(vaddr, (uint32_t *)&instr) == -1)
1443 		instr = -1;
1444 
1445 	return (instr);
1446 }
1447