xref: /illumos-gate/usr/src/lib/libm/common/m9x/__fex_sse.c (revision 7d0b359ca572cd04474eb1f2ceec5a8ff39e36c9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <ucontext.h>
31 #include <fenv.h>
32 #if defined(__SUNPRO_C)
33 #include <sunmath.h>
34 #else
35 #include <sys/ieeefp.h>
36 #endif
37 #include "fex_handler.h"
38 #include "fenv_inlines.h"
39 
40 #if !defined(REG_PC)
41 #define REG_PC	EIP
42 #endif
43 
44 #if !defined(REG_PS)
45 #define REG_PS	EFL
46 #endif
47 
48 #ifdef __amd64
49 #define regno(X)	((X < 4)? REG_RAX - X : \
50 			((X > 4)? REG_RAX + 1 - X : REG_RSP))
51 #else
52 #define regno(X)	(EAX - X)
53 #endif
54 
55 /*
56  * Support for SSE instructions
57  */
58 
59 /*
60  * Decode an SSE instruction.  Fill in *inst and return the length of the
61  * instruction in bytes.  Return 0 if the instruction is not recognized.
62  */
63 int
64 __fex_parse_sse(ucontext_t *uap, sseinst_t *inst)
65 {
66 	unsigned char	*ip;
67 	char		*addr;
68 	int		i, dbl, simd, rex, modrm, sib, r;
69 
70 	i = 0;
71 	ip = (unsigned char *)uap->uc_mcontext.gregs[REG_PC];
72 
73 	/* look for pseudo-prefixes */
74 	dbl = 0;
75 	simd = SIMD;
76 	if (ip[i] == 0xF3) {
77 		simd = 0;
78 		i++;
79 	} else if (ip[i] == 0x66) {
80 		dbl = DOUBLE;
81 		i++;
82 	} else if (ip[i] == 0xF2) {
83 		dbl = DOUBLE;
84 		simd = 0;
85 		i++;
86 	}
87 
88 	/* look for AMD64 REX prefix */
89 	rex = 0;
90 	if (ip[i] >= 0x40 && ip[i] <= 0x4F) {
91 		rex = ip[i];
92 		i++;
93 	}
94 
95 	/* parse opcode */
96 	if (ip[i++] != 0x0F)
97 		return 0;
98 	switch (ip[i++]) {
99 	case 0x2A:
100 		inst->op = (int)cvtsi2ss + simd + dbl;
101 		if (!simd)
102 			inst->op = (int)inst->op + (rex & 8);
103 		break;
104 
105 	case 0x2C:
106 		inst->op = (int)cvttss2si + simd + dbl;
107 		if (!simd)
108 			inst->op = (int)inst->op + (rex & 8);
109 		break;
110 
111 	case 0x2D:
112 		inst->op = (int)cvtss2si + simd + dbl;
113 		if (!simd)
114 			inst->op = (int)inst->op + (rex & 8);
115 		break;
116 
117 	case 0x2E:
118 		/* oddball: scalar instruction in a SIMD opcode group */
119 		if (!simd)
120 			return 0;
121 		inst->op = (int)ucomiss + dbl;
122 		break;
123 
124 	case 0x2F:
125 		/* oddball: scalar instruction in a SIMD opcode group */
126 		if (!simd)
127 			return 0;
128 		inst->op = (int)comiss + dbl;
129 		break;
130 
131 	case 0x51:
132 		inst->op = (int)sqrtss + simd + dbl;
133 		break;
134 
135 	case 0x58:
136 		inst->op = (int)addss + simd + dbl;
137 		break;
138 
139 	case 0x59:
140 		inst->op = (int)mulss + simd + dbl;
141 		break;
142 
143 	case 0x5A:
144 		inst->op = (int)cvtss2sd + simd + dbl;
145 		break;
146 
147 	case 0x5B:
148 		if (dbl) {
149 			if (simd)
150 				inst->op = cvtps2dq;
151 			else
152 				return 0;
153 		} else {
154 			inst->op = (simd)? cvtdq2ps : cvttps2dq;
155 		}
156 		break;
157 
158 	case 0x5C:
159 		inst->op = (int)subss + simd + dbl;
160 		break;
161 
162 	case 0x5D:
163 		inst->op = (int)minss + simd + dbl;
164 		break;
165 
166 	case 0x5E:
167 		inst->op = (int)divss + simd + dbl;
168 		break;
169 
170 	case 0x5F:
171 		inst->op = (int)maxss + simd + dbl;
172 		break;
173 
174 	case 0xC2:
175 		inst->op = (int)cmpss + simd + dbl;
176 		break;
177 
178 	case 0xE6:
179 		if (simd) {
180 			if (dbl)
181 				inst->op = cvttpd2dq;
182 			else
183 				return 0;
184 		} else {
185 			inst->op = (dbl)? cvtpd2dq : cvtdq2pd;
186 		}
187 		break;
188 
189 	default:
190 		return 0;
191 	}
192 
193 	/* locate operands */
194 	modrm = ip[i++];
195 
196 	if (inst->op == cvtss2si || inst->op == cvttss2si ||
197 	    inst->op == cvtsd2si || inst->op == cvttsd2si ||
198 	    inst->op == cvtss2siq || inst->op == cvttss2siq ||
199 	    inst->op == cvtsd2siq || inst->op == cvttsd2siq) {
200 		/* op1 is a gp register */
201 		r = ((rex & 4) << 1) | ((modrm >> 3) & 7);
202 		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.gregs[regno(r)];
203 	} else if (inst->op == cvtps2pi || inst->op == cvttps2pi ||
204 	    inst->op == cvtpd2pi || inst->op == cvttpd2pi) {
205 		/* op1 is a mmx register */
206 #ifdef __amd64
207 		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.fpregs.fp_reg_set.
208 		    fpchip_state.st[(modrm >> 3) & 7];
209 #else
210 		inst->op1 = (sseoperand_t *)(10 * ((modrm >> 3) & 7) +
211 		    (char *)&uap->uc_mcontext.fpregs.fp_reg_set.
212 		    fpchip_state.state[7]);
213 #endif
214 	} else {
215 		/* op1 is a xmm register */
216 		r = ((rex & 4) << 1) | ((modrm >> 3) & 7);
217 		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
218 		    fp_reg_set.fpchip_state.xmm[r];
219 	}
220 
221 	if ((modrm >> 6) == 3) {
222 		if (inst->op == cvtsi2ss || inst->op == cvtsi2sd ||
223 		    inst->op == cvtsi2ssq || inst->op == cvtsi2sdq) {
224 			/* op2 is a gp register */
225 			r = ((rex & 1) << 3) | (modrm & 7);
226 			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.
227 			    gregs[regno(r)];
228 		} else if (inst->op == cvtpi2ps || inst->op == cvtpi2pd) {
229 			/* op2 is a mmx register */
230 #ifdef __amd64
231 			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
232 			    fp_reg_set.fpchip_state.st[modrm & 7];
233 #else
234 			inst->op2 = (sseoperand_t *)(10 * (modrm & 7) +
235 			    (char *)&uap->uc_mcontext.fpregs.fp_reg_set.
236 			    fpchip_state.state[7]);
237 #endif
238 		} else {
239 			/* op2 is a xmm register */
240 			r = ((rex & 1) << 3) | (modrm & 7);
241 			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
242 			    fp_reg_set.fpchip_state.xmm[r];
243 		}
244 	} else if ((modrm & 0xc7) == 0x05) {
245 #ifdef __amd64
246 		/* address of next instruction + offset */
247 		r = i + 4;
248 		if (inst->op == cmpss || inst->op == cmpps ||
249 		    inst->op == cmpsd || inst->op == cmppd)
250 			r++;
251 		inst->op2 = (sseoperand_t *)(ip + r + *(int *)(ip + i));
252 #else
253 		/* absolute address */
254 		inst->op2 = (sseoperand_t *)(*(int *)(ip + i));
255 #endif
256 		i += 4;
257 	} else {
258 		/* complex address */
259 		if ((modrm & 7) == 4) {
260 			/* parse sib byte */
261 			sib = ip[i++];
262 			if ((sib & 7) == 5 && (modrm >> 6) == 0) {
263 				/* start with absolute address */
264 				addr = (char *)(uintptr_t)(*(int *)(ip + i));
265 				i += 4;
266 			} else {
267 				/* start with base */
268 				r = ((rex & 1) << 3) | (sib & 7);
269 				addr = (char *)uap->uc_mcontext.gregs[regno(r)];
270 			}
271 			r = ((rex & 2) << 2) | ((sib >> 3) & 7);
272 			if (r != 4) {
273 				/* add scaled index */
274 				addr += uap->uc_mcontext.gregs[regno(r)]
275 				    << (sib >> 6);
276 			}
277 		} else {
278 			r = ((rex & 1) << 3) | (modrm & 7);
279 			addr = (char *)uap->uc_mcontext.gregs[regno(r)];
280 		}
281 
282 		/* add displacement, if any */
283 		if ((modrm >> 6) == 1) {
284 			addr += (char)ip[i++];
285 		} else if ((modrm >> 6) == 2) {
286 			addr += *(int *)(ip + i);
287 			i += 4;
288 		}
289 		inst->op2 = (sseoperand_t *)addr;
290 	}
291 
292 	if (inst->op == cmpss || inst->op == cmpps || inst->op == cmpsd ||
293 	    inst->op == cmppd) {
294 		/* get the immediate operand */
295 		inst->imm = ip[i++];
296 	}
297 
298 	return i;
299 }
300 
301 static enum fp_class_type
302 my_fp_classf(float *x)
303 {
304 	int	i = *(int *)x & ~0x80000000;
305 
306 	if (i < 0x7f800000) {
307 		if (i < 0x00800000)
308 			return ((i == 0)? fp_zero : fp_subnormal);
309 		return fp_normal;
310 	}
311 	else if (i == 0x7f800000)
312 		return fp_infinity;
313 	else if (i & 0x400000)
314 		return fp_quiet;
315 	else
316 		return fp_signaling;
317 }
318 
319 static enum fp_class_type
320 my_fp_class(double *x)
321 {
322 	int	i = *(1+(int *)x) & ~0x80000000;
323 
324 	if (i < 0x7ff00000) {
325 		if (i < 0x00100000)
326 			return (((i | *(int *)x) == 0)? fp_zero : fp_subnormal);
327 		return fp_normal;
328 	}
329 	else if (i == 0x7ff00000 && *(int *)x == 0)
330 		return fp_infinity;
331 	else if (i & 0x80000)
332 		return fp_quiet;
333 	else
334 		return fp_signaling;
335 }
336 
337 /*
338  * Inspect a scalar SSE instruction that incurred an invalid operation
339  * exception to determine which type of exception it was.
340  */
341 static enum fex_exception
342 __fex_get_sse_invalid_type(sseinst_t *inst)
343 {
344 	enum fp_class_type	t1, t2;
345 
346 	/* check op2 for signaling nan */
347 	t2 = ((int)inst->op & DOUBLE)? my_fp_class(&inst->op2->d[0]) :
348 	    my_fp_classf(&inst->op2->f[0]);
349 	if (t2 == fp_signaling)
350 		return fex_inv_snan;
351 
352 	/* eliminate all single-operand instructions */
353 	switch (inst->op) {
354 	case cvtsd2ss:
355 	case cvtss2sd:
356 		/* hmm, this shouldn't have happened */
357 		return (enum fex_exception) -1;
358 
359 	case sqrtss:
360 	case sqrtsd:
361 		return fex_inv_sqrt;
362 
363 	case cvtss2si:
364 	case cvtsd2si:
365 	case cvttss2si:
366 	case cvttsd2si:
367 	case cvtss2siq:
368 	case cvtsd2siq:
369 	case cvttss2siq:
370 	case cvttsd2siq:
371 		return fex_inv_int;
372 	default:
373 		break;
374 	}
375 
376 	/* check op1 for signaling nan */
377 	t1 = ((int)inst->op & DOUBLE)? my_fp_class(&inst->op1->d[0]) :
378 	    my_fp_classf(&inst->op1->f[0]);
379 	if (t1 == fp_signaling)
380 		return fex_inv_snan;
381 
382 	/* check two-operand instructions for other cases */
383 	switch (inst->op) {
384 	case cmpss:
385 	case cmpsd:
386 	case minss:
387 	case minsd:
388 	case maxss:
389 	case maxsd:
390 	case comiss:
391 	case comisd:
392 		return fex_inv_cmp;
393 
394 	case addss:
395 	case addsd:
396 	case subss:
397 	case subsd:
398 		if (t1 == fp_infinity && t2 == fp_infinity)
399 			return fex_inv_isi;
400 		break;
401 
402 	case mulss:
403 	case mulsd:
404 		if ((t1 == fp_zero && t2 == fp_infinity) ||
405 		    (t2 == fp_zero && t1 == fp_infinity))
406 			return fex_inv_zmi;
407 		break;
408 
409 	case divss:
410 	case divsd:
411 		if (t1 == fp_zero && t2 == fp_zero)
412 			return fex_inv_zdz;
413 		if (t1 == fp_infinity && t2 == fp_infinity)
414 			return fex_inv_idi;
415 	default:
416 		break;
417 	}
418 
419 	return (enum fex_exception)-1;
420 }
421 
422 /* inline templates */
423 extern void sse_cmpeqss(float *, float *, int *);
424 extern void sse_cmpltss(float *, float *, int *);
425 extern void sse_cmpless(float *, float *, int *);
426 extern void sse_cmpunordss(float *, float *, int *);
427 extern void sse_minss(float *, float *, float *);
428 extern void sse_maxss(float *, float *, float *);
429 extern void sse_addss(float *, float *, float *);
430 extern void sse_subss(float *, float *, float *);
431 extern void sse_mulss(float *, float *, float *);
432 extern void sse_divss(float *, float *, float *);
433 extern void sse_sqrtss(float *, float *);
434 extern void sse_ucomiss(float *, float *);
435 extern void sse_comiss(float *, float *);
436 extern void sse_cvtss2sd(float *, double *);
437 extern void sse_cvtsi2ss(int *, float *);
438 extern void sse_cvttss2si(float *, int *);
439 extern void sse_cvtss2si(float *, int *);
440 #ifdef __amd64
441 extern void sse_cvtsi2ssq(long long *, float *);
442 extern void sse_cvttss2siq(float *, long long *);
443 extern void sse_cvtss2siq(float *, long long *);
444 #endif
445 extern void sse_cmpeqsd(double *, double *, long long *);
446 extern void sse_cmpltsd(double *, double *, long long *);
447 extern void sse_cmplesd(double *, double *, long long *);
448 extern void sse_cmpunordsd(double *, double *, long long *);
449 extern void sse_minsd(double *, double *, double *);
450 extern void sse_maxsd(double *, double *, double *);
451 extern void sse_addsd(double *, double *, double *);
452 extern void sse_subsd(double *, double *, double *);
453 extern void sse_mulsd(double *, double *, double *);
454 extern void sse_divsd(double *, double *, double *);
455 extern void sse_sqrtsd(double *, double *);
456 extern void sse_ucomisd(double *, double *);
457 extern void sse_comisd(double *, double *);
458 extern void sse_cvtsd2ss(double *, float *);
459 extern void sse_cvtsi2sd(int *, double *);
460 extern void sse_cvttsd2si(double *, int *);
461 extern void sse_cvtsd2si(double *, int *);
462 #ifdef __amd64
463 extern void sse_cvtsi2sdq(long long *, double *);
464 extern void sse_cvttsd2siq(double *, long long *);
465 extern void sse_cvtsd2siq(double *, long long *);
466 #endif
467 
468 /*
469  * Fill in *info with the operands, default untrapped result, and
470  * flags produced by a scalar SSE instruction, and return the type
471  * of trapped exception (if any).  On entry, the mxcsr must have
472  * all exceptions masked and all flags clear.  The same conditions
473  * will hold on exit.
474  *
475  * This routine does not work if the instruction specified by *inst
476  * is not a scalar instruction.
477  */
478 enum fex_exception
479 __fex_get_sse_op(ucontext_t *uap, sseinst_t *inst, fex_info_t *info)
480 {
481 	unsigned int	e, te, mxcsr, oldmxcsr, subnorm;
482 
483 	/*
484 	 * Perform the operation with traps disabled and check the
485 	 * exception flags.  If the underflow trap was enabled, also
486 	 * check for an exact subnormal result.
487 	 */
488 	__fenv_getmxcsr(&oldmxcsr);
489 	subnorm = 0;
490 	if ((int)inst->op & DOUBLE) {
491 		if (inst->op == cvtsi2sd) {
492 			info->op1.type = fex_int;
493 			info->op1.val.i = inst->op2->i[0];
494 			info->op2.type = fex_nodata;
495 		} else if (inst->op == cvtsi2sdq) {
496 			info->op1.type = fex_llong;
497 			info->op1.val.l = inst->op2->l[0];
498 			info->op2.type = fex_nodata;
499 		} else if (inst->op == sqrtsd || inst->op == cvtsd2ss ||
500 		    inst->op == cvttsd2si || inst->op == cvtsd2si ||
501 		    inst->op == cvttsd2siq || inst->op == cvtsd2siq) {
502 			info->op1.type = fex_double;
503 			info->op1.val.d = inst->op2->d[0];
504 			info->op2.type = fex_nodata;
505 		} else {
506 			info->op1.type = fex_double;
507 			info->op1.val.d = inst->op1->d[0];
508 			info->op2.type = fex_double;
509 			info->op2.val.d = inst->op2->d[0];
510 		}
511 		info->res.type = fex_double;
512 		switch (inst->op) {
513 		case cmpsd:
514 			info->op = fex_cmp;
515 			info->res.type = fex_llong;
516 			switch (inst->imm & 3) {
517 			case 0:
518 				sse_cmpeqsd(&info->op1.val.d, &info->op2.val.d,
519 				    &info->res.val.l);
520 				break;
521 
522 			case 1:
523 				sse_cmpltsd(&info->op1.val.d, &info->op2.val.d,
524 				    &info->res.val.l);
525 				break;
526 
527 			case 2:
528 				sse_cmplesd(&info->op1.val.d, &info->op2.val.d,
529 				    &info->res.val.l);
530 				break;
531 
532 			case 3:
533 				sse_cmpunordsd(&info->op1.val.d,
534 				    &info->op2.val.d, &info->res.val.l);
535 			}
536 			if (inst->imm & 4)
537 				info->res.val.l ^= 0xffffffffffffffffull;
538 			break;
539 
540 		case minsd:
541 			info->op = fex_other;
542 			sse_minsd(&info->op1.val.d, &info->op2.val.d,
543 			    &info->res.val.d);
544 			break;
545 
546 		case maxsd:
547 			info->op = fex_other;
548 			sse_maxsd(&info->op1.val.d, &info->op2.val.d,
549 			    &info->res.val.d);
550 			break;
551 
552 		case addsd:
553 			info->op = fex_add;
554 			sse_addsd(&info->op1.val.d, &info->op2.val.d,
555 			    &info->res.val.d);
556 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
557 				subnorm = 1;
558 			break;
559 
560 		case subsd:
561 			info->op = fex_sub;
562 			sse_subsd(&info->op1.val.d, &info->op2.val.d,
563 			    &info->res.val.d);
564 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
565 				subnorm = 1;
566 			break;
567 
568 		case mulsd:
569 			info->op = fex_mul;
570 			sse_mulsd(&info->op1.val.d, &info->op2.val.d,
571 			    &info->res.val.d);
572 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
573 				subnorm = 1;
574 			break;
575 
576 		case divsd:
577 			info->op = fex_div;
578 			sse_divsd(&info->op1.val.d, &info->op2.val.d,
579 			    &info->res.val.d);
580 			if (my_fp_class(&info->res.val.d) == fp_subnormal)
581 				subnorm = 1;
582 			break;
583 
584 		case sqrtsd:
585 			info->op = fex_sqrt;
586 			sse_sqrtsd(&info->op1.val.d, &info->res.val.d);
587 			break;
588 
589 		case cvtsd2ss:
590 			info->op = fex_cnvt;
591 			info->res.type = fex_float;
592 			sse_cvtsd2ss(&info->op1.val.d, &info->res.val.f);
593 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
594 				subnorm = 1;
595 			break;
596 
597 		case cvtsi2sd:
598 			info->op = fex_cnvt;
599 			sse_cvtsi2sd(&info->op1.val.i, &info->res.val.d);
600 			break;
601 
602 		case cvttsd2si:
603 			info->op = fex_cnvt;
604 			info->res.type = fex_int;
605 			sse_cvttsd2si(&info->op1.val.d, &info->res.val.i);
606 			break;
607 
608 		case cvtsd2si:
609 			info->op = fex_cnvt;
610 			info->res.type = fex_int;
611 			sse_cvtsd2si(&info->op1.val.d, &info->res.val.i);
612 			break;
613 
614 #ifdef __amd64
615 		case cvtsi2sdq:
616 			info->op = fex_cnvt;
617 			sse_cvtsi2sdq(&info->op1.val.l, &info->res.val.d);
618 			break;
619 
620 		case cvttsd2siq:
621 			info->op = fex_cnvt;
622 			info->res.type = fex_llong;
623 			sse_cvttsd2siq(&info->op1.val.d, &info->res.val.l);
624 			break;
625 
626 		case cvtsd2siq:
627 			info->op = fex_cnvt;
628 			info->res.type = fex_llong;
629 			sse_cvtsd2siq(&info->op1.val.d, &info->res.val.l);
630 			break;
631 #endif
632 
633 		case ucomisd:
634 			info->op = fex_cmp;
635 			info->res.type = fex_nodata;
636 			sse_ucomisd(&info->op1.val.d, &info->op2.val.d);
637 			break;
638 
639 		case comisd:
640 			info->op = fex_cmp;
641 			info->res.type = fex_nodata;
642 			sse_comisd(&info->op1.val.d, &info->op2.val.d);
643 			break;
644 		default:
645 			break;
646 		}
647 	} else {
648 		if (inst->op == cvtsi2ss) {
649 			info->op1.type = fex_int;
650 			info->op1.val.i = inst->op2->i[0];
651 			info->op2.type = fex_nodata;
652 		} else if (inst->op == cvtsi2ssq) {
653 			info->op1.type = fex_llong;
654 			info->op1.val.l = inst->op2->l[0];
655 			info->op2.type = fex_nodata;
656 		} else if (inst->op == sqrtss || inst->op == cvtss2sd ||
657 		    inst->op == cvttss2si || inst->op == cvtss2si ||
658 		    inst->op == cvttss2siq || inst->op == cvtss2siq) {
659 			info->op1.type = fex_float;
660 			info->op1.val.f = inst->op2->f[0];
661 			info->op2.type = fex_nodata;
662 		} else {
663 			info->op1.type = fex_float;
664 			info->op1.val.f = inst->op1->f[0];
665 			info->op2.type = fex_float;
666 			info->op2.val.f = inst->op2->f[0];
667 		}
668 		info->res.type = fex_float;
669 		switch (inst->op) {
670 		case cmpss:
671 			info->op = fex_cmp;
672 			info->res.type = fex_int;
673 			switch (inst->imm & 3) {
674 			case 0:
675 				sse_cmpeqss(&info->op1.val.f, &info->op2.val.f,
676 				    &info->res.val.i);
677 				break;
678 
679 			case 1:
680 				sse_cmpltss(&info->op1.val.f, &info->op2.val.f,
681 				    &info->res.val.i);
682 				break;
683 
684 			case 2:
685 				sse_cmpless(&info->op1.val.f, &info->op2.val.f,
686 				    &info->res.val.i);
687 				break;
688 
689 			case 3:
690 				sse_cmpunordss(&info->op1.val.f,
691 				    &info->op2.val.f, &info->res.val.i);
692 			}
693 			if (inst->imm & 4)
694 				info->res.val.i ^= 0xffffffffu;
695 			break;
696 
697 		case minss:
698 			info->op = fex_other;
699 			sse_minss(&info->op1.val.f, &info->op2.val.f,
700 			    &info->res.val.f);
701 			break;
702 
703 		case maxss:
704 			info->op = fex_other;
705 			sse_maxss(&info->op1.val.f, &info->op2.val.f,
706 			    &info->res.val.f);
707 			break;
708 
709 		case addss:
710 			info->op = fex_add;
711 			sse_addss(&info->op1.val.f, &info->op2.val.f,
712 			    &info->res.val.f);
713 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
714 				subnorm = 1;
715 			break;
716 
717 		case subss:
718 			info->op = fex_sub;
719 			sse_subss(&info->op1.val.f, &info->op2.val.f,
720 			    &info->res.val.f);
721 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
722 				subnorm = 1;
723 			break;
724 
725 		case mulss:
726 			info->op = fex_mul;
727 			sse_mulss(&info->op1.val.f, &info->op2.val.f,
728 			    &info->res.val.f);
729 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
730 				subnorm = 1;
731 			break;
732 
733 		case divss:
734 			info->op = fex_div;
735 			sse_divss(&info->op1.val.f, &info->op2.val.f,
736 			    &info->res.val.f);
737 			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
738 				subnorm = 1;
739 			break;
740 
741 		case sqrtss:
742 			info->op = fex_sqrt;
743 			sse_sqrtss(&info->op1.val.f, &info->res.val.f);
744 			break;
745 
746 		case cvtss2sd:
747 			info->op = fex_cnvt;
748 			info->res.type = fex_double;
749 			sse_cvtss2sd(&info->op1.val.f, &info->res.val.d);
750 			break;
751 
752 		case cvtsi2ss:
753 			info->op = fex_cnvt;
754 			sse_cvtsi2ss(&info->op1.val.i, &info->res.val.f);
755 			break;
756 
757 		case cvttss2si:
758 			info->op = fex_cnvt;
759 			info->res.type = fex_int;
760 			sse_cvttss2si(&info->op1.val.f, &info->res.val.i);
761 			break;
762 
763 		case cvtss2si:
764 			info->op = fex_cnvt;
765 			info->res.type = fex_int;
766 			sse_cvtss2si(&info->op1.val.f, &info->res.val.i);
767 			break;
768 
769 #ifdef __amd64
770 		case cvtsi2ssq:
771 			info->op = fex_cnvt;
772 			sse_cvtsi2ssq(&info->op1.val.l, &info->res.val.f);
773 			break;
774 
775 		case cvttss2siq:
776 			info->op = fex_cnvt;
777 			info->res.type = fex_llong;
778 			sse_cvttss2siq(&info->op1.val.f, &info->res.val.l);
779 			break;
780 
781 		case cvtss2siq:
782 			info->op = fex_cnvt;
783 			info->res.type = fex_llong;
784 			sse_cvtss2siq(&info->op1.val.f, &info->res.val.l);
785 			break;
786 #endif
787 
788 		case ucomiss:
789 			info->op = fex_cmp;
790 			info->res.type = fex_nodata;
791 			sse_ucomiss(&info->op1.val.f, &info->op2.val.f);
792 			break;
793 
794 		case comiss:
795 			info->op = fex_cmp;
796 			info->res.type = fex_nodata;
797 			sse_comiss(&info->op1.val.f, &info->op2.val.f);
798 			break;
799 		default:
800 			break;
801 		}
802 	}
803 	__fenv_getmxcsr(&mxcsr);
804 	info->flags = mxcsr & 0x3d;
805 	__fenv_setmxcsr(&oldmxcsr);
806 
807 	/* determine which exception would have been trapped */
808 	te = ~(uap->uc_mcontext.fpregs.fp_reg_set.fpchip_state.mxcsr
809 	    >> 7) & 0x3d;
810 	e = mxcsr & te;
811 	if (e & FE_INVALID)
812 		return __fex_get_sse_invalid_type(inst);
813 	if (e & FE_DIVBYZERO)
814 		return fex_division;
815 	if (e & FE_OVERFLOW)
816 		return fex_overflow;
817 	if ((e & FE_UNDERFLOW) || (subnorm && (te & FE_UNDERFLOW)))
818 		return fex_underflow;
819 	if (e & FE_INEXACT)
820 		return fex_inexact;
821 	return (enum fex_exception)-1;
822 }
823 
824 /*
825  * Emulate a SIMD SSE instruction to determine which exceptions occur
826  * in each part.  For i = 0, 1, 2, and 3, set e[i] to indicate the
827  * trapped exception that would occur if the i-th part of the SIMD
828  * instruction were executed in isolation; set e[i] to -1 if no
829  * trapped exception would occur in this part.  Also fill in info[i]
830  * with the corresponding operands, default untrapped result, and
831  * flags.
832  *
833  * This routine does not work if the instruction specified by *inst
834  * is not a SIMD instruction.
835  */
836 void
837 __fex_get_simd_op(ucontext_t *uap, sseinst_t *inst, enum fex_exception *e,
838     fex_info_t *info)
839 {
840 	sseinst_t	dummy;
841 	int		i;
842 
843 	e[0] = e[1] = e[2] = e[3] = -1;
844 
845 	/* perform each part of the SIMD operation */
846 	switch (inst->op) {
847 	case cmpps:
848 		dummy.op = cmpss;
849 		dummy.imm = inst->imm;
850 		for (i = 0; i < 4; i++) {
851 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
852 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
853 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
854 		}
855 		break;
856 
857 	case minps:
858 		dummy.op = minss;
859 		for (i = 0; i < 4; i++) {
860 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
861 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
862 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
863 		}
864 		break;
865 
866 	case maxps:
867 		dummy.op = maxss;
868 		for (i = 0; i < 4; i++) {
869 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
870 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
871 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
872 		}
873 		break;
874 
875 	case addps:
876 		dummy.op = addss;
877 		for (i = 0; i < 4; i++) {
878 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
879 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
880 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
881 		}
882 		break;
883 
884 	case subps:
885 		dummy.op = subss;
886 		for (i = 0; i < 4; i++) {
887 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
888 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
889 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
890 		}
891 		break;
892 
893 	case mulps:
894 		dummy.op = mulss;
895 		for (i = 0; i < 4; i++) {
896 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
897 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
898 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
899 		}
900 		break;
901 
902 	case divps:
903 		dummy.op = divss;
904 		for (i = 0; i < 4; i++) {
905 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
906 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
907 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
908 		}
909 		break;
910 
911 	case sqrtps:
912 		dummy.op = sqrtss;
913 		for (i = 0; i < 4; i++) {
914 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
915 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
916 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
917 		}
918 		break;
919 
920 	case cvtdq2ps:
921 		dummy.op = cvtsi2ss;
922 		for (i = 0; i < 4; i++) {
923 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
924 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
925 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
926 		}
927 		break;
928 
929 	case cvttps2dq:
930 		dummy.op = cvttss2si;
931 		for (i = 0; i < 4; i++) {
932 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
933 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
934 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
935 		}
936 		break;
937 
938 	case cvtps2dq:
939 		dummy.op = cvtss2si;
940 		for (i = 0; i < 4; i++) {
941 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
942 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
943 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
944 		}
945 		break;
946 
947 	case cvtpi2ps:
948 		dummy.op = cvtsi2ss;
949 		for (i = 0; i < 2; i++) {
950 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
951 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
952 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
953 		}
954 		break;
955 
956 	case cvttps2pi:
957 		dummy.op = cvttss2si;
958 		for (i = 0; i < 2; i++) {
959 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
960 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
961 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
962 		}
963 		break;
964 
965 	case cvtps2pi:
966 		dummy.op = cvtss2si;
967 		for (i = 0; i < 2; i++) {
968 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
969 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
970 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
971 		}
972 		break;
973 
974 	case cmppd:
975 		dummy.op = cmpsd;
976 		dummy.imm = inst->imm;
977 		for (i = 0; i < 2; i++) {
978 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
979 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
980 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
981 		}
982 		break;
983 
984 	case minpd:
985 		dummy.op = minsd;
986 		for (i = 0; i < 2; i++) {
987 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
988 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
989 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
990 		}
991 		break;
992 
993 	case maxpd:
994 		dummy.op = maxsd;
995 		for (i = 0; i < 2; i++) {
996 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
997 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
998 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
999 		}
1000 		break;
1001 
1002 	case addpd:
1003 		dummy.op = addsd;
1004 		for (i = 0; i < 2; i++) {
1005 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1006 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1007 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1008 		}
1009 		break;
1010 
1011 	case subpd:
1012 		dummy.op = subsd;
1013 		for (i = 0; i < 2; i++) {
1014 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1015 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1016 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1017 		}
1018 		break;
1019 
1020 	case mulpd:
1021 		dummy.op = mulsd;
1022 		for (i = 0; i < 2; i++) {
1023 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1024 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1025 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1026 		}
1027 		break;
1028 
1029 	case divpd:
1030 		dummy.op = divsd;
1031 		for (i = 0; i < 2; i++) {
1032 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1033 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1034 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1035 		}
1036 		break;
1037 
1038 	case sqrtpd:
1039 		dummy.op = sqrtsd;
1040 		for (i = 0; i < 2; i++) {
1041 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1042 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1043 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1044 		}
1045 		break;
1046 
1047 	case cvtpi2pd:
1048 	case cvtdq2pd:
1049 		dummy.op = cvtsi2sd;
1050 		for (i = 0; i < 2; i++) {
1051 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1052 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1053 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1054 		}
1055 		break;
1056 
1057 	case cvttpd2pi:
1058 	case cvttpd2dq:
1059 		dummy.op = cvttsd2si;
1060 		for (i = 0; i < 2; i++) {
1061 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1062 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1063 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1064 		}
1065 		break;
1066 
1067 	case cvtpd2pi:
1068 	case cvtpd2dq:
1069 		dummy.op = cvtsd2si;
1070 		for (i = 0; i < 2; i++) {
1071 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1072 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1073 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1074 		}
1075 		break;
1076 
1077 	case cvtps2pd:
1078 		dummy.op = cvtss2sd;
1079 		for (i = 0; i < 2; i++) {
1080 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1081 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1082 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1083 		}
1084 		break;
1085 
1086 	case cvtpd2ps:
1087 		dummy.op = cvtsd2ss;
1088 		for (i = 0; i < 2; i++) {
1089 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1090 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1091 			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1092 		}
1093 	default:
1094 		break;
1095 	}
1096 }
1097 
1098 /*
1099  * Store the result value from *info in the destination of the scalar
1100  * SSE instruction specified by *inst.  If no result is given but the
1101  * exception is underflow or overflow, supply the default trapped result.
1102  *
1103  * This routine does not work if the instruction specified by *inst
1104  * is not a scalar instruction.
1105  */
1106 void
1107 __fex_st_sse_result(ucontext_t *uap, sseinst_t *inst, enum fex_exception e,
1108     fex_info_t *info)
1109 {
1110 	int		i = 0;
1111 	long long	l = 0L;;
1112 	float		f = 0.0, fscl;
1113 	double		d = 0.0L, dscl;
1114 
1115 	/* for compares that write eflags, just set the flags
1116 	   to indicate "unordered" */
1117 	if (inst->op == ucomiss || inst->op == comiss ||
1118 	    inst->op == ucomisd || inst->op == comisd) {
1119 		uap->uc_mcontext.gregs[REG_PS] |= 0x45;
1120 		return;
1121 	}
1122 
1123 	/* if info doesn't specify a result value, try to generate
1124 	   the default trapped result */
1125 	if (info->res.type == fex_nodata) {
1126 		/* set scale factors for exponent wrapping */
1127 		switch (e) {
1128 		case fex_overflow:
1129 			fscl = 1.262177448e-29f; /* 2^-96 */
1130 			dscl = 6.441148769597133308e-232; /* 2^-768 */
1131 			break;
1132 
1133 		case fex_underflow:
1134 			fscl = 7.922816251e+28f; /* 2^96 */
1135 			dscl = 1.552518092300708935e+231; /* 2^768 */
1136 			break;
1137 
1138 		default:
1139 			(void) __fex_get_sse_op(uap, inst, info);
1140 			if (info->res.type == fex_nodata)
1141 				return;
1142 			goto stuff;
1143 		}
1144 
1145 		/* generate the wrapped result */
1146 		if (inst->op == cvtsd2ss) {
1147 			info->op1.type = fex_double;
1148 			info->op1.val.d = inst->op2->d[0];
1149 			info->op2.type = fex_nodata;
1150 			info->res.type = fex_float;
1151 			info->res.val.f = (float)(fscl * (fscl *
1152 			    info->op1.val.d));
1153 		} else if ((int)inst->op & DOUBLE) {
1154 			info->op1.type = fex_double;
1155 			info->op1.val.d = inst->op1->d[0];
1156 			info->op2.type = fex_double;
1157 			info->op2.val.d = inst->op2->d[0];
1158 			info->res.type = fex_double;
1159 			switch (inst->op) {
1160 			case addsd:
1161 				info->res.val.d = dscl * (dscl *
1162 				    info->op1.val.d + dscl * info->op2.val.d);
1163 				break;
1164 
1165 			case subsd:
1166 				info->res.val.d = dscl * (dscl *
1167 				    info->op1.val.d - dscl * info->op2.val.d);
1168 				break;
1169 
1170 			case mulsd:
1171 				info->res.val.d = (dscl * info->op1.val.d) *
1172 				    (dscl * info->op2.val.d);
1173 				break;
1174 
1175 			case divsd:
1176 				info->res.val.d = (dscl * info->op1.val.d) /
1177 				    (info->op2.val.d / dscl);
1178 				break;
1179 
1180 			default:
1181 				return;
1182 			}
1183 		} else {
1184 			info->op1.type = fex_float;
1185 			info->op1.val.f = inst->op1->f[0];
1186 			info->op2.type = fex_float;
1187 			info->op2.val.f = inst->op2->f[0];
1188 			info->res.type = fex_float;
1189 			switch (inst->op) {
1190 			case addss:
1191 				info->res.val.f = fscl * (fscl *
1192 				    info->op1.val.f + fscl * info->op2.val.f);
1193 				break;
1194 
1195 			case subss:
1196 				info->res.val.f = fscl * (fscl *
1197 				    info->op1.val.f - fscl * info->op2.val.f);
1198 				break;
1199 
1200 			case mulss:
1201 				info->res.val.f = (fscl * info->op1.val.f) *
1202 				    (fscl * info->op2.val.f);
1203 				break;
1204 
1205 			case divss:
1206 				info->res.val.f = (fscl * info->op1.val.f) /
1207 				    (info->op2.val.f / fscl);
1208 				break;
1209 
1210 			default:
1211 				return;
1212 			}
1213 		}
1214 	}
1215 
1216 	/* put the result in the destination */
1217 stuff:
1218 	if (inst->op == cmpss || inst->op == cvttss2si || inst->op == cvtss2si
1219 	    || inst->op == cvttsd2si || inst->op == cvtsd2si) {
1220 		switch (info->res.type) {
1221 		case fex_int:
1222 			i = info->res.val.i;
1223 			break;
1224 
1225 		case fex_llong:
1226 			i = info->res.val.l;
1227 			break;
1228 
1229 		case fex_float:
1230 			i = info->res.val.f;
1231 			break;
1232 
1233 		case fex_double:
1234 			i = info->res.val.d;
1235 			break;
1236 
1237 		case fex_ldouble:
1238 			i = info->res.val.q;
1239 			break;
1240 
1241 		default:
1242 			break;
1243 		}
1244 		inst->op1->i[0] = i;
1245 	} else if (inst->op == cmpsd || inst->op == cvttss2siq ||
1246 	    inst->op == cvtss2siq || inst->op == cvttsd2siq ||
1247 	    inst->op == cvtsd2siq) {
1248 		switch (info->res.type) {
1249 		case fex_int:
1250 			l = info->res.val.i;
1251 			break;
1252 
1253 		case fex_llong:
1254 			l = info->res.val.l;
1255 			break;
1256 
1257 		case fex_float:
1258 			l = info->res.val.f;
1259 			break;
1260 
1261 		case fex_double:
1262 			l = info->res.val.d;
1263 			break;
1264 
1265 		case fex_ldouble:
1266 			l = info->res.val.q;
1267 			break;
1268 
1269 		default:
1270 			break;
1271 		}
1272 		inst->op1->l[0] = l;
1273 	} else if ((((int)inst->op & DOUBLE) && inst->op != cvtsd2ss) ||
1274 	    inst->op == cvtss2sd) {
1275 		switch (info->res.type) {
1276 		case fex_int:
1277 			d = info->res.val.i;
1278 			break;
1279 
1280 		case fex_llong:
1281 			d = info->res.val.l;
1282 			break;
1283 
1284 		case fex_float:
1285 			d = info->res.val.f;
1286 			break;
1287 
1288 		case fex_double:
1289 			d = info->res.val.d;
1290 			break;
1291 
1292 		case fex_ldouble:
1293 			d = info->res.val.q;
1294 			break;
1295 
1296 		default:
1297 			break;
1298 		}
1299 		inst->op1->d[0] = d;
1300 	} else {
1301 		switch (info->res.type) {
1302 		case fex_int:
1303 			f = info->res.val.i;
1304 			break;
1305 
1306 		case fex_llong:
1307 			f = info->res.val.l;
1308 			break;
1309 
1310 		case fex_float:
1311 			f = info->res.val.f;
1312 			break;
1313 
1314 		case fex_double:
1315 			f = info->res.val.d;
1316 			break;
1317 
1318 		case fex_ldouble:
1319 			f = info->res.val.q;
1320 			break;
1321 
1322 		default:
1323 			break;
1324 		}
1325 		inst->op1->f[0] = f;
1326 	}
1327 }
1328 
1329 /*
1330  * Store the results from a SIMD instruction.  For each i, store
1331  * the result value from info[i] in the i-th part of the destination
1332  * of the SIMD SSE instruction specified by *inst.  If no result
1333  * is given but the exception indicated by e[i] is underflow or
1334  * overflow, supply the default trapped result.
1335  *
1336  * This routine does not work if the instruction specified by *inst
1337  * is not a SIMD instruction.
1338  */
1339 void
1340 __fex_st_simd_result(ucontext_t *uap, sseinst_t *inst, enum fex_exception *e,
1341     fex_info_t *info)
1342 {
1343 	sseinst_t	dummy;
1344 	int		i;
1345 
1346 	/* store each part */
1347 	switch (inst->op) {
1348 	case cmpps:
1349 		dummy.op = cmpss;
1350 		dummy.imm = inst->imm;
1351 		for (i = 0; i < 4; i++) {
1352 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1353 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1354 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1355 		}
1356 		break;
1357 
1358 	case minps:
1359 		dummy.op = minss;
1360 		for (i = 0; i < 4; i++) {
1361 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1362 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1363 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1364 		}
1365 		break;
1366 
1367 	case maxps:
1368 		dummy.op = maxss;
1369 		for (i = 0; i < 4; i++) {
1370 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1371 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1372 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1373 		}
1374 		break;
1375 
1376 	case addps:
1377 		dummy.op = addss;
1378 		for (i = 0; i < 4; i++) {
1379 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1380 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1381 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1382 		}
1383 		break;
1384 
1385 	case subps:
1386 		dummy.op = subss;
1387 		for (i = 0; i < 4; i++) {
1388 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1389 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1390 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1391 		}
1392 		break;
1393 
1394 	case mulps:
1395 		dummy.op = mulss;
1396 		for (i = 0; i < 4; i++) {
1397 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1398 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1399 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1400 		}
1401 		break;
1402 
1403 	case divps:
1404 		dummy.op = divss;
1405 		for (i = 0; i < 4; i++) {
1406 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1407 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1408 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1409 		}
1410 		break;
1411 
1412 	case sqrtps:
1413 		dummy.op = sqrtss;
1414 		for (i = 0; i < 4; i++) {
1415 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1416 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1417 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1418 		}
1419 		break;
1420 
1421 	case cvtdq2ps:
1422 		dummy.op = cvtsi2ss;
1423 		for (i = 0; i < 4; i++) {
1424 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1425 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1426 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1427 		}
1428 		break;
1429 
1430 	case cvttps2dq:
1431 		dummy.op = cvttss2si;
1432 		for (i = 0; i < 4; i++) {
1433 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1434 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1435 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1436 		}
1437 		break;
1438 
1439 	case cvtps2dq:
1440 		dummy.op = cvtss2si;
1441 		for (i = 0; i < 4; i++) {
1442 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1443 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1444 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1445 		}
1446 		break;
1447 
1448 	case cvtpi2ps:
1449 		dummy.op = cvtsi2ss;
1450 		for (i = 0; i < 2; i++) {
1451 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1452 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1453 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1454 		}
1455 		break;
1456 
1457 	case cvttps2pi:
1458 		dummy.op = cvttss2si;
1459 		for (i = 0; i < 2; i++) {
1460 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1461 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1462 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1463 		}
1464 		break;
1465 
1466 	case cvtps2pi:
1467 		dummy.op = cvtss2si;
1468 		for (i = 0; i < 2; i++) {
1469 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1470 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1471 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1472 		}
1473 		break;
1474 
1475 	case cmppd:
1476 		dummy.op = cmpsd;
1477 		dummy.imm = inst->imm;
1478 		for (i = 0; i < 2; i++) {
1479 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1480 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1481 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1482 		}
1483 		break;
1484 
1485 	case minpd:
1486 		dummy.op = minsd;
1487 		for (i = 0; i < 2; i++) {
1488 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1489 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1490 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1491 		}
1492 		break;
1493 
1494 	case maxpd:
1495 		dummy.op = maxsd;
1496 		for (i = 0; i < 2; i++) {
1497 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1498 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1499 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1500 		}
1501 		break;
1502 
1503 	case addpd:
1504 		dummy.op = addsd;
1505 		for (i = 0; i < 2; i++) {
1506 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1507 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1508 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1509 		}
1510 		break;
1511 
1512 	case subpd:
1513 		dummy.op = subsd;
1514 		for (i = 0; i < 2; i++) {
1515 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1516 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1517 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1518 		}
1519 		break;
1520 
1521 	case mulpd:
1522 		dummy.op = mulsd;
1523 		for (i = 0; i < 2; i++) {
1524 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1525 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1526 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1527 		}
1528 		break;
1529 
1530 	case divpd:
1531 		dummy.op = divsd;
1532 		for (i = 0; i < 2; i++) {
1533 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1534 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1535 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1536 		}
1537 		break;
1538 
1539 	case sqrtpd:
1540 		dummy.op = sqrtsd;
1541 		for (i = 0; i < 2; i++) {
1542 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1543 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1544 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1545 		}
1546 		break;
1547 
1548 	case cvtpi2pd:
1549 	case cvtdq2pd:
1550 		dummy.op = cvtsi2sd;
1551 		for (i = 0; i < 2; i++) {
1552 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1553 			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1554 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1555 		}
1556 		break;
1557 
1558 	case cvttpd2pi:
1559 	case cvttpd2dq:
1560 		dummy.op = cvttsd2si;
1561 		for (i = 0; i < 2; i++) {
1562 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1563 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1564 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1565 		}
1566 		/* for cvttpd2dq, zero the high 64 bits of the destination */
1567 		if (inst->op == cvttpd2dq)
1568 			inst->op1->l[1] = 0ll;
1569 		break;
1570 
1571 	case cvtpd2pi:
1572 	case cvtpd2dq:
1573 		dummy.op = cvtsd2si;
1574 		for (i = 0; i < 2; i++) {
1575 			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1576 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1577 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1578 		}
1579 		/* for cvtpd2dq, zero the high 64 bits of the destination */
1580 		if (inst->op == cvtpd2dq)
1581 			inst->op1->l[1] = 0ll;
1582 		break;
1583 
1584 	case cvtps2pd:
1585 		dummy.op = cvtss2sd;
1586 		for (i = 0; i < 2; i++) {
1587 			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1588 			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1589 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1590 		}
1591 		break;
1592 
1593 	case cvtpd2ps:
1594 		dummy.op = cvtsd2ss;
1595 		for (i = 0; i < 2; i++) {
1596 			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1597 			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1598 			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1599 		}
1600 		/* zero the high 64 bits of the destination */
1601 		inst->op1->l[1] = 0ll;
1602 
1603 	default:
1604 		break;
1605 	}
1606 }
1607 
1608