xref: /titanic_52/usr/src/cmd/sgs/rtld/amd64/boot_elf.s (revision f936286c99fb83153e4bfd870eb2830a990a82c1)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2012 Joyent, Inc. All rights reserved.
26 */
27
28#if	defined(lint)
29
30#include	<sys/types.h>
31#include	<_rtld.h>
32#include	<_audit.h>
33#include	<_elf.h>
34#include	<sys/regset.h>
35#include	<sys/auxv_386.h>
36
37/* ARGSUSED0 */
38int
39elf_plt_trace()
40{
41	return (0);
42}
43#else
44
45#include	<link.h>
46#include	<_audit.h>
47#include	<sys/asm_linkage.h>
48#include	<sys/auxv_386.h>
49
50	.file	"boot_elf.s"
51	.text
52
53/*
54 * On entry the 'glue code' has already  done the following:
55 *
56 *	pushq	%rbp
57 *	movq	%rsp, %rbp
58 *	subq	$0x10, %rsp
59 *	leaq	trace_fields(%rip), %r11
60 *	movq	%r11, -0x8(%rbp)
61 *	movq	$elf_plt_trace, %r11
62 *	jmp	*%r11
63 *
64 * so - -8(%rbp) contains the dyndata ptr
65 *
66 *	0x0	Addr		*reflmp
67 *	0x8	Addr		*deflmp
68 *	0x10	Word		symndx
69 *	0x14	Word		sb_flags
70 *	0x18	Sym		symdef.st_name
71 *	0x1c			symdef.st_info
72 *	0x1d			symdef.st_other
73 *	0x1e			symdef.st_shndx
74 *	0x20			symdef.st_value
75 *	0x28			symdef.st_size
76 *
77 * Also note - on entry 16 bytes have already been subtracted
78 * from the %rsp.  The first 8 bytes is for the dyn_data_ptr,
79 * the second 8 bytes are to align the stack and are available
80 * for use.
81 */
82#define	REFLMP_OFF		0x0
83#define	DEFLMP_OFF		0x8
84#define	SYMNDX_OFF		0x10
85#define	SBFLAGS_OFF		0x14
86#define	SYMDEF_OFF		0x18
87#define	SYMDEF_VALUE_OFF	0x20
88/*
89 * Local stack space storage for elf_plt_trace is allocated
90 * as follows:
91 *
92 *  First - before we got here - %rsp has been decremented
93 *  by 0x10 to make space for the dyndata ptr (and another
94 *  free word).  In addition to that, we create space
95 *  for the following:
96 *
97 *	La_amd64_regs	    8 * 8:	64
98 *	prev_stack_size	    8		 8
99 *	Saved regs:
100 *	    %rdi			 8
101 *	    %rsi			 8
102 *	    %rdx			 8
103 *	    %rcx			 8
104 *	    %r8				 8
105 *	    %r9				 8
106 *	    %r10			 8
107 *	    %r11			 8
108 *	    %rax			 8
109 *				    =======
110 *			    Subtotal:	144 (32byte aligned)
111 *
112 *	Saved Media Regs (used to pass floating point args):
113 *	    %xmm0 - %xmm7   32 * 8:	256
114 *				    =======
115 *			    Total:	400 (32byte aligned)
116 *
117 *  So - will subtract the following to create enough space
118 *
119 *	-8(%rbp)	store dyndata ptr
120 *	-16(%rbp)	store call destination
121 *	-80(%rbp)	space for La_amd64_regs
122 *	-88(%rbp)	prev stack size
123 *  The next %rbp offsets are only true if the caller had correct stack
124 *  alignment.  See note above SPRDIOFF for why we use %rsp alignment to
125 *  access these stack fields.
126 *	-96(%rbp)	entering %rdi
127 *	-104(%rbp)	entering %rsi
128 *	-112(%rbp)	entering %rdx
129 *	-120(%rbp)	entering %rcx
130 *	-128(%rbp)	entering %r8
131 *	-136(%rbp)	entering %r9
132 *	-144(%rbp)	entering %r10
133 *	-152(%rbp)	entering %r11
134 *	-160(%rbp)	entering %rax
135 *	-192(%rbp)	entering %xmm0
136 *	-224(%rbp)	entering %xmm1
137 *	-256(%rbp)	entering %xmm2
138 *	-288(%rbp)	entering %xmm3
139 *	-320(%rbp)	entering %xmm4
140 *	-384(%rbp)	entering %xmm5
141 *	-416(%rbp)	entering %xmm6
142 *	-448(%rbp)	entering %xmm7
143 *
144 */
145#define	SPDYNOFF    -8
146#define	SPDESTOFF   -16
147#define	SPLAREGOFF  -80
148#define	SPPRVSTKOFF -88
149
150/*
151 * The next set of offsets are relative to %rsp.
152 * We guarantee %rsp is ABI compliant 32-byte aligned.  This guarantees the
153 * ymm registers are saved to 32-byte aligned addresses.
154 * %rbp may only be 8 byte aligned if we came in from non-ABI compliant code.
155 */
156#define	SPRDIOFF	320
157#define	SPRSIOFF	312
158#define	SPRDXOFF	304
159#define	SPRCXOFF	296
160#define	SPR8OFF		288
161#define	SPR9OFF		280
162#define	SPR10OFF	272
163#define	SPR11OFF	264
164#define	SPRAXOFF	256
165#define	SPXMM0OFF	224
166#define	SPXMM1OFF	192
167#define	SPXMM2OFF	160
168#define	SPXMM3OFF	128
169#define	SPXMM4OFF	96
170#define	SPXMM5OFF	64
171#define	SPXMM6OFF	32
172#define	SPXMM7OFF	0
173
174	/* See elf_rtbndr for explanation behind org_scapset */
175	.extern org_scapset
176	.globl	elf_plt_trace
177	.type	elf_plt_trace,@function
178	.align 16
179elf_plt_trace:
180	/*
181	 * Enforce ABI 32-byte stack alignment here.
182	 * The next andq instruction does this pseudo code:
183	 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
184	 */
185	andq    $-32, %rsp	/* enforce ABI 32-byte stack alignment */
186	subq	$400,%rsp	/ create some local storage
187
188	movq	%rdi, SPRDIOFF(%rsp)
189	movq	%rsi, SPRSIOFF(%rsp)
190	movq	%rdx, SPRDXOFF(%rsp)
191	movq	%rcx, SPRCXOFF(%rsp)
192	movq	%r8, SPR8OFF(%rsp)
193	movq	%r9, SPR9OFF(%rsp)
194	movq	%r10, SPR10OFF(%rsp)
195	movq	%r11, SPR11OFF(%rsp)
196	movq	%rax, SPRAXOFF(%rsp)
197
198	movq	org_scapset@GOTPCREL(%rip),%r9
199	movq	(%r9),%r9
200	movl	(%r9),%edx
201	testl	$AV_386_AVX,%edx
202	jne	.trace_save_ymm
203
204.trace_save_xmm:
205	movdqa	%xmm0, SPXMM0OFF(%rsp)
206	movdqa	%xmm1, SPXMM1OFF(%rsp)
207	movdqa	%xmm2, SPXMM2OFF(%rsp)
208	movdqa	%xmm3, SPXMM3OFF(%rsp)
209	movdqa	%xmm4, SPXMM4OFF(%rsp)
210	movdqa	%xmm5, SPXMM5OFF(%rsp)
211	movdqa	%xmm6, SPXMM6OFF(%rsp)
212	movdqa	%xmm7, SPXMM7OFF(%rsp)
213	jmp	.trace_save_finish
214
215.trace_save_ymm:
216	vmovdqa	%ymm0, SPXMM0OFF(%rsp)
217	vmovdqa	%ymm1, SPXMM1OFF(%rsp)
218	vmovdqa	%ymm2, SPXMM2OFF(%rsp)
219	vmovdqa	%ymm3, SPXMM3OFF(%rsp)
220	vmovdqa	%ymm4, SPXMM4OFF(%rsp)
221	vmovdqa	%ymm5, SPXMM5OFF(%rsp)
222	vmovdqa	%ymm6, SPXMM6OFF(%rsp)
223	vmovdqa	%ymm7, SPXMM7OFF(%rsp)
224
225.trace_save_finish:
226
227	movq	SPDYNOFF(%rbp), %rax			/ %rax = dyndata
228	testb	$LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax)	/ <link.h>
229	je	.start_pltenter
230	movq	SYMDEF_VALUE_OFF(%rax), %rdi
231	movq	%rdi, SPDESTOFF(%rbp)		/ save destination address
232	jmp	.end_pltenter
233
234.start_pltenter:
235	/*
236	 * save all registers into La_amd64_regs
237	 */
238	leaq	SPLAREGOFF(%rbp), %rsi	/ %rsi = &La_amd64_regs
239	leaq	8(%rbp), %rdi
240	movq	%rdi, 0(%rsi)		/ la_rsp
241	movq	0(%rbp), %rdi
242	movq	%rdi, 8(%rsi)		/ la_rbp
243	movq	SPRDIOFF(%rsp), %rdi
244	movq	%rdi, 16(%rsi)		/ la_rdi
245	movq	SPRSIOFF(%rsp), %rdi
246	movq	%rdi, 24(%rsi)		/ la_rsi
247	movq	SPRDXOFF(%rsp), %rdi
248	movq	%rdi, 32(%rsi)		/ la_rdx
249	movq	SPRCXOFF(%rsp), %rdi
250	movq	%rdi, 40(%rsi)		/ la_rcx
251	movq	SPR8OFF(%rsp), %rdi
252	movq	%rdi, 48(%rsi)		/ la_r8
253	movq	SPR9OFF(%rsp), %rdi
254	movq	%rdi, 56(%rsi)		/ la_r9
255
256	/*
257	 * prepare for call to la_pltenter
258	 */
259	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
260	leaq	SBFLAGS_OFF(%r11), %r9		/ arg6 (&sb_flags)
261	leaq	SPLAREGOFF(%rbp), %r8		/ arg5 (&La_amd64_regs)
262	movl	SYMNDX_OFF(%r11), %ecx		/ arg4 (symndx)
263	leaq	SYMDEF_OFF(%r11), %rdx		/ arg3 (&Sym)
264	movq	DEFLMP_OFF(%r11), %rsi		/ arg2 (dlmp)
265	movq	REFLMP_OFF(%r11), %rdi		/ arg1 (rlmp)
266	call	audit_pltenter@PLT
267	movq	%rax, SPDESTOFF(%rbp)		/ save calling address
268.end_pltenter:
269
270	/*
271	 * If *no* la_pltexit() routines exist
272	 * we do not need to keep the stack frame
273	 * before we call the actual routine.  Instead we
274	 * jump to it and remove our stack from the stack
275	 * at the same time.
276	 */
277	movl	audit_flags(%rip), %eax
278	andl	$AF_PLTEXIT, %eax		/ value of audit.h:AF_PLTEXIT
279	cmpl	$0, %eax
280	je	.bypass_pltexit
281	/*
282	 * Has the *nopltexit* flag been set for this entry point
283	 */
284	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
285	testb	$LA_SYMB_NOPLTEXIT, SBFLAGS_OFF(%r11)
286	je	.start_pltexit
287
288.bypass_pltexit:
289	/*
290	 * No PLTEXIT processing required.
291	 */
292	movq	0(%rbp), %r11
293	movq	%r11, -8(%rbp)			/ move prev %rbp
294	movq	SPDESTOFF(%rbp), %r11		/ r11 == calling destination
295	movq	%r11, 0(%rbp)			/ store destination at top
296
297	/
298	/ Restore registers
299	/
300	movq	org_scapset@GOTPCREL(%rip),%r9
301	movq	(%r9),%r9
302	movl	(%r9),%edx
303	testl	$AV_386_AVX,%edx
304	jne	.trace_restore_ymm
305
306.trace_restore_xmm:
307	movdqa	SPXMM0OFF(%rsp), %xmm0
308	movdqa	SPXMM1OFF(%rsp), %xmm1
309	movdqa	SPXMM2OFF(%rsp), %xmm2
310	movdqa	SPXMM3OFF(%rsp), %xmm3
311	movdqa	SPXMM4OFF(%rsp), %xmm4
312	movdqa	SPXMM5OFF(%rsp), %xmm5
313	movdqa	SPXMM6OFF(%rsp), %xmm6
314	movdqa	SPXMM7OFF(%rsp), %xmm7
315	jmp	.trace_restore_finish
316
317.trace_restore_ymm:
318	vmovdqa	SPXMM0OFF(%rsp), %ymm0
319	vmovdqa	SPXMM1OFF(%rsp), %ymm1
320	vmovdqa	SPXMM2OFF(%rsp), %ymm2
321	vmovdqa	SPXMM3OFF(%rsp), %ymm3
322	vmovdqa	SPXMM4OFF(%rsp), %ymm4
323	vmovdqa	SPXMM5OFF(%rsp), %ymm5
324	vmovdqa	SPXMM6OFF(%rsp), %ymm6
325	vmovdqa	SPXMM7OFF(%rsp), %ymm7
326
327.trace_restore_finish:
328	movq	SPRDIOFF(%rsp), %rdi
329	movq	SPRSIOFF(%rsp), %rsi
330	movq	SPRDXOFF(%rsp), %rdx
331	movq	SPRCXOFF(%rsp), %rcx
332	movq	SPR8OFF(%rsp), %r8
333	movq	SPR9OFF(%rsp), %r9
334	movq	SPR10OFF(%rsp), %r10
335	movq	SPR11OFF(%rsp), %r11
336	movq	SPRAXOFF(%rsp), %rax
337
338	subq	$8, %rbp			/ adjust %rbp for 'ret'
339	movq	%rbp, %rsp			/
340	/*
341	 * At this point, after a little doctoring, we should
342	 * have the following on the stack:
343	 *
344	 *	16(%rsp):  ret addr
345	 *	8(%rsp):  dest_addr
346	 *	0(%rsp):  Previous %rbp
347	 *
348	 * So - we pop the previous %rbp, and then
349	 * ret to our final destination.
350	 */
351	popq	%rbp				/
352	ret					/ jmp to final destination
353						/ and clean up stack :)
354
355.start_pltexit:
356	/*
357	 * In order to call the destination procedure and then return
358	 * to audit_pltexit() for post analysis we must first grow
359	 * our stack frame and then duplicate the original callers
360	 * stack state.  This duplicates all of the arguements
361	 * that were to be passed to the destination procedure.
362	 */
363	movq	%rbp, %rdi			/
364	addq	$16, %rdi			/    %rdi = src
365	movq	(%rbp), %rdx			/
366	subq	%rdi, %rdx			/    %rdx == prev frame sz
367	/*
368	 * If audit_argcnt > 0 then we limit the number of
369	 * arguements that will be duplicated to audit_argcnt.
370	 *
371	 * If (prev_stack_size > (audit_argcnt * 8))
372	 *	prev_stack_size = audit_argcnt * 8;
373	 */
374	movl	audit_argcnt(%rip),%eax		/   %eax = audit_argcnt
375	cmpl	$0, %eax
376	jle	.grow_stack
377	leaq	(,%rax,8), %rax			/    %eax = %eax * 4
378	cmpq	%rax,%rdx
379	jle	.grow_stack
380	movq	%rax, %rdx
381	/*
382	 * Grow the stack and duplicate the arguements of the
383	 * original caller.
384	 *
385	 * We save %rsp in %r11 since we need to use the current rsp for
386	 * accessing the registers saved in our stack frame.
387	 */
388.grow_stack:
389	movq	%rsp, %r11
390	subq	%rdx, %rsp			/    grow the stack
391	movq	%rdx, SPPRVSTKOFF(%rbp)		/    -88(%rbp) == prev frame sz
392	movq	%rsp, %rcx			/    %rcx = dest
393	addq	%rcx, %rdx			/    %rdx == tail of dest
394.while_base:
395	cmpq	%rdx, %rcx			/   while (base+size >= src++) {
396	jge	.end_while			/
397	movq	(%rdi), %rsi
398	movq	%rsi,(%rcx)			/        *dest = *src
399	addq	$8, %rdi			/	 src++
400	addq	$8, %rcx			/        dest++
401	jmp	.while_base			/    }
402
403	/*
404	 * The above stack is now an exact duplicate of
405	 * the stack of the original calling procedure.
406	 */
407.end_while:
408	/
409	/ Restore registers using %r11 which contains our old %rsp value
410	/ before growing the stack.
411	/
412
413	/ Yes, we have to do this dance again. Sorry.
414	movq	org_scapset@GOTPCREL(%rip),%r9
415	movq	(%r9),%r9
416	movl	(%r9),%edx
417	testl	$AV_386_AVX,%edx
418	jne	.trace_r2_ymm
419
420.trace_r2_xmm:
421	movdqa	SPXMM0OFF(%r11), %xmm0
422	movdqa	SPXMM1OFF(%r11), %xmm1
423	movdqa	SPXMM2OFF(%r11), %xmm2
424	movdqa	SPXMM3OFF(%r11), %xmm3
425	movdqa	SPXMM4OFF(%r11), %xmm4
426	movdqa	SPXMM5OFF(%r11), %xmm5
427	movdqa	SPXMM6OFF(%r11), %xmm6
428	movdqa	SPXMM7OFF(%r11), %xmm7
429	jmp	.trace_r2_finish
430
431.trace_r2_ymm:
432	vmovdqa	SPXMM0OFF(%r11), %ymm0
433	vmovdqa	SPXMM1OFF(%r11), %ymm1
434	vmovdqa	SPXMM2OFF(%r11), %ymm2
435	vmovdqa	SPXMM3OFF(%r11), %ymm3
436	vmovdqa	SPXMM4OFF(%r11), %ymm4
437	vmovdqa	SPXMM5OFF(%r11), %ymm5
438	vmovdqa	SPXMM6OFF(%r11), %ymm6
439	vmovdqa	SPXMM7OFF(%r11), %ymm7
440
441.trace_r2_finish:
442	movq	SPRDIOFF(%r11), %rdi
443	movq	SPRSIOFF(%r11), %rsi
444	movq	SPRDXOFF(%r11), %rdx
445	movq	SPRCXOFF(%r11), %rcx
446	movq	SPR8OFF(%r11), %r8
447	movq	SPR9OFF(%r11), %r9
448	movq	SPR10OFF(%r11), %r10
449	movq	SPRAXOFF(%r11), %rax
450	movq	SPR11OFF(%r11), %r11		/ retore %r11 last
451
452	/*
453	 * Call to desitnation function - we'll return here
454	 * for pltexit monitoring.
455	 */
456	call	*SPDESTOFF(%rbp)
457
458	addq	SPPRVSTKOFF(%rbp), %rsp	/ cleanup dupped stack
459
460	/
461	/ prepare for call to audit_pltenter()
462	/
463	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
464	movq	SYMNDX_OFF(%r11), %r8		/ arg5 (symndx)
465	leaq	SYMDEF_OFF(%r11), %rcx		/ arg4 (&Sym)
466	movq	DEFLMP_OFF(%r11), %rdx		/ arg3 (dlmp)
467	movq	REFLMP_OFF(%r11), %rsi		/ arg2 (rlmp)
468	movq	%rax, %rdi			/ arg1 (returnval)
469	call	audit_pltexit@PLT
470
471	/*
472	 * Clean up after ourselves and return to the
473	 * original calling procedure.
474	 */
475
476	/
477	/ Restore registers
478	/
479	movq	SPRDIOFF(%rsp), %rdi
480	movq	SPRSIOFF(%rsp), %rsi
481	movq	SPRDXOFF(%rsp), %rdx
482	movq	SPRCXOFF(%rsp), %rcx
483	movq	SPR8OFF(%rsp), %r8
484	movq	SPR9OFF(%rsp), %r9
485	movq	SPR10OFF(%rsp), %r10
486	movq	SPR11OFF(%rsp), %r11
487	// rax already contains return value
488	movdqa	SPXMM0OFF(%rsp), %xmm0
489	movdqa	SPXMM1OFF(%rsp), %xmm1
490	movdqa	SPXMM2OFF(%rsp), %xmm2
491	movdqa	SPXMM3OFF(%rsp), %xmm3
492	movdqa	SPXMM4OFF(%rsp), %xmm4
493	movdqa	SPXMM5OFF(%rsp), %xmm5
494	movdqa	SPXMM6OFF(%rsp), %xmm6
495	movdqa	SPXMM7OFF(%rsp), %xmm7
496
497	movq	%rbp, %rsp			/
498	popq	%rbp				/
499	ret					/ return to caller
500	.size	elf_plt_trace, .-elf_plt_trace
501#endif
502
503/*
504 * We got here because a call to a function resolved to a procedure
505 * linkage table entry.  That entry did a JMPL to the first PLT entry, which
506 * in turn did a call to elf_rtbndr.
507 *
508 * the code sequence that got us here was:
509 *
510 * .PLT0:
511 *	pushq	GOT+8(%rip)	#GOT[1]
512 *	jmp	*GOT+16(%rip)	#GOT[2]
513 *	nop
514 *	nop
515 *	nop
516 *	nop
517 *	...
518 * PLT entry for foo:
519 *	jmp	*name1@GOTPCREL(%rip)
520 *	pushl	$rel.plt.foo
521 *	jmp	PLT0
522 *
523 * At entry, the stack looks like this:
524 *
525 *	return address			16(%rsp)
526 *	$rel.plt.foo	(plt index)	8(%rsp)
527 *	lmp				0(%rsp)
528 *
529 */
530#if defined(lint)
531
532extern unsigned long	elf_bndr(Rt_map *, unsigned long, caddr_t);
533
534void
535elf_rtbndr(Rt_map * lmp, unsigned long reloc, caddr_t pc)
536{
537	(void) elf_bndr(lmp, reloc, pc);
538}
539
540#else
541
542/*
543 * The PLT code that landed us here placed 2 arguments on the stack as
544 * arguments to elf_rtbndr.
545 * Additionally the pc of caller is below these 2 args.
546 * Our stack will look like this after we establish a stack frame with
547 * push %rbp; movq %rsp, %rbp sequence:
548 *
549 *	8(%rbp)			arg1 - *lmp
550 *	16(%rbp), %rsi		arg2 - reloc index
551 *	24(%rbp), %rdx		arg3 - pc of caller
552 */
553#define	LBPLMPOFF	8	/* arg1 - *lmp */
554#define	LBPRELOCOFF	16	/* arg2 - reloc index */
555#define	LBRPCOFF	24	/* arg3 - pc of caller */
556
557/*
558 * Possible arguments for the resolved function are in registers as per
559 * the AMD64 ABI.  We must save on the local stack all possible register
560 * arguments before interposing functions to resolve the called function.
561 * Possible arguments must be restored before invoking the resolved function.
562 *
563 * Before the AVX instruction set enhancements to AMD64 there were no changes in
564 * the set of registers and their sizes across different processors. With AVX,
565 * the xmm registers became the lower 128 bits of the ymm registers. Because of
566 * this, we need to conditionally save 256 bits instead of 128 bits. Regardless
567 * of whether we have ymm registers or not, we're always going to push the stack
568 * space assuming that we do to simplify the code.
569 *
570 * Local stack space storage for elf_rtbndr is allocated as follows:
571 *
572 *	Saved regs:
573 *	    %rax			 8
574 *	    %rdi			 8
575 *	    %rsi			 8
576 *	    %rdx			 8
577 *	    %rcx			 8
578 *	    %r8				 8
579 *	    %r9				 8
580 *	    %r10			 8
581 *				    =======
582 *			    Subtotal:   64 (32byte aligned)
583 *
584 *	Saved Media Regs (used to pass floating point args):
585 *	    %ymm0 - %ymm7   32 * 8     256
586 *				    =======
587 *			    Total:     320 (32byte aligned)
588 *
589 *  So - will subtract the following to create enough space
590 *
591 *	0(%rsp)		save %rax
592 *	8(%rsp)		save %rdi
593 *	16(%rsp)	save %rsi
594 *	24(%rsp)	save %rdx
595 *	32(%rsp)	save %rcx
596 *	40(%rsp)	save %r8
597 *	48(%rsp)	save %r9
598 *	56(%rsp)	save %r10
599 *	64(%rsp)	save %ymm0
600 *	96(%rsp)	save %ymm1
601 *	128(%rsp)	save %ymm2
602 *	160(%rsp)	save %ymm3
603 *	192(%rsp)	save %ymm4
604 *	224(%rsp)	save %ymm5
605 *	256(%rsp)	save %ymm6
606 *	288(%rsp)	save %ymm7
607 *
608 * Note: Some callers may use 8-byte stack alignment instead of the
609 * ABI required 16-byte alignment.  We use %rsp offsets to save/restore
610 * registers because %rbp may not be 16-byte aligned.  We guarantee %rsp
611 * is 16-byte aligned in the function preamble.
612 */
613/*
614 * As the registers may either be xmm or ymm, we've left the name as xmm, but
615 * increased the offset between them to always cover the xmm and ymm cases.
616 */
617#define	LS_SIZE	$320	/* local stack space to save all possible arguments */
618#define	LSRAXOFF	0	/* for SSE register count */
619#define	LSRDIOFF	8	/* arg 0 ... */
620#define	LSRSIOFF	16
621#define	LSRDXOFF	24
622#define	LSRCXOFF	32
623#define	LSR8OFF		40
624#define	LSR9OFF		48
625#define	LSR10OFF	56	/* ... arg 5 */
626#define	LSXMM0OFF	64	/* SSE arg 0 ... */
627#define	LSXMM1OFF	96
628#define	LSXMM2OFF	128
629#define	LSXMM3OFF	160
630#define	LSXMM4OFF	192
631#define	LSXMM5OFF	224
632#define	LSXMM6OFF	256
633#define	LSXMM7OFF	288	/* ... SSE arg 7 */
634
635	/*
636	 * The org_scapset is a global variable that is a part of rtld. It
637	 * contains the capabilities that the kernel has told us are supported
638	 * (auxv_hwcap). This is necessary for determining whether or not we
639	 * need to save and restore AVX registers or simple SSE registers. Note,
640	 * that the field we care about is currently at offset 0, if that
641	 * changes, this code will have to be updated.
642	 */
643	.extern org_scapset
644	.weak	_elf_rtbndr
645	_elf_rtbndr = elf_rtbndr
646
647	ENTRY(elf_rtbndr)
648
649	pushq	%rbp
650	movq	%rsp, %rbp
651
652	/*
653	 * Some libraries may (incorrectly) use non-ABI compliant 8-byte stack
654	 * alignment.  Enforce ABI 16-byte stack alignment here.
655	 * The next andq instruction does this pseudo code:
656	 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
657	 */
658	andq	$-32, %rsp	/* enforce ABI 32-byte stack alignment */
659
660	subq	LS_SIZE, %rsp	/* save all ABI defined argument registers */
661
662	movq	%rax, LSRAXOFF(%rsp)	/* for SSE register count */
663	movq	%rdi, LSRDIOFF(%rsp)	/*  arg 0 .. */
664	movq	%rsi, LSRSIOFF(%rsp)
665	movq	%rdx, LSRDXOFF(%rsp)
666	movq	%rcx, LSRCXOFF(%rsp)
667	movq	%r8, LSR8OFF(%rsp)
668	movq	%r9, LSR9OFF(%rsp)	/* .. arg 5 */
669	movq	%r10, LSR10OFF(%rsp)	/* call chain reg */
670
671	/*
672	 * Our xmm registers could secretly by ymm registers in disguise.
673	 */
674	movq	org_scapset@GOTPCREL(%rip),%r9
675	movq	(%r9),%r9
676	movl	(%r9),%edx
677	testl	$AV_386_AVX,%edx
678	jne	.save_ymm
679
680.save_xmm:
681	movdqa	%xmm0, LSXMM0OFF(%rsp)	/* SSE arg 0 ... */
682	movdqa	%xmm1, LSXMM1OFF(%rsp)
683	movdqa	%xmm2, LSXMM2OFF(%rsp)
684	movdqa	%xmm3, LSXMM3OFF(%rsp)
685	movdqa	%xmm4, LSXMM4OFF(%rsp)
686	movdqa	%xmm5, LSXMM5OFF(%rsp)
687	movdqa	%xmm6, LSXMM6OFF(%rsp)
688	movdqa	%xmm7, LSXMM7OFF(%rsp)	/* ... SSE arg 7 */
689	jmp	.save_finish
690
691.save_ymm:
692	vmovdqa	%ymm0, LSXMM0OFF(%rsp)	/* SSE arg 0 ... */
693	vmovdqa	%ymm1, LSXMM1OFF(%rsp)
694	vmovdqa	%ymm2, LSXMM2OFF(%rsp)
695	vmovdqa	%ymm3, LSXMM3OFF(%rsp)
696	vmovdqa	%ymm4, LSXMM4OFF(%rsp)
697	vmovdqa	%ymm5, LSXMM5OFF(%rsp)
698	vmovdqa	%ymm6, LSXMM6OFF(%rsp)
699	vmovdqa	%ymm7, LSXMM7OFF(%rsp)	/* ... SSE arg 7 */
700
701.save_finish:
702	movq	LBPLMPOFF(%rbp), %rdi	/* arg1 - *lmp */
703	movq	LBPRELOCOFF(%rbp), %rsi	/* arg2 - reloc index */
704	movq	LBRPCOFF(%rbp), %rdx	/* arg3 - pc of caller */
705	call	elf_bndr@PLT		/* call elf_rtbndr(lmp, relndx, pc) */
706	movq	%rax, LBPRELOCOFF(%rbp)	/* store final destination */
707
708	/*
709	 * Restore possible arguments before invoking resolved function. We
710	 * check the xmm vs. ymm regs first so we can use the others.
711	 */
712	movq	org_scapset@GOTPCREL(%rip),%r9
713	movq	(%r9),%r9
714	movl	(%r9),%edx
715	testl	$AV_386_AVX,%edx
716	jne	.restore_ymm
717
718.restore_xmm:
719	movdqa	LSXMM0OFF(%rsp), %xmm0
720	movdqa	LSXMM1OFF(%rsp), %xmm1
721	movdqa	LSXMM2OFF(%rsp), %xmm2
722	movdqa	LSXMM3OFF(%rsp), %xmm3
723	movdqa	LSXMM4OFF(%rsp), %xmm4
724	movdqa	LSXMM5OFF(%rsp), %xmm5
725	movdqa	LSXMM6OFF(%rsp), %xmm6
726	movdqa	LSXMM7OFF(%rsp), %xmm7
727	jmp .restore_finish
728
729.restore_ymm:
730	vmovdqa	LSXMM0OFF(%rsp), %ymm0
731	vmovdqa	LSXMM1OFF(%rsp), %ymm1
732	vmovdqa	LSXMM2OFF(%rsp), %ymm2
733	vmovdqa	LSXMM3OFF(%rsp), %ymm3
734	vmovdqa	LSXMM4OFF(%rsp), %ymm4
735	vmovdqa	LSXMM5OFF(%rsp), %ymm5
736	vmovdqa	LSXMM6OFF(%rsp), %ymm6
737	vmovdqa	LSXMM7OFF(%rsp), %ymm7
738
739.restore_finish:
740	movq	LSRAXOFF(%rsp), %rax
741	movq	LSRDIOFF(%rsp), %rdi
742	movq	LSRSIOFF(%rsp), %rsi
743	movq	LSRDXOFF(%rsp), %rdx
744	movq	LSRCXOFF(%rsp), %rcx
745	movq	LSR8OFF(%rsp), %r8
746	movq	LSR9OFF(%rsp), %r9
747	movq	LSR10OFF(%rsp), %r10
748
749	movq	%rbp, %rsp
750	popq	%rbp
751
752	addq	$8, %rsp	/* pop 1st plt-pushed args */
753				/* the second arguement is used */
754				/* for the 'return' address to our */
755				/* final destination */
756
757	ret			/* invoke resolved function */
758	.size 	elf_rtbndr, .-elf_rtbndr
759#endif
760