xref: /freebsd/contrib/llvm-project/openmp/runtime/src/z_Linux_asm.S (revision 6966ac055c3b7a39266fb982493330df7a097997)
1//  z_Linux_asm.S:  - microtasking routines specifically
2//                    written for Intel platforms running Linux* OS
3
4//
5////===----------------------------------------------------------------------===//
6////
7//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8//// See https://llvm.org/LICENSE.txt for license information.
9//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10////
11////===----------------------------------------------------------------------===//
12//
13
14// -----------------------------------------------------------------------
15// macros
16// -----------------------------------------------------------------------
17
18#include "kmp_config.h"
19
20#if KMP_ARCH_X86 || KMP_ARCH_X86_64
21
22# if KMP_MIC
23// the 'delay r16/r32/r64' should be used instead of the 'pause'.
24// The delay operation has the effect of removing the current thread from
25// the round-robin HT mechanism, and therefore speeds up the issue rate of
26// the other threads on the same core.
27//
28// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
29// barrier time to increase greatly for 3 or more threads per core.
30//
31// A value of 100 works pretty well for up to 4 threads per core, but isn't
32// quite as fast as 0 for 2 threads per core.
33//
34// We need to check what happens for oversubscription / > 4 threads per core.
35// It is possible that we need to pass the delay value in as a parameter
36// that the caller determines based on the total # threads / # cores.
37//
38//.macro pause_op
39//	mov    $100, %rax
40//	delay  %rax
41//.endm
42# else
43#  define pause_op   .byte 0xf3,0x90
44# endif // KMP_MIC
45
46# if KMP_OS_DARWIN
47#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
48#  define KMP_LABEL(x) L_##x             // form the name of label
49.macro KMP_CFI_DEF_OFFSET
50.endmacro
51.macro KMP_CFI_OFFSET
52.endmacro
53.macro KMP_CFI_REGISTER
54.endmacro
55.macro KMP_CFI_DEF
56.endmacro
57.macro ALIGN
58	.align $0
59.endmacro
60.macro DEBUG_INFO
61/* Not sure what .size does in icc, not sure if we need to do something
62   similar for OS X*.
63*/
64.endmacro
65.macro PROC
66	ALIGN  4
67	.globl KMP_PREFIX_UNDERSCORE($0)
68KMP_PREFIX_UNDERSCORE($0):
69.endmacro
70# else // KMP_OS_DARWIN
71#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
72// Format labels so that they don't override function names in gdb's backtraces
73// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
74// on OS X*)
75# if KMP_MIC
76#  define KMP_LABEL(x) L_##x          // local label
77# else
78#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
79# endif // KMP_MIC
80.macro ALIGN size
81	.align 1<<(\size)
82.endm
83.macro DEBUG_INFO proc
84	.cfi_endproc
85// Not sure why we need .type and .size for the functions
86	.align 16
87	.type  \proc,@function
88        .size  \proc,.-\proc
89.endm
90.macro PROC proc
91	ALIGN  4
92        .globl KMP_PREFIX_UNDERSCORE(\proc)
93KMP_PREFIX_UNDERSCORE(\proc):
94	.cfi_startproc
95.endm
96.macro KMP_CFI_DEF_OFFSET sz
97	.cfi_def_cfa_offset	\sz
98.endm
99.macro KMP_CFI_OFFSET reg, sz
100	.cfi_offset	\reg,\sz
101.endm
102.macro KMP_CFI_REGISTER reg
103	.cfi_def_cfa_register	\reg
104.endm
105.macro KMP_CFI_DEF reg, sz
106	.cfi_def_cfa	\reg,\sz
107.endm
108# endif // KMP_OS_DARWIN
109#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
110
111#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
112
113# if KMP_OS_DARWIN
114#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
115#  define KMP_LABEL(x) L_##x             // form the name of label
116
117.macro ALIGN
118	.align $0
119.endmacro
120
121.macro DEBUG_INFO
122/* Not sure what .size does in icc, not sure if we need to do something
123   similar for OS X*.
124*/
125.endmacro
126
127.macro PROC
128	ALIGN  4
129	.globl KMP_PREFIX_UNDERSCORE($0)
130KMP_PREFIX_UNDERSCORE($0):
131.endmacro
132# else // KMP_OS_DARWIN
133#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
134// Format labels so that they don't override function names in gdb's backtraces
135#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
136
137.macro ALIGN size
138	.align 1<<(\size)
139.endm
140
141.macro DEBUG_INFO proc
142	.cfi_endproc
143// Not sure why we need .type and .size for the functions
144	ALIGN 2
145	.type  \proc,@function
146	.size  \proc,.-\proc
147.endm
148
149.macro PROC proc
150	ALIGN 2
151	.globl KMP_PREFIX_UNDERSCORE(\proc)
152KMP_PREFIX_UNDERSCORE(\proc):
153	.cfi_startproc
154.endm
155# endif // KMP_OS_DARWIN
156
157#endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
158
159// -----------------------------------------------------------------------
160// data
161// -----------------------------------------------------------------------
162
163#ifdef KMP_GOMP_COMPAT
164
165// Support for unnamed common blocks.
166//
167// Because the symbol ".gomp_critical_user_" contains a ".", we have to
168// put this stuff in assembly.
169
170# if KMP_ARCH_X86
171#  if KMP_OS_DARWIN
172        .data
173        .comm .gomp_critical_user_,32
174        .data
175        .globl ___kmp_unnamed_critical_addr
176___kmp_unnamed_critical_addr:
177        .long .gomp_critical_user_
178#  else /* Linux* OS */
179        .data
180        .comm .gomp_critical_user_,32,8
181        .data
182	ALIGN 4
183        .global __kmp_unnamed_critical_addr
184__kmp_unnamed_critical_addr:
185        .4byte .gomp_critical_user_
186        .type __kmp_unnamed_critical_addr,@object
187        .size __kmp_unnamed_critical_addr,4
188#  endif /* KMP_OS_DARWIN */
189# endif /* KMP_ARCH_X86 */
190
191# if KMP_ARCH_X86_64
192#  if KMP_OS_DARWIN
193        .data
194        .comm .gomp_critical_user_,32
195        .data
196        .globl ___kmp_unnamed_critical_addr
197___kmp_unnamed_critical_addr:
198        .quad .gomp_critical_user_
199#  else /* Linux* OS */
200        .data
201        .comm .gomp_critical_user_,32,8
202        .data
203	ALIGN 8
204        .global __kmp_unnamed_critical_addr
205__kmp_unnamed_critical_addr:
206        .8byte .gomp_critical_user_
207        .type __kmp_unnamed_critical_addr,@object
208        .size __kmp_unnamed_critical_addr,8
209#  endif /* KMP_OS_DARWIN */
210# endif /* KMP_ARCH_X86_64 */
211
212#endif /* KMP_GOMP_COMPAT */
213
214
215#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
216
217// -----------------------------------------------------------------------
218// microtasking routines specifically written for IA-32 architecture
219// running Linux* OS
220// -----------------------------------------------------------------------
221
222	.ident "Intel Corporation"
223	.data
224	ALIGN 4
225// void
226// __kmp_x86_pause( void );
227
228        .text
229	PROC  __kmp_x86_pause
230
231        pause_op
232        ret
233
234	DEBUG_INFO __kmp_x86_pause
235
236# if !KMP_ASM_INTRINS
237
238//------------------------------------------------------------------------
239// kmp_int32
240// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
241
242        PROC      __kmp_test_then_add32
243
244        movl      4(%esp), %ecx
245        movl      8(%esp), %eax
246        lock
247        xaddl     %eax,(%ecx)
248        ret
249
250	DEBUG_INFO __kmp_test_then_add32
251
252//------------------------------------------------------------------------
253// FUNCTION __kmp_xchg_fixed8
254//
255// kmp_int32
256// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
257//
258// parameters:
259// 	p:	4(%esp)
260// 	d:	8(%esp)
261//
262// return:	%al
263        PROC  __kmp_xchg_fixed8
264
265        movl      4(%esp), %ecx    // "p"
266        movb      8(%esp), %al	// "d"
267
268        lock
269        xchgb     %al,(%ecx)
270        ret
271
272        DEBUG_INFO __kmp_xchg_fixed8
273
274
275//------------------------------------------------------------------------
276// FUNCTION __kmp_xchg_fixed16
277//
278// kmp_int16
279// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
280//
281// parameters:
282// 	p:	4(%esp)
283// 	d:	8(%esp)
284// return:     %ax
285        PROC  __kmp_xchg_fixed16
286
287        movl      4(%esp), %ecx    // "p"
288        movw      8(%esp), %ax	// "d"
289
290        lock
291        xchgw     %ax,(%ecx)
292        ret
293
294        DEBUG_INFO __kmp_xchg_fixed16
295
296
297//------------------------------------------------------------------------
298// FUNCTION __kmp_xchg_fixed32
299//
300// kmp_int32
301// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
302//
303// parameters:
304// 	p:	4(%esp)
305// 	d:	8(%esp)
306//
307// return:	%eax
308        PROC  __kmp_xchg_fixed32
309
310        movl      4(%esp), %ecx    // "p"
311        movl      8(%esp), %eax	// "d"
312
313        lock
314        xchgl     %eax,(%ecx)
315        ret
316
317        DEBUG_INFO __kmp_xchg_fixed32
318
319
320// kmp_int8
321// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
322        PROC  __kmp_compare_and_store8
323
324        movl      4(%esp), %ecx
325        movb      8(%esp), %al
326        movb      12(%esp), %dl
327        lock
328        cmpxchgb  %dl,(%ecx)
329        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
330        and       $1, %eax      // sign extend previous instruction
331        ret
332
333        DEBUG_INFO __kmp_compare_and_store8
334
335// kmp_int16
336// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
337        PROC  __kmp_compare_and_store16
338
339        movl      4(%esp), %ecx
340        movw      8(%esp), %ax
341        movw      12(%esp), %dx
342        lock
343        cmpxchgw  %dx,(%ecx)
344        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
345        and       $1, %eax      // sign extend previous instruction
346        ret
347
348        DEBUG_INFO __kmp_compare_and_store16
349
350// kmp_int32
351// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
352        PROC  __kmp_compare_and_store32
353
354        movl      4(%esp), %ecx
355        movl      8(%esp), %eax
356        movl      12(%esp), %edx
357        lock
358        cmpxchgl  %edx,(%ecx)
359        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
360        and       $1, %eax     // sign extend previous instruction
361        ret
362
363        DEBUG_INFO __kmp_compare_and_store32
364
365// kmp_int32
366// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
367        PROC  __kmp_compare_and_store64
368
369        pushl     %ebp
370        movl      %esp, %ebp
371        pushl     %ebx
372        pushl     %edi
373        movl      8(%ebp), %edi
374        movl      12(%ebp), %eax        // "cv" low order word
375        movl      16(%ebp), %edx        // "cv" high order word
376        movl      20(%ebp), %ebx        // "sv" low order word
377        movl      24(%ebp), %ecx        // "sv" high order word
378        lock
379        cmpxchg8b (%edi)
380        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
381        and       $1, %eax // sign extend previous instruction
382        popl      %edi
383        popl      %ebx
384        movl      %ebp, %esp
385        popl      %ebp
386        ret
387
388        DEBUG_INFO __kmp_compare_and_store64
389
390// kmp_int8
391// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
392        PROC  __kmp_compare_and_store_ret8
393
394        movl      4(%esp), %ecx
395        movb      8(%esp), %al
396        movb      12(%esp), %dl
397        lock
398        cmpxchgb  %dl,(%ecx)
399        ret
400
401        DEBUG_INFO __kmp_compare_and_store_ret8
402
403// kmp_int16
404// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
405//                               kmp_int16 sv);
406        PROC  __kmp_compare_and_store_ret16
407
408        movl      4(%esp), %ecx
409        movw      8(%esp), %ax
410        movw      12(%esp), %dx
411        lock
412        cmpxchgw  %dx,(%ecx)
413        ret
414
415        DEBUG_INFO __kmp_compare_and_store_ret16
416
417// kmp_int32
418// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
419//                               kmp_int32 sv);
420        PROC  __kmp_compare_and_store_ret32
421
422        movl      4(%esp), %ecx
423        movl      8(%esp), %eax
424        movl      12(%esp), %edx
425        lock
426        cmpxchgl  %edx,(%ecx)
427        ret
428
429        DEBUG_INFO __kmp_compare_and_store_ret32
430
431// kmp_int64
432// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
433//                               kmp_int64 sv);
434        PROC  __kmp_compare_and_store_ret64
435
436        pushl     %ebp
437        movl      %esp, %ebp
438        pushl     %ebx
439        pushl     %edi
440        movl      8(%ebp), %edi
441        movl      12(%ebp), %eax        // "cv" low order word
442        movl      16(%ebp), %edx        // "cv" high order word
443        movl      20(%ebp), %ebx        // "sv" low order word
444        movl      24(%ebp), %ecx        // "sv" high order word
445        lock
446        cmpxchg8b (%edi)
447        popl      %edi
448        popl      %ebx
449        movl      %ebp, %esp
450        popl      %ebp
451        ret
452
453        DEBUG_INFO __kmp_compare_and_store_ret64
454
455
456//------------------------------------------------------------------------
457// FUNCTION __kmp_xchg_real32
458//
459// kmp_real32
460// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
461//
462// parameters:
463// 	addr:	4(%esp)
464// 	data:	8(%esp)
465//
466// return:	%eax
467        PROC  __kmp_xchg_real32
468
469        pushl   %ebp
470        movl    %esp, %ebp
471        subl    $4, %esp
472        pushl   %esi
473
474        movl    4(%ebp), %esi
475        flds    (%esi)
476                        // load <addr>
477        fsts    -4(%ebp)
478                        // store old value
479
480        movl    8(%ebp), %eax
481
482        lock
483        xchgl   %eax, (%esi)
484
485        flds    -4(%ebp)
486                        // return old value
487
488        popl    %esi
489        movl    %ebp, %esp
490        popl    %ebp
491        ret
492
493        DEBUG_INFO __kmp_xchg_real32
494
495# endif /* !KMP_ASM_INTRINS */
496
497//------------------------------------------------------------------------
498// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
499//
500// int
501// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
502//                         int argc, void *p_argv[] ) {
503//    (*pkfn)( & gtid, & gtid, argv[0], ... );
504//    return 1;
505// }
506
507// -- Begin __kmp_invoke_microtask
508// mark_begin;
509	PROC  __kmp_invoke_microtask
510
511	pushl %ebp
512	KMP_CFI_DEF_OFFSET 8
513	KMP_CFI_OFFSET ebp,-8
514	movl %esp,%ebp		// establish the base pointer for this routine.
515	KMP_CFI_REGISTER ebp
516	subl $8,%esp		// allocate space for two local variables.
517				// These varibales are:
518				//	argv: -4(%ebp)
519				//	temp: -8(%ebp)
520				//
521	pushl %ebx		// save %ebx to use during this routine
522				//
523#if OMPT_SUPPORT
524	movl 28(%ebp),%ebx	// get exit_frame address
525	movl %ebp,(%ebx)	// save exit_frame
526#endif
527
528	movl 20(%ebp),%ebx	// Stack alignment - # args
529	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
530	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
531	movl %esp,%eax		//
532	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
533	movl %eax,%ebx		// Save to %ebx
534	andl $0xFFFFFF80,%eax	// mask off 7 bits
535	subl %eax,%ebx		// Amount to subtract from %esp
536	subl %ebx,%esp		// Prepare the stack ptr --
537				//   now it will be aligned on 128-byte boundary at the call
538
539	movl 24(%ebp),%eax	// copy from p_argv[]
540	movl %eax,-4(%ebp)	// into the local variable *argv.
541
542	movl 20(%ebp),%ebx	// argc is 20(%ebp)
543	shll $2,%ebx
544
545KMP_LABEL(invoke_2):
546	cmpl $0,%ebx
547	jg  KMP_LABEL(invoke_4)
548	jmp KMP_LABEL(invoke_3)
549	ALIGN 2
550KMP_LABEL(invoke_4):
551	movl -4(%ebp),%eax
552	subl $4,%ebx			// decrement argc.
553	addl %ebx,%eax			// index into argv.
554	movl (%eax),%edx
555	pushl %edx
556
557	jmp KMP_LABEL(invoke_2)
558	ALIGN 2
559KMP_LABEL(invoke_3):
560	leal 16(%ebp),%eax		// push & tid
561	pushl %eax
562
563	leal 12(%ebp),%eax		// push & gtid
564	pushl %eax
565
566	movl 8(%ebp),%ebx
567	call *%ebx			// call (*pkfn)();
568
569	movl $1,%eax			// return 1;
570
571	movl -12(%ebp),%ebx		// restore %ebx
572	leave
573	KMP_CFI_DEF esp,4
574	ret
575
576	DEBUG_INFO __kmp_invoke_microtask
577// -- End  __kmp_invoke_microtask
578
579
580// kmp_uint64
581// __kmp_hardware_timestamp(void)
582	PROC  __kmp_hardware_timestamp
583	rdtsc
584	ret
585
586	DEBUG_INFO __kmp_hardware_timestamp
587// -- End  __kmp_hardware_timestamp
588
589#endif /* KMP_ARCH_X86 */
590
591
592#if KMP_ARCH_X86_64
593
594// -----------------------------------------------------------------------
595// microtasking routines specifically written for IA-32 architecture and
596// Intel(R) 64 running Linux* OS
597// -----------------------------------------------------------------------
598
599// -- Machine type P
600// mark_description "Intel Corporation";
601	.ident "Intel Corporation"
602// --	.file "z_Linux_asm.S"
603	.data
604	ALIGN 4
605
606// To prevent getting our code into .data section .text added to every routine
607// definition for x86_64.
608//------------------------------------------------------------------------
609# if !KMP_ASM_INTRINS
610
611//------------------------------------------------------------------------
612// FUNCTION __kmp_test_then_add32
613//
614// kmp_int32
615// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
616//
617// parameters:
618// 	p:	%rdi
619// 	d:	%esi
620//
621// return:	%eax
622        .text
623        PROC  __kmp_test_then_add32
624
625        movl      %esi, %eax	// "d"
626        lock
627        xaddl     %eax,(%rdi)
628        ret
629
630        DEBUG_INFO __kmp_test_then_add32
631
632
633//------------------------------------------------------------------------
634// FUNCTION __kmp_test_then_add64
635//
636// kmp_int64
637// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
638//
639// parameters:
640// 	p:	%rdi
641// 	d:	%rsi
642//	return:	%rax
643        .text
644        PROC  __kmp_test_then_add64
645
646        movq      %rsi, %rax	// "d"
647        lock
648        xaddq     %rax,(%rdi)
649        ret
650
651        DEBUG_INFO __kmp_test_then_add64
652
653
654//------------------------------------------------------------------------
655// FUNCTION __kmp_xchg_fixed8
656//
657// kmp_int32
658// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
659//
660// parameters:
661// 	p:	%rdi
662// 	d:	%sil
663//
664// return:	%al
665        .text
666        PROC  __kmp_xchg_fixed8
667
668        movb      %sil, %al	// "d"
669
670        lock
671        xchgb     %al,(%rdi)
672        ret
673
674        DEBUG_INFO __kmp_xchg_fixed8
675
676
677//------------------------------------------------------------------------
678// FUNCTION __kmp_xchg_fixed16
679//
680// kmp_int16
681// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
682//
683// parameters:
684// 	p:	%rdi
685// 	d:	%si
686// return:     %ax
687        .text
688        PROC  __kmp_xchg_fixed16
689
690        movw      %si, %ax	// "d"
691
692        lock
693        xchgw     %ax,(%rdi)
694        ret
695
696        DEBUG_INFO __kmp_xchg_fixed16
697
698
699//------------------------------------------------------------------------
700// FUNCTION __kmp_xchg_fixed32
701//
702// kmp_int32
703// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
704//
705// parameters:
706// 	p:	%rdi
707// 	d:	%esi
708//
709// return:	%eax
710        .text
711        PROC  __kmp_xchg_fixed32
712
713        movl      %esi, %eax	// "d"
714
715        lock
716        xchgl     %eax,(%rdi)
717        ret
718
719        DEBUG_INFO __kmp_xchg_fixed32
720
721
722//------------------------------------------------------------------------
723// FUNCTION __kmp_xchg_fixed64
724//
725// kmp_int64
726// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
727//
728// parameters:
729// 	p:	%rdi
730// 	d:	%rsi
731// return:	%rax
732        .text
733        PROC  __kmp_xchg_fixed64
734
735        movq      %rsi, %rax	// "d"
736
737        lock
738        xchgq     %rax,(%rdi)
739        ret
740
741        DEBUG_INFO __kmp_xchg_fixed64
742
743
744//------------------------------------------------------------------------
745// FUNCTION __kmp_compare_and_store8
746//
747// kmp_int8
748// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
749//
750// parameters:
751// 	p:	%rdi
752// 	cv:	%esi
753//	sv:	%edx
754//
755// return:	%eax
756        .text
757        PROC  __kmp_compare_and_store8
758
759        movb      %sil, %al	// "cv"
760        lock
761        cmpxchgb  %dl,(%rdi)
762        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
763        andq      $1, %rax      // sign extend previous instruction for return value
764        ret
765
766        DEBUG_INFO __kmp_compare_and_store8
767
768
769//------------------------------------------------------------------------
770// FUNCTION __kmp_compare_and_store16
771//
772// kmp_int16
773// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
774//
775// parameters:
776// 	p:	%rdi
777// 	cv:	%si
778//	sv:	%dx
779//
780// return:	%eax
781        .text
782        PROC  __kmp_compare_and_store16
783
784        movw      %si, %ax	// "cv"
785        lock
786        cmpxchgw  %dx,(%rdi)
787        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
788        andq      $1, %rax      // sign extend previous instruction for return value
789        ret
790
791        DEBUG_INFO __kmp_compare_and_store16
792
793
794//------------------------------------------------------------------------
795// FUNCTION __kmp_compare_and_store32
796//
797// kmp_int32
798// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
799//
800// parameters:
801// 	p:	%rdi
802// 	cv:	%esi
803//	sv:	%edx
804//
805// return:	%eax
806        .text
807        PROC  __kmp_compare_and_store32
808
809        movl      %esi, %eax	// "cv"
810        lock
811        cmpxchgl  %edx,(%rdi)
812        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
813        andq      $1, %rax      // sign extend previous instruction for return value
814        ret
815
816        DEBUG_INFO __kmp_compare_and_store32
817
818
819//------------------------------------------------------------------------
820// FUNCTION __kmp_compare_and_store64
821//
822// kmp_int32
823// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
824//
825// parameters:
826// 	p:	%rdi
827// 	cv:	%rsi
828//	sv:	%rdx
829//	return:	%eax
830        .text
831        PROC  __kmp_compare_and_store64
832
833        movq      %rsi, %rax    // "cv"
834        lock
835        cmpxchgq  %rdx,(%rdi)
836        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
837        andq      $1, %rax      // sign extend previous instruction for return value
838        ret
839
840        DEBUG_INFO __kmp_compare_and_store64
841
842//------------------------------------------------------------------------
843// FUNCTION __kmp_compare_and_store_ret8
844//
845// kmp_int8
846// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
847//
848// parameters:
849// 	p:	%rdi
850// 	cv:	%esi
851//	sv:	%edx
852//
853// return:	%eax
854        .text
855        PROC  __kmp_compare_and_store_ret8
856
857        movb      %sil, %al	// "cv"
858        lock
859        cmpxchgb  %dl,(%rdi)
860        ret
861
862        DEBUG_INFO __kmp_compare_and_store_ret8
863
864
865//------------------------------------------------------------------------
866// FUNCTION __kmp_compare_and_store_ret16
867//
868// kmp_int16
869// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
870//
871// parameters:
872// 	p:	%rdi
873// 	cv:	%si
874//	sv:	%dx
875//
876// return:	%eax
877        .text
878        PROC  __kmp_compare_and_store_ret16
879
880        movw      %si, %ax	// "cv"
881        lock
882        cmpxchgw  %dx,(%rdi)
883        ret
884
885        DEBUG_INFO __kmp_compare_and_store_ret16
886
887
888//------------------------------------------------------------------------
889// FUNCTION __kmp_compare_and_store_ret32
890//
891// kmp_int32
892// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
893//
894// parameters:
895// 	p:	%rdi
896// 	cv:	%esi
897//	sv:	%edx
898//
899// return:	%eax
900        .text
901        PROC  __kmp_compare_and_store_ret32
902
903        movl      %esi, %eax	// "cv"
904        lock
905        cmpxchgl  %edx,(%rdi)
906        ret
907
908        DEBUG_INFO __kmp_compare_and_store_ret32
909
910
911//------------------------------------------------------------------------
912// FUNCTION __kmp_compare_and_store_ret64
913//
914// kmp_int64
915// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
916//
917// parameters:
918// 	p:	%rdi
919// 	cv:	%rsi
920//	sv:	%rdx
921//	return:	%eax
922        .text
923        PROC  __kmp_compare_and_store_ret64
924
925        movq      %rsi, %rax    // "cv"
926        lock
927        cmpxchgq  %rdx,(%rdi)
928        ret
929
930        DEBUG_INFO __kmp_compare_and_store_ret64
931
932# endif /* !KMP_ASM_INTRINS */
933
934
935# if !KMP_MIC
936
937# if !KMP_ASM_INTRINS
938
939//------------------------------------------------------------------------
940// FUNCTION __kmp_xchg_real32
941//
942// kmp_real32
943// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
944//
945// parameters:
946// 	addr:	%rdi
947// 	data:	%xmm0 (lower 4 bytes)
948//
949// return:	%xmm0 (lower 4 bytes)
950        .text
951        PROC  __kmp_xchg_real32
952
953	movd	%xmm0, %eax	// load "data" to eax
954
955         lock
956         xchgl %eax, (%rdi)
957
958	movd	%eax, %xmm0	// load old value into return register
959
960        ret
961
962        DEBUG_INFO __kmp_xchg_real32
963
964
965//------------------------------------------------------------------------
966// FUNCTION __kmp_xchg_real64
967//
968// kmp_real64
969// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
970//
971// parameters:
972//      addr:   %rdi
973//      data:   %xmm0 (lower 8 bytes)
974//      return: %xmm0 (lower 8 bytes)
975        .text
976        PROC  __kmp_xchg_real64
977
978	movd	%xmm0, %rax	// load "data" to rax
979
980         lock
981	xchgq  %rax, (%rdi)
982
983	movd	%rax, %xmm0	// load old value into return register
984        ret
985
986        DEBUG_INFO __kmp_xchg_real64
987
988
989# endif /* !KMP_MIC */
990
991# endif /* !KMP_ASM_INTRINS */
992
993//------------------------------------------------------------------------
994// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
995//
996// int
997// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
998//		           int gtid, int tid,
999//                         int argc, void *p_argv[] ) {
1000//    (*pkfn)( & gtid, & tid, argv[0], ... );
1001//    return 1;
1002// }
1003//
1004// note: at call to pkfn must have %rsp 128-byte aligned for compiler
1005//
1006// parameters:
1007//      %rdi:  	pkfn
1008//	%esi:	gtid
1009//	%edx:	tid
1010//	%ecx:	argc
1011//	%r8:	p_argv
1012//	%r9:	&exit_frame
1013//
1014// locals:
1015//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1016//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1017//
1018// reg temps:
1019//	%rax:	used all over the place
1020//	%rdx:	used in stack pointer alignment calculation
1021//	%r11:	used to traverse p_argv array
1022//	%rsi:	used as temporary for stack parameters
1023//		used as temporary for number of pkfn parms to push
1024//	%rbx:	used to hold pkfn address, and zero constant, callee-save
1025//
1026// return:	%eax 	(always 1/TRUE)
1027__gtid = -16
1028__tid = -24
1029
1030// -- Begin __kmp_invoke_microtask
1031// mark_begin;
1032        .text
1033	PROC  __kmp_invoke_microtask
1034
1035	pushq 	%rbp		// save base pointer
1036	KMP_CFI_DEF_OFFSET 16
1037	KMP_CFI_OFFSET rbp,-16
1038	movq 	%rsp,%rbp	// establish the base pointer for this routine.
1039	KMP_CFI_REGISTER rbp
1040
1041#if OMPT_SUPPORT
1042	movq	%rbp, (%r9)	// save exit_frame
1043#endif
1044
1045	pushq 	%rbx		// %rbx is callee-saved register
1046	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
1047	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
1048
1049	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
1050	movq	$0, %rbx	// constant for cmovs later
1051	subq	$4, %rax	// subtract four args passed in registers to pkfn
1052#if KMP_MIC
1053	js	KMP_LABEL(kmp_0)	// jump to movq
1054	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
1055KMP_LABEL(kmp_0):
1056	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1057KMP_LABEL(kmp_0_exit):
1058#else
1059	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1060#endif // KMP_MIC
1061
1062	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
1063	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
1064
1065	movq 	%rsp, %rdx	//
1066	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
1067				// without align, stack ptr would be this
1068	movq 	%rdx, %rax	// Save to %rax
1069
1070	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
1071	subq 	%rax, %rdx	// Amount to subtract from %rsp
1072	subq 	%rdx, %rsp	// Prepare the stack ptr --
1073				// now %rsp will align to 128-byte boundary at call site
1074
1075				// setup pkfn parameter reg and stack
1076	movq	%rcx, %rax	// argc -> %rax
1077	cmpq	$0, %rsi
1078	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
1079	shlq	$3, %rcx	// argc*8 -> %rcx
1080	movq 	%r8, %rdx	// p_argv -> %rdx
1081	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
1082
1083	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
1084
1085KMP_LABEL(kmp_invoke_push_parms):
1086	// push nth - 7th parms to pkfn on stack
1087	subq	$8, %rdx	// decrement p_argv pointer to previous parm
1088	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
1089	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
1090	subl	$1, %ecx
1091
1092// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1093//		if the name of the label that is an operand of this jecxz starts with a dot (".");
1094//	   Apple's linker does not support 1-byte length relocation;
1095//         Resolution: replace all .labelX entries with L_labelX.
1096
1097	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
1098	jmp	KMP_LABEL(kmp_invoke_push_parms)
1099	ALIGN 3
1100KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
1101				// order here is important to avoid trashing
1102				// registers used for both input and output parms!
1103	movq	%rdi, %rbx	// pkfn -> %rbx
1104	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1105	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
1106
1107	movq	%r8, %r11	// p_argv -> %r11
1108
1109#if KMP_MIC
1110	cmpq	$4, %rax	// argc >= 4?
1111	jns	KMP_LABEL(kmp_4)	// jump to movq
1112	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
1113KMP_LABEL(kmp_4):
1114	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1115KMP_LABEL(kmp_4_exit):
1116
1117	cmpq	$3, %rax	// argc >= 3?
1118	jns	KMP_LABEL(kmp_3)	// jump to movq
1119	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
1120KMP_LABEL(kmp_3):
1121	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1122KMP_LABEL(kmp_3_exit):
1123
1124	cmpq	$2, %rax	// argc >= 2?
1125	jns	KMP_LABEL(kmp_2)	// jump to movq
1126	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
1127KMP_LABEL(kmp_2):
1128	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1129KMP_LABEL(kmp_2_exit):
1130
1131	cmpq	$1, %rax	// argc >= 1?
1132	jns	KMP_LABEL(kmp_1)	// jump to movq
1133	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
1134KMP_LABEL(kmp_1):
1135	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1136KMP_LABEL(kmp_1_exit):
1137#else
1138	cmpq	$4, %rax	// argc >= 4?
1139	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1140
1141	cmpq	$3, %rax	// argc >= 3?
1142	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1143
1144	cmpq	$2, %rax	// argc >= 2?
1145	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1146
1147	cmpq	$1, %rax	// argc >= 1?
1148	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1149#endif // KMP_MIC
1150
1151	call	*%rbx		// call (*pkfn)();
1152	movq	$1, %rax	// move 1 into return register;
1153
1154	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
1155	movq 	%rbp, %rsp	// restore stack pointer
1156	popq 	%rbp		// restore frame pointer
1157	KMP_CFI_DEF rsp,8
1158	ret
1159
1160	DEBUG_INFO __kmp_invoke_microtask
1161// -- End  __kmp_invoke_microtask
1162
1163// kmp_uint64
1164// __kmp_hardware_timestamp(void)
1165        .text
1166	PROC  __kmp_hardware_timestamp
1167	rdtsc
1168	shlq    $32, %rdx
1169	orq     %rdx, %rax
1170	ret
1171
1172	DEBUG_INFO __kmp_hardware_timestamp
1173// -- End  __kmp_hardware_timestamp
1174
1175//------------------------------------------------------------------------
1176// FUNCTION __kmp_bsr32
1177//
1178// int
1179// __kmp_bsr32( int );
1180        .text
1181        PROC  __kmp_bsr32
1182
1183        bsr    %edi,%eax
1184        ret
1185
1186        DEBUG_INFO __kmp_bsr32
1187
1188// -----------------------------------------------------------------------
1189#endif /* KMP_ARCH_X86_64 */
1190
1191// '
1192#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
1193
1194//------------------------------------------------------------------------
1195//
1196// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
1197//
1198// int
1199// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1200//		           int gtid, int tid,
1201//                         int argc, void *p_argv[] ) {
1202//    (*pkfn)( & gtid, & tid, argv[0], ... );
1203//    return 1;
1204// }
1205//
1206// parameters:
1207//	x0:	pkfn
1208//	w1:	gtid
1209//	w2:	tid
1210//	w3:	argc
1211//	x4:	p_argv
1212//	x5:	&exit_frame
1213//
1214// locals:
1215//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1216//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1217//
1218// reg temps:
1219//	 x8:	used to hold pkfn address
1220//	 w9:	used as temporary for number of pkfn parms
1221//	x10:	used to traverse p_argv array
1222//	x11:	used as temporary for stack placement calculation
1223//	x12:	used as temporary for stack parameters
1224//	x19:	used to preserve exit_frame_ptr, callee-save
1225//
1226// return:	w0	(always 1/TRUE)
1227//
1228
1229__gtid = 4
1230__tid = 8
1231
1232// -- Begin __kmp_invoke_microtask
1233// mark_begin;
1234	.text
1235	PROC __kmp_invoke_microtask
1236
1237	stp	x29, x30, [sp, #-16]!
1238# if OMPT_SUPPORT
1239	stp	x19, x20, [sp, #-16]!
1240# endif
1241	mov	x29, sp
1242
1243	orr	w9, wzr, #1
1244	add	w9, w9, w3, lsr #1
1245	sub	sp, sp, w9, uxtw #4
1246	mov	x11, sp
1247
1248	mov	x8, x0
1249	str	w1, [x29, #-__gtid]
1250	str	w2, [x29, #-__tid]
1251	mov	w9, w3
1252	mov	x10, x4
1253# if OMPT_SUPPORT
1254	mov	x19, x5
1255	str	x29, [x19]
1256# endif
1257
1258	sub	x0, x29, #__gtid
1259	sub	x1, x29, #__tid
1260
1261	cbz	w9, KMP_LABEL(kmp_1)
1262	ldr	x2, [x10]
1263
1264	sub	w9, w9, #1
1265	cbz	w9, KMP_LABEL(kmp_1)
1266	ldr	x3, [x10, #8]!
1267
1268	sub	w9, w9, #1
1269	cbz	w9, KMP_LABEL(kmp_1)
1270	ldr	x4, [x10, #8]!
1271
1272	sub	w9, w9, #1
1273	cbz	w9, KMP_LABEL(kmp_1)
1274	ldr	x5, [x10, #8]!
1275
1276	sub	w9, w9, #1
1277	cbz	w9, KMP_LABEL(kmp_1)
1278	ldr	x6, [x10, #8]!
1279
1280	sub	w9, w9, #1
1281	cbz	w9, KMP_LABEL(kmp_1)
1282	ldr	x7, [x10, #8]!
1283
1284KMP_LABEL(kmp_0):
1285	sub	w9, w9, #1
1286	cbz	w9, KMP_LABEL(kmp_1)
1287	ldr	x12, [x10, #8]!
1288	str	x12, [x11], #8
1289	b	KMP_LABEL(kmp_0)
1290KMP_LABEL(kmp_1):
1291	blr	x8
1292	orr	w0, wzr, #1
1293	mov	sp, x29
1294# if OMPT_SUPPORT
1295	str	xzr, [x19]
1296	ldp	x19, x20, [sp], #16
1297# endif
1298	ldp	x29, x30, [sp], #16
1299	ret
1300
1301	DEBUG_INFO __kmp_invoke_microtask
1302// -- End  __kmp_invoke_microtask
1303
1304#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
1305
1306#if KMP_ARCH_PPC64
1307
1308//------------------------------------------------------------------------
1309//
1310// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
1311//
1312// int
1313// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1314//		           int gtid, int tid,
1315//                         int argc, void *p_argv[] ) {
1316//    (*pkfn)( & gtid, & tid, argv[0], ... );
1317//    return 1;
1318// }
1319//
1320// parameters:
1321//	r3:	pkfn
1322//	r4:	gtid
1323//	r5:	tid
1324//	r6:	argc
1325//	r7:	p_argv
1326//	r8:	&exit_frame
1327//
1328// return:	r3	(always 1/TRUE)
1329//
1330	.text
1331# if KMP_ARCH_PPC64_ELFv2
1332	.abiversion 2
1333# endif
1334	.globl	__kmp_invoke_microtask
1335
1336# if KMP_ARCH_PPC64_ELFv2
1337	.p2align	4
1338# else
1339	.p2align	2
1340# endif
1341
1342	.type	__kmp_invoke_microtask,@function
1343
1344# if KMP_ARCH_PPC64_ELFv2
1345__kmp_invoke_microtask:
1346.Lfunc_begin0:
1347.Lfunc_gep0:
1348	addis 2, 12, .TOC.-.Lfunc_gep0@ha
1349	addi 2, 2, .TOC.-.Lfunc_gep0@l
1350.Lfunc_lep0:
1351	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1352# else
1353	.section	.opd,"aw",@progbits
1354__kmp_invoke_microtask:
1355	.p2align	3
1356	.quad	.Lfunc_begin0
1357	.quad	.TOC.@tocbase
1358	.quad	0
1359	.text
1360.Lfunc_begin0:
1361# endif
1362
1363// -- Begin __kmp_invoke_microtask
1364// mark_begin;
1365
1366// We need to allocate a stack frame large enough to hold all of the parameters
1367// on the stack for the microtask plus what this function needs. That's 48
1368// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1369// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1370// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1371// to save r30 to hold a copy of r8.
1372
1373	.cfi_startproc
1374	mflr 0
1375	std 31, -8(1)
1376	std 0, 16(1)
1377
1378// This is unusual because normally we'd set r31 equal to r1 after the stack
1379// frame is established. In this case, however, we need to dynamically compute
1380// the stack frame size, and so we keep a direct copy of r1 to access our
1381// register save areas and restore the r1 value before returning.
1382	mr 31, 1
1383	.cfi_def_cfa_register r31
1384	.cfi_offset r31, -8
1385	.cfi_offset lr, 16
1386
1387// Compute the size necessary for the local stack frame.
1388# if KMP_ARCH_PPC64_ELFv2
1389	li 12, 72
1390# else
1391	li 12, 88
1392# endif
1393	sldi 0, 6, 3
1394	add 12, 0, 12
1395	neg 12, 12
1396
1397// We need to make sure that the stack frame stays aligned (to 16 bytes, except
1398// under the BG/Q CNK, where it must be to 32 bytes).
1399# if KMP_OS_CNK
1400	li 0, -32
1401# else
1402	li 0, -16
1403# endif
1404	and 12, 0, 12
1405
1406// Establish the local stack frame.
1407	stdux 1, 1, 12
1408
1409# if OMPT_SUPPORT
1410	.cfi_offset r30, -16
1411	std 30, -16(31)
1412	std 1, 0(8)
1413	mr 30, 8
1414# endif
1415
1416// Store gtid and tid to the stack because they're passed by reference to the microtask.
1417	stw 4, -20(31)
1418	stw 5, -24(31)
1419
1420	mr 12, 6
1421	mr 4, 7
1422
1423	cmpwi 0, 12, 1
1424	blt	 0, .Lcall
1425
1426	ld 5, 0(4)
1427
1428	cmpwi 0, 12, 2
1429	blt	 0, .Lcall
1430
1431	ld 6, 8(4)
1432
1433	cmpwi 0, 12, 3
1434	blt	 0, .Lcall
1435
1436	ld 7, 16(4)
1437
1438	cmpwi 0, 12, 4
1439	blt	 0, .Lcall
1440
1441	ld 8, 24(4)
1442
1443	cmpwi 0, 12, 5
1444	blt	 0, .Lcall
1445
1446	ld 9, 32(4)
1447
1448	cmpwi 0, 12, 6
1449	blt	 0, .Lcall
1450
1451	ld 10, 40(4)
1452
1453	cmpwi 0, 12, 7
1454	blt	 0, .Lcall
1455
1456// There are more than 6 microtask parameters, so we need to store the
1457// remainder to the stack.
1458	addi 12, 12, -6
1459	mtctr 12
1460
1461// These are set to 8 bytes before the first desired store address (we're using
1462// pre-increment loads and stores in the loop below). The parameter save area
1463// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1464// 32 + 8*8 == 96 bytes above r1 for ELFv2.
1465	addi 4, 4, 40
1466# if KMP_ARCH_PPC64_ELFv2
1467	addi 12, 1, 88
1468# else
1469	addi 12, 1, 104
1470# endif
1471
1472.Lnext:
1473	ldu 0, 8(4)
1474	stdu 0, 8(12)
1475	bdnz .Lnext
1476
1477.Lcall:
1478# if KMP_ARCH_PPC64_ELFv2
1479	std 2, 24(1)
1480	mr 12, 3
1481#else
1482	std 2, 40(1)
1483// For ELFv1, we need to load the actual function address from the function descriptor.
1484	ld 12, 0(3)
1485	ld 2, 8(3)
1486	ld 11, 16(3)
1487#endif
1488
1489	addi 3, 31, -20
1490	addi 4, 31, -24
1491
1492	mtctr 12
1493	bctrl
1494# if KMP_ARCH_PPC64_ELFv2
1495	ld 2, 24(1)
1496# else
1497	ld 2, 40(1)
1498# endif
1499
1500# if OMPT_SUPPORT
1501	li 3, 0
1502	std 3, 0(30)
1503# endif
1504
1505	li 3, 1
1506
1507# if OMPT_SUPPORT
1508	ld 30, -16(31)
1509# endif
1510
1511	mr 1, 31
1512	ld 0, 16(1)
1513	ld 31, -8(1)
1514	mtlr 0
1515	blr
1516
1517	.long	0
1518	.quad	0
1519.Lfunc_end0:
1520	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1521	.cfi_endproc
1522
1523// -- End  __kmp_invoke_microtask
1524
1525#endif /* KMP_ARCH_PPC64 */
1526
1527#if KMP_ARCH_ARM || KMP_ARCH_MIPS
1528    .data
1529    .comm .gomp_critical_user_,32,8
1530    .data
1531    .align 4
1532    .global __kmp_unnamed_critical_addr
1533__kmp_unnamed_critical_addr:
1534    .4byte .gomp_critical_user_
1535    .size __kmp_unnamed_critical_addr,4
1536#endif /* KMP_ARCH_ARM */
1537
1538#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
1539    .data
1540    .comm .gomp_critical_user_,32,8
1541    .data
1542    .align 8
1543    .global __kmp_unnamed_critical_addr
1544__kmp_unnamed_critical_addr:
1545    .8byte .gomp_critical_user_
1546    .size __kmp_unnamed_critical_addr,8
1547#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
1548
1549#if KMP_OS_LINUX
1550# if KMP_ARCH_ARM
1551.section .note.GNU-stack,"",%progbits
1552# else
1553.section .note.GNU-stack,"",@progbits
1554# endif
1555#endif
1556