xref: /freebsd/contrib/llvm-project/openmp/runtime/src/z_Linux_asm.S (revision 77a1348b3c1cfe8547be49a121b56299a1e18b69)
1//  z_Linux_asm.S:  - microtasking routines specifically
2//                    written for Intel platforms running Linux* OS
3
4//
5////===----------------------------------------------------------------------===//
6////
7//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8//// See https://llvm.org/LICENSE.txt for license information.
9//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10////
11////===----------------------------------------------------------------------===//
12//
13
14// -----------------------------------------------------------------------
15// macros
16// -----------------------------------------------------------------------
17
18#include "kmp_config.h"
19
20#if KMP_ARCH_X86 || KMP_ARCH_X86_64
21
22# if KMP_MIC
23// the 'delay r16/r32/r64' should be used instead of the 'pause'.
24// The delay operation has the effect of removing the current thread from
25// the round-robin HT mechanism, and therefore speeds up the issue rate of
26// the other threads on the same core.
27//
28// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
29// barrier time to increase greatly for 3 or more threads per core.
30//
31// A value of 100 works pretty well for up to 4 threads per core, but isn't
32// quite as fast as 0 for 2 threads per core.
33//
34// We need to check what happens for oversubscription / > 4 threads per core.
35// It is possible that we need to pass the delay value in as a parameter
36// that the caller determines based on the total # threads / # cores.
37//
38//.macro pause_op
39//	mov    $100, %rax
40//	delay  %rax
41//.endm
42# else
43#  define pause_op   .byte 0xf3,0x90
44# endif // KMP_MIC
45
46# if KMP_OS_DARWIN
47#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
48#  define KMP_LABEL(x) L_##x             // form the name of label
49.macro KMP_CFI_DEF_OFFSET
50.endmacro
51.macro KMP_CFI_OFFSET
52.endmacro
53.macro KMP_CFI_REGISTER
54.endmacro
55.macro KMP_CFI_DEF
56.endmacro
57.macro ALIGN
58	.align $0
59.endmacro
60.macro DEBUG_INFO
61/* Not sure what .size does in icc, not sure if we need to do something
62   similar for OS X*.
63*/
64.endmacro
65.macro PROC
66	ALIGN  4
67	.globl KMP_PREFIX_UNDERSCORE($0)
68KMP_PREFIX_UNDERSCORE($0):
69.endmacro
70# else // KMP_OS_DARWIN
71#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
72// Format labels so that they don't override function names in gdb's backtraces
73// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
74// on OS X*)
75# if KMP_MIC
76#  define KMP_LABEL(x) L_##x          // local label
77# else
78#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
79# endif // KMP_MIC
80.macro ALIGN size
81	.align 1<<(\size)
82.endm
83.macro DEBUG_INFO proc
84	.cfi_endproc
85// Not sure why we need .type and .size for the functions
86	.align 16
87	.type  \proc,@function
88        .size  \proc,.-\proc
89.endm
90.macro PROC proc
91	ALIGN  4
92        .globl KMP_PREFIX_UNDERSCORE(\proc)
93KMP_PREFIX_UNDERSCORE(\proc):
94	.cfi_startproc
95.endm
96.macro KMP_CFI_DEF_OFFSET sz
97	.cfi_def_cfa_offset	\sz
98.endm
99.macro KMP_CFI_OFFSET reg, sz
100	.cfi_offset	\reg,\sz
101.endm
102.macro KMP_CFI_REGISTER reg
103	.cfi_def_cfa_register	\reg
104.endm
105.macro KMP_CFI_DEF reg, sz
106	.cfi_def_cfa	\reg,\sz
107.endm
108# endif // KMP_OS_DARWIN
109#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
110
111#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
112
113# if KMP_OS_DARWIN
114#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
115#  define KMP_LABEL(x) L_##x             // form the name of label
116
117.macro ALIGN
118	.align $0
119.endmacro
120
121.macro DEBUG_INFO
122/* Not sure what .size does in icc, not sure if we need to do something
123   similar for OS X*.
124*/
125.endmacro
126
127.macro PROC
128	ALIGN  4
129	.globl KMP_PREFIX_UNDERSCORE($0)
130KMP_PREFIX_UNDERSCORE($0):
131.endmacro
132# else // KMP_OS_DARWIN
133#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
134// Format labels so that they don't override function names in gdb's backtraces
135#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
136
137.macro ALIGN size
138	.align 1<<(\size)
139.endm
140
141.macro DEBUG_INFO proc
142	.cfi_endproc
143// Not sure why we need .type and .size for the functions
144	ALIGN 2
145	.type  \proc,@function
146	.size  \proc,.-\proc
147.endm
148
149.macro PROC proc
150	ALIGN 2
151	.globl KMP_PREFIX_UNDERSCORE(\proc)
152KMP_PREFIX_UNDERSCORE(\proc):
153	.cfi_startproc
154.endm
155# endif // KMP_OS_DARWIN
156
157#endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
158
159// -----------------------------------------------------------------------
160// data
161// -----------------------------------------------------------------------
162
163#ifdef KMP_GOMP_COMPAT
164
165// Support for unnamed common blocks.
166//
167// Because the symbol ".gomp_critical_user_" contains a ".", we have to
168// put this stuff in assembly.
169
170# if KMP_ARCH_X86
171#  if KMP_OS_DARWIN
172        .data
173        .comm .gomp_critical_user_,32
174        .data
175        .globl ___kmp_unnamed_critical_addr
176___kmp_unnamed_critical_addr:
177        .long .gomp_critical_user_
178#  else /* Linux* OS */
179        .data
180        .comm .gomp_critical_user_,32,8
181        .data
182	ALIGN 4
183        .global __kmp_unnamed_critical_addr
184__kmp_unnamed_critical_addr:
185        .4byte .gomp_critical_user_
186        .type __kmp_unnamed_critical_addr,@object
187        .size __kmp_unnamed_critical_addr,4
188#  endif /* KMP_OS_DARWIN */
189# endif /* KMP_ARCH_X86 */
190
191# if KMP_ARCH_X86_64
192#  if KMP_OS_DARWIN
193        .data
194        .comm .gomp_critical_user_,32
195        .data
196        .globl ___kmp_unnamed_critical_addr
197___kmp_unnamed_critical_addr:
198        .quad .gomp_critical_user_
199#  else /* Linux* OS */
200        .data
201        .comm .gomp_critical_user_,32,8
202        .data
203	ALIGN 8
204        .global __kmp_unnamed_critical_addr
205__kmp_unnamed_critical_addr:
206        .8byte .gomp_critical_user_
207        .type __kmp_unnamed_critical_addr,@object
208        .size __kmp_unnamed_critical_addr,8
209#  endif /* KMP_OS_DARWIN */
210# endif /* KMP_ARCH_X86_64 */
211
212#endif /* KMP_GOMP_COMPAT */
213
214
215#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
216
217// -----------------------------------------------------------------------
218// microtasking routines specifically written for IA-32 architecture
219// running Linux* OS
220// -----------------------------------------------------------------------
221
222	.ident "Intel Corporation"
223	.data
224	ALIGN 4
225// void
226// __kmp_x86_pause( void );
227
228        .text
229	PROC  __kmp_x86_pause
230
231        pause_op
232        ret
233
234	DEBUG_INFO __kmp_x86_pause
235
236# if !KMP_ASM_INTRINS
237
238//------------------------------------------------------------------------
239// kmp_int32
240// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
241
242        PROC      __kmp_test_then_add32
243
244        movl      4(%esp), %ecx
245        movl      8(%esp), %eax
246        lock
247        xaddl     %eax,(%ecx)
248        ret
249
250	DEBUG_INFO __kmp_test_then_add32
251
252//------------------------------------------------------------------------
253// FUNCTION __kmp_xchg_fixed8
254//
255// kmp_int32
256// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
257//
258// parameters:
259// 	p:	4(%esp)
260// 	d:	8(%esp)
261//
262// return:	%al
263        PROC  __kmp_xchg_fixed8
264
265        movl      4(%esp), %ecx    // "p"
266        movb      8(%esp), %al	// "d"
267
268        lock
269        xchgb     %al,(%ecx)
270        ret
271
272        DEBUG_INFO __kmp_xchg_fixed8
273
274
275//------------------------------------------------------------------------
276// FUNCTION __kmp_xchg_fixed16
277//
278// kmp_int16
279// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
280//
281// parameters:
282// 	p:	4(%esp)
283// 	d:	8(%esp)
284// return:     %ax
285        PROC  __kmp_xchg_fixed16
286
287        movl      4(%esp), %ecx    // "p"
288        movw      8(%esp), %ax	// "d"
289
290        lock
291        xchgw     %ax,(%ecx)
292        ret
293
294        DEBUG_INFO __kmp_xchg_fixed16
295
296
297//------------------------------------------------------------------------
298// FUNCTION __kmp_xchg_fixed32
299//
300// kmp_int32
301// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
302//
303// parameters:
304// 	p:	4(%esp)
305// 	d:	8(%esp)
306//
307// return:	%eax
308        PROC  __kmp_xchg_fixed32
309
310        movl      4(%esp), %ecx    // "p"
311        movl      8(%esp), %eax	// "d"
312
313        lock
314        xchgl     %eax,(%ecx)
315        ret
316
317        DEBUG_INFO __kmp_xchg_fixed32
318
319
320// kmp_int8
321// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
322        PROC  __kmp_compare_and_store8
323
324        movl      4(%esp), %ecx
325        movb      8(%esp), %al
326        movb      12(%esp), %dl
327        lock
328        cmpxchgb  %dl,(%ecx)
329        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
330        and       $1, %eax      // sign extend previous instruction
331        ret
332
333        DEBUG_INFO __kmp_compare_and_store8
334
335// kmp_int16
336// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
337        PROC  __kmp_compare_and_store16
338
339        movl      4(%esp), %ecx
340        movw      8(%esp), %ax
341        movw      12(%esp), %dx
342        lock
343        cmpxchgw  %dx,(%ecx)
344        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
345        and       $1, %eax      // sign extend previous instruction
346        ret
347
348        DEBUG_INFO __kmp_compare_and_store16
349
350// kmp_int32
351// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
352        PROC  __kmp_compare_and_store32
353
354        movl      4(%esp), %ecx
355        movl      8(%esp), %eax
356        movl      12(%esp), %edx
357        lock
358        cmpxchgl  %edx,(%ecx)
359        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
360        and       $1, %eax     // sign extend previous instruction
361        ret
362
363        DEBUG_INFO __kmp_compare_and_store32
364
365// kmp_int32
366// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
367        PROC  __kmp_compare_and_store64
368
369        pushl     %ebp
370        movl      %esp, %ebp
371        pushl     %ebx
372        pushl     %edi
373        movl      8(%ebp), %edi
374        movl      12(%ebp), %eax        // "cv" low order word
375        movl      16(%ebp), %edx        // "cv" high order word
376        movl      20(%ebp), %ebx        // "sv" low order word
377        movl      24(%ebp), %ecx        // "sv" high order word
378        lock
379        cmpxchg8b (%edi)
380        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
381        and       $1, %eax // sign extend previous instruction
382        popl      %edi
383        popl      %ebx
384        movl      %ebp, %esp
385        popl      %ebp
386        ret
387
388        DEBUG_INFO __kmp_compare_and_store64
389
390// kmp_int8
391// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
392        PROC  __kmp_compare_and_store_ret8
393
394        movl      4(%esp), %ecx
395        movb      8(%esp), %al
396        movb      12(%esp), %dl
397        lock
398        cmpxchgb  %dl,(%ecx)
399        ret
400
401        DEBUG_INFO __kmp_compare_and_store_ret8
402
403// kmp_int16
404// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
405//                               kmp_int16 sv);
406        PROC  __kmp_compare_and_store_ret16
407
408        movl      4(%esp), %ecx
409        movw      8(%esp), %ax
410        movw      12(%esp), %dx
411        lock
412        cmpxchgw  %dx,(%ecx)
413        ret
414
415        DEBUG_INFO __kmp_compare_and_store_ret16
416
417// kmp_int32
418// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
419//                               kmp_int32 sv);
420        PROC  __kmp_compare_and_store_ret32
421
422        movl      4(%esp), %ecx
423        movl      8(%esp), %eax
424        movl      12(%esp), %edx
425        lock
426        cmpxchgl  %edx,(%ecx)
427        ret
428
429        DEBUG_INFO __kmp_compare_and_store_ret32
430
431// kmp_int64
432// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
433//                               kmp_int64 sv);
434        PROC  __kmp_compare_and_store_ret64
435
436        pushl     %ebp
437        movl      %esp, %ebp
438        pushl     %ebx
439        pushl     %edi
440        movl      8(%ebp), %edi
441        movl      12(%ebp), %eax        // "cv" low order word
442        movl      16(%ebp), %edx        // "cv" high order word
443        movl      20(%ebp), %ebx        // "sv" low order word
444        movl      24(%ebp), %ecx        // "sv" high order word
445        lock
446        cmpxchg8b (%edi)
447        popl      %edi
448        popl      %ebx
449        movl      %ebp, %esp
450        popl      %ebp
451        ret
452
453        DEBUG_INFO __kmp_compare_and_store_ret64
454
455
456//------------------------------------------------------------------------
457// FUNCTION __kmp_xchg_real32
458//
459// kmp_real32
460// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
461//
462// parameters:
463// 	addr:	4(%esp)
464// 	data:	8(%esp)
465//
466// return:	%eax
467        PROC  __kmp_xchg_real32
468
469        pushl   %ebp
470        movl    %esp, %ebp
471        subl    $4, %esp
472        pushl   %esi
473
474        movl    4(%ebp), %esi
475        flds    (%esi)
476                        // load <addr>
477        fsts    -4(%ebp)
478                        // store old value
479
480        movl    8(%ebp), %eax
481
482        lock
483        xchgl   %eax, (%esi)
484
485        flds    -4(%ebp)
486                        // return old value
487
488        popl    %esi
489        movl    %ebp, %esp
490        popl    %ebp
491        ret
492
493        DEBUG_INFO __kmp_xchg_real32
494
495# endif /* !KMP_ASM_INTRINS */
496
497//------------------------------------------------------------------------
498// int
499// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
500//                         int gtid, int tid,
501//                         int argc, void *p_argv[]
502// #if OMPT_SUPPORT
503//                         ,
504//                         void **exit_frame_ptr
505// #endif
506//                       ) {
507// #if OMPT_SUPPORT
508//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
509// #endif
510//
511//   (*pkfn)( & gtid, & tid, argv[0], ... );
512//   return 1;
513// }
514
515// -- Begin __kmp_invoke_microtask
516// mark_begin;
517	PROC  __kmp_invoke_microtask
518
519	pushl %ebp
520	KMP_CFI_DEF_OFFSET 8
521	KMP_CFI_OFFSET ebp,-8
522	movl %esp,%ebp		// establish the base pointer for this routine.
523	KMP_CFI_REGISTER ebp
524	subl $8,%esp		// allocate space for two local variables.
525				// These varibales are:
526				//	argv: -4(%ebp)
527				//	temp: -8(%ebp)
528				//
529	pushl %ebx		// save %ebx to use during this routine
530				//
531#if OMPT_SUPPORT
532	movl 28(%ebp),%ebx	// get exit_frame address
533	movl %ebp,(%ebx)	// save exit_frame
534#endif
535
536	movl 20(%ebp),%ebx	// Stack alignment - # args
537	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
538	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
539	movl %esp,%eax		//
540	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
541	movl %eax,%ebx		// Save to %ebx
542	andl $0xFFFFFF80,%eax	// mask off 7 bits
543	subl %eax,%ebx		// Amount to subtract from %esp
544	subl %ebx,%esp		// Prepare the stack ptr --
545				//   now it will be aligned on 128-byte boundary at the call
546
547	movl 24(%ebp),%eax	// copy from p_argv[]
548	movl %eax,-4(%ebp)	// into the local variable *argv.
549
550	movl 20(%ebp),%ebx	// argc is 20(%ebp)
551	shll $2,%ebx
552
553KMP_LABEL(invoke_2):
554	cmpl $0,%ebx
555	jg  KMP_LABEL(invoke_4)
556	jmp KMP_LABEL(invoke_3)
557	ALIGN 2
558KMP_LABEL(invoke_4):
559	movl -4(%ebp),%eax
560	subl $4,%ebx			// decrement argc.
561	addl %ebx,%eax			// index into argv.
562	movl (%eax),%edx
563	pushl %edx
564
565	jmp KMP_LABEL(invoke_2)
566	ALIGN 2
567KMP_LABEL(invoke_3):
568	leal 16(%ebp),%eax		// push & tid
569	pushl %eax
570
571	leal 12(%ebp),%eax		// push & gtid
572	pushl %eax
573
574	movl 8(%ebp),%ebx
575	call *%ebx			// call (*pkfn)();
576
577	movl $1,%eax			// return 1;
578
579	movl -12(%ebp),%ebx		// restore %ebx
580	leave
581	KMP_CFI_DEF esp,4
582	ret
583
584	DEBUG_INFO __kmp_invoke_microtask
585// -- End  __kmp_invoke_microtask
586
587
588// kmp_uint64
589// __kmp_hardware_timestamp(void)
590	PROC  __kmp_hardware_timestamp
591	rdtsc
592	ret
593
594	DEBUG_INFO __kmp_hardware_timestamp
595// -- End  __kmp_hardware_timestamp
596
597#endif /* KMP_ARCH_X86 */
598
599
600#if KMP_ARCH_X86_64
601
602// -----------------------------------------------------------------------
603// microtasking routines specifically written for IA-32 architecture and
604// Intel(R) 64 running Linux* OS
605// -----------------------------------------------------------------------
606
607// -- Machine type P
608// mark_description "Intel Corporation";
609	.ident "Intel Corporation"
610// --	.file "z_Linux_asm.S"
611	.data
612	ALIGN 4
613
614// To prevent getting our code into .data section .text added to every routine
615// definition for x86_64.
616//------------------------------------------------------------------------
617# if !KMP_ASM_INTRINS
618
619//------------------------------------------------------------------------
620// FUNCTION __kmp_test_then_add32
621//
622// kmp_int32
623// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
624//
625// parameters:
626// 	p:	%rdi
627// 	d:	%esi
628//
629// return:	%eax
630        .text
631        PROC  __kmp_test_then_add32
632
633        movl      %esi, %eax	// "d"
634        lock
635        xaddl     %eax,(%rdi)
636        ret
637
638        DEBUG_INFO __kmp_test_then_add32
639
640
641//------------------------------------------------------------------------
642// FUNCTION __kmp_test_then_add64
643//
644// kmp_int64
645// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
646//
647// parameters:
648// 	p:	%rdi
649// 	d:	%rsi
650//	return:	%rax
651        .text
652        PROC  __kmp_test_then_add64
653
654        movq      %rsi, %rax	// "d"
655        lock
656        xaddq     %rax,(%rdi)
657        ret
658
659        DEBUG_INFO __kmp_test_then_add64
660
661
662//------------------------------------------------------------------------
663// FUNCTION __kmp_xchg_fixed8
664//
665// kmp_int32
666// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
667//
668// parameters:
669// 	p:	%rdi
670// 	d:	%sil
671//
672// return:	%al
673        .text
674        PROC  __kmp_xchg_fixed8
675
676        movb      %sil, %al	// "d"
677
678        lock
679        xchgb     %al,(%rdi)
680        ret
681
682        DEBUG_INFO __kmp_xchg_fixed8
683
684
685//------------------------------------------------------------------------
686// FUNCTION __kmp_xchg_fixed16
687//
688// kmp_int16
689// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
690//
691// parameters:
692// 	p:	%rdi
693// 	d:	%si
694// return:     %ax
695        .text
696        PROC  __kmp_xchg_fixed16
697
698        movw      %si, %ax	// "d"
699
700        lock
701        xchgw     %ax,(%rdi)
702        ret
703
704        DEBUG_INFO __kmp_xchg_fixed16
705
706
707//------------------------------------------------------------------------
708// FUNCTION __kmp_xchg_fixed32
709//
710// kmp_int32
711// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
712//
713// parameters:
714// 	p:	%rdi
715// 	d:	%esi
716//
717// return:	%eax
718        .text
719        PROC  __kmp_xchg_fixed32
720
721        movl      %esi, %eax	// "d"
722
723        lock
724        xchgl     %eax,(%rdi)
725        ret
726
727        DEBUG_INFO __kmp_xchg_fixed32
728
729
730//------------------------------------------------------------------------
731// FUNCTION __kmp_xchg_fixed64
732//
733// kmp_int64
734// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
735//
736// parameters:
737// 	p:	%rdi
738// 	d:	%rsi
739// return:	%rax
740        .text
741        PROC  __kmp_xchg_fixed64
742
743        movq      %rsi, %rax	// "d"
744
745        lock
746        xchgq     %rax,(%rdi)
747        ret
748
749        DEBUG_INFO __kmp_xchg_fixed64
750
751
752//------------------------------------------------------------------------
753// FUNCTION __kmp_compare_and_store8
754//
755// kmp_int8
756// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
757//
758// parameters:
759// 	p:	%rdi
760// 	cv:	%esi
761//	sv:	%edx
762//
763// return:	%eax
764        .text
765        PROC  __kmp_compare_and_store8
766
767        movb      %sil, %al	// "cv"
768        lock
769        cmpxchgb  %dl,(%rdi)
770        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
771        andq      $1, %rax      // sign extend previous instruction for return value
772        ret
773
774        DEBUG_INFO __kmp_compare_and_store8
775
776
777//------------------------------------------------------------------------
778// FUNCTION __kmp_compare_and_store16
779//
780// kmp_int16
781// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
782//
783// parameters:
784// 	p:	%rdi
785// 	cv:	%si
786//	sv:	%dx
787//
788// return:	%eax
789        .text
790        PROC  __kmp_compare_and_store16
791
792        movw      %si, %ax	// "cv"
793        lock
794        cmpxchgw  %dx,(%rdi)
795        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
796        andq      $1, %rax      // sign extend previous instruction for return value
797        ret
798
799        DEBUG_INFO __kmp_compare_and_store16
800
801
802//------------------------------------------------------------------------
803// FUNCTION __kmp_compare_and_store32
804//
805// kmp_int32
806// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
807//
808// parameters:
809// 	p:	%rdi
810// 	cv:	%esi
811//	sv:	%edx
812//
813// return:	%eax
814        .text
815        PROC  __kmp_compare_and_store32
816
817        movl      %esi, %eax	// "cv"
818        lock
819        cmpxchgl  %edx,(%rdi)
820        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
821        andq      $1, %rax      // sign extend previous instruction for return value
822        ret
823
824        DEBUG_INFO __kmp_compare_and_store32
825
826
827//------------------------------------------------------------------------
828// FUNCTION __kmp_compare_and_store64
829//
830// kmp_int32
831// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
832//
833// parameters:
834// 	p:	%rdi
835// 	cv:	%rsi
836//	sv:	%rdx
837//	return:	%eax
838        .text
839        PROC  __kmp_compare_and_store64
840
841        movq      %rsi, %rax    // "cv"
842        lock
843        cmpxchgq  %rdx,(%rdi)
844        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
845        andq      $1, %rax      // sign extend previous instruction for return value
846        ret
847
848        DEBUG_INFO __kmp_compare_and_store64
849
850//------------------------------------------------------------------------
851// FUNCTION __kmp_compare_and_store_ret8
852//
853// kmp_int8
854// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
855//
856// parameters:
857// 	p:	%rdi
858// 	cv:	%esi
859//	sv:	%edx
860//
861// return:	%eax
862        .text
863        PROC  __kmp_compare_and_store_ret8
864
865        movb      %sil, %al	// "cv"
866        lock
867        cmpxchgb  %dl,(%rdi)
868        ret
869
870        DEBUG_INFO __kmp_compare_and_store_ret8
871
872
873//------------------------------------------------------------------------
874// FUNCTION __kmp_compare_and_store_ret16
875//
876// kmp_int16
877// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
878//
879// parameters:
880// 	p:	%rdi
881// 	cv:	%si
882//	sv:	%dx
883//
884// return:	%eax
885        .text
886        PROC  __kmp_compare_and_store_ret16
887
888        movw      %si, %ax	// "cv"
889        lock
890        cmpxchgw  %dx,(%rdi)
891        ret
892
893        DEBUG_INFO __kmp_compare_and_store_ret16
894
895
896//------------------------------------------------------------------------
897// FUNCTION __kmp_compare_and_store_ret32
898//
899// kmp_int32
900// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
901//
902// parameters:
903// 	p:	%rdi
904// 	cv:	%esi
905//	sv:	%edx
906//
907// return:	%eax
908        .text
909        PROC  __kmp_compare_and_store_ret32
910
911        movl      %esi, %eax	// "cv"
912        lock
913        cmpxchgl  %edx,(%rdi)
914        ret
915
916        DEBUG_INFO __kmp_compare_and_store_ret32
917
918
919//------------------------------------------------------------------------
920// FUNCTION __kmp_compare_and_store_ret64
921//
922// kmp_int64
923// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
924//
925// parameters:
926// 	p:	%rdi
927// 	cv:	%rsi
928//	sv:	%rdx
929//	return:	%eax
930        .text
931        PROC  __kmp_compare_and_store_ret64
932
933        movq      %rsi, %rax    // "cv"
934        lock
935        cmpxchgq  %rdx,(%rdi)
936        ret
937
938        DEBUG_INFO __kmp_compare_and_store_ret64
939
940# endif /* !KMP_ASM_INTRINS */
941
942
943# if !KMP_MIC
944
945# if !KMP_ASM_INTRINS
946
947//------------------------------------------------------------------------
948// FUNCTION __kmp_xchg_real32
949//
950// kmp_real32
951// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
952//
953// parameters:
954// 	addr:	%rdi
955// 	data:	%xmm0 (lower 4 bytes)
956//
957// return:	%xmm0 (lower 4 bytes)
958        .text
959        PROC  __kmp_xchg_real32
960
961	movd	%xmm0, %eax	// load "data" to eax
962
963         lock
964         xchgl %eax, (%rdi)
965
966	movd	%eax, %xmm0	// load old value into return register
967
968        ret
969
970        DEBUG_INFO __kmp_xchg_real32
971
972
973//------------------------------------------------------------------------
974// FUNCTION __kmp_xchg_real64
975//
976// kmp_real64
977// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
978//
979// parameters:
980//      addr:   %rdi
981//      data:   %xmm0 (lower 8 bytes)
982//      return: %xmm0 (lower 8 bytes)
983        .text
984        PROC  __kmp_xchg_real64
985
986	movd	%xmm0, %rax	// load "data" to rax
987
988         lock
989	xchgq  %rax, (%rdi)
990
991	movd	%rax, %xmm0	// load old value into return register
992        ret
993
994        DEBUG_INFO __kmp_xchg_real64
995
996
997# endif /* !KMP_MIC */
998
999# endif /* !KMP_ASM_INTRINS */
1000
1001//------------------------------------------------------------------------
1002// int
1003// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1004//                         int gtid, int tid,
1005//                         int argc, void *p_argv[]
1006// #if OMPT_SUPPORT
1007//                         ,
1008//                         void **exit_frame_ptr
1009// #endif
1010//                       ) {
1011// #if OMPT_SUPPORT
1012//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1013// #endif
1014//
1015//   (*pkfn)( & gtid, & tid, argv[0], ... );
1016//   return 1;
1017// }
1018//
1019// note: at call to pkfn must have %rsp 128-byte aligned for compiler
1020//
1021// parameters:
1022//      %rdi:  	pkfn
1023//	%esi:	gtid
1024//	%edx:	tid
1025//	%ecx:	argc
1026//	%r8:	p_argv
1027//	%r9:	&exit_frame
1028//
1029// locals:
1030//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1031//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1032//
1033// reg temps:
1034//	%rax:	used all over the place
1035//	%rdx:	used in stack pointer alignment calculation
1036//	%r11:	used to traverse p_argv array
1037//	%rsi:	used as temporary for stack parameters
1038//		used as temporary for number of pkfn parms to push
1039//	%rbx:	used to hold pkfn address, and zero constant, callee-save
1040//
1041// return:	%eax 	(always 1/TRUE)
1042__gtid = -16
1043__tid = -24
1044
1045// -- Begin __kmp_invoke_microtask
1046// mark_begin;
1047        .text
1048	PROC  __kmp_invoke_microtask
1049
1050	pushq 	%rbp		// save base pointer
1051	KMP_CFI_DEF_OFFSET 16
1052	KMP_CFI_OFFSET rbp,-16
1053	movq 	%rsp,%rbp	// establish the base pointer for this routine.
1054	KMP_CFI_REGISTER rbp
1055
1056#if OMPT_SUPPORT
1057	movq	%rbp, (%r9)	// save exit_frame
1058#endif
1059
1060	pushq 	%rbx		// %rbx is callee-saved register
1061	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
1062	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
1063
1064	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
1065	movq	$0, %rbx	// constant for cmovs later
1066	subq	$4, %rax	// subtract four args passed in registers to pkfn
1067#if KMP_MIC
1068	js	KMP_LABEL(kmp_0)	// jump to movq
1069	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
1070KMP_LABEL(kmp_0):
1071	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1072KMP_LABEL(kmp_0_exit):
1073#else
1074	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1075#endif // KMP_MIC
1076
1077	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
1078	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
1079
1080	movq 	%rsp, %rdx	//
1081	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
1082				// without align, stack ptr would be this
1083	movq 	%rdx, %rax	// Save to %rax
1084
1085	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
1086	subq 	%rax, %rdx	// Amount to subtract from %rsp
1087	subq 	%rdx, %rsp	// Prepare the stack ptr --
1088				// now %rsp will align to 128-byte boundary at call site
1089
1090				// setup pkfn parameter reg and stack
1091	movq	%rcx, %rax	// argc -> %rax
1092	cmpq	$0, %rsi
1093	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
1094	shlq	$3, %rcx	// argc*8 -> %rcx
1095	movq 	%r8, %rdx	// p_argv -> %rdx
1096	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
1097
1098	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
1099
1100KMP_LABEL(kmp_invoke_push_parms):
1101	// push nth - 7th parms to pkfn on stack
1102	subq	$8, %rdx	// decrement p_argv pointer to previous parm
1103	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
1104	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
1105	subl	$1, %ecx
1106
1107// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1108//		if the name of the label that is an operand of this jecxz starts with a dot (".");
1109//	   Apple's linker does not support 1-byte length relocation;
1110//         Resolution: replace all .labelX entries with L_labelX.
1111
1112	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
1113	jmp	KMP_LABEL(kmp_invoke_push_parms)
1114	ALIGN 3
1115KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
1116				// order here is important to avoid trashing
1117				// registers used for both input and output parms!
1118	movq	%rdi, %rbx	// pkfn -> %rbx
1119	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1120	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
1121
1122	movq	%r8, %r11	// p_argv -> %r11
1123
1124#if KMP_MIC
1125	cmpq	$4, %rax	// argc >= 4?
1126	jns	KMP_LABEL(kmp_4)	// jump to movq
1127	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
1128KMP_LABEL(kmp_4):
1129	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1130KMP_LABEL(kmp_4_exit):
1131
1132	cmpq	$3, %rax	// argc >= 3?
1133	jns	KMP_LABEL(kmp_3)	// jump to movq
1134	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
1135KMP_LABEL(kmp_3):
1136	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1137KMP_LABEL(kmp_3_exit):
1138
1139	cmpq	$2, %rax	// argc >= 2?
1140	jns	KMP_LABEL(kmp_2)	// jump to movq
1141	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
1142KMP_LABEL(kmp_2):
1143	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1144KMP_LABEL(kmp_2_exit):
1145
1146	cmpq	$1, %rax	// argc >= 1?
1147	jns	KMP_LABEL(kmp_1)	// jump to movq
1148	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
1149KMP_LABEL(kmp_1):
1150	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1151KMP_LABEL(kmp_1_exit):
1152#else
1153	cmpq	$4, %rax	// argc >= 4?
1154	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1155
1156	cmpq	$3, %rax	// argc >= 3?
1157	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1158
1159	cmpq	$2, %rax	// argc >= 2?
1160	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1161
1162	cmpq	$1, %rax	// argc >= 1?
1163	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1164#endif // KMP_MIC
1165
1166	call	*%rbx		// call (*pkfn)();
1167	movq	$1, %rax	// move 1 into return register;
1168
1169	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
1170	movq 	%rbp, %rsp	// restore stack pointer
1171	popq 	%rbp		// restore frame pointer
1172	KMP_CFI_DEF rsp,8
1173	ret
1174
1175	DEBUG_INFO __kmp_invoke_microtask
1176// -- End  __kmp_invoke_microtask
1177
1178// kmp_uint64
1179// __kmp_hardware_timestamp(void)
1180        .text
1181	PROC  __kmp_hardware_timestamp
1182	rdtsc
1183	shlq    $32, %rdx
1184	orq     %rdx, %rax
1185	ret
1186
1187	DEBUG_INFO __kmp_hardware_timestamp
1188// -- End  __kmp_hardware_timestamp
1189
1190//------------------------------------------------------------------------
1191// FUNCTION __kmp_bsr32
1192//
1193// int
1194// __kmp_bsr32( int );
1195        .text
1196        PROC  __kmp_bsr32
1197
1198        bsr    %edi,%eax
1199        ret
1200
1201        DEBUG_INFO __kmp_bsr32
1202
1203// -----------------------------------------------------------------------
1204#endif /* KMP_ARCH_X86_64 */
1205
1206// '
1207#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
1208
1209//------------------------------------------------------------------------
1210// int
1211// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1212//                         int gtid, int tid,
1213//                         int argc, void *p_argv[]
1214// #if OMPT_SUPPORT
1215//                         ,
1216//                         void **exit_frame_ptr
1217// #endif
1218//                       ) {
1219// #if OMPT_SUPPORT
1220//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1221// #endif
1222//
1223//   (*pkfn)( & gtid, & tid, argv[0], ... );
1224//
1225// // FIXME: This is done at call-site and can be removed here.
1226// #if OMPT_SUPPORT
1227//   *exit_frame_ptr = 0;
1228// #endif
1229//
1230//   return 1;
1231// }
1232//
1233// parameters:
1234//	x0:	pkfn
1235//	w1:	gtid
1236//	w2:	tid
1237//	w3:	argc
1238//	x4:	p_argv
1239//	x5:	&exit_frame
1240//
1241// locals:
1242//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1243//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1244//
1245// reg temps:
1246//	 x8:	used to hold pkfn address
1247//	 w9:	used as temporary for number of pkfn parms
1248//	x10:	used to traverse p_argv array
1249//	x11:	used as temporary for stack placement calculation
1250//	x12:	used as temporary for stack parameters
1251//	x19:	used to preserve exit_frame_ptr, callee-save
1252//
1253// return:	w0	(always 1/TRUE)
1254//
1255
1256__gtid = 4
1257__tid = 8
1258
1259// -- Begin __kmp_invoke_microtask
1260// mark_begin;
1261	.text
1262	PROC __kmp_invoke_microtask
1263
1264	stp	x29, x30, [sp, #-16]!
1265# if OMPT_SUPPORT
1266	stp	x19, x20, [sp, #-16]!
1267# endif
1268	mov	x29, sp
1269
1270	orr	w9, wzr, #1
1271	add	w9, w9, w3, lsr #1
1272	sub	sp, sp, w9, uxtw #4
1273	mov	x11, sp
1274
1275	mov	x8, x0
1276	str	w1, [x29, #-__gtid]
1277	str	w2, [x29, #-__tid]
1278	mov	w9, w3
1279	mov	x10, x4
1280# if OMPT_SUPPORT
1281	mov	x19, x5
1282	str	x29, [x19]
1283# endif
1284
1285	sub	x0, x29, #__gtid
1286	sub	x1, x29, #__tid
1287
1288	cbz	w9, KMP_LABEL(kmp_1)
1289	ldr	x2, [x10]
1290
1291	sub	w9, w9, #1
1292	cbz	w9, KMP_LABEL(kmp_1)
1293	ldr	x3, [x10, #8]!
1294
1295	sub	w9, w9, #1
1296	cbz	w9, KMP_LABEL(kmp_1)
1297	ldr	x4, [x10, #8]!
1298
1299	sub	w9, w9, #1
1300	cbz	w9, KMP_LABEL(kmp_1)
1301	ldr	x5, [x10, #8]!
1302
1303	sub	w9, w9, #1
1304	cbz	w9, KMP_LABEL(kmp_1)
1305	ldr	x6, [x10, #8]!
1306
1307	sub	w9, w9, #1
1308	cbz	w9, KMP_LABEL(kmp_1)
1309	ldr	x7, [x10, #8]!
1310
1311KMP_LABEL(kmp_0):
1312	sub	w9, w9, #1
1313	cbz	w9, KMP_LABEL(kmp_1)
1314	ldr	x12, [x10, #8]!
1315	str	x12, [x11], #8
1316	b	KMP_LABEL(kmp_0)
1317KMP_LABEL(kmp_1):
1318	blr	x8
1319	orr	w0, wzr, #1
1320	mov	sp, x29
1321# if OMPT_SUPPORT
1322	str	xzr, [x19]
1323	ldp	x19, x20, [sp], #16
1324# endif
1325	ldp	x29, x30, [sp], #16
1326	ret
1327
1328	DEBUG_INFO __kmp_invoke_microtask
1329// -- End  __kmp_invoke_microtask
1330
1331#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
1332
1333#if KMP_ARCH_PPC64
1334
1335//------------------------------------------------------------------------
1336// int
1337// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1338//                         int gtid, int tid,
1339//                         int argc, void *p_argv[]
1340// #if OMPT_SUPPORT
1341//                         ,
1342//                         void **exit_frame_ptr
1343// #endif
1344//                       ) {
1345// #if OMPT_SUPPORT
1346//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1347// #endif
1348//
1349//   (*pkfn)( & gtid, & tid, argv[0], ... );
1350//
1351// // FIXME: This is done at call-site and can be removed here.
1352// #if OMPT_SUPPORT
1353//   *exit_frame_ptr = 0;
1354// #endif
1355//
1356//   return 1;
1357// }
1358//
1359// parameters:
1360//	r3:	pkfn
1361//	r4:	gtid
1362//	r5:	tid
1363//	r6:	argc
1364//	r7:	p_argv
1365//	r8:	&exit_frame
1366//
1367// return:	r3	(always 1/TRUE)
1368//
1369	.text
1370# if KMP_ARCH_PPC64_ELFv2
1371	.abiversion 2
1372# endif
1373	.globl	__kmp_invoke_microtask
1374
1375# if KMP_ARCH_PPC64_ELFv2
1376	.p2align	4
1377# else
1378	.p2align	2
1379# endif
1380
1381	.type	__kmp_invoke_microtask,@function
1382
1383# if KMP_ARCH_PPC64_ELFv2
1384__kmp_invoke_microtask:
1385.Lfunc_begin0:
1386.Lfunc_gep0:
1387	addis 2, 12, .TOC.-.Lfunc_gep0@ha
1388	addi 2, 2, .TOC.-.Lfunc_gep0@l
1389.Lfunc_lep0:
1390	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1391# else
1392	.section	.opd,"aw",@progbits
1393__kmp_invoke_microtask:
1394	.p2align	3
1395	.quad	.Lfunc_begin0
1396	.quad	.TOC.@tocbase
1397	.quad	0
1398	.text
1399.Lfunc_begin0:
1400# endif
1401
1402// -- Begin __kmp_invoke_microtask
1403// mark_begin;
1404
1405// We need to allocate a stack frame large enough to hold all of the parameters
1406// on the stack for the microtask plus what this function needs. That's 48
1407// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1408// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1409// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1410// to save r30 to hold a copy of r8.
1411
1412	.cfi_startproc
1413	mflr 0
1414	std 31, -8(1)
1415	std 0, 16(1)
1416
1417// This is unusual because normally we'd set r31 equal to r1 after the stack
1418// frame is established. In this case, however, we need to dynamically compute
1419// the stack frame size, and so we keep a direct copy of r1 to access our
1420// register save areas and restore the r1 value before returning.
1421	mr 31, 1
1422	.cfi_def_cfa_register r31
1423	.cfi_offset r31, -8
1424	.cfi_offset lr, 16
1425
1426// Compute the size necessary for the local stack frame.
1427# if KMP_ARCH_PPC64_ELFv2
1428	li 12, 72
1429# else
1430	li 12, 88
1431# endif
1432	sldi 0, 6, 3
1433	add 12, 0, 12
1434	neg 12, 12
1435
1436// We need to make sure that the stack frame stays aligned (to 16 bytes, except
1437// under the BG/Q CNK, where it must be to 32 bytes).
1438# if KMP_OS_CNK
1439	li 0, -32
1440# else
1441	li 0, -16
1442# endif
1443	and 12, 0, 12
1444
1445// Establish the local stack frame.
1446	stdux 1, 1, 12
1447
1448# if OMPT_SUPPORT
1449	.cfi_offset r30, -16
1450	std 30, -16(31)
1451	std 1, 0(8)
1452	mr 30, 8
1453# endif
1454
1455// Store gtid and tid to the stack because they're passed by reference to the microtask.
1456	stw 4, -20(31)
1457	stw 5, -24(31)
1458
1459	mr 12, 6
1460	mr 4, 7
1461
1462	cmpwi 0, 12, 1
1463	blt	 0, .Lcall
1464
1465	ld 5, 0(4)
1466
1467	cmpwi 0, 12, 2
1468	blt	 0, .Lcall
1469
1470	ld 6, 8(4)
1471
1472	cmpwi 0, 12, 3
1473	blt	 0, .Lcall
1474
1475	ld 7, 16(4)
1476
1477	cmpwi 0, 12, 4
1478	blt	 0, .Lcall
1479
1480	ld 8, 24(4)
1481
1482	cmpwi 0, 12, 5
1483	blt	 0, .Lcall
1484
1485	ld 9, 32(4)
1486
1487	cmpwi 0, 12, 6
1488	blt	 0, .Lcall
1489
1490	ld 10, 40(4)
1491
1492	cmpwi 0, 12, 7
1493	blt	 0, .Lcall
1494
1495// There are more than 6 microtask parameters, so we need to store the
1496// remainder to the stack.
1497	addi 12, 12, -6
1498	mtctr 12
1499
1500// These are set to 8 bytes before the first desired store address (we're using
1501// pre-increment loads and stores in the loop below). The parameter save area
1502// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1503// 32 + 8*8 == 96 bytes above r1 for ELFv2.
1504	addi 4, 4, 40
1505# if KMP_ARCH_PPC64_ELFv2
1506	addi 12, 1, 88
1507# else
1508	addi 12, 1, 104
1509# endif
1510
1511.Lnext:
1512	ldu 0, 8(4)
1513	stdu 0, 8(12)
1514	bdnz .Lnext
1515
1516.Lcall:
1517# if KMP_ARCH_PPC64_ELFv2
1518	std 2, 24(1)
1519	mr 12, 3
1520#else
1521	std 2, 40(1)
1522// For ELFv1, we need to load the actual function address from the function descriptor.
1523	ld 12, 0(3)
1524	ld 2, 8(3)
1525	ld 11, 16(3)
1526#endif
1527
1528	addi 3, 31, -20
1529	addi 4, 31, -24
1530
1531	mtctr 12
1532	bctrl
1533# if KMP_ARCH_PPC64_ELFv2
1534	ld 2, 24(1)
1535# else
1536	ld 2, 40(1)
1537# endif
1538
1539# if OMPT_SUPPORT
1540	li 3, 0
1541	std 3, 0(30)
1542# endif
1543
1544	li 3, 1
1545
1546# if OMPT_SUPPORT
1547	ld 30, -16(31)
1548# endif
1549
1550	mr 1, 31
1551	ld 0, 16(1)
1552	ld 31, -8(1)
1553	mtlr 0
1554	blr
1555
1556	.long	0
1557	.quad	0
1558.Lfunc_end0:
1559	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1560	.cfi_endproc
1561
1562// -- End  __kmp_invoke_microtask
1563
1564#endif /* KMP_ARCH_PPC64 */
1565
1566#if KMP_ARCH_RISCV64
1567
1568//------------------------------------------------------------------------
1569//
1570// typedef void (*microtask_t)(int *gtid, int *tid, ...);
1571//
1572// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1573//                            void *p_argv[]
1574// #if OMPT_SUPPORT
1575//                            ,
1576//                            void **exit_frame_ptr
1577// #endif
1578//                            ) {
1579// #if OMPT_SUPPORT
1580//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1581// #endif
1582//
1583//   (*pkfn)(&gtid, &tid, argv[0], ...);
1584//
1585//   return 1;
1586// }
1587//
1588// Parameters:
1589//   a0: pkfn
1590//   a1: gtid
1591//   a2: tid
1592//   a3: argc
1593//   a4: p_argv
1594//   a5: exit_frame_ptr
1595//
1596// Locals:
1597//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
1598//   __tid: tid param pushed on stack so can pass &tid to pkfn
1599//
1600// Temp. registers:
1601//
1602//  t0: used to calculate the dynamic stack size / used to hold pkfn address
1603//  t1: used as temporary for stack placement calculation
1604//  t2: used as temporary for stack arguments
1605//  t3: used as temporary for number of remaining pkfn parms
1606//  t4: used to traverse p_argv array
1607//
1608// return: a0 (always 1/TRUE)
1609//
1610
1611__gtid = -20
1612__tid = -24
1613
1614// -- Begin __kmp_invoke_microtask
1615// mark_begin;
1616	.text
1617	.globl	__kmp_invoke_microtask
1618	.p2align	1
1619	.type	__kmp_invoke_microtask,@function
1620__kmp_invoke_microtask:
1621	.cfi_startproc
1622
1623	// First, save ra and fp
1624	addi	sp, sp, -16
1625	sd	ra, 8(sp)
1626	sd	fp, 0(sp)
1627	addi	fp, sp, 16
1628	.cfi_def_cfa	fp, 0
1629	.cfi_offset	ra, -8
1630	.cfi_offset	fp, -16
1631
1632	// Compute the dynamic stack size:
1633	//
1634	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1635	//   reference
1636	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1637	//   function by register. Given that we have 8 of such registers (a[0-7])
1638	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
1639	//   reserve max(0, argc - 6)*8 extra bytes
1640	//
1641	// The total number of bytes is then max(0, argc - 6)*8 + 8
1642
1643	// Compute max(0, argc - 6) using the following bithack:
1644	// max(0, x) = x - (x & (x >> 31)), where x := argc - 6
1645	// Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
1646	addi	t0, a3, -6
1647	srai	t1, t0, 31
1648	and	t1, t0, t1
1649	sub	t0, t0, t1
1650
1651	addi	t0, t0, 1
1652
1653	slli	t0, t0, 3
1654	sub	sp, sp, t0
1655
1656	// Align the stack to 16 bytes
1657	andi	sp, sp, -16
1658
1659	mv	t0, a0
1660	mv	t3, a3
1661	mv	t4, a4
1662
1663#if OMPT_SUPPORT
1664	// Save frame pointer into exit_frame
1665	sd	fp, 0(a5)
1666#endif
1667
1668	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
1669
1670	sw	a1, __gtid(fp)
1671	sw	a2, __tid(fp)
1672
1673	addi	a0, fp, __gtid
1674	addi	a1, fp, __tid
1675
1676	beqz	t3, .L_kmp_3
1677	ld	a2, 0(t4)
1678
1679	addi	t3, t3, -1
1680	beqz	t3, .L_kmp_3
1681	ld	a3, 8(t4)
1682
1683	addi	t3, t3, -1
1684	beqz	t3, .L_kmp_3
1685	ld	a4, 16(t4)
1686
1687	addi	t3, t3, -1
1688	beqz	t3, .L_kmp_3
1689	ld	a5, 24(t4)
1690
1691	addi	t3, t3, -1
1692	beqz	t3, .L_kmp_3
1693	ld	a6, 32(t4)
1694
1695	addi	t3, t3, -1
1696	beqz	t3, .L_kmp_3
1697	ld	a7, 40(t4)
1698
1699	// Prepare any additional argument passed through the stack
1700	addi	t4, t4, 48
1701	mv	t1, sp
1702	j .L_kmp_2
1703.L_kmp_1:
1704	ld	t2, 0(t4)
1705	sd	t2, 0(t1)
1706	addi	t4, t4, 8
1707	addi	t1, t1, 8
1708.L_kmp_2:
1709	addi	t3, t3, -1
1710	bnez	t3, .L_kmp_1
1711
1712.L_kmp_3:
1713	// Call pkfn function
1714	jalr	t0
1715
1716	// Restore stack and return
1717
1718	addi	a0, zero, 1
1719
1720	addi	sp, fp, -16
1721	ld	fp, 0(sp)
1722	ld	ra, 8(sp)
1723	addi	sp, sp, 16
1724	ret
1725.Lfunc_end0:
1726	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
1727	.cfi_endproc
1728
1729// -- End  __kmp_invoke_microtask
1730
1731#endif /* KMP_ARCH_RISCV64 */
1732
1733#if KMP_ARCH_ARM || KMP_ARCH_MIPS
1734    .data
1735    .comm .gomp_critical_user_,32,8
1736    .data
1737    .align 4
1738    .global __kmp_unnamed_critical_addr
1739__kmp_unnamed_critical_addr:
1740    .4byte .gomp_critical_user_
1741    .size __kmp_unnamed_critical_addr,4
1742#endif /* KMP_ARCH_ARM */
1743
1744#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
1745    .data
1746    .comm .gomp_critical_user_,32,8
1747    .data
1748    .align 8
1749    .global __kmp_unnamed_critical_addr
1750__kmp_unnamed_critical_addr:
1751    .8byte .gomp_critical_user_
1752    .size __kmp_unnamed_critical_addr,8
1753#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
1754          KMP_ARCH_RISCV64 */
1755
1756#if KMP_OS_LINUX
1757# if KMP_ARCH_ARM
1758.section .note.GNU-stack,"",%progbits
1759# else
1760.section .note.GNU-stack,"",@progbits
1761# endif
1762#endif
1763