xref: /freebsd/contrib/llvm-project/openmp/runtime/src/z_Linux_asm.S (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1//  z_Linux_asm.S:  - microtasking routines specifically
2//                    written for Intel platforms running Linux* OS
3
4//
5////===----------------------------------------------------------------------===//
6////
7//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8//// See https://llvm.org/LICENSE.txt for license information.
9//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10////
11////===----------------------------------------------------------------------===//
12//
13
14// -----------------------------------------------------------------------
15// macros
16// -----------------------------------------------------------------------
17
18#include "kmp_config.h"
19
20#if KMP_ARCH_X86 || KMP_ARCH_X86_64
21
22# if KMP_MIC
23// the 'delay r16/r32/r64' should be used instead of the 'pause'.
24// The delay operation has the effect of removing the current thread from
25// the round-robin HT mechanism, and therefore speeds up the issue rate of
26// the other threads on the same core.
27//
28// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
29// barrier time to increase greatly for 3 or more threads per core.
30//
31// A value of 100 works pretty well for up to 4 threads per core, but isn't
32// quite as fast as 0 for 2 threads per core.
33//
34// We need to check what happens for oversubscription / > 4 threads per core.
35// It is possible that we need to pass the delay value in as a parameter
36// that the caller determines based on the total # threads / # cores.
37//
38//.macro pause_op
39//	mov    $100, %rax
40//	delay  %rax
41//.endm
42# else
43#  define pause_op   .byte 0xf3,0x90
44# endif // KMP_MIC
45
46# if KMP_OS_DARWIN
47#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
48#  define KMP_LABEL(x) L_##x             // form the name of label
49.macro KMP_CFI_DEF_OFFSET
50.endmacro
51.macro KMP_CFI_OFFSET
52.endmacro
53.macro KMP_CFI_REGISTER
54.endmacro
55.macro KMP_CFI_DEF
56.endmacro
57.macro ALIGN
58	.align $0
59.endmacro
60.macro DEBUG_INFO
61/* Not sure what .size does in icc, not sure if we need to do something
62   similar for OS X*.
63*/
64.endmacro
65.macro PROC
66	ALIGN  4
67	.globl KMP_PREFIX_UNDERSCORE($0)
68KMP_PREFIX_UNDERSCORE($0):
69.endmacro
70# else // KMP_OS_DARWIN
71#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
72// Format labels so that they don't override function names in gdb's backtraces
73// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
74// on OS X*)
75# if KMP_MIC
76#  define KMP_LABEL(x) L_##x          // local label
77# else
78#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
79# endif // KMP_MIC
80.macro ALIGN size
81	.align 1<<(\size)
82.endm
83.macro DEBUG_INFO proc
84	.cfi_endproc
85// Not sure why we need .type and .size for the functions
86	.align 16
87	.type  \proc,@function
88        .size  \proc,.-\proc
89.endm
90.macro PROC proc
91	ALIGN  4
92        .globl KMP_PREFIX_UNDERSCORE(\proc)
93KMP_PREFIX_UNDERSCORE(\proc):
94	.cfi_startproc
95.endm
96.macro KMP_CFI_DEF_OFFSET sz
97	.cfi_def_cfa_offset	\sz
98.endm
99.macro KMP_CFI_OFFSET reg, sz
100	.cfi_offset	\reg,\sz
101.endm
102.macro KMP_CFI_REGISTER reg
103	.cfi_def_cfa_register	\reg
104.endm
105.macro KMP_CFI_DEF reg, sz
106	.cfi_def_cfa	\reg,\sz
107.endm
108# endif // KMP_OS_DARWIN
109#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
110
111#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
112
113# if KMP_OS_DARWIN
114#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
115#  define KMP_LABEL(x) L_##x             // form the name of label
116
117.macro ALIGN
118	.align $0
119.endmacro
120
121.macro DEBUG_INFO
122/* Not sure what .size does in icc, not sure if we need to do something
123   similar for OS X*.
124*/
125.endmacro
126
127.macro PROC
128	ALIGN  4
129	.globl KMP_PREFIX_UNDERSCORE($0)
130KMP_PREFIX_UNDERSCORE($0):
131.endmacro
132# elif KMP_OS_WINDOWS
133#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Windows/ARM64 symbols
134// Format labels so that they don't override function names in gdb's backtraces
135#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
136
137.macro ALIGN size
138	.align 1<<(\size)
139.endm
140
141.macro DEBUG_INFO proc
142	ALIGN 2
143.endm
144
145.macro PROC proc
146	ALIGN 2
147	.globl KMP_PREFIX_UNDERSCORE(\proc)
148KMP_PREFIX_UNDERSCORE(\proc):
149.endm
150# else // KMP_OS_DARWIN || KMP_OS_WINDOWS
151#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
152// Format labels so that they don't override function names in gdb's backtraces
153#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
154
155.macro ALIGN size
156	.align 1<<(\size)
157.endm
158
159.macro DEBUG_INFO proc
160	.cfi_endproc
161// Not sure why we need .type and .size for the functions
162	ALIGN 2
163#if KMP_ARCH_ARM
164	.type  \proc,%function
165#else
166	.type  \proc,@function
167#endif
168	.size  \proc,.-\proc
169.endm
170
171.macro PROC proc
172	ALIGN 2
173	.globl KMP_PREFIX_UNDERSCORE(\proc)
174KMP_PREFIX_UNDERSCORE(\proc):
175	.cfi_startproc
176.endm
177# endif // KMP_OS_DARWIN
178
179#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
180
181.macro COMMON name, size, align_power
182#if KMP_OS_DARWIN
183	.comm \name, \size
184#elif KMP_OS_WINDOWS
185	.comm \name, \size, \align_power
186#else // !KMP_OS_DARWIN && !KMP_OS_WINDOWS
187	.comm \name, \size, (1<<(\align_power))
188#endif
189.endm
190
191// -----------------------------------------------------------------------
192// data
193// -----------------------------------------------------------------------
194
195#ifdef KMP_GOMP_COMPAT
196
197// Support for unnamed common blocks.
198//
199// Because the symbol ".gomp_critical_user_" contains a ".", we have to
200// put this stuff in assembly.
201
202# if KMP_ARCH_X86
203#  if KMP_OS_DARWIN
204        .data
205        .comm .gomp_critical_user_,32
206        .data
207        .globl ___kmp_unnamed_critical_addr
208___kmp_unnamed_critical_addr:
209        .long .gomp_critical_user_
210#  else /* Linux* OS */
211        .data
212        .comm .gomp_critical_user_,32,8
213        .data
214	ALIGN 4
215        .global __kmp_unnamed_critical_addr
216__kmp_unnamed_critical_addr:
217        .4byte .gomp_critical_user_
218        .type __kmp_unnamed_critical_addr,@object
219        .size __kmp_unnamed_critical_addr,4
220#  endif /* KMP_OS_DARWIN */
221# endif /* KMP_ARCH_X86 */
222
223# if KMP_ARCH_X86_64
224#  if KMP_OS_DARWIN
225        .data
226        .comm .gomp_critical_user_,32
227        .data
228        .globl ___kmp_unnamed_critical_addr
229___kmp_unnamed_critical_addr:
230        .quad .gomp_critical_user_
231#  else /* Linux* OS */
232        .data
233        .comm .gomp_critical_user_,32,8
234        .data
235	ALIGN 8
236        .global __kmp_unnamed_critical_addr
237__kmp_unnamed_critical_addr:
238        .8byte .gomp_critical_user_
239        .type __kmp_unnamed_critical_addr,@object
240        .size __kmp_unnamed_critical_addr,8
241#  endif /* KMP_OS_DARWIN */
242# endif /* KMP_ARCH_X86_64 */
243
244#endif /* KMP_GOMP_COMPAT */
245
246
247#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
248
249// -----------------------------------------------------------------------
250// microtasking routines specifically written for IA-32 architecture
251// running Linux* OS
252// -----------------------------------------------------------------------
253
254	.ident "Intel Corporation"
255	.data
256	ALIGN 4
257// void
258// __kmp_x86_pause( void );
259
260        .text
261	PROC  __kmp_x86_pause
262
263        pause_op
264        ret
265
266	DEBUG_INFO __kmp_x86_pause
267
268# if !KMP_ASM_INTRINS
269
270//------------------------------------------------------------------------
271// kmp_int32
272// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
273
274        PROC      __kmp_test_then_add32
275
276        movl      4(%esp), %ecx
277        movl      8(%esp), %eax
278        lock
279        xaddl     %eax,(%ecx)
280        ret
281
282	DEBUG_INFO __kmp_test_then_add32
283
284//------------------------------------------------------------------------
285// FUNCTION __kmp_xchg_fixed8
286//
287// kmp_int32
288// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
289//
290// parameters:
291// 	p:	4(%esp)
292// 	d:	8(%esp)
293//
294// return:	%al
295        PROC  __kmp_xchg_fixed8
296
297        movl      4(%esp), %ecx    // "p"
298        movb      8(%esp), %al	// "d"
299
300        lock
301        xchgb     %al,(%ecx)
302        ret
303
304        DEBUG_INFO __kmp_xchg_fixed8
305
306
307//------------------------------------------------------------------------
308// FUNCTION __kmp_xchg_fixed16
309//
310// kmp_int16
311// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
312//
313// parameters:
314// 	p:	4(%esp)
315// 	d:	8(%esp)
316// return:     %ax
317        PROC  __kmp_xchg_fixed16
318
319        movl      4(%esp), %ecx    // "p"
320        movw      8(%esp), %ax	// "d"
321
322        lock
323        xchgw     %ax,(%ecx)
324        ret
325
326        DEBUG_INFO __kmp_xchg_fixed16
327
328
329//------------------------------------------------------------------------
330// FUNCTION __kmp_xchg_fixed32
331//
332// kmp_int32
333// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
334//
335// parameters:
336// 	p:	4(%esp)
337// 	d:	8(%esp)
338//
339// return:	%eax
340        PROC  __kmp_xchg_fixed32
341
342        movl      4(%esp), %ecx    // "p"
343        movl      8(%esp), %eax	// "d"
344
345        lock
346        xchgl     %eax,(%ecx)
347        ret
348
349        DEBUG_INFO __kmp_xchg_fixed32
350
351
352// kmp_int8
353// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
354        PROC  __kmp_compare_and_store8
355
356        movl      4(%esp), %ecx
357        movb      8(%esp), %al
358        movb      12(%esp), %dl
359        lock
360        cmpxchgb  %dl,(%ecx)
361        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
362        and       $1, %eax      // sign extend previous instruction
363        ret
364
365        DEBUG_INFO __kmp_compare_and_store8
366
367// kmp_int16
368// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
369        PROC  __kmp_compare_and_store16
370
371        movl      4(%esp), %ecx
372        movw      8(%esp), %ax
373        movw      12(%esp), %dx
374        lock
375        cmpxchgw  %dx,(%ecx)
376        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
377        and       $1, %eax      // sign extend previous instruction
378        ret
379
380        DEBUG_INFO __kmp_compare_and_store16
381
382// kmp_int32
383// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
384        PROC  __kmp_compare_and_store32
385
386        movl      4(%esp), %ecx
387        movl      8(%esp), %eax
388        movl      12(%esp), %edx
389        lock
390        cmpxchgl  %edx,(%ecx)
391        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
392        and       $1, %eax     // sign extend previous instruction
393        ret
394
395        DEBUG_INFO __kmp_compare_and_store32
396
397// kmp_int32
398// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
399        PROC  __kmp_compare_and_store64
400
401        pushl     %ebp
402        movl      %esp, %ebp
403        pushl     %ebx
404        pushl     %edi
405        movl      8(%ebp), %edi
406        movl      12(%ebp), %eax        // "cv" low order word
407        movl      16(%ebp), %edx        // "cv" high order word
408        movl      20(%ebp), %ebx        // "sv" low order word
409        movl      24(%ebp), %ecx        // "sv" high order word
410        lock
411        cmpxchg8b (%edi)
412        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
413        and       $1, %eax // sign extend previous instruction
414        popl      %edi
415        popl      %ebx
416        movl      %ebp, %esp
417        popl      %ebp
418        ret
419
420        DEBUG_INFO __kmp_compare_and_store64
421
422// kmp_int8
423// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
424        PROC  __kmp_compare_and_store_ret8
425
426        movl      4(%esp), %ecx
427        movb      8(%esp), %al
428        movb      12(%esp), %dl
429        lock
430        cmpxchgb  %dl,(%ecx)
431        ret
432
433        DEBUG_INFO __kmp_compare_and_store_ret8
434
435// kmp_int16
436// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
437//                               kmp_int16 sv);
438        PROC  __kmp_compare_and_store_ret16
439
440        movl      4(%esp), %ecx
441        movw      8(%esp), %ax
442        movw      12(%esp), %dx
443        lock
444        cmpxchgw  %dx,(%ecx)
445        ret
446
447        DEBUG_INFO __kmp_compare_and_store_ret16
448
449// kmp_int32
450// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
451//                               kmp_int32 sv);
452        PROC  __kmp_compare_and_store_ret32
453
454        movl      4(%esp), %ecx
455        movl      8(%esp), %eax
456        movl      12(%esp), %edx
457        lock
458        cmpxchgl  %edx,(%ecx)
459        ret
460
461        DEBUG_INFO __kmp_compare_and_store_ret32
462
463// kmp_int64
464// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
465//                               kmp_int64 sv);
466        PROC  __kmp_compare_and_store_ret64
467
468        pushl     %ebp
469        movl      %esp, %ebp
470        pushl     %ebx
471        pushl     %edi
472        movl      8(%ebp), %edi
473        movl      12(%ebp), %eax        // "cv" low order word
474        movl      16(%ebp), %edx        // "cv" high order word
475        movl      20(%ebp), %ebx        // "sv" low order word
476        movl      24(%ebp), %ecx        // "sv" high order word
477        lock
478        cmpxchg8b (%edi)
479        popl      %edi
480        popl      %ebx
481        movl      %ebp, %esp
482        popl      %ebp
483        ret
484
485        DEBUG_INFO __kmp_compare_and_store_ret64
486
487
488//------------------------------------------------------------------------
489// FUNCTION __kmp_xchg_real32
490//
491// kmp_real32
492// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
493//
494// parameters:
495// 	addr:	4(%esp)
496// 	data:	8(%esp)
497//
498// return:	%eax
499        PROC  __kmp_xchg_real32
500
501        pushl   %ebp
502        movl    %esp, %ebp
503        subl    $4, %esp
504        pushl   %esi
505
506        movl    4(%ebp), %esi
507        flds    (%esi)
508                        // load <addr>
509        fsts    -4(%ebp)
510                        // store old value
511
512        movl    8(%ebp), %eax
513
514        lock
515        xchgl   %eax, (%esi)
516
517        flds    -4(%ebp)
518                        // return old value
519
520        popl    %esi
521        movl    %ebp, %esp
522        popl    %ebp
523        ret
524
525        DEBUG_INFO __kmp_xchg_real32
526
527# endif /* !KMP_ASM_INTRINS */
528
529//------------------------------------------------------------------------
530// int
531// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
532//                         int gtid, int tid,
533//                         int argc, void *p_argv[]
534// #if OMPT_SUPPORT
535//                         ,
536//                         void **exit_frame_ptr
537// #endif
538//                       ) {
539// #if OMPT_SUPPORT
540//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
541// #endif
542//
543//   (*pkfn)( & gtid, & tid, argv[0], ... );
544//   return 1;
545// }
546
547// -- Begin __kmp_invoke_microtask
548// mark_begin;
549	PROC  __kmp_invoke_microtask
550
551	pushl %ebp
552	KMP_CFI_DEF_OFFSET 8
553	KMP_CFI_OFFSET ebp,-8
554	movl %esp,%ebp		// establish the base pointer for this routine.
555	KMP_CFI_REGISTER ebp
556	subl $8,%esp		// allocate space for two local variables.
557				// These varibales are:
558				//	argv: -4(%ebp)
559				//	temp: -8(%ebp)
560				//
561	pushl %ebx		// save %ebx to use during this routine
562				//
563#if OMPT_SUPPORT
564	movl 28(%ebp),%ebx	// get exit_frame address
565	movl %ebp,(%ebx)	// save exit_frame
566#endif
567
568	movl 20(%ebp),%ebx	// Stack alignment - # args
569	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
570	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
571	movl %esp,%eax		//
572	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
573	movl %eax,%ebx		// Save to %ebx
574	andl $0xFFFFFF80,%eax	// mask off 7 bits
575	subl %eax,%ebx		// Amount to subtract from %esp
576	subl %ebx,%esp		// Prepare the stack ptr --
577				//   now it will be aligned on 128-byte boundary at the call
578
579	movl 24(%ebp),%eax	// copy from p_argv[]
580	movl %eax,-4(%ebp)	// into the local variable *argv.
581
582	movl 20(%ebp),%ebx	// argc is 20(%ebp)
583	shll $2,%ebx
584
585KMP_LABEL(invoke_2):
586	cmpl $0,%ebx
587	jg  KMP_LABEL(invoke_4)
588	jmp KMP_LABEL(invoke_3)
589	ALIGN 2
590KMP_LABEL(invoke_4):
591	movl -4(%ebp),%eax
592	subl $4,%ebx			// decrement argc.
593	addl %ebx,%eax			// index into argv.
594	movl (%eax),%edx
595	pushl %edx
596
597	jmp KMP_LABEL(invoke_2)
598	ALIGN 2
599KMP_LABEL(invoke_3):
600	leal 16(%ebp),%eax		// push & tid
601	pushl %eax
602
603	leal 12(%ebp),%eax		// push & gtid
604	pushl %eax
605
606	movl 8(%ebp),%ebx
607	call *%ebx			// call (*pkfn)();
608
609	movl $1,%eax			// return 1;
610
611	movl -12(%ebp),%ebx		// restore %ebx
612	leave
613	KMP_CFI_DEF esp,4
614	ret
615
616	DEBUG_INFO __kmp_invoke_microtask
617// -- End  __kmp_invoke_microtask
618
619
620// kmp_uint64
621// __kmp_hardware_timestamp(void)
622	PROC  __kmp_hardware_timestamp
623	rdtsc
624	ret
625
626	DEBUG_INFO __kmp_hardware_timestamp
627// -- End  __kmp_hardware_timestamp
628
629#endif /* KMP_ARCH_X86 */
630
631
632#if KMP_ARCH_X86_64
633
634// -----------------------------------------------------------------------
635// microtasking routines specifically written for IA-32 architecture and
636// Intel(R) 64 running Linux* OS
637// -----------------------------------------------------------------------
638
639// -- Machine type P
640// mark_description "Intel Corporation";
641	.ident "Intel Corporation"
642// --	.file "z_Linux_asm.S"
643	.data
644	ALIGN 4
645
646// To prevent getting our code into .data section .text added to every routine
647// definition for x86_64.
648//------------------------------------------------------------------------
649# if !KMP_ASM_INTRINS
650
651//------------------------------------------------------------------------
652// FUNCTION __kmp_test_then_add32
653//
654// kmp_int32
655// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
656//
657// parameters:
658// 	p:	%rdi
659// 	d:	%esi
660//
661// return:	%eax
662        .text
663        PROC  __kmp_test_then_add32
664
665        movl      %esi, %eax	// "d"
666        lock
667        xaddl     %eax,(%rdi)
668        ret
669
670        DEBUG_INFO __kmp_test_then_add32
671
672
673//------------------------------------------------------------------------
674// FUNCTION __kmp_test_then_add64
675//
676// kmp_int64
677// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
678//
679// parameters:
680// 	p:	%rdi
681// 	d:	%rsi
682//	return:	%rax
683        .text
684        PROC  __kmp_test_then_add64
685
686        movq      %rsi, %rax	// "d"
687        lock
688        xaddq     %rax,(%rdi)
689        ret
690
691        DEBUG_INFO __kmp_test_then_add64
692
693
694//------------------------------------------------------------------------
695// FUNCTION __kmp_xchg_fixed8
696//
697// kmp_int32
698// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
699//
700// parameters:
701// 	p:	%rdi
702// 	d:	%sil
703//
704// return:	%al
705        .text
706        PROC  __kmp_xchg_fixed8
707
708        movb      %sil, %al	// "d"
709
710        lock
711        xchgb     %al,(%rdi)
712        ret
713
714        DEBUG_INFO __kmp_xchg_fixed8
715
716
717//------------------------------------------------------------------------
718// FUNCTION __kmp_xchg_fixed16
719//
720// kmp_int16
721// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
722//
723// parameters:
724// 	p:	%rdi
725// 	d:	%si
726// return:     %ax
727        .text
728        PROC  __kmp_xchg_fixed16
729
730        movw      %si, %ax	// "d"
731
732        lock
733        xchgw     %ax,(%rdi)
734        ret
735
736        DEBUG_INFO __kmp_xchg_fixed16
737
738
739//------------------------------------------------------------------------
740// FUNCTION __kmp_xchg_fixed32
741//
742// kmp_int32
743// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
744//
745// parameters:
746// 	p:	%rdi
747// 	d:	%esi
748//
749// return:	%eax
750        .text
751        PROC  __kmp_xchg_fixed32
752
753        movl      %esi, %eax	// "d"
754
755        lock
756        xchgl     %eax,(%rdi)
757        ret
758
759        DEBUG_INFO __kmp_xchg_fixed32
760
761
762//------------------------------------------------------------------------
763// FUNCTION __kmp_xchg_fixed64
764//
765// kmp_int64
766// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
767//
768// parameters:
769// 	p:	%rdi
770// 	d:	%rsi
771// return:	%rax
772        .text
773        PROC  __kmp_xchg_fixed64
774
775        movq      %rsi, %rax	// "d"
776
777        lock
778        xchgq     %rax,(%rdi)
779        ret
780
781        DEBUG_INFO __kmp_xchg_fixed64
782
783
784//------------------------------------------------------------------------
785// FUNCTION __kmp_compare_and_store8
786//
787// kmp_int8
788// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
789//
790// parameters:
791// 	p:	%rdi
792// 	cv:	%esi
793//	sv:	%edx
794//
795// return:	%eax
796        .text
797        PROC  __kmp_compare_and_store8
798
799        movb      %sil, %al	// "cv"
800        lock
801        cmpxchgb  %dl,(%rdi)
802        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
803        andq      $1, %rax      // sign extend previous instruction for return value
804        ret
805
806        DEBUG_INFO __kmp_compare_and_store8
807
808
809//------------------------------------------------------------------------
810// FUNCTION __kmp_compare_and_store16
811//
812// kmp_int16
813// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
814//
815// parameters:
816// 	p:	%rdi
817// 	cv:	%si
818//	sv:	%dx
819//
820// return:	%eax
821        .text
822        PROC  __kmp_compare_and_store16
823
824        movw      %si, %ax	// "cv"
825        lock
826        cmpxchgw  %dx,(%rdi)
827        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
828        andq      $1, %rax      // sign extend previous instruction for return value
829        ret
830
831        DEBUG_INFO __kmp_compare_and_store16
832
833
834//------------------------------------------------------------------------
835// FUNCTION __kmp_compare_and_store32
836//
837// kmp_int32
838// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
839//
840// parameters:
841// 	p:	%rdi
842// 	cv:	%esi
843//	sv:	%edx
844//
845// return:	%eax
846        .text
847        PROC  __kmp_compare_and_store32
848
849        movl      %esi, %eax	// "cv"
850        lock
851        cmpxchgl  %edx,(%rdi)
852        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
853        andq      $1, %rax      // sign extend previous instruction for return value
854        ret
855
856        DEBUG_INFO __kmp_compare_and_store32
857
858
859//------------------------------------------------------------------------
860// FUNCTION __kmp_compare_and_store64
861//
862// kmp_int32
863// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
864//
865// parameters:
866// 	p:	%rdi
867// 	cv:	%rsi
868//	sv:	%rdx
869//	return:	%eax
870        .text
871        PROC  __kmp_compare_and_store64
872
873        movq      %rsi, %rax    // "cv"
874        lock
875        cmpxchgq  %rdx,(%rdi)
876        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
877        andq      $1, %rax      // sign extend previous instruction for return value
878        ret
879
880        DEBUG_INFO __kmp_compare_and_store64
881
882//------------------------------------------------------------------------
883// FUNCTION __kmp_compare_and_store_ret8
884//
885// kmp_int8
886// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
887//
888// parameters:
889// 	p:	%rdi
890// 	cv:	%esi
891//	sv:	%edx
892//
893// return:	%eax
894        .text
895        PROC  __kmp_compare_and_store_ret8
896
897        movb      %sil, %al	// "cv"
898        lock
899        cmpxchgb  %dl,(%rdi)
900        ret
901
902        DEBUG_INFO __kmp_compare_and_store_ret8
903
904
905//------------------------------------------------------------------------
906// FUNCTION __kmp_compare_and_store_ret16
907//
908// kmp_int16
909// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
910//
911// parameters:
912// 	p:	%rdi
913// 	cv:	%si
914//	sv:	%dx
915//
916// return:	%eax
917        .text
918        PROC  __kmp_compare_and_store_ret16
919
920        movw      %si, %ax	// "cv"
921        lock
922        cmpxchgw  %dx,(%rdi)
923        ret
924
925        DEBUG_INFO __kmp_compare_and_store_ret16
926
927
928//------------------------------------------------------------------------
929// FUNCTION __kmp_compare_and_store_ret32
930//
931// kmp_int32
932// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
933//
934// parameters:
935// 	p:	%rdi
936// 	cv:	%esi
937//	sv:	%edx
938//
939// return:	%eax
940        .text
941        PROC  __kmp_compare_and_store_ret32
942
943        movl      %esi, %eax	// "cv"
944        lock
945        cmpxchgl  %edx,(%rdi)
946        ret
947
948        DEBUG_INFO __kmp_compare_and_store_ret32
949
950
951//------------------------------------------------------------------------
952// FUNCTION __kmp_compare_and_store_ret64
953//
954// kmp_int64
955// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
956//
957// parameters:
958// 	p:	%rdi
959// 	cv:	%rsi
960//	sv:	%rdx
961//	return:	%eax
962        .text
963        PROC  __kmp_compare_and_store_ret64
964
965        movq      %rsi, %rax    // "cv"
966        lock
967        cmpxchgq  %rdx,(%rdi)
968        ret
969
970        DEBUG_INFO __kmp_compare_and_store_ret64
971
972# endif /* !KMP_ASM_INTRINS */
973
974
975# if !KMP_MIC
976
977# if !KMP_ASM_INTRINS
978
979//------------------------------------------------------------------------
980// FUNCTION __kmp_xchg_real32
981//
982// kmp_real32
983// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
984//
985// parameters:
986// 	addr:	%rdi
987// 	data:	%xmm0 (lower 4 bytes)
988//
989// return:	%xmm0 (lower 4 bytes)
990        .text
991        PROC  __kmp_xchg_real32
992
993	movd	%xmm0, %eax	// load "data" to eax
994
995         lock
996         xchgl %eax, (%rdi)
997
998	movd	%eax, %xmm0	// load old value into return register
999
1000        ret
1001
1002        DEBUG_INFO __kmp_xchg_real32
1003
1004
1005//------------------------------------------------------------------------
1006// FUNCTION __kmp_xchg_real64
1007//
1008// kmp_real64
1009// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1010//
1011// parameters:
1012//      addr:   %rdi
1013//      data:   %xmm0 (lower 8 bytes)
1014//      return: %xmm0 (lower 8 bytes)
1015        .text
1016        PROC  __kmp_xchg_real64
1017
1018	movd	%xmm0, %rax	// load "data" to rax
1019
1020         lock
1021	xchgq  %rax, (%rdi)
1022
1023	movd	%rax, %xmm0	// load old value into return register
1024        ret
1025
1026        DEBUG_INFO __kmp_xchg_real64
1027
1028
1029# endif /* !KMP_MIC */
1030
1031# endif /* !KMP_ASM_INTRINS */
1032
1033//------------------------------------------------------------------------
1034// int
1035// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1036//                         int gtid, int tid,
1037//                         int argc, void *p_argv[]
1038// #if OMPT_SUPPORT
1039//                         ,
1040//                         void **exit_frame_ptr
1041// #endif
1042//                       ) {
1043// #if OMPT_SUPPORT
1044//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1045// #endif
1046//
1047//   (*pkfn)( & gtid, & tid, argv[0], ... );
1048//   return 1;
1049// }
1050//
1051// note: at call to pkfn must have %rsp 128-byte aligned for compiler
1052//
1053// parameters:
1054//      %rdi:  	pkfn
1055//	%esi:	gtid
1056//	%edx:	tid
1057//	%ecx:	argc
1058//	%r8:	p_argv
1059//	%r9:	&exit_frame
1060//
1061// locals:
1062//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1063//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1064//
1065// reg temps:
1066//	%rax:	used all over the place
1067//	%rdx:	used in stack pointer alignment calculation
1068//	%r11:	used to traverse p_argv array
1069//	%rsi:	used as temporary for stack parameters
1070//		used as temporary for number of pkfn parms to push
1071//	%rbx:	used to hold pkfn address, and zero constant, callee-save
1072//
1073// return:	%eax 	(always 1/TRUE)
1074__gtid = -16
1075__tid = -24
1076
1077// -- Begin __kmp_invoke_microtask
1078// mark_begin;
1079        .text
1080	PROC  __kmp_invoke_microtask
1081
1082	pushq 	%rbp		// save base pointer
1083	KMP_CFI_DEF_OFFSET 16
1084	KMP_CFI_OFFSET rbp,-16
1085	movq 	%rsp,%rbp	// establish the base pointer for this routine.
1086	KMP_CFI_REGISTER rbp
1087
1088#if OMPT_SUPPORT
1089	movq	%rbp, (%r9)	// save exit_frame
1090#endif
1091
1092	pushq 	%rbx		// %rbx is callee-saved register
1093	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
1094	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
1095
1096	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
1097	movq	$0, %rbx	// constant for cmovs later
1098	subq	$4, %rax	// subtract four args passed in registers to pkfn
1099#if KMP_MIC
1100	js	KMP_LABEL(kmp_0)	// jump to movq
1101	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
1102KMP_LABEL(kmp_0):
1103	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1104KMP_LABEL(kmp_0_exit):
1105#else
1106	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1107#endif // KMP_MIC
1108
1109	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
1110	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
1111
1112	movq 	%rsp, %rdx	//
1113	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
1114				// without align, stack ptr would be this
1115	movq 	%rdx, %rax	// Save to %rax
1116
1117	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
1118	subq 	%rax, %rdx	// Amount to subtract from %rsp
1119	subq 	%rdx, %rsp	// Prepare the stack ptr --
1120				// now %rsp will align to 128-byte boundary at call site
1121
1122				// setup pkfn parameter reg and stack
1123	movq	%rcx, %rax	// argc -> %rax
1124	cmpq	$0, %rsi
1125	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
1126	shlq	$3, %rcx	// argc*8 -> %rcx
1127	movq 	%r8, %rdx	// p_argv -> %rdx
1128	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
1129
1130	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
1131
1132KMP_LABEL(kmp_invoke_push_parms):
1133	// push nth - 7th parms to pkfn on stack
1134	subq	$8, %rdx	// decrement p_argv pointer to previous parm
1135	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
1136	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
1137	subl	$1, %ecx
1138
1139// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1140//		if the name of the label that is an operand of this jecxz starts with a dot (".");
1141//	   Apple's linker does not support 1-byte length relocation;
1142//         Resolution: replace all .labelX entries with L_labelX.
1143
1144	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
1145	jmp	KMP_LABEL(kmp_invoke_push_parms)
1146	ALIGN 3
1147KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
1148				// order here is important to avoid trashing
1149				// registers used for both input and output parms!
1150	movq	%rdi, %rbx	// pkfn -> %rbx
1151	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1152	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
1153
1154	movq	%r8, %r11	// p_argv -> %r11
1155
1156#if KMP_MIC
1157	cmpq	$4, %rax	// argc >= 4?
1158	jns	KMP_LABEL(kmp_4)	// jump to movq
1159	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
1160KMP_LABEL(kmp_4):
1161	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1162KMP_LABEL(kmp_4_exit):
1163
1164	cmpq	$3, %rax	// argc >= 3?
1165	jns	KMP_LABEL(kmp_3)	// jump to movq
1166	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
1167KMP_LABEL(kmp_3):
1168	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1169KMP_LABEL(kmp_3_exit):
1170
1171	cmpq	$2, %rax	// argc >= 2?
1172	jns	KMP_LABEL(kmp_2)	// jump to movq
1173	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
1174KMP_LABEL(kmp_2):
1175	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1176KMP_LABEL(kmp_2_exit):
1177
1178	cmpq	$1, %rax	// argc >= 1?
1179	jns	KMP_LABEL(kmp_1)	// jump to movq
1180	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
1181KMP_LABEL(kmp_1):
1182	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1183KMP_LABEL(kmp_1_exit):
1184#else
1185	cmpq	$4, %rax	// argc >= 4?
1186	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1187
1188	cmpq	$3, %rax	// argc >= 3?
1189	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1190
1191	cmpq	$2, %rax	// argc >= 2?
1192	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1193
1194	cmpq	$1, %rax	// argc >= 1?
1195	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1196#endif // KMP_MIC
1197
1198	call	*%rbx		// call (*pkfn)();
1199	movq	$1, %rax	// move 1 into return register;
1200
1201	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
1202	movq 	%rbp, %rsp	// restore stack pointer
1203	popq 	%rbp		// restore frame pointer
1204	KMP_CFI_DEF rsp,8
1205	ret
1206
1207	DEBUG_INFO __kmp_invoke_microtask
1208// -- End  __kmp_invoke_microtask
1209
1210// kmp_uint64
1211// __kmp_hardware_timestamp(void)
1212        .text
1213	PROC  __kmp_hardware_timestamp
1214	rdtsc
1215	shlq    $32, %rdx
1216	orq     %rdx, %rax
1217	ret
1218
1219	DEBUG_INFO __kmp_hardware_timestamp
1220// -- End  __kmp_hardware_timestamp
1221
1222//------------------------------------------------------------------------
1223// FUNCTION __kmp_bsr32
1224//
1225// int
1226// __kmp_bsr32( int );
1227        .text
1228        PROC  __kmp_bsr32
1229
1230        bsr    %edi,%eax
1231        ret
1232
1233        DEBUG_INFO __kmp_bsr32
1234
1235// -----------------------------------------------------------------------
1236#endif /* KMP_ARCH_X86_64 */
1237
1238// '
1239#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
1240
1241//------------------------------------------------------------------------
1242// int
1243// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1244//                         int gtid, int tid,
1245//                         int argc, void *p_argv[]
1246// #if OMPT_SUPPORT
1247//                         ,
1248//                         void **exit_frame_ptr
1249// #endif
1250//                       ) {
1251// #if OMPT_SUPPORT
1252//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1253// #endif
1254//
1255//   (*pkfn)( & gtid, & tid, argv[0], ... );
1256//
1257// // FIXME: This is done at call-site and can be removed here.
1258// #if OMPT_SUPPORT
1259//   *exit_frame_ptr = 0;
1260// #endif
1261//
1262//   return 1;
1263// }
1264//
1265// parameters:
1266//	x0:	pkfn
1267//	w1:	gtid
1268//	w2:	tid
1269//	w3:	argc
1270//	x4:	p_argv
1271//	x5:	&exit_frame
1272//
1273// locals:
1274//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1275//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1276//
1277// reg temps:
1278//	 x8:	used to hold pkfn address
1279//	 w9:	used as temporary for number of pkfn parms
1280//	x10:	used to traverse p_argv array
1281//	x11:	used as temporary for stack placement calculation
1282//	x12:	used as temporary for stack parameters
1283//	x19:	used to preserve exit_frame_ptr, callee-save
1284//
1285// return:	w0	(always 1/TRUE)
1286//
1287
1288__gtid = 4
1289__tid = 8
1290
1291// -- Begin __kmp_invoke_microtask
1292// mark_begin;
1293	.text
1294	PROC __kmp_invoke_microtask
1295
1296	stp	x29, x30, [sp, #-16]!
1297# if OMPT_SUPPORT
1298	stp	x19, x20, [sp, #-16]!
1299# endif
1300	mov	x29, sp
1301
1302	orr	w9, wzr, #1
1303	add	w9, w9, w3, lsr #1
1304	sub	sp, sp, w9, uxtw #4
1305	mov	x11, sp
1306
1307	mov	x8, x0
1308	str	w1, [x29, #-__gtid]
1309	str	w2, [x29, #-__tid]
1310	mov	w9, w3
1311	mov	x10, x4
1312# if OMPT_SUPPORT
1313	mov	x19, x5
1314	str	x29, [x19]
1315# endif
1316
1317	sub	x0, x29, #__gtid
1318	sub	x1, x29, #__tid
1319
1320	cbz	w9, KMP_LABEL(kmp_1)
1321	ldr	x2, [x10]
1322
1323	sub	w9, w9, #1
1324	cbz	w9, KMP_LABEL(kmp_1)
1325	ldr	x3, [x10, #8]!
1326
1327	sub	w9, w9, #1
1328	cbz	w9, KMP_LABEL(kmp_1)
1329	ldr	x4, [x10, #8]!
1330
1331	sub	w9, w9, #1
1332	cbz	w9, KMP_LABEL(kmp_1)
1333	ldr	x5, [x10, #8]!
1334
1335	sub	w9, w9, #1
1336	cbz	w9, KMP_LABEL(kmp_1)
1337	ldr	x6, [x10, #8]!
1338
1339	sub	w9, w9, #1
1340	cbz	w9, KMP_LABEL(kmp_1)
1341	ldr	x7, [x10, #8]!
1342
1343KMP_LABEL(kmp_0):
1344	sub	w9, w9, #1
1345	cbz	w9, KMP_LABEL(kmp_1)
1346	ldr	x12, [x10, #8]!
1347	str	x12, [x11], #8
1348	b	KMP_LABEL(kmp_0)
1349KMP_LABEL(kmp_1):
1350	blr	x8
1351	orr	w0, wzr, #1
1352	mov	sp, x29
1353# if OMPT_SUPPORT
1354	str	xzr, [x19]
1355	ldp	x19, x20, [sp], #16
1356# endif
1357	ldp	x29, x30, [sp], #16
1358	ret
1359
1360	DEBUG_INFO __kmp_invoke_microtask
1361// -- End  __kmp_invoke_microtask
1362
1363#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
1364
1365#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
1366
1367//------------------------------------------------------------------------
1368// int
1369// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1370//                         int gtid, int tid,
1371//                         int argc, void *p_argv[]
1372// #if OMPT_SUPPORT
1373//                         ,
1374//                         void **exit_frame_ptr
1375// #endif
1376//                       ) {
1377// #if OMPT_SUPPORT
1378//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1379// #endif
1380//
1381//   (*pkfn)( & gtid, & tid, argv[0], ... );
1382//
1383// // FIXME: This is done at call-site and can be removed here.
1384// #if OMPT_SUPPORT
1385//   *exit_frame_ptr = 0;
1386// #endif
1387//
1388//   return 1;
1389// }
1390//
1391// parameters:
1392//	r0:	pkfn
1393//	r1:	gtid
1394//	r2:	tid
1395//	r3:	argc
1396//	r4(stack):	p_argv
1397//	r5(stack):	&exit_frame
1398//
1399// locals:
1400//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1401//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1402//
1403// reg temps:
1404//	 r4:	used to hold pkfn address
1405//	 r5:	used as temporary for number of pkfn parms
1406//	 r6:	used to traverse p_argv array
1407//	 r7:	frame pointer (in some configurations)
1408//	 r8:	used as temporary for stack placement calculation
1409//	 	and as pointer to base of callee saved area
1410//	 r9:	used as temporary for stack parameters
1411//	r10:	used to preserve exit_frame_ptr, callee-save
1412//	r11:	frame pointer (in some configurations)
1413//
1414// return:	r0	(always 1/TRUE)
1415//
1416
1417__gtid = 4
1418__tid = 8
1419
1420// -- Begin __kmp_invoke_microtask
1421// mark_begin;
1422	.text
1423	PROC __kmp_invoke_microtask
1424
1425	// Pushing one extra register (r3) to keep the stack aligned
1426	// for when we call pkfn below
1427	push	{r3-r11,lr}
1428	// Load p_argv and &exit_frame
1429	ldr	r4, [sp, #10*4]
1430# if OMPT_SUPPORT
1431	ldr	r5, [sp, #11*4]
1432# endif
1433
1434# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
1435# define FP r7
1436# define FPOFF 4*4
1437#else
1438# define FP r11
1439# define FPOFF 8*4
1440#endif
1441	add	FP, sp, #FPOFF
1442# if OMPT_SUPPORT
1443	mov	r10, r5
1444	str	FP, [r10]
1445# endif
1446	mov	r8, sp
1447
1448	// Calculate how much stack to allocate, in increments of 8 bytes.
1449	// We strictly need 4*(argc-2) bytes (2 arguments are passed in
1450	// registers) but allocate 4*argc for simplicity (to avoid needing
1451	// to handle the argc<2 cases). We align the number of bytes
1452	// allocated to 8 bytes, to keep the stack aligned. (Since we
1453	// already allocate more than enough, it's ok to round down
1454	// instead of up for the alignment.) We allocate another extra
1455	// 8 bytes for gtid and tid.
1456	mov	r5, #1
1457	add	r5, r5, r3, lsr #1
1458	sub	sp, sp, r5, lsl #3
1459
1460	str	r1, [r8, #-__gtid]
1461	str	r2, [r8, #-__tid]
1462	mov	r5, r3
1463	mov	r6, r4
1464	mov	r4, r0
1465
1466	// Prepare the first 2 parameters to pkfn - pointers to gtid and tid
1467	// in our stack frame.
1468	sub	r0, r8, #__gtid
1469	sub	r1, r8, #__tid
1470
1471	mov	r8, sp
1472
1473	// Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
1474	cmp	r5, #0
1475	beq	KMP_LABEL(kmp_1)
1476	ldr	r2, [r6]
1477
1478	subs	r5, r5, #1
1479	beq	KMP_LABEL(kmp_1)
1480	ldr	r3, [r6, #4]!
1481
1482	// Loop, loading the rest of p_argv and writing the elements on the
1483	// stack.
1484KMP_LABEL(kmp_0):
1485	subs	r5, r5, #1
1486	beq	KMP_LABEL(kmp_1)
1487	ldr	r12, [r6, #4]!
1488	str	r12, [r8], #4
1489	b	KMP_LABEL(kmp_0)
1490KMP_LABEL(kmp_1):
1491	blx	r4
1492	mov	r0, #1
1493
1494	sub	r4, FP, #FPOFF
1495	mov	sp, r4
1496# undef FP
1497# undef FPOFF
1498
1499# if OMPT_SUPPORT
1500	mov	r1, #0
1501	str	r1, [r10]
1502# endif
1503	pop	{r3-r11,pc}
1504
1505	DEBUG_INFO __kmp_invoke_microtask
1506// -- End  __kmp_invoke_microtask
1507
1508#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
1509
1510#if KMP_ARCH_PPC64
1511
1512//------------------------------------------------------------------------
1513// int
1514// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1515//                         int gtid, int tid,
1516//                         int argc, void *p_argv[]
1517// #if OMPT_SUPPORT
1518//                         ,
1519//                         void **exit_frame_ptr
1520// #endif
1521//                       ) {
1522// #if OMPT_SUPPORT
1523//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1524// #endif
1525//
1526//   (*pkfn)( & gtid, & tid, argv[0], ... );
1527//
1528// // FIXME: This is done at call-site and can be removed here.
1529// #if OMPT_SUPPORT
1530//   *exit_frame_ptr = 0;
1531// #endif
1532//
1533//   return 1;
1534// }
1535//
1536// parameters:
1537//	r3:	pkfn
1538//	r4:	gtid
1539//	r5:	tid
1540//	r6:	argc
1541//	r7:	p_argv
1542//	r8:	&exit_frame
1543//
1544// return:	r3	(always 1/TRUE)
1545//
1546	.text
1547# if KMP_ARCH_PPC64_ELFv2
1548	.abiversion 2
1549# endif
1550	.globl	__kmp_invoke_microtask
1551
1552# if KMP_ARCH_PPC64_ELFv2
1553	.p2align	4
1554# else
1555	.p2align	2
1556# endif
1557
1558	.type	__kmp_invoke_microtask,@function
1559
1560# if KMP_ARCH_PPC64_ELFv2
1561__kmp_invoke_microtask:
1562.Lfunc_begin0:
1563.Lfunc_gep0:
1564	addis 2, 12, .TOC.-.Lfunc_gep0@ha
1565	addi 2, 2, .TOC.-.Lfunc_gep0@l
1566.Lfunc_lep0:
1567	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1568# else
1569	.section	.opd,"aw",@progbits
1570__kmp_invoke_microtask:
1571	.p2align	3
1572	.quad	.Lfunc_begin0
1573	.quad	.TOC.@tocbase
1574	.quad	0
1575	.text
1576.Lfunc_begin0:
1577# endif
1578
1579// -- Begin __kmp_invoke_microtask
1580// mark_begin;
1581
1582// We need to allocate a stack frame large enough to hold all of the parameters
1583// on the stack for the microtask plus what this function needs. That's 48
1584// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1585// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1586// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1587// to save r30 to hold a copy of r8.
1588
1589	.cfi_startproc
1590	mflr 0
1591	std 31, -8(1)
1592	std 0, 16(1)
1593
1594// This is unusual because normally we'd set r31 equal to r1 after the stack
1595// frame is established. In this case, however, we need to dynamically compute
1596// the stack frame size, and so we keep a direct copy of r1 to access our
1597// register save areas and restore the r1 value before returning.
1598	mr 31, 1
1599	.cfi_def_cfa_register r31
1600	.cfi_offset r31, -8
1601	.cfi_offset lr, 16
1602
1603// Compute the size necessary for the local stack frame.
1604# if KMP_ARCH_PPC64_ELFv2
1605	li 12, 72
1606# else
1607	li 12, 88
1608# endif
1609	sldi 0, 6, 3
1610	add 12, 0, 12
1611	neg 12, 12
1612
1613// We need to make sure that the stack frame stays aligned (to 16 bytes).
1614	li 0, -16
1615	and 12, 0, 12
1616
1617// Establish the local stack frame.
1618	stdux 1, 1, 12
1619
1620# if OMPT_SUPPORT
1621	.cfi_offset r30, -16
1622	std 30, -16(31)
1623	std 1, 0(8)
1624	mr 30, 8
1625# endif
1626
1627// Store gtid and tid to the stack because they're passed by reference to the microtask.
1628	stw 4, -20(31)
1629	stw 5, -24(31)
1630
1631	mr 12, 6
1632	mr 4, 7
1633
1634	cmpwi 0, 12, 1
1635	blt	 0, .Lcall
1636
1637	ld 5, 0(4)
1638
1639	cmpwi 0, 12, 2
1640	blt	 0, .Lcall
1641
1642	ld 6, 8(4)
1643
1644	cmpwi 0, 12, 3
1645	blt	 0, .Lcall
1646
1647	ld 7, 16(4)
1648
1649	cmpwi 0, 12, 4
1650	blt	 0, .Lcall
1651
1652	ld 8, 24(4)
1653
1654	cmpwi 0, 12, 5
1655	blt	 0, .Lcall
1656
1657	ld 9, 32(4)
1658
1659	cmpwi 0, 12, 6
1660	blt	 0, .Lcall
1661
1662	ld 10, 40(4)
1663
1664	cmpwi 0, 12, 7
1665	blt	 0, .Lcall
1666
1667// There are more than 6 microtask parameters, so we need to store the
1668// remainder to the stack.
1669	addi 12, 12, -6
1670	mtctr 12
1671
1672// These are set to 8 bytes before the first desired store address (we're using
1673// pre-increment loads and stores in the loop below). The parameter save area
1674// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1675// 32 + 8*8 == 96 bytes above r1 for ELFv2.
1676	addi 4, 4, 40
1677# if KMP_ARCH_PPC64_ELFv2
1678	addi 12, 1, 88
1679# else
1680	addi 12, 1, 104
1681# endif
1682
1683.Lnext:
1684	ldu 0, 8(4)
1685	stdu 0, 8(12)
1686	bdnz .Lnext
1687
1688.Lcall:
1689# if KMP_ARCH_PPC64_ELFv2
1690	std 2, 24(1)
1691	mr 12, 3
1692#else
1693	std 2, 40(1)
1694// For ELFv1, we need to load the actual function address from the function descriptor.
1695	ld 12, 0(3)
1696	ld 2, 8(3)
1697	ld 11, 16(3)
1698#endif
1699
1700	addi 3, 31, -20
1701	addi 4, 31, -24
1702
1703	mtctr 12
1704	bctrl
1705# if KMP_ARCH_PPC64_ELFv2
1706	ld 2, 24(1)
1707# else
1708	ld 2, 40(1)
1709# endif
1710
1711# if OMPT_SUPPORT
1712	li 3, 0
1713	std 3, 0(30)
1714# endif
1715
1716	li 3, 1
1717
1718# if OMPT_SUPPORT
1719	ld 30, -16(31)
1720# endif
1721
1722	mr 1, 31
1723	ld 0, 16(1)
1724	ld 31, -8(1)
1725	mtlr 0
1726	blr
1727
1728	.long	0
1729	.quad	0
1730.Lfunc_end0:
1731	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1732	.cfi_endproc
1733
1734// -- End  __kmp_invoke_microtask
1735
1736#endif /* KMP_ARCH_PPC64 */
1737
1738#if KMP_ARCH_RISCV64
1739
1740//------------------------------------------------------------------------
1741//
1742// typedef void (*microtask_t)(int *gtid, int *tid, ...);
1743//
1744// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1745//                            void *p_argv[]
1746// #if OMPT_SUPPORT
1747//                            ,
1748//                            void **exit_frame_ptr
1749// #endif
1750//                            ) {
1751// #if OMPT_SUPPORT
1752//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1753// #endif
1754//
1755//   (*pkfn)(&gtid, &tid, argv[0], ...);
1756//
1757//   return 1;
1758// }
1759//
1760// Parameters:
1761//   a0: pkfn
1762//   a1: gtid
1763//   a2: tid
1764//   a3: argc
1765//   a4: p_argv
1766//   a5: exit_frame_ptr
1767//
1768// Locals:
1769//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
1770//   __tid: tid param pushed on stack so can pass &tid to pkfn
1771//
1772// Temp. registers:
1773//
1774//  t0: used to calculate the dynamic stack size / used to hold pkfn address
1775//  t1: used as temporary for stack placement calculation
1776//  t2: used as temporary for stack arguments
1777//  t3: used as temporary for number of remaining pkfn parms
1778//  t4: used to traverse p_argv array
1779//
1780// return: a0 (always 1/TRUE)
1781//
1782
1783__gtid = -20
1784__tid = -24
1785
1786// -- Begin __kmp_invoke_microtask
1787// mark_begin;
1788	.text
1789	.globl	__kmp_invoke_microtask
1790	.p2align	1
1791	.type	__kmp_invoke_microtask,@function
1792__kmp_invoke_microtask:
1793	.cfi_startproc
1794
1795	// First, save ra and fp
1796	addi	sp, sp, -16
1797	sd	ra, 8(sp)
1798	sd	fp, 0(sp)
1799	addi	fp, sp, 16
1800	.cfi_def_cfa	fp, 0
1801	.cfi_offset	ra, -8
1802	.cfi_offset	fp, -16
1803
1804	// Compute the dynamic stack size:
1805	//
1806	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1807	//   reference
1808	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1809	//   function by register. Given that we have 8 of such registers (a[0-7])
1810	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
1811	//   reserve max(0, argc - 6)*8 extra bytes
1812	//
1813	// The total number of bytes is then max(0, argc - 6)*8 + 8
1814
1815	// Compute max(0, argc - 6) using the following bithack:
1816	// max(0, x) = x - (x & (x >> 31)), where x := argc - 6
1817	// Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
1818	addi	t0, a3, -6
1819	srai	t1, t0, 31
1820	and	t1, t0, t1
1821	sub	t0, t0, t1
1822
1823	addi	t0, t0, 1
1824
1825	slli	t0, t0, 3
1826	sub	sp, sp, t0
1827
1828	// Align the stack to 16 bytes
1829	andi	sp, sp, -16
1830
1831	mv	t0, a0
1832	mv	t3, a3
1833	mv	t4, a4
1834
1835#if OMPT_SUPPORT
1836	// Save frame pointer into exit_frame
1837	sd	fp, 0(a5)
1838#endif
1839
1840	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
1841
1842	sw	a1, __gtid(fp)
1843	sw	a2, __tid(fp)
1844
1845	addi	a0, fp, __gtid
1846	addi	a1, fp, __tid
1847
1848	beqz	t3, .L_kmp_3
1849	ld	a2, 0(t4)
1850
1851	addi	t3, t3, -1
1852	beqz	t3, .L_kmp_3
1853	ld	a3, 8(t4)
1854
1855	addi	t3, t3, -1
1856	beqz	t3, .L_kmp_3
1857	ld	a4, 16(t4)
1858
1859	addi	t3, t3, -1
1860	beqz	t3, .L_kmp_3
1861	ld	a5, 24(t4)
1862
1863	addi	t3, t3, -1
1864	beqz	t3, .L_kmp_3
1865	ld	a6, 32(t4)
1866
1867	addi	t3, t3, -1
1868	beqz	t3, .L_kmp_3
1869	ld	a7, 40(t4)
1870
1871	// Prepare any additional argument passed through the stack
1872	addi	t4, t4, 48
1873	mv	t1, sp
1874	j .L_kmp_2
1875.L_kmp_1:
1876	ld	t2, 0(t4)
1877	sd	t2, 0(t1)
1878	addi	t4, t4, 8
1879	addi	t1, t1, 8
1880.L_kmp_2:
1881	addi	t3, t3, -1
1882	bnez	t3, .L_kmp_1
1883
1884.L_kmp_3:
1885	// Call pkfn function
1886	jalr	t0
1887
1888	// Restore stack and return
1889
1890	addi	a0, zero, 1
1891
1892	addi	sp, fp, -16
1893	ld	fp, 0(sp)
1894	ld	ra, 8(sp)
1895	addi	sp, sp, 16
1896	ret
1897.Lfunc_end0:
1898	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
1899	.cfi_endproc
1900
1901// -- End  __kmp_invoke_microtask
1902
1903#endif /* KMP_ARCH_RISCV64 */
1904
1905#if KMP_ARCH_LOONGARCH64
1906
1907//------------------------------------------------------------------------
1908//
1909// typedef void (*microtask_t)(int *gtid, int *tid, ...);
1910//
1911// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1912//                            void *p_argv[]
1913// #if OMPT_SUPPORT
1914//                            ,
1915//                            void **exit_frame_ptr
1916// #endif
1917//                            ) {
1918// #if OMPT_SUPPORT
1919//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1920// #endif
1921//
1922//   (*pkfn)(&gtid, &tid, argv[0], ...);
1923//
1924//   return 1;
1925// }
1926//
1927// Parameters:
1928//   a0: pkfn
1929//   a1: gtid
1930//   a2: tid
1931//   a3: argc
1932//   a4: p_argv
1933//   a5: exit_frame_ptr
1934//
1935// Locals:
1936//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
1937//   __tid: tid param pushed on stack so can pass &tid to pkfn
1938//
1939// Temp registers:
1940//
1941//  t0: used to calculate the dynamic stack size / used to hold pkfn address
1942//  t1: used as temporary for stack placement calculation
1943//  t2: used as temporary for stack arguments
1944//  t3: used as temporary for number of remaining pkfn parms
1945//  t4: used to traverse p_argv array
1946//
1947// return: a0 (always 1/TRUE)
1948//
1949
1950// -- Begin __kmp_invoke_microtask
1951// mark_begin;
1952	.text
1953	.globl	__kmp_invoke_microtask
1954	.p2align	2
1955	.type	__kmp_invoke_microtask,@function
1956__kmp_invoke_microtask:
1957	.cfi_startproc
1958
1959	// First, save ra and fp
1960	addi.d	$sp, $sp, -16
1961	st.d	$ra, $sp, 8
1962	st.d	$fp, $sp, 0
1963	addi.d	$fp, $sp, 16
1964	.cfi_def_cfa	22, 0
1965	.cfi_offset	1, -8
1966	.cfi_offset	22, -16
1967
1968	// Compute the dynamic stack size:
1969	//
1970	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1971	//   reference
1972	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1973	//   function by register. Given that we have 8 of such registers (a[0-7])
1974	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
1975	//   reserve max(0, argc - 6)*8 extra bytes
1976	//
1977	// The total number of bytes is then max(0, argc - 6)*8 + 8
1978
1979	addi.d  $t0, $a3, -6
1980	slt  $t1, $t0, $zero
1981	masknez  $t0, $t0, $t1
1982	addi.d  $t0, $t0, 1
1983	slli.d	$t0, $t0, 3
1984	sub.d	$sp, $sp, $t0
1985
1986	// Align the stack to 16 bytes
1987	bstrins.d $sp, $zero, 3, 0
1988
1989	move	$t0, $a0
1990	move	$t3, $a3
1991	move	$t4, $a4
1992
1993#if OMPT_SUPPORT
1994	// Save frame pointer into exit_frame
1995	st.d	$fp, $a5, 0
1996#endif
1997
1998	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
1999
2000	st.w	$a1, $fp, -20
2001	st.w	$a2, $fp, -24
2002
2003	addi.d	$a0, $fp, -20
2004	addi.d	$a1, $fp, -24
2005
2006	beqz	$t3, .L_kmp_3
2007	ld.d	$a2, $t4, 0
2008
2009	addi.d	$t3, $t3, -1
2010	beqz	$t3, .L_kmp_3
2011	ld.d	$a3, $t4, 8
2012
2013	addi.d	$t3, $t3, -1
2014	beqz	$t3, .L_kmp_3
2015	ld.d	$a4, $t4, 16
2016
2017	addi.d	$t3, $t3, -1
2018	beqz	$t3, .L_kmp_3
2019	ld.d	$a5, $t4, 24
2020
2021	addi.d	$t3, $t3, -1
2022	beqz	$t3, .L_kmp_3
2023	ld.d	$a6, $t4, 32
2024
2025	addi.d	$t3, $t3, -1
2026	beqz	$t3, .L_kmp_3
2027	ld.d	$a7, $t4, 40
2028
2029	// Prepare any additional argument passed through the stack
2030	addi.d	$t4, $t4, 48
2031	move	$t1, $sp
2032	b .L_kmp_2
2033.L_kmp_1:
2034	ld.d	$t2, $t4, 0
2035	st.d	$t2, $t1, 0
2036	addi.d	$t4, $t4, 8
2037	addi.d	$t1, $t1, 8
2038.L_kmp_2:
2039	addi.d	$t3, $t3, -1
2040	bnez	$t3, .L_kmp_1
2041
2042.L_kmp_3:
2043	// Call pkfn function
2044	jirl	$ra, $t0, 0
2045
2046	// Restore stack and return
2047
2048	addi.d	$a0, $zero, 1
2049
2050	addi.d	$sp, $fp, -16
2051	ld.d	$fp, $sp, 0
2052	ld.d	$ra, $sp, 8
2053	addi.d	$sp, $sp, 16
2054	jr $ra
2055.Lfunc_end0:
2056	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2057	.cfi_endproc
2058
2059// -- End  __kmp_invoke_microtask
2060
2061#endif /* KMP_ARCH_LOONGARCH64 */
2062
2063#if KMP_ARCH_ARM || KMP_ARCH_MIPS
2064    .data
2065    COMMON .gomp_critical_user_, 32, 3
2066    .data
2067    .align 4
2068    .global __kmp_unnamed_critical_addr
2069__kmp_unnamed_critical_addr:
2070    .4byte .gomp_critical_user_
2071#ifdef __ELF__
2072    .size __kmp_unnamed_critical_addr,4
2073#endif
2074#endif /* KMP_ARCH_ARM */
2075
2076#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
2077#ifndef KMP_PREFIX_UNDERSCORE
2078# define KMP_PREFIX_UNDERSCORE(x) x
2079#endif
2080    .data
2081    COMMON .gomp_critical_user_, 32, 3
2082    .data
2083    .align 8
2084    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
2085KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
2086    .8byte .gomp_critical_user_
2087#ifdef __ELF__
2088    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
2089#endif
2090#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
2091          KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 */
2092
2093#if KMP_OS_LINUX
2094# if KMP_ARCH_ARM
2095.section .note.GNU-stack,"",%progbits
2096# else
2097.section .note.GNU-stack,"",@progbits
2098# endif
2099#endif
2100