xref: /freebsd/contrib/llvm-project/openmp/runtime/src/z_Linux_asm.S (revision 5036d9652a5701d00e9e40ea942c278e9f77d33d)
1//  z_Linux_asm.S:  - microtasking routines specifically
2//                    written for Intel platforms running Linux* OS
3
4//
5////===----------------------------------------------------------------------===//
6////
7//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8//// See https://llvm.org/LICENSE.txt for license information.
9//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10////
11////===----------------------------------------------------------------------===//
12//
13
14// -----------------------------------------------------------------------
15// macros
16// -----------------------------------------------------------------------
17
18#include "kmp_config.h"
19
20#if KMP_ARCH_X86 || KMP_ARCH_X86_64
21
22# if KMP_MIC
23// the 'delay r16/r32/r64' should be used instead of the 'pause'.
24// The delay operation has the effect of removing the current thread from
25// the round-robin HT mechanism, and therefore speeds up the issue rate of
26// the other threads on the same core.
27//
28// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
29// barrier time to increase greatly for 3 or more threads per core.
30//
31// A value of 100 works pretty well for up to 4 threads per core, but isn't
32// quite as fast as 0 for 2 threads per core.
33//
34// We need to check what happens for oversubscription / > 4 threads per core.
35// It is possible that we need to pass the delay value in as a parameter
36// that the caller determines based on the total # threads / # cores.
37//
38//.macro pause_op
39//	mov    $100, %rax
40//	delay  %rax
41//.endm
42# else
43#  define pause_op   .byte 0xf3,0x90
44# endif // KMP_MIC
45
46# if KMP_OS_DARWIN
47#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
48#  define KMP_LABEL(x) L_##x             // form the name of label
49.macro KMP_CFI_DEF_OFFSET
50.endmacro
51.macro KMP_CFI_OFFSET
52.endmacro
53.macro KMP_CFI_REGISTER
54.endmacro
55.macro KMP_CFI_DEF
56.endmacro
57.macro ALIGN
58	.align $0
59.endmacro
60.macro DEBUG_INFO
61/* Not sure what .size does in icc, not sure if we need to do something
62   similar for OS X*.
63*/
64.endmacro
65.macro PROC
66	ALIGN  4
67	.globl KMP_PREFIX_UNDERSCORE($0)
68KMP_PREFIX_UNDERSCORE($0):
69.endmacro
70# else // KMP_OS_DARWIN
71#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
72// Format labels so that they don't override function names in gdb's backtraces
73// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
74// on OS X*)
75# if KMP_MIC
76#  define KMP_LABEL(x) L_##x          // local label
77# else
78#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
79# endif // KMP_MIC
80.macro ALIGN size
81	.align 1<<(\size)
82.endm
83.macro DEBUG_INFO proc
84	.cfi_endproc
85// Not sure why we need .type and .size for the functions
86	.align 16
87	.type  \proc,@function
88        .size  \proc,.-\proc
89.endm
90.macro PROC proc
91	ALIGN  4
92        .globl KMP_PREFIX_UNDERSCORE(\proc)
93KMP_PREFIX_UNDERSCORE(\proc):
94	.cfi_startproc
95.endm
96.macro KMP_CFI_DEF_OFFSET sz
97	.cfi_def_cfa_offset	\sz
98.endm
99.macro KMP_CFI_OFFSET reg, sz
100	.cfi_offset	\reg,\sz
101.endm
102.macro KMP_CFI_REGISTER reg
103	.cfi_def_cfa_register	\reg
104.endm
105.macro KMP_CFI_DEF reg, sz
106	.cfi_def_cfa	\reg,\sz
107.endm
108# endif // KMP_OS_DARWIN
109#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
110
111#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
112
113# if KMP_OS_DARWIN
114#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
115#  define KMP_LABEL(x) L_##x             // form the name of label
116
117.macro ALIGN
118	.align $0
119.endmacro
120
121.macro DEBUG_INFO
122/* Not sure what .size does in icc, not sure if we need to do something
123   similar for OS X*.
124*/
125.endmacro
126
127.macro PROC
128	ALIGN  4
129	.globl KMP_PREFIX_UNDERSCORE($0)
130KMP_PREFIX_UNDERSCORE($0):
131.endmacro
132# elif KMP_OS_WINDOWS
133#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Windows/ARM64 symbols
134// Format labels so that they don't override function names in gdb's backtraces
135#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
136
137.macro ALIGN size
138	.align 1<<(\size)
139.endm
140
141.macro DEBUG_INFO proc
142	ALIGN 2
143.endm
144
145.macro PROC proc
146	ALIGN 2
147	.globl KMP_PREFIX_UNDERSCORE(\proc)
148KMP_PREFIX_UNDERSCORE(\proc):
149.endm
150# else // KMP_OS_DARWIN || KMP_OS_WINDOWS
151#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
152// Format labels so that they don't override function names in gdb's backtraces
153#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
154
155.macro ALIGN size
156	.align 1<<(\size)
157.endm
158
159.macro DEBUG_INFO proc
160	.cfi_endproc
161// Not sure why we need .type and .size for the functions
162	ALIGN 2
163#if KMP_ARCH_ARM
164	.type  \proc,%function
165#else
166	.type  \proc,@function
167#endif
168	.size  \proc,.-\proc
169.endm
170
171.macro PROC proc
172	ALIGN 2
173	.globl KMP_PREFIX_UNDERSCORE(\proc)
174KMP_PREFIX_UNDERSCORE(\proc):
175	.cfi_startproc
176.endm
177# endif // KMP_OS_DARWIN
178
179# if KMP_OS_LINUX
180// BTI and PAC gnu property note
181#  define NT_GNU_PROPERTY_TYPE_0 5
182#  define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
183#  define GNU_PROPERTY_AARCH64_FEATURE_1_BTI 1
184#  define GNU_PROPERTY_AARCH64_FEATURE_1_PAC 2
185
186#  define GNU_PROPERTY(type, value)                                            \
187  .pushsection .note.gnu.property, "a";                                        \
188  .p2align 3;                                                                  \
189  .word 4;                                                                     \
190  .word 16;                                                                    \
191  .word NT_GNU_PROPERTY_TYPE_0;                                                \
192  .asciz "GNU";                                                                \
193  .word type;                                                                  \
194  .word 4;                                                                     \
195  .word value;                                                                 \
196  .word 0;                                                                     \
197  .popsection
198# endif
199
200# if defined(__ARM_FEATURE_BTI_DEFAULT)
201#  define BTI_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_BTI
202# else
203#  define BTI_FLAG 0
204# endif
205# if __ARM_FEATURE_PAC_DEFAULT & 3
206#  define PAC_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_PAC
207# else
208#  define PAC_FLAG 0
209# endif
210
211# if (BTI_FLAG | PAC_FLAG) != 0
212#  if PAC_FLAG != 0
213#   define PACBTI_C hint #25
214#   define PACBTI_RET hint #29
215#  else
216#   define PACBTI_C hint #34
217#   define PACBTI_RET
218#  endif
219#  define GNU_PROPERTY_BTI_PAC \
220    GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND, BTI_FLAG | PAC_FLAG)
221# else
222#  define PACBTI_C
223#  define PACBTI_RET
224#  define GNU_PROPERTY_BTI_PAC
225# endif
226#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
227
228.macro COMMON name, size, align_power
229#if KMP_OS_DARWIN
230	.comm \name, \size
231#elif KMP_OS_WINDOWS
232	.comm \name, \size, \align_power
233#else // !KMP_OS_DARWIN && !KMP_OS_WINDOWS
234	.comm \name, \size, (1<<(\align_power))
235#endif
236.endm
237
238// -----------------------------------------------------------------------
239// data
240// -----------------------------------------------------------------------
241
242#ifdef KMP_GOMP_COMPAT
243
244// Support for unnamed common blocks.
245//
246// Because the symbol ".gomp_critical_user_" contains a ".", we have to
247// put this stuff in assembly.
248
249# if KMP_ARCH_X86
250#  if KMP_OS_DARWIN
251        .data
252        .comm .gomp_critical_user_,32
253        .data
254        .globl ___kmp_unnamed_critical_addr
255___kmp_unnamed_critical_addr:
256        .long .gomp_critical_user_
257#  else /* Linux* OS */
258        .data
259        .comm .gomp_critical_user_,32,8
260        .data
261	ALIGN 4
262        .global __kmp_unnamed_critical_addr
263__kmp_unnamed_critical_addr:
264        .4byte .gomp_critical_user_
265        .type __kmp_unnamed_critical_addr,@object
266        .size __kmp_unnamed_critical_addr,4
267#  endif /* KMP_OS_DARWIN */
268# endif /* KMP_ARCH_X86 */
269
270# if KMP_ARCH_X86_64
271#  if KMP_OS_DARWIN
272        .data
273        .comm .gomp_critical_user_,32
274        .data
275        .globl ___kmp_unnamed_critical_addr
276___kmp_unnamed_critical_addr:
277        .quad .gomp_critical_user_
278#  else /* Linux* OS */
279        .data
280        .comm .gomp_critical_user_,32,8
281        .data
282	ALIGN 8
283        .global __kmp_unnamed_critical_addr
284__kmp_unnamed_critical_addr:
285        .8byte .gomp_critical_user_
286        .type __kmp_unnamed_critical_addr,@object
287        .size __kmp_unnamed_critical_addr,8
288#  endif /* KMP_OS_DARWIN */
289# endif /* KMP_ARCH_X86_64 */
290
291#endif /* KMP_GOMP_COMPAT */
292
293
294#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
295
296// -----------------------------------------------------------------------
297// microtasking routines specifically written for IA-32 architecture
298// running Linux* OS
299// -----------------------------------------------------------------------
300
301	.ident "Intel Corporation"
302	.data
303	ALIGN 4
304// void
305// __kmp_x86_pause( void );
306
307        .text
308	PROC  __kmp_x86_pause
309
310        pause_op
311        ret
312
313	DEBUG_INFO __kmp_x86_pause
314
315# if !KMP_ASM_INTRINS
316
317//------------------------------------------------------------------------
318// kmp_int32
319// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
320
321        PROC      __kmp_test_then_add32
322
323        movl      4(%esp), %ecx
324        movl      8(%esp), %eax
325        lock
326        xaddl     %eax,(%ecx)
327        ret
328
329	DEBUG_INFO __kmp_test_then_add32
330
331//------------------------------------------------------------------------
332// FUNCTION __kmp_xchg_fixed8
333//
334// kmp_int32
335// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
336//
337// parameters:
338// 	p:	4(%esp)
339// 	d:	8(%esp)
340//
341// return:	%al
342        PROC  __kmp_xchg_fixed8
343
344        movl      4(%esp), %ecx    // "p"
345        movb      8(%esp), %al	// "d"
346
347        lock
348        xchgb     %al,(%ecx)
349        ret
350
351        DEBUG_INFO __kmp_xchg_fixed8
352
353
354//------------------------------------------------------------------------
355// FUNCTION __kmp_xchg_fixed16
356//
357// kmp_int16
358// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
359//
360// parameters:
361// 	p:	4(%esp)
362// 	d:	8(%esp)
363// return:     %ax
364        PROC  __kmp_xchg_fixed16
365
366        movl      4(%esp), %ecx    // "p"
367        movw      8(%esp), %ax	// "d"
368
369        lock
370        xchgw     %ax,(%ecx)
371        ret
372
373        DEBUG_INFO __kmp_xchg_fixed16
374
375
376//------------------------------------------------------------------------
377// FUNCTION __kmp_xchg_fixed32
378//
379// kmp_int32
380// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
381//
382// parameters:
383// 	p:	4(%esp)
384// 	d:	8(%esp)
385//
386// return:	%eax
387        PROC  __kmp_xchg_fixed32
388
389        movl      4(%esp), %ecx    // "p"
390        movl      8(%esp), %eax	// "d"
391
392        lock
393        xchgl     %eax,(%ecx)
394        ret
395
396        DEBUG_INFO __kmp_xchg_fixed32
397
398
399// kmp_int8
400// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
401        PROC  __kmp_compare_and_store8
402
403        movl      4(%esp), %ecx
404        movb      8(%esp), %al
405        movb      12(%esp), %dl
406        lock
407        cmpxchgb  %dl,(%ecx)
408        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
409        and       $1, %eax      // sign extend previous instruction
410        ret
411
412        DEBUG_INFO __kmp_compare_and_store8
413
414// kmp_int16
415// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
416        PROC  __kmp_compare_and_store16
417
418        movl      4(%esp), %ecx
419        movw      8(%esp), %ax
420        movw      12(%esp), %dx
421        lock
422        cmpxchgw  %dx,(%ecx)
423        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
424        and       $1, %eax      // sign extend previous instruction
425        ret
426
427        DEBUG_INFO __kmp_compare_and_store16
428
429// kmp_int32
430// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
431        PROC  __kmp_compare_and_store32
432
433        movl      4(%esp), %ecx
434        movl      8(%esp), %eax
435        movl      12(%esp), %edx
436        lock
437        cmpxchgl  %edx,(%ecx)
438        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
439        and       $1, %eax     // sign extend previous instruction
440        ret
441
442        DEBUG_INFO __kmp_compare_and_store32
443
444// kmp_int32
445// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
446        PROC  __kmp_compare_and_store64
447
448        pushl     %ebp
449        movl      %esp, %ebp
450        pushl     %ebx
451        pushl     %edi
452        movl      8(%ebp), %edi
453        movl      12(%ebp), %eax        // "cv" low order word
454        movl      16(%ebp), %edx        // "cv" high order word
455        movl      20(%ebp), %ebx        // "sv" low order word
456        movl      24(%ebp), %ecx        // "sv" high order word
457        lock
458        cmpxchg8b (%edi)
459        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
460        and       $1, %eax // sign extend previous instruction
461        popl      %edi
462        popl      %ebx
463        movl      %ebp, %esp
464        popl      %ebp
465        ret
466
467        DEBUG_INFO __kmp_compare_and_store64
468
469// kmp_int8
470// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
471        PROC  __kmp_compare_and_store_ret8
472
473        movl      4(%esp), %ecx
474        movb      8(%esp), %al
475        movb      12(%esp), %dl
476        lock
477        cmpxchgb  %dl,(%ecx)
478        ret
479
480        DEBUG_INFO __kmp_compare_and_store_ret8
481
482// kmp_int16
483// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
484//                               kmp_int16 sv);
485        PROC  __kmp_compare_and_store_ret16
486
487        movl      4(%esp), %ecx
488        movw      8(%esp), %ax
489        movw      12(%esp), %dx
490        lock
491        cmpxchgw  %dx,(%ecx)
492        ret
493
494        DEBUG_INFO __kmp_compare_and_store_ret16
495
496// kmp_int32
497// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
498//                               kmp_int32 sv);
499        PROC  __kmp_compare_and_store_ret32
500
501        movl      4(%esp), %ecx
502        movl      8(%esp), %eax
503        movl      12(%esp), %edx
504        lock
505        cmpxchgl  %edx,(%ecx)
506        ret
507
508        DEBUG_INFO __kmp_compare_and_store_ret32
509
510// kmp_int64
511// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
512//                               kmp_int64 sv);
513        PROC  __kmp_compare_and_store_ret64
514
515        pushl     %ebp
516        movl      %esp, %ebp
517        pushl     %ebx
518        pushl     %edi
519        movl      8(%ebp), %edi
520        movl      12(%ebp), %eax        // "cv" low order word
521        movl      16(%ebp), %edx        // "cv" high order word
522        movl      20(%ebp), %ebx        // "sv" low order word
523        movl      24(%ebp), %ecx        // "sv" high order word
524        lock
525        cmpxchg8b (%edi)
526        popl      %edi
527        popl      %ebx
528        movl      %ebp, %esp
529        popl      %ebp
530        ret
531
532        DEBUG_INFO __kmp_compare_and_store_ret64
533
534
535//------------------------------------------------------------------------
536// FUNCTION __kmp_xchg_real32
537//
538// kmp_real32
539// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
540//
541// parameters:
542// 	addr:	4(%esp)
543// 	data:	8(%esp)
544//
545// return:	%eax
546        PROC  __kmp_xchg_real32
547
548        pushl   %ebp
549        movl    %esp, %ebp
550        subl    $4, %esp
551        pushl   %esi
552
553        movl    4(%ebp), %esi
554        flds    (%esi)
555                        // load <addr>
556        fsts    -4(%ebp)
557                        // store old value
558
559        movl    8(%ebp), %eax
560
561        lock
562        xchgl   %eax, (%esi)
563
564        flds    -4(%ebp)
565                        // return old value
566
567        popl    %esi
568        movl    %ebp, %esp
569        popl    %ebp
570        ret
571
572        DEBUG_INFO __kmp_xchg_real32
573
574# endif /* !KMP_ASM_INTRINS */
575
576//------------------------------------------------------------------------
577// int
578// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
579//                         int gtid, int tid,
580//                         int argc, void *p_argv[]
581// #if OMPT_SUPPORT
582//                         ,
583//                         void **exit_frame_ptr
584// #endif
585//                       ) {
586// #if OMPT_SUPPORT
587//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
588// #endif
589//
590//   (*pkfn)( & gtid, & tid, argv[0], ... );
591//   return 1;
592// }
593
594// -- Begin __kmp_invoke_microtask
595// mark_begin;
596	PROC  __kmp_invoke_microtask
597
598	pushl %ebp
599	KMP_CFI_DEF_OFFSET 8
600	KMP_CFI_OFFSET ebp,-8
601	movl %esp,%ebp		// establish the base pointer for this routine.
602	KMP_CFI_REGISTER ebp
603	subl $8,%esp		// allocate space for two local variables.
604				// These varibales are:
605				//	argv: -4(%ebp)
606				//	temp: -8(%ebp)
607				//
608	pushl %ebx		// save %ebx to use during this routine
609				//
610#if OMPT_SUPPORT
611	movl 28(%ebp),%ebx	// get exit_frame address
612	movl %ebp,(%ebx)	// save exit_frame
613#endif
614
615	movl 20(%ebp),%ebx	// Stack alignment - # args
616	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
617	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
618	movl %esp,%eax		//
619	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
620	movl %eax,%ebx		// Save to %ebx
621	andl $0xFFFFFF80,%eax	// mask off 7 bits
622	subl %eax,%ebx		// Amount to subtract from %esp
623	subl %ebx,%esp		// Prepare the stack ptr --
624				//   now it will be aligned on 128-byte boundary at the call
625
626	movl 24(%ebp),%eax	// copy from p_argv[]
627	movl %eax,-4(%ebp)	// into the local variable *argv.
628
629	movl 20(%ebp),%ebx	// argc is 20(%ebp)
630	shll $2,%ebx
631
632KMP_LABEL(invoke_2):
633	cmpl $0,%ebx
634	jg  KMP_LABEL(invoke_4)
635	jmp KMP_LABEL(invoke_3)
636	ALIGN 2
637KMP_LABEL(invoke_4):
638	movl -4(%ebp),%eax
639	subl $4,%ebx			// decrement argc.
640	addl %ebx,%eax			// index into argv.
641	movl (%eax),%edx
642	pushl %edx
643
644	jmp KMP_LABEL(invoke_2)
645	ALIGN 2
646KMP_LABEL(invoke_3):
647	leal 16(%ebp),%eax		// push & tid
648	pushl %eax
649
650	leal 12(%ebp),%eax		// push & gtid
651	pushl %eax
652
653	movl 8(%ebp),%ebx
654	call *%ebx			// call (*pkfn)();
655
656	movl $1,%eax			// return 1;
657
658	movl -12(%ebp),%ebx		// restore %ebx
659	leave
660	KMP_CFI_DEF esp,4
661	ret
662
663	DEBUG_INFO __kmp_invoke_microtask
664// -- End  __kmp_invoke_microtask
665
666
667// kmp_uint64
668// __kmp_hardware_timestamp(void)
669	PROC  __kmp_hardware_timestamp
670	rdtsc
671	ret
672
673	DEBUG_INFO __kmp_hardware_timestamp
674// -- End  __kmp_hardware_timestamp
675
676#endif /* KMP_ARCH_X86 */
677
678
679#if KMP_ARCH_X86_64
680
681// -----------------------------------------------------------------------
682// microtasking routines specifically written for IA-32 architecture and
683// Intel(R) 64 running Linux* OS
684// -----------------------------------------------------------------------
685
686// -- Machine type P
687// mark_description "Intel Corporation";
688	.ident "Intel Corporation"
689// --	.file "z_Linux_asm.S"
690	.data
691	ALIGN 4
692
693// To prevent getting our code into .data section .text added to every routine
694// definition for x86_64.
695//------------------------------------------------------------------------
696# if !KMP_ASM_INTRINS
697
698//------------------------------------------------------------------------
699// FUNCTION __kmp_test_then_add32
700//
701// kmp_int32
702// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
703//
704// parameters:
705// 	p:	%rdi
706// 	d:	%esi
707//
708// return:	%eax
709        .text
710        PROC  __kmp_test_then_add32
711
712        movl      %esi, %eax	// "d"
713        lock
714        xaddl     %eax,(%rdi)
715        ret
716
717        DEBUG_INFO __kmp_test_then_add32
718
719
720//------------------------------------------------------------------------
721// FUNCTION __kmp_test_then_add64
722//
723// kmp_int64
724// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
725//
726// parameters:
727// 	p:	%rdi
728// 	d:	%rsi
729//	return:	%rax
730        .text
731        PROC  __kmp_test_then_add64
732
733        movq      %rsi, %rax	// "d"
734        lock
735        xaddq     %rax,(%rdi)
736        ret
737
738        DEBUG_INFO __kmp_test_then_add64
739
740
741//------------------------------------------------------------------------
742// FUNCTION __kmp_xchg_fixed8
743//
744// kmp_int32
745// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
746//
747// parameters:
748// 	p:	%rdi
749// 	d:	%sil
750//
751// return:	%al
752        .text
753        PROC  __kmp_xchg_fixed8
754
755        movb      %sil, %al	// "d"
756
757        lock
758        xchgb     %al,(%rdi)
759        ret
760
761        DEBUG_INFO __kmp_xchg_fixed8
762
763
764//------------------------------------------------------------------------
765// FUNCTION __kmp_xchg_fixed16
766//
767// kmp_int16
768// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
769//
770// parameters:
771// 	p:	%rdi
772// 	d:	%si
773// return:     %ax
774        .text
775        PROC  __kmp_xchg_fixed16
776
777        movw      %si, %ax	// "d"
778
779        lock
780        xchgw     %ax,(%rdi)
781        ret
782
783        DEBUG_INFO __kmp_xchg_fixed16
784
785
786//------------------------------------------------------------------------
787// FUNCTION __kmp_xchg_fixed32
788//
789// kmp_int32
790// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
791//
792// parameters:
793// 	p:	%rdi
794// 	d:	%esi
795//
796// return:	%eax
797        .text
798        PROC  __kmp_xchg_fixed32
799
800        movl      %esi, %eax	// "d"
801
802        lock
803        xchgl     %eax,(%rdi)
804        ret
805
806        DEBUG_INFO __kmp_xchg_fixed32
807
808
809//------------------------------------------------------------------------
810// FUNCTION __kmp_xchg_fixed64
811//
812// kmp_int64
813// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
814//
815// parameters:
816// 	p:	%rdi
817// 	d:	%rsi
818// return:	%rax
819        .text
820        PROC  __kmp_xchg_fixed64
821
822        movq      %rsi, %rax	// "d"
823
824        lock
825        xchgq     %rax,(%rdi)
826        ret
827
828        DEBUG_INFO __kmp_xchg_fixed64
829
830
831//------------------------------------------------------------------------
832// FUNCTION __kmp_compare_and_store8
833//
834// kmp_int8
835// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
836//
837// parameters:
838// 	p:	%rdi
839// 	cv:	%esi
840//	sv:	%edx
841//
842// return:	%eax
843        .text
844        PROC  __kmp_compare_and_store8
845
846        movb      %sil, %al	// "cv"
847        lock
848        cmpxchgb  %dl,(%rdi)
849        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
850        andq      $1, %rax      // sign extend previous instruction for return value
851        ret
852
853        DEBUG_INFO __kmp_compare_and_store8
854
855
856//------------------------------------------------------------------------
857// FUNCTION __kmp_compare_and_store16
858//
859// kmp_int16
860// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
861//
862// parameters:
863// 	p:	%rdi
864// 	cv:	%si
865//	sv:	%dx
866//
867// return:	%eax
868        .text
869        PROC  __kmp_compare_and_store16
870
871        movw      %si, %ax	// "cv"
872        lock
873        cmpxchgw  %dx,(%rdi)
874        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
875        andq      $1, %rax      // sign extend previous instruction for return value
876        ret
877
878        DEBUG_INFO __kmp_compare_and_store16
879
880
881//------------------------------------------------------------------------
882// FUNCTION __kmp_compare_and_store32
883//
884// kmp_int32
885// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
886//
887// parameters:
888// 	p:	%rdi
889// 	cv:	%esi
890//	sv:	%edx
891//
892// return:	%eax
893        .text
894        PROC  __kmp_compare_and_store32
895
896        movl      %esi, %eax	// "cv"
897        lock
898        cmpxchgl  %edx,(%rdi)
899        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
900        andq      $1, %rax      // sign extend previous instruction for return value
901        ret
902
903        DEBUG_INFO __kmp_compare_and_store32
904
905
906//------------------------------------------------------------------------
907// FUNCTION __kmp_compare_and_store64
908//
909// kmp_int32
910// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
911//
912// parameters:
913// 	p:	%rdi
914// 	cv:	%rsi
915//	sv:	%rdx
916//	return:	%eax
917        .text
918        PROC  __kmp_compare_and_store64
919
920        movq      %rsi, %rax    // "cv"
921        lock
922        cmpxchgq  %rdx,(%rdi)
923        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
924        andq      $1, %rax      // sign extend previous instruction for return value
925        ret
926
927        DEBUG_INFO __kmp_compare_and_store64
928
929//------------------------------------------------------------------------
930// FUNCTION __kmp_compare_and_store_ret8
931//
932// kmp_int8
933// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
934//
935// parameters:
936// 	p:	%rdi
937// 	cv:	%esi
938//	sv:	%edx
939//
940// return:	%eax
941        .text
942        PROC  __kmp_compare_and_store_ret8
943
944        movb      %sil, %al	// "cv"
945        lock
946        cmpxchgb  %dl,(%rdi)
947        ret
948
949        DEBUG_INFO __kmp_compare_and_store_ret8
950
951
952//------------------------------------------------------------------------
953// FUNCTION __kmp_compare_and_store_ret16
954//
955// kmp_int16
956// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
957//
958// parameters:
959// 	p:	%rdi
960// 	cv:	%si
961//	sv:	%dx
962//
963// return:	%eax
964        .text
965        PROC  __kmp_compare_and_store_ret16
966
967        movw      %si, %ax	// "cv"
968        lock
969        cmpxchgw  %dx,(%rdi)
970        ret
971
972        DEBUG_INFO __kmp_compare_and_store_ret16
973
974
975//------------------------------------------------------------------------
976// FUNCTION __kmp_compare_and_store_ret32
977//
978// kmp_int32
979// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
980//
981// parameters:
982// 	p:	%rdi
983// 	cv:	%esi
984//	sv:	%edx
985//
986// return:	%eax
987        .text
988        PROC  __kmp_compare_and_store_ret32
989
990        movl      %esi, %eax	// "cv"
991        lock
992        cmpxchgl  %edx,(%rdi)
993        ret
994
995        DEBUG_INFO __kmp_compare_and_store_ret32
996
997
998//------------------------------------------------------------------------
999// FUNCTION __kmp_compare_and_store_ret64
1000//
1001// kmp_int64
1002// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1003//
1004// parameters:
1005// 	p:	%rdi
1006// 	cv:	%rsi
1007//	sv:	%rdx
1008//	return:	%eax
1009        .text
1010        PROC  __kmp_compare_and_store_ret64
1011
1012        movq      %rsi, %rax    // "cv"
1013        lock
1014        cmpxchgq  %rdx,(%rdi)
1015        ret
1016
1017        DEBUG_INFO __kmp_compare_and_store_ret64
1018
1019# endif /* !KMP_ASM_INTRINS */
1020
1021
1022# if !KMP_MIC
1023
1024# if !KMP_ASM_INTRINS
1025
1026//------------------------------------------------------------------------
1027// FUNCTION __kmp_xchg_real32
1028//
1029// kmp_real32
1030// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
1031//
1032// parameters:
1033// 	addr:	%rdi
1034// 	data:	%xmm0 (lower 4 bytes)
1035//
1036// return:	%xmm0 (lower 4 bytes)
1037        .text
1038        PROC  __kmp_xchg_real32
1039
1040	movd	%xmm0, %eax	// load "data" to eax
1041
1042         lock
1043         xchgl %eax, (%rdi)
1044
1045	movd	%eax, %xmm0	// load old value into return register
1046
1047        ret
1048
1049        DEBUG_INFO __kmp_xchg_real32
1050
1051
1052//------------------------------------------------------------------------
1053// FUNCTION __kmp_xchg_real64
1054//
1055// kmp_real64
1056// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1057//
1058// parameters:
1059//      addr:   %rdi
1060//      data:   %xmm0 (lower 8 bytes)
1061//      return: %xmm0 (lower 8 bytes)
1062        .text
1063        PROC  __kmp_xchg_real64
1064
1065	movd	%xmm0, %rax	// load "data" to rax
1066
1067         lock
1068	xchgq  %rax, (%rdi)
1069
1070	movd	%rax, %xmm0	// load old value into return register
1071        ret
1072
1073        DEBUG_INFO __kmp_xchg_real64
1074
1075
1076# endif /* !KMP_MIC */
1077
1078# endif /* !KMP_ASM_INTRINS */
1079
1080//------------------------------------------------------------------------
1081// int
1082// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1083//                         int gtid, int tid,
1084//                         int argc, void *p_argv[]
1085// #if OMPT_SUPPORT
1086//                         ,
1087//                         void **exit_frame_ptr
1088// #endif
1089//                       ) {
1090// #if OMPT_SUPPORT
1091//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1092// #endif
1093//
1094//   (*pkfn)( & gtid, & tid, argv[0], ... );
1095//   return 1;
1096// }
1097//
1098// note: at call to pkfn must have %rsp 128-byte aligned for compiler
1099//
1100// parameters:
1101//      %rdi:  	pkfn
1102//	%esi:	gtid
1103//	%edx:	tid
1104//	%ecx:	argc
1105//	%r8:	p_argv
1106//	%r9:	&exit_frame
1107//
1108// locals:
1109//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1110//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1111//
1112// reg temps:
1113//	%rax:	used all over the place
1114//	%rdx:	used in stack pointer alignment calculation
1115//	%r11:	used to traverse p_argv array
1116//	%rsi:	used as temporary for stack parameters
1117//		used as temporary for number of pkfn parms to push
1118//	%rbx:	used to hold pkfn address, and zero constant, callee-save
1119//
1120// return:	%eax 	(always 1/TRUE)
1121__gtid = -16
1122__tid = -24
1123
1124// -- Begin __kmp_invoke_microtask
1125// mark_begin;
1126        .text
1127	PROC  __kmp_invoke_microtask
1128
1129	pushq 	%rbp		// save base pointer
1130	KMP_CFI_DEF_OFFSET 16
1131	KMP_CFI_OFFSET rbp,-16
1132	movq 	%rsp,%rbp	// establish the base pointer for this routine.
1133	KMP_CFI_REGISTER rbp
1134
1135#if OMPT_SUPPORT
1136	movq	%rbp, (%r9)	// save exit_frame
1137#endif
1138
1139	pushq 	%rbx		// %rbx is callee-saved register
1140	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
1141	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
1142
1143	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
1144	movq	$0, %rbx	// constant for cmovs later
1145	subq	$4, %rax	// subtract four args passed in registers to pkfn
1146#if KMP_MIC
1147	js	KMP_LABEL(kmp_0)	// jump to movq
1148	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
1149KMP_LABEL(kmp_0):
1150	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1151KMP_LABEL(kmp_0_exit):
1152#else
1153	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1154#endif // KMP_MIC
1155
1156	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
1157	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
1158
1159	movq 	%rsp, %rdx	//
1160	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
1161				// without align, stack ptr would be this
1162	movq 	%rdx, %rax	// Save to %rax
1163
1164	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
1165	subq 	%rax, %rdx	// Amount to subtract from %rsp
1166	subq 	%rdx, %rsp	// Prepare the stack ptr --
1167				// now %rsp will align to 128-byte boundary at call site
1168
1169				// setup pkfn parameter reg and stack
1170	movq	%rcx, %rax	// argc -> %rax
1171	cmpq	$0, %rsi
1172	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
1173	shlq	$3, %rcx	// argc*8 -> %rcx
1174	movq 	%r8, %rdx	// p_argv -> %rdx
1175	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
1176
1177	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
1178
1179KMP_LABEL(kmp_invoke_push_parms):
1180	// push nth - 7th parms to pkfn on stack
1181	subq	$8, %rdx	// decrement p_argv pointer to previous parm
1182	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
1183	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
1184	subl	$1, %ecx
1185
1186// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1187//		if the name of the label that is an operand of this jecxz starts with a dot (".");
1188//	   Apple's linker does not support 1-byte length relocation;
1189//         Resolution: replace all .labelX entries with L_labelX.
1190
1191	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
1192	jmp	KMP_LABEL(kmp_invoke_push_parms)
1193	ALIGN 3
1194KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
1195				// order here is important to avoid trashing
1196				// registers used for both input and output parms!
1197	movq	%rdi, %rbx	// pkfn -> %rbx
1198	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1199	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
1200	// Check if argc is 0
1201	cmpq $0, %rax
1202	je KMP_LABEL(kmp_no_args) // Jump ahead
1203
1204	movq	%r8, %r11	// p_argv -> %r11
1205
1206#if KMP_MIC
1207	cmpq	$4, %rax	// argc >= 4?
1208	jns	KMP_LABEL(kmp_4)	// jump to movq
1209	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
1210KMP_LABEL(kmp_4):
1211	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1212KMP_LABEL(kmp_4_exit):
1213
1214	cmpq	$3, %rax	// argc >= 3?
1215	jns	KMP_LABEL(kmp_3)	// jump to movq
1216	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
1217KMP_LABEL(kmp_3):
1218	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1219KMP_LABEL(kmp_3_exit):
1220
1221	cmpq	$2, %rax	// argc >= 2?
1222	jns	KMP_LABEL(kmp_2)	// jump to movq
1223	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
1224KMP_LABEL(kmp_2):
1225	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1226KMP_LABEL(kmp_2_exit):
1227
1228	cmpq	$1, %rax	// argc >= 1?
1229	jns	KMP_LABEL(kmp_1)	// jump to movq
1230	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
1231KMP_LABEL(kmp_1):
1232	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1233KMP_LABEL(kmp_1_exit):
1234#else
1235	cmpq	$4, %rax	// argc >= 4?
1236	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1237
1238	cmpq	$3, %rax	// argc >= 3?
1239	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1240
1241	cmpq	$2, %rax	// argc >= 2?
1242	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1243
1244	cmpq	$1, %rax	// argc >= 1?
1245	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1246#endif // KMP_MIC
1247
1248KMP_LABEL(kmp_no_args):
1249	call	*%rbx		// call (*pkfn)();
1250	movq	$1, %rax	// move 1 into return register;
1251
1252	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
1253	movq 	%rbp, %rsp	// restore stack pointer
1254	popq 	%rbp		// restore frame pointer
1255	KMP_CFI_DEF rsp,8
1256	ret
1257
1258	DEBUG_INFO __kmp_invoke_microtask
1259// -- End  __kmp_invoke_microtask
1260
1261// kmp_uint64
1262// __kmp_hardware_timestamp(void)
1263        .text
1264	PROC  __kmp_hardware_timestamp
1265	rdtsc
1266	shlq    $32, %rdx
1267	orq     %rdx, %rax
1268	ret
1269
1270	DEBUG_INFO __kmp_hardware_timestamp
1271// -- End  __kmp_hardware_timestamp
1272
1273//------------------------------------------------------------------------
1274// FUNCTION __kmp_bsr32
1275//
1276// int
1277// __kmp_bsr32( int );
1278        .text
1279        PROC  __kmp_bsr32
1280
1281        bsr    %edi,%eax
1282        ret
1283
1284        DEBUG_INFO __kmp_bsr32
1285
1286// -----------------------------------------------------------------------
1287#endif /* KMP_ARCH_X86_64 */
1288
1289// '
1290#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
1291
1292//------------------------------------------------------------------------
1293// int
1294// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1295//                         int gtid, int tid,
1296//                         int argc, void *p_argv[]
1297// #if OMPT_SUPPORT
1298//                         ,
1299//                         void **exit_frame_ptr
1300// #endif
1301//                       ) {
1302// #if OMPT_SUPPORT
1303//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1304// #endif
1305//
1306//   (*pkfn)( & gtid, & tid, argv[0], ... );
1307//
1308// // FIXME: This is done at call-site and can be removed here.
1309// #if OMPT_SUPPORT
1310//   *exit_frame_ptr = 0;
1311// #endif
1312//
1313//   return 1;
1314// }
1315//
1316// parameters:
1317//	x0:	pkfn
1318//	w1:	gtid
1319//	w2:	tid
1320//	w3:	argc
1321//	x4:	p_argv
1322//	x5:	&exit_frame
1323//
1324// locals:
1325//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1326//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1327//
1328// reg temps:
1329//	 x8:	used to hold pkfn address
1330//	 w9:	used as temporary for number of pkfn parms
1331//	x10:	used to traverse p_argv array
1332//	x11:	used as temporary for stack placement calculation
1333//	x12:	used as temporary for stack parameters
1334//	x19:	used to preserve exit_frame_ptr, callee-save
1335//
1336// return:	w0	(always 1/TRUE)
1337//
1338
1339__gtid = 4
1340__tid = 8
1341
1342// -- Begin __kmp_invoke_microtask
1343// mark_begin;
1344	.text
1345	PROC __kmp_invoke_microtask
1346	PACBTI_C
1347
1348	stp	x29, x30, [sp, #-16]!
1349# if OMPT_SUPPORT
1350	stp	x19, x20, [sp, #-16]!
1351# endif
1352	mov	x29, sp
1353
1354	orr	w9, wzr, #1
1355	add	w9, w9, w3, lsr #1
1356	sub	sp, sp, w9, uxtw #4
1357	mov	x11, sp
1358
1359	mov	x8, x0
1360	str	w1, [x29, #-__gtid]
1361	str	w2, [x29, #-__tid]
1362	mov	w9, w3
1363	mov	x10, x4
1364# if OMPT_SUPPORT
1365	mov	x19, x5
1366	str	x29, [x19]
1367# endif
1368
1369	sub	x0, x29, #__gtid
1370	sub	x1, x29, #__tid
1371
1372	cbz	w9, KMP_LABEL(kmp_1)
1373	ldr	x2, [x10]
1374
1375	sub	w9, w9, #1
1376	cbz	w9, KMP_LABEL(kmp_1)
1377	ldr	x3, [x10, #8]!
1378
1379	sub	w9, w9, #1
1380	cbz	w9, KMP_LABEL(kmp_1)
1381	ldr	x4, [x10, #8]!
1382
1383	sub	w9, w9, #1
1384	cbz	w9, KMP_LABEL(kmp_1)
1385	ldr	x5, [x10, #8]!
1386
1387	sub	w9, w9, #1
1388	cbz	w9, KMP_LABEL(kmp_1)
1389	ldr	x6, [x10, #8]!
1390
1391	sub	w9, w9, #1
1392	cbz	w9, KMP_LABEL(kmp_1)
1393	ldr	x7, [x10, #8]!
1394
1395KMP_LABEL(kmp_0):
1396	sub	w9, w9, #1
1397	cbz	w9, KMP_LABEL(kmp_1)
1398	ldr	x12, [x10, #8]!
1399	str	x12, [x11], #8
1400	b	KMP_LABEL(kmp_0)
1401KMP_LABEL(kmp_1):
1402	blr	x8
1403	orr	w0, wzr, #1
1404	mov	sp, x29
1405# if OMPT_SUPPORT
1406	str	xzr, [x19]
1407	ldp	x19, x20, [sp], #16
1408# endif
1409	ldp	x29, x30, [sp], #16
1410	PACBTI_RET
1411	ret
1412
1413	DEBUG_INFO __kmp_invoke_microtask
1414// -- End  __kmp_invoke_microtask
1415
1416#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */
1417
1418#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
1419
1420//------------------------------------------------------------------------
1421// int
1422// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1423//                         int gtid, int tid,
1424//                         int argc, void *p_argv[]
1425// #if OMPT_SUPPORT
1426//                         ,
1427//                         void **exit_frame_ptr
1428// #endif
1429//                       ) {
1430// #if OMPT_SUPPORT
1431//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1432// #endif
1433//
1434//   (*pkfn)( & gtid, & tid, argv[0], ... );
1435//
1436// // FIXME: This is done at call-site and can be removed here.
1437// #if OMPT_SUPPORT
1438//   *exit_frame_ptr = 0;
1439// #endif
1440//
1441//   return 1;
1442// }
1443//
1444// parameters:
1445//	r0:	pkfn
1446//	r1:	gtid
1447//	r2:	tid
1448//	r3:	argc
1449//	r4(stack):	p_argv
1450//	r5(stack):	&exit_frame
1451//
1452// locals:
1453//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1454//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1455//
1456// reg temps:
1457//	 r4:	used to hold pkfn address
1458//	 r5:	used as temporary for number of pkfn parms
1459//	 r6:	used to traverse p_argv array
1460//	 r7:	frame pointer (in some configurations)
1461//	 r8:	used as temporary for stack placement calculation
1462//	 	and as pointer to base of callee saved area
1463//	 r9:	used as temporary for stack parameters
1464//	r10:	used to preserve exit_frame_ptr, callee-save
1465//	r11:	frame pointer (in some configurations)
1466//
1467// return:	r0	(always 1/TRUE)
1468//
1469
1470__gtid = 4
1471__tid = 8
1472
1473// -- Begin __kmp_invoke_microtask
1474// mark_begin;
1475	.text
1476	PROC __kmp_invoke_microtask
1477
1478	// Pushing one extra register (r3) to keep the stack aligned
1479	// for when we call pkfn below
1480	push	{r3-r11,lr}
1481	// Load p_argv and &exit_frame
1482	ldr	r4, [sp, #10*4]
1483# if OMPT_SUPPORT
1484	ldr	r5, [sp, #11*4]
1485# endif
1486
1487# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
1488# define FP r7
1489# define FPOFF 4*4
1490#else
1491# define FP r11
1492# define FPOFF 8*4
1493#endif
1494	add	FP, sp, #FPOFF
1495# if OMPT_SUPPORT
1496	mov	r10, r5
1497	str	FP, [r10]
1498# endif
1499	mov	r8, sp
1500
1501	// Calculate how much stack to allocate, in increments of 8 bytes.
1502	// We strictly need 4*(argc-2) bytes (2 arguments are passed in
1503	// registers) but allocate 4*argc for simplicity (to avoid needing
1504	// to handle the argc<2 cases). We align the number of bytes
1505	// allocated to 8 bytes, to keep the stack aligned. (Since we
1506	// already allocate more than enough, it's ok to round down
1507	// instead of up for the alignment.) We allocate another extra
1508	// 8 bytes for gtid and tid.
1509	mov	r5, #1
1510	add	r5, r5, r3, lsr #1
1511	sub	sp, sp, r5, lsl #3
1512
1513	str	r1, [r8, #-__gtid]
1514	str	r2, [r8, #-__tid]
1515	mov	r5, r3
1516	mov	r6, r4
1517	mov	r4, r0
1518
1519	// Prepare the first 2 parameters to pkfn - pointers to gtid and tid
1520	// in our stack frame.
1521	sub	r0, r8, #__gtid
1522	sub	r1, r8, #__tid
1523
1524	mov	r8, sp
1525
1526	// Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
1527	cmp	r5, #0
1528	beq	KMP_LABEL(kmp_1)
1529	ldr	r2, [r6]
1530
1531	subs	r5, r5, #1
1532	beq	KMP_LABEL(kmp_1)
1533	ldr	r3, [r6, #4]!
1534
1535	// Loop, loading the rest of p_argv and writing the elements on the
1536	// stack.
1537KMP_LABEL(kmp_0):
1538	subs	r5, r5, #1
1539	beq	KMP_LABEL(kmp_1)
1540	ldr	r12, [r6, #4]!
1541	str	r12, [r8], #4
1542	b	KMP_LABEL(kmp_0)
1543KMP_LABEL(kmp_1):
1544	blx	r4
1545	mov	r0, #1
1546
1547	sub	r4, FP, #FPOFF
1548	mov	sp, r4
1549# undef FP
1550# undef FPOFF
1551
1552# if OMPT_SUPPORT
1553	mov	r1, #0
1554	str	r1, [r10]
1555# endif
1556	pop	{r3-r11,pc}
1557
1558	DEBUG_INFO __kmp_invoke_microtask
1559// -- End  __kmp_invoke_microtask
1560
1561#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM */
1562
1563#if KMP_ARCH_PPC64
1564
1565//------------------------------------------------------------------------
1566// int
1567// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1568//                         int gtid, int tid,
1569//                         int argc, void *p_argv[]
1570// #if OMPT_SUPPORT
1571//                         ,
1572//                         void **exit_frame_ptr
1573// #endif
1574//                       ) {
1575// #if OMPT_SUPPORT
1576//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1577// #endif
1578//
1579//   (*pkfn)( & gtid, & tid, argv[0], ... );
1580//
1581// // FIXME: This is done at call-site and can be removed here.
1582// #if OMPT_SUPPORT
1583//   *exit_frame_ptr = 0;
1584// #endif
1585//
1586//   return 1;
1587// }
1588//
1589// parameters:
1590//	r3:	pkfn
1591//	r4:	gtid
1592//	r5:	tid
1593//	r6:	argc
1594//	r7:	p_argv
1595//	r8:	&exit_frame
1596//
1597// return:	r3	(always 1/TRUE)
1598//
1599	.text
1600# if KMP_ARCH_PPC64_ELFv2
1601	.abiversion 2
1602# endif
1603	.globl	__kmp_invoke_microtask
1604
1605# if KMP_ARCH_PPC64_ELFv2
1606	.p2align	4
1607# else
1608	.p2align	2
1609# endif
1610
1611	.type	__kmp_invoke_microtask,@function
1612
1613# if KMP_ARCH_PPC64_ELFv2
1614__kmp_invoke_microtask:
1615.Lfunc_begin0:
1616.Lfunc_gep0:
1617	addis 2, 12, .TOC.-.Lfunc_gep0@ha
1618	addi 2, 2, .TOC.-.Lfunc_gep0@l
1619.Lfunc_lep0:
1620	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1621# else
1622	.section	.opd,"aw",@progbits
1623__kmp_invoke_microtask:
1624	.p2align	3
1625	.quad	.Lfunc_begin0
1626	.quad	.TOC.@tocbase
1627	.quad	0
1628	.text
1629.Lfunc_begin0:
1630# endif
1631
1632// -- Begin __kmp_invoke_microtask
1633// mark_begin;
1634
1635// We need to allocate a stack frame large enough to hold all of the parameters
1636// on the stack for the microtask plus what this function needs. That's 48
1637// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1638// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1639// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1640// to save r30 to hold a copy of r8.
1641
1642	.cfi_startproc
1643	mflr 0
1644	std 31, -8(1)
1645	std 0, 16(1)
1646
1647// This is unusual because normally we'd set r31 equal to r1 after the stack
1648// frame is established. In this case, however, we need to dynamically compute
1649// the stack frame size, and so we keep a direct copy of r1 to access our
1650// register save areas and restore the r1 value before returning.
1651	mr 31, 1
1652	.cfi_def_cfa_register r31
1653	.cfi_offset r31, -8
1654	.cfi_offset lr, 16
1655
1656// Compute the size necessary for the local stack frame.
1657# if KMP_ARCH_PPC64_ELFv2
1658	li 12, 72
1659# else
1660	li 12, 88
1661# endif
1662	sldi 0, 6, 3
1663	add 12, 0, 12
1664	neg 12, 12
1665
1666// We need to make sure that the stack frame stays aligned (to 16 bytes).
1667	li 0, -16
1668	and 12, 0, 12
1669
1670// Establish the local stack frame.
1671	stdux 1, 1, 12
1672
1673# if OMPT_SUPPORT
1674	.cfi_offset r30, -16
1675	std 30, -16(31)
1676	std 1, 0(8)
1677	mr 30, 8
1678# endif
1679
1680// Store gtid and tid to the stack because they're passed by reference to the microtask.
1681	stw 4, -20(31)
1682	stw 5, -24(31)
1683
1684	mr 12, 6
1685	mr 4, 7
1686
1687	cmpwi 0, 12, 1
1688	blt	 0, .Lcall
1689
1690	ld 5, 0(4)
1691
1692	cmpwi 0, 12, 2
1693	blt	 0, .Lcall
1694
1695	ld 6, 8(4)
1696
1697	cmpwi 0, 12, 3
1698	blt	 0, .Lcall
1699
1700	ld 7, 16(4)
1701
1702	cmpwi 0, 12, 4
1703	blt	 0, .Lcall
1704
1705	ld 8, 24(4)
1706
1707	cmpwi 0, 12, 5
1708	blt	 0, .Lcall
1709
1710	ld 9, 32(4)
1711
1712	cmpwi 0, 12, 6
1713	blt	 0, .Lcall
1714
1715	ld 10, 40(4)
1716
1717	cmpwi 0, 12, 7
1718	blt	 0, .Lcall
1719
1720// There are more than 6 microtask parameters, so we need to store the
1721// remainder to the stack.
1722	addi 12, 12, -6
1723	mtctr 12
1724
1725// These are set to 8 bytes before the first desired store address (we're using
1726// pre-increment loads and stores in the loop below). The parameter save area
1727// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1728// 32 + 8*8 == 96 bytes above r1 for ELFv2.
1729	addi 4, 4, 40
1730# if KMP_ARCH_PPC64_ELFv2
1731	addi 12, 1, 88
1732# else
1733	addi 12, 1, 104
1734# endif
1735
1736.Lnext:
1737	ldu 0, 8(4)
1738	stdu 0, 8(12)
1739	bdnz .Lnext
1740
1741.Lcall:
1742# if KMP_ARCH_PPC64_ELFv2
1743	std 2, 24(1)
1744	mr 12, 3
1745#else
1746	std 2, 40(1)
1747// For ELFv1, we need to load the actual function address from the function descriptor.
1748	ld 12, 0(3)
1749	ld 2, 8(3)
1750	ld 11, 16(3)
1751#endif
1752
1753	addi 3, 31, -20
1754	addi 4, 31, -24
1755
1756	mtctr 12
1757	bctrl
1758# if KMP_ARCH_PPC64_ELFv2
1759	ld 2, 24(1)
1760# else
1761	ld 2, 40(1)
1762# endif
1763
1764# if OMPT_SUPPORT
1765	li 3, 0
1766	std 3, 0(30)
1767# endif
1768
1769	li 3, 1
1770
1771# if OMPT_SUPPORT
1772	ld 30, -16(31)
1773# endif
1774
1775	mr 1, 31
1776	ld 0, 16(1)
1777	ld 31, -8(1)
1778	mtlr 0
1779	blr
1780
1781	.long	0
1782	.quad	0
1783.Lfunc_end0:
1784	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1785	.cfi_endproc
1786
1787// -- End  __kmp_invoke_microtask
1788
1789#endif /* KMP_ARCH_PPC64 */
1790
1791#if KMP_ARCH_RISCV64
1792
1793//------------------------------------------------------------------------
1794//
1795// typedef void (*microtask_t)(int *gtid, int *tid, ...);
1796//
1797// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1798//                            void *p_argv[]
1799// #if OMPT_SUPPORT
1800//                            ,
1801//                            void **exit_frame_ptr
1802// #endif
1803//                            ) {
1804// #if OMPT_SUPPORT
1805//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1806// #endif
1807//
1808//   (*pkfn)(&gtid, &tid, argv[0], ...);
1809//
1810//   return 1;
1811// }
1812//
1813// Parameters:
1814//   a0: pkfn
1815//   a1: gtid
1816//   a2: tid
1817//   a3: argc
1818//   a4: p_argv
1819//   a5: exit_frame_ptr
1820//
1821// Locals:
1822//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
1823//   __tid: tid param pushed on stack so can pass &tid to pkfn
1824//
1825// Temp. registers:
1826//
1827//  t0: used to calculate the dynamic stack size / used to hold pkfn address
1828//  t1: used as temporary for stack placement calculation
1829//  t2: used as temporary for stack arguments
1830//  t3: used as temporary for number of remaining pkfn parms
1831//  t4: used to traverse p_argv array
1832//
1833// return: a0 (always 1/TRUE)
1834//
1835
1836__gtid = -20
1837__tid = -24
1838
1839// -- Begin __kmp_invoke_microtask
1840// mark_begin;
1841	.text
1842	.globl	__kmp_invoke_microtask
1843	.p2align	1
1844	.type	__kmp_invoke_microtask,@function
1845__kmp_invoke_microtask:
1846	.cfi_startproc
1847
1848	// First, save ra and fp
1849	addi	sp, sp, -16
1850	sd	ra, 8(sp)
1851	sd	fp, 0(sp)
1852	addi	fp, sp, 16
1853	.cfi_def_cfa	fp, 0
1854	.cfi_offset	ra, -8
1855	.cfi_offset	fp, -16
1856
1857	// Compute the dynamic stack size:
1858	//
1859	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1860	//   reference
1861	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1862	//   function by register. Given that we have 8 of such registers (a[0-7])
1863	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
1864	//   reserve max(0, argc - 6)*8 extra bytes
1865	//
1866	// The total number of bytes is then max(0, argc - 6)*8 + 8
1867
1868	// Compute max(0, argc - 6) using the following bithack:
1869	// max(0, x) = x - (x & (x >> 31)), where x := argc - 6
1870	// Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
1871	addi	t0, a3, -6
1872	srai	t1, t0, 31
1873	and	t1, t0, t1
1874	sub	t0, t0, t1
1875
1876	addi	t0, t0, 1
1877
1878	slli	t0, t0, 3
1879	sub	sp, sp, t0
1880
1881	// Align the stack to 16 bytes
1882	andi	sp, sp, -16
1883
1884	mv	t0, a0
1885	mv	t3, a3
1886	mv	t4, a4
1887
1888#if OMPT_SUPPORT
1889	// Save frame pointer into exit_frame
1890	sd	fp, 0(a5)
1891#endif
1892
1893	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
1894
1895	sw	a1, __gtid(fp)
1896	sw	a2, __tid(fp)
1897
1898	addi	a0, fp, __gtid
1899	addi	a1, fp, __tid
1900
1901	beqz	t3, .L_kmp_3
1902	ld	a2, 0(t4)
1903
1904	addi	t3, t3, -1
1905	beqz	t3, .L_kmp_3
1906	ld	a3, 8(t4)
1907
1908	addi	t3, t3, -1
1909	beqz	t3, .L_kmp_3
1910	ld	a4, 16(t4)
1911
1912	addi	t3, t3, -1
1913	beqz	t3, .L_kmp_3
1914	ld	a5, 24(t4)
1915
1916	addi	t3, t3, -1
1917	beqz	t3, .L_kmp_3
1918	ld	a6, 32(t4)
1919
1920	addi	t3, t3, -1
1921	beqz	t3, .L_kmp_3
1922	ld	a7, 40(t4)
1923
1924	// Prepare any additional argument passed through the stack
1925	addi	t4, t4, 48
1926	mv	t1, sp
1927	j .L_kmp_2
1928.L_kmp_1:
1929	ld	t2, 0(t4)
1930	sd	t2, 0(t1)
1931	addi	t4, t4, 8
1932	addi	t1, t1, 8
1933.L_kmp_2:
1934	addi	t3, t3, -1
1935	bnez	t3, .L_kmp_1
1936
1937.L_kmp_3:
1938	// Call pkfn function
1939	jalr	t0
1940
1941	// Restore stack and return
1942
1943	addi	a0, zero, 1
1944
1945	addi	sp, fp, -16
1946	ld	fp, 0(sp)
1947	ld	ra, 8(sp)
1948	addi	sp, sp, 16
1949	ret
1950.Lfunc_end0:
1951	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
1952	.cfi_endproc
1953
1954// -- End  __kmp_invoke_microtask
1955
1956#endif /* KMP_ARCH_RISCV64 */
1957
1958#if KMP_ARCH_LOONGARCH64
1959
1960//------------------------------------------------------------------------
1961//
1962// typedef void (*microtask_t)(int *gtid, int *tid, ...);
1963//
1964// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1965//                            void *p_argv[]
1966// #if OMPT_SUPPORT
1967//                            ,
1968//                            void **exit_frame_ptr
1969// #endif
1970//                            ) {
1971// #if OMPT_SUPPORT
1972//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1973// #endif
1974//
1975//   (*pkfn)(&gtid, &tid, argv[0], ...);
1976//
1977//   return 1;
1978// }
1979//
1980// Parameters:
1981//   a0: pkfn
1982//   a1: gtid
1983//   a2: tid
1984//   a3: argc
1985//   a4: p_argv
1986//   a5: exit_frame_ptr
1987//
1988// Locals:
1989//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
1990//   __tid: tid param pushed on stack so can pass &tid to pkfn
1991//
1992// Temp registers:
1993//
1994//  t0: used to calculate the dynamic stack size / used to hold pkfn address
1995//  t1: used as temporary for stack placement calculation
1996//  t2: used as temporary for stack arguments
1997//  t3: used as temporary for number of remaining pkfn parms
1998//  t4: used to traverse p_argv array
1999//
2000// return: a0 (always 1/TRUE)
2001//
2002
2003// -- Begin __kmp_invoke_microtask
2004// mark_begin;
2005	.text
2006	.globl	__kmp_invoke_microtask
2007	.p2align	2
2008	.type	__kmp_invoke_microtask,@function
2009__kmp_invoke_microtask:
2010	.cfi_startproc
2011
2012	// First, save ra and fp
2013	addi.d	$sp, $sp, -16
2014	st.d	$ra, $sp, 8
2015	st.d	$fp, $sp, 0
2016	addi.d	$fp, $sp, 16
2017	.cfi_def_cfa	22, 0
2018	.cfi_offset	1, -8
2019	.cfi_offset	22, -16
2020
2021	// Compute the dynamic stack size:
2022	//
2023	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
2024	//   reference
2025	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
2026	//   function by register. Given that we have 8 of such registers (a[0-7])
2027	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
2028	//   reserve max(0, argc - 6)*8 extra bytes
2029	//
2030	// The total number of bytes is then max(0, argc - 6)*8 + 8
2031
2032	addi.d  $t0, $a3, -6
2033	slt  $t1, $t0, $zero
2034	masknez  $t0, $t0, $t1
2035	addi.d  $t0, $t0, 1
2036	slli.d	$t0, $t0, 3
2037	sub.d	$sp, $sp, $t0
2038
2039	// Align the stack to 16 bytes
2040	bstrins.d $sp, $zero, 3, 0
2041
2042	move	$t0, $a0
2043	move	$t3, $a3
2044	move	$t4, $a4
2045
2046#if OMPT_SUPPORT
2047	// Save frame pointer into exit_frame
2048	st.d	$fp, $a5, 0
2049#endif
2050
2051	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
2052
2053	st.w	$a1, $fp, -20
2054	st.w	$a2, $fp, -24
2055
2056	addi.d	$a0, $fp, -20
2057	addi.d	$a1, $fp, -24
2058
2059	beqz	$t3, .L_kmp_3
2060	ld.d	$a2, $t4, 0
2061
2062	addi.d	$t3, $t3, -1
2063	beqz	$t3, .L_kmp_3
2064	ld.d	$a3, $t4, 8
2065
2066	addi.d	$t3, $t3, -1
2067	beqz	$t3, .L_kmp_3
2068	ld.d	$a4, $t4, 16
2069
2070	addi.d	$t3, $t3, -1
2071	beqz	$t3, .L_kmp_3
2072	ld.d	$a5, $t4, 24
2073
2074	addi.d	$t3, $t3, -1
2075	beqz	$t3, .L_kmp_3
2076	ld.d	$a6, $t4, 32
2077
2078	addi.d	$t3, $t3, -1
2079	beqz	$t3, .L_kmp_3
2080	ld.d	$a7, $t4, 40
2081
2082	// Prepare any additional argument passed through the stack
2083	addi.d	$t4, $t4, 48
2084	move	$t1, $sp
2085	b .L_kmp_2
2086.L_kmp_1:
2087	ld.d	$t2, $t4, 0
2088	st.d	$t2, $t1, 0
2089	addi.d	$t4, $t4, 8
2090	addi.d	$t1, $t1, 8
2091.L_kmp_2:
2092	addi.d	$t3, $t3, -1
2093	bnez	$t3, .L_kmp_1
2094
2095.L_kmp_3:
2096	// Call pkfn function
2097	jirl	$ra, $t0, 0
2098
2099	// Restore stack and return
2100
2101	addi.d	$a0, $zero, 1
2102
2103	addi.d	$sp, $fp, -16
2104	ld.d	$fp, $sp, 0
2105	ld.d	$ra, $sp, 8
2106	addi.d	$sp, $sp, 16
2107	jr $ra
2108.Lfunc_end0:
2109	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2110	.cfi_endproc
2111
2112// -- End  __kmp_invoke_microtask
2113
2114#endif /* KMP_ARCH_LOONGARCH64 */
2115
2116#if KMP_ARCH_VE
2117
2118//------------------------------------------------------------------------
2119//
2120// typedef void (*microtask_t)(int *gtid, int *tid, ...);
2121//
2122// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
2123//                            void *p_argv[]
2124// #if OMPT_SUPPORT
2125//                            ,
2126//                            void **exit_frame_ptr
2127// #endif
2128//                            ) {
2129// #if OMPT_SUPPORT
2130//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
2131// #endif
2132//
2133//   (*pkfn)(&gtid, &tid, argv[0], ...);
2134//
2135//   return 1;
2136// }
2137//
2138// Parameters:
2139//   s0: pkfn
2140//   s1: gtid
2141//   s2: tid
2142//   s3: argc
2143//   s4: p_argv
2144//   s5: exit_frame_ptr
2145//
2146// Locals:
2147//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
2148//   __tid: tid param pushed on stack so can pass &tid to pkfn
2149//
2150// Temp. registers:
2151//
2152//  s34: used to calculate the dynamic stack size
2153//  s35: used as temporary for stack placement calculation
2154//  s36: used as temporary for stack arguments
2155//  s37: used as temporary for number of remaining pkfn parms
2156//  s38: used to traverse p_argv array
2157//
2158// return: s0 (always 1/TRUE)
2159//
2160
2161__gtid = -4
2162__tid = -8
2163
2164// -- Begin __kmp_invoke_microtask
2165// mark_begin;
2166	.text
2167	.globl	__kmp_invoke_microtask
2168	// A function requires 8 bytes align.
2169	.p2align	3
2170	.type	__kmp_invoke_microtask,@function
2171__kmp_invoke_microtask:
2172	.cfi_startproc
2173
2174	// First, save fp and lr.  VE stores them at caller stack frame.
2175	st	%fp, 0(, %sp)
2176	st	%lr, 8(, %sp)
2177	or	%fp, 0, %sp
2178	.cfi_def_cfa	%fp, 0
2179	.cfi_offset	%lr, 8
2180	.cfi_offset	%fp, 0
2181
2182	// Compute the dynamic stack size:
2183	//
2184	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them
2185	//   by reference
2186	// - We need 8 bytes for whole arguments.  We have two + 'argc'
2187	//   arguments (condider &gtid and &tid).  We need to reserve
2188	//   (argc + 2) * 8 bytes.
2189	// - We need 176 bytes for RSA and others
2190	//
2191	// The total number of bytes is then (argc + 2) * 8 + 8 + 176.
2192	//
2193	// |------------------------------|
2194	// | return address of callee     | 8(%fp)
2195	// |------------------------------|
2196	// | frame pointer of callee      | 0(%fp)
2197	// |------------------------------| <------------------ %fp
2198	// | __tid / __gtid               | -8(%fp) / -4(%fp)
2199	// |------------------------------|
2200	// | argc+2 for arguments         | 176(%sp)
2201	// |------------------------------|
2202	// | RSA                          |
2203	// |------------------------------|
2204	// | return address               |
2205	// |------------------------------|
2206	// | frame pointer                |
2207	// |------------------------------| <------------------ %sp
2208
2209	adds.w.sx	%s34, 2, %s3
2210	sll	%s34, %s34, 3
2211	lea	%s34, 184(, %s34)
2212	subs.l	%sp, %sp, %s34
2213
2214	// Align the stack to 16 bytes.
2215	and	%sp, -16, %sp
2216
2217	// Save pkfn.
2218	or	%s12, 0, %s0
2219
2220	// Call host to allocate stack if it is necessary.
2221	brge.l	%sp, %sl, .L_kmp_pass
2222	ld	%s61, 24(, %tp)
2223	lea	%s63, 0x13b
2224	shm.l	%s63, 0(%s61)
2225	shm.l	%sl, 8(%s61)
2226	shm.l	%sp, 16(%s61)
2227	monc
2228
2229.L_kmp_pass:
2230	lea	%s35, 176(, %sp)
2231	adds.w.sx	%s37, 0, %s3
2232	or	%s38, 0, %s4
2233
2234#if OMPT_SUPPORT
2235	// Save frame pointer into exit_frame.
2236	st	%fp, 0(%s5)
2237#endif
2238
2239	// Prepare arguments for the pkfn function (first 8 using s0-s7
2240	// registers, but need to store stack also because of varargs).
2241
2242	stl	%s1, __gtid(%fp)
2243	stl	%s2, __tid(%fp)
2244
2245	adds.l	%s0, __gtid, %fp
2246	st	%s0, 0(, %s35)
2247	adds.l	%s1, __tid, %fp
2248	st	%s1, 8(, %s35)
2249
2250	breq.l	0, %s37, .L_kmp_call
2251	ld	%s2, 0(, %s38)
2252	st	%s2, 16(, %s35)
2253
2254	breq.l	1, %s37, .L_kmp_call
2255	ld	%s3, 8(, %s38)
2256	st	%s3, 24(, %s35)
2257
2258	breq.l	2, %s37, .L_kmp_call
2259	ld	%s4, 16(, %s38)
2260	st	%s4, 32(, %s35)
2261
2262	breq.l	3, %s37, .L_kmp_call
2263	ld	%s5, 24(, %s38)
2264	st	%s5, 40(, %s35)
2265
2266	breq.l	4, %s37, .L_kmp_call
2267	ld	%s6, 32(, %s38)
2268	st	%s6, 48(, %s35)
2269
2270	breq.l	5, %s37, .L_kmp_call
2271	ld	%s7, 40(, %s38)
2272	st	%s7, 56(, %s35)
2273
2274	breq.l	6, %s37, .L_kmp_call
2275
2276	// Prepare any additional argument passed through the stack.
2277	adds.l	%s37, -6, %s37
2278	lea	%s38, 48(, %s38)
2279	lea	%s35, 64(, %s35)
2280.L_kmp_loop:
2281	ld	%s36, 0(, %s38)
2282	st	%s36, 0(, %s35)
2283	adds.l	%s37, -1, %s37
2284	adds.l	%s38, 8, %s38
2285	adds.l	%s35, 8, %s35
2286	brne.l	0, %s37, .L_kmp_loop
2287
2288.L_kmp_call:
2289	// Call pkfn function.
2290	bsic	%lr, (, %s12)
2291
2292	// Return value.
2293	lea	%s0, 1
2294
2295	// Restore stack and return.
2296	or	%sp, 0, %fp
2297	ld	%lr, 8(, %sp)
2298	ld	%fp, 0(, %sp)
2299	b.l.t	(, %lr)
2300.Lfunc_end0:
2301	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2302	.cfi_endproc
2303
2304// -- End  __kmp_invoke_microtask
2305
2306#endif /* KMP_ARCH_VE */
2307
2308#if KMP_ARCH_S390X
2309
2310//------------------------------------------------------------------------
2311//
2312// typedef void (*microtask_t)(int *gtid, int *tid, ...);
2313//
2314// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
2315//                            void *p_argv[]
2316// #if OMPT_SUPPORT
2317//                            ,
2318//                            void **exit_frame_ptr
2319// #endif
2320//                            ) {
2321// #if OMPT_SUPPORT
2322//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
2323// #endif
2324//
2325//   (*pkfn)(&gtid, &tid, argv[0], ...);
2326//
2327//   return 1;
2328// }
2329//
2330// Parameters:
2331//   r2: pkfn
2332//   r3: gtid
2333//   r4: tid
2334//   r5: argc
2335//   r6: p_argv
2336//   SP+160: exit_frame_ptr
2337//
2338// Locals:
2339//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
2340//   __tid: tid param pushed on stack so can pass &tid to pkfn
2341//
2342// Temp. registers:
2343//
2344//  r0: used to fetch argv slots
2345//  r7: used as temporary for number of remaining pkfn parms
2346//  r8: argv
2347//  r9: pkfn
2348//  r10: stack size
2349//  r11: previous fp
2350//  r12: stack parameter area
2351//  r13: argv slot
2352//
2353// return: r2 (always 1/TRUE)
2354//
2355
2356// -- Begin __kmp_invoke_microtask
2357// mark_begin;
2358	.text
2359	.globl	__kmp_invoke_microtask
2360	.p2align	1
2361	.type	__kmp_invoke_microtask,@function
2362__kmp_invoke_microtask:
2363	.cfi_startproc
2364
2365	stmg	%r6,%r14,48(%r15)
2366        .cfi_offset %r6, -112
2367        .cfi_offset %r7, -104
2368        .cfi_offset %r8, -96
2369        .cfi_offset %r9, -88
2370        .cfi_offset %r10, -80
2371        .cfi_offset %r11, -72
2372        .cfi_offset %r12, -64
2373        .cfi_offset %r13, -56
2374        .cfi_offset %r14, -48
2375        .cfi_offset %r15, -40
2376	lgr	%r11,%r15
2377	.cfi_def_cfa %r11, 160
2378
2379	// Compute the dynamic stack size:
2380	//
2381	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
2382	//   reference
2383	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
2384	//   function by register. Given that we have 5 of such registers (r[2-6])
2385	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
2386	//   reserve max(0, argc - 3)*8 extra bytes
2387	//
2388	// The total number of bytes is then max(0, argc - 3)*8 + 8
2389
2390	lgr	%r10,%r5
2391	aghi	%r10,-2
2392	jnm	0f
2393	lghi	%r10,0
23940:
2395	sllg	%r10,%r10,3
2396	lgr	%r12,%r10
2397	aghi	%r10,176
2398	sgr 	%r15,%r10
2399	agr	%r12,%r15
2400	stg	%r11,0(%r15)
2401
2402	lgr	%r9,%r2			// pkfn
2403
2404#if OMPT_SUPPORT
2405	// Save frame pointer into exit_frame
2406	lg	%r8,160(%r11)
2407	stg	%r11,0(%r8)
2408#endif
2409
2410	// Prepare arguments for the pkfn function (first 5 using r2-r6 registers)
2411
2412	stg     %r3,160(%r12)
2413	la	%r2,164(%r12)		// gid
2414	stg	%r4,168(%r12)
2415	la	%r3,172(%r12)		// tid
2416	lgr	%r8,%r6			// argv
2417
2418	// If argc > 0
2419	ltgr	%r7,%r5
2420	jz	1f
2421
2422	lg	%r4,0(%r8)		// argv[0]
2423	aghi	%r7,-1
2424	jz	1f
2425
2426	// If argc > 1
2427	lg	%r5,8(%r8)		// argv[1]
2428	aghi	%r7,-1
2429	jz	1f
2430
2431	// If argc > 2
2432	lg	%r6,16(%r8)		// argv[2]
2433	aghi	%r7,-1
2434	jz	1f
2435
2436	lghi	%r13,0			// Index [n]
24372:
2438	lg	%r0,24(%r13,%r8)	// argv[2+n]
2439	stg	%r0,160(%r13,%r15)	// parm[2+n]
2440	aghi	%r13,8			// Next
2441	aghi	%r7,-1
2442	jnz	2b
2443
24441:
2445	basr	%r14,%r9		// Call pkfn
2446
2447	// Restore stack and return
2448
2449	lgr	%r15,%r11
2450	lmg	%r6,%r14,48(%r15)
2451	lghi	%r2,1
2452	br	%r14
2453.Lfunc_end0:
2454	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2455	.cfi_endproc
2456
2457// -- End  __kmp_invoke_microtask
2458
2459#endif /* KMP_ARCH_S390X */
2460
2461#if KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32
2462#ifndef KMP_PREFIX_UNDERSCORE
2463# define KMP_PREFIX_UNDERSCORE(x) x
2464#endif
2465    .data
2466    COMMON .gomp_critical_user_, 32, 3
2467    .data
2468    .align 4
2469    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
2470KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
2471    .4byte .gomp_critical_user_
2472#ifdef __ELF__
2473    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),4
2474#endif
2475#endif /* KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32 */
2476
2477#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||                   \
2478    KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||                 \
2479    KMP_ARCH_S390X
2480#ifndef KMP_PREFIX_UNDERSCORE
2481# define KMP_PREFIX_UNDERSCORE(x) x
2482#endif
2483    .data
2484    COMMON .gomp_critical_user_, 32, 3
2485    .data
2486    .align 8
2487    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
2488KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
2489    .8byte .gomp_critical_user_
2490#ifdef __ELF__
2491    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
2492#endif
2493#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
2494          KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||
2495          KMP_ARCH_S390X */
2496
2497#if KMP_OS_LINUX
2498# if KMP_ARCH_ARM || KMP_ARCH_AARCH64
2499.section .note.GNU-stack,"",%progbits
2500# elif !KMP_ARCH_WASM
2501.section .note.GNU-stack,"",@progbits
2502# endif
2503#endif
2504
2505#if KMP_ARCH_WASM
2506.data
2507.global .gomp_critical_user_
2508.global .gomp_critical_user_.var
2509.global .gomp_critical_user_.reduction.var
2510.global __kmp_unnamed_critical_addr
2511.gomp_critical_user_:
2512.zero 4
2513.size .gomp_critical_user_, 4
2514.gomp_critical_user_.var:
2515.zero 4
2516.size .gomp_critical_user_.var, 4
2517.gomp_critical_user_.reduction.var:
2518.zero 4
2519.size .gomp_critical_user_.reduction.var, 4
2520__kmp_unnamed_critical_addr:
2521    .4byte .gomp_critical_user_
2522    .size __kmp_unnamed_critical_addr, 4
2523#endif
2524
2525#if KMP_OS_LINUX && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
2526GNU_PROPERTY_BTI_PAC
2527#endif
2528