xref: /linux/tools/arch/x86/lib/memset_64.S (revision 7d7d1bf1d1dabe435ef50efb051724b8664749cb)
1*7d7d1bf1SArnaldo Carvalho de Melo/* Copyright 2002 Andi Kleen, SuSE Labs */
2*7d7d1bf1SArnaldo Carvalho de Melo
3*7d7d1bf1SArnaldo Carvalho de Melo#include <linux/linkage.h>
4*7d7d1bf1SArnaldo Carvalho de Melo#include <asm/cpufeatures.h>
5*7d7d1bf1SArnaldo Carvalho de Melo#include <asm/alternative-asm.h>
6*7d7d1bf1SArnaldo Carvalho de Melo
7*7d7d1bf1SArnaldo Carvalho de Melo.weak memset
8*7d7d1bf1SArnaldo Carvalho de Melo
9*7d7d1bf1SArnaldo Carvalho de Melo/*
10*7d7d1bf1SArnaldo Carvalho de Melo * ISO C memset - set a memory block to a byte value. This function uses fast
11*7d7d1bf1SArnaldo Carvalho de Melo * string to get better performance than the original function. The code is
12*7d7d1bf1SArnaldo Carvalho de Melo * simpler and shorter than the original function as well.
13*7d7d1bf1SArnaldo Carvalho de Melo *
14*7d7d1bf1SArnaldo Carvalho de Melo * rdi   destination
15*7d7d1bf1SArnaldo Carvalho de Melo * rsi   value (char)
16*7d7d1bf1SArnaldo Carvalho de Melo * rdx   count (bytes)
17*7d7d1bf1SArnaldo Carvalho de Melo *
18*7d7d1bf1SArnaldo Carvalho de Melo * rax   original destination
19*7d7d1bf1SArnaldo Carvalho de Melo */
20*7d7d1bf1SArnaldo Carvalho de MeloENTRY(memset)
21*7d7d1bf1SArnaldo Carvalho de MeloENTRY(__memset)
22*7d7d1bf1SArnaldo Carvalho de Melo	/*
23*7d7d1bf1SArnaldo Carvalho de Melo	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
24*7d7d1bf1SArnaldo Carvalho de Melo	 * to use it when possible. If not available, use fast string instructions.
25*7d7d1bf1SArnaldo Carvalho de Melo	 *
26*7d7d1bf1SArnaldo Carvalho de Melo	 * Otherwise, use original memset function.
27*7d7d1bf1SArnaldo Carvalho de Melo	 */
28*7d7d1bf1SArnaldo Carvalho de Melo	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
29*7d7d1bf1SArnaldo Carvalho de Melo		      "jmp memset_erms", X86_FEATURE_ERMS
30*7d7d1bf1SArnaldo Carvalho de Melo
31*7d7d1bf1SArnaldo Carvalho de Melo	movq %rdi,%r9
32*7d7d1bf1SArnaldo Carvalho de Melo	movq %rdx,%rcx
33*7d7d1bf1SArnaldo Carvalho de Melo	andl $7,%edx
34*7d7d1bf1SArnaldo Carvalho de Melo	shrq $3,%rcx
35*7d7d1bf1SArnaldo Carvalho de Melo	/* expand byte value  */
36*7d7d1bf1SArnaldo Carvalho de Melo	movzbl %sil,%esi
37*7d7d1bf1SArnaldo Carvalho de Melo	movabs $0x0101010101010101,%rax
38*7d7d1bf1SArnaldo Carvalho de Melo	imulq %rsi,%rax
39*7d7d1bf1SArnaldo Carvalho de Melo	rep stosq
40*7d7d1bf1SArnaldo Carvalho de Melo	movl %edx,%ecx
41*7d7d1bf1SArnaldo Carvalho de Melo	rep stosb
42*7d7d1bf1SArnaldo Carvalho de Melo	movq %r9,%rax
43*7d7d1bf1SArnaldo Carvalho de Melo	ret
44*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(memset)
45*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(__memset)
46*7d7d1bf1SArnaldo Carvalho de Melo
47*7d7d1bf1SArnaldo Carvalho de Melo/*
48*7d7d1bf1SArnaldo Carvalho de Melo * ISO C memset - set a memory block to a byte value. This function uses
49*7d7d1bf1SArnaldo Carvalho de Melo * enhanced rep stosb to override the fast string function.
50*7d7d1bf1SArnaldo Carvalho de Melo * The code is simpler and shorter than the fast string function as well.
51*7d7d1bf1SArnaldo Carvalho de Melo *
52*7d7d1bf1SArnaldo Carvalho de Melo * rdi   destination
53*7d7d1bf1SArnaldo Carvalho de Melo * rsi   value (char)
54*7d7d1bf1SArnaldo Carvalho de Melo * rdx   count (bytes)
55*7d7d1bf1SArnaldo Carvalho de Melo *
56*7d7d1bf1SArnaldo Carvalho de Melo * rax   original destination
57*7d7d1bf1SArnaldo Carvalho de Melo */
58*7d7d1bf1SArnaldo Carvalho de MeloENTRY(memset_erms)
59*7d7d1bf1SArnaldo Carvalho de Melo	movq %rdi,%r9
60*7d7d1bf1SArnaldo Carvalho de Melo	movb %sil,%al
61*7d7d1bf1SArnaldo Carvalho de Melo	movq %rdx,%rcx
62*7d7d1bf1SArnaldo Carvalho de Melo	rep stosb
63*7d7d1bf1SArnaldo Carvalho de Melo	movq %r9,%rax
64*7d7d1bf1SArnaldo Carvalho de Melo	ret
65*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(memset_erms)
66*7d7d1bf1SArnaldo Carvalho de Melo
67*7d7d1bf1SArnaldo Carvalho de MeloENTRY(memset_orig)
68*7d7d1bf1SArnaldo Carvalho de Melo	movq %rdi,%r10
69*7d7d1bf1SArnaldo Carvalho de Melo
70*7d7d1bf1SArnaldo Carvalho de Melo	/* expand byte value  */
71*7d7d1bf1SArnaldo Carvalho de Melo	movzbl %sil,%ecx
72*7d7d1bf1SArnaldo Carvalho de Melo	movabs $0x0101010101010101,%rax
73*7d7d1bf1SArnaldo Carvalho de Melo	imulq  %rcx,%rax
74*7d7d1bf1SArnaldo Carvalho de Melo
75*7d7d1bf1SArnaldo Carvalho de Melo	/* align dst */
76*7d7d1bf1SArnaldo Carvalho de Melo	movl  %edi,%r9d
77*7d7d1bf1SArnaldo Carvalho de Melo	andl  $7,%r9d
78*7d7d1bf1SArnaldo Carvalho de Melo	jnz  .Lbad_alignment
79*7d7d1bf1SArnaldo Carvalho de Melo.Lafter_bad_alignment:
80*7d7d1bf1SArnaldo Carvalho de Melo
81*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rdx,%rcx
82*7d7d1bf1SArnaldo Carvalho de Melo	shrq  $6,%rcx
83*7d7d1bf1SArnaldo Carvalho de Melo	jz	 .Lhandle_tail
84*7d7d1bf1SArnaldo Carvalho de Melo
85*7d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
86*7d7d1bf1SArnaldo Carvalho de Melo.Lloop_64:
87*7d7d1bf1SArnaldo Carvalho de Melo	decq  %rcx
88*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,(%rdi)
89*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,8(%rdi)
90*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,16(%rdi)
91*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,24(%rdi)
92*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,32(%rdi)
93*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,40(%rdi)
94*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,48(%rdi)
95*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,56(%rdi)
96*7d7d1bf1SArnaldo Carvalho de Melo	leaq  64(%rdi),%rdi
97*7d7d1bf1SArnaldo Carvalho de Melo	jnz    .Lloop_64
98*7d7d1bf1SArnaldo Carvalho de Melo
99*7d7d1bf1SArnaldo Carvalho de Melo	/* Handle tail in loops. The loops should be faster than hard
100*7d7d1bf1SArnaldo Carvalho de Melo	   to predict jump tables. */
101*7d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
102*7d7d1bf1SArnaldo Carvalho de Melo.Lhandle_tail:
103*7d7d1bf1SArnaldo Carvalho de Melo	movl	%edx,%ecx
104*7d7d1bf1SArnaldo Carvalho de Melo	andl    $63&(~7),%ecx
105*7d7d1bf1SArnaldo Carvalho de Melo	jz 		.Lhandle_7
106*7d7d1bf1SArnaldo Carvalho de Melo	shrl	$3,%ecx
107*7d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
108*7d7d1bf1SArnaldo Carvalho de Melo.Lloop_8:
109*7d7d1bf1SArnaldo Carvalho de Melo	decl   %ecx
110*7d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,(%rdi)
111*7d7d1bf1SArnaldo Carvalho de Melo	leaq  8(%rdi),%rdi
112*7d7d1bf1SArnaldo Carvalho de Melo	jnz    .Lloop_8
113*7d7d1bf1SArnaldo Carvalho de Melo
114*7d7d1bf1SArnaldo Carvalho de Melo.Lhandle_7:
115*7d7d1bf1SArnaldo Carvalho de Melo	andl	$7,%edx
116*7d7d1bf1SArnaldo Carvalho de Melo	jz      .Lende
117*7d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
118*7d7d1bf1SArnaldo Carvalho de Melo.Lloop_1:
119*7d7d1bf1SArnaldo Carvalho de Melo	decl    %edx
120*7d7d1bf1SArnaldo Carvalho de Melo	movb 	%al,(%rdi)
121*7d7d1bf1SArnaldo Carvalho de Melo	leaq	1(%rdi),%rdi
122*7d7d1bf1SArnaldo Carvalho de Melo	jnz     .Lloop_1
123*7d7d1bf1SArnaldo Carvalho de Melo
124*7d7d1bf1SArnaldo Carvalho de Melo.Lende:
125*7d7d1bf1SArnaldo Carvalho de Melo	movq	%r10,%rax
126*7d7d1bf1SArnaldo Carvalho de Melo	ret
127*7d7d1bf1SArnaldo Carvalho de Melo
128*7d7d1bf1SArnaldo Carvalho de Melo.Lbad_alignment:
129*7d7d1bf1SArnaldo Carvalho de Melo	cmpq $7,%rdx
130*7d7d1bf1SArnaldo Carvalho de Melo	jbe	.Lhandle_7
131*7d7d1bf1SArnaldo Carvalho de Melo	movq %rax,(%rdi)	/* unaligned store */
132*7d7d1bf1SArnaldo Carvalho de Melo	movq $8,%r8
133*7d7d1bf1SArnaldo Carvalho de Melo	subq %r9,%r8
134*7d7d1bf1SArnaldo Carvalho de Melo	addq %r8,%rdi
135*7d7d1bf1SArnaldo Carvalho de Melo	subq %r8,%rdx
136*7d7d1bf1SArnaldo Carvalho de Melo	jmp .Lafter_bad_alignment
137*7d7d1bf1SArnaldo Carvalho de Melo.Lfinal:
138*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(memset_orig)
139