xref: /linux/tools/arch/x86/lib/memset_64.S (revision efe80f9c9063228136bc3824f7ac6b4ff2e273b4)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
27d7d1bf1SArnaldo Carvalho de Melo/* Copyright 2002 Andi Kleen, SuSE Labs */
37d7d1bf1SArnaldo Carvalho de Melo
4*efe80f9cSArnaldo Carvalho de Melo#include <linux/export.h>
57d7d1bf1SArnaldo Carvalho de Melo#include <linux/linkage.h>
67d7d1bf1SArnaldo Carvalho de Melo#include <asm/cpufeatures.h>
7fb24e308SArnaldo Carvalho de Melo#include <asm/alternative.h>
87d7d1bf1SArnaldo Carvalho de Melo
931d2e6b5SArnaldo Carvalho de Melo.section .noinstr.text, "ax"
1031d2e6b5SArnaldo Carvalho de Melo
117d7d1bf1SArnaldo Carvalho de Melo/*
127d7d1bf1SArnaldo Carvalho de Melo * ISO C memset - set a memory block to a byte value. This function uses fast
137d7d1bf1SArnaldo Carvalho de Melo * string to get better performance than the original function. The code is
147d7d1bf1SArnaldo Carvalho de Melo * simpler and shorter than the original function as well.
157d7d1bf1SArnaldo Carvalho de Melo *
167d7d1bf1SArnaldo Carvalho de Melo * rdi   destination
177d7d1bf1SArnaldo Carvalho de Melo * rsi   value (char)
187d7d1bf1SArnaldo Carvalho de Melo * rdx   count (bytes)
197d7d1bf1SArnaldo Carvalho de Melo *
207d7d1bf1SArnaldo Carvalho de Melo * rax   original destination
217f02ce62SArnaldo Carvalho de Melo *
227f02ce62SArnaldo Carvalho de Melo * The FSRS alternative should be done inline (avoiding the call and
237f02ce62SArnaldo Carvalho de Melo * the disgusting return handling), but that would require some help
247f02ce62SArnaldo Carvalho de Melo * from the compiler for better calling conventions.
257f02ce62SArnaldo Carvalho de Melo *
267f02ce62SArnaldo Carvalho de Melo * The 'rep stosb' itself is small enough to replace the call, but all
277f02ce62SArnaldo Carvalho de Melo * the register moves blow up the code. And two of them are "needed"
287f02ce62SArnaldo Carvalho de Melo * only for the return value that is the same as the source input,
297f02ce62SArnaldo Carvalho de Melo * which the compiler could/should do much better anyway.
307d7d1bf1SArnaldo Carvalho de Melo */
31bd5c6b81SArnaldo Carvalho de MeloSYM_FUNC_START(__memset)
327f02ce62SArnaldo Carvalho de Melo	ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
337d7d1bf1SArnaldo Carvalho de Melo
347d7d1bf1SArnaldo Carvalho de Melo	movq %rdi,%r9
357f02ce62SArnaldo Carvalho de Melo	movb %sil,%al
367d7d1bf1SArnaldo Carvalho de Melo	movq %rdx,%rcx
377d7d1bf1SArnaldo Carvalho de Melo	rep stosb
387d7d1bf1SArnaldo Carvalho de Melo	movq %r9,%rax
3935cb8c71SArnaldo Carvalho de Melo	RET
40bd5c6b81SArnaldo Carvalho de MeloSYM_FUNC_END(__memset)
41db1a8b97SArnaldo Carvalho de MeloEXPORT_SYMBOL(__memset)
427d7d1bf1SArnaldo Carvalho de Melo
43*efe80f9cSArnaldo Carvalho de MeloSYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
447be2e319SMark RutlandEXPORT_SYMBOL(memset)
457be2e319SMark Rutland
46db1a8b97SArnaldo Carvalho de MeloSYM_FUNC_START_LOCAL(memset_orig)
477d7d1bf1SArnaldo Carvalho de Melo	movq %rdi,%r10
487d7d1bf1SArnaldo Carvalho de Melo
497d7d1bf1SArnaldo Carvalho de Melo	/* expand byte value  */
507d7d1bf1SArnaldo Carvalho de Melo	movzbl %sil,%ecx
517d7d1bf1SArnaldo Carvalho de Melo	movabs $0x0101010101010101,%rax
527d7d1bf1SArnaldo Carvalho de Melo	imulq  %rcx,%rax
537d7d1bf1SArnaldo Carvalho de Melo
547d7d1bf1SArnaldo Carvalho de Melo	/* align dst */
557d7d1bf1SArnaldo Carvalho de Melo	movl  %edi,%r9d
567d7d1bf1SArnaldo Carvalho de Melo	andl  $7,%r9d
577d7d1bf1SArnaldo Carvalho de Melo	jnz  .Lbad_alignment
587d7d1bf1SArnaldo Carvalho de Melo.Lafter_bad_alignment:
597d7d1bf1SArnaldo Carvalho de Melo
607d7d1bf1SArnaldo Carvalho de Melo	movq  %rdx,%rcx
617d7d1bf1SArnaldo Carvalho de Melo	shrq  $6,%rcx
627d7d1bf1SArnaldo Carvalho de Melo	jz	 .Lhandle_tail
637d7d1bf1SArnaldo Carvalho de Melo
647d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
657d7d1bf1SArnaldo Carvalho de Melo.Lloop_64:
667d7d1bf1SArnaldo Carvalho de Melo	decq  %rcx
677d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,(%rdi)
687d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,8(%rdi)
697d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,16(%rdi)
707d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,24(%rdi)
717d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,32(%rdi)
727d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,40(%rdi)
737d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,48(%rdi)
747d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,56(%rdi)
757d7d1bf1SArnaldo Carvalho de Melo	leaq  64(%rdi),%rdi
767d7d1bf1SArnaldo Carvalho de Melo	jnz    .Lloop_64
777d7d1bf1SArnaldo Carvalho de Melo
787d7d1bf1SArnaldo Carvalho de Melo	/* Handle tail in loops. The loops should be faster than hard
797d7d1bf1SArnaldo Carvalho de Melo	   to predict jump tables. */
807d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
817d7d1bf1SArnaldo Carvalho de Melo.Lhandle_tail:
827d7d1bf1SArnaldo Carvalho de Melo	movl	%edx,%ecx
837d7d1bf1SArnaldo Carvalho de Melo	andl    $63&(~7),%ecx
847d7d1bf1SArnaldo Carvalho de Melo	jz 		.Lhandle_7
857d7d1bf1SArnaldo Carvalho de Melo	shrl	$3,%ecx
867d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
877d7d1bf1SArnaldo Carvalho de Melo.Lloop_8:
887d7d1bf1SArnaldo Carvalho de Melo	decl   %ecx
897d7d1bf1SArnaldo Carvalho de Melo	movq  %rax,(%rdi)
907d7d1bf1SArnaldo Carvalho de Melo	leaq  8(%rdi),%rdi
917d7d1bf1SArnaldo Carvalho de Melo	jnz    .Lloop_8
927d7d1bf1SArnaldo Carvalho de Melo
937d7d1bf1SArnaldo Carvalho de Melo.Lhandle_7:
947d7d1bf1SArnaldo Carvalho de Melo	andl	$7,%edx
957d7d1bf1SArnaldo Carvalho de Melo	jz      .Lende
967d7d1bf1SArnaldo Carvalho de Melo	.p2align 4
977d7d1bf1SArnaldo Carvalho de Melo.Lloop_1:
987d7d1bf1SArnaldo Carvalho de Melo	decl    %edx
997d7d1bf1SArnaldo Carvalho de Melo	movb 	%al,(%rdi)
1007d7d1bf1SArnaldo Carvalho de Melo	leaq	1(%rdi),%rdi
1017d7d1bf1SArnaldo Carvalho de Melo	jnz     .Lloop_1
1027d7d1bf1SArnaldo Carvalho de Melo
1037d7d1bf1SArnaldo Carvalho de Melo.Lende:
1047d7d1bf1SArnaldo Carvalho de Melo	movq	%r10,%rax
10535cb8c71SArnaldo Carvalho de Melo	RET
1067d7d1bf1SArnaldo Carvalho de Melo
1077d7d1bf1SArnaldo Carvalho de Melo.Lbad_alignment:
1087d7d1bf1SArnaldo Carvalho de Melo	cmpq $7,%rdx
1097d7d1bf1SArnaldo Carvalho de Melo	jbe	.Lhandle_7
1107d7d1bf1SArnaldo Carvalho de Melo	movq %rax,(%rdi)	/* unaligned store */
1117d7d1bf1SArnaldo Carvalho de Melo	movq $8,%r8
1127d7d1bf1SArnaldo Carvalho de Melo	subq %r9,%r8
1137d7d1bf1SArnaldo Carvalho de Melo	addq %r8,%rdi
1147d7d1bf1SArnaldo Carvalho de Melo	subq %r8,%rdx
1157d7d1bf1SArnaldo Carvalho de Melo	jmp .Lafter_bad_alignment
1167d7d1bf1SArnaldo Carvalho de Melo.Lfinal:
117bd5c6b81SArnaldo Carvalho de MeloSYM_FUNC_END(memset_orig)
118