1*7d7d1bf1SArnaldo Carvalho de Melo/* Copyright 2002 Andi Kleen, SuSE Labs */ 2*7d7d1bf1SArnaldo Carvalho de Melo 3*7d7d1bf1SArnaldo Carvalho de Melo#include <linux/linkage.h> 4*7d7d1bf1SArnaldo Carvalho de Melo#include <asm/cpufeatures.h> 5*7d7d1bf1SArnaldo Carvalho de Melo#include <asm/alternative-asm.h> 6*7d7d1bf1SArnaldo Carvalho de Melo 7*7d7d1bf1SArnaldo Carvalho de Melo.weak memset 8*7d7d1bf1SArnaldo Carvalho de Melo 9*7d7d1bf1SArnaldo Carvalho de Melo/* 10*7d7d1bf1SArnaldo Carvalho de Melo * ISO C memset - set a memory block to a byte value. This function uses fast 11*7d7d1bf1SArnaldo Carvalho de Melo * string to get better performance than the original function. The code is 12*7d7d1bf1SArnaldo Carvalho de Melo * simpler and shorter than the original function as well. 13*7d7d1bf1SArnaldo Carvalho de Melo * 14*7d7d1bf1SArnaldo Carvalho de Melo * rdi destination 15*7d7d1bf1SArnaldo Carvalho de Melo * rsi value (char) 16*7d7d1bf1SArnaldo Carvalho de Melo * rdx count (bytes) 17*7d7d1bf1SArnaldo Carvalho de Melo * 18*7d7d1bf1SArnaldo Carvalho de Melo * rax original destination 19*7d7d1bf1SArnaldo Carvalho de Melo */ 20*7d7d1bf1SArnaldo Carvalho de MeloENTRY(memset) 21*7d7d1bf1SArnaldo Carvalho de MeloENTRY(__memset) 22*7d7d1bf1SArnaldo Carvalho de Melo /* 23*7d7d1bf1SArnaldo Carvalho de Melo * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended 24*7d7d1bf1SArnaldo Carvalho de Melo * to use it when possible. If not available, use fast string instructions. 25*7d7d1bf1SArnaldo Carvalho de Melo * 26*7d7d1bf1SArnaldo Carvalho de Melo * Otherwise, use original memset function. 27*7d7d1bf1SArnaldo Carvalho de Melo */ 28*7d7d1bf1SArnaldo Carvalho de Melo ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ 29*7d7d1bf1SArnaldo Carvalho de Melo "jmp memset_erms", X86_FEATURE_ERMS 30*7d7d1bf1SArnaldo Carvalho de Melo 31*7d7d1bf1SArnaldo Carvalho de Melo movq %rdi,%r9 32*7d7d1bf1SArnaldo Carvalho de Melo movq %rdx,%rcx 33*7d7d1bf1SArnaldo Carvalho de Melo andl $7,%edx 34*7d7d1bf1SArnaldo Carvalho de Melo shrq $3,%rcx 35*7d7d1bf1SArnaldo Carvalho de Melo /* expand byte value */ 36*7d7d1bf1SArnaldo Carvalho de Melo movzbl %sil,%esi 37*7d7d1bf1SArnaldo Carvalho de Melo movabs $0x0101010101010101,%rax 38*7d7d1bf1SArnaldo Carvalho de Melo imulq %rsi,%rax 39*7d7d1bf1SArnaldo Carvalho de Melo rep stosq 40*7d7d1bf1SArnaldo Carvalho de Melo movl %edx,%ecx 41*7d7d1bf1SArnaldo Carvalho de Melo rep stosb 42*7d7d1bf1SArnaldo Carvalho de Melo movq %r9,%rax 43*7d7d1bf1SArnaldo Carvalho de Melo ret 44*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(memset) 45*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(__memset) 46*7d7d1bf1SArnaldo Carvalho de Melo 47*7d7d1bf1SArnaldo Carvalho de Melo/* 48*7d7d1bf1SArnaldo Carvalho de Melo * ISO C memset - set a memory block to a byte value. This function uses 49*7d7d1bf1SArnaldo Carvalho de Melo * enhanced rep stosb to override the fast string function. 50*7d7d1bf1SArnaldo Carvalho de Melo * The code is simpler and shorter than the fast string function as well. 51*7d7d1bf1SArnaldo Carvalho de Melo * 52*7d7d1bf1SArnaldo Carvalho de Melo * rdi destination 53*7d7d1bf1SArnaldo Carvalho de Melo * rsi value (char) 54*7d7d1bf1SArnaldo Carvalho de Melo * rdx count (bytes) 55*7d7d1bf1SArnaldo Carvalho de Melo * 56*7d7d1bf1SArnaldo Carvalho de Melo * rax original destination 57*7d7d1bf1SArnaldo Carvalho de Melo */ 58*7d7d1bf1SArnaldo Carvalho de MeloENTRY(memset_erms) 59*7d7d1bf1SArnaldo Carvalho de Melo movq %rdi,%r9 60*7d7d1bf1SArnaldo Carvalho de Melo movb %sil,%al 61*7d7d1bf1SArnaldo Carvalho de Melo movq %rdx,%rcx 62*7d7d1bf1SArnaldo Carvalho de Melo rep stosb 63*7d7d1bf1SArnaldo Carvalho de Melo movq %r9,%rax 64*7d7d1bf1SArnaldo Carvalho de Melo ret 65*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(memset_erms) 66*7d7d1bf1SArnaldo Carvalho de Melo 67*7d7d1bf1SArnaldo Carvalho de MeloENTRY(memset_orig) 68*7d7d1bf1SArnaldo Carvalho de Melo movq %rdi,%r10 69*7d7d1bf1SArnaldo Carvalho de Melo 70*7d7d1bf1SArnaldo Carvalho de Melo /* expand byte value */ 71*7d7d1bf1SArnaldo Carvalho de Melo movzbl %sil,%ecx 72*7d7d1bf1SArnaldo Carvalho de Melo movabs $0x0101010101010101,%rax 73*7d7d1bf1SArnaldo Carvalho de Melo imulq %rcx,%rax 74*7d7d1bf1SArnaldo Carvalho de Melo 75*7d7d1bf1SArnaldo Carvalho de Melo /* align dst */ 76*7d7d1bf1SArnaldo Carvalho de Melo movl %edi,%r9d 77*7d7d1bf1SArnaldo Carvalho de Melo andl $7,%r9d 78*7d7d1bf1SArnaldo Carvalho de Melo jnz .Lbad_alignment 79*7d7d1bf1SArnaldo Carvalho de Melo.Lafter_bad_alignment: 80*7d7d1bf1SArnaldo Carvalho de Melo 81*7d7d1bf1SArnaldo Carvalho de Melo movq %rdx,%rcx 82*7d7d1bf1SArnaldo Carvalho de Melo shrq $6,%rcx 83*7d7d1bf1SArnaldo Carvalho de Melo jz .Lhandle_tail 84*7d7d1bf1SArnaldo Carvalho de Melo 85*7d7d1bf1SArnaldo Carvalho de Melo .p2align 4 86*7d7d1bf1SArnaldo Carvalho de Melo.Lloop_64: 87*7d7d1bf1SArnaldo Carvalho de Melo decq %rcx 88*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,(%rdi) 89*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,8(%rdi) 90*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,16(%rdi) 91*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,24(%rdi) 92*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,32(%rdi) 93*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,40(%rdi) 94*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,48(%rdi) 95*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,56(%rdi) 96*7d7d1bf1SArnaldo Carvalho de Melo leaq 64(%rdi),%rdi 97*7d7d1bf1SArnaldo Carvalho de Melo jnz .Lloop_64 98*7d7d1bf1SArnaldo Carvalho de Melo 99*7d7d1bf1SArnaldo Carvalho de Melo /* Handle tail in loops. The loops should be faster than hard 100*7d7d1bf1SArnaldo Carvalho de Melo to predict jump tables. */ 101*7d7d1bf1SArnaldo Carvalho de Melo .p2align 4 102*7d7d1bf1SArnaldo Carvalho de Melo.Lhandle_tail: 103*7d7d1bf1SArnaldo Carvalho de Melo movl %edx,%ecx 104*7d7d1bf1SArnaldo Carvalho de Melo andl $63&(~7),%ecx 105*7d7d1bf1SArnaldo Carvalho de Melo jz .Lhandle_7 106*7d7d1bf1SArnaldo Carvalho de Melo shrl $3,%ecx 107*7d7d1bf1SArnaldo Carvalho de Melo .p2align 4 108*7d7d1bf1SArnaldo Carvalho de Melo.Lloop_8: 109*7d7d1bf1SArnaldo Carvalho de Melo decl %ecx 110*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,(%rdi) 111*7d7d1bf1SArnaldo Carvalho de Melo leaq 8(%rdi),%rdi 112*7d7d1bf1SArnaldo Carvalho de Melo jnz .Lloop_8 113*7d7d1bf1SArnaldo Carvalho de Melo 114*7d7d1bf1SArnaldo Carvalho de Melo.Lhandle_7: 115*7d7d1bf1SArnaldo Carvalho de Melo andl $7,%edx 116*7d7d1bf1SArnaldo Carvalho de Melo jz .Lende 117*7d7d1bf1SArnaldo Carvalho de Melo .p2align 4 118*7d7d1bf1SArnaldo Carvalho de Melo.Lloop_1: 119*7d7d1bf1SArnaldo Carvalho de Melo decl %edx 120*7d7d1bf1SArnaldo Carvalho de Melo movb %al,(%rdi) 121*7d7d1bf1SArnaldo Carvalho de Melo leaq 1(%rdi),%rdi 122*7d7d1bf1SArnaldo Carvalho de Melo jnz .Lloop_1 123*7d7d1bf1SArnaldo Carvalho de Melo 124*7d7d1bf1SArnaldo Carvalho de Melo.Lende: 125*7d7d1bf1SArnaldo Carvalho de Melo movq %r10,%rax 126*7d7d1bf1SArnaldo Carvalho de Melo ret 127*7d7d1bf1SArnaldo Carvalho de Melo 128*7d7d1bf1SArnaldo Carvalho de Melo.Lbad_alignment: 129*7d7d1bf1SArnaldo Carvalho de Melo cmpq $7,%rdx 130*7d7d1bf1SArnaldo Carvalho de Melo jbe .Lhandle_7 131*7d7d1bf1SArnaldo Carvalho de Melo movq %rax,(%rdi) /* unaligned store */ 132*7d7d1bf1SArnaldo Carvalho de Melo movq $8,%r8 133*7d7d1bf1SArnaldo Carvalho de Melo subq %r9,%r8 134*7d7d1bf1SArnaldo Carvalho de Melo addq %r8,%rdi 135*7d7d1bf1SArnaldo Carvalho de Melo subq %r8,%rdx 136*7d7d1bf1SArnaldo Carvalho de Melo jmp .Lafter_bad_alignment 137*7d7d1bf1SArnaldo Carvalho de Melo.Lfinal: 138*7d7d1bf1SArnaldo Carvalho de MeloENDPROC(memset_orig) 139