1/* Copyright 2002 Andi Kleen, SuSE Labs */ 2 3#include <linux/linkage.h> 4#include <asm/dwarf2.h> 5#include <asm/cpufeature.h> 6#include <asm/alternative-asm.h> 7 8/* 9 * ISO C memset - set a memory block to a byte value. This function uses fast 10 * string to get better performance than the original function. The code is 11 * simpler and shorter than the orignal function as well. 12 * 13 * rdi destination 14 * rsi value (char) 15 * rdx count (bytes) 16 * 17 * rax original destination 18 */ 19 .section .altinstr_replacement, "ax", @progbits 20.Lmemset_c: 21 movq %rdi,%r9 22 movq %rdx,%rcx 23 andl $7,%edx 24 shrq $3,%rcx 25 /* expand byte value */ 26 movzbl %sil,%esi 27 movabs $0x0101010101010101,%rax 28 imulq %rsi,%rax 29 rep stosq 30 movl %edx,%ecx 31 rep stosb 32 movq %r9,%rax 33 ret 34.Lmemset_e: 35 .previous 36 37/* 38 * ISO C memset - set a memory block to a byte value. This function uses 39 * enhanced rep stosb to override the fast string function. 40 * The code is simpler and shorter than the fast string function as well. 41 * 42 * rdi destination 43 * rsi value (char) 44 * rdx count (bytes) 45 * 46 * rax original destination 47 */ 48 .section .altinstr_replacement, "ax", @progbits 49.Lmemset_c_e: 50 movq %rdi,%r9 51 movb %sil,%al 52 movq %rdx,%rcx 53 rep stosb 54 movq %r9,%rax 55 ret 56.Lmemset_e_e: 57 .previous 58 59.weak memset 60 61ENTRY(memset) 62ENTRY(__memset) 63 CFI_STARTPROC 64 movq %rdi,%r10 65 66 /* expand byte value */ 67 movzbl %sil,%ecx 68 movabs $0x0101010101010101,%rax 69 imulq %rcx,%rax 70 71 /* align dst */ 72 movl %edi,%r9d 73 andl $7,%r9d 74 jnz .Lbad_alignment 75 CFI_REMEMBER_STATE 76.Lafter_bad_alignment: 77 78 movq %rdx,%rcx 79 shrq $6,%rcx 80 jz .Lhandle_tail 81 82 .p2align 4 83.Lloop_64: 84 decq %rcx 85 movq %rax,(%rdi) 86 movq %rax,8(%rdi) 87 movq %rax,16(%rdi) 88 movq %rax,24(%rdi) 89 movq %rax,32(%rdi) 90 movq %rax,40(%rdi) 91 movq %rax,48(%rdi) 92 movq %rax,56(%rdi) 93 leaq 64(%rdi),%rdi 94 jnz .Lloop_64 95 96 /* Handle tail in loops. The loops should be faster than hard 97 to predict jump tables. */ 98 .p2align 4 99.Lhandle_tail: 100 movl %edx,%ecx 101 andl $63&(~7),%ecx 102 jz .Lhandle_7 103 shrl $3,%ecx 104 .p2align 4 105.Lloop_8: 106 decl %ecx 107 movq %rax,(%rdi) 108 leaq 8(%rdi),%rdi 109 jnz .Lloop_8 110 111.Lhandle_7: 112 andl $7,%edx 113 jz .Lende 114 .p2align 4 115.Lloop_1: 116 decl %edx 117 movb %al,(%rdi) 118 leaq 1(%rdi),%rdi 119 jnz .Lloop_1 120 121.Lende: 122 movq %r10,%rax 123 ret 124 125 CFI_RESTORE_STATE 126.Lbad_alignment: 127 cmpq $7,%rdx 128 jbe .Lhandle_7 129 movq %rax,(%rdi) /* unaligned store */ 130 movq $8,%r8 131 subq %r9,%r8 132 addq %r8,%rdi 133 subq %r8,%rdx 134 jmp .Lafter_bad_alignment 135.Lfinal: 136 CFI_ENDPROC 137ENDPROC(memset) 138ENDPROC(__memset) 139 140 /* Some CPUs support enhanced REP MOVSB/STOSB feature. 141 * It is recommended to use this when possible. 142 * 143 * If enhanced REP MOVSB/STOSB feature is not available, use fast string 144 * instructions. 145 * 146 * Otherwise, use original memset function. 147 * 148 * In .altinstructions section, ERMS feature is placed after REG_GOOD 149 * feature to implement the right patch order. 150 */ 151 .section .altinstructions,"a" 152 altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ 153 .Lfinal-__memset,.Lmemset_e-.Lmemset_c 154 altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ 155 .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e 156 .previous 157