1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4 5#include <asm/cpufeature.h> 6#include <asm/dwarf2.h> 7#include <asm/alternative-asm.h> 8 9/* 10 * memcpy - Copy a memory block. 11 * 12 * Input: 13 * rdi destination 14 * rsi source 15 * rdx count 16 * 17 * Output: 18 * rax original destination 19 */ 20 21/* 22 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 23 * 24 * This gets patched over the unrolled variant (below) via the 25 * alternative instructions framework: 26 */ 27 .section .altinstr_replacement, "ax", @progbits 28.Lmemcpy_c: 29 movq %rdi, %rax 30 movq %rdx, %rcx 31 shrq $3, %rcx 32 andl $7, %edx 33 rep movsq 34 movl %edx, %ecx 35 rep movsb 36 ret 37.Lmemcpy_e: 38 .previous 39 40/* 41 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than 42 * memcpy_c. Use memcpy_c_e when possible. 43 * 44 * This gets patched over the unrolled variant (below) via the 45 * alternative instructions framework: 46 */ 47 .section .altinstr_replacement, "ax", @progbits 48.Lmemcpy_c_e: 49 movq %rdi, %rax 50 movq %rdx, %rcx 51 rep movsb 52 ret 53.Lmemcpy_e_e: 54 .previous 55 56.weak memcpy 57 58ENTRY(__memcpy) 59ENTRY(memcpy) 60 CFI_STARTPROC 61 movq %rdi, %rax 62 63 cmpq $0x20, %rdx 64 jb .Lhandle_tail 65 66 /* 67 * We check whether memory false dependence could occur, 68 * then jump to corresponding copy mode. 69 */ 70 cmp %dil, %sil 71 jl .Lcopy_backward 72 subq $0x20, %rdx 73.Lcopy_forward_loop: 74 subq $0x20, %rdx 75 76 /* 77 * Move in blocks of 4x8 bytes: 78 */ 79 movq 0*8(%rsi), %r8 80 movq 1*8(%rsi), %r9 81 movq 2*8(%rsi), %r10 82 movq 3*8(%rsi), %r11 83 leaq 4*8(%rsi), %rsi 84 85 movq %r8, 0*8(%rdi) 86 movq %r9, 1*8(%rdi) 87 movq %r10, 2*8(%rdi) 88 movq %r11, 3*8(%rdi) 89 leaq 4*8(%rdi), %rdi 90 jae .Lcopy_forward_loop 91 addl $0x20, %edx 92 jmp .Lhandle_tail 93 94.Lcopy_backward: 95 /* 96 * Calculate copy position to tail. 97 */ 98 addq %rdx, %rsi 99 addq %rdx, %rdi 100 subq $0x20, %rdx 101 /* 102 * At most 3 ALU operations in one cycle, 103 * so append NOPS in the same 16 bytes trunk. 104 */ 105 .p2align 4 106.Lcopy_backward_loop: 107 subq $0x20, %rdx 108 movq -1*8(%rsi), %r8 109 movq -2*8(%rsi), %r9 110 movq -3*8(%rsi), %r10 111 movq -4*8(%rsi), %r11 112 leaq -4*8(%rsi), %rsi 113 movq %r8, -1*8(%rdi) 114 movq %r9, -2*8(%rdi) 115 movq %r10, -3*8(%rdi) 116 movq %r11, -4*8(%rdi) 117 leaq -4*8(%rdi), %rdi 118 jae .Lcopy_backward_loop 119 120 /* 121 * Calculate copy position to head. 122 */ 123 addl $0x20, %edx 124 subq %rdx, %rsi 125 subq %rdx, %rdi 126.Lhandle_tail: 127 cmpl $16, %edx 128 jb .Lless_16bytes 129 130 /* 131 * Move data from 16 bytes to 31 bytes. 132 */ 133 movq 0*8(%rsi), %r8 134 movq 1*8(%rsi), %r9 135 movq -2*8(%rsi, %rdx), %r10 136 movq -1*8(%rsi, %rdx), %r11 137 movq %r8, 0*8(%rdi) 138 movq %r9, 1*8(%rdi) 139 movq %r10, -2*8(%rdi, %rdx) 140 movq %r11, -1*8(%rdi, %rdx) 141 retq 142 .p2align 4 143.Lless_16bytes: 144 cmpl $8, %edx 145 jb .Lless_8bytes 146 /* 147 * Move data from 8 bytes to 15 bytes. 148 */ 149 movq 0*8(%rsi), %r8 150 movq -1*8(%rsi, %rdx), %r9 151 movq %r8, 0*8(%rdi) 152 movq %r9, -1*8(%rdi, %rdx) 153 retq 154 .p2align 4 155.Lless_8bytes: 156 cmpl $4, %edx 157 jb .Lless_3bytes 158 159 /* 160 * Move data from 4 bytes to 7 bytes. 161 */ 162 movl (%rsi), %ecx 163 movl -4(%rsi, %rdx), %r8d 164 movl %ecx, (%rdi) 165 movl %r8d, -4(%rdi, %rdx) 166 retq 167 .p2align 4 168.Lless_3bytes: 169 subl $1, %edx 170 jb .Lend 171 /* 172 * Move data from 1 bytes to 3 bytes. 173 */ 174 movzbl (%rsi), %ecx 175 jz .Lstore_1byte 176 movzbq 1(%rsi), %r8 177 movzbq (%rsi, %rdx), %r9 178 movb %r8b, 1(%rdi) 179 movb %r9b, (%rdi, %rdx) 180.Lstore_1byte: 181 movb %cl, (%rdi) 182 183.Lend: 184 retq 185 CFI_ENDPROC 186ENDPROC(memcpy) 187ENDPROC(__memcpy) 188 189 /* 190 * Some CPUs are adding enhanced REP MOVSB/STOSB feature 191 * If the feature is supported, memcpy_c_e() is the first choice. 192 * If enhanced rep movsb copy is not available, use fast string copy 193 * memcpy_c() when possible. This is faster and code is simpler than 194 * original memcpy(). 195 * Otherwise, original memcpy() is used. 196 * In .altinstructions section, ERMS feature is placed after REG_GOOD 197 * feature to implement the right patch order. 198 * 199 * Replace only beginning, memcpy is used to apply alternatives, 200 * so it is silly to overwrite itself with nops - reboot is the 201 * only outcome... 202 */ 203 .section .altinstructions, "a" 204 altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ 205 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c 206 altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ 207 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e 208 .previous 209