1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4#include <asm/cpufeatures.h> 5#include <asm/alternative-asm.h> 6 7/* 8 * We build a jump to memcpy_orig by default which gets NOPped out on 9 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 10 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 11 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 12 */ 13 14.weak memcpy 15 16/* 17 * memcpy - Copy a memory block. 18 * 19 * Input: 20 * rdi destination 21 * rsi source 22 * rdx count 23 * 24 * Output: 25 * rax original destination 26 */ 27ENTRY(__memcpy) 28ENTRY(memcpy) 29 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 30 "jmp memcpy_erms", X86_FEATURE_ERMS 31 32 movq %rdi, %rax 33 movq %rdx, %rcx 34 shrq $3, %rcx 35 andl $7, %edx 36 rep movsq 37 movl %edx, %ecx 38 rep movsb 39 ret 40ENDPROC(memcpy) 41ENDPROC(__memcpy) 42 43/* 44 * memcpy_erms() - enhanced fast string memcpy. This is faster and 45 * simpler than memcpy. Use memcpy_erms when possible. 46 */ 47ENTRY(memcpy_erms) 48 movq %rdi, %rax 49 movq %rdx, %rcx 50 rep movsb 51 ret 52ENDPROC(memcpy_erms) 53 54ENTRY(memcpy_orig) 55 movq %rdi, %rax 56 57 cmpq $0x20, %rdx 58 jb .Lhandle_tail 59 60 /* 61 * We check whether memory false dependence could occur, 62 * then jump to corresponding copy mode. 63 */ 64 cmp %dil, %sil 65 jl .Lcopy_backward 66 subq $0x20, %rdx 67.Lcopy_forward_loop: 68 subq $0x20, %rdx 69 70 /* 71 * Move in blocks of 4x8 bytes: 72 */ 73 movq 0*8(%rsi), %r8 74 movq 1*8(%rsi), %r9 75 movq 2*8(%rsi), %r10 76 movq 3*8(%rsi), %r11 77 leaq 4*8(%rsi), %rsi 78 79 movq %r8, 0*8(%rdi) 80 movq %r9, 1*8(%rdi) 81 movq %r10, 2*8(%rdi) 82 movq %r11, 3*8(%rdi) 83 leaq 4*8(%rdi), %rdi 84 jae .Lcopy_forward_loop 85 addl $0x20, %edx 86 jmp .Lhandle_tail 87 88.Lcopy_backward: 89 /* 90 * Calculate copy position to tail. 91 */ 92 addq %rdx, %rsi 93 addq %rdx, %rdi 94 subq $0x20, %rdx 95 /* 96 * At most 3 ALU operations in one cycle, 97 * so append NOPS in the same 16 bytes trunk. 98 */ 99 .p2align 4 100.Lcopy_backward_loop: 101 subq $0x20, %rdx 102 movq -1*8(%rsi), %r8 103 movq -2*8(%rsi), %r9 104 movq -3*8(%rsi), %r10 105 movq -4*8(%rsi), %r11 106 leaq -4*8(%rsi), %rsi 107 movq %r8, -1*8(%rdi) 108 movq %r9, -2*8(%rdi) 109 movq %r10, -3*8(%rdi) 110 movq %r11, -4*8(%rdi) 111 leaq -4*8(%rdi), %rdi 112 jae .Lcopy_backward_loop 113 114 /* 115 * Calculate copy position to head. 116 */ 117 addl $0x20, %edx 118 subq %rdx, %rsi 119 subq %rdx, %rdi 120.Lhandle_tail: 121 cmpl $16, %edx 122 jb .Lless_16bytes 123 124 /* 125 * Move data from 16 bytes to 31 bytes. 126 */ 127 movq 0*8(%rsi), %r8 128 movq 1*8(%rsi), %r9 129 movq -2*8(%rsi, %rdx), %r10 130 movq -1*8(%rsi, %rdx), %r11 131 movq %r8, 0*8(%rdi) 132 movq %r9, 1*8(%rdi) 133 movq %r10, -2*8(%rdi, %rdx) 134 movq %r11, -1*8(%rdi, %rdx) 135 retq 136 .p2align 4 137.Lless_16bytes: 138 cmpl $8, %edx 139 jb .Lless_8bytes 140 /* 141 * Move data from 8 bytes to 15 bytes. 142 */ 143 movq 0*8(%rsi), %r8 144 movq -1*8(%rsi, %rdx), %r9 145 movq %r8, 0*8(%rdi) 146 movq %r9, -1*8(%rdi, %rdx) 147 retq 148 .p2align 4 149.Lless_8bytes: 150 cmpl $4, %edx 151 jb .Lless_3bytes 152 153 /* 154 * Move data from 4 bytes to 7 bytes. 155 */ 156 movl (%rsi), %ecx 157 movl -4(%rsi, %rdx), %r8d 158 movl %ecx, (%rdi) 159 movl %r8d, -4(%rdi, %rdx) 160 retq 161 .p2align 4 162.Lless_3bytes: 163 subl $1, %edx 164 jb .Lend 165 /* 166 * Move data from 1 bytes to 3 bytes. 167 */ 168 movzbl (%rsi), %ecx 169 jz .Lstore_1byte 170 movzbq 1(%rsi), %r8 171 movzbq (%rsi, %rdx), %r9 172 movb %r8b, 1(%rdi) 173 movb %r9b, (%rdi, %rdx) 174.Lstore_1byte: 175 movb %cl, (%rdi) 176 177.Lend: 178 retq 179ENDPROC(memcpy_orig) 180 181#ifndef CONFIG_UML 182/* 183 * memcpy_mcsafe - memory copy with machine check exception handling 184 * Note that we only catch machine checks when reading the source addresses. 185 * Writes to target are posted and don't generate machine checks. 186 */ 187ENTRY(memcpy_mcsafe) 188 cmpl $8, %edx 189 /* Less than 8 bytes? Go to byte copy loop */ 190 jb .L_no_whole_words 191 192 /* Check for bad alignment of source */ 193 testl $7, %esi 194 /* Already aligned */ 195 jz .L_8byte_aligned 196 197 /* Copy one byte at a time until source is 8-byte aligned */ 198 movl %esi, %ecx 199 andl $7, %ecx 200 subl $8, %ecx 201 negl %ecx 202 subl %ecx, %edx 203.L_copy_leading_bytes: 204 movb (%rsi), %al 205 movb %al, (%rdi) 206 incq %rsi 207 incq %rdi 208 decl %ecx 209 jnz .L_copy_leading_bytes 210 211.L_8byte_aligned: 212 /* Figure out how many whole cache lines (64-bytes) to copy */ 213 movl %edx, %ecx 214 andl $63, %edx 215 shrl $6, %ecx 216 jz .L_no_whole_cache_lines 217 218 /* Loop copying whole cache lines */ 219.L_cache_w0: movq (%rsi), %r8 220.L_cache_w1: movq 1*8(%rsi), %r9 221.L_cache_w2: movq 2*8(%rsi), %r10 222.L_cache_w3: movq 3*8(%rsi), %r11 223 movq %r8, (%rdi) 224 movq %r9, 1*8(%rdi) 225 movq %r10, 2*8(%rdi) 226 movq %r11, 3*8(%rdi) 227.L_cache_w4: movq 4*8(%rsi), %r8 228.L_cache_w5: movq 5*8(%rsi), %r9 229.L_cache_w6: movq 6*8(%rsi), %r10 230.L_cache_w7: movq 7*8(%rsi), %r11 231 movq %r8, 4*8(%rdi) 232 movq %r9, 5*8(%rdi) 233 movq %r10, 6*8(%rdi) 234 movq %r11, 7*8(%rdi) 235 leaq 64(%rsi), %rsi 236 leaq 64(%rdi), %rdi 237 decl %ecx 238 jnz .L_cache_w0 239 240 /* Are there any trailing 8-byte words? */ 241.L_no_whole_cache_lines: 242 movl %edx, %ecx 243 andl $7, %edx 244 shrl $3, %ecx 245 jz .L_no_whole_words 246 247 /* Copy trailing words */ 248.L_copy_trailing_words: 249 movq (%rsi), %r8 250 mov %r8, (%rdi) 251 leaq 8(%rsi), %rsi 252 leaq 8(%rdi), %rdi 253 decl %ecx 254 jnz .L_copy_trailing_words 255 256 /* Any trailing bytes? */ 257.L_no_whole_words: 258 andl %edx, %edx 259 jz .L_done_memcpy_trap 260 261 /* Copy trailing bytes */ 262 movl %edx, %ecx 263.L_copy_trailing_bytes: 264 movb (%rsi), %al 265 movb %al, (%rdi) 266 incq %rsi 267 incq %rdi 268 decl %ecx 269 jnz .L_copy_trailing_bytes 270 271 /* Copy successful. Return true */ 272.L_done_memcpy_trap: 273 xorq %rax, %rax 274 ret 275ENDPROC(memcpy_mcsafe) 276 277 .section .fixup, "ax" 278 /* Return false for any failure */ 279.L_memcpy_mcsafe_fail: 280 mov $1, %rax 281 ret 282 283 .previous 284 285 _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) 286 _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) 287 _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) 288 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) 289 _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) 290 _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) 291 _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) 292 _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) 293 _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) 294 _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) 295 _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) 296#endif 297