1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4#include <asm/cpufeature.h> 5#include <asm/alternative-asm.h> 6 7/* 8 * We build a jump to memcpy_orig by default which gets NOPped out on 9 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 10 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 11 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 12 */ 13 14.weak memcpy 15 16/* 17 * memcpy - Copy a memory block. 18 * 19 * Input: 20 * rdi destination 21 * rsi source 22 * rdx count 23 * 24 * Output: 25 * rax original destination 26 */ 27ENTRY(__memcpy) 28ENTRY(memcpy) 29 ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 30 "jmp memcpy_erms", X86_FEATURE_ERMS 31 32 movq %rdi, %rax 33 movq %rdx, %rcx 34 shrq $3, %rcx 35 andl $7, %edx 36 rep movsq 37 movl %edx, %ecx 38 rep movsb 39 ret 40ENDPROC(memcpy) 41ENDPROC(__memcpy) 42 43/* 44 * memcpy_erms() - enhanced fast string memcpy. This is faster and 45 * simpler than memcpy. Use memcpy_erms when possible. 46 */ 47ENTRY(memcpy_erms) 48 movq %rdi, %rax 49 movq %rdx, %rcx 50 rep movsb 51 ret 52ENDPROC(memcpy_erms) 53 54ENTRY(memcpy_orig) 55 movq %rdi, %rax 56 57 cmpq $0x20, %rdx 58 jb .Lhandle_tail 59 60 /* 61 * We check whether memory false dependence could occur, 62 * then jump to corresponding copy mode. 63 */ 64 cmp %dil, %sil 65 jl .Lcopy_backward 66 subq $0x20, %rdx 67.Lcopy_forward_loop: 68 subq $0x20, %rdx 69 70 /* 71 * Move in blocks of 4x8 bytes: 72 */ 73 movq 0*8(%rsi), %r8 74 movq 1*8(%rsi), %r9 75 movq 2*8(%rsi), %r10 76 movq 3*8(%rsi), %r11 77 leaq 4*8(%rsi), %rsi 78 79 movq %r8, 0*8(%rdi) 80 movq %r9, 1*8(%rdi) 81 movq %r10, 2*8(%rdi) 82 movq %r11, 3*8(%rdi) 83 leaq 4*8(%rdi), %rdi 84 jae .Lcopy_forward_loop 85 addl $0x20, %edx 86 jmp .Lhandle_tail 87 88.Lcopy_backward: 89 /* 90 * Calculate copy position to tail. 91 */ 92 addq %rdx, %rsi 93 addq %rdx, %rdi 94 subq $0x20, %rdx 95 /* 96 * At most 3 ALU operations in one cycle, 97 * so append NOPS in the same 16 bytes trunk. 98 */ 99 .p2align 4 100.Lcopy_backward_loop: 101 subq $0x20, %rdx 102 movq -1*8(%rsi), %r8 103 movq -2*8(%rsi), %r9 104 movq -3*8(%rsi), %r10 105 movq -4*8(%rsi), %r11 106 leaq -4*8(%rsi), %rsi 107 movq %r8, -1*8(%rdi) 108 movq %r9, -2*8(%rdi) 109 movq %r10, -3*8(%rdi) 110 movq %r11, -4*8(%rdi) 111 leaq -4*8(%rdi), %rdi 112 jae .Lcopy_backward_loop 113 114 /* 115 * Calculate copy position to head. 116 */ 117 addl $0x20, %edx 118 subq %rdx, %rsi 119 subq %rdx, %rdi 120.Lhandle_tail: 121 cmpl $16, %edx 122 jb .Lless_16bytes 123 124 /* 125 * Move data from 16 bytes to 31 bytes. 126 */ 127 movq 0*8(%rsi), %r8 128 movq 1*8(%rsi), %r9 129 movq -2*8(%rsi, %rdx), %r10 130 movq -1*8(%rsi, %rdx), %r11 131 movq %r8, 0*8(%rdi) 132 movq %r9, 1*8(%rdi) 133 movq %r10, -2*8(%rdi, %rdx) 134 movq %r11, -1*8(%rdi, %rdx) 135 retq 136 .p2align 4 137.Lless_16bytes: 138 cmpl $8, %edx 139 jb .Lless_8bytes 140 /* 141 * Move data from 8 bytes to 15 bytes. 142 */ 143 movq 0*8(%rsi), %r8 144 movq -1*8(%rsi, %rdx), %r9 145 movq %r8, 0*8(%rdi) 146 movq %r9, -1*8(%rdi, %rdx) 147 retq 148 .p2align 4 149.Lless_8bytes: 150 cmpl $4, %edx 151 jb .Lless_3bytes 152 153 /* 154 * Move data from 4 bytes to 7 bytes. 155 */ 156 movl (%rsi), %ecx 157 movl -4(%rsi, %rdx), %r8d 158 movl %ecx, (%rdi) 159 movl %r8d, -4(%rdi, %rdx) 160 retq 161 .p2align 4 162.Lless_3bytes: 163 subl $1, %edx 164 jb .Lend 165 /* 166 * Move data from 1 bytes to 3 bytes. 167 */ 168 movzbl (%rsi), %ecx 169 jz .Lstore_1byte 170 movzbq 1(%rsi), %r8 171 movzbq (%rsi, %rdx), %r9 172 movb %r8b, 1(%rdi) 173 movb %r9b, (%rdi, %rdx) 174.Lstore_1byte: 175 movb %cl, (%rdi) 176 177.Lend: 178 retq 179ENDPROC(memcpy_orig) 180