1/* 2 * Normally compiler builtins are used, but sometimes the compiler calls out 3 * of line code. Based on asm-i386/string.h. 4 * 5 * This assembly file is re-written from memmove_64.c file. 6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> 7 */ 8#include <linux/linkage.h> 9#include <asm/cpufeature.h> 10#include <asm/alternative-asm.h> 11 12#undef memmove 13 14/* 15 * Implement memmove(). This can handle overlap between src and dst. 16 * 17 * Input: 18 * rdi: dest 19 * rsi: src 20 * rdx: count 21 * 22 * Output: 23 * rax: dest 24 */ 25.weak memmove 26 27ENTRY(memmove) 28ENTRY(__memmove) 29 30 /* Handle more 32 bytes in loop */ 31 mov %rdi, %rax 32 cmp $0x20, %rdx 33 jb 1f 34 35 /* Decide forward/backward copy mode */ 36 cmp %rdi, %rsi 37 jge .Lmemmove_begin_forward 38 mov %rsi, %r8 39 add %rdx, %r8 40 cmp %rdi, %r8 41 jg 2f 42 43.Lmemmove_begin_forward: 44 ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS 45 46 /* 47 * movsq instruction have many startup latency 48 * so we handle small size by general register. 49 */ 50 cmp $680, %rdx 51 jb 3f 52 /* 53 * movsq instruction is only good for aligned case. 54 */ 55 56 cmpb %dil, %sil 57 je 4f 583: 59 sub $0x20, %rdx 60 /* 61 * We gobble 32 bytes forward in each loop. 62 */ 635: 64 sub $0x20, %rdx 65 movq 0*8(%rsi), %r11 66 movq 1*8(%rsi), %r10 67 movq 2*8(%rsi), %r9 68 movq 3*8(%rsi), %r8 69 leaq 4*8(%rsi), %rsi 70 71 movq %r11, 0*8(%rdi) 72 movq %r10, 1*8(%rdi) 73 movq %r9, 2*8(%rdi) 74 movq %r8, 3*8(%rdi) 75 leaq 4*8(%rdi), %rdi 76 jae 5b 77 addq $0x20, %rdx 78 jmp 1f 79 /* 80 * Handle data forward by movsq. 81 */ 82 .p2align 4 834: 84 movq %rdx, %rcx 85 movq -8(%rsi, %rdx), %r11 86 lea -8(%rdi, %rdx), %r10 87 shrq $3, %rcx 88 rep movsq 89 movq %r11, (%r10) 90 jmp 13f 91.Lmemmove_end_forward: 92 93 /* 94 * Handle data backward by movsq. 95 */ 96 .p2align 4 977: 98 movq %rdx, %rcx 99 movq (%rsi), %r11 100 movq %rdi, %r10 101 leaq -8(%rsi, %rdx), %rsi 102 leaq -8(%rdi, %rdx), %rdi 103 shrq $3, %rcx 104 std 105 rep movsq 106 cld 107 movq %r11, (%r10) 108 jmp 13f 109 110 /* 111 * Start to prepare for backward copy. 112 */ 113 .p2align 4 1142: 115 cmp $680, %rdx 116 jb 6f 117 cmp %dil, %sil 118 je 7b 1196: 120 /* 121 * Calculate copy position to tail. 122 */ 123 addq %rdx, %rsi 124 addq %rdx, %rdi 125 subq $0x20, %rdx 126 /* 127 * We gobble 32 bytes backward in each loop. 128 */ 1298: 130 subq $0x20, %rdx 131 movq -1*8(%rsi), %r11 132 movq -2*8(%rsi), %r10 133 movq -3*8(%rsi), %r9 134 movq -4*8(%rsi), %r8 135 leaq -4*8(%rsi), %rsi 136 137 movq %r11, -1*8(%rdi) 138 movq %r10, -2*8(%rdi) 139 movq %r9, -3*8(%rdi) 140 movq %r8, -4*8(%rdi) 141 leaq -4*8(%rdi), %rdi 142 jae 8b 143 /* 144 * Calculate copy position to head. 145 */ 146 addq $0x20, %rdx 147 subq %rdx, %rsi 148 subq %rdx, %rdi 1491: 150 cmpq $16, %rdx 151 jb 9f 152 /* 153 * Move data from 16 bytes to 31 bytes. 154 */ 155 movq 0*8(%rsi), %r11 156 movq 1*8(%rsi), %r10 157 movq -2*8(%rsi, %rdx), %r9 158 movq -1*8(%rsi, %rdx), %r8 159 movq %r11, 0*8(%rdi) 160 movq %r10, 1*8(%rdi) 161 movq %r9, -2*8(%rdi, %rdx) 162 movq %r8, -1*8(%rdi, %rdx) 163 jmp 13f 164 .p2align 4 1659: 166 cmpq $8, %rdx 167 jb 10f 168 /* 169 * Move data from 8 bytes to 15 bytes. 170 */ 171 movq 0*8(%rsi), %r11 172 movq -1*8(%rsi, %rdx), %r10 173 movq %r11, 0*8(%rdi) 174 movq %r10, -1*8(%rdi, %rdx) 175 jmp 13f 17610: 177 cmpq $4, %rdx 178 jb 11f 179 /* 180 * Move data from 4 bytes to 7 bytes. 181 */ 182 movl (%rsi), %r11d 183 movl -4(%rsi, %rdx), %r10d 184 movl %r11d, (%rdi) 185 movl %r10d, -4(%rdi, %rdx) 186 jmp 13f 18711: 188 cmp $2, %rdx 189 jb 12f 190 /* 191 * Move data from 2 bytes to 3 bytes. 192 */ 193 movw (%rsi), %r11w 194 movw -2(%rsi, %rdx), %r10w 195 movw %r11w, (%rdi) 196 movw %r10w, -2(%rdi, %rdx) 197 jmp 13f 19812: 199 cmp $1, %rdx 200 jb 13f 201 /* 202 * Move data for 1 byte. 203 */ 204 movb (%rsi), %r11b 205 movb %r11b, (%rdi) 20613: 207 retq 208ENDPROC(__memmove) 209ENDPROC(memmove) 210