1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4 5#include <asm/cpufeature.h> 6#include <asm/dwarf2.h> 7 8/* 9 * memcpy - Copy a memory block. 10 * 11 * Input: 12 * rdi destination 13 * rsi source 14 * rdx count 15 * 16 * Output: 17 * rax original destination 18 */ 19 20/* 21 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 22 * 23 * This gets patched over the unrolled variant (below) via the 24 * alternative instructions framework: 25 */ 26 .section .altinstr_replacement, "ax", @progbits 27.Lmemcpy_c: 28 movq %rdi, %rax 29 30 movl %edx, %ecx 31 shrl $3, %ecx 32 andl $7, %edx 33 rep movsq 34 movl %edx, %ecx 35 rep movsb 36 ret 37.Lmemcpy_e: 38 .previous 39 40ENTRY(__memcpy) 41ENTRY(memcpy) 42 CFI_STARTPROC 43 movq %rdi, %rax 44 45 /* 46 * Use 32bit CMP here to avoid long NOP padding. 47 */ 48 cmp $0x20, %edx 49 jb .Lhandle_tail 50 51 /* 52 * We check whether memory false dependece could occur, 53 * then jump to corresponding copy mode. 54 */ 55 cmp %dil, %sil 56 jl .Lcopy_backward 57 subl $0x20, %edx 58.Lcopy_forward_loop: 59 subq $0x20, %rdx 60 61 /* 62 * Move in blocks of 4x8 bytes: 63 */ 64 movq 0*8(%rsi), %r8 65 movq 1*8(%rsi), %r9 66 movq 2*8(%rsi), %r10 67 movq 3*8(%rsi), %r11 68 leaq 4*8(%rsi), %rsi 69 70 movq %r8, 0*8(%rdi) 71 movq %r9, 1*8(%rdi) 72 movq %r10, 2*8(%rdi) 73 movq %r11, 3*8(%rdi) 74 leaq 4*8(%rdi), %rdi 75 jae .Lcopy_forward_loop 76 addq $0x20, %rdx 77 jmp .Lhandle_tail 78 79.Lcopy_backward: 80 /* 81 * Calculate copy position to tail. 82 */ 83 addq %rdx, %rsi 84 addq %rdx, %rdi 85 subq $0x20, %rdx 86 /* 87 * At most 3 ALU operations in one cycle, 88 * so append NOPS in the same 16bytes trunk. 89 */ 90 .p2align 4 91.Lcopy_backward_loop: 92 subq $0x20, %rdx 93 movq -1*8(%rsi), %r8 94 movq -2*8(%rsi), %r9 95 movq -3*8(%rsi), %r10 96 movq -4*8(%rsi), %r11 97 leaq -4*8(%rsi), %rsi 98 movq %r8, -1*8(%rdi) 99 movq %r9, -2*8(%rdi) 100 movq %r10, -3*8(%rdi) 101 movq %r11, -4*8(%rdi) 102 leaq -4*8(%rdi), %rdi 103 jae .Lcopy_backward_loop 104 105 /* 106 * Calculate copy position to head. 107 */ 108 addq $0x20, %rdx 109 subq %rdx, %rsi 110 subq %rdx, %rdi 111.Lhandle_tail: 112 cmpq $16, %rdx 113 jb .Lless_16bytes 114 115 /* 116 * Move data from 16 bytes to 31 bytes. 117 */ 118 movq 0*8(%rsi), %r8 119 movq 1*8(%rsi), %r9 120 movq -2*8(%rsi, %rdx), %r10 121 movq -1*8(%rsi, %rdx), %r11 122 movq %r8, 0*8(%rdi) 123 movq %r9, 1*8(%rdi) 124 movq %r10, -2*8(%rdi, %rdx) 125 movq %r11, -1*8(%rdi, %rdx) 126 retq 127 .p2align 4 128.Lless_16bytes: 129 cmpq $8, %rdx 130 jb .Lless_8bytes 131 /* 132 * Move data from 8 bytes to 15 bytes. 133 */ 134 movq 0*8(%rsi), %r8 135 movq -1*8(%rsi, %rdx), %r9 136 movq %r8, 0*8(%rdi) 137 movq %r9, -1*8(%rdi, %rdx) 138 retq 139 .p2align 4 140.Lless_8bytes: 141 cmpq $4, %rdx 142 jb .Lless_3bytes 143 144 /* 145 * Move data from 4 bytes to 7 bytes. 146 */ 147 movl (%rsi), %ecx 148 movl -4(%rsi, %rdx), %r8d 149 movl %ecx, (%rdi) 150 movl %r8d, -4(%rdi, %rdx) 151 retq 152 .p2align 4 153.Lless_3bytes: 154 cmpl $0, %edx 155 je .Lend 156 /* 157 * Move data from 1 bytes to 3 bytes. 158 */ 159.Lloop_1: 160 movb (%rsi), %r8b 161 movb %r8b, (%rdi) 162 incq %rdi 163 incq %rsi 164 decl %edx 165 jnz .Lloop_1 166 167.Lend: 168 retq 169 CFI_ENDPROC 170ENDPROC(memcpy) 171ENDPROC(__memcpy) 172 173 /* 174 * Some CPUs run faster using the string copy instructions. 175 * It is also a lot simpler. Use this when possible: 176 */ 177 178 .section .altinstructions, "a" 179 .align 8 180 .quad memcpy 181 .quad .Lmemcpy_c 182 .word X86_FEATURE_REP_GOOD 183 184 /* 185 * Replace only beginning, memcpy is used to apply alternatives, 186 * so it is silly to overwrite itself with nops - reboot is the 187 * only outcome... 188 */ 189 .byte .Lmemcpy_e - .Lmemcpy_c 190 .byte .Lmemcpy_e - .Lmemcpy_c 191 .previous 192