1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* Copyright 2002 Andi Kleen */ 3 4#include <linux/linkage.h> 5#include <asm/errno.h> 6#include <asm/cpufeatures.h> 7#include <asm/alternative.h> 8#include <asm/export.h> 9 10.section .noinstr.text, "ax" 11 12/* 13 * memcpy - Copy a memory block. 14 * 15 * Input: 16 * rdi destination 17 * rsi source 18 * rdx count 19 * 20 * Output: 21 * rax original destination 22 * 23 * The FSRM alternative should be done inline (avoiding the call and 24 * the disgusting return handling), but that would require some help 25 * from the compiler for better calling conventions. 26 * 27 * The 'rep movsb' itself is small enough to replace the call, but the 28 * two register moves blow up the code. And one of them is "needed" 29 * only for the return value that is the same as the source input, 30 * which the compiler could/should do much better anyway. 31 */ 32SYM_TYPED_FUNC_START(__memcpy) 33 ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM 34 35 movq %rdi, %rax 36 movq %rdx, %rcx 37 rep movsb 38 RET 39SYM_FUNC_END(__memcpy) 40EXPORT_SYMBOL(__memcpy) 41 42SYM_FUNC_ALIAS(memcpy, __memcpy) 43EXPORT_SYMBOL(memcpy) 44 45SYM_FUNC_START_LOCAL(memcpy_orig) 46 movq %rdi, %rax 47 48 cmpq $0x20, %rdx 49 jb .Lhandle_tail 50 51 /* 52 * We check whether memory false dependence could occur, 53 * then jump to corresponding copy mode. 54 */ 55 cmp %dil, %sil 56 jl .Lcopy_backward 57 subq $0x20, %rdx 58.Lcopy_forward_loop: 59 subq $0x20, %rdx 60 61 /* 62 * Move in blocks of 4x8 bytes: 63 */ 64 movq 0*8(%rsi), %r8 65 movq 1*8(%rsi), %r9 66 movq 2*8(%rsi), %r10 67 movq 3*8(%rsi), %r11 68 leaq 4*8(%rsi), %rsi 69 70 movq %r8, 0*8(%rdi) 71 movq %r9, 1*8(%rdi) 72 movq %r10, 2*8(%rdi) 73 movq %r11, 3*8(%rdi) 74 leaq 4*8(%rdi), %rdi 75 jae .Lcopy_forward_loop 76 addl $0x20, %edx 77 jmp .Lhandle_tail 78 79.Lcopy_backward: 80 /* 81 * Calculate copy position to tail. 82 */ 83 addq %rdx, %rsi 84 addq %rdx, %rdi 85 subq $0x20, %rdx 86 /* 87 * At most 3 ALU operations in one cycle, 88 * so append NOPS in the same 16 bytes trunk. 89 */ 90 .p2align 4 91.Lcopy_backward_loop: 92 subq $0x20, %rdx 93 movq -1*8(%rsi), %r8 94 movq -2*8(%rsi), %r9 95 movq -3*8(%rsi), %r10 96 movq -4*8(%rsi), %r11 97 leaq -4*8(%rsi), %rsi 98 movq %r8, -1*8(%rdi) 99 movq %r9, -2*8(%rdi) 100 movq %r10, -3*8(%rdi) 101 movq %r11, -4*8(%rdi) 102 leaq -4*8(%rdi), %rdi 103 jae .Lcopy_backward_loop 104 105 /* 106 * Calculate copy position to head. 107 */ 108 addl $0x20, %edx 109 subq %rdx, %rsi 110 subq %rdx, %rdi 111.Lhandle_tail: 112 cmpl $16, %edx 113 jb .Lless_16bytes 114 115 /* 116 * Move data from 16 bytes to 31 bytes. 117 */ 118 movq 0*8(%rsi), %r8 119 movq 1*8(%rsi), %r9 120 movq -2*8(%rsi, %rdx), %r10 121 movq -1*8(%rsi, %rdx), %r11 122 movq %r8, 0*8(%rdi) 123 movq %r9, 1*8(%rdi) 124 movq %r10, -2*8(%rdi, %rdx) 125 movq %r11, -1*8(%rdi, %rdx) 126 RET 127 .p2align 4 128.Lless_16bytes: 129 cmpl $8, %edx 130 jb .Lless_8bytes 131 /* 132 * Move data from 8 bytes to 15 bytes. 133 */ 134 movq 0*8(%rsi), %r8 135 movq -1*8(%rsi, %rdx), %r9 136 movq %r8, 0*8(%rdi) 137 movq %r9, -1*8(%rdi, %rdx) 138 RET 139 .p2align 4 140.Lless_8bytes: 141 cmpl $4, %edx 142 jb .Lless_3bytes 143 144 /* 145 * Move data from 4 bytes to 7 bytes. 146 */ 147 movl (%rsi), %ecx 148 movl -4(%rsi, %rdx), %r8d 149 movl %ecx, (%rdi) 150 movl %r8d, -4(%rdi, %rdx) 151 RET 152 .p2align 4 153.Lless_3bytes: 154 subl $1, %edx 155 jb .Lend 156 /* 157 * Move data from 1 bytes to 3 bytes. 158 */ 159 movzbl (%rsi), %ecx 160 jz .Lstore_1byte 161 movzbq 1(%rsi), %r8 162 movzbq (%rsi, %rdx), %r9 163 movb %r8b, 1(%rdi) 164 movb %r9b, (%rdi, %rdx) 165.Lstore_1byte: 166 movb %cl, (%rdi) 167 168.Lend: 169 RET 170SYM_FUNC_END(memcpy_orig) 171 172