xref: /linux/tools/arch/x86/lib/memcpy_64.S (revision afc74ce7b484da5c5698d8eb2472a58c547cbc2b)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/linkage.h>
5#include <asm/errno.h>
6#include <asm/cpufeatures.h>
7#include <asm/alternative-asm.h>
8#include <asm/export.h>
9
10.pushsection .noinstr.text, "ax"
11
12/*
13 * We build a jump to memcpy_orig by default which gets NOPped out on
14 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
15 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
16 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
17 */
18
19.weak memcpy
20
21/*
22 * memcpy - Copy a memory block.
23 *
24 * Input:
25 *  rdi destination
26 *  rsi source
27 *  rdx count
28 *
29 * Output:
30 * rax original destination
31 */
32SYM_FUNC_START_ALIAS(__memcpy)
33SYM_FUNC_START_LOCAL(memcpy)
34	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
35		      "jmp memcpy_erms", X86_FEATURE_ERMS
36
37	movq %rdi, %rax
38	movq %rdx, %rcx
39	shrq $3, %rcx
40	andl $7, %edx
41	rep movsq
42	movl %edx, %ecx
43	rep movsb
44	ret
45SYM_FUNC_END(memcpy)
46SYM_FUNC_END_ALIAS(__memcpy)
47EXPORT_SYMBOL(memcpy)
48EXPORT_SYMBOL(__memcpy)
49
50/*
51 * memcpy_erms() - enhanced fast string memcpy. This is faster and
52 * simpler than memcpy. Use memcpy_erms when possible.
53 */
54SYM_FUNC_START(memcpy_erms)
55	movq %rdi, %rax
56	movq %rdx, %rcx
57	rep movsb
58	ret
59SYM_FUNC_END(memcpy_erms)
60
61SYM_FUNC_START(memcpy_orig)
62	movq %rdi, %rax
63
64	cmpq $0x20, %rdx
65	jb .Lhandle_tail
66
67	/*
68	 * We check whether memory false dependence could occur,
69	 * then jump to corresponding copy mode.
70	 */
71	cmp  %dil, %sil
72	jl .Lcopy_backward
73	subq $0x20, %rdx
74.Lcopy_forward_loop:
75	subq $0x20,	%rdx
76
77	/*
78	 * Move in blocks of 4x8 bytes:
79	 */
80	movq 0*8(%rsi),	%r8
81	movq 1*8(%rsi),	%r9
82	movq 2*8(%rsi),	%r10
83	movq 3*8(%rsi),	%r11
84	leaq 4*8(%rsi),	%rsi
85
86	movq %r8,	0*8(%rdi)
87	movq %r9,	1*8(%rdi)
88	movq %r10,	2*8(%rdi)
89	movq %r11,	3*8(%rdi)
90	leaq 4*8(%rdi),	%rdi
91	jae  .Lcopy_forward_loop
92	addl $0x20,	%edx
93	jmp  .Lhandle_tail
94
95.Lcopy_backward:
96	/*
97	 * Calculate copy position to tail.
98	 */
99	addq %rdx,	%rsi
100	addq %rdx,	%rdi
101	subq $0x20,	%rdx
102	/*
103	 * At most 3 ALU operations in one cycle,
104	 * so append NOPS in the same 16 bytes trunk.
105	 */
106	.p2align 4
107.Lcopy_backward_loop:
108	subq $0x20,	%rdx
109	movq -1*8(%rsi),	%r8
110	movq -2*8(%rsi),	%r9
111	movq -3*8(%rsi),	%r10
112	movq -4*8(%rsi),	%r11
113	leaq -4*8(%rsi),	%rsi
114	movq %r8,		-1*8(%rdi)
115	movq %r9,		-2*8(%rdi)
116	movq %r10,		-3*8(%rdi)
117	movq %r11,		-4*8(%rdi)
118	leaq -4*8(%rdi),	%rdi
119	jae  .Lcopy_backward_loop
120
121	/*
122	 * Calculate copy position to head.
123	 */
124	addl $0x20,	%edx
125	subq %rdx,	%rsi
126	subq %rdx,	%rdi
127.Lhandle_tail:
128	cmpl $16,	%edx
129	jb   .Lless_16bytes
130
131	/*
132	 * Move data from 16 bytes to 31 bytes.
133	 */
134	movq 0*8(%rsi), %r8
135	movq 1*8(%rsi),	%r9
136	movq -2*8(%rsi, %rdx),	%r10
137	movq -1*8(%rsi, %rdx),	%r11
138	movq %r8,	0*8(%rdi)
139	movq %r9,	1*8(%rdi)
140	movq %r10,	-2*8(%rdi, %rdx)
141	movq %r11,	-1*8(%rdi, %rdx)
142	retq
143	.p2align 4
144.Lless_16bytes:
145	cmpl $8,	%edx
146	jb   .Lless_8bytes
147	/*
148	 * Move data from 8 bytes to 15 bytes.
149	 */
150	movq 0*8(%rsi),	%r8
151	movq -1*8(%rsi, %rdx),	%r9
152	movq %r8,	0*8(%rdi)
153	movq %r9,	-1*8(%rdi, %rdx)
154	retq
155	.p2align 4
156.Lless_8bytes:
157	cmpl $4,	%edx
158	jb   .Lless_3bytes
159
160	/*
161	 * Move data from 4 bytes to 7 bytes.
162	 */
163	movl (%rsi), %ecx
164	movl -4(%rsi, %rdx), %r8d
165	movl %ecx, (%rdi)
166	movl %r8d, -4(%rdi, %rdx)
167	retq
168	.p2align 4
169.Lless_3bytes:
170	subl $1, %edx
171	jb .Lend
172	/*
173	 * Move data from 1 bytes to 3 bytes.
174	 */
175	movzbl (%rsi), %ecx
176	jz .Lstore_1byte
177	movzbq 1(%rsi), %r8
178	movzbq (%rsi, %rdx), %r9
179	movb %r8b, 1(%rdi)
180	movb %r9b, (%rdi, %rdx)
181.Lstore_1byte:
182	movb %cl, (%rdi)
183
184.Lend:
185	retq
186SYM_FUNC_END(memcpy_orig)
187
188.popsection
189