xref: /linux/arch/x86/lib/memcpy_64.S (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/cpufeature.h>
5#include <asm/alternative-asm.h>
6
7/*
8 * We build a jump to memcpy_orig by default which gets NOPped out on
9 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
10 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
11 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
12 */
13
14.weak memcpy
15
16/*
17 * memcpy - Copy a memory block.
18 *
19 * Input:
20 *  rdi destination
21 *  rsi source
22 *  rdx count
23 *
24 * Output:
25 * rax original destination
26 */
27ENTRY(__memcpy)
28ENTRY(memcpy)
29	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
30		      "jmp memcpy_erms", X86_FEATURE_ERMS
31
32	movq %rdi, %rax
33	movq %rdx, %rcx
34	shrq $3, %rcx
35	andl $7, %edx
36	rep movsq
37	movl %edx, %ecx
38	rep movsb
39	ret
40ENDPROC(memcpy)
41ENDPROC(__memcpy)
42
43/*
44 * memcpy_erms() - enhanced fast string memcpy. This is faster and
45 * simpler than memcpy. Use memcpy_erms when possible.
46 */
47ENTRY(memcpy_erms)
48	movq %rdi, %rax
49	movq %rdx, %rcx
50	rep movsb
51	ret
52ENDPROC(memcpy_erms)
53
54ENTRY(memcpy_orig)
55	movq %rdi, %rax
56
57	cmpq $0x20, %rdx
58	jb .Lhandle_tail
59
60	/*
61	 * We check whether memory false dependence could occur,
62	 * then jump to corresponding copy mode.
63	 */
64	cmp  %dil, %sil
65	jl .Lcopy_backward
66	subq $0x20, %rdx
67.Lcopy_forward_loop:
68	subq $0x20,	%rdx
69
70	/*
71	 * Move in blocks of 4x8 bytes:
72	 */
73	movq 0*8(%rsi),	%r8
74	movq 1*8(%rsi),	%r9
75	movq 2*8(%rsi),	%r10
76	movq 3*8(%rsi),	%r11
77	leaq 4*8(%rsi),	%rsi
78
79	movq %r8,	0*8(%rdi)
80	movq %r9,	1*8(%rdi)
81	movq %r10,	2*8(%rdi)
82	movq %r11,	3*8(%rdi)
83	leaq 4*8(%rdi),	%rdi
84	jae  .Lcopy_forward_loop
85	addl $0x20,	%edx
86	jmp  .Lhandle_tail
87
88.Lcopy_backward:
89	/*
90	 * Calculate copy position to tail.
91	 */
92	addq %rdx,	%rsi
93	addq %rdx,	%rdi
94	subq $0x20,	%rdx
95	/*
96	 * At most 3 ALU operations in one cycle,
97	 * so append NOPS in the same 16 bytes trunk.
98	 */
99	.p2align 4
100.Lcopy_backward_loop:
101	subq $0x20,	%rdx
102	movq -1*8(%rsi),	%r8
103	movq -2*8(%rsi),	%r9
104	movq -3*8(%rsi),	%r10
105	movq -4*8(%rsi),	%r11
106	leaq -4*8(%rsi),	%rsi
107	movq %r8,		-1*8(%rdi)
108	movq %r9,		-2*8(%rdi)
109	movq %r10,		-3*8(%rdi)
110	movq %r11,		-4*8(%rdi)
111	leaq -4*8(%rdi),	%rdi
112	jae  .Lcopy_backward_loop
113
114	/*
115	 * Calculate copy position to head.
116	 */
117	addl $0x20,	%edx
118	subq %rdx,	%rsi
119	subq %rdx,	%rdi
120.Lhandle_tail:
121	cmpl $16,	%edx
122	jb   .Lless_16bytes
123
124	/*
125	 * Move data from 16 bytes to 31 bytes.
126	 */
127	movq 0*8(%rsi), %r8
128	movq 1*8(%rsi),	%r9
129	movq -2*8(%rsi, %rdx),	%r10
130	movq -1*8(%rsi, %rdx),	%r11
131	movq %r8,	0*8(%rdi)
132	movq %r9,	1*8(%rdi)
133	movq %r10,	-2*8(%rdi, %rdx)
134	movq %r11,	-1*8(%rdi, %rdx)
135	retq
136	.p2align 4
137.Lless_16bytes:
138	cmpl $8,	%edx
139	jb   .Lless_8bytes
140	/*
141	 * Move data from 8 bytes to 15 bytes.
142	 */
143	movq 0*8(%rsi),	%r8
144	movq -1*8(%rsi, %rdx),	%r9
145	movq %r8,	0*8(%rdi)
146	movq %r9,	-1*8(%rdi, %rdx)
147	retq
148	.p2align 4
149.Lless_8bytes:
150	cmpl $4,	%edx
151	jb   .Lless_3bytes
152
153	/*
154	 * Move data from 4 bytes to 7 bytes.
155	 */
156	movl (%rsi), %ecx
157	movl -4(%rsi, %rdx), %r8d
158	movl %ecx, (%rdi)
159	movl %r8d, -4(%rdi, %rdx)
160	retq
161	.p2align 4
162.Lless_3bytes:
163	subl $1, %edx
164	jb .Lend
165	/*
166	 * Move data from 1 bytes to 3 bytes.
167	 */
168	movzbl (%rsi), %ecx
169	jz .Lstore_1byte
170	movzbq 1(%rsi), %r8
171	movzbq (%rsi, %rdx), %r9
172	movb %r8b, 1(%rdi)
173	movb %r9b, (%rdi, %rdx)
174.Lstore_1byte:
175	movb %cl, (%rdi)
176
177.Lend:
178	retq
179ENDPROC(memcpy_orig)
180