xref: /linux/arch/x86/lib/copy_page_64.S (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
2
3#include <linux/linkage.h>
4#include <asm/dwarf2.h>
5#include <asm/alternative-asm.h>
6
7	ALIGN
8copy_page_c:
9	CFI_STARTPROC
10	movl $4096/8,%ecx
11	rep movsq
12	ret
13	CFI_ENDPROC
14ENDPROC(copy_page_c)
15
16/* Don't use streaming store because it's better when the target
17   ends up in cache. */
18
19/* Could vary the prefetch distance based on SMP/UP */
20
21ENTRY(copy_page)
22	CFI_STARTPROC
23	subq	$2*8,%rsp
24	CFI_ADJUST_CFA_OFFSET 2*8
25	movq	%rbx,(%rsp)
26	CFI_REL_OFFSET rbx, 0
27	movq	%r12,1*8(%rsp)
28	CFI_REL_OFFSET r12, 1*8
29
30	movl	$(4096/64)-5,%ecx
31	.p2align 4
32.Loop64:
33  	dec     %rcx
34
35	movq        (%rsi), %rax
36	movq      8 (%rsi), %rbx
37	movq     16 (%rsi), %rdx
38	movq     24 (%rsi), %r8
39	movq     32 (%rsi), %r9
40	movq     40 (%rsi), %r10
41	movq     48 (%rsi), %r11
42	movq     56 (%rsi), %r12
43
44	prefetcht0 5*64(%rsi)
45
46	movq     %rax,    (%rdi)
47	movq     %rbx,  8 (%rdi)
48	movq     %rdx, 16 (%rdi)
49	movq     %r8,  24 (%rdi)
50	movq     %r9,  32 (%rdi)
51	movq     %r10, 40 (%rdi)
52	movq     %r11, 48 (%rdi)
53	movq     %r12, 56 (%rdi)
54
55	leaq    64 (%rsi), %rsi
56	leaq    64 (%rdi), %rdi
57
58	jnz     .Loop64
59
60	movl	$5,%ecx
61	.p2align 4
62.Loop2:
63	decl   %ecx
64
65	movq        (%rsi), %rax
66	movq      8 (%rsi), %rbx
67	movq     16 (%rsi), %rdx
68	movq     24 (%rsi), %r8
69	movq     32 (%rsi), %r9
70	movq     40 (%rsi), %r10
71	movq     48 (%rsi), %r11
72	movq     56 (%rsi), %r12
73
74	movq     %rax,    (%rdi)
75	movq     %rbx,  8 (%rdi)
76	movq     %rdx, 16 (%rdi)
77	movq     %r8,  24 (%rdi)
78	movq     %r9,  32 (%rdi)
79	movq     %r10, 40 (%rdi)
80	movq     %r11, 48 (%rdi)
81	movq     %r12, 56 (%rdi)
82
83	leaq	64(%rdi),%rdi
84	leaq	64(%rsi),%rsi
85
86	jnz	.Loop2
87
88	movq	(%rsp),%rbx
89	CFI_RESTORE rbx
90	movq	1*8(%rsp),%r12
91	CFI_RESTORE r12
92	addq	$2*8,%rsp
93	CFI_ADJUST_CFA_OFFSET -2*8
94	ret
95.Lcopy_page_end:
96	CFI_ENDPROC
97ENDPROC(copy_page)
98
99	/* Some CPUs run faster using the string copy instructions.
100	   It is also a lot simpler. Use this when possible */
101
102#include <asm/cpufeature.h>
103
104	.section .altinstr_replacement,"ax"
1051:	.byte 0xeb					/* jmp <disp8> */
106	.byte (copy_page_c - copy_page) - (2f - 1b)	/* offset */
1072:
108	.previous
109	.section .altinstructions,"a"
110	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
111		.Lcopy_page_end-copy_page, 2b-1b
112	.previous
113