xref: /linux/arch/powerpc/lib/memcpy_64.S (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL(memcpy)
14	mtcrf	0x01,r5
15	cmpldi	cr1,r5,16
16	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
17	andi.	r6,r6,7
18	dcbt	0,r4
19	blt	cr1,.Lshort_copy
20	bne	.Ldst_unaligned
21.Ldst_aligned:
22	andi.	r0,r4,7
23	addi	r3,r3,-16
24	bne	.Lsrc_unaligned
25	srdi	r7,r5,4
26	ld	r9,0(r4)
27	addi	r4,r4,-8
28	mtctr	r7
29	andi.	r5,r5,7
30	bf	cr7*4+0,2f
31	addi	r3,r3,8
32	addi	r4,r4,8
33	mr	r8,r9
34	blt	cr1,3f
351:	ld	r9,8(r4)
36	std	r8,8(r3)
372:	ldu	r8,16(r4)
38	stdu	r9,16(r3)
39	bdnz	1b
403:	std	r8,8(r3)
41	beqlr
42	addi	r3,r3,16
43	ld	r9,8(r4)
44.Ldo_tail:
45	bf	cr7*4+1,1f
46	rotldi	r9,r9,32
47	stw	r9,0(r3)
48	addi	r3,r3,4
491:	bf	cr7*4+2,2f
50	rotldi	r9,r9,16
51	sth	r9,0(r3)
52	addi	r3,r3,2
532:	bf	cr7*4+3,3f
54	rotldi	r9,r9,8
55	stb	r9,0(r3)
563:	blr
57
58.Lsrc_unaligned:
59	srdi	r6,r5,3
60	addi	r5,r5,-16
61	subf	r4,r0,r4
62	srdi	r7,r5,4
63	sldi	r10,r0,3
64	cmpdi	cr6,r6,3
65	andi.	r5,r5,7
66	mtctr	r7
67	subfic	r11,r10,64
68	add	r5,r5,r0
69
70	bt	cr7*4+0,0f
71
72	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
73	ld	r0,8(r4)
74	sld	r6,r9,r10
75	ldu	r9,16(r4)
76	srd	r7,r0,r11
77	sld	r8,r0,r10
78	or	r7,r7,r6
79	blt	cr6,4f
80	ld	r0,8(r4)
81	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
82	b	2f
83
840:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
85	ldu	r9,8(r4)
86	sld	r8,r0,r10
87	addi	r3,r3,-8
88	blt	cr6,5f
89	ld	r0,8(r4)
90	srd	r12,r9,r11
91	sld	r6,r9,r10
92	ldu	r9,16(r4)
93	or	r12,r8,r12
94	srd	r7,r0,r11
95	sld	r8,r0,r10
96	addi	r3,r3,16
97	beq	cr6,3f
98
99	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1001:	or	r7,r7,r6
101	ld	r0,8(r4)
102	std	r12,8(r3)
1032:	srd	r12,r9,r11
104	sld	r6,r9,r10
105	ldu	r9,16(r4)
106	or	r12,r8,r12
107	stdu	r7,16(r3)
108	srd	r7,r0,r11
109	sld	r8,r0,r10
110	bdnz	1b
111
1123:	std	r12,8(r3)
113	or	r7,r7,r6
1144:	std	r7,16(r3)
1155:	srd	r12,r9,r11
116	or	r12,r8,r12
117	std	r12,24(r3)
118	beqlr
119	cmpwi	cr1,r5,8
120	addi	r3,r3,32
121	sld	r9,r9,r10
122	ble	cr1,.Ldo_tail
123	ld	r0,8(r4)
124	srd	r7,r0,r11
125	or	r9,r7,r9
126	b	.Ldo_tail
127
128.Ldst_unaligned:
129	mtcrf	0x01,r6		# put #bytes to 8B bdry into cr7
130	subf	r5,r6,r5
131	li	r7,0
132	cmpldi	r1,r5,16
133	bf	cr7*4+3,1f
134	lbz	r0,0(r4)
135	stb	r0,0(r3)
136	addi	r7,r7,1
1371:	bf	cr7*4+2,2f
138	lhzx	r0,r7,r4
139	sthx	r0,r7,r3
140	addi	r7,r7,2
1412:	bf	cr7*4+1,3f
142	lwzx	r0,r7,r4
143	stwx	r0,r7,r3
1443:	mtcrf	0x01,r5
145	add	r4,r6,r4
146	add	r3,r6,r3
147	b	.Ldst_aligned
148
149.Lshort_copy:
150	bf	cr7*4+0,1f
151	lwz	r0,0(r4)
152	lwz	r9,4(r4)
153	addi	r4,r4,8
154	stw	r0,0(r3)
155	stw	r9,4(r3)
156	addi	r3,r3,8
1571:	bf	cr7*4+1,2f
158	lwz	r0,0(r4)
159	addi	r4,r4,4
160	stw	r0,0(r3)
161	addi	r3,r3,4
1622:	bf	cr7*4+2,3f
163	lhz	r0,0(r4)
164	addi	r4,r4,2
165	sth	r0,0(r3)
166	addi	r3,r3,2
1673:	bf	cr7*4+3,4f
168	lbz	r0,0(r4)
169	stb	r0,0(r3)
1704:	blr
171