xref: /linux/arch/powerpc/kernel/vector.S (revision 4f58e6dceb0e44ca8f21568ed81e1df24e55964c)
1#include <asm/processor.h>
2#include <asm/ppc_asm.h>
3#include <asm/reg.h>
4#include <asm/asm-offsets.h>
5#include <asm/cputable.h>
6#include <asm/thread_info.h>
7#include <asm/page.h>
8#include <asm/ptrace.h>
9
10/*
11 * Load state from memory into VMX registers including VSCR.
12 * Assumes the caller has enabled VMX in the MSR.
13 */
14_GLOBAL(load_vr_state)
15	li	r4,VRSTATE_VSCR
16	lvx	v0,r4,r3
17	mtvscr	v0
18	REST_32VRS(0,r4,r3)
19	blr
20
21/*
22 * Store VMX state into memory, including VSCR.
23 * Assumes the caller has enabled VMX in the MSR.
24 */
25_GLOBAL(store_vr_state)
26	SAVE_32VRS(0, r4, r3)
27	mfvscr	v0
28	li	r4, VRSTATE_VSCR
29	stvx	v0, r4, r3
30	blr
31
32/*
33 * Disable VMX for the task which had it previously,
34 * and save its vector registers in its thread_struct.
35 * Enables the VMX for use in the kernel on return.
36 * On SMP we know the VMX is free, since we give it up every
37 * switch (ie, no lazy save of the vector registers).
38 *
39 * Note that on 32-bit this can only use registers that will be
40 * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
41 */
42_GLOBAL(load_up_altivec)
43	mfmsr	r5			/* grab the current MSR */
44	oris	r5,r5,MSR_VEC@h
45	MTMSRD(r5)			/* enable use of AltiVec now */
46	isync
47
48	/*
49	 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
50	 * to optimise userspace context save/restore. Whenever we take an
51	 * altivec unavailable exception we must set VRSAVE to something non
52	 * zero. Set it to all 1s. See also the programming note in the ISA.
53	 */
54	mfspr	r4,SPRN_VRSAVE
55	cmpwi	0,r4,0
56	bne+	1f
57	li	r4,-1
58	mtspr	SPRN_VRSAVE,r4
591:
60	/* enable use of VMX after return */
61#ifdef CONFIG_PPC32
62	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
63	oris	r9,r9,MSR_VEC@h
64#else
65	ld	r4,PACACURRENT(r13)
66	addi	r5,r4,THREAD		/* Get THREAD */
67	oris	r12,r12,MSR_VEC@h
68	std	r12,_MSR(r1)
69#endif
70	/* Don't care if r4 overflows, this is desired behaviour */
71	lbz	r4,THREAD_LOAD_VEC(r5)
72	addi	r4,r4,1
73	stb	r4,THREAD_LOAD_VEC(r5)
74	addi	r6,r5,THREAD_VRSTATE
75	li	r4,1
76	li	r10,VRSTATE_VSCR
77	stw	r4,THREAD_USED_VR(r5)
78	lvx	v0,r10,r6
79	mtvscr	v0
80	REST_32VRS(0,r4,r6)
81	/* restore registers and return */
82	blr
83
84/*
85 * save_altivec(tsk)
86 * Save the vector registers to its thread_struct
87 */
88_GLOBAL(save_altivec)
89	addi	r3,r3,THREAD		/* want THREAD of task */
90	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
91	PPC_LL	r5,PT_REGS(r3)
92	PPC_LCMPI	0,r7,0
93	bne	2f
94	addi	r7,r3,THREAD_VRSTATE
952:	SAVE_32VRS(0,r4,r7)
96	mfvscr	v0
97	li	r4,VRSTATE_VSCR
98	stvx	v0,r4,r7
99	blr
100
101#ifdef CONFIG_VSX
102
103#ifdef CONFIG_PPC32
104#error This asm code isn't ready for 32-bit kernels
105#endif
106
107/*
108 * load_up_vsx(unused, unused, tsk)
109 * Disable VSX for the task which had it previously,
110 * and save its vector registers in its thread_struct.
111 * Reuse the fp and vsx saves, but first check to see if they have
112 * been saved already.
113 */
114_GLOBAL(load_up_vsx)
115/* Load FP and VSX registers if they haven't been done yet */
116	andi.	r5,r12,MSR_FP
117	beql+	load_up_fpu		/* skip if already loaded */
118	andis.	r5,r12,MSR_VEC@h
119	beql+	load_up_altivec		/* skip if already loaded */
120
121	ld	r4,PACACURRENT(r13)
122	addi	r4,r4,THREAD		/* Get THREAD */
123	li	r6,1
124	stw	r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
125	/* enable use of VSX after return */
126	oris	r12,r12,MSR_VSX@h
127	std	r12,_MSR(r1)
128	b	fast_exception_return
129
130#endif /* CONFIG_VSX */
131
132
133/*
134 * The routines below are in assembler so we can closely control the
135 * usage of floating-point registers.  These routines must be called
136 * with preempt disabled.
137 */
138#ifdef CONFIG_PPC32
139	.data
140fpzero:
141	.long	0
142fpone:
143	.long	0x3f800000	/* 1.0 in single-precision FP */
144fphalf:
145	.long	0x3f000000	/* 0.5 in single-precision FP */
146
147#define LDCONST(fr, name)	\
148	lis	r11,name@ha;	\
149	lfs	fr,name@l(r11)
150#else
151
152	.section ".toc","aw"
153fpzero:
154	.tc	FD_0_0[TC],0
155fpone:
156	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
157fphalf:
158	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
159
160#define LDCONST(fr, name)	\
161	lfd	fr,name@toc(r2)
162#endif
163
164	.text
165/*
166 * Internal routine to enable floating point and set FPSCR to 0.
167 * Don't call it from C; it doesn't use the normal calling convention.
168 */
169fpenable:
170#ifdef CONFIG_PPC32
171	stwu	r1,-64(r1)
172#else
173	stdu	r1,-64(r1)
174#endif
175	mfmsr	r10
176	ori	r11,r10,MSR_FP
177	mtmsr	r11
178	isync
179	stfd	fr0,24(r1)
180	stfd	fr1,16(r1)
181	stfd	fr31,8(r1)
182	LDCONST(fr1, fpzero)
183	mffs	fr31
184	MTFSF_L(fr1)
185	blr
186
187fpdisable:
188	mtlr	r12
189	MTFSF_L(fr31)
190	lfd	fr31,8(r1)
191	lfd	fr1,16(r1)
192	lfd	fr0,24(r1)
193	mtmsr	r10
194	isync
195	addi	r1,r1,64
196	blr
197
198/*
199 * Vector add, floating point.
200 */
201_GLOBAL(vaddfp)
202	mflr	r12
203	bl	fpenable
204	li	r0,4
205	mtctr	r0
206	li	r6,0
2071:	lfsx	fr0,r4,r6
208	lfsx	fr1,r5,r6
209	fadds	fr0,fr0,fr1
210	stfsx	fr0,r3,r6
211	addi	r6,r6,4
212	bdnz	1b
213	b	fpdisable
214
215/*
216 * Vector subtract, floating point.
217 */
218_GLOBAL(vsubfp)
219	mflr	r12
220	bl	fpenable
221	li	r0,4
222	mtctr	r0
223	li	r6,0
2241:	lfsx	fr0,r4,r6
225	lfsx	fr1,r5,r6
226	fsubs	fr0,fr0,fr1
227	stfsx	fr0,r3,r6
228	addi	r6,r6,4
229	bdnz	1b
230	b	fpdisable
231
232/*
233 * Vector multiply and add, floating point.
234 */
235_GLOBAL(vmaddfp)
236	mflr	r12
237	bl	fpenable
238	stfd	fr2,32(r1)
239	li	r0,4
240	mtctr	r0
241	li	r7,0
2421:	lfsx	fr0,r4,r7
243	lfsx	fr1,r5,r7
244	lfsx	fr2,r6,r7
245	fmadds	fr0,fr0,fr2,fr1
246	stfsx	fr0,r3,r7
247	addi	r7,r7,4
248	bdnz	1b
249	lfd	fr2,32(r1)
250	b	fpdisable
251
252/*
253 * Vector negative multiply and subtract, floating point.
254 */
255_GLOBAL(vnmsubfp)
256	mflr	r12
257	bl	fpenable
258	stfd	fr2,32(r1)
259	li	r0,4
260	mtctr	r0
261	li	r7,0
2621:	lfsx	fr0,r4,r7
263	lfsx	fr1,r5,r7
264	lfsx	fr2,r6,r7
265	fnmsubs	fr0,fr0,fr2,fr1
266	stfsx	fr0,r3,r7
267	addi	r7,r7,4
268	bdnz	1b
269	lfd	fr2,32(r1)
270	b	fpdisable
271
272/*
273 * Vector reciprocal estimate.  We just compute 1.0/x.
274 * r3 -> destination, r4 -> source.
275 */
276_GLOBAL(vrefp)
277	mflr	r12
278	bl	fpenable
279	li	r0,4
280	LDCONST(fr1, fpone)
281	mtctr	r0
282	li	r6,0
2831:	lfsx	fr0,r4,r6
284	fdivs	fr0,fr1,fr0
285	stfsx	fr0,r3,r6
286	addi	r6,r6,4
287	bdnz	1b
288	b	fpdisable
289
290/*
291 * Vector reciprocal square-root estimate, floating point.
292 * We use the frsqrte instruction for the initial estimate followed
293 * by 2 iterations of Newton-Raphson to get sufficient accuracy.
294 * r3 -> destination, r4 -> source.
295 */
296_GLOBAL(vrsqrtefp)
297	mflr	r12
298	bl	fpenable
299	stfd	fr2,32(r1)
300	stfd	fr3,40(r1)
301	stfd	fr4,48(r1)
302	stfd	fr5,56(r1)
303	li	r0,4
304	LDCONST(fr4, fpone)
305	LDCONST(fr5, fphalf)
306	mtctr	r0
307	li	r6,0
3081:	lfsx	fr0,r4,r6
309	frsqrte	fr1,fr0		/* r = frsqrte(s) */
310	fmuls	fr3,fr1,fr0	/* r * s */
311	fmuls	fr2,fr1,fr5	/* r * 0.5 */
312	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
313	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
314	fmuls	fr3,fr1,fr0	/* r * s */
315	fmuls	fr2,fr1,fr5	/* r * 0.5 */
316	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
317	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
318	stfsx	fr1,r3,r6
319	addi	r6,r6,4
320	bdnz	1b
321	lfd	fr5,56(r1)
322	lfd	fr4,48(r1)
323	lfd	fr3,40(r1)
324	lfd	fr2,32(r1)
325	b	fpdisable
326