xref: /linux/arch/powerpc/kernel/vector.S (revision 071bf69a0220253a44acb8b2a27f7a262b9a46bf)
1#include <asm/processor.h>
2#include <asm/ppc_asm.h>
3#include <asm/reg.h>
4#include <asm/asm-offsets.h>
5#include <asm/cputable.h>
6#include <asm/thread_info.h>
7#include <asm/page.h>
8#include <asm/ptrace.h>
9
10#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
11/* void do_load_up_transact_altivec(struct thread_struct *thread)
12 *
13 * This is similar to load_up_altivec but for the transactional version of the
14 * vector regs.  It doesn't mess with the task MSR or valid flags.
15 * Furthermore, VEC laziness is not supported with TM currently.
16 */
17_GLOBAL(do_load_up_transact_altivec)
18	mfmsr	r6
19	oris	r5,r6,MSR_VEC@h
20	MTMSRD(r5)
21	isync
22
23	li	r4,1
24	stw	r4,THREAD_USED_VR(r3)
25
26	li	r10,THREAD_TRANSACT_VRSTATE+VRSTATE_VSCR
27	lvx	v0,r10,r3
28	mtvscr	v0
29	addi	r10,r3,THREAD_TRANSACT_VRSTATE
30	REST_32VRS(0,r4,r10)
31
32	blr
33#endif
34
35/*
36 * Load state from memory into VMX registers including VSCR.
37 * Assumes the caller has enabled VMX in the MSR.
38 */
39_GLOBAL(load_vr_state)
40	li	r4,VRSTATE_VSCR
41	lvx	v0,r4,r3
42	mtvscr	v0
43	REST_32VRS(0,r4,r3)
44	blr
45
46/*
47 * Store VMX state into memory, including VSCR.
48 * Assumes the caller has enabled VMX in the MSR.
49 */
50_GLOBAL(store_vr_state)
51	SAVE_32VRS(0, r4, r3)
52	mfvscr	v0
53	li	r4, VRSTATE_VSCR
54	stvx	v0, r4, r3
55	blr
56
57/*
58 * Disable VMX for the task which had it previously,
59 * and save its vector registers in its thread_struct.
60 * Enables the VMX for use in the kernel on return.
61 * On SMP we know the VMX is free, since we give it up every
62 * switch (ie, no lazy save of the vector registers).
63 *
64 * Note that on 32-bit this can only use registers that will be
65 * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
66 */
67_GLOBAL(load_up_altivec)
68	mfmsr	r5			/* grab the current MSR */
69	oris	r5,r5,MSR_VEC@h
70	MTMSRD(r5)			/* enable use of AltiVec now */
71	isync
72
73	/*
74	 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
75	 * to optimise userspace context save/restore. Whenever we take an
76	 * altivec unavailable exception we must set VRSAVE to something non
77	 * zero. Set it to all 1s. See also the programming note in the ISA.
78	 */
79	mfspr	r4,SPRN_VRSAVE
80	cmpwi	0,r4,0
81	bne+	1f
82	li	r4,-1
83	mtspr	SPRN_VRSAVE,r4
841:
85	/* enable use of VMX after return */
86#ifdef CONFIG_PPC32
87	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
88	oris	r9,r9,MSR_VEC@h
89#else
90	ld	r4,PACACURRENT(r13)
91	addi	r5,r4,THREAD		/* Get THREAD */
92	oris	r12,r12,MSR_VEC@h
93	std	r12,_MSR(r1)
94#endif
95	/* Don't care if r4 overflows, this is desired behaviour */
96	lbz	r4,THREAD_LOAD_VEC(r5)
97	addi	r4,r4,1
98	stb	r4,THREAD_LOAD_VEC(r5)
99	addi	r6,r5,THREAD_VRSTATE
100	li	r4,1
101	li	r10,VRSTATE_VSCR
102	stw	r4,THREAD_USED_VR(r5)
103	lvx	v0,r10,r6
104	mtvscr	v0
105	REST_32VRS(0,r4,r6)
106	/* restore registers and return */
107	blr
108
109/*
110 * save_altivec(tsk)
111 * Save the vector registers to its thread_struct
112 */
113_GLOBAL(save_altivec)
114	addi	r3,r3,THREAD		/* want THREAD of task */
115	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
116	PPC_LL	r5,PT_REGS(r3)
117	PPC_LCMPI	0,r7,0
118	bne	2f
119	addi	r7,r3,THREAD_VRSTATE
1202:	SAVE_32VRS(0,r4,r7)
121	mfvscr	v0
122	li	r4,VRSTATE_VSCR
123	stvx	v0,r4,r7
124	blr
125
126#ifdef CONFIG_VSX
127
128#ifdef CONFIG_PPC32
129#error This asm code isn't ready for 32-bit kernels
130#endif
131
132/*
133 * load_up_vsx(unused, unused, tsk)
134 * Disable VSX for the task which had it previously,
135 * and save its vector registers in its thread_struct.
136 * Reuse the fp and vsx saves, but first check to see if they have
137 * been saved already.
138 */
139_GLOBAL(load_up_vsx)
140/* Load FP and VSX registers if they haven't been done yet */
141	andi.	r5,r12,MSR_FP
142	beql+	load_up_fpu		/* skip if already loaded */
143	andis.	r5,r12,MSR_VEC@h
144	beql+	load_up_altivec		/* skip if already loaded */
145
146	ld	r4,PACACURRENT(r13)
147	addi	r4,r4,THREAD		/* Get THREAD */
148	li	r6,1
149	stw	r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
150	/* enable use of VSX after return */
151	oris	r12,r12,MSR_VSX@h
152	std	r12,_MSR(r1)
153	b	fast_exception_return
154
155#endif /* CONFIG_VSX */
156
157
158/*
159 * The routines below are in assembler so we can closely control the
160 * usage of floating-point registers.  These routines must be called
161 * with preempt disabled.
162 */
163#ifdef CONFIG_PPC32
164	.data
165fpzero:
166	.long	0
167fpone:
168	.long	0x3f800000	/* 1.0 in single-precision FP */
169fphalf:
170	.long	0x3f000000	/* 0.5 in single-precision FP */
171
172#define LDCONST(fr, name)	\
173	lis	r11,name@ha;	\
174	lfs	fr,name@l(r11)
175#else
176
177	.section ".toc","aw"
178fpzero:
179	.tc	FD_0_0[TC],0
180fpone:
181	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
182fphalf:
183	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
184
185#define LDCONST(fr, name)	\
186	lfd	fr,name@toc(r2)
187#endif
188
189	.text
190/*
191 * Internal routine to enable floating point and set FPSCR to 0.
192 * Don't call it from C; it doesn't use the normal calling convention.
193 */
194fpenable:
195#ifdef CONFIG_PPC32
196	stwu	r1,-64(r1)
197#else
198	stdu	r1,-64(r1)
199#endif
200	mfmsr	r10
201	ori	r11,r10,MSR_FP
202	mtmsr	r11
203	isync
204	stfd	fr0,24(r1)
205	stfd	fr1,16(r1)
206	stfd	fr31,8(r1)
207	LDCONST(fr1, fpzero)
208	mffs	fr31
209	MTFSF_L(fr1)
210	blr
211
212fpdisable:
213	mtlr	r12
214	MTFSF_L(fr31)
215	lfd	fr31,8(r1)
216	lfd	fr1,16(r1)
217	lfd	fr0,24(r1)
218	mtmsr	r10
219	isync
220	addi	r1,r1,64
221	blr
222
223/*
224 * Vector add, floating point.
225 */
226_GLOBAL(vaddfp)
227	mflr	r12
228	bl	fpenable
229	li	r0,4
230	mtctr	r0
231	li	r6,0
2321:	lfsx	fr0,r4,r6
233	lfsx	fr1,r5,r6
234	fadds	fr0,fr0,fr1
235	stfsx	fr0,r3,r6
236	addi	r6,r6,4
237	bdnz	1b
238	b	fpdisable
239
240/*
241 * Vector subtract, floating point.
242 */
243_GLOBAL(vsubfp)
244	mflr	r12
245	bl	fpenable
246	li	r0,4
247	mtctr	r0
248	li	r6,0
2491:	lfsx	fr0,r4,r6
250	lfsx	fr1,r5,r6
251	fsubs	fr0,fr0,fr1
252	stfsx	fr0,r3,r6
253	addi	r6,r6,4
254	bdnz	1b
255	b	fpdisable
256
257/*
258 * Vector multiply and add, floating point.
259 */
260_GLOBAL(vmaddfp)
261	mflr	r12
262	bl	fpenable
263	stfd	fr2,32(r1)
264	li	r0,4
265	mtctr	r0
266	li	r7,0
2671:	lfsx	fr0,r4,r7
268	lfsx	fr1,r5,r7
269	lfsx	fr2,r6,r7
270	fmadds	fr0,fr0,fr2,fr1
271	stfsx	fr0,r3,r7
272	addi	r7,r7,4
273	bdnz	1b
274	lfd	fr2,32(r1)
275	b	fpdisable
276
277/*
278 * Vector negative multiply and subtract, floating point.
279 */
280_GLOBAL(vnmsubfp)
281	mflr	r12
282	bl	fpenable
283	stfd	fr2,32(r1)
284	li	r0,4
285	mtctr	r0
286	li	r7,0
2871:	lfsx	fr0,r4,r7
288	lfsx	fr1,r5,r7
289	lfsx	fr2,r6,r7
290	fnmsubs	fr0,fr0,fr2,fr1
291	stfsx	fr0,r3,r7
292	addi	r7,r7,4
293	bdnz	1b
294	lfd	fr2,32(r1)
295	b	fpdisable
296
297/*
298 * Vector reciprocal estimate.  We just compute 1.0/x.
299 * r3 -> destination, r4 -> source.
300 */
301_GLOBAL(vrefp)
302	mflr	r12
303	bl	fpenable
304	li	r0,4
305	LDCONST(fr1, fpone)
306	mtctr	r0
307	li	r6,0
3081:	lfsx	fr0,r4,r6
309	fdivs	fr0,fr1,fr0
310	stfsx	fr0,r3,r6
311	addi	r6,r6,4
312	bdnz	1b
313	b	fpdisable
314
315/*
316 * Vector reciprocal square-root estimate, floating point.
317 * We use the frsqrte instruction for the initial estimate followed
318 * by 2 iterations of Newton-Raphson to get sufficient accuracy.
319 * r3 -> destination, r4 -> source.
320 */
321_GLOBAL(vrsqrtefp)
322	mflr	r12
323	bl	fpenable
324	stfd	fr2,32(r1)
325	stfd	fr3,40(r1)
326	stfd	fr4,48(r1)
327	stfd	fr5,56(r1)
328	li	r0,4
329	LDCONST(fr4, fpone)
330	LDCONST(fr5, fphalf)
331	mtctr	r0
332	li	r6,0
3331:	lfsx	fr0,r4,r6
334	frsqrte	fr1,fr0		/* r = frsqrte(s) */
335	fmuls	fr3,fr1,fr0	/* r * s */
336	fmuls	fr2,fr1,fr5	/* r * 0.5 */
337	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
338	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
339	fmuls	fr3,fr1,fr0	/* r * s */
340	fmuls	fr2,fr1,fr5	/* r * 0.5 */
341	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
342	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
343	stfsx	fr1,r3,r6
344	addi	r6,r6,4
345	bdnz	1b
346	lfd	fr5,56(r1)
347	lfd	fr4,48(r1)
348	lfd	fr3,40(r1)
349	lfd	fr2,32(r1)
350	b	fpdisable
351