xref: /linux/arch/alpha/lib/ev6-stxcpy.S (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1/*
2 * arch/alpha/lib/ev6-stxcpy.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Copy a null-terminated string from SRC to DST.
6 *
7 * This is an internal routine used by strcpy, stpcpy, and strcat.
8 * As such, it uses special linkage conventions to make implementation
9 * of these public functions more efficient.
10 *
11 * On input:
12 *	t9 = return address
13 *	a0 = DST
14 *	a1 = SRC
15 *
16 * On output:
17 *	t12 = bitmask (with one bit set) indicating the last byte written
18 *	a0  = unaligned address of the last *word* written
19 *
20 * Furthermore, v0, a3-a5, t11, and t12 are untouched.
21 *
22 * Much of the information about 21264 scheduling/coding comes from:
23 *	Compiler Writer's Guide for the Alpha 21264
24 *	abbreviated as 'CWG' in other comments here
25 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
26 * Scheduling notation:
27 *	E	- either cluster
28 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
29 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
30 * Try not to change the actual algorithm if possible for consistency.
31 */
32
33#include <asm/regdef.h>
34
35	.set noat
36	.set noreorder
37
38	.text
39
40/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
41   doesn't like putting the entry point for a procedure somewhere in the
42   middle of the procedure descriptor.  Work around this by putting the
43   aligned copy in its own procedure descriptor */
44
45
46	.ent stxcpy_aligned
47	.align 4
48stxcpy_aligned:
49	.frame sp, 0, t9
50	.prologue 0
51
52	/* On entry to this basic block:
53	   t0 == the first destination word for masking back in
54	   t1 == the first source word.  */
55
56	/* Create the 1st output word and detect 0's in the 1st input word.  */
57	lda	t2, -1		# E : build a mask against false zero
58	mskqh	t2, a1, t2	# U :   detection in the src word (stall)
59	mskqh	t1, a1, t3	# U :
60	ornot	t1, t2, t2	# E : (stall)
61
62	mskql	t0, a1, t0	# U : assemble the first output word
63	cmpbge	zero, t2, t8	# E : bits set iff null found
64	or	t0, t3, t1	# E : (stall)
65	bne	t8, $a_eos	# U : (stall)
66
67	/* On entry to this basic block:
68	   t0 == the first destination word for masking back in
69	   t1 == a source word not containing a null.  */
70	/* Nops here to separate store quads from load quads */
71
72$a_loop:
73	stq_u	t1, 0(a0)	# L :
74	addq	a0, 8, a0	# E :
75	nop
76	nop
77
78	ldq_u	t1, 0(a1)	# L : Latency=3
79	addq	a1, 8, a1	# E :
80	cmpbge	zero, t1, t8	# E : (3 cycle stall)
81	beq	t8, $a_loop	# U : (stall for t8)
82
83	/* Take care of the final (partial) word store.
84	   On entry to this basic block we have:
85	   t1 == the source word containing the null
86	   t8 == the cmpbge mask that found it.  */
87$a_eos:
88	negq	t8, t6		# E : find low bit set
89	and	t8, t6, t12	# E : (stall)
90	/* For the sake of the cache, don't read a destination word
91	   if we're not going to need it.  */
92	and	t12, 0x80, t6	# E : (stall)
93	bne	t6, 1f		# U : (stall)
94
95	/* We're doing a partial word store and so need to combine
96	   our source and original destination words.  */
97	ldq_u	t0, 0(a0)	# L : Latency=3
98	subq	t12, 1, t6	# E :
99	zapnot	t1, t6, t1	# U : clear src bytes >= null (stall)
100	or	t12, t6, t8	# E : (stall)
101
102	zap	t0, t8, t0	# E : clear dst bytes <= null
103	or	t0, t1, t1	# E : (stall)
104	nop
105	nop
106
1071:	stq_u	t1, 0(a0)	# L :
108	ret	(t9)		# L0 : Latency=3
109	nop
110	nop
111
112	.end stxcpy_aligned
113
114	.align 4
115	.ent __stxcpy
116	.globl __stxcpy
117__stxcpy:
118	.frame sp, 0, t9
119	.prologue 0
120
121	/* Are source and destination co-aligned?  */
122	xor	a0, a1, t0	# E :
123	unop			# E :
124	and	t0, 7, t0	# E : (stall)
125	bne	t0, $unaligned	# U : (stall)
126
127	/* We are co-aligned; take care of a partial first word.  */
128	ldq_u	t1, 0(a1)		# L : load first src word
129	and	a0, 7, t0		# E : take care not to load a word ...
130	addq	a1, 8, a1		# E :
131	beq	t0, stxcpy_aligned	# U : ... if we wont need it (stall)
132
133	ldq_u	t0, 0(a0)	# L :
134	br	stxcpy_aligned	# L0 : Latency=3
135	nop
136	nop
137
138
139/* The source and destination are not co-aligned.  Align the destination
140   and cope.  We have to be very careful about not reading too much and
141   causing a SEGV.  */
142
143	.align 4
144$u_head:
145	/* We know just enough now to be able to assemble the first
146	   full source word.  We can still find a zero at the end of it
147	   that prevents us from outputting the whole thing.
148
149	   On entry to this basic block:
150	   t0 == the first dest word, for masking back in, if needed else 0
151	   t1 == the low bits of the first source word
152	   t6 == bytemask that is -1 in dest word bytes */
153
154	ldq_u	t2, 8(a1)	# L :
155	addq	a1, 8, a1	# E :
156	extql	t1, a1, t1	# U : (stall on a1)
157	extqh	t2, a1, t4	# U : (stall on a1)
158
159	mskql	t0, a0, t0	# U :
160	or	t1, t4, t1	# E :
161	mskqh	t1, a0, t1	# U : (stall on t1)
162	or	t0, t1, t1	# E : (stall on t1)
163
164	or	t1, t6, t6	# E :
165	cmpbge	zero, t6, t8	# E : (stall)
166	lda	t6, -1		# E : for masking just below
167	bne	t8, $u_final	# U : (stall)
168
169	mskql	t6, a1, t6		# U : mask out the bits we have
170	or	t6, t2, t2		# E :   already extracted before (stall)
171	cmpbge	zero, t2, t8		# E :   testing eos (stall)
172	bne	t8, $u_late_head_exit	# U : (stall)
173
174	/* Finally, we've got all the stupid leading edge cases taken care
175	   of and we can set up to enter the main loop.  */
176
177	stq_u	t1, 0(a0)	# L : store first output word
178	addq	a0, 8, a0	# E :
179	extql	t2, a1, t0	# U : position ho-bits of lo word
180	ldq_u	t2, 8(a1)	# U : read next high-order source word
181
182	addq	a1, 8, a1	# E :
183	cmpbge	zero, t2, t8	# E : (stall for t2)
184	nop			# E :
185	bne	t8, $u_eos	# U : (stall)
186
187	/* Unaligned copy main loop.  In order to avoid reading too much,
188	   the loop is structured to detect zeros in aligned source words.
189	   This has, unfortunately, effectively pulled half of a loop
190	   iteration out into the head and half into the tail, but it does
191	   prevent nastiness from accumulating in the very thing we want
192	   to run as fast as possible.
193
194	   On entry to this basic block:
195	   t0 == the shifted high-order bits from the previous source word
196	   t2 == the unshifted current source word
197
198	   We further know that t2 does not contain a null terminator.  */
199
200	.align 3
201$u_loop:
202	extqh	t2, a1, t1	# U : extract high bits for current word
203	addq	a1, 8, a1	# E : (stall)
204	extql	t2, a1, t3	# U : extract low bits for next time (stall)
205	addq	a0, 8, a0	# E :
206
207	or	t0, t1, t1	# E : current dst word now complete
208	ldq_u	t2, 0(a1)	# L : Latency=3 load high word for next time
209	stq_u	t1, -8(a0)	# L : save the current word (stall)
210	mov	t3, t0		# E :
211
212	cmpbge	zero, t2, t8	# E : test new word for eos
213	beq	t8, $u_loop	# U : (stall)
214	nop
215	nop
216
217	/* We've found a zero somewhere in the source word we just read.
218	   If it resides in the lower half, we have one (probably partial)
219	   word to write out, and if it resides in the upper half, we
220	   have one full and one partial word left to write out.
221
222	   On entry to this basic block:
223	   t0 == the shifted high-order bits from the previous source word
224	   t2 == the unshifted current source word.  */
225$u_eos:
226	extqh	t2, a1, t1	# U :
227	or	t0, t1, t1	# E : first (partial) source word complete (stall)
228	cmpbge	zero, t1, t8	# E : is the null in this first bit? (stall)
229	bne	t8, $u_final	# U : (stall)
230
231$u_late_head_exit:
232	stq_u	t1, 0(a0)	# L : the null was in the high-order bits
233	addq	a0, 8, a0	# E :
234	extql	t2, a1, t1	# U :
235	cmpbge	zero, t1, t8	# E : (stall)
236
237	/* Take care of a final (probably partial) result word.
238	   On entry to this basic block:
239	   t1 == assembled source word
240	   t8 == cmpbge mask that found the null.  */
241$u_final:
242	negq	t8, t6		# E : isolate low bit set
243	and	t6, t8, t12	# E : (stall)
244	and	t12, 0x80, t6	# E : avoid dest word load if we can (stall)
245	bne	t6, 1f		# U : (stall)
246
247	ldq_u	t0, 0(a0)	# E :
248	subq	t12, 1, t6	# E :
249	or	t6, t12, t8	# E : (stall)
250	zapnot	t1, t6, t1	# U : kill source bytes >= null (stall)
251
252	zap	t0, t8, t0	# U : kill dest bytes <= null (2 cycle data stall)
253	or	t0, t1, t1	# E : (stall)
254	nop
255	nop
256
2571:	stq_u	t1, 0(a0)	# L :
258	ret	(t9)		# L0 : Latency=3
259	nop
260	nop
261
262	/* Unaligned copy entry point.  */
263	.align 4
264$unaligned:
265
266	ldq_u	t1, 0(a1)	# L : load first source word
267	and	a0, 7, t4	# E : find dest misalignment
268	and	a1, 7, t5	# E : find src misalignment
269	/* Conditionally load the first destination word and a bytemask
270	   with 0xff indicating that the destination byte is sacrosanct.  */
271	mov	zero, t0	# E :
272
273	mov	zero, t6	# E :
274	beq	t4, 1f		# U :
275	ldq_u	t0, 0(a0)	# L :
276	lda	t6, -1		# E :
277
278	mskql	t6, a0, t6	# U :
279	nop
280	nop
281	nop
2821:
283	subq	a1, t4, a1	# E : sub dest misalignment from src addr
284	/* If source misalignment is larger than dest misalignment, we need
285	   extra startup checks to avoid SEGV.  */
286	cmplt	t4, t5, t12	# E :
287	beq	t12, $u_head	# U :
288	lda	t2, -1		# E : mask out leading garbage in source
289
290	mskqh	t2, t5, t2	# U :
291	ornot	t1, t2, t3	# E : (stall)
292	cmpbge	zero, t3, t8	# E : is there a zero? (stall)
293	beq	t8, $u_head	# U : (stall)
294
295	/* At this point we've found a zero in the first partial word of
296	   the source.  We need to isolate the valid source data and mask
297	   it into the original destination data.  (Incidentally, we know
298	   that we'll need at least one byte of that original dest word.) */
299
300	ldq_u	t0, 0(a0)	# L :
301	negq	t8, t6		# E : build bitmask of bytes <= zero
302	and	t6, t8, t12	# E : (stall)
303	and	a1, 7, t5	# E :
304
305	subq	t12, 1, t6	# E :
306	or	t6, t12, t8	# E : (stall)
307	srl	t12, t5, t12	# U : adjust final null return value
308	zapnot	t2, t8, t2	# U : prepare source word; mirror changes (stall)
309
310	and	t1, t2, t1	# E : to source validity mask
311	extql	t2, a1, t2	# U :
312	extql	t1, a1, t1	# U : (stall)
313	andnot	t0, t2, t0	# .. e1 : zero place for source to reside (stall)
314
315	or	t0, t1, t1	# e1    : and put it there
316	stq_u	t1, 0(a0)	# .. e0 : (stall)
317	ret	(t9)		# e1    :
318	nop
319
320	.end __stxcpy
321
322