xref: /linux/arch/sparc/lib/NGmemcpy.S (revision 68a052239fc4b351e961f698b824f7654a346091)
1/* SPDX-License-Identifier: GPL-2.0 */
2/* NGmemcpy.S: Niagara optimized memcpy.
3 *
4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
5 */
6
7#ifdef __KERNEL__
8#include <linux/linkage.h>
9#include <asm/asi.h>
10#include <asm/thread_info.h>
11#define GLOBAL_SPARE	%g7
12#define RESTORE_ASI(TMP)	\
13	wr	%g0, ASI_AIUS, %asi
14#else
15#define GLOBAL_SPARE	%g5
16#define RESTORE_ASI(TMP)	\
17	wr	%g0, ASI_PNF, %asi
18#endif
19
20#ifdef __sparc_v9__
21#define SAVE_AMOUNT	128
22#else
23#define SAVE_AMOUNT	64
24#endif
25
26#ifndef STORE_ASI
27#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
28#endif
29
30#ifndef EX_LD
31#define EX_LD(x,y)	x
32#endif
33
34#ifndef EX_ST
35#define EX_ST(x,y)	x
36#endif
37
38#ifndef LOAD
39#ifndef MEMCPY_DEBUG
40#define LOAD(type,addr,dest)	type [addr], dest
41#else
42#define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
43#endif
44#endif
45
46#ifndef LOAD_TWIN
47#define LOAD_TWIN(addr_reg,dest0,dest1)	\
48	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
49#endif
50
51#ifndef STORE
52#define STORE(type,src,addr)	type src, [addr]
53#endif
54
55#ifndef STORE_INIT
56#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
57#define STORE_INIT(src,addr)	stxa src, [addr] %asi
58#else
59#define STORE_INIT(src,addr)	stx src, [addr + 0x00]
60#endif
61#endif
62
63#ifndef FUNC_NAME
64#define FUNC_NAME	NGmemcpy
65#endif
66
67#ifndef PREAMBLE
68#define PREAMBLE
69#endif
70
71#ifndef XCC
72#define XCC xcc
73#endif
74
75	.register	%g2,#scratch
76	.register	%g3,#scratch
77
78	.text
79#ifndef EX_RETVAL
80#define EX_RETVAL(x)	x
81__restore_asi:
82	wr	%g0, ASI_AIUS, %asi
83	ret
84	 restore
85ENTRY(NG_ret_i2_plus_i4_plus_1)
86	ba,pt	%xcc, __restore_asi
87	 add	%i2, %i5, %i0
88ENDPROC(NG_ret_i2_plus_i4_plus_1)
89ENTRY(NG_ret_i2_plus_g1)
90	ba,pt	%xcc, __restore_asi
91	 add	%i2, %g1, %i0
92ENDPROC(NG_ret_i2_plus_g1)
93ENTRY(NG_ret_i2_plus_g1_minus_8)
94	sub	%g1, 8, %g1
95	ba,pt	%xcc, __restore_asi
96	 add	%i2, %g1, %i0
97ENDPROC(NG_ret_i2_plus_g1_minus_8)
98ENTRY(NG_ret_i2_plus_g1_minus_16)
99	sub	%g1, 16, %g1
100	ba,pt	%xcc, __restore_asi
101	 add	%i2, %g1, %i0
102ENDPROC(NG_ret_i2_plus_g1_minus_16)
103ENTRY(NG_ret_i2_plus_g1_minus_24)
104	sub	%g1, 24, %g1
105	ba,pt	%xcc, __restore_asi
106	 add	%i2, %g1, %i0
107ENDPROC(NG_ret_i2_plus_g1_minus_24)
108ENTRY(NG_ret_i2_plus_g1_minus_32)
109	sub	%g1, 32, %g1
110	ba,pt	%xcc, __restore_asi
111	 add	%i2, %g1, %i0
112ENDPROC(NG_ret_i2_plus_g1_minus_32)
113ENTRY(NG_ret_i2_plus_g1_minus_40)
114	sub	%g1, 40, %g1
115	ba,pt	%xcc, __restore_asi
116	 add	%i2, %g1, %i0
117ENDPROC(NG_ret_i2_plus_g1_minus_40)
118ENTRY(NG_ret_i2_plus_g1_minus_48)
119	sub	%g1, 48, %g1
120	ba,pt	%xcc, __restore_asi
121	 add	%i2, %g1, %i0
122ENDPROC(NG_ret_i2_plus_g1_minus_48)
123ENTRY(NG_ret_i2_plus_g1_minus_56)
124	sub	%g1, 56, %g1
125	ba,pt	%xcc, __restore_asi
126	 add	%i2, %g1, %i0
127ENDPROC(NG_ret_i2_plus_g1_minus_56)
128ENTRY(NG_ret_i2_plus_i4_plus_16)
129        add     %i4, 16, %i4
130	ba,pt	%xcc, __restore_asi
131	 add	%i2, %i4, %i0
132ENDPROC(NG_ret_i2_plus_i4_plus_16)
133ENTRY(NG_ret_i2_plus_i4_plus_8)
134	add	%i4, 8, %i4
135	ba,pt	%xcc, __restore_asi
136	 add	%i2, %i4, %i0
137ENDPROC(NG_ret_i2_plus_i4_plus_8)
138ENTRY(NG_ret_i2_plus_8)
139	ba,pt	%xcc, __restore_asi
140	 add	%i2, 8, %i0
141ENDPROC(NG_ret_i2_plus_8)
142ENTRY(NG_ret_i2_plus_4)
143	ba,pt	%xcc, __restore_asi
144	 add	%i2, 4, %i0
145ENDPROC(NG_ret_i2_plus_4)
146ENTRY(NG_ret_i2_plus_1)
147	ba,pt	%xcc, __restore_asi
148	 add	%i2, 1, %i0
149ENDPROC(NG_ret_i2_plus_1)
150ENTRY(NG_ret_i2_plus_g1_plus_1)
151	add	%g1, 1, %g1
152	ba,pt	%xcc, __restore_asi
153	 add	%i2, %g1, %i0
154ENDPROC(NG_ret_i2_plus_g1_plus_1)
155ENTRY(NG_ret_i2)
156	ba,pt	%xcc, __restore_asi
157	 mov	%i2, %i0
158ENDPROC(NG_ret_i2)
159ENTRY(NG_ret_i2_and_7_plus_i4)
160	and	%i2, 7, %i2
161	ba,pt	%xcc, __restore_asi
162	 add	%i2, %i4, %i0
163ENDPROC(NG_ret_i2_and_7_plus_i4)
164ENTRY(NG_ret_i2_and_7_plus_i4_plus_8)
165	and	%i2, 7, %i2
166	add	%i4, 8, %i4
167	ba,pt	%xcc, __restore_asi
168	 add	%i2, %i4, %i0
169ENDPROC(NG_ret_i2_and_7_plus_i4)
170#endif
171
172	.align		64
173
174	.globl	FUNC_NAME
175	.type	FUNC_NAME,#function
176FUNC_NAME:	/* %i0=dst, %i1=src, %i2=len */
177	PREAMBLE
178	save		%sp, -SAVE_AMOUNT, %sp
179	srlx		%i2, 31, %g2
180	cmp		%g2, 0
181	tne		%xcc, 5
182	mov		%i0, %o0
183	cmp		%i2, 0
184	be,pn		%XCC, 85f
185	 or		%o0, %i1, %i3
186	cmp		%i2, 16
187	blu,a,pn	%XCC, 80f
188	 or		%i3, %i2, %i3
189
190	/* 2 blocks (128 bytes) is the minimum we can do the block
191	 * copy with.  We need to ensure that we'll iterate at least
192	 * once in the block copy loop.  At worst we'll need to align
193	 * the destination to a 64-byte boundary which can chew up
194	 * to (64 - 1) bytes from the length before we perform the
195	 * block copy loop.
196	 */
197	cmp		%i2, (2 * 64)
198	blu,pt		%XCC, 70f
199	 andcc		%i3, 0x7, %g0
200
201	/* %o0:	dst
202	 * %i1:	src
203	 * %i2:	len  (known to be >= 128)
204	 *
205	 * The block copy loops will use %i4/%i5,%g2/%g3 as
206	 * temporaries while copying the data.
207	 */
208
209	LOAD(prefetch, %i1, #one_read)
210	wr		%g0, STORE_ASI, %asi
211
212	/* Align destination on 64-byte boundary.  */
213	andcc		%o0, (64 - 1), %i4
214	be,pt		%XCC, 2f
215	 sub		%i4, 64, %i4
216	sub		%g0, %i4, %i4	! bytes to align dst
217	sub		%i2, %i4, %i2
2181:	subcc		%i4, 1, %i4
219	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
220	EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
221	add		%i1, 1, %i1
222	bne,pt		%XCC, 1b
223	add		%o0, 1, %o0
224
225	/* If the source is on a 16-byte boundary we can do
226	 * the direct block copy loop.  If it is 8-byte aligned
227	 * we can do the 16-byte loads offset by -8 bytes and the
228	 * init stores offset by one register.
229	 *
230	 * If the source is not even 8-byte aligned, we need to do
231	 * shifting and masking (basically integer faligndata).
232	 *
233	 * The careful bit with init stores is that if we store
234	 * to any part of the cache line we have to store the whole
235	 * cacheline else we can end up with corrupt L2 cache line
236	 * contents.  Since the loop works on 64-bytes of 64-byte
237	 * aligned store data at a time, this is easy to ensure.
238	 */
2392:
240	andcc		%i1, (16 - 1), %i4
241	andn		%i2, (64 - 1), %g1	! block copy loop iterator
242	be,pt		%XCC, 50f
243	 sub		%i2, %g1, %i2		! final sub-block copy bytes
244
245	cmp		%i4, 8
246	be,pt		%XCC, 10f
247	 sub		%i1, %i4, %i1
248
249	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
250	and		%i4, 0x7, GLOBAL_SPARE
251	sll		GLOBAL_SPARE, 3, GLOBAL_SPARE
252	mov		64, %i5
253	EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
254	sub		%i5, GLOBAL_SPARE, %i5
255	mov		16, %o4
256	mov		32, %o5
257	mov		48, %o7
258	mov		64, %i3
259
260	bg,pn	   	%XCC, 9f
261	 nop
262
263#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
264	sllx		WORD1, POST_SHIFT, WORD1; \
265	srlx		WORD2, PRE_SHIFT, TMP; \
266	sllx		WORD2, POST_SHIFT, WORD2; \
267	or		WORD1, TMP, WORD1; \
268	srlx		WORD3, PRE_SHIFT, TMP; \
269	or		WORD2, TMP, WORD2;
270
2718:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
272	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
273	LOAD(prefetch, %i1 + %i3, #one_read)
274
275	EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
276	EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
277
278	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
279	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
280
281	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
282	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
283
284	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
285	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
286
287	EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
288	EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
289
290	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
291	add		%i1, 64, %i1
292	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
293
294	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
295	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
296
297	subcc		%g1, 64, %g1
298	bne,pt		%XCC, 8b
299	 add		%o0, 64, %o0
300
301	ba,pt		%XCC, 60f
302	 add		%i1, %i4, %i1
303
3049:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
305	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
306	LOAD(prefetch, %i1 + %i3, #one_read)
307
308	EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
309	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
310
311	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
312	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
313
314	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
315	EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
316
317	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
318	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
319
320	EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
321	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
322
323	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
324	add		%i1, 64, %i1
325	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
326
327	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
328	EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
329
330	subcc		%g1, 64, %g1
331	bne,pt		%XCC, 9b
332	 add		%o0, 64, %o0
333
334	ba,pt		%XCC, 60f
335	 add		%i1, %i4, %i1
336
33710:	/* Destination is 64-byte aligned, source was only 8-byte
338	 * aligned but it has been subtracted by 8 and we perform
339	 * one twin load ahead, then add 8 back into source when
340	 * we finish the loop.
341	 */
342	EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
343	mov	16, %o7
344	mov	32, %g2
345	mov	48, %g3
346	mov	64, %o1
3471:	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
348	LOAD(prefetch, %i1 + %o1, #one_read)
349	EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
350	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
351	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
352	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
353	EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
354	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
355	EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
356	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
357	EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
358	add		%i1, 64, %i1
359	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
360	EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
361	subcc		%g1, 64, %g1
362	bne,pt		%XCC, 1b
363	 add		%o0, 64, %o0
364
365	ba,pt		%XCC, 60f
366	 add		%i1, 0x8, %i1
367
36850:	/* Destination is 64-byte aligned, and source is 16-byte
369	 * aligned.
370	 */
371	mov	16, %o7
372	mov	32, %g2
373	mov	48, %g3
374	mov	64, %o1
3751:	EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
376	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
377	LOAD(prefetch, %i1 + %o1, #one_read)
378	EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
379	EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
380	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
381	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
382	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
383	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
384	add	%i1, 64, %i1
385	EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
386	EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
387	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
388	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
389	subcc	%g1, 64, %g1
390	bne,pt	%XCC, 1b
391	 add	%o0, 64, %o0
392	/* fall through */
393
39460:
395	membar		#Sync
396
397	/* %i2 contains any final bytes still needed to be copied
398	 * over. If anything is left, we copy it one byte at a time.
399	 */
400	RESTORE_ASI(%i3)
401	brz,pt		%i2, 85f
402	 sub		%o0, %i1, %i3
403	ba,a,pt		%XCC, 90f
404	 nop
405
406	.align		64
40770: /* 16 < len <= 64 */
408	bne,pn		%XCC, 75f
409	 sub		%o0, %i1, %i3
410
41172:
412	andn		%i2, 0xf, %i4
413	and		%i2, 0xf, %i2
4141:	subcc		%i4, 0x10, %i4
415	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4_plus_16)
416	add		%i1, 0x08, %i1
417	EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4_plus_16)
418	sub		%i1, 0x08, %i1
419	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4_plus_16)
420	add		%i1, 0x8, %i1
421	EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_plus_8)
422	bgu,pt		%XCC, 1b
423	 add		%i1, 0x8, %i1
42473:	andcc		%i2, 0x8, %g0
425	be,pt		%XCC, 1f
426	 nop
427	sub		%i2, 0x8, %i2
428	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
429	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
430	add		%i1, 0x8, %i1
4311:	andcc		%i2, 0x4, %g0
432	be,pt		%XCC, 1f
433	 nop
434	sub		%i2, 0x4, %i2
435	EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
436	EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
437	add		%i1, 0x4, %i1
4381:	cmp		%i2, 0
439	be,pt		%XCC, 85f
440	 nop
441	ba,pt		%xcc, 90f
442	 nop
443
44475:
445	andcc		%o0, 0x7, %g1
446	sub		%g1, 0x8, %g1
447	be,pn		%icc, 2f
448	 sub		%g0, %g1, %g1
449	sub		%i2, %g1, %i2
450
4511:	subcc		%g1, 1, %g1
452	EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
453	EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
454	bgu,pt		%icc, 1b
455	 add		%i1, 1, %i1
456
4572:	add		%i1, %i3, %o0
458	andcc		%i1, 0x7, %g1
459	bne,pt		%icc, 8f
460	 sll		%g1, 3, %g1
461
462	cmp		%i2, 16
463	bgeu,pt		%icc, 72b
464	 nop
465	ba,a,pt		%xcc, 73b
466
4678:	mov		64, %i3
468	andn		%i1, 0x7, %i1
469	EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
470	sub		%i3, %g1, %i3
471	andn		%i2, 0x7, %i4
472	sllx		%g2, %g1, %g2
4731:	add		%i1, 0x8, %i1
474	EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
475	subcc		%i4, 0x8, %i4
476	srlx		%g3, %i3, %i5
477	or		%i5, %g2, %i5
478	EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4_plus_8)
479	add		%o0, 0x8, %o0
480	bgu,pt		%icc, 1b
481	 sllx		%g3, %g1, %g2
482
483	srl		%g1, 3, %g1
484	andcc		%i2, 0x7, %i2
485	be,pn		%icc, 85f
486	 add		%i1, %g1, %i1
487	ba,pt		%xcc, 90f
488	 sub		%o0, %i1, %i3
489
490	.align		64
49180: /* 0 < len <= 16 */
492	andcc		%i3, 0x3, %g0
493	bne,pn		%XCC, 90f
494	 sub		%o0, %i1, %i3
495
4961:
497	subcc		%i2, 4, %i2
498	EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
499	EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
500	bgu,pt		%XCC, 1b
501	 add		%i1, 4, %i1
502
50385:	ret
504	 restore	EX_RETVAL(%i0), %g0, %o0
505
506	.align		32
50790:
508	subcc		%i2, 1, %i2
509	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
510	EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
511	bgu,pt		%XCC, 90b
512	 add		%i1, 1, %i1
513	ret
514	 restore	EX_RETVAL(%i0), %g0, %o0
515
516	.size		FUNC_NAME, .-FUNC_NAME
517