xref: /titanic_41/usr/src/uts/sun4v/cpu/niagara_copy.s (revision 4ef135ebdb1da7fd227e3b45fabfa88b85fc5083)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/errno.h>
31#include <sys/asm_linkage.h>
32#include <sys/vtrace.h>
33#include <sys/machthread.h>
34#include <sys/clock.h>
35#include <sys/asi.h>
36#include <sys/fsr.h>
37#include <sys/privregs.h>
38#include <sys/machasi.h>
39#include <sys/niagaraasi.h>
40
41#if !defined(lint)
42#include "assym.h"
43#endif	/* lint */
44
45
46/*
47 * Pseudo-code to aid in understanding the control flow of the
48 * bcopy/kcopy routine.
49 *
50 *	! WARNING : <Register usage convention>
51 *	! In kcopy() the %o5, holds previous error handler and a flag
52 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
53 *	! The %o5 is not available for any other use.
54 *
55 * kcopy():
56 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
57 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
58 *	curthread->t_lofault = .copyerr;
59 *	Call bcopy();
60 *
61 * bcopy():
62 * 	if (length < 128)
63 * 		goto regular_copy;
64 *
65 * 	if (!use_hw_bcopy)
66 * 		goto regular_copy;
67 *
68 * 	blockcopy;
69 *	restore t_lofault handler if came from kcopy();
70 *
71 *	regular_copy;
72 *	restore t_lofault handler if came from kcopy();
73 *
74 * In lofault handler:
75 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
76 *	return (errno)
77 *
78 */
79
80/*
81 * Less then or equal this number of bytes we will always copy byte-for-byte
82 */
83#define	SMALL_LIMIT	7
84
85/*
86 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
87 * handler was set
88 */
89#define	LOFAULT_SET 2
90
91/*
92 * This define is to align data for the unaligned source cases.
93 * The data1, data2 and data3 is merged into data1 and data2.
94 * The data3 is preserved for next merge.
95 */
96#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
97	sllx	data1, lshift, data1				;\
98	srlx	data2, rshift, tmp				;\
99	or	data1, tmp, data1				;\
100	sllx	data2, lshift, data2				;\
101	srlx	data3, rshift, tmp				;\
102	or	data2, tmp, data2
103/*
104 * This macro is to align the data. Basically it merges
105 * data1 and data2 to form double word.
106 */
107#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
108	sllx	data1, lshift, data1				;\
109	srlx	data2, rshift, tmp				;\
110	or	data1, tmp, data1
111
112/*
113 * Copy a block of storage, returning an error code if `from' or
114 * `to' takes a kernel pagefault which cannot be resolved.
115 * Returns errno value on pagefault error, 0 if all ok
116 */
117
118
119
120#if defined(lint)
121
122/* ARGSUSED */
123int
124kcopy(const void *from, void *to, size_t count)
125{ return(0); }
126
127#else	/* lint */
128
129	.seg	".text"
130	.align	4
131
132	ENTRY(kcopy)
133
134	save	%sp, -SA(MINFRAME), %sp
135	set	.copyerr, %l7			! copyerr is lofault value
136	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
137	or	%o5, LOFAULT_SET, %o5
138	membar	#Sync				! sync error barrier
139	b	.do_copy			! common code
140	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
141
142/*
143 * We got here because of a fault during kcopy.
144 * Errno value is in %g1.
145 */
146.copyerr:
147	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
148	! into %o5 to indicate it has set t_lofault handler. Need to clear
149	! LOFAULT_SET flag before restoring the error handler.
150	andn	%o5, LOFAULT_SET, %o5
151	membar	#Sync				! sync error barrier
152	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
153	ret
154	restore	%g1, 0, %o0
155
156	SET_SIZE(kcopy)
157#endif	/* lint */
158
159
160/*
161 * Copy a block of storage - must not overlap (from + len <= to).
162 */
163#if defined(lint)
164
165/* ARGSUSED */
166void
167bcopy(const void *from, void *to, size_t count)
168{}
169
170#else	/* lint */
171
172	ENTRY(bcopy)
173
174	save	%sp, -SA(MINFRAME), %sp
175	clr	%o5			! flag LOFAULT_SET is not set for bcopy
176
177.do_copy:
178	cmp	%i2, 12			! for small counts
179	blu	%ncc, .bytecp		! just copy bytes
180	  .empty
181
182	cmp	%i2, 128		! for less than 128 bytes
183	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
184	  nop
185
186	set	use_hw_bcopy, %o2
187	ld	[%o2], %o2
188	tst	%o2
189	bz	.bcb_punt
190	  nop
191
192	subcc	%i1, %i0, %i3
193	bneg,a,pn %ncc, 1f
194	neg	%i3
1951:
196	/*
197	 * Compare against 256 since we should be checking block addresses
198	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
199	 * src = dest + (64 * 3) + 63.
200	 */
201	cmp	%i3, 256
202	blu,pn	%ncc, .bcb_punt
203	  nop
204
205	/*
206	 * Copy that reach here have at least 2 blocks of data to copy.
207	 */
208.do_blockcopy:
209	! Swap src/dst since the code below is memcpy code
210	! and memcpy/bcopy have different calling sequences
211	mov	%i1, %i5
212	mov	%i0, %i1
213	mov	%i5, %i0
214
215	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
216	bz	%xcc, .chksrc		! dst is already double aligned
217	sub	%i3, 0x40, %i3
218	neg	%i3			! bytes till dst 64 bytes aligned
219	sub	%i2, %i3, %i2		! update i2 with new count
220
2211:	ldub	[%i1], %i4
222	stb	%i4, [%i0]
223	inc	%i1
224	deccc	%i3
225	bgu	%xcc, 1b
226	inc	%i0
227
228	! Now Destination is block (64 bytes) aligned
229.chksrc:
230	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
231	sub	%i2, %i3, %i2		! Residue bytes in %i2
232
233	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
234
235	andcc	%i1, 0xf, %o2		! is src quadword aligned
236	bz,pn	%xcc, .blkcpy		! src offset in %o2
237	nop
238	cmp	%o2, 0x8
239	bg	.cpy_upper_double
240	nop
241	bl	.cpy_lower_double
242	nop
243
244	! Falls through when source offset is equal to 8 i.e.
245	! source is double word aligned.
246	! In this case no shift/merge of data is required
247	sub	%i1, %o2, %i1		! align the src at 16 bytes.
248	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
249	prefetch [%l0+0x0], #one_read
250	ldda	[%i1+0x0]%asi, %l2
251loop0:
252	ldda	[%i1+0x10]%asi, %l4
253	prefetch [%l0+0x40], #one_read
254
255	stxa	%l3, [%i0+0x0]%asi
256	stxa	%l4, [%i0+0x8]%asi
257
258	ldda	[%i1+0x20]%asi, %l2
259	stxa	%l5, [%i0+0x10]%asi
260	stxa	%l2, [%i0+0x18]%asi
261
262	ldda	[%i1+0x30]%asi, %l4
263	stxa	%l3, [%i0+0x20]%asi
264	stxa	%l4, [%i0+0x28]%asi
265
266	ldda	[%i1+0x40]%asi, %l2
267	stxa	%l5, [%i0+0x30]%asi
268	stxa	%l2, [%i0+0x38]%asi
269
270	add	%l0, 0x40, %l0
271	add	%i1, 0x40, %i1
272	subcc	%i3, 0x40, %i3
273	bgu,pt	%xcc, loop0
274	add	%i0, 0x40, %i0
275	ba	.blkdone
276	add	%i1, %o2, %i1		! increment the source by src offset
277					! the src offset was stored in %o2
278
279.cpy_lower_double:
280	sub	%i1, %o2, %i1		! align the src at 16 bytes.
281	sll	%o2, 3, %o0		! %o0 left shift
282	mov	0x40, %o1
283	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
284	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
285	prefetch [%l0+0x0], #one_read
286	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
287					! complete data
288loop1:
289	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
290	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
291							! into %l2 and %l3
292	prefetch [%l0+0x40], #one_read
293	stxa	%l2, [%i0+0x0]%asi
294	stxa	%l3, [%i0+0x8]%asi
295
296	ldda	[%i1+0x20]%asi, %l2
297	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
298	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
299	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
300
301	! Repeat the same for next 32 bytes.
302
303	ldda	[%i1+0x30]%asi, %l4
304	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
305	stxa	%l2, [%i0+0x20]%asi
306	stxa	%l3, [%i0+0x28]%asi
307
308	ldda	[%i1+0x40]%asi, %l2
309	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
310	stxa	%l4, [%i0+0x30]%asi
311	stxa	%l5, [%i0+0x38]%asi
312
313	add	%l0, 0x40, %l0
314	add	%i1, 0x40, %i1
315	subcc	%i3, 0x40, %i3
316	bgu,pt	%xcc, loop1
317	add	%i0, 0x40, %i0
318	ba	.blkdone
319	add	%i1, %o2, %i1		! increment the source by src offset
320					! the src offset was stored in %o2
321
322.cpy_upper_double:
323	sub	%i1, %o2, %i1		! align the src at 16 bytes.
324	mov	0x8, %o0
325	sub	%o2, %o0, %o0
326	sll	%o0, 3, %o0		! %o0 left shift
327	mov	0x40, %o1
328	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
329	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
330	prefetch [%l0+0x0], #one_read
331	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
332					! no data in %l2
333loop2:
334	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
335					! partial
336	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
337							! into %l3 and %l4
338	prefetch [%l0+0x40], #one_read
339	stxa	%l3, [%i0+0x0]%asi
340	stxa	%l4, [%i0+0x8]%asi
341
342	ldda	[%i1+0x20]%asi, %l2
343	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
344	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
345	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
346
347	! Repeat the same for next 32 bytes.
348
349	ldda	[%i1+0x30]%asi, %l4
350	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
351	stxa	%l3, [%i0+0x20]%asi
352	stxa	%l4, [%i0+0x28]%asi
353
354	ldda	[%i1+0x40]%asi, %l2
355	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
356	stxa	%l5, [%i0+0x30]%asi
357	stxa	%l2, [%i0+0x38]%asi
358
359	add	%l0, 0x40, %l0
360	add	%i1, 0x40, %i1
361	subcc	%i3, 0x40, %i3
362	bgu,pt	%xcc, loop2
363	add	%i0, 0x40, %i0
364	ba	.blkdone
365	add	%i1, %o2, %i1		! increment the source by src offset
366					! the src offset was stored in %o2
367
368
369	! Both Source and Destination are block aligned.
370	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
371.blkcpy:
372	prefetch [%i1+0x0], #one_read
3731:
374	ldda	[%i1+0x0]%asi, %l0
375	ldda	[%i1+0x10]%asi, %l2
376	prefetch [%i1+0x40], #one_read
377
378	stxa	%l0, [%i0+0x0]%asi
379	ldda	[%i1+0x20]%asi, %l4
380	ldda	[%i1+0x30]%asi, %l6
381
382	stxa	%l1, [%i0+0x8]%asi
383	stxa	%l2, [%i0+0x10]%asi
384	stxa	%l3, [%i0+0x18]%asi
385	stxa	%l4, [%i0+0x20]%asi
386	stxa	%l5, [%i0+0x28]%asi
387	stxa	%l6, [%i0+0x30]%asi
388	stxa	%l7, [%i0+0x38]%asi
389
390	add	%i1, 0x40, %i1
391	subcc	%i3, 0x40, %i3
392	bgu,pt	%xcc, 1b
393	add	%i0, 0x40, %i0
394
395.blkdone:
396	tst	%i2
397	bz,pt	%xcc, .blkexit
398	nop
399
400.residue:
401	ldub	[%i1], %i4
402	stb	%i4, [%i0]
403	inc	%i1
404	deccc	%i2
405	bgu	%xcc, .residue
406	inc	%i0
407
408.blkexit:
409	membar	#Sync				! sync error barrier
410	! Restore t_lofault handler, if came here from kcopy().
411	tst	%o5
412	bz	%ncc, 1f
413	andn	%o5, LOFAULT_SET, %o5
414	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4151:
416	ret
417	restore	%g0, 0, %o0
418
419.bcb_punt:
420	!
421	! use aligned transfers where possible
422	!
423	xor	%i0, %i1, %o4		! xor from and to address
424	btst	7, %o4			! if lower three bits zero
425	bz	.aldoubcp		! can align on double boundary
426	.empty	! assembler complaints about label
427
428	xor	%i0, %i1, %o4		! xor from and to address
429	btst	3, %o4			! if lower two bits zero
430	bz	.alwordcp		! can align on word boundary
431	btst	3, %i0			! delay slot, from address unaligned?
432	!
433	! use aligned reads and writes where possible
434	! this differs from wordcp in that it copes
435	! with odd alignment between source and destnation
436	! using word reads and writes with the proper shifts
437	! in between to align transfers to and from memory
438	! i0 - src address, i1 - dest address, i2 - count
439	! i3, i4 - tmps for used generating complete word
440	! i5 (word to write)
441	! l0 size in bits of upper part of source word (US)
442	! l1 size in bits of lower part of source word (LS = 32 - US)
443	! l2 size in bits of upper part of destination word (UD)
444	! l3 size in bits of lower part of destination word (LD = 32 - UD)
445	! l4 number of bytes leftover after aligned transfers complete
446	! l5 the number 32
447	!
448	mov	32, %l5			! load an oft-needed constant
449	bz	.align_dst_only
450	btst	3, %i1			! is destnation address aligned?
451	clr	%i4			! clear registers used in either case
452	bz	.align_src_only
453	clr	%l0
454	!
455	! both source and destination addresses are unaligned
456	!
4571:					! align source
458	ldub	[%i0], %i3		! read a byte from source address
459	add	%i0, 1, %i0		! increment source address
460	or	%i4, %i3, %i4		! or in with previous bytes (if any)
461	btst	3, %i0			! is source aligned?
462	add	%l0, 8, %l0		! increment size of upper source (US)
463	bnz,a	1b
464	sll	%i4, 8, %i4		! make room for next byte
465
466	sub	%l5, %l0, %l1		! generate shift left count (LS)
467	sll	%i4, %l1, %i4		! prepare to get rest
468	ld	[%i0], %i3		! read a word
469	add	%i0, 4, %i0		! increment source address
470	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
471	or	%i4, %i5, %i5		! merge
472	mov	24, %l3			! align destination
4731:
474	srl	%i5, %l3, %i4		! prepare to write a single byte
475	stb	%i4, [%i1]		! write a byte
476	add	%i1, 1, %i1		! increment destination address
477	sub	%i2, 1, %i2		! decrement count
478	btst	3, %i1			! is destination aligned?
479	bnz,a	1b
480	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
481	sub	%l5, %l3, %l2		! generate shift left count (UD)
482	sll	%i5, %l2, %i5		! move leftover into upper bytes
483	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
484	bgu	%ncc, .more_needed	! need more to fill than we have
485	nop
486
487	sll	%i3, %l1, %i3		! clear upper used byte(s)
488	srl	%i3, %l1, %i3
489	! get the odd bytes between alignments
490	sub	%l0, %l2, %l0		! regenerate shift count
491	sub	%l5, %l0, %l1		! generate new shift left count (LS)
492	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
493	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
494	srl	%i3, %l0, %i4
495	or	%i5, %i4, %i5
496	st	%i5, [%i1]		! write a word
497	subcc	%i2, 4, %i2		! decrement count
498	bz	%ncc, .unalign_out
499	add	%i1, 4, %i1		! increment destination address
500
501	b	2f
502	sll	%i3, %l1, %i5		! get leftover into upper bits
503.more_needed:
504	sll	%i3, %l0, %i3		! save remaining byte(s)
505	srl	%i3, %l0, %i3
506	sub	%l2, %l0, %l1		! regenerate shift count
507	sub	%l5, %l1, %l0		! generate new shift left count
508	sll	%i3, %l1, %i4		! move to fill empty space
509	b	3f
510	or	%i5, %i4, %i5		! merge to complete word
511	!
512	! the source address is aligned and destination is not
513	!
514.align_dst_only:
515	ld	[%i0], %i4		! read a word
516	add	%i0, 4, %i0		! increment source address
517	mov	24, %l0			! initial shift alignment count
5181:
519	srl	%i4, %l0, %i3		! prepare to write a single byte
520	stb	%i3, [%i1]		! write a byte
521	add	%i1, 1, %i1		! increment destination address
522	sub	%i2, 1, %i2		! decrement count
523	btst	3, %i1			! is destination aligned?
524	bnz,a	1b
525	sub	%l0, 8, %l0		! delay slot, decrement shift count
526.xfer:
527	sub	%l5, %l0, %l1		! generate shift left count
528	sll	%i4, %l1, %i5		! get leftover
5293:
530	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
531	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
5322:
533	ld	[%i0], %i3		! read a source word
534	add	%i0, 4, %i0		! increment source address
535	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
536	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
537	st	%i5, [%i1]		! write a destination word
538	subcc	%i2, 4, %i2		! decrement count
539	bz	%ncc, .unalign_out	! check if done
540	add	%i1, 4, %i1		! increment destination address
541	b	2b			! loop
542	sll	%i3, %l1, %i5		! get leftover
543.unalign_out:
544	tst	%l4			! any bytes leftover?
545	bz	%ncc, .cpdone
546	.empty				! allow next instruction in delay slot
5471:
548	sub	%l0, 8, %l0		! decrement shift
549	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
550	stb	%i4, [%i1]		! write a byte
551	subcc	%l4, 1, %l4		! decrement count
552	bz	%ncc, .cpdone		! done?
553	add	%i1, 1, %i1		! increment destination
554	tst	%l0			! any more previously read bytes
555	bnz	%ncc, 1b		! we have leftover bytes
556	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
557	b	.dbytecp		! let dbytecp do the rest
558	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
559	!
560	! the destination address is aligned and the source is not
561	!
562.align_src_only:
563	ldub	[%i0], %i3		! read a byte from source address
564	add	%i0, 1, %i0		! increment source address
565	or	%i4, %i3, %i4		! or in with previous bytes (if any)
566	btst	3, %i0			! is source aligned?
567	add	%l0, 8, %l0		! increment shift count (US)
568	bnz,a	.align_src_only
569	sll	%i4, 8, %i4		! make room for next byte
570	b,a	.xfer
571	!
572	! if from address unaligned for double-word moves,
573	! move bytes till it is, if count is < 56 it could take
574	! longer to align the thing than to do the transfer
575	! in word size chunks right away
576	!
577.aldoubcp:
578	cmp	%i2, 56			! if count < 56, use wordcp, it takes
579	blu,a	%ncc, .alwordcp		! longer to align doubles than words
580	mov	3, %o0			! mask for word alignment
581	call	.alignit		! copy bytes until aligned
582	mov	7, %o0			! mask for double alignment
583	!
584	! source and destination are now double-word aligned
585	! i3 has aligned count returned by alignit
586	!
587	and	%i2, 7, %i2		! unaligned leftover count
588	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
5895:
590	ldx	[%i0+%i1], %o4		! read from address
591	stx	%o4, [%i1]		! write at destination address
592	subcc	%i3, 8, %i3		! dec count
593	bgu	%ncc, 5b
594	add	%i1, 8, %i1		! delay slot, inc to address
595	cmp	%i2, 4			! see if we can copy a word
596	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
597	.empty
598	!
599	! for leftover bytes we fall into wordcp, if needed
600	!
601.wordcp:
602	and	%i2, 3, %i2		! unaligned leftover count
6035:
604	ld	[%i0+%i1], %o4		! read from address
605	st	%o4, [%i1]		! write at destination address
606	subcc	%i3, 4, %i3		! dec count
607	bgu	%ncc, 5b
608	add	%i1, 4, %i1		! delay slot, inc to address
609	b,a	.dbytecp
610
611	! we come here to align copies on word boundaries
612.alwordcp:
613	call	.alignit		! go word-align it
614	mov	3, %o0			! bits that must be zero to be aligned
615	b	.wordcp
616	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
617
618	!
619	! byte copy, works with any alignment
620	!
621.bytecp:
622	b	.dbytecp
623	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
624
625	!
626	! differenced byte copy, works with any alignment
627	! assumes dest in %i1 and (source - dest) in %i0
628	!
6291:
630	stb	%o4, [%i1]		! write to address
631	inc	%i1			! inc to address
632.dbytecp:
633	deccc	%i2			! dec count
634	bgeu,a	%ncc, 1b		! loop till done
635	ldub	[%i0+%i1], %o4		! read from address
636.cpdone:
637	membar	#Sync				! sync error barrier
638	! Restore t_lofault handler, if came here from kcopy().
639	tst	%o5
640	bz	%ncc, 1f
641	andn	%o5, LOFAULT_SET, %o5
642	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
6431:
644	ret
645	restore %g0, 0, %o0		! return (0)
646
647/*
648 * Common code used to align transfers on word and doubleword
649 * boudaries.  Aligns source and destination and returns a count
650 * of aligned bytes to transfer in %i3
651 */
6521:
653	inc	%i0			! inc from
654	stb	%o4, [%i1]		! write a byte
655	inc	%i1			! inc to
656	dec	%i2			! dec count
657.alignit:
658	btst	%o0, %i0		! %o0 is bit mask to check for alignment
659	bnz,a	1b
660	ldub	[%i0], %o4		! read next byte
661
662	retl
663	andn	%i2, %o0, %i3		! return size of aligned bytes
664	SET_SIZE(bcopy)
665
666#endif	/* lint */
667
668/*
669 * Block copy with possibly overlapped operands.
670 */
671
672#if defined(lint)
673
674/*ARGSUSED*/
675void
676ovbcopy(const void *from, void *to, size_t count)
677{}
678
679#else	/* lint */
680
681	ENTRY(ovbcopy)
682	tst	%o2			! check count
683	bgu,a	%ncc, 1f		! nothing to do or bad arguments
684	subcc	%o0, %o1, %o3		! difference of from and to address
685
686	retl				! return
687	nop
6881:
689	bneg,a	%ncc, 2f
690	neg	%o3			! if < 0, make it positive
6912:	cmp	%o2, %o3		! cmp size and abs(from - to)
692	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
693	.empty				!   no overlap
694	cmp	%o0, %o1		! compare from and to addresses
695	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
696	nop
697	!
698	! Copy forwards.
699	!
700.ov_fwd:
701	ldub	[%o0], %o3		! read from address
702	inc	%o0			! inc from address
703	stb	%o3, [%o1]		! write to address
704	deccc	%o2			! dec count
705	bgu	%ncc, .ov_fwd		! loop till done
706	inc	%o1			! inc to address
707
708	retl				! return
709	nop
710	!
711	! Copy backwards.
712	!
713.ov_bkwd:
714	deccc	%o2			! dec count
715	ldub	[%o0 + %o2], %o3	! get byte at end of src
716	bgu	%ncc, .ov_bkwd		! loop till done
717	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
718
719	retl				! return
720	nop
721	SET_SIZE(ovbcopy)
722
723#endif	/* lint */
724
725/*
726 * hwblkpagecopy()
727 *
728 * Copies exactly one page.  This routine assumes the caller (ppcopy)
729 * has already disabled kernel preemption and has checked
730 * use_hw_bcopy.
731 */
732#ifdef lint
733/*ARGSUSED*/
734void
735hwblkpagecopy(const void *src, void *dst)
736{ }
737#else /* lint */
738	ENTRY(hwblkpagecopy)
739	save	%sp, -SA(MINFRAME + 4*64), %sp
740
741	! %i0 - source address (arg)
742	! %i1 - destination address (arg)
743	! %i2 - length of region (not arg)
744
745	set	PAGESIZE, %i2
746
747	/*
748	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
749	 */
750	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
751	prefetch [%i0+0x0], #one_read
752	prefetch [%i0+0x40], #one_read
7531:
754	prefetch [%i0+0x80], #one_read
755	prefetch [%i0+0xc0], #one_read
756	ldda	[%i0+0x0]%asi, %l0
757	ldda	[%i0+0x10]%asi, %l2
758	ldda	[%i0+0x20]%asi, %l4
759	ldda	[%i0+0x30]%asi, %l6
760	stxa	%l0, [%i1+0x0]%asi
761	stxa	%l1, [%i1+0x8]%asi
762	stxa	%l2, [%i1+0x10]%asi
763	stxa	%l3, [%i1+0x18]%asi
764	stxa	%l4, [%i1+0x20]%asi
765	stxa	%l5, [%i1+0x28]%asi
766	stxa	%l6, [%i1+0x30]%asi
767	stxa	%l7, [%i1+0x38]%asi
768	ldda	[%i0+0x40]%asi, %l0
769	ldda	[%i0+0x50]%asi, %l2
770	ldda	[%i0+0x60]%asi, %l4
771	ldda	[%i0+0x70]%asi, %l6
772	stxa	%l0, [%i1+0x40]%asi
773	stxa	%l1, [%i1+0x48]%asi
774	stxa	%l2, [%i1+0x50]%asi
775	stxa	%l3, [%i1+0x58]%asi
776	stxa	%l4, [%i1+0x60]%asi
777	stxa	%l5, [%i1+0x68]%asi
778	stxa	%l6, [%i1+0x70]%asi
779	stxa	%l7, [%i1+0x78]%asi
780
781	add	%i0, 0x80, %i0
782	subcc	%i2, 0x80, %i2
783	bgu,pt	%xcc, 1b
784	add	%i1, 0x80, %i1
785
786	membar #Sync
787	ret
788	restore	%g0, 0, %o0
789	SET_SIZE(hwblkpagecopy)
790#endif	/* lint */
791
792
793/*
794 * Transfer data to and from user space -
795 * Note that these routines can cause faults
796 * It is assumed that the kernel has nothing at
797 * less than KERNELBASE in the virtual address space.
798 *
799 * Note that copyin(9F) and copyout(9F) are part of the
800 * DDI/DKI which specifies that they return '-1' on "errors."
801 *
802 * Sigh.
803 *
804 * So there's two extremely similar routines - xcopyin() and xcopyout()
805 * which return the errno that we've faithfully computed.  This
806 * allows other callers (e.g. uiomove(9F)) to work correctly.
807 * Given that these are used pretty heavily, we expand the calling
808 * sequences inline for all flavours (rather than making wrappers).
809 *
810 * There are also stub routines for xcopyout_little and xcopyin_little,
811 * which currently are intended to handle requests of <= 16 bytes from
812 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
813 * is left as an exercise...
814 */
815
816/*
817 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
818 *
819 * General theory of operation:
820 *
821 * None of the copyops routines grab a window until it's decided that
822 * we need to do a HW block copy operation. This saves a window
823 * spill/fill when we're called during socket ops. The typical IO
824 * path won't cause spill/fill traps.
825 *
826 * This code uses a set of 4 limits for the maximum size that will
827 * be copied given a particular input/output address alignment.
828 * the default limits are:
829 *
830 * single byte aligned - 256 (hw_copy_limit_1)
831 * two byte aligned - 512 (hw_copy_limit_2)
832 * four byte aligned - 1024 (hw_copy_limit_4)
833 * eight byte aligned - 1024 (hw_copy_limit_8)
834 *
835 * If the value for a particular limit is zero, the copy will be done
836 * via the copy loops rather than block store/quad load instructions.
837 *
838 * Flow:
839 *
840 * If count == zero return zero.
841 *
842 * Store the previous lo_fault handler into %g6.
843 * Place our secondary lofault handler into %g5.
844 * Place the address of our nowindow fault handler into %o3.
845 * Place the address of the windowed fault handler into %o4.
846 * --> We'll use this handler if we end up grabbing a window
847 * --> before we use block initializing store and quad load ASIs
848 *
849 * If count is less than or equal to SMALL_LIMIT (7) we
850 * always do a byte for byte copy.
851 *
852 * If count is > SMALL_LIMIT, we check the alignment of the input
853 * and output pointers. Based on the alignment we check count
854 * against a limit based on detected alignment.  If we exceed the
855 * alignment value we copy via block initializing store and quad
856 * load instructions.
857 *
858 * If we don't exceed one of the limits, we store -count in %o3,
859 * we store the number of chunks (8, 4, 2 or 1 byte) operated
860 * on in our basic copy loop in %o2. Following this we branch
861 * to the appropriate copy loop and copy that many chunks.
862 * Since we've been adding the chunk size to %o3 each time through
863 * as well as decrementing %o2, we can tell if any data is
864 * is left to be copied by examining %o3. If that is zero, we're
865 * done and can go home. If not, we figure out what the largest
866 * chunk size left to be copied is and branch to that copy loop
867 * unless there's only one byte left. We load that as we're
868 * branching to code that stores it just before we return.
869 *
870 * Fault handlers are invoked if we reference memory that has no
871 * current mapping.  All forms share the same copyio_fault handler.
872 * This routine handles fixing up the stack and general housecleaning.
873 * Each copy operation has a simple fault handler that is then called
874 * to do the work specific to the invidual operation.  The handler
875 * for copyOP and xcopyOP are found at the end of individual function.
876 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
877 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
878 */
879
880/*
881 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
882 */
883
884#if defined(lint)
885
886/*ARGSUSED*/
887int
888copyout(const void *kaddr, void *uaddr, size_t count)
889{ return (0); }
890
891#else	/* lint */
892
893/*
894 * We save the arguments in the following registers in case of a fault:
895 * 	kaddr - %g2
896 * 	uaddr - %g3
897 * 	count - %g4
898 */
899#define	SAVE_SRC	%g2
900#define	SAVE_DST	%g3
901#define	SAVE_COUNT	%g4
902
903#define	REAL_LOFAULT		%g5
904#define	SAVED_LOFAULT		%g6
905
906/*
907 * Generic copyio fault handler.  This is the first line of defense when a
908 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
909 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
910 * This allows us to share common code for all the flavors of the copy
911 * operations, including the _noerr versions.
912 *
913 * Note that this function will restore the original input parameters before
914 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
915 * member of the t_copyop structure, if needed.
916 */
917	ENTRY(copyio_fault)
918	membar	#Sync
919	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
920
921	restore
922
923	mov	SAVE_SRC, %o0
924	mov	SAVE_DST, %o1
925	jmp	REAL_LOFAULT
926	  mov	SAVE_COUNT, %o2
927	SET_SIZE(copyio_fault)
928
929	ENTRY(copyio_fault_nowindow)
930	membar	#Sync
931	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
932
933	mov	SAVE_SRC, %o0
934	mov	SAVE_DST, %o1
935	jmp	REAL_LOFAULT
936	  mov	SAVE_COUNT, %o2
937	SET_SIZE(copyio_fault_nowindow)
938
939	ENTRY(copyout)
940	sethi	%hi(.copyout_err), REAL_LOFAULT
941	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
942
943.do_copyout:
944	!
945	! Check the length and bail if zero.
946	!
947	tst	%o2
948	bnz,pt	%ncc, 1f
949	  nop
950	retl
951	  clr	%o0
9521:
953	sethi	%hi(copyio_fault), %o4
954	or	%o4, %lo(copyio_fault), %o4
955	sethi	%hi(copyio_fault_nowindow), %o3
956	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
957	or	%o3, %lo(copyio_fault_nowindow), %o3
958	membar	#Sync
959	stn	%o3, [THREAD_REG + T_LOFAULT]
960
961	mov	%o0, SAVE_SRC
962	mov	%o1, SAVE_DST
963	mov	%o2, SAVE_COUNT
964
965	!
966	! Check to see if we're more than SMALL_LIMIT (7 bytes).
967	! Run in leaf mode, using the %o regs as our input regs.
968	!
969	subcc	%o2, SMALL_LIMIT, %o3
970	bgu,a,pt %ncc, .dco_ns
971	or	%o0, %o1, %o3
972	!
973	! What was previously ".small_copyout"
974	! Do full differenced copy.
975	!
976.dcobcp:
977	sub	%g0, %o2, %o3		! negate count
978	add	%o0, %o2, %o0		! make %o0 point at the end
979	add	%o1, %o2, %o1		! make %o1 point at the end
980	ba,pt	%ncc, .dcocl
981	ldub	[%o0 + %o3], %o4	! load first byte
982	!
983	! %o0 and %o2 point at the end and remain pointing at the end
984	! of their buffers. We pull things out by adding %o3 (which is
985	! the negation of the length) to the buffer end which gives us
986	! the curent location in the buffers. By incrementing %o3 we walk
987	! through both buffers without having to bump each buffer's
988	! pointer. A very fast 4 instruction loop.
989	!
990	.align 16
991.dcocl:
992	stba	%o4, [%o1 + %o3]ASI_USER
993	inccc	%o3
994	bl,a,pt	%ncc, .dcocl
995	ldub	[%o0 + %o3], %o4
996	!
997	! We're done. Go home.
998	!
999	membar	#Sync
1000	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
1001	retl
1002	clr	%o0
1003	!
1004	! Try aligned copies from here.
1005	!
1006.dco_ns:
1007	! %o0 = kernel addr (to be copied from)
1008	! %o1 = user addr (to be copied to)
1009	! %o2 = length
1010	! %o3 = %o1 | %o2 (used for alignment checking)
1011	! %o4 is alternate lo_fault
1012	! %o5 is original lo_fault
1013	!
1014	! See if we're single byte aligned. If we are, check the
1015	! limit for single byte copies. If we're smaller or equal,
1016	! bounce to the byte for byte copy loop. Otherwise do it in
1017	! HW (if enabled).
1018	!
1019	btst	1, %o3
1020	bz,pt	%icc, .dcoh8
1021	btst	7, %o3
1022	!
1023	! Single byte aligned. Do we do it via HW or via
1024	! byte for byte? Do a quick no memory reference
1025	! check to pick up small copies.
1026	!
1027	sethi	%hi(hw_copy_limit_1), %o3
1028	!
1029	! Big enough that we need to check the HW limit for
1030	! this size copy.
1031	!
1032	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1033	!
1034	! Is HW copy on? If not, do everything byte for byte.
1035	!
1036	tst	%o3
1037	bz,pn	%icc, .dcobcp
1038	subcc	%o3, %o2, %o3
1039	!
1040	! If we're less than or equal to the single byte copy limit,
1041	! bop to the copy loop.
1042	!
1043	bge,pt	%ncc, .dcobcp
1044	nop
1045	!
1046	! We're big enough and copy is on. Do it with HW.
1047	!
1048	ba,pt	%ncc, .big_copyout
1049	nop
1050.dcoh8:
1051	!
1052	! 8 byte aligned?
1053	!
1054	bnz,a	%ncc, .dcoh4
1055	btst	3, %o3
1056	!
1057	! See if we're in the "small range".
1058	! If so, go off and do the copy.
1059	! If not, load the hard limit. %o3 is
1060	! available for reuse.
1061	!
1062	sethi	%hi(hw_copy_limit_8), %o3
1063	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1064	!
1065	! If it's zero, there's no HW bcopy.
1066	! Bop off to the aligned copy.
1067	!
1068	tst	%o3
1069	bz,pn	%icc, .dcos8
1070	subcc	%o3, %o2, %o3
1071	!
1072	! We're negative if our size is larger than hw_copy_limit_8.
1073	!
1074	bge,pt	%ncc, .dcos8
1075	nop
1076	!
1077	! HW assist is on and we're large enough. Do it.
1078	!
1079	ba,pt	%ncc, .big_copyout
1080	nop
1081.dcos8:
1082	!
1083	! Housekeeping for copy loops. Uses same idea as in the byte for
1084	! byte copy loop above.
1085	!
1086	add	%o0, %o2, %o0
1087	add	%o1, %o2, %o1
1088	sub	%g0, %o2, %o3
1089	ba,pt	%ncc, .dodebc
1090	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
1091	!
1092	! 4 byte aligned?
1093	!
1094.dcoh4:
1095	bnz,pn	%ncc, .dcoh2
1096	!
1097	! See if we're in the "small range".
1098	! If so, go off an do the copy.
1099	! If not, load the hard limit. %o3 is
1100	! available for reuse.
1101	!
1102	sethi	%hi(hw_copy_limit_4), %o3
1103	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1104	!
1105	! If it's zero, there's no HW bcopy.
1106	! Bop off to the aligned copy.
1107	!
1108	tst	%o3
1109	bz,pn	%icc, .dcos4
1110	subcc	%o3, %o2, %o3
1111	!
1112	! We're negative if our size is larger than hw_copy_limit_4.
1113	!
1114	bge,pt	%ncc, .dcos4
1115	nop
1116	!
1117	! HW assist is on and we're large enough. Do it.
1118	!
1119	ba,pt	%ncc, .big_copyout
1120	nop
1121.dcos4:
1122	add	%o0, %o2, %o0
1123	add	%o1, %o2, %o1
1124	sub	%g0, %o2, %o3
1125	ba,pt	%ncc, .dodfbc
1126	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
1127	!
1128	! We must be 2 byte aligned. Off we go.
1129	! The check for small copies was done in the
1130	! delay at .dcoh4
1131	!
1132.dcoh2:
1133	ble	%ncc, .dcos2
1134	sethi	%hi(hw_copy_limit_2), %o3
1135	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1136	tst	%o3
1137	bz,pn	%icc, .dcos2
1138	subcc	%o3, %o2, %o3
1139	bge,pt	%ncc, .dcos2
1140	nop
1141	!
1142	! HW is on and we're big enough. Do it.
1143	!
1144	ba,pt	%ncc, .big_copyout
1145	nop
1146.dcos2:
1147	add	%o0, %o2, %o0
1148	add	%o1, %o2, %o1
1149	sub	%g0, %o2, %o3
1150	ba,pt	%ncc, .dodtbc
1151	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
1152.small_copyout:
1153	!
1154	! Why are we doing this AGAIN? There are certain conditions in
1155	! big_copyout that will cause us to forego the HW assisted copies
1156	! and bounce back to a non-HW assisted copy. This dispatches those
1157	! copies. Note that we branch around this in the main line code.
1158	!
1159	! We make no check for limits or HW enablement here. We've
1160	! already been told that we're a poster child so just go off
1161	! and do it.
1162	!
1163	or	%o0, %o1, %o3
1164	btst	1, %o3
1165	bnz	%icc, .dcobcp		! Most likely
1166	btst	7, %o3
1167	bz	%icc, .dcos8
1168	btst	3, %o3
1169	bz	%icc, .dcos4
1170	nop
1171	ba,pt	%ncc, .dcos2
1172	nop
1173	.align 32
1174.dodebc:
1175	ldx	[%o0 + %o3], %o4
1176	deccc	%o2
1177	stxa	%o4, [%o1 + %o3]ASI_USER
1178	bg,pt	%ncc, .dodebc
1179	addcc	%o3, 8, %o3
1180	!
1181	! End of copy loop. Check to see if we're done. Most
1182	! eight byte aligned copies end here.
1183	!
1184	bz,pt	%ncc, .dcofh
1185	nop
1186	!
1187	! Something is left - do it byte for byte.
1188	!
1189	ba,pt	%ncc, .dcocl
1190	ldub	[%o0 + %o3], %o4	! load next byte
1191	!
1192	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
1193	!
1194	.align 32
1195.dodfbc:
1196	lduw	[%o0 + %o3], %o4
1197	deccc	%o2
1198	sta	%o4, [%o1 + %o3]ASI_USER
1199	bg,pt	%ncc, .dodfbc
1200	addcc	%o3, 4, %o3
1201	!
1202	! End of copy loop. Check to see if we're done. Most
1203	! four byte aligned copies end here.
1204	!
1205	bz,pt	%ncc, .dcofh
1206	nop
1207	!
1208	! Something is left. Do it byte for byte.
1209	!
1210	ba,pt	%ncc, .dcocl
1211	ldub	[%o0 + %o3], %o4	! load next byte
1212	!
1213	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
1214	! copy.
1215	!
1216	.align 32
1217.dodtbc:
1218	lduh	[%o0 + %o3], %o4
1219	deccc	%o2
1220	stha	%o4, [%o1 + %o3]ASI_USER
1221	bg,pt	%ncc, .dodtbc
1222	addcc	%o3, 2, %o3
1223	!
1224	! End of copy loop. Anything left?
1225	!
1226	bz,pt	%ncc, .dcofh
1227	nop
1228	!
1229	! Deal with the last byte
1230	!
1231	ldub	[%o0 + %o3], %o4
1232	stba	%o4, [%o1 + %o3]ASI_USER
1233.dcofh:
1234	membar	#Sync
1235	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1236	retl
1237	clr	%o0
1238
1239.big_copyout:
1240	!
1241	! We're going to go off and do a block copy.
1242	! Switch fault handlers and grab a window. We
1243	! don't do a membar #Sync since we've done only
1244	! kernel data to this point.
1245	!
1246	stn	%o4, [THREAD_REG + T_LOFAULT]
1247	save	%sp, -SA(MINFRAME), %sp
1248
1249	! Copy out that reach here are larger than 256 bytes. The
1250	! hw_copy_limit_1 is set to 256. Never set this limit less
1251	! 128 bytes.
1252.do_block_copyout:
1253
1254	! Swap src/dst since the code below is memcpy code
1255	! and memcpy/bcopy have different calling sequences
1256	mov	%i1, %i5
1257	mov	%i0, %i1
1258	mov	%i5, %i0
1259
1260	andcc	%i0, 7, %i3		! is dst double aligned
1261	bz	%ncc, copyout_blkcpy
1262	sub	%i3, 8, %i3
1263	neg	%i3			! bytes till double aligned
1264	sub	%i2, %i3, %i2		! update %i2 with new count
1265
1266	! Align Destination on double-word boundary
1267
12681:	ldub	[%i1], %i4
1269	inc	%i1
1270	stba	%i4, [%i0]ASI_USER
1271	deccc	%i3
1272	bgu	%ncc, 1b
1273	  inc	%i0
1274
1275copyout_blkcpy:
1276	andcc	%i0, 63, %i3
1277	bz,pn	%ncc, copyout_blalign	! now block aligned
1278	sub	%i3, 64, %i3
1279	neg	%i3			! bytes till block aligned
1280	sub	%i2, %i3, %i2		! update %i2 with new count
1281
1282	! Copy %i3 bytes till dst is block (64 byte) aligned. use
1283	! double word copies.
1284
1285	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
1286	bz	%ncc, .co_dbcopy	! %g1 has source offset (last 3-bits)
1287	sll	%g1, 3, %l1		! left shift
1288	mov	0x40, %l2
1289	sub	%l2, %l1, %l2		! right shift = (64 - left shift)
1290
1291	! Now use double word copies to align destination.
1292.co_double:
1293	sub	%i1, %g1, %i1		! align the src at 8 bytes.
1294	ldx	[%i1], %o2
12952:
1296	ldx	[%i1+8], %o4
1297	ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
1298	stxa	%o2, [%i0]ASI_USER
1299	mov	%o4, %o2
1300	add	%i1, 0x8, %i1
1301	subcc	%i3, 0x8, %i3
1302	bgu,pt	%ncc, 2b
1303	add	%i0, 0x8, %i0
1304	ba	copyout_blalign
1305	add	%i1, %g1, %i1
1306
1307	! Both source and destination are double aligned.
1308	! No shift and merge of data required in this case.
1309.co_dbcopy:
1310	ldx	[%i1], %o2
1311	stxa	%o2, [%i0]ASI_USER
1312	add	%i1, 0x8, %i1
1313	subcc	%i3, 0x8, %i3
1314	bgu,pt	%ncc, .co_dbcopy
1315	add	%i0, 0x8, %i0
1316
1317copyout_blalign:
1318	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
1319	sub	%i2, %i3, %i2		! Residue bytes in %i2
1320
1321	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
1322
1323	andcc	%i1, 0xf, %o2		! is src quadword aligned
1324	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
1325	nop
1326	cmp	%o2, 0x8
1327	bg	.co_upper_double
1328	nop
1329	bl	.co_lower_double
1330	nop
1331
1332	! Falls through when source offset is equal to 8 i.e.
1333	! source is double word aligned.
1334	! In this case no shift/merge of data is required
1335
1336	sub	%i1, %o2, %i1		! align the src at 16 bytes.
1337	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
1338	prefetch [%l0+0x0], #one_read
1339	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1340.co_loop0:
1341	add	%i1, 0x10, %i1
1342	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1343	prefetch [%l0+0x40], #one_read
1344
1345	stxa	%l3, [%i0+0x0]%asi
1346	stxa	%l4, [%i0+0x8]%asi
1347
1348	add	%i1, 0x10, %i1
1349	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1350
1351	stxa	%l5, [%i0+0x10]%asi
1352	stxa	%l2, [%i0+0x18]%asi
1353
1354	add	%i1, 0x10, %i1
1355	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1356
1357	stxa	%l3, [%i0+0x20]%asi
1358	stxa	%l4, [%i0+0x28]%asi
1359
1360	add	%i1, 0x10, %i1
1361	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1362
1363	stxa	%l5, [%i0+0x30]%asi
1364	stxa	%l2, [%i0+0x38]%asi
1365
1366	add	%l0, 0x40, %l0
1367	subcc	%i3, 0x40, %i3
1368	bgu,pt	%xcc, .co_loop0
1369	add	%i0, 0x40, %i0
1370	ba	.co_blkdone
1371	add	%i1, %o2, %i1		! increment the source by src offset
1372					! the src offset was stored in %o2
1373
1374.co_lower_double:
1375
1376	sub	%i1, %o2, %i1		! align the src at 16 bytes.
1377	sll	%o2, 3, %o0		! %o0 left shift
1378	mov	0x40, %o1
1379	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
1380	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
1381	prefetch [%l0+0x0], #one_read
1382	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
1383					! complete data
1384.co_loop1:
1385	add	%i1, 0x10, %i1
1386	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
1387							! for this read.
1388	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
1389							! into %l2 and %l3
1390	prefetch [%l0+0x40], #one_read
1391
1392	stxa	%l2, [%i0+0x0]%asi
1393	stxa	%l3, [%i0+0x8]%asi
1394
1395	add	%i1, 0x10, %i1
1396	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1397	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
1398							! %l4 from previous read
1399							! into %l4 and %l5
1400	stxa	%l4, [%i0+0x10]%asi
1401	stxa	%l5, [%i0+0x18]%asi
1402
1403	! Repeat the same for next 32 bytes.
1404
1405	add	%i1, 0x10, %i1
1406	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1407	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
1408
1409	stxa	%l2, [%i0+0x20]%asi
1410	stxa	%l3, [%i0+0x28]%asi
1411
1412	add	%i1, 0x10, %i1
1413	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1414	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
1415
1416	stxa	%l4, [%i0+0x30]%asi
1417	stxa	%l5, [%i0+0x38]%asi
1418
1419	add	%l0, 0x40, %l0
1420	subcc	%i3, 0x40, %i3
1421	bgu,pt	%xcc, .co_loop1
1422	add	%i0, 0x40, %i0
1423	ba	.co_blkdone
1424	add	%i1, %o2, %i1		! increment the source by src offset
1425					! the src offset was stored in %o2
1426
1427.co_upper_double:
1428
1429	sub	%i1, %o2, %i1		! align the src at 16 bytes.
1430	sub	%o2, 0x8, %o0
1431	sll	%o0, 3, %o0		! %o0 left shift
1432	mov	0x40, %o1
1433	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
1434	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
1435	prefetch [%l0+0x0], #one_read
1436	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
1437							! for this read and
1438							! no data in %l2
1439.co_loop2:
1440	add	%i1, 0x10, %i1
1441	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
1442							! and %l5 has partial
1443	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
1444							! into %l3 and %l4
1445	prefetch [%l0+0x40], #one_read
1446
1447	stxa	%l3, [%i0+0x0]%asi
1448	stxa	%l4, [%i0+0x8]%asi
1449
1450	add	%i1, 0x10, %i1
1451	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1452	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
1453							! %l5 from previous read
1454							! into %l5 and %l2
1455
1456	stxa	%l5, [%i0+0x10]%asi
1457	stxa	%l2, [%i0+0x18]%asi
1458
1459	! Repeat the same for next 32 bytes.
1460
1461	add	%i1, 0x10, %i1
1462	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1463	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
1464
1465	stxa	%l3, [%i0+0x20]%asi
1466	stxa	%l4, [%i0+0x28]%asi
1467
1468	add	%i1, 0x10, %i1
1469	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1470	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
1471
1472	stxa	%l5, [%i0+0x30]%asi
1473	stxa	%l2, [%i0+0x38]%asi
1474
1475	add	%l0, 0x40, %l0
1476	subcc	%i3, 0x40, %i3
1477	bgu,pt	%xcc, .co_loop2
1478	add	%i0, 0x40, %i0
1479	ba	.co_blkdone
1480	add	%i1, %o2, %i1		! increment the source by src offset
1481					! the src offset was stored in %o2
1482
1483
1484	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
1485.co_blkcpy:
1486
1487	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
1488	prefetch [%o0+0x0], #one_read
14891:
1490	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
1491	add	%i1, 0x10, %i1
1492	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1493	add	%i1, 0x10, %i1
1494
1495	prefetch [%o0+0x40], #one_read
1496
1497	stxa	%l0, [%i0+0x0]%asi
1498
1499	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1500	add	%i1, 0x10, %i1
1501	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
1502	add	%i1, 0x10, %i1
1503
1504	stxa	%l1, [%i0+0x8]%asi
1505	stxa	%l2, [%i0+0x10]%asi
1506	stxa	%l3, [%i0+0x18]%asi
1507	stxa	%l4, [%i0+0x20]%asi
1508	stxa	%l5, [%i0+0x28]%asi
1509	stxa	%l6, [%i0+0x30]%asi
1510	stxa	%l7, [%i0+0x38]%asi
1511
1512	add	%o0, 0x40, %o0
1513	subcc	%i3, 0x40, %i3
1514	bgu,pt	%xcc, 1b
1515	add	%i0, 0x40, %i0
1516
1517.co_blkdone:
1518	membar	#Sync
1519
1520	! Copy as much rest of the data as double word copy.
1521.co_dwcp:
1522	cmp	%i2, 0x8		! Not enough bytes to copy as double
1523	blu	%ncc, .co_dbdone
1524	nop
1525
1526	andn	%i2, 0x7, %i3		! %i3 count is multiple of 8 bytes size
1527	sub	%i2, %i3, %i2		! Residue bytes in %i2
1528
1529	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
1530	bz	%ncc, .co_cpy_db
1531	nop
1532
1533	sll	%g1, 3, %l0		! left shift
1534	mov	0x40, %l1
1535	sub	%l1, %l0, %l1		! right shift = (64 - left shift)
1536
1537.co_cpy_wd:
1538	sub	%i1, %g1, %i1		! align the src at 8 bytes.
1539	ldx	[%i1], %o2
15403:
1541	ldx	[%i1+8], %o4
1542	ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
1543	stxa	%o2, [%i0]ASI_USER
1544	mov	%o4, %o2
1545	add	%i1, 0x8, %i1
1546	subcc	%i3, 0x8, %i3
1547	bgu,pt	%ncc, 3b
1548	add	%i0, 0x8, %i0
1549	ba	.co_dbdone
1550	add	%i1, %g1, %i1
1551
1552.co_cpy_db:
1553	ldx	[%i1], %o2
1554	stxa	%o2, [%i0]ASI_USER
1555	add	%i1, 0x8, %i1
1556	subcc	%i3, 0x8, %i3
1557	bgu,pt	%ncc, .co_cpy_db
1558	add	%i0, 0x8, %i0
1559
1560.co_dbdone:
1561	tst	%i2
1562	bz,pt	%xcc, .copyout_exit
1563	nop
1564
1565	! Copy the residue as byte copy
1566.co_residue:
1567	ldub	[%i1], %i4
1568	stba	%i4, [%i0]ASI_USER
1569	inc	%i1
1570	deccc	%i2
1571	bgu	%xcc, .co_residue
1572	inc	%i0
1573
1574.copyout_exit:
1575	membar	#Sync
1576	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1577	ret
1578	restore	%g0, 0, %o0
1579
1580.copyout_err:
1581	ldn	[THREAD_REG + T_COPYOPS], %o4
1582	brz	%o4, 2f
1583	nop
1584	ldn	[%o4 + CP_COPYOUT], %g2
1585	jmp	%g2
1586	nop
15872:
1588	retl
1589	mov	-1, %o0
1590	SET_SIZE(copyout)
1591
1592#endif	/* lint */
1593
1594
1595#ifdef	lint
1596
1597/*ARGSUSED*/
1598int
1599xcopyout(const void *kaddr, void *uaddr, size_t count)
1600{ return (0); }
1601
1602#else	/* lint */
1603
1604	ENTRY(xcopyout)
1605	sethi	%hi(.xcopyout_err), REAL_LOFAULT
1606	b	.do_copyout
1607	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
1608.xcopyout_err:
1609	ldn	[THREAD_REG + T_COPYOPS], %o4
1610	brz	%o4, 2f
1611	nop
1612	ldn	[%o4 + CP_XCOPYOUT], %g2
1613	jmp	%g2
1614	nop
16152:
1616	retl
1617	mov	%g1, %o0
1618	SET_SIZE(xcopyout)
1619
1620#endif	/* lint */
1621
1622#ifdef	lint
1623
1624/*ARGSUSED*/
1625int
1626xcopyout_little(const void *kaddr, void *uaddr, size_t count)
1627{ return (0); }
1628
1629#else	/* lint */
1630
1631	ENTRY(xcopyout_little)
1632	sethi	%hi(.little_err), %o4
1633	ldn	[THREAD_REG + T_LOFAULT], %o5
1634	or	%o4, %lo(.little_err), %o4
1635	membar	#Sync			! sync error barrier
1636	stn	%o4, [THREAD_REG + T_LOFAULT]
1637
1638	subcc	%g0, %o2, %o3
1639	add	%o0, %o2, %o0
1640	bz,pn	%ncc, 2f		! check for zero bytes
1641	sub	%o2, 1, %o4
1642	add	%o0, %o4, %o0		! start w/last byte
1643	add	%o1, %o2, %o1
1644	ldub	[%o0+%o3], %o4
1645
16461:	stba	%o4, [%o1+%o3]ASI_AIUSL
1647	inccc	%o3
1648	sub	%o0, 2, %o0		! get next byte
1649	bcc,a,pt %ncc, 1b
1650	  ldub	[%o0+%o3], %o4
1651
16522:	membar	#Sync			! sync error barrier
1653	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1654	retl
1655	mov	%g0, %o0		! return (0)
1656	SET_SIZE(xcopyout_little)
1657
1658#endif	/* lint */
1659
1660/*
1661 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
1662 */
1663
1664#if defined(lint)
1665
1666/*ARGSUSED*/
1667int
1668copyin(const void *uaddr, void *kaddr, size_t count)
1669{ return (0); }
1670
1671#else	/* lint */
1672
1673	ENTRY(copyin)
1674	sethi	%hi(.copyin_err), REAL_LOFAULT
1675	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
1676
1677.do_copyin:
1678	!
1679	! Check the length and bail if zero.
1680	!
1681	tst	%o2
1682	bnz,pt	%ncc, 1f
1683	  nop
1684	retl
1685	  clr	%o0
16861:
1687	sethi	%hi(copyio_fault), %o4
1688	or	%o4, %lo(copyio_fault), %o4
1689	sethi	%hi(copyio_fault_nowindow), %o3
1690	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
1691	or	%o3, %lo(copyio_fault_nowindow), %o3
1692	membar	#Sync
1693	stn	%o3, [THREAD_REG + T_LOFAULT]
1694
1695	mov	%o0, SAVE_SRC
1696	mov	%o1, SAVE_DST
1697	mov	%o2, SAVE_COUNT
1698
1699	!
1700	! Check to see if we're more than SMALL_LIMIT.
1701	!
1702	subcc	%o2, SMALL_LIMIT, %o3
1703	bgu,a,pt %ncc, .dci_ns
1704	or	%o0, %o1, %o3
1705	!
1706	! What was previously ".small_copyin"
1707	!
1708.dcibcp:
1709	sub	%g0, %o2, %o3		! setup for copy loop
1710	add	%o0, %o2, %o0
1711	add	%o1, %o2, %o1
1712	ba,pt	%ncc, .dcicl
1713	lduba	[%o0 + %o3]ASI_USER, %o4
1714	!
1715	! %o0 and %o1 point at the end and remain pointing at the end
1716	! of their buffers. We pull things out by adding %o3 (which is
1717	! the negation of the length) to the buffer end which gives us
1718	! the curent location in the buffers. By incrementing %o3 we walk
1719	! through both buffers without having to bump each buffer's
1720	! pointer. A very fast 4 instruction loop.
1721	!
1722	.align 16
1723.dcicl:
1724	stb	%o4, [%o1 + %o3]
1725	inccc	%o3
1726	bl,a,pt %ncc, .dcicl
1727	lduba	[%o0 + %o3]ASI_USER, %o4
1728	!
1729	! We're done. Go home.
1730	!
1731	membar	#Sync
1732	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
1733	retl
1734	clr	%o0
1735	!
1736	! Try aligned copies from here.
1737	!
1738.dci_ns:
1739	!
1740	! See if we're single byte aligned. If we are, check the
1741	! limit for single byte copies. If we're smaller, or equal,
1742	! bounce to the byte for byte copy loop. Otherwise do it in
1743	! HW (if enabled).
1744	!
1745	btst	1, %o3
1746	bz,a,pt	%icc, .dcih8
1747	btst	7, %o3
1748	!
1749	! We're single byte aligned.
1750	!
1751	sethi	%hi(hw_copy_limit_1), %o3
1752	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1753	!
1754	! Is HW copy on? If not do everything byte for byte.
1755	!
1756	tst	%o3
1757	bz,pn	%icc, .dcibcp
1758	subcc	%o3, %o2, %o3
1759	!
1760	! Are we bigger than the HW limit? If not
1761	! go to byte for byte.
1762	!
1763	bge,pt	%ncc, .dcibcp
1764	nop
1765	!
1766	! We're big enough and copy is on. Do it with HW.
1767	!
1768	ba,pt	%ncc, .big_copyin
1769	nop
1770.dcih8:
1771	!
1772	! 8 byte aligned?
1773	!
1774	bnz,a	%ncc, .dcih4
1775	btst	3, %o3
1776	!
1777	! We're eight byte aligned.
1778	!
1779	sethi	%hi(hw_copy_limit_8), %o3
1780	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1781	!
1782	! Is HW assist on? If not, do it with the aligned copy.
1783	!
1784	tst	%o3
1785	bz,pn	%icc, .dcis8
1786	subcc	%o3, %o2, %o3
1787	bge	%ncc, .dcis8
1788	nop
1789	ba,pt	%ncc, .big_copyin
1790	nop
1791.dcis8:
1792	!
1793	! Housekeeping for copy loops. Uses same idea as in the byte for
1794	! byte copy loop above.
1795	!
1796	add	%o0, %o2, %o0
1797	add	%o1, %o2, %o1
1798	sub	%g0, %o2, %o3
1799	ba,pt	%ncc, .didebc
1800	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
1801	!
1802	! 4 byte aligned?
1803	!
1804.dcih4:
1805	bnz	%ncc, .dcih2
1806	sethi	%hi(hw_copy_limit_4), %o3
1807	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1808	!
1809	! Is HW assist on? If not, do it with the aligned copy.
1810	!
1811	tst	%o3
1812	bz,pn	%icc, .dcis4
1813	subcc	%o3, %o2, %o3
1814	!
1815	! We're negative if our size is less than or equal to hw_copy_limit_4.
1816	!
1817	bge	%ncc, .dcis4
1818	nop
1819	ba,pt	%ncc, .big_copyin
1820	nop
1821.dcis4:
1822	!
1823	! Housekeeping for copy loops. Uses same idea as in the byte
1824	! for byte copy loop above.
1825	!
1826	add	%o0, %o2, %o0
1827	add	%o1, %o2, %o1
1828	sub	%g0, %o2, %o3
1829	ba,pt	%ncc, .didfbc
1830	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
1831.dcih2:
1832	!
1833	! We're two byte aligned. Check for "smallness"
1834	! done in delay at .dcih4
1835	!
1836	bleu,pt	%ncc, .dcis2
1837	sethi	%hi(hw_copy_limit_2), %o3
1838	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1839	!
1840	! Is HW assist on? If not, do it with the aligned copy.
1841	!
1842	tst	%o3
1843	bz,pn	%icc, .dcis2
1844	subcc	%o3, %o2, %o3
1845	!
1846	! Are we larger than the HW limit?
1847	!
1848	bge	%ncc, .dcis2
1849	nop
1850	!
1851	! HW assist is on and we're large enough to use it.
1852	!
1853	ba,pt	%ncc, .big_copyin
1854	nop
1855	!
1856	! Housekeeping for copy loops. Uses same idea as in the byte
1857	! for byte copy loop above.
1858	!
1859.dcis2:
1860	add	%o0, %o2, %o0
1861	add	%o1, %o2, %o1
1862	sub	%g0, %o2, %o3
1863	ba,pt	%ncc, .didtbc
1864	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
1865	!
1866.small_copyin:
1867	!
1868	! Why are we doing this AGAIN? There are certain conditions in
1869	! big copyin that will cause us to forgo the HW assisted copys
1870	! and bounce back to a non-hw assisted copy. This dispatches
1871	! those copies. Note that we branch around this in the main line
1872	! code.
1873	!
1874	! We make no check for limits or HW enablement here. We've
1875	! already been told that we're a poster child so just go off
1876	! and do it.
1877	!
1878	or	%o0, %o1, %o3
1879	btst	1, %o3
1880	bnz	%icc, .dcibcp		! Most likely
1881	btst	7, %o3
1882	bz	%icc, .dcis8
1883	btst	3, %o3
1884	bz	%icc, .dcis4
1885	nop
1886	ba,pt	%ncc, .dcis2
1887	nop
1888	!
1889	! Eight byte aligned copies. A steal from the original .small_copyin
1890	! with modifications. %o2 is number of 8 byte chunks to copy. When
1891	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
1892	! to copy.
1893	!
1894	.align 32
1895.didebc:
1896	ldxa	[%o0 + %o3]ASI_USER, %o4
1897	deccc	%o2
1898	stx	%o4, [%o1 + %o3]
1899	bg,pt	%ncc, .didebc
1900	addcc	%o3, 8, %o3
1901	!
1902	! End of copy loop. Most 8 byte aligned copies end here.
1903	!
1904	bz,pt	%ncc, .dcifh
1905	nop
1906	!
1907	! Something is left. Do it byte for byte.
1908	!
1909	ba,pt	%ncc, .dcicl
1910	lduba	[%o0 + %o3]ASI_USER, %o4
1911	!
1912	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
1913	!
1914	.align 32
1915.didfbc:
1916	lduwa	[%o0 + %o3]ASI_USER, %o4
1917	deccc	%o2
1918	st	%o4, [%o1 + %o3]
1919	bg,pt	%ncc, .didfbc
1920	addcc	%o3, 4, %o3
1921	!
1922	! End of copy loop. Most 4 byte aligned copies end here.
1923	!
1924	bz,pt	%ncc, .dcifh
1925	nop
1926	!
1927	! Something is left. Do it byte for byte.
1928	!
1929	ba,pt	%ncc, .dcicl
1930	lduba	[%o0 + %o3]ASI_USER, %o4
1931	!
1932	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
1933	! copy.
1934	!
1935	.align 32
1936.didtbc:
1937	lduha	[%o0 + %o3]ASI_USER, %o4
1938	deccc	%o2
1939	sth	%o4, [%o1 + %o3]
1940	bg,pt	%ncc, .didtbc
1941	addcc	%o3, 2, %o3
1942	!
1943	! End of copy loop. Most 2 byte aligned copies end here.
1944	!
1945	bz,pt	%ncc, .dcifh
1946	nop
1947	!
1948	! Deal with the last byte
1949	!
1950	lduba	[%o0 + %o3]ASI_USER, %o4
1951	stb	%o4, [%o1 + %o3]
1952.dcifh:
1953	membar	#Sync
1954	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1955	retl
1956	clr	%o0
1957
1958.big_copyin:
1959	!
1960	! We're going off to do a block copy.
1961	! Switch fault hendlers and grab a window. We
1962	! don't do a membar #Sync since we've done only
1963	! kernel data to this point.
1964	!
1965	stn	%o4, [THREAD_REG + T_LOFAULT]
1966	save	%sp, -SA(MINFRAME), %sp
1967
1968	! Copy in that reach here are larger than 256 bytes. The
1969	! hw_copy_limit_1 is set to 256. Never set this limit less
1970	! 128 bytes.
1971.do_blockcopyin:
1972
1973	! Swap src/dst since the code below is memcpy code
1974	! and memcpy/bcopy have different calling sequences
1975	mov	%i1, %i5
1976	mov	%i0, %i1
1977	mov	%i5, %i0
1978
1979	andcc	%i0, 7, %i3		! is dst double aligned
1980	bz	%ncc, copyin_blkcpy
1981	sub	%i3, 8, %i3
1982	neg	%i3			! bytes till double aligned
1983	sub	%i2, %i3, %i2		! update %i2 with new count
1984
1985	! Align Destination on double-word boundary
1986
19871:	lduba	[%i1]ASI_USER, %i4
1988	inc	%i1
1989	stb	%i4, [%i0]
1990	deccc	%i3
1991	bgu	%ncc, 1b
1992	  inc	%i0
1993
1994copyin_blkcpy:
1995	andcc	%i0, 63, %i3
1996	bz,pn	%ncc, copyin_blalign	! now block aligned
1997	sub	%i3, 64, %i3
1998	neg	%i3			! bytes till block aligned
1999	sub	%i2, %i3, %i2		! update %i2 with new count
2000
2001	! Copy %i3 bytes till dst is block (64 byte) aligned. use
2002	! double word copies.
2003
2004	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
2005	bz	%ncc, .ci_dbcopy	! %g1 has source offset (last 3-bits)
2006	sll	%g1, 3, %l1		! left shift
2007	mov	0x40, %l2
2008	sub	%l2, %l1, %l2		! right shift = (64 - left shift)
2009
2010	! Now use double word copies to align destination.
2011.ci_double:
2012	sub	%i1, %g1, %i1		! align the src at 8 bytes.
2013	ldxa	[%i1]ASI_USER, %o2
20142:
2015	add	%i1, 0x8, %i1
2016	ldxa	[%i1]ASI_USER, %o4
2017	ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
2018	stx	%o2, [%i0]
2019	mov	%o4, %o2
2020	subcc	%i3, 0x8, %i3
2021	bgu,pt	%ncc, 2b
2022	add	%i0, 0x8, %i0
2023	ba	copyin_blalign
2024	add	%i1, %g1, %i1
2025
2026	! Both source and destination are double aligned.
2027	! No shift and merge of data required in this case.
2028.ci_dbcopy:
2029	ldxa	[%i1]ASI_USER, %o2
2030	stx	%o2, [%i0]
2031	add	%i1, 0x8, %i1
2032	subcc	%i3, 0x8, %i3
2033	bgu,pt	%ncc, .ci_dbcopy
2034	add	%i0, 0x8, %i0
2035
2036copyin_blalign:
2037	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2038	sub	%i2, %i3, %i2		! Residue bytes in %i2
2039
2040	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2041
2042	andcc	%i1, 0xf, %o2		! is src quadword aligned
2043	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
2044	nop
2045	cmp	%o2, 0x8
2046	bg	.ci_upper_double
2047	nop
2048	bl	.ci_lower_double
2049	nop
2050
2051	! Falls through when source offset is equal to 8 i.e.
2052	! source is double word aligned.
2053	! In this case no shift/merge of data is required
2054
2055	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2056	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2057	prefetch [%l0+0x0], #one_read
2058	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2059.ci_loop0:
2060	add	%i1, 0x10, %i1
2061	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2062
2063	prefetch [%l0+0x40], #one_read
2064
2065	stxa	%l3, [%i0+0x0]%asi
2066	stxa	%l4, [%i0+0x8]%asi
2067
2068	add	%i1, 0x10, %i1
2069	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2070
2071	stxa	%l5, [%i0+0x10]%asi
2072	stxa	%l2, [%i0+0x18]%asi
2073
2074	add	%i1, 0x10, %i1
2075	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2076
2077	stxa	%l3, [%i0+0x20]%asi
2078	stxa	%l4, [%i0+0x28]%asi
2079
2080	add	%i1, 0x10, %i1
2081	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2082
2083	stxa	%l5, [%i0+0x30]%asi
2084	stxa	%l2, [%i0+0x38]%asi
2085
2086	add	%l0, 0x40, %l0
2087	subcc	%i3, 0x40, %i3
2088	bgu,pt	%xcc, .ci_loop0
2089	add	%i0, 0x40, %i0
2090	ba	.ci_blkdone
2091	add	%i1, %o2, %i1		! increment the source by src offset
2092					! the src offset was stored in %o2
2093
2094.ci_lower_double:
2095
2096	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2097	sll	%o2, 3, %o0		! %o0 left shift
2098	mov	0x40, %o1
2099	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2100	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2101	prefetch [%l0+0x0], #one_read
2102	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
2103							! and %l3 has complete
2104							! data
2105.ci_loop1:
2106	add	%i1, 0x10, %i1
2107	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
2108							! for this read.
2109	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2110							! into %l2 and %l3
2111
2112	prefetch [%l0+0x40], #one_read
2113
2114	stxa	%l2, [%i0+0x0]%asi
2115	stxa	%l3, [%i0+0x8]%asi
2116
2117	add	%i1, 0x10, %i1
2118	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2119	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2120							! %l4 from previous read
2121							! into %l4 and %l5
2122	stxa	%l4, [%i0+0x10]%asi
2123	stxa	%l5, [%i0+0x18]%asi
2124
2125	! Repeat the same for next 32 bytes.
2126
2127	add	%i1, 0x10, %i1
2128	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2129	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2130
2131	stxa	%l2, [%i0+0x20]%asi
2132	stxa	%l3, [%i0+0x28]%asi
2133
2134	add	%i1, 0x10, %i1
2135	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2136	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2137
2138	stxa	%l4, [%i0+0x30]%asi
2139	stxa	%l5, [%i0+0x38]%asi
2140
2141	add	%l0, 0x40, %l0
2142	subcc	%i3, 0x40, %i3
2143	bgu,pt	%xcc, .ci_loop1
2144	add	%i0, 0x40, %i0
2145	ba	.ci_blkdone
2146	add	%i1, %o2, %i1		! increment the source by src offset
2147					! the src offset was stored in %o2
2148
2149.ci_upper_double:
2150
2151	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2152	sub	%o2, 0x8, %o0
2153	sll	%o0, 3, %o0		! %o0 left shift
2154	mov	0x40, %o1
2155	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2156	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2157	prefetch [%l0+0x0], #one_read
2158	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
2159							! for this read and
2160							! no data in %l2
2161.ci_loop2:
2162	add	%i1, 0x10, %i1
2163	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
2164							! and %l5 has partial
2165	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2166							! into %l3 and %l4
2167	prefetch [%l0+0x40], #one_read
2168
2169	stxa	%l3, [%i0+0x0]%asi
2170	stxa	%l4, [%i0+0x8]%asi
2171
2172	add	%i1, 0x10, %i1
2173	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2174	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2175							! %l5 from previous read
2176							! into %l5 and %l2
2177
2178	stxa	%l5, [%i0+0x10]%asi
2179	stxa	%l2, [%i0+0x18]%asi
2180
2181	! Repeat the same for next 32 bytes.
2182
2183	add	%i1, 0x10, %i1
2184	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2185	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2186
2187	stxa	%l3, [%i0+0x20]%asi
2188	stxa	%l4, [%i0+0x28]%asi
2189
2190	add	%i1, 0x10, %i1
2191	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2192	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2193
2194	stxa	%l5, [%i0+0x30]%asi
2195	stxa	%l2, [%i0+0x38]%asi
2196
2197	add	%l0, 0x40, %l0
2198	subcc	%i3, 0x40, %i3
2199	bgu,pt	%xcc, .ci_loop2
2200	add	%i0, 0x40, %i0
2201	ba	.ci_blkdone
2202	add	%i1, %o2, %i1		! increment the source by src offset
2203					! the src offset was stored in %o2
2204
2205
2206	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2207.ci_blkcpy:
2208
2209	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2210	prefetch [%o0+0x0], #one_read
22111:
2212	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
2213	add	%i1, 0x10, %i1
2214	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2215	add	%i1, 0x10, %i1
2216
2217	prefetch [%o0+0x40], #one_read
2218
2219	stxa	%l0, [%i0+0x0]%asi
2220
2221	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2222	add	%i1, 0x10, %i1
2223	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
2224	add	%i1, 0x10, %i1
2225
2226	stxa	%l1, [%i0+0x8]%asi
2227	stxa	%l2, [%i0+0x10]%asi
2228	stxa	%l3, [%i0+0x18]%asi
2229	stxa	%l4, [%i0+0x20]%asi
2230	stxa	%l5, [%i0+0x28]%asi
2231	stxa	%l6, [%i0+0x30]%asi
2232	stxa	%l7, [%i0+0x38]%asi
2233
2234	add	%o0, 0x40, %o0
2235	subcc	%i3, 0x40, %i3
2236	bgu,pt	%xcc, 1b
2237	add	%i0, 0x40, %i0
2238
2239.ci_blkdone:
2240	membar	#Sync
2241
2242	! Copy as much rest of the data as double word copy.
2243.ci_dwcp:
2244	cmp	%i2, 0x8		! Not enough bytes to copy as double
2245	blu	%ncc, .ci_dbdone
2246	nop
2247
2248	andn	%i2, 0x7, %i3		! %i3 count is multiple of 8 bytes size
2249	sub	%i2, %i3, %i2		! Residue bytes in %i2
2250
2251	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
2252	bz	%ncc, .ci_cpy_db
2253	nop
2254
2255	sll	%g1, 3, %l0		! left shift
2256	mov	0x40, %l1
2257	sub	%l1, %l0, %l1		! right shift = (64 - left shift)
2258
2259.ci_cpy_dbwd:
2260	sub	%i1, %g1, %i1		! align the src at 8 bytes.
2261	ldxa	[%i1]ASI_USER, %o2
22623:
2263	add	%i1, 0x8, %i1
2264	ldxa	[%i1]ASI_USER, %o4
2265	ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
2266	stx	%o2, [%i0]
2267	mov	%o4, %o2
2268	subcc	%i3, 0x8, %i3
2269	bgu,pt	%ncc, 3b
2270	add	%i0, 0x8, %i0
2271	ba	.ci_dbdone
2272	add	%i1, %g1, %i1
2273
2274.ci_cpy_db:
2275	ldxa	[%i1]ASI_USER, %o2
2276	stx	%o2, [%i0]
2277	add	%i1, 0x8, %i1
2278	subcc	%i3, 0x8, %i3
2279	bgu,pt	%ncc, .ci_cpy_db
2280	add	%i0, 0x8, %i0
2281
2282.ci_dbdone:
2283	tst	%i2
2284	bz,pt	%xcc, .copyin_exit
2285	nop
2286
2287	! Copy the residue as byte copy
2288.ci_residue:
2289	lduba	[%i1]ASI_USER, %i4
2290	stb	%i4, [%i0]
2291	inc	%i1
2292	deccc	%i2
2293	bgu	%xcc, .ci_residue
2294	inc	%i0
2295
2296.copyin_exit:
2297	membar	#Sync
2298	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2299	ret
2300	restore	%g0, 0, %o0
2301.copyin_err:
2302	ldn	[THREAD_REG + T_COPYOPS], %o4
2303	brz	%o4, 2f
2304	nop
2305	ldn	[%o4 + CP_COPYIN], %g2
2306	jmp	%g2
2307	nop
23082:
2309	retl
2310	mov	-1, %o0
2311	SET_SIZE(copyin)
2312
2313#endif	/* lint */
2314
2315#ifdef	lint
2316
2317/*ARGSUSED*/
2318int
2319xcopyin(const void *uaddr, void *kaddr, size_t count)
2320{ return (0); }
2321
2322#else	/* lint */
2323
2324	ENTRY(xcopyin)
2325	sethi	%hi(.xcopyin_err), REAL_LOFAULT
2326	b	.do_copyin
2327	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
2328.xcopyin_err:
2329	ldn	[THREAD_REG + T_COPYOPS], %o4
2330	brz	%o4, 2f
2331	nop
2332	ldn	[%o4 + CP_XCOPYIN], %g2
2333	jmp	%g2
2334	nop
23352:
2336	retl
2337	mov	%g1, %o0
2338	SET_SIZE(xcopyin)
2339
2340#endif	/* lint */
2341
2342#ifdef	lint
2343
2344/*ARGSUSED*/
2345int
2346xcopyin_little(const void *uaddr, void *kaddr, size_t count)
2347{ return (0); }
2348
2349#else	/* lint */
2350
2351	ENTRY(xcopyin_little)
2352	sethi	%hi(.little_err), %o4
2353	ldn	[THREAD_REG + T_LOFAULT], %o5
2354	or	%o4, %lo(.little_err), %o4
2355	membar	#Sync				! sync error barrier
2356	stn	%o4, [THREAD_REG + T_LOFAULT]
2357
2358	subcc	%g0, %o2, %o3
2359	add	%o0, %o2, %o0
2360	bz,pn	%ncc, 2f		! check for zero bytes
2361	sub	%o2, 1, %o4
2362	add	%o0, %o4, %o0		! start w/last byte
2363	add	%o1, %o2, %o1
2364	lduba	[%o0+%o3]ASI_AIUSL, %o4
2365
23661:	stb	%o4, [%o1+%o3]
2367	inccc	%o3
2368	sub	%o0, 2, %o0		! get next byte
2369	bcc,a,pt %ncc, 1b
2370	  lduba	[%o0+%o3]ASI_AIUSL, %o4
2371
23722:	membar	#Sync				! sync error barrier
2373	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2374	retl
2375	mov	%g0, %o0		! return (0)
2376
2377.little_err:
2378	membar	#Sync				! sync error barrier
2379	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2380	retl
2381	mov	%g1, %o0
2382	SET_SIZE(xcopyin_little)
2383
2384#endif	/* lint */
2385
2386
2387/*
2388 * Copy a block of storage - must not overlap (from + len <= to).
2389 * No fault handler installed (to be called under on_fault())
2390 */
2391#if defined(lint)
2392
2393/* ARGSUSED */
2394void
2395copyin_noerr(const void *ufrom, void *kto, size_t count)
2396{}
2397
2398#else	/* lint */
2399
2400	ENTRY(copyin_noerr)
2401	sethi	%hi(.copyio_noerr), REAL_LOFAULT
2402	b	.do_copyin
2403	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
2404.copyio_noerr:
2405	jmp	SAVED_LOFAULT
2406	  nop
2407	SET_SIZE(copyin_noerr)
2408
2409#endif /* lint */
2410
2411/*
2412 * Copy a block of storage - must not overlap (from + len <= to).
2413 * No fault handler installed (to be called under on_fault())
2414 */
2415
2416#if defined(lint)
2417
2418/* ARGSUSED */
2419void
2420copyout_noerr(const void *kfrom, void *uto, size_t count)
2421{}
2422
2423#else	/* lint */
2424
2425	ENTRY(copyout_noerr)
2426	sethi	%hi(.copyio_noerr), REAL_LOFAULT
2427	b	.do_copyout
2428	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
2429	SET_SIZE(copyout_noerr)
2430
2431#endif /* lint */
2432
2433#if defined(lint)
2434
2435int use_hw_bcopy = 1;
2436int use_hw_bzero = 1;
2437uint_t hw_copy_limit_1 = 0x100;
2438uint_t hw_copy_limit_2 = 0x200;
2439uint_t hw_copy_limit_4 = 0x400;
2440uint_t hw_copy_limit_8 = 0x400;
2441
2442#else /* !lint */
2443
2444	.align	4
2445	DGDEF(use_hw_bcopy)
2446	.word	1
2447	DGDEF(use_hw_bzero)
2448	.word	1
2449	DGDEF(hw_copy_limit_1)
2450	.word	0x100
2451	DGDEF(hw_copy_limit_2)
2452	.word	0x200
2453	DGDEF(hw_copy_limit_4)
2454	.word	0x400
2455	DGDEF(hw_copy_limit_8)
2456	.word	0x400
2457
2458	.align	64
2459	.section ".text"
2460#endif /* !lint */
2461
2462/*
2463 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
2464 * longer than 256 bytes in length using Niagara's block stores/quad store.
2465 * If the criteria for using this routine are not met then it calls bzero
2466 * and returns 1.  Otherwise 0 is returned indicating success.
2467 * Caller is responsible for ensuring use_hw_bzero is true and that
2468 * kpreempt_disable() has been called.
2469 */
2470#ifdef lint
2471/*ARGSUSED*/
2472int
2473hwblkclr(void *addr, size_t len)
2474{
2475	return(0);
2476}
2477#else /* lint */
2478	! %i0 - start address
2479	! %i1 - length of region (multiple of 64)
2480
2481	ENTRY(hwblkclr)
2482	save	%sp, -SA(MINFRAME), %sp
2483
2484	! Must be block-aligned
2485	andcc	%i0, 0x3f, %g0
2486	bnz,pn	%ncc, 1f
2487	  nop
2488
2489	! ... and must be 256 bytes or more
2490	cmp	%i1, 0x100
2491	blu,pn	%ncc, 1f
2492	  nop
2493
2494	! ... and length must be a multiple of 64
2495	andcc	%i1, 0x3f, %g0
2496	bz,pn	%ncc, .pz_doblock
2497	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2498
24991:	! punt, call bzero but notify the caller that bzero was used
2500	mov	%i0, %o0
2501	call	bzero
2502	  mov	%i1, %o1
2503	ret
2504	restore	%g0, 1, %o0	! return (1) - did not use block operations
2505
2506	! Already verified that there are at least 256 bytes to set
2507.pz_doblock:
2508	stxa	%g0, [%i0+0x0]%asi
2509	stxa	%g0, [%i0+0x40]%asi
2510	stxa	%g0, [%i0+0x80]%asi
2511	stxa	%g0, [%i0+0xc0]%asi
2512
2513	stxa	%g0, [%i0+0x8]%asi
2514	stxa	%g0, [%i0+0x10]%asi
2515	stxa	%g0, [%i0+0x18]%asi
2516	stxa	%g0, [%i0+0x20]%asi
2517	stxa	%g0, [%i0+0x28]%asi
2518	stxa	%g0, [%i0+0x30]%asi
2519	stxa	%g0, [%i0+0x38]%asi
2520
2521	stxa	%g0, [%i0+0x48]%asi
2522	stxa	%g0, [%i0+0x50]%asi
2523	stxa	%g0, [%i0+0x58]%asi
2524	stxa	%g0, [%i0+0x60]%asi
2525	stxa	%g0, [%i0+0x68]%asi
2526	stxa	%g0, [%i0+0x70]%asi
2527	stxa	%g0, [%i0+0x78]%asi
2528
2529	stxa	%g0, [%i0+0x88]%asi
2530	stxa	%g0, [%i0+0x90]%asi
2531	stxa	%g0, [%i0+0x98]%asi
2532	stxa	%g0, [%i0+0xa0]%asi
2533	stxa	%g0, [%i0+0xa8]%asi
2534	stxa	%g0, [%i0+0xb0]%asi
2535	stxa	%g0, [%i0+0xb8]%asi
2536
2537	stxa	%g0, [%i0+0xc8]%asi
2538	stxa	%g0, [%i0+0xd0]%asi
2539	stxa	%g0, [%i0+0xd8]%asi
2540	stxa	%g0, [%i0+0xe0]%asi
2541	stxa	%g0, [%i0+0xe8]%asi
2542	stxa	%g0, [%i0+0xf0]%asi
2543	stxa	%g0, [%i0+0xf8]%asi
2544
2545	sub	%i1, 0x100, %i1
2546	cmp	%i1, 0x100
2547	bgu,pt	%ncc, .pz_doblock
2548	add	%i0, 0x100, %i0
2549
25502:
2551	! Check if more than 64 bytes to set
2552	cmp	%i1,0x40
2553	blu	%ncc, .pz_finish
2554	nop
2555
25563:
2557	stxa	%g0, [%i0+0x0]%asi
2558	stxa	%g0, [%i0+0x8]%asi
2559	stxa	%g0, [%i0+0x10]%asi
2560	stxa	%g0, [%i0+0x18]%asi
2561	stxa	%g0, [%i0+0x20]%asi
2562	stxa	%g0, [%i0+0x28]%asi
2563	stxa	%g0, [%i0+0x30]%asi
2564	stxa	%g0, [%i0+0x38]%asi
2565
2566	subcc	%i1, 0x40, %i1
2567	bgu,pt	%ncc, 3b
2568	add	%i0, 0x40, %i0
2569
2570.pz_finish:
2571	membar	#Sync
2572	ret
2573	restore	%g0, 0, %o0		! return (bzero or not)
2574	SET_SIZE(hwblkclr)
2575#endif	/* lint */
2576
2577#ifdef	lint
2578/* Copy 32 bytes of data from src to dst using physical addresses */
2579/*ARGSUSED*/
2580void
2581hw_pa_bcopy32(uint64_t src, uint64_t dst)
2582{}
2583#else	/*!lint */
2584
2585	/*
2586	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
2587	 * using physical addresses.
2588	 */
2589	ENTRY_NP(hw_pa_bcopy32)
2590	rdpr    %pstate, %g1
2591	andn    %g1, PSTATE_IE, %g2
2592	wrpr    %g0, %g2, %pstate
2593
2594	ldxa    [%o0]ASI_MEM, %o2
2595	add     %o0, 8, %o0
2596	ldxa    [%o0]ASI_MEM, %o3
2597	add     %o0, 8, %o0
2598	ldxa    [%o0]ASI_MEM, %o4
2599	add     %o0, 8, %o0
2600	ldxa    [%o0]ASI_MEM, %o5
2601	stxa    %o2, [%o1]ASI_MEM
2602	add     %o1, 8, %o1
2603	stxa    %o3, [%o1]ASI_MEM
2604	add     %o1, 8, %o1
2605	stxa    %o4, [%o1]ASI_MEM
2606	add     %o1, 8, %o1
2607	stxa    %o5, [%o1]ASI_MEM
2608
2609	membar	#Sync
2610	retl
2611	  wrpr    %g0, %g1, %pstate
2612	SET_SIZE(hw_pa_bcopy32)
2613#endif /* lint */
2614
2615/*
2616 * Zero a block of storage.
2617 *
2618 * uzero is used by the kernel to zero a block in user address space.
2619 */
2620
2621/*
2622 * Control flow of the bzero/kzero/uzero routine.
2623 *
2624 *	For fewer than 7 bytes stores, bytes will be zeroed.
2625 *
2626 *	For less than 15 bytes stores, align the address on 4 byte boundary.
2627 *	Then store as many 4-byte chunks, followed by trailing bytes.
2628 *
2629 *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
2630 *	if (count > 128) {
2631 *		store as many 8-bytes chunks to block align the address
2632 *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
2633 *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
2634 *	}
2635 *	Store as many 8-byte chunks, followed by trailing bytes.
2636 */
2637
2638#if defined(lint)
2639
2640/* ARGSUSED */
2641int
2642kzero(void *addr, size_t count)
2643{ return(0); }
2644
2645/* ARGSUSED */
2646void
2647uzero(void *addr, size_t count)
2648{}
2649
2650#else	/* lint */
2651
2652	ENTRY(uzero)
2653	!
2654	! Set a new lo_fault handler only if we came in with one
2655	! already specified.
2656	!
2657	wr	%g0, ASI_USER, %asi
2658	ldn	[THREAD_REG + T_LOFAULT], %o5
2659	tst	%o5
2660	bz,pt	%ncc, .do_zero
2661	sethi	%hi(.zeroerr), %o2
2662	or	%o2, %lo(.zeroerr), %o2
2663	membar	#Sync
2664	ba,pt	%ncc, .do_zero
2665	stn	%o2, [THREAD_REG + T_LOFAULT]
2666
2667	ENTRY(kzero)
2668	!
2669	! Always set a lo_fault handler
2670	!
2671	wr	%g0, ASI_P, %asi
2672	ldn	[THREAD_REG + T_LOFAULT], %o5
2673	sethi	%hi(.zeroerr), %o2
2674	or	%o5, LOFAULT_SET, %o5
2675	or	%o2, %lo(.zeroerr), %o2
2676	membar	#Sync
2677	ba,pt	%ncc, .do_zero
2678	stn	%o2, [THREAD_REG + T_LOFAULT]
2679
2680/*
2681 * We got here because of a fault during kzero or if
2682 * uzero or bzero was called with t_lofault non-zero.
2683 * Otherwise we've already run screaming from the room.
2684 * Errno value is in %g1. Note that we're here iff
2685 * we did set t_lofault.
2686 */
2687.zeroerr:
2688	!
2689	! Undo asi register setting. Just set it to be the
2690        ! kernel default without checking.
2691	!
2692	wr	%g0, ASI_P, %asi
2693
2694	!
2695	! We did set t_lofault. It may well have been zero coming in.
2696	!
26971:
2698	tst	%o5
2699	membar #Sync
2700	bne,pn	%ncc, 3f
2701	andncc	%o5, LOFAULT_SET, %o5
27022:
2703	!
2704	! Old handler was zero. Just return the error.
2705	!
2706	retl				! return
2707	mov	%g1, %o0		! error code from %g1
27083:
2709	!
2710	! We're here because %o5 was non-zero. It was non-zero
2711	! because either LOFAULT_SET was present, a previous fault
2712	! handler was present or both. In all cases we need to reset
2713	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
2714	! before we either simply return the error or we invoke the
2715	! previously specified handler.
2716	!
2717	be	%ncc, 2b
2718	stn	%o5, [THREAD_REG + T_LOFAULT]
2719	jmp	%o5			! goto real handler
2720	  nop
2721	SET_SIZE(kzero)
2722	SET_SIZE(uzero)
2723
2724#endif	/* lint */
2725
2726/*
2727 * Zero a block of storage.
2728 */
2729
2730#if defined(lint)
2731
2732/* ARGSUSED */
2733void
2734bzero(void *addr, size_t count)
2735{}
2736
2737#else	/* lint */
2738
2739	ENTRY(bzero)
2740	wr	%g0, ASI_P, %asi
2741
2742	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
2743	tst	%o5
2744	bz,pt	%ncc, .do_zero
2745	sethi	%hi(.zeroerr), %o2
2746	or	%o2, %lo(.zeroerr), %o2
2747	membar	#Sync				! sync error barrier
2748	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
2749
2750.do_zero:
2751	cmp	%o1, 7
2752	blu,pn	%ncc, .byteclr
2753	nop
2754
2755	cmp	%o1, 15
2756	blu,pn	%ncc, .wdalign
2757	nop
2758
2759	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
2760	bz,pt	%ncc, .blkalign		! already double aligned
2761	sub	%o3, 8, %o3		! -(bytes till double aligned)
2762	add	%o1, %o3, %o1		! update o1 with new count
2763
27641:
2765	stba	%g0, [%o0]%asi
2766	inccc	%o3
2767	bl,pt	%ncc, 1b
2768	inc	%o0
2769
2770	! Now address is double aligned
2771.blkalign:
2772	cmp	%o1, 0x80		! check if there are 128 bytes to set
2773	blu,pn	%ncc, .bzero_small
2774	mov	%o1, %o3
2775
2776	sethi	%hi(use_hw_bzero), %o2
2777	ld	[%o2 + %lo(use_hw_bzero)], %o2
2778	tst	%o2
2779	bz	%ncc, .bzero_small
2780	mov	%o1, %o3
2781
2782	rd	%asi, %o3
2783	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2784	cmp	%o3, ASI_P
2785	bne,a	%ncc, .algnblk
2786	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
2787
2788.algnblk:
2789	andcc	%o0, 0x3f, %o3		! is block aligned?
2790	bz,pt	%ncc, .bzero_blk
2791	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
2792	add	%o1, %o3, %o1		! o1 is the remainder
2793
2794	! Clear -(%o3) bytes till block aligned
27951:
2796	stxa	%g0, [%o0]%asi
2797	addcc	%o3, 8, %o3
2798	bl,pt	%ncc, 1b
2799	add	%o0, 8, %o0
2800
2801.bzero_blk:
2802	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
2803	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
2804
2805	cmp	%o4, 0x100		! 256 bytes or more
2806	blu,pn	%ncc, 3f
2807	nop
2808
28092:
2810	stxa	%g0, [%o0+0x0]%asi
2811	stxa	%g0, [%o0+0x40]%asi
2812	stxa	%g0, [%o0+0x80]%asi
2813	stxa	%g0, [%o0+0xc0]%asi
2814
2815	stxa	%g0, [%o0+0x8]%asi
2816	stxa	%g0, [%o0+0x10]%asi
2817	stxa	%g0, [%o0+0x18]%asi
2818	stxa	%g0, [%o0+0x20]%asi
2819	stxa	%g0, [%o0+0x28]%asi
2820	stxa	%g0, [%o0+0x30]%asi
2821	stxa	%g0, [%o0+0x38]%asi
2822
2823	stxa	%g0, [%o0+0x48]%asi
2824	stxa	%g0, [%o0+0x50]%asi
2825	stxa	%g0, [%o0+0x58]%asi
2826	stxa	%g0, [%o0+0x60]%asi
2827	stxa	%g0, [%o0+0x68]%asi
2828	stxa	%g0, [%o0+0x70]%asi
2829	stxa	%g0, [%o0+0x78]%asi
2830
2831	stxa	%g0, [%o0+0x88]%asi
2832	stxa	%g0, [%o0+0x90]%asi
2833	stxa	%g0, [%o0+0x98]%asi
2834	stxa	%g0, [%o0+0xa0]%asi
2835	stxa	%g0, [%o0+0xa8]%asi
2836	stxa	%g0, [%o0+0xb0]%asi
2837	stxa	%g0, [%o0+0xb8]%asi
2838
2839	stxa	%g0, [%o0+0xc8]%asi
2840	stxa	%g0, [%o0+0xd0]%asi
2841	stxa	%g0, [%o0+0xd8]%asi
2842	stxa	%g0, [%o0+0xe0]%asi
2843	stxa	%g0, [%o0+0xe8]%asi
2844	stxa	%g0, [%o0+0xf0]%asi
2845	stxa	%g0, [%o0+0xf8]%asi
2846
2847	sub	%o4, 0x100, %o4
2848	cmp	%o4, 0x100
2849	bgu,pt	%ncc, 2b
2850	add	%o0, 0x100, %o0
2851
28523:
2853	! ... check if 64 bytes to set
2854	cmp	%o4, 0x40
2855	blu	%ncc, .bzero_blk_done
2856	nop
2857
28584:
2859	stxa	%g0, [%o0+0x0]%asi
2860	stxa	%g0, [%o0+0x8]%asi
2861	stxa	%g0, [%o0+0x10]%asi
2862	stxa	%g0, [%o0+0x18]%asi
2863	stxa	%g0, [%o0+0x20]%asi
2864	stxa	%g0, [%o0+0x28]%asi
2865	stxa	%g0, [%o0+0x30]%asi
2866	stxa	%g0, [%o0+0x38]%asi
2867
2868	subcc	%o4, 0x40, %o4
2869	bgu,pt	%ncc, 3b
2870	add	%o0, 0x40, %o0
2871
2872.bzero_blk_done:
2873	membar	#Sync
2874	!
2875	! Undo asi register setting.
2876	!
2877	rd	%asi, %o4
2878	wr	%g0, ASI_P, %asi
2879	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
2880	bne,a	%ncc, .bzero_small
2881	wr	%g0, ASI_USER, %asi
2882
2883.bzero_small:
2884	! Set the remaining doubles
2885	subcc	%o3, 8, %o3		! Can we store any doubles?
2886	blu,pn	%ncc, .byteclr
2887	and	%o1, 7, %o1		! calc bytes left after doubles
2888
2889.dbclr:
2890	stxa	%g0, [%o0]%asi		! Clear the doubles
2891	subcc	%o3, 8, %o3
2892	bgeu,pt	%ncc, .dbclr
2893	add	%o0, 8, %o0
2894
2895	ba	.byteclr
2896	nop
2897
2898.wdalign:
2899	andcc	%o0, 3, %o3		! is add aligned on a word boundary
2900	bz,pn	%ncc, .wdclr
2901	andn	%o1, 3, %o3		! create word sized count in %o3
2902
2903	dec	%o1			! decrement count
2904	stba	%g0, [%o0]%asi		! clear a byte
2905	ba	.wdalign
2906	inc	%o0			! next byte
2907
2908.wdclr:
2909	sta	%g0, [%o0]%asi		! 4-byte clearing loop
2910	subcc	%o3, 4, %o3
2911	bnz,pt	%ncc, .wdclr
2912	inc	4, %o0
2913
2914	and	%o1, 3, %o1		! leftover count, if any
2915
2916.byteclr:
2917	! Set the leftover bytes
2918	brz	%o1, .bzero_exit
2919	nop
2920
29217:
2922	deccc	%o1			! byte clearing loop
2923	stba	%g0, [%o0]%asi
2924	bgu,pt	%ncc, 7b
2925	inc	%o0
2926
2927.bzero_exit:
2928	!
2929	! We're just concerned with whether t_lofault was set
2930	! when we came in. We end up here from either kzero()
2931	! or bzero(). kzero() *always* sets a lofault handler.
2932	! It ors LOFAULT_SET into %o5 to indicate it has done
2933	! this even if the value of %o5 is otherwise zero.
2934	! bzero() sets a lofault handler *only* if one was
2935	! previously set. Accordingly we need to examine
2936	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
2937	! before resetting the error handler.
2938	!
2939	tst	%o5
2940	bz	%ncc, 1f
2941	andn	%o5, LOFAULT_SET, %o5
2942	membar	#Sync				! sync error barrier
2943	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
29441:
2945	retl
2946	clr	%o0			! return (0)
2947
2948	SET_SIZE(bzero)
2949#endif	/* lint */
2950