xref: /titanic_52/usr/src/uts/sun4v/cpu/niagara_copy.s (revision fd9cb95cbb2f626355a60efb9d02c5f0a33c10e6)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/errno.h>
31#include <sys/asm_linkage.h>
32#include <sys/vtrace.h>
33#include <sys/machthread.h>
34#include <sys/clock.h>
35#include <sys/asi.h>
36#include <sys/fsr.h>
37#include <sys/privregs.h>
38#include <sys/machasi.h>
39#include <sys/niagaraasi.h>
40
41#if !defined(lint)
42#include "assym.h"
43#endif	/* lint */
44
45
46/*
47 * Pseudo-code to aid in understanding the control flow of the
48 * bcopy/kcopy routine.
49 *
50 * On entry to kcopy:
51 *	%l7 = curthread->t_lofault;
52 *	curthread->t_lofault = .copyerr;
53 *	%o5 = %l7;			! save existing handler in %o5
54 *	Call bcopy();
55 *
56 * On entry to bcopy:
57 *
58 * 	if (length < 128)
59 * 		goto_regular_copy;
60 *
61 * 	if (!use_vis)
62 * 		goto_regular_copy;
63 *
64 * 	do_blockcopy_here;
65 *
66 * In lofault handler:
67 *	curthread->t_lofault = %o5;	! restore old t_lofault
68 *	return (errno)
69 *
70 */
71
72/*
73 * Less then or equal this number of bytes we will always copy byte-for-byte
74 */
75#define	SMALL_LIMIT	7
76
77/*
78 * Size of stack frame in order to accomodate a 64-byte aligned
79 * floating-point register save area and 2 32-bit temp locations.
80 */
81#define	HWCOPYFRAMESIZE	((64 * 5) + (2 * 4))
82
83/*
84 * LOFAULT_SET : Flag set by kzero to indicate that lo_fault handler was set
85 */
86#define	LOFAULT_SET 2
87
88/*
89 * This define is to align data for the unaligned source cases.
90 * The data1, data2 and data3 is merged into data1 and data2.
91 * The data3 is preserved for next merge.
92 */
93#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
94	sllx	data1, lshift, data1				;\
95	srlx	data2, rshift, tmp				;\
96	or	data1, tmp, data1				;\
97	sllx	data2, lshift, data2				;\
98	srlx	data3, rshift, tmp				;\
99	or	data2, tmp, data2
100/*
101 * This macro is to align the data. Basically it merges
102 * data1 and data2 to form double word.
103 */
104#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
105	sllx	data1, lshift, data1				;\
106	srlx	data2, rshift, tmp				;\
107	or	data1, tmp, data1
108
109/*
110 * Copy a block of storage, returning an error code if `from' or
111 * `to' takes a kernel pagefault which cannot be resolved.
112 * Returns errno value on pagefault error, 0 if all ok
113 */
114
115
116
117#if defined(lint)
118
119/* ARGSUSED */
120int
121kcopy(const void *from, void *to, size_t count)
122{ return(0); }
123
124#else	/* lint */
125
126	.seg	".text"
127	.align	4
128
129	ENTRY(kcopy)
130
131	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
132	set	.copyerr, %o5		! copyerr is lofault value
133	ldn	[THREAD_REG + T_LOFAULT], %l7	! save existing handler
134	membar	#Sync			! sync error barrier (see copy.s)
135	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
136	b	.do_copy		! common code
137	  mov	%l7, %o5
138
139/*
140 * We got here because of a fault during kcopy.
141 * Errno value is in %g1.
142 */
143.copyerr:
144	membar	#Sync			! sync error barrier
145	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
146	ret
147	restore	%g1, 0, %o0
148
149	SET_SIZE(kcopy)
150#endif	/* lint */
151
152
153/*
154 * Copy a block of storage - must not overlap (from + len <= to).
155 *
156 * Copy a page of memory.
157 * Assumes double word alignment and a count >= 256.
158 */
159#if defined(lint)
160
161/* ARGSUSED */
162void
163bcopy(const void *from, void *to, size_t count)
164{}
165
166#else	/* lint */
167
168	ENTRY(bcopy)
169
170	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
171
172.do_copy:
173	cmp	%i2, 12			! for small counts
174	blu	%ncc, .bytecp		! just copy bytes
175	  .empty
176
177	cmp	%i2, 128		! for less than 128 bytes
178	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
179	  nop
180
181	set	use_hw_bcopy, %o2
182	ld	[%o2], %o2
183	tst	%o2
184	bz	.bcb_punt
185	  nop
186
187	subcc	%i1, %i0, %i3
188	bneg,a,pn %ncc, 1f
189	neg	%i3
1901:
191	/*
192	 * Compare against 256 since we should be checking block addresses
193	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
194	 * src = dest + (64 * 3) + 63.
195	 */
196	cmp	%i3, 256
197	blu,pn	%ncc, .bcb_punt
198	  nop
199
200	/*
201	 * Copy that reach here have at least 2 blocks of data to copy.
202	 */
203.do_blockcopy:
204	! Swap src/dst since the code below is memcpy code
205	! and memcpy/bcopy have different calling sequences
206	mov	%i1, %i5
207	mov	%i0, %i1
208	mov	%i5, %i0
209
210	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
211	bz	%xcc, .chksrc		! dst is already double aligned
212	sub	%i3, 0x40, %i3
213	neg	%i3			! bytes till dst 64 bytes aligned
214	sub	%i2, %i3, %i2		! update i2 with new count
215
2161:	ldub	[%i1], %i4
217	stb	%i4, [%i0]
218	inc	%i1
219	deccc	%i3
220	bgu	%xcc, 1b
221	inc	%i0
222
223	! Now Destination is block (64 bytes) aligned
224.chksrc:
225	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
226	sub	%i2, %i3, %i2		! Residue bytes in %i2
227
228	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
229
230	andcc	%i1, 0xf, %o2		! is src quadword aligned
231	bz,pn	%xcc, .blkcpy		! src offset in %o2
232	nop
233	cmp	%o2, 0x8
234	bg	.cpy_upper_double
235	nop
236	bl	.cpy_lower_double
237	nop
238
239	! Falls through when source offset is equal to 8 i.e.
240	! source is double word aligned.
241	! In this case no shift/merge of data is required
242	sub	%i1, %o2, %i1		! align the src at 16 bytes.
243	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
244	prefetch [%l0+0x0], #one_read
245	ldda	[%i1+0x0]%asi, %l2
246loop0:
247	ldda	[%i1+0x10]%asi, %l4
248	prefetch [%l0+0x40], #one_read
249
250	stxa	%l3, [%i0+0x0]%asi
251	stxa	%l4, [%i0+0x8]%asi
252
253	ldda	[%i1+0x20]%asi, %l2
254	stxa	%l5, [%i0+0x10]%asi
255	stxa	%l2, [%i0+0x18]%asi
256
257	ldda	[%i1+0x30]%asi, %l4
258	stxa	%l3, [%i0+0x20]%asi
259	stxa	%l4, [%i0+0x28]%asi
260
261	ldda	[%i1+0x40]%asi, %l2
262	stxa	%l5, [%i0+0x30]%asi
263	stxa	%l2, [%i0+0x38]%asi
264
265	add	%l0, 0x40, %l0
266	add	%i1, 0x40, %i1
267	subcc	%i3, 0x40, %i3
268	bgu,pt	%xcc, loop0
269	add	%i0, 0x40, %i0
270	ba	.blkdone
271	add	%i1, %o2, %i1		! increment the source by src offset
272					! the src offset was stored in %o2
273
274.cpy_lower_double:
275	sub	%i1, %o2, %i1		! align the src at 16 bytes.
276	sll	%o2, 3, %o0		! %o0 left shift
277	mov	0x40, %o1
278	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
279	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
280	prefetch [%l0+0x0], #one_read
281	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
282					! complete data
283loop1:
284	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
285	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
286							! into %l2 and %l3
287	prefetch [%l0+0x40], #one_read
288	stxa	%l2, [%i0+0x0]%asi
289	stxa	%l3, [%i0+0x8]%asi
290
291	ldda	[%i1+0x20]%asi, %l2
292	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
293	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
294	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
295
296	! Repeat the same for next 32 bytes.
297
298	ldda	[%i1+0x30]%asi, %l4
299	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
300	stxa	%l2, [%i0+0x20]%asi
301	stxa	%l3, [%i0+0x28]%asi
302
303	ldda	[%i1+0x40]%asi, %l2
304	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
305	stxa	%l4, [%i0+0x30]%asi
306	stxa	%l5, [%i0+0x38]%asi
307
308	add	%l0, 0x40, %l0
309	add	%i1, 0x40, %i1
310	subcc	%i3, 0x40, %i3
311	bgu,pt	%xcc, loop1
312	add	%i0, 0x40, %i0
313	ba	.blkdone
314	add	%i1, %o2, %i1		! increment the source by src offset
315					! the src offset was stored in %o2
316
317.cpy_upper_double:
318	sub	%i1, %o2, %i1		! align the src at 16 bytes.
319	mov	0x8, %o0
320	sub	%o2, %o0, %o0
321	sll	%o0, 3, %o0		! %o0 left shift
322	mov	0x40, %o1
323	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
324	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
325	prefetch [%l0+0x0], #one_read
326	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
327					! no data in %l2
328loop2:
329	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
330					! partial
331	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
332							! into %l3 and %l4
333	prefetch [%l0+0x40], #one_read
334	stxa	%l3, [%i0+0x0]%asi
335	stxa	%l4, [%i0+0x8]%asi
336
337	ldda	[%i1+0x20]%asi, %l2
338	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
339	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
340	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
341
342	! Repeat the same for next 32 bytes.
343
344	ldda	[%i1+0x30]%asi, %l4
345	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
346	stxa	%l3, [%i0+0x20]%asi
347	stxa	%l4, [%i0+0x28]%asi
348
349	ldda	[%i1+0x40]%asi, %l2
350	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
351	stxa	%l5, [%i0+0x30]%asi
352	stxa	%l2, [%i0+0x38]%asi
353
354	add	%l0, 0x40, %l0
355	add	%i1, 0x40, %i1
356	subcc	%i3, 0x40, %i3
357	bgu,pt	%xcc, loop2
358	add	%i0, 0x40, %i0
359	ba	.blkdone
360	add	%i1, %o2, %i1		! increment the source by src offset
361					! the src offset was stored in %o2
362
363
364	! Both Source and Destination are block aligned.
365	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
366.blkcpy:
367	prefetch [%i1+0x0], #one_read
3681:
369	ldda	[%i1+0x0]%asi, %l0
370	ldda	[%i1+0x10]%asi, %l2
371	prefetch [%i1+0x40], #one_read
372
373	stxa	%l0, [%i0+0x0]%asi
374	ldda	[%i1+0x20]%asi, %l4
375	ldda	[%i1+0x30]%asi, %l6
376
377	stxa	%l1, [%i0+0x8]%asi
378	stxa	%l2, [%i0+0x10]%asi
379	stxa	%l3, [%i0+0x18]%asi
380	stxa	%l4, [%i0+0x20]%asi
381	stxa	%l5, [%i0+0x28]%asi
382	stxa	%l6, [%i0+0x30]%asi
383	stxa	%l7, [%i0+0x38]%asi
384
385	add	%i1, 0x40, %i1
386	subcc	%i3, 0x40, %i3
387	bgu,pt	%xcc, 1b
388	add	%i0, 0x40, %i0
389
390.blkdone:
391	membar	#Sync
392	tst	%i2
393	bz,pt	%xcc, .blkexit
394	nop
395
396.residue:
397	ldub	[%i1], %i4
398	stb	%i4, [%i0]
399	inc	%i1
400	deccc	%i2
401	bgu	%xcc, .residue
402	inc	%i0
403
404.blkexit:
405	ret
406	restore	%g0, 0, %o0
407
408.bcb_punt:
409	!
410	! use aligned transfers where possible
411	!
412	xor	%i0, %i1, %o4		! xor from and to address
413	btst	7, %o4			! if lower three bits zero
414	bz	.aldoubcp		! can align on double boundary
415	.empty	! assembler complaints about label
416
417	xor	%i0, %i1, %o4		! xor from and to address
418	btst	3, %o4			! if lower two bits zero
419	bz	.alwordcp		! can align on word boundary
420	btst	3, %i0			! delay slot, from address unaligned?
421	!
422	! use aligned reads and writes where possible
423	! this differs from wordcp in that it copes
424	! with odd alignment between source and destnation
425	! using word reads and writes with the proper shifts
426	! in between to align transfers to and from memory
427	! i0 - src address, i1 - dest address, i2 - count
428	! i3, i4 - tmps for used generating complete word
429	! i5 (word to write)
430	! l0 size in bits of upper part of source word (US)
431	! l1 size in bits of lower part of source word (LS = 32 - US)
432	! l2 size in bits of upper part of destination word (UD)
433	! l3 size in bits of lower part of destination word (LD = 32 - UD)
434	! l4 number of bytes leftover after aligned transfers complete
435	! l5 the number 32
436	!
437	mov	32, %l5			! load an oft-needed constant
438	bz	.align_dst_only
439	btst	3, %i1			! is destnation address aligned?
440	clr	%i4			! clear registers used in either case
441	bz	.align_src_only
442	clr	%l0
443	!
444	! both source and destination addresses are unaligned
445	!
4461:					! align source
447	ldub	[%i0], %i3		! read a byte from source address
448	add	%i0, 1, %i0		! increment source address
449	or	%i4, %i3, %i4		! or in with previous bytes (if any)
450	btst	3, %i0			! is source aligned?
451	add	%l0, 8, %l0		! increment size of upper source (US)
452	bnz,a	1b
453	sll	%i4, 8, %i4		! make room for next byte
454
455	sub	%l5, %l0, %l1		! generate shift left count (LS)
456	sll	%i4, %l1, %i4		! prepare to get rest
457	ld	[%i0], %i3		! read a word
458	add	%i0, 4, %i0		! increment source address
459	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
460	or	%i4, %i5, %i5		! merge
461	mov	24, %l3			! align destination
4621:
463	srl	%i5, %l3, %i4		! prepare to write a single byte
464	stb	%i4, [%i1]		! write a byte
465	add	%i1, 1, %i1		! increment destination address
466	sub	%i2, 1, %i2		! decrement count
467	btst	3, %i1			! is destination aligned?
468	bnz,a	1b
469	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
470	sub	%l5, %l3, %l2		! generate shift left count (UD)
471	sll	%i5, %l2, %i5		! move leftover into upper bytes
472	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
473	bgu	%ncc, .more_needed	! need more to fill than we have
474	nop
475
476	sll	%i3, %l1, %i3		! clear upper used byte(s)
477	srl	%i3, %l1, %i3
478	! get the odd bytes between alignments
479	sub	%l0, %l2, %l0		! regenerate shift count
480	sub	%l5, %l0, %l1		! generate new shift left count (LS)
481	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
482	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
483	srl	%i3, %l0, %i4
484	or	%i5, %i4, %i5
485	st	%i5, [%i1]		! write a word
486	subcc	%i2, 4, %i2		! decrement count
487	bz	%ncc, .unalign_out
488	add	%i1, 4, %i1		! increment destination address
489
490	b	2f
491	sll	%i3, %l1, %i5		! get leftover into upper bits
492.more_needed:
493	sll	%i3, %l0, %i3		! save remaining byte(s)
494	srl	%i3, %l0, %i3
495	sub	%l2, %l0, %l1		! regenerate shift count
496	sub	%l5, %l1, %l0		! generate new shift left count
497	sll	%i3, %l1, %i4		! move to fill empty space
498	b	3f
499	or	%i5, %i4, %i5		! merge to complete word
500	!
501	! the source address is aligned and destination is not
502	!
503.align_dst_only:
504	ld	[%i0], %i4		! read a word
505	add	%i0, 4, %i0		! increment source address
506	mov	24, %l0			! initial shift alignment count
5071:
508	srl	%i4, %l0, %i3		! prepare to write a single byte
509	stb	%i3, [%i1]		! write a byte
510	add	%i1, 1, %i1		! increment destination address
511	sub	%i2, 1, %i2		! decrement count
512	btst	3, %i1			! is destination aligned?
513	bnz,a	1b
514	sub	%l0, 8, %l0		! delay slot, decrement shift count
515.xfer:
516	sub	%l5, %l0, %l1		! generate shift left count
517	sll	%i4, %l1, %i5		! get leftover
5183:
519	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
520	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
5212:
522	ld	[%i0], %i3		! read a source word
523	add	%i0, 4, %i0		! increment source address
524	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
525	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
526	st	%i5, [%i1]		! write a destination word
527	subcc	%i2, 4, %i2		! decrement count
528	bz	%ncc, .unalign_out	! check if done
529	add	%i1, 4, %i1		! increment destination address
530	b	2b			! loop
531	sll	%i3, %l1, %i5		! get leftover
532.unalign_out:
533	tst	%l4			! any bytes leftover?
534	bz	%ncc, .cpdone
535	.empty				! allow next instruction in delay slot
5361:
537	sub	%l0, 8, %l0		! decrement shift
538	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
539	stb	%i4, [%i1]		! write a byte
540	subcc	%l4, 1, %l4		! decrement count
541	bz	%ncc, .cpdone		! done?
542	add	%i1, 1, %i1		! increment destination
543	tst	%l0			! any more previously read bytes
544	bnz	%ncc, 1b		! we have leftover bytes
545	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
546	b	.dbytecp		! let dbytecp do the rest
547	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
548	!
549	! the destination address is aligned and the source is not
550	!
551.align_src_only:
552	ldub	[%i0], %i3		! read a byte from source address
553	add	%i0, 1, %i0		! increment source address
554	or	%i4, %i3, %i4		! or in with previous bytes (if any)
555	btst	3, %i0			! is source aligned?
556	add	%l0, 8, %l0		! increment shift count (US)
557	bnz,a	.align_src_only
558	sll	%i4, 8, %i4		! make room for next byte
559	b,a	.xfer
560	!
561	! if from address unaligned for double-word moves,
562	! move bytes till it is, if count is < 56 it could take
563	! longer to align the thing than to do the transfer
564	! in word size chunks right away
565	!
566.aldoubcp:
567	cmp	%i2, 56			! if count < 56, use wordcp, it takes
568	blu,a	%ncc, .alwordcp		! longer to align doubles than words
569	mov	3, %o0			! mask for word alignment
570	call	.alignit		! copy bytes until aligned
571	mov	7, %o0			! mask for double alignment
572	!
573	! source and destination are now double-word aligned
574	! i3 has aligned count returned by alignit
575	!
576	and	%i2, 7, %i2		! unaligned leftover count
577	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
5785:
579	ldx	[%i0+%i1], %o4		! read from address
580	stx	%o4, [%i1]		! write at destination address
581	subcc	%i3, 8, %i3		! dec count
582	bgu	%ncc, 5b
583	add	%i1, 8, %i1		! delay slot, inc to address
584	cmp	%i2, 4			! see if we can copy a word
585	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
586	.empty
587	!
588	! for leftover bytes we fall into wordcp, if needed
589	!
590.wordcp:
591	and	%i2, 3, %i2		! unaligned leftover count
5925:
593	ld	[%i0+%i1], %o4		! read from address
594	st	%o4, [%i1]		! write at destination address
595	subcc	%i3, 4, %i3		! dec count
596	bgu	%ncc, 5b
597	add	%i1, 4, %i1		! delay slot, inc to address
598	b,a	.dbytecp
599
600	! we come here to align copies on word boundaries
601.alwordcp:
602	call	.alignit		! go word-align it
603	mov	3, %o0			! bits that must be zero to be aligned
604	b	.wordcp
605	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
606
607	!
608	! byte copy, works with any alignment
609	!
610.bytecp:
611	b	.dbytecp
612	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
613
614	!
615	! differenced byte copy, works with any alignment
616	! assumes dest in %i1 and (source - dest) in %i0
617	!
6181:
619	stb	%o4, [%i1]		! write to address
620	inc	%i1			! inc to address
621.dbytecp:
622	deccc	%i2			! dec count
623	bgeu,a	%ncc, 1b		! loop till done
624	ldub	[%i0+%i1], %o4		! read from address
625.cpdone:
626	membar	#Sync			! sync error barrier
627	ret
628	restore %g0, 0, %o0		! return (0)
629
630/*
631 * Common code used to align transfers on word and doubleword
632 * boudaries.  Aligns source and destination and returns a count
633 * of aligned bytes to transfer in %i3
634 */
6351:
636	inc	%i0			! inc from
637	stb	%o4, [%i1]		! write a byte
638	inc	%i1			! inc to
639	dec	%i2			! dec count
640.alignit:
641	btst	%o0, %i0		! %o0 is bit mask to check for alignment
642	bnz,a	1b
643	ldub	[%i0], %o4		! read next byte
644
645	retl
646	andn	%i2, %o0, %i3		! return size of aligned bytes
647	SET_SIZE(bcopy)
648
649#endif	/* lint */
650
651/*
652 * Block copy with possibly overlapped operands.
653 */
654
655#if defined(lint)
656
657/*ARGSUSED*/
658void
659ovbcopy(const void *from, void *to, size_t count)
660{}
661
662#else	/* lint */
663
664	ENTRY(ovbcopy)
665	tst	%o2			! check count
666	bgu,a	%ncc, 1f		! nothing to do or bad arguments
667	subcc	%o0, %o1, %o3		! difference of from and to address
668
669	retl				! return
670	nop
6711:
672	bneg,a	%ncc, 2f
673	neg	%o3			! if < 0, make it positive
6742:	cmp	%o2, %o3		! cmp size and abs(from - to)
675	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
676	.empty				!   no overlap
677	cmp	%o0, %o1		! compare from and to addresses
678	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
679	nop
680	!
681	! Copy forwards.
682	!
683.ov_fwd:
684	ldub	[%o0], %o3		! read from address
685	inc	%o0			! inc from address
686	stb	%o3, [%o1]		! write to address
687	deccc	%o2			! dec count
688	bgu	%ncc, .ov_fwd		! loop till done
689	inc	%o1			! inc to address
690
691	retl				! return
692	nop
693	!
694	! Copy backwards.
695	!
696.ov_bkwd:
697	deccc	%o2			! dec count
698	ldub	[%o0 + %o2], %o3	! get byte at end of src
699	bgu	%ncc, .ov_bkwd		! loop till done
700	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
701
702	retl				! return
703	nop
704	SET_SIZE(ovbcopy)
705
706#endif	/* lint */
707
708/*
709 * hwblkpagecopy()
710 *
711 * Copies exactly one page.  This routine assumes the caller (ppcopy)
712 * has already disabled kernel preemption and has checked
713 * use_hw_bcopy.
714 */
715#ifdef lint
716/*ARGSUSED*/
717void
718hwblkpagecopy(const void *src, void *dst)
719{ }
720#else /* lint */
721	ENTRY(hwblkpagecopy)
722	save	%sp, -SA(MINFRAME + 4*64), %sp
723
724	! %i0 - source address (arg)
725	! %i1 - destination address (arg)
726	! %i2 - length of region (not arg)
727
728	set	PAGESIZE, %i2
729
730	/*
731	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
732	 */
733	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
734	prefetch [%i0+0x0], #one_read
735	prefetch [%i0+0x40], #one_read
7361:
737	prefetch [%i0+0x80], #one_read
738	prefetch [%i0+0xc0], #one_read
739	ldda	[%i0+0x0]%asi, %l0
740	ldda	[%i0+0x10]%asi, %l2
741	ldda	[%i0+0x20]%asi, %l4
742	ldda	[%i0+0x30]%asi, %l6
743	stxa	%l0, [%i1+0x0]%asi
744	stxa	%l1, [%i1+0x8]%asi
745	stxa	%l2, [%i1+0x10]%asi
746	stxa	%l3, [%i1+0x18]%asi
747	stxa	%l4, [%i1+0x20]%asi
748	stxa	%l5, [%i1+0x28]%asi
749	stxa	%l6, [%i1+0x30]%asi
750	stxa	%l7, [%i1+0x38]%asi
751	ldda	[%i0+0x40]%asi, %l0
752	ldda	[%i0+0x50]%asi, %l2
753	ldda	[%i0+0x60]%asi, %l4
754	ldda	[%i0+0x70]%asi, %l6
755	stxa	%l0, [%i1+0x40]%asi
756	stxa	%l1, [%i1+0x48]%asi
757	stxa	%l2, [%i1+0x50]%asi
758	stxa	%l3, [%i1+0x58]%asi
759	stxa	%l4, [%i1+0x60]%asi
760	stxa	%l5, [%i1+0x68]%asi
761	stxa	%l6, [%i1+0x70]%asi
762	stxa	%l7, [%i1+0x78]%asi
763
764	add	%i0, 0x80, %i0
765	subcc	%i2, 0x80, %i2
766	bgu,pt	%xcc, 1b
767	add	%i1, 0x80, %i1
768
769	membar #Sync
770	ret
771	restore	%g0, 0, %o0
772	SET_SIZE(hwblkpagecopy)
773#endif	/* lint */
774
775
776/*
777 * Transfer data to and from user space -
778 * Note that these routines can cause faults
779 * It is assumed that the kernel has nothing at
780 * less than KERNELBASE in the virtual address space.
781 *
782 * Note that copyin(9F) and copyout(9F) are part of the
783 * DDI/DKI which specifies that they return '-1' on "errors."
784 *
785 * Sigh.
786 *
787 * So there's two extremely similar routines - xcopyin() and xcopyout()
788 * which return the errno that we've faithfully computed.  This
789 * allows other callers (e.g. uiomove(9F)) to work correctly.
790 * Given that these are used pretty heavily, we expand the calling
791 * sequences inline for all flavours (rather than making wrappers).
792 *
793 * There are also stub routines for xcopyout_little and xcopyin_little,
794 * which currently are intended to handle requests of <= 16 bytes from
795 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
796 * is left as an exercise...
797 */
798
799/*
800 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
801 *
802 * General theory of operation:
803 *
804 * None of the copyops routines grab a window until it's decided that
805 * we need to do a HW block copy operation. This saves a window
806 * spill/fill when we're called during socket ops. The typical IO
807 * path won't cause spill/fill traps.
808 *
809 * This code uses a set of 4 limits for the maximum size that will
810 * be copied given a particular input/output address alignment.
811 * the default limits are:
812 *
813 * single byte aligned - 256 (hw_copy_limit_1)
814 * two byte aligned - 512 (hw_copy_limit_2)
815 * four byte aligned - 1024 (hw_copy_limit_4)
816 * eight byte aligned - 1024 (hw_copy_limit_8)
817 *
818 * If the value for a particular limit is zero, the copy will be done
819 * via the copy loops rather than block store/quad load instructions.
820 *
821 * Flow:
822 *
823 * If count == zero return zero.
824 *
825 * Store the previous lo_fault handler into %g6.
826 * Place our secondary lofault handler into %g5.
827 * Place the address of our nowindow fault handler into %o3.
828 * Place the address of the windowed fault handler into %o4.
829 * --> We'll use this handler if we end up grabbing a window
830 * --> before we use block initializing store and quad load ASIs
831 *
832 * If count is less than or equal to SMALL_LIMIT (7) we
833 * always do a byte for byte copy.
834 *
835 * If count is > SMALL_LIMIT, we check the alignment of the input
836 * and output pointers. Based on the alignment we check count
837 * against a limit based on detected alignment.  If we exceed the
838 * alignment value we copy via block initializing store and quad
839 * load instructions.
840 *
841 * If we don't exceed one of the limits, we store -count in %o3,
842 * we store the number of chunks (8, 4, 2 or 1 byte) operated
843 * on in our basic copy loop in %o2. Following this we branch
844 * to the appropriate copy loop and copy that many chunks.
845 * Since we've been adding the chunk size to %o3 each time through
846 * as well as decrementing %o2, we can tell if any data is
847 * is left to be copied by examining %o3. If that is zero, we're
848 * done and can go home. If not, we figure out what the largest
849 * chunk size left to be copied is and branch to that copy loop
850 * unless there's only one byte left. We load that as we're
851 * branching to code that stores it just before we return.
852 *
853 * Fault handlers are invoked if we reference memory that has no
854 * current mapping.  All forms share the same copyio_fault handler.
855 * This routine handles fixing up the stack and general housecleaning.
856 * Each copy operation has a simple fault handler that is then called
857 * to do the work specific to the invidual operation.  The handler
858 * for copyOP and xcopyOP are found at the end of individual function.
859 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
860 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
861 */
862
863/*
864 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
865 */
866
867#if defined(lint)
868
869/*ARGSUSED*/
870int
871copyout(const void *kaddr, void *uaddr, size_t count)
872{ return (0); }
873
874#else	/* lint */
875
876/*
877 * We save the arguments in the following registers in case of a fault:
878 * 	kaddr - %g2
879 * 	uaddr - %g3
880 * 	count - %g4
881 */
882#define	SAVE_SRC	%g2
883#define	SAVE_DST	%g3
884#define	SAVE_COUNT	%g4
885
886#define	REAL_LOFAULT		%g5
887#define	SAVED_LOFAULT		%g6
888
889/*
890 * Generic copyio fault handler.  This is the first line of defense when a
891 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
892 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
893 * This allows us to share common code for all the flavors of the copy
894 * operations, including the _noerr versions.
895 *
896 * Note that this function will restore the original input parameters before
897 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
898 * member of the t_copyop structure, if needed.
899 */
900	ENTRY(copyio_fault)
901	membar	#Sync
902	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
903
904	restore
905
906	mov	SAVE_SRC, %o0
907	mov	SAVE_DST, %o1
908	jmp	REAL_LOFAULT
909	  mov	SAVE_COUNT, %o2
910	SET_SIZE(copyio_fault)
911
912	ENTRY(copyio_fault_nowindow)
913	membar	#Sync
914	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
915
916	mov	SAVE_SRC, %o0
917	mov	SAVE_DST, %o1
918	jmp	REAL_LOFAULT
919	  mov	SAVE_COUNT, %o2
920	SET_SIZE(copyio_fault_nowindow)
921
922	ENTRY(copyout)
923	sethi	%hi(.copyout_err), REAL_LOFAULT
924	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
925
926.do_copyout:
927	!
928	! Check the length and bail if zero.
929	!
930	tst	%o2
931	bnz,pt	%ncc, 1f
932	  nop
933	retl
934	  clr	%o0
9351:
936	sethi	%hi(copyio_fault), %o4
937	or	%o4, %lo(copyio_fault), %o4
938	sethi	%hi(copyio_fault_nowindow), %o3
939	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
940	or	%o3, %lo(copyio_fault_nowindow), %o3
941	membar	#Sync
942	stn	%o3, [THREAD_REG + T_LOFAULT]
943
944	mov	%o0, SAVE_SRC
945	mov	%o1, SAVE_DST
946	mov	%o2, SAVE_COUNT
947
948	!
949	! Check to see if we're more than SMALL_LIMIT (7 bytes).
950	! Run in leaf mode, using the %o regs as our input regs.
951	!
952	subcc	%o2, SMALL_LIMIT, %o3
953	bgu,a,pt %ncc, .dco_ns
954	or	%o0, %o1, %o3
955	!
956	! What was previously ".small_copyout"
957	! Do full differenced copy.
958	!
959.dcobcp:
960	sub	%g0, %o2, %o3		! negate count
961	add	%o0, %o2, %o0		! make %o0 point at the end
962	add	%o1, %o2, %o1		! make %o1 point at the end
963	ba,pt	%ncc, .dcocl
964	ldub	[%o0 + %o3], %o4	! load first byte
965	!
966	! %o0 and %o2 point at the end and remain pointing at the end
967	! of their buffers. We pull things out by adding %o3 (which is
968	! the negation of the length) to the buffer end which gives us
969	! the curent location in the buffers. By incrementing %o3 we walk
970	! through both buffers without having to bump each buffer's
971	! pointer. A very fast 4 instruction loop.
972	!
973	.align 16
974.dcocl:
975	stba	%o4, [%o1 + %o3]ASI_USER
976	inccc	%o3
977	bl,a,pt	%ncc, .dcocl
978	ldub	[%o0 + %o3], %o4
979	!
980	! We're done. Go home.
981	!
982	membar	#Sync
983	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
984	retl
985	clr	%o0
986	!
987	! Try aligned copies from here.
988	!
989.dco_ns:
990	! %o0 = kernel addr (to be copied from)
991	! %o1 = user addr (to be copied to)
992	! %o2 = length
993	! %o3 = %o1 | %o2 (used for alignment checking)
994	! %o4 is alternate lo_fault
995	! %o5 is original lo_fault
996	!
997	! See if we're single byte aligned. If we are, check the
998	! limit for single byte copies. If we're smaller or equal,
999	! bounce to the byte for byte copy loop. Otherwise do it in
1000	! HW (if enabled).
1001	!
1002	btst	1, %o3
1003	bz,pt	%icc, .dcoh8
1004	btst	7, %o3
1005	!
1006	! Single byte aligned. Do we do it via HW or via
1007	! byte for byte? Do a quick no memory reference
1008	! check to pick up small copies.
1009	!
1010	sethi	%hi(hw_copy_limit_1), %o3
1011	!
1012	! Big enough that we need to check the HW limit for
1013	! this size copy.
1014	!
1015	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1016	!
1017	! Is HW copy on? If not, do everything byte for byte.
1018	!
1019	tst	%o3
1020	bz,pn	%icc, .dcobcp
1021	subcc	%o3, %o2, %o3
1022	!
1023	! If we're less than or equal to the single byte copy limit,
1024	! bop to the copy loop.
1025	!
1026	bge,pt	%ncc, .dcobcp
1027	nop
1028	!
1029	! We're big enough and copy is on. Do it with HW.
1030	!
1031	ba,pt	%ncc, .big_copyout
1032	nop
1033.dcoh8:
1034	!
1035	! 8 byte aligned?
1036	!
1037	bnz,a	%ncc, .dcoh4
1038	btst	3, %o3
1039	!
1040	! See if we're in the "small range".
1041	! If so, go off and do the copy.
1042	! If not, load the hard limit. %o3 is
1043	! available for reuse.
1044	!
1045	sethi	%hi(hw_copy_limit_8), %o3
1046	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1047	!
1048	! If it's zero, there's no HW bcopy.
1049	! Bop off to the aligned copy.
1050	!
1051	tst	%o3
1052	bz,pn	%icc, .dcos8
1053	subcc	%o3, %o2, %o3
1054	!
1055	! We're negative if our size is larger than hw_copy_limit_8.
1056	!
1057	bge,pt	%ncc, .dcos8
1058	nop
1059	!
1060	! HW assist is on and we're large enough. Do it.
1061	!
1062	ba,pt	%ncc, .big_copyout
1063	nop
1064.dcos8:
1065	!
1066	! Housekeeping for copy loops. Uses same idea as in the byte for
1067	! byte copy loop above.
1068	!
1069	add	%o0, %o2, %o0
1070	add	%o1, %o2, %o1
1071	sub	%g0, %o2, %o3
1072	ba,pt	%ncc, .dodebc
1073	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
1074	!
1075	! 4 byte aligned?
1076	!
1077.dcoh4:
1078	bnz,pn	%ncc, .dcoh2
1079	!
1080	! See if we're in the "small range".
1081	! If so, go off an do the copy.
1082	! If not, load the hard limit. %o3 is
1083	! available for reuse.
1084	!
1085	sethi	%hi(hw_copy_limit_4), %o3
1086	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1087	!
1088	! If it's zero, there's no HW bcopy.
1089	! Bop off to the aligned copy.
1090	!
1091	tst	%o3
1092	bz,pn	%icc, .dcos4
1093	subcc	%o3, %o2, %o3
1094	!
1095	! We're negative if our size is larger than hw_copy_limit_4.
1096	!
1097	bge,pt	%ncc, .dcos4
1098	nop
1099	!
1100	! HW assist is on and we're large enough. Do it.
1101	!
1102	ba,pt	%ncc, .big_copyout
1103	nop
1104.dcos4:
1105	add	%o0, %o2, %o0
1106	add	%o1, %o2, %o1
1107	sub	%g0, %o2, %o3
1108	ba,pt	%ncc, .dodfbc
1109	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
1110	!
1111	! We must be 2 byte aligned. Off we go.
1112	! The check for small copies was done in the
1113	! delay at .dcoh4
1114	!
1115.dcoh2:
1116	ble	%ncc, .dcos2
1117	sethi	%hi(hw_copy_limit_2), %o3
1118	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1119	tst	%o3
1120	bz,pn	%icc, .dcos2
1121	subcc	%o3, %o2, %o3
1122	bge,pt	%ncc, .dcos2
1123	nop
1124	!
1125	! HW is on and we're big enough. Do it.
1126	!
1127	ba,pt	%ncc, .big_copyout
1128	nop
1129.dcos2:
1130	add	%o0, %o2, %o0
1131	add	%o1, %o2, %o1
1132	sub	%g0, %o2, %o3
1133	ba,pt	%ncc, .dodtbc
1134	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
1135.small_copyout:
1136	!
1137	! Why are we doing this AGAIN? There are certain conditions in
1138	! big_copyout that will cause us to forego the HW assisted copies
1139	! and bounce back to a non-HW assisted copy. This dispatches those
1140	! copies. Note that we branch around this in the main line code.
1141	!
1142	! We make no check for limits or HW enablement here. We've
1143	! already been told that we're a poster child so just go off
1144	! and do it.
1145	!
1146	or	%o0, %o1, %o3
1147	btst	1, %o3
1148	bnz	%icc, .dcobcp		! Most likely
1149	btst	7, %o3
1150	bz	%icc, .dcos8
1151	btst	3, %o3
1152	bz	%icc, .dcos4
1153	nop
1154	ba,pt	%ncc, .dcos2
1155	nop
1156	.align 32
1157.dodebc:
1158	ldx	[%o0 + %o3], %o4
1159	deccc	%o2
1160	stxa	%o4, [%o1 + %o3]ASI_USER
1161	bg,pt	%ncc, .dodebc
1162	addcc	%o3, 8, %o3
1163	!
1164	! End of copy loop. Check to see if we're done. Most
1165	! eight byte aligned copies end here.
1166	!
1167	bz,pt	%ncc, .dcofh
1168	nop
1169	!
1170	! Something is left - do it byte for byte.
1171	!
1172	ba,pt	%ncc, .dcocl
1173	ldub	[%o0 + %o3], %o4	! load next byte
1174	!
1175	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
1176	!
1177	.align 32
1178.dodfbc:
1179	lduw	[%o0 + %o3], %o4
1180	deccc	%o2
1181	sta	%o4, [%o1 + %o3]ASI_USER
1182	bg,pt	%ncc, .dodfbc
1183	addcc	%o3, 4, %o3
1184	!
1185	! End of copy loop. Check to see if we're done. Most
1186	! four byte aligned copies end here.
1187	!
1188	bz,pt	%ncc, .dcofh
1189	nop
1190	!
1191	! Something is left. Do it byte for byte.
1192	!
1193	ba,pt	%ncc, .dcocl
1194	ldub	[%o0 + %o3], %o4	! load next byte
1195	!
1196	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
1197	! copy.
1198	!
1199	.align 32
1200.dodtbc:
1201	lduh	[%o0 + %o3], %o4
1202	deccc	%o2
1203	stha	%o4, [%o1 + %o3]ASI_USER
1204	bg,pt	%ncc, .dodtbc
1205	addcc	%o3, 2, %o3
1206	!
1207	! End of copy loop. Anything left?
1208	!
1209	bz,pt	%ncc, .dcofh
1210	nop
1211	!
1212	! Deal with the last byte
1213	!
1214	ldub	[%o0 + %o3], %o4
1215	stba	%o4, [%o1 + %o3]ASI_USER
1216.dcofh:
1217	membar	#Sync
1218	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1219	retl
1220	clr	%o0
1221
1222.big_copyout:
1223	!
1224	! We're going to go off and do a block copy.
1225	! Switch fault handlers and grab a window. We
1226	! don't do a membar #Sync since we've done only
1227	! kernel data to this point.
1228	!
1229	stn	%o4, [THREAD_REG + T_LOFAULT]
1230	save	%sp, -SA(MINFRAME), %sp
1231
1232	! Copy out that reach here are larger than 256 bytes. The
1233	! hw_copy_limit_1 is set to 256. Never set this limit less
1234	! 128 bytes.
1235.do_block_copyout:
1236
1237	! Swap src/dst since the code below is memcpy code
1238	! and memcpy/bcopy have different calling sequences
1239	mov	%i1, %i5
1240	mov	%i0, %i1
1241	mov	%i5, %i0
1242
1243	andcc	%i0, 7, %i3		! is dst double aligned
1244	bz	%ncc, copyout_blkcpy
1245	sub	%i3, 8, %i3
1246	neg	%i3			! bytes till double aligned
1247	sub	%i2, %i3, %i2		! update %i2 with new count
1248
1249	! Align Destination on double-word boundary
1250
12511:	ldub	[%i1], %i4
1252	inc	%i1
1253	stba	%i4, [%i0]ASI_USER
1254	deccc	%i3
1255	bgu	%ncc, 1b
1256	  inc	%i0
1257
1258copyout_blkcpy:
1259	andcc	%i0, 63, %i3
1260	bz,pn	%ncc, copyout_blalign	! now block aligned
1261	sub	%i3, 64, %i3
1262	neg	%i3			! bytes till block aligned
1263	sub	%i2, %i3, %i2		! update %i2 with new count
1264
1265	! Copy %i3 bytes till dst is block (64 byte) aligned. use
1266	! double word copies.
1267
1268	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
1269	bz	%ncc, .co_dbcopy	! %g1 has source offset (last 3-bits)
1270	sll	%g1, 3, %l1		! left shift
1271	mov	0x40, %l2
1272	sub	%l2, %l1, %l2		! right shift = (64 - left shift)
1273
1274	! Now use double word copies to align destination.
1275.co_double:
1276	sub	%i1, %g1, %i1		! align the src at 8 bytes.
1277	ldx	[%i1], %o2
12782:
1279	ldx	[%i1+8], %o4
1280	ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
1281	stxa	%o2, [%i0]ASI_USER
1282	mov	%o4, %o2
1283	add	%i1, 0x8, %i1
1284	subcc	%i3, 0x8, %i3
1285	bgu,pt	%ncc, 2b
1286	add	%i0, 0x8, %i0
1287	ba	copyout_blalign
1288	add	%i1, %g1, %i1
1289
1290	! Both source and destination are double aligned.
1291	! No shift and merge of data required in this case.
1292.co_dbcopy:
1293	ldx	[%i1], %o2
1294	stxa	%o2, [%i0]ASI_USER
1295	add	%i1, 0x8, %i1
1296	subcc	%i3, 0x8, %i3
1297	bgu,pt	%ncc, .co_dbcopy
1298	add	%i0, 0x8, %i0
1299
1300copyout_blalign:
1301	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
1302	sub	%i2, %i3, %i2		! Residue bytes in %i2
1303
1304	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
1305
1306	andcc	%i1, 0xf, %o2		! is src quadword aligned
1307	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
1308	nop
1309	cmp	%o2, 0x8
1310	bg	.co_upper_double
1311	nop
1312	bl	.co_lower_double
1313	nop
1314
1315	! Falls through when source offset is equal to 8 i.e.
1316	! source is double word aligned.
1317	! In this case no shift/merge of data is required
1318
1319	sub	%i1, %o2, %i1		! align the src at 16 bytes.
1320	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
1321	prefetch [%l0+0x0], #one_read
1322	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1323.co_loop0:
1324	add	%i1, 0x10, %i1
1325	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1326	prefetch [%l0+0x40], #one_read
1327
1328	stxa	%l3, [%i0+0x0]%asi
1329	stxa	%l4, [%i0+0x8]%asi
1330
1331	add	%i1, 0x10, %i1
1332	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1333
1334	stxa	%l5, [%i0+0x10]%asi
1335	stxa	%l2, [%i0+0x18]%asi
1336
1337	add	%i1, 0x10, %i1
1338	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1339
1340	stxa	%l3, [%i0+0x20]%asi
1341	stxa	%l4, [%i0+0x28]%asi
1342
1343	add	%i1, 0x10, %i1
1344	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1345
1346	stxa	%l5, [%i0+0x30]%asi
1347	stxa	%l2, [%i0+0x38]%asi
1348
1349	add	%l0, 0x40, %l0
1350	subcc	%i3, 0x40, %i3
1351	bgu,pt	%xcc, .co_loop0
1352	add	%i0, 0x40, %i0
1353	ba	.co_blkdone
1354	add	%i1, %o2, %i1		! increment the source by src offset
1355					! the src offset was stored in %o2
1356
1357.co_lower_double:
1358
1359	sub	%i1, %o2, %i1		! align the src at 16 bytes.
1360	sll	%o2, 3, %o0		! %o0 left shift
1361	mov	0x40, %o1
1362	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
1363	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
1364	prefetch [%l0+0x0], #one_read
1365	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
1366					! complete data
1367.co_loop1:
1368	add	%i1, 0x10, %i1
1369	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
1370							! for this read.
1371	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
1372							! into %l2 and %l3
1373	prefetch [%l0+0x40], #one_read
1374
1375	stxa	%l2, [%i0+0x0]%asi
1376	stxa	%l3, [%i0+0x8]%asi
1377
1378	add	%i1, 0x10, %i1
1379	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1380	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
1381							! %l4 from previous read
1382							! into %l4 and %l5
1383	stxa	%l4, [%i0+0x10]%asi
1384	stxa	%l5, [%i0+0x18]%asi
1385
1386	! Repeat the same for next 32 bytes.
1387
1388	add	%i1, 0x10, %i1
1389	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1390	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
1391
1392	stxa	%l2, [%i0+0x20]%asi
1393	stxa	%l3, [%i0+0x28]%asi
1394
1395	add	%i1, 0x10, %i1
1396	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1397	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
1398
1399	stxa	%l4, [%i0+0x30]%asi
1400	stxa	%l5, [%i0+0x38]%asi
1401
1402	add	%l0, 0x40, %l0
1403	subcc	%i3, 0x40, %i3
1404	bgu,pt	%xcc, .co_loop1
1405	add	%i0, 0x40, %i0
1406	ba	.co_blkdone
1407	add	%i1, %o2, %i1		! increment the source by src offset
1408					! the src offset was stored in %o2
1409
1410.co_upper_double:
1411
1412	sub	%i1, %o2, %i1		! align the src at 16 bytes.
1413	sub	%o2, 0x8, %o0
1414	sll	%o0, 3, %o0		! %o0 left shift
1415	mov	0x40, %o1
1416	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
1417	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
1418	prefetch [%l0+0x0], #one_read
1419	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
1420							! for this read and
1421							! no data in %l2
1422.co_loop2:
1423	add	%i1, 0x10, %i1
1424	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
1425							! and %l5 has partial
1426	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
1427							! into %l3 and %l4
1428	prefetch [%l0+0x40], #one_read
1429
1430	stxa	%l3, [%i0+0x0]%asi
1431	stxa	%l4, [%i0+0x8]%asi
1432
1433	add	%i1, 0x10, %i1
1434	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1435	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
1436							! %l5 from previous read
1437							! into %l5 and %l2
1438
1439	stxa	%l5, [%i0+0x10]%asi
1440	stxa	%l2, [%i0+0x18]%asi
1441
1442	! Repeat the same for next 32 bytes.
1443
1444	add	%i1, 0x10, %i1
1445	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1446	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
1447
1448	stxa	%l3, [%i0+0x20]%asi
1449	stxa	%l4, [%i0+0x28]%asi
1450
1451	add	%i1, 0x10, %i1
1452	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1453	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
1454
1455	stxa	%l5, [%i0+0x30]%asi
1456	stxa	%l2, [%i0+0x38]%asi
1457
1458	add	%l0, 0x40, %l0
1459	subcc	%i3, 0x40, %i3
1460	bgu,pt	%xcc, .co_loop2
1461	add	%i0, 0x40, %i0
1462	ba	.co_blkdone
1463	add	%i1, %o2, %i1		! increment the source by src offset
1464					! the src offset was stored in %o2
1465
1466
1467	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
1468.co_blkcpy:
1469
1470	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
1471	prefetch [%o0+0x0], #one_read
14721:
1473	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
1474	add	%i1, 0x10, %i1
1475	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
1476	add	%i1, 0x10, %i1
1477
1478	prefetch [%o0+0x40], #one_read
1479
1480	stxa	%l0, [%i0+0x0]%asi
1481
1482	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
1483	add	%i1, 0x10, %i1
1484	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
1485	add	%i1, 0x10, %i1
1486
1487	stxa	%l1, [%i0+0x8]%asi
1488	stxa	%l2, [%i0+0x10]%asi
1489	stxa	%l3, [%i0+0x18]%asi
1490	stxa	%l4, [%i0+0x20]%asi
1491	stxa	%l5, [%i0+0x28]%asi
1492	stxa	%l6, [%i0+0x30]%asi
1493	stxa	%l7, [%i0+0x38]%asi
1494
1495	add	%o0, 0x40, %o0
1496	subcc	%i3, 0x40, %i3
1497	bgu,pt	%xcc, 1b
1498	add	%i0, 0x40, %i0
1499
1500.co_blkdone:
1501	membar	#Sync
1502
1503	! Copy as much rest of the data as double word copy.
1504.co_dwcp:
1505	cmp	%i2, 0x8		! Not enough bytes to copy as double
1506	blu	%ncc, .co_dbdone
1507	nop
1508
1509	andn	%i2, 0x7, %i3		! %i3 count is multiple of 8 bytes size
1510	sub	%i2, %i3, %i2		! Residue bytes in %i2
1511
1512	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
1513	bz	%ncc, .co_cpy_db
1514	nop
1515
1516	sll	%g1, 3, %l0		! left shift
1517	mov	0x40, %l1
1518	sub	%l1, %l0, %l1		! right shift = (64 - left shift)
1519
1520.co_cpy_wd:
1521	sub	%i1, %g1, %i1		! align the src at 8 bytes.
1522	ldx	[%i1], %o2
15233:
1524	ldx	[%i1+8], %o4
1525	ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
1526	stxa	%o2, [%i0]ASI_USER
1527	mov	%o4, %o2
1528	add	%i1, 0x8, %i1
1529	subcc	%i3, 0x8, %i3
1530	bgu,pt	%ncc, 3b
1531	add	%i0, 0x8, %i0
1532	ba	.co_dbdone
1533	add	%i1, %g1, %i1
1534
1535.co_cpy_db:
1536	ldx	[%i1], %o2
1537	stxa	%o2, [%i0]ASI_USER
1538	add	%i1, 0x8, %i1
1539	subcc	%i3, 0x8, %i3
1540	bgu,pt	%ncc, .co_cpy_db
1541	add	%i0, 0x8, %i0
1542
1543.co_dbdone:
1544	tst	%i2
1545	bz,pt	%xcc, .copyout_exit
1546	nop
1547
1548	! Copy the residue as byte copy
1549.co_residue:
1550	ldub	[%i1], %i4
1551	stba	%i4, [%i0]ASI_USER
1552	inc	%i1
1553	deccc	%i2
1554	bgu	%xcc, .co_residue
1555	inc	%i0
1556
1557.copyout_exit:
1558	membar	#Sync
1559	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1560	ret
1561	restore	%g0, 0, %o0
1562
1563.copyout_err:
1564	ldn	[THREAD_REG + T_COPYOPS], %o4
1565	brz	%o4, 2f
1566	nop
1567	ldn	[%o4 + CP_COPYOUT], %g2
1568	jmp	%g2
1569	nop
15702:
1571	retl
1572	mov	-1, %o0
1573	SET_SIZE(copyout)
1574
1575#endif	/* lint */
1576
1577
1578#ifdef	lint
1579
1580/*ARGSUSED*/
1581int
1582xcopyout(const void *kaddr, void *uaddr, size_t count)
1583{ return (0); }
1584
1585#else	/* lint */
1586
1587	ENTRY(xcopyout)
1588	sethi	%hi(.xcopyout_err), REAL_LOFAULT
1589	b	.do_copyout
1590	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
1591.xcopyout_err:
1592	ldn	[THREAD_REG + T_COPYOPS], %o4
1593	brz	%o4, 2f
1594	nop
1595	ldn	[%o4 + CP_XCOPYOUT], %g2
1596	jmp	%g2
1597	nop
15982:
1599	retl
1600	mov	%g1, %o0
1601	SET_SIZE(xcopyout)
1602
1603#endif	/* lint */
1604
1605#ifdef	lint
1606
1607/*ARGSUSED*/
1608int
1609xcopyout_little(const void *kaddr, void *uaddr, size_t count)
1610{ return (0); }
1611
1612#else	/* lint */
1613
1614	ENTRY(xcopyout_little)
1615	sethi	%hi(.little_err), %o4
1616	ldn	[THREAD_REG + T_LOFAULT], %o5
1617	or	%o4, %lo(.little_err), %o4
1618	membar	#Sync			! sync error barrier
1619	stn	%o4, [THREAD_REG + T_LOFAULT]
1620
1621	subcc	%g0, %o2, %o3
1622	add	%o0, %o2, %o0
1623	bz,pn	%ncc, 2f		! check for zero bytes
1624	sub	%o2, 1, %o4
1625	add	%o0, %o4, %o0		! start w/last byte
1626	add	%o1, %o2, %o1
1627	ldub	[%o0+%o3], %o4
1628
16291:	stba	%o4, [%o1+%o3]ASI_AIUSL
1630	inccc	%o3
1631	sub	%o0, 2, %o0		! get next byte
1632	bcc,a,pt %ncc, 1b
1633	  ldub	[%o0+%o3], %o4
1634
16352:	membar	#Sync			! sync error barrier
1636	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1637	retl
1638	mov	%g0, %o0		! return (0)
1639	SET_SIZE(xcopyout_little)
1640
1641#endif	/* lint */
1642
1643/*
1644 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
1645 */
1646
1647#if defined(lint)
1648
1649/*ARGSUSED*/
1650int
1651copyin(const void *uaddr, void *kaddr, size_t count)
1652{ return (0); }
1653
1654#else	/* lint */
1655
1656	ENTRY(copyin)
1657	sethi	%hi(.copyin_err), REAL_LOFAULT
1658	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
1659
1660.do_copyin:
1661	!
1662	! Check the length and bail if zero.
1663	!
1664	tst	%o2
1665	bnz,pt	%ncc, 1f
1666	  nop
1667	retl
1668	  clr	%o0
16691:
1670	sethi	%hi(copyio_fault), %o4
1671	or	%o4, %lo(copyio_fault), %o4
1672	sethi	%hi(copyio_fault_nowindow), %o3
1673	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
1674	or	%o3, %lo(copyio_fault_nowindow), %o3
1675	membar	#Sync
1676	stn	%o3, [THREAD_REG + T_LOFAULT]
1677
1678	mov	%o0, SAVE_SRC
1679	mov	%o1, SAVE_DST
1680	mov	%o2, SAVE_COUNT
1681
1682	!
1683	! Check to see if we're more than SMALL_LIMIT.
1684	!
1685	subcc	%o2, SMALL_LIMIT, %o3
1686	bgu,a,pt %ncc, .dci_ns
1687	or	%o0, %o1, %o3
1688	!
1689	! What was previously ".small_copyin"
1690	!
1691.dcibcp:
1692	sub	%g0, %o2, %o3		! setup for copy loop
1693	add	%o0, %o2, %o0
1694	add	%o1, %o2, %o1
1695	ba,pt	%ncc, .dcicl
1696	lduba	[%o0 + %o3]ASI_USER, %o4
1697	!
1698	! %o0 and %o1 point at the end and remain pointing at the end
1699	! of their buffers. We pull things out by adding %o3 (which is
1700	! the negation of the length) to the buffer end which gives us
1701	! the curent location in the buffers. By incrementing %o3 we walk
1702	! through both buffers without having to bump each buffer's
1703	! pointer. A very fast 4 instruction loop.
1704	!
1705	.align 16
1706.dcicl:
1707	stb	%o4, [%o1 + %o3]
1708	inccc	%o3
1709	bl,a,pt %ncc, .dcicl
1710	lduba	[%o0 + %o3]ASI_USER, %o4
1711	!
1712	! We're done. Go home.
1713	!
1714	membar	#Sync
1715	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
1716	retl
1717	clr	%o0
1718	!
1719	! Try aligned copies from here.
1720	!
1721.dci_ns:
1722	!
1723	! See if we're single byte aligned. If we are, check the
1724	! limit for single byte copies. If we're smaller, or equal,
1725	! bounce to the byte for byte copy loop. Otherwise do it in
1726	! HW (if enabled).
1727	!
1728	btst	1, %o3
1729	bz,a,pt	%icc, .dcih8
1730	btst	7, %o3
1731	!
1732	! We're single byte aligned.
1733	!
1734	sethi	%hi(hw_copy_limit_1), %o3
1735	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1736	!
1737	! Is HW copy on? If not do everything byte for byte.
1738	!
1739	tst	%o3
1740	bz,pn	%icc, .dcibcp
1741	subcc	%o3, %o2, %o3
1742	!
1743	! Are we bigger than the HW limit? If not
1744	! go to byte for byte.
1745	!
1746	bge,pt	%ncc, .dcibcp
1747	nop
1748	!
1749	! We're big enough and copy is on. Do it with HW.
1750	!
1751	ba,pt	%ncc, .big_copyin
1752	nop
1753.dcih8:
1754	!
1755	! 8 byte aligned?
1756	!
1757	bnz,a	%ncc, .dcih4
1758	btst	3, %o3
1759	!
1760	! We're eight byte aligned.
1761	!
1762	sethi	%hi(hw_copy_limit_8), %o3
1763	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1764	!
1765	! Is HW assist on? If not, do it with the aligned copy.
1766	!
1767	tst	%o3
1768	bz,pn	%icc, .dcis8
1769	subcc	%o3, %o2, %o3
1770	bge	%ncc, .dcis8
1771	nop
1772	ba,pt	%ncc, .big_copyin
1773	nop
1774.dcis8:
1775	!
1776	! Housekeeping for copy loops. Uses same idea as in the byte for
1777	! byte copy loop above.
1778	!
1779	add	%o0, %o2, %o0
1780	add	%o1, %o2, %o1
1781	sub	%g0, %o2, %o3
1782	ba,pt	%ncc, .didebc
1783	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
1784	!
1785	! 4 byte aligned?
1786	!
1787.dcih4:
1788	bnz	%ncc, .dcih2
1789	sethi	%hi(hw_copy_limit_4), %o3
1790	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1791	!
1792	! Is HW assist on? If not, do it with the aligned copy.
1793	!
1794	tst	%o3
1795	bz,pn	%icc, .dcis4
1796	subcc	%o3, %o2, %o3
1797	!
1798	! We're negative if our size is less than or equal to hw_copy_limit_4.
1799	!
1800	bge	%ncc, .dcis4
1801	nop
1802	ba,pt	%ncc, .big_copyin
1803	nop
1804.dcis4:
1805	!
1806	! Housekeeping for copy loops. Uses same idea as in the byte
1807	! for byte copy loop above.
1808	!
1809	add	%o0, %o2, %o0
1810	add	%o1, %o2, %o1
1811	sub	%g0, %o2, %o3
1812	ba,pt	%ncc, .didfbc
1813	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
1814.dcih2:
1815	!
1816	! We're two byte aligned. Check for "smallness"
1817	! done in delay at .dcih4
1818	!
1819	bleu,pt	%ncc, .dcis2
1820	sethi	%hi(hw_copy_limit_2), %o3
1821	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1822	!
1823	! Is HW assist on? If not, do it with the aligned copy.
1824	!
1825	tst	%o3
1826	bz,pn	%icc, .dcis2
1827	subcc	%o3, %o2, %o3
1828	!
1829	! Are we larger than the HW limit?
1830	!
1831	bge	%ncc, .dcis2
1832	nop
1833	!
1834	! HW assist is on and we're large enough to use it.
1835	!
1836	ba,pt	%ncc, .big_copyin
1837	nop
1838	!
1839	! Housekeeping for copy loops. Uses same idea as in the byte
1840	! for byte copy loop above.
1841	!
1842.dcis2:
1843	add	%o0, %o2, %o0
1844	add	%o1, %o2, %o1
1845	sub	%g0, %o2, %o3
1846	ba,pt	%ncc, .didtbc
1847	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
1848	!
1849.small_copyin:
1850	!
1851	! Why are we doing this AGAIN? There are certain conditions in
1852	! big copyin that will cause us to forgo the HW assisted copys
1853	! and bounce back to a non-hw assisted copy. This dispatches
1854	! those copies. Note that we branch around this in the main line
1855	! code.
1856	!
1857	! We make no check for limits or HW enablement here. We've
1858	! already been told that we're a poster child so just go off
1859	! and do it.
1860	!
1861	or	%o0, %o1, %o3
1862	btst	1, %o3
1863	bnz	%icc, .dcibcp		! Most likely
1864	btst	7, %o3
1865	bz	%icc, .dcis8
1866	btst	3, %o3
1867	bz	%icc, .dcis4
1868	nop
1869	ba,pt	%ncc, .dcis2
1870	nop
1871	!
1872	! Eight byte aligned copies. A steal from the original .small_copyin
1873	! with modifications. %o2 is number of 8 byte chunks to copy. When
1874	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
1875	! to copy.
1876	!
1877	.align 32
1878.didebc:
1879	ldxa	[%o0 + %o3]ASI_USER, %o4
1880	deccc	%o2
1881	stx	%o4, [%o1 + %o3]
1882	bg,pt	%ncc, .didebc
1883	addcc	%o3, 8, %o3
1884	!
1885	! End of copy loop. Most 8 byte aligned copies end here.
1886	!
1887	bz,pt	%ncc, .dcifh
1888	nop
1889	!
1890	! Something is left. Do it byte for byte.
1891	!
1892	ba,pt	%ncc, .dcicl
1893	lduba	[%o0 + %o3]ASI_USER, %o4
1894	!
1895	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
1896	!
1897	.align 32
1898.didfbc:
1899	lduwa	[%o0 + %o3]ASI_USER, %o4
1900	deccc	%o2
1901	st	%o4, [%o1 + %o3]
1902	bg,pt	%ncc, .didfbc
1903	addcc	%o3, 4, %o3
1904	!
1905	! End of copy loop. Most 4 byte aligned copies end here.
1906	!
1907	bz,pt	%ncc, .dcifh
1908	nop
1909	!
1910	! Something is left. Do it byte for byte.
1911	!
1912	ba,pt	%ncc, .dcicl
1913	lduba	[%o0 + %o3]ASI_USER, %o4
1914	!
1915	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
1916	! copy.
1917	!
1918	.align 32
1919.didtbc:
1920	lduha	[%o0 + %o3]ASI_USER, %o4
1921	deccc	%o2
1922	sth	%o4, [%o1 + %o3]
1923	bg,pt	%ncc, .didtbc
1924	addcc	%o3, 2, %o3
1925	!
1926	! End of copy loop. Most 2 byte aligned copies end here.
1927	!
1928	bz,pt	%ncc, .dcifh
1929	nop
1930	!
1931	! Deal with the last byte
1932	!
1933	lduba	[%o0 + %o3]ASI_USER, %o4
1934	stb	%o4, [%o1 + %o3]
1935.dcifh:
1936	membar	#Sync
1937	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1938	retl
1939	clr	%o0
1940
1941.big_copyin:
1942	!
1943	! We're going off to do a block copy.
1944	! Switch fault hendlers and grab a window. We
1945	! don't do a membar #Sync since we've done only
1946	! kernel data to this point.
1947	!
1948	stn	%o4, [THREAD_REG + T_LOFAULT]
1949	save	%sp, -SA(MINFRAME), %sp
1950
1951	! Copy in that reach here are larger than 256 bytes. The
1952	! hw_copy_limit_1 is set to 256. Never set this limit less
1953	! 128 bytes.
1954.do_blockcopyin:
1955
1956	! Swap src/dst since the code below is memcpy code
1957	! and memcpy/bcopy have different calling sequences
1958	mov	%i1, %i5
1959	mov	%i0, %i1
1960	mov	%i5, %i0
1961
1962	andcc	%i0, 7, %i3		! is dst double aligned
1963	bz	%ncc, copyin_blkcpy
1964	sub	%i3, 8, %i3
1965	neg	%i3			! bytes till double aligned
1966	sub	%i2, %i3, %i2		! update %i2 with new count
1967
1968	! Align Destination on double-word boundary
1969
19701:	lduba	[%i1]ASI_USER, %i4
1971	inc	%i1
1972	stb	%i4, [%i0]
1973	deccc	%i3
1974	bgu	%ncc, 1b
1975	  inc	%i0
1976
1977copyin_blkcpy:
1978	andcc	%i0, 63, %i3
1979	bz,pn	%ncc, copyin_blalign	! now block aligned
1980	sub	%i3, 64, %i3
1981	neg	%i3			! bytes till block aligned
1982	sub	%i2, %i3, %i2		! update %i2 with new count
1983
1984	! Copy %i3 bytes till dst is block (64 byte) aligned. use
1985	! double word copies.
1986
1987	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
1988	bz	%ncc, .ci_dbcopy	! %g1 has source offset (last 3-bits)
1989	sll	%g1, 3, %l1		! left shift
1990	mov	0x40, %l2
1991	sub	%l2, %l1, %l2		! right shift = (64 - left shift)
1992
1993	! Now use double word copies to align destination.
1994.ci_double:
1995	sub	%i1, %g1, %i1		! align the src at 8 bytes.
1996	ldxa	[%i1]ASI_USER, %o2
19972:
1998	add	%i1, 0x8, %i1
1999	ldxa	[%i1]ASI_USER, %o4
2000	ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
2001	stx	%o2, [%i0]
2002	mov	%o4, %o2
2003	subcc	%i3, 0x8, %i3
2004	bgu,pt	%ncc, 2b
2005	add	%i0, 0x8, %i0
2006	ba	copyin_blalign
2007	add	%i1, %g1, %i1
2008
2009	! Both source and destination are double aligned.
2010	! No shift and merge of data required in this case.
2011.ci_dbcopy:
2012	ldxa	[%i1]ASI_USER, %o2
2013	stx	%o2, [%i0]
2014	add	%i1, 0x8, %i1
2015	subcc	%i3, 0x8, %i3
2016	bgu,pt	%ncc, .ci_dbcopy
2017	add	%i0, 0x8, %i0
2018
2019copyin_blalign:
2020	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2021	sub	%i2, %i3, %i2		! Residue bytes in %i2
2022
2023	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2024
2025	andcc	%i1, 0xf, %o2		! is src quadword aligned
2026	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
2027	nop
2028	cmp	%o2, 0x8
2029	bg	.ci_upper_double
2030	nop
2031	bl	.ci_lower_double
2032	nop
2033
2034	! Falls through when source offset is equal to 8 i.e.
2035	! source is double word aligned.
2036	! In this case no shift/merge of data is required
2037
2038	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2039	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2040	prefetch [%l0+0x0], #one_read
2041	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2042.ci_loop0:
2043	add	%i1, 0x10, %i1
2044	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2045
2046	prefetch [%l0+0x40], #one_read
2047
2048	stxa	%l3, [%i0+0x0]%asi
2049	stxa	%l4, [%i0+0x8]%asi
2050
2051	add	%i1, 0x10, %i1
2052	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2053
2054	stxa	%l5, [%i0+0x10]%asi
2055	stxa	%l2, [%i0+0x18]%asi
2056
2057	add	%i1, 0x10, %i1
2058	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2059
2060	stxa	%l3, [%i0+0x20]%asi
2061	stxa	%l4, [%i0+0x28]%asi
2062
2063	add	%i1, 0x10, %i1
2064	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2065
2066	stxa	%l5, [%i0+0x30]%asi
2067	stxa	%l2, [%i0+0x38]%asi
2068
2069	add	%l0, 0x40, %l0
2070	subcc	%i3, 0x40, %i3
2071	bgu,pt	%xcc, .ci_loop0
2072	add	%i0, 0x40, %i0
2073	ba	.ci_blkdone
2074	add	%i1, %o2, %i1		! increment the source by src offset
2075					! the src offset was stored in %o2
2076
2077.ci_lower_double:
2078
2079	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2080	sll	%o2, 3, %o0		! %o0 left shift
2081	mov	0x40, %o1
2082	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2083	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2084	prefetch [%l0+0x0], #one_read
2085	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
2086							! and %l3 has complete
2087							! data
2088.ci_loop1:
2089	add	%i1, 0x10, %i1
2090	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
2091							! for this read.
2092	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2093							! into %l2 and %l3
2094
2095	prefetch [%l0+0x40], #one_read
2096
2097	stxa	%l2, [%i0+0x0]%asi
2098	stxa	%l3, [%i0+0x8]%asi
2099
2100	add	%i1, 0x10, %i1
2101	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2102	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2103							! %l4 from previous read
2104							! into %l4 and %l5
2105	stxa	%l4, [%i0+0x10]%asi
2106	stxa	%l5, [%i0+0x18]%asi
2107
2108	! Repeat the same for next 32 bytes.
2109
2110	add	%i1, 0x10, %i1
2111	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2112	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2113
2114	stxa	%l2, [%i0+0x20]%asi
2115	stxa	%l3, [%i0+0x28]%asi
2116
2117	add	%i1, 0x10, %i1
2118	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2119	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2120
2121	stxa	%l4, [%i0+0x30]%asi
2122	stxa	%l5, [%i0+0x38]%asi
2123
2124	add	%l0, 0x40, %l0
2125	subcc	%i3, 0x40, %i3
2126	bgu,pt	%xcc, .ci_loop1
2127	add	%i0, 0x40, %i0
2128	ba	.ci_blkdone
2129	add	%i1, %o2, %i1		! increment the source by src offset
2130					! the src offset was stored in %o2
2131
2132.ci_upper_double:
2133
2134	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2135	sub	%o2, 0x8, %o0
2136	sll	%o0, 3, %o0		! %o0 left shift
2137	mov	0x40, %o1
2138	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2139	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2140	prefetch [%l0+0x0], #one_read
2141	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
2142							! for this read and
2143							! no data in %l2
2144.ci_loop2:
2145	add	%i1, 0x10, %i1
2146	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
2147							! and %l5 has partial
2148	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2149							! into %l3 and %l4
2150	prefetch [%l0+0x40], #one_read
2151
2152	stxa	%l3, [%i0+0x0]%asi
2153	stxa	%l4, [%i0+0x8]%asi
2154
2155	add	%i1, 0x10, %i1
2156	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2157	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2158							! %l5 from previous read
2159							! into %l5 and %l2
2160
2161	stxa	%l5, [%i0+0x10]%asi
2162	stxa	%l2, [%i0+0x18]%asi
2163
2164	! Repeat the same for next 32 bytes.
2165
2166	add	%i1, 0x10, %i1
2167	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2168	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2169
2170	stxa	%l3, [%i0+0x20]%asi
2171	stxa	%l4, [%i0+0x28]%asi
2172
2173	add	%i1, 0x10, %i1
2174	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2175	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2176
2177	stxa	%l5, [%i0+0x30]%asi
2178	stxa	%l2, [%i0+0x38]%asi
2179
2180	add	%l0, 0x40, %l0
2181	subcc	%i3, 0x40, %i3
2182	bgu,pt	%xcc, .ci_loop2
2183	add	%i0, 0x40, %i0
2184	ba	.ci_blkdone
2185	add	%i1, %o2, %i1		! increment the source by src offset
2186					! the src offset was stored in %o2
2187
2188
2189	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2190.ci_blkcpy:
2191
2192	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2193	prefetch [%o0+0x0], #one_read
21941:
2195	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
2196	add	%i1, 0x10, %i1
2197	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
2198	add	%i1, 0x10, %i1
2199
2200	prefetch [%o0+0x40], #one_read
2201
2202	stxa	%l0, [%i0+0x0]%asi
2203
2204	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
2205	add	%i1, 0x10, %i1
2206	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
2207	add	%i1, 0x10, %i1
2208
2209	stxa	%l1, [%i0+0x8]%asi
2210	stxa	%l2, [%i0+0x10]%asi
2211	stxa	%l3, [%i0+0x18]%asi
2212	stxa	%l4, [%i0+0x20]%asi
2213	stxa	%l5, [%i0+0x28]%asi
2214	stxa	%l6, [%i0+0x30]%asi
2215	stxa	%l7, [%i0+0x38]%asi
2216
2217	add	%o0, 0x40, %o0
2218	subcc	%i3, 0x40, %i3
2219	bgu,pt	%xcc, 1b
2220	add	%i0, 0x40, %i0
2221
2222.ci_blkdone:
2223	membar	#Sync
2224
2225	! Copy as much rest of the data as double word copy.
2226.ci_dwcp:
2227	cmp	%i2, 0x8		! Not enough bytes to copy as double
2228	blu	%ncc, .ci_dbdone
2229	nop
2230
2231	andn	%i2, 0x7, %i3		! %i3 count is multiple of 8 bytes size
2232	sub	%i2, %i3, %i2		! Residue bytes in %i2
2233
2234	andcc	%i1, 7, %g1		! is src aligned on a 8 bytes
2235	bz	%ncc, .ci_cpy_db
2236	nop
2237
2238	sll	%g1, 3, %l0		! left shift
2239	mov	0x40, %l1
2240	sub	%l1, %l0, %l1		! right shift = (64 - left shift)
2241
2242.ci_cpy_dbwd:
2243	sub	%i1, %g1, %i1		! align the src at 8 bytes.
2244	ldxa	[%i1]ASI_USER, %o2
22453:
2246	add	%i1, 0x8, %i1
2247	ldxa	[%i1]ASI_USER, %o4
2248	ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
2249	stx	%o2, [%i0]
2250	mov	%o4, %o2
2251	subcc	%i3, 0x8, %i3
2252	bgu,pt	%ncc, 3b
2253	add	%i0, 0x8, %i0
2254	ba	.ci_dbdone
2255	add	%i1, %g1, %i1
2256
2257.ci_cpy_db:
2258	ldxa	[%i1]ASI_USER, %o2
2259	stx	%o2, [%i0]
2260	add	%i1, 0x8, %i1
2261	subcc	%i3, 0x8, %i3
2262	bgu,pt	%ncc, .ci_cpy_db
2263	add	%i0, 0x8, %i0
2264
2265.ci_dbdone:
2266	tst	%i2
2267	bz,pt	%xcc, .copyin_exit
2268	nop
2269
2270	! Copy the residue as byte copy
2271.ci_residue:
2272	lduba	[%i1]ASI_USER, %i4
2273	stb	%i4, [%i0]
2274	inc	%i1
2275	deccc	%i2
2276	bgu	%xcc, .ci_residue
2277	inc	%i0
2278
2279.copyin_exit:
2280	membar	#Sync
2281	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2282	ret
2283	restore	%g0, 0, %o0
2284.copyin_err:
2285	ldn	[THREAD_REG + T_COPYOPS], %o4
2286	brz	%o4, 2f
2287	nop
2288	ldn	[%o4 + CP_COPYIN], %g2
2289	jmp	%g2
2290	nop
22912:
2292	retl
2293	mov	-1, %o0
2294	SET_SIZE(copyin)
2295
2296#endif	/* lint */
2297
2298#ifdef	lint
2299
2300/*ARGSUSED*/
2301int
2302xcopyin(const void *uaddr, void *kaddr, size_t count)
2303{ return (0); }
2304
2305#else	/* lint */
2306
2307	ENTRY(xcopyin)
2308	sethi	%hi(.xcopyin_err), REAL_LOFAULT
2309	b	.do_copyin
2310	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
2311.xcopyin_err:
2312	ldn	[THREAD_REG + T_COPYOPS], %o4
2313	brz	%o4, 2f
2314	nop
2315	ldn	[%o4 + CP_XCOPYIN], %g2
2316	jmp	%g2
2317	nop
23182:
2319	retl
2320	mov	%g1, %o0
2321	SET_SIZE(xcopyin)
2322
2323#endif	/* lint */
2324
2325#ifdef	lint
2326
2327/*ARGSUSED*/
2328int
2329xcopyin_little(const void *uaddr, void *kaddr, size_t count)
2330{ return (0); }
2331
2332#else	/* lint */
2333
2334	ENTRY(xcopyin_little)
2335	sethi	%hi(.little_err), %o4
2336	ldn	[THREAD_REG + T_LOFAULT], %o5
2337	or	%o4, %lo(.little_err), %o4
2338	membar	#Sync				! sync error barrier
2339	stn	%o4, [THREAD_REG + T_LOFAULT]
2340
2341	subcc	%g0, %o2, %o3
2342	add	%o0, %o2, %o0
2343	bz,pn	%ncc, 2f		! check for zero bytes
2344	sub	%o2, 1, %o4
2345	add	%o0, %o4, %o0		! start w/last byte
2346	add	%o1, %o2, %o1
2347	lduba	[%o0+%o3]ASI_AIUSL, %o4
2348
23491:	stb	%o4, [%o1+%o3]
2350	inccc	%o3
2351	sub	%o0, 2, %o0		! get next byte
2352	bcc,a,pt %ncc, 1b
2353	  lduba	[%o0+%o3]ASI_AIUSL, %o4
2354
23552:	membar	#Sync				! sync error barrier
2356	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2357	retl
2358	mov	%g0, %o0		! return (0)
2359
2360.little_err:
2361	membar	#Sync				! sync error barrier
2362	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2363	retl
2364	mov	%g1, %o0
2365	SET_SIZE(xcopyin_little)
2366
2367#endif	/* lint */
2368
2369
2370/*
2371 * Copy a block of storage - must not overlap (from + len <= to).
2372 * No fault handler installed (to be called under on_fault())
2373 */
2374#if defined(lint)
2375
2376/* ARGSUSED */
2377void
2378copyin_noerr(const void *ufrom, void *kto, size_t count)
2379{}
2380
2381#else	/* lint */
2382
2383	ENTRY(copyin_noerr)
2384	sethi	%hi(.copyio_noerr), REAL_LOFAULT
2385	b	.do_copyin
2386	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
2387.copyio_noerr:
2388	jmp	SAVED_LOFAULT
2389	  nop
2390	SET_SIZE(copyin_noerr)
2391
2392#endif /* lint */
2393
2394/*
2395 * Copy a block of storage - must not overlap (from + len <= to).
2396 * No fault handler installed (to be called under on_fault())
2397 */
2398
2399#if defined(lint)
2400
2401/* ARGSUSED */
2402void
2403copyout_noerr(const void *kfrom, void *uto, size_t count)
2404{}
2405
2406#else	/* lint */
2407
2408	ENTRY(copyout_noerr)
2409	sethi	%hi(.copyio_noerr), REAL_LOFAULT
2410	b	.do_copyout
2411	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
2412	SET_SIZE(copyout_noerr)
2413
2414#endif /* lint */
2415
2416#if defined(lint)
2417
2418int use_hw_bcopy = 1;
2419int use_hw_bzero = 1;
2420uint_t hw_copy_limit_1 = 0x100;
2421uint_t hw_copy_limit_2 = 0x200;
2422uint_t hw_copy_limit_4 = 0x400;
2423uint_t hw_copy_limit_8 = 0x400;
2424
2425#else /* !lint */
2426
2427	.align	4
2428	DGDEF(use_hw_bcopy)
2429	.word	1
2430	DGDEF(use_hw_bzero)
2431	.word	1
2432	DGDEF(hw_copy_limit_1)
2433	.word	0x100
2434	DGDEF(hw_copy_limit_2)
2435	.word	0x200
2436	DGDEF(hw_copy_limit_4)
2437	.word	0x400
2438	DGDEF(hw_copy_limit_8)
2439	.word	0x400
2440
2441	.align	64
2442	.section ".text"
2443#endif /* !lint */
2444
2445/*
2446 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
2447 * longer than 256 bytes in length using Niagara's block stores/quad store.
2448 * If the criteria for using this routine are not met then it calls bzero
2449 * and returns 1.  Otherwise 0 is returned indicating success.
2450 * Caller is responsible for ensuring use_hw_bzero is true and that
2451 * kpreempt_disable() has been called.
2452 */
2453#ifdef lint
2454/*ARGSUSED*/
2455int
2456hwblkclr(void *addr, size_t len)
2457{
2458	return(0);
2459}
2460#else /* lint */
2461	! %i0 - start address
2462	! %i1 - length of region (multiple of 64)
2463
2464	ENTRY(hwblkclr)
2465	save	%sp, -SA(MINFRAME), %sp
2466
2467	! Must be block-aligned
2468	andcc	%i0, 0x3f, %g0
2469	bnz,pn	%ncc, 1f
2470	  nop
2471
2472	! ... and must be 256 bytes or more
2473	cmp	%i1, 0x100
2474	blu,pn	%ncc, 1f
2475	  nop
2476
2477	! ... and length must be a multiple of 64
2478	andcc	%i1, 0x3f, %g0
2479	bz,pn	%ncc, .pz_doblock
2480	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2481
24821:	! punt, call bzero but notify the caller that bzero was used
2483	mov	%i0, %o0
2484	call	bzero
2485	  mov	%i1, %o1
2486	ret
2487	restore	%g0, 1, %o0	! return (1) - did not use block operations
2488
2489	! Already verified that there are at least 256 bytes to set
2490.pz_doblock:
2491	stxa	%g0, [%i0+0x0]%asi
2492	stxa	%g0, [%i0+0x40]%asi
2493	stxa	%g0, [%i0+0x80]%asi
2494	stxa	%g0, [%i0+0xc0]%asi
2495
2496	stxa	%g0, [%i0+0x8]%asi
2497	stxa	%g0, [%i0+0x10]%asi
2498	stxa	%g0, [%i0+0x18]%asi
2499	stxa	%g0, [%i0+0x20]%asi
2500	stxa	%g0, [%i0+0x28]%asi
2501	stxa	%g0, [%i0+0x30]%asi
2502	stxa	%g0, [%i0+0x38]%asi
2503
2504	stxa	%g0, [%i0+0x48]%asi
2505	stxa	%g0, [%i0+0x50]%asi
2506	stxa	%g0, [%i0+0x58]%asi
2507	stxa	%g0, [%i0+0x60]%asi
2508	stxa	%g0, [%i0+0x68]%asi
2509	stxa	%g0, [%i0+0x70]%asi
2510	stxa	%g0, [%i0+0x78]%asi
2511
2512	stxa	%g0, [%i0+0x88]%asi
2513	stxa	%g0, [%i0+0x90]%asi
2514	stxa	%g0, [%i0+0x98]%asi
2515	stxa	%g0, [%i0+0xa0]%asi
2516	stxa	%g0, [%i0+0xa8]%asi
2517	stxa	%g0, [%i0+0xb0]%asi
2518	stxa	%g0, [%i0+0xb8]%asi
2519
2520	stxa	%g0, [%i0+0xc8]%asi
2521	stxa	%g0, [%i0+0xd0]%asi
2522	stxa	%g0, [%i0+0xd8]%asi
2523	stxa	%g0, [%i0+0xe0]%asi
2524	stxa	%g0, [%i0+0xe8]%asi
2525	stxa	%g0, [%i0+0xf0]%asi
2526	stxa	%g0, [%i0+0xf8]%asi
2527
2528	sub	%i1, 0x100, %i1
2529	cmp	%i1, 0x100
2530	bgu,pt	%ncc, .pz_doblock
2531	add	%i0, 0x100, %i0
2532
25332:
2534	! Check if more than 64 bytes to set
2535	cmp	%i1,0x40
2536	blu	%ncc, .pz_finish
2537	nop
2538
25393:
2540	stxa	%g0, [%i0+0x0]%asi
2541	stxa	%g0, [%i0+0x8]%asi
2542	stxa	%g0, [%i0+0x10]%asi
2543	stxa	%g0, [%i0+0x18]%asi
2544	stxa	%g0, [%i0+0x20]%asi
2545	stxa	%g0, [%i0+0x28]%asi
2546	stxa	%g0, [%i0+0x30]%asi
2547	stxa	%g0, [%i0+0x38]%asi
2548
2549	subcc	%i1, 0x40, %i1
2550	bgu,pt	%ncc, 3b
2551	add	%i0, 0x40, %i0
2552
2553.pz_finish:
2554	membar	#Sync
2555	ret
2556	restore	%g0, 0, %o0		! return (bzero or not)
2557	SET_SIZE(hwblkclr)
2558#endif	/* lint */
2559
2560#ifdef	lint
2561/* Copy 32 bytes of data from src to dst using physical addresses */
2562/*ARGSUSED*/
2563void
2564hw_pa_bcopy32(uint64_t src, uint64_t dst)
2565{}
2566#else	/*!lint */
2567
2568	/*
2569	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
2570	 * using physical addresses.
2571	 */
2572	ENTRY_NP(hw_pa_bcopy32)
2573	rdpr    %pstate, %g1
2574	andn    %g1, PSTATE_IE, %g2
2575	wrpr    %g0, %g2, %pstate
2576
2577	ldxa    [%o0]ASI_MEM, %o2
2578	add     %o0, 8, %o0
2579	ldxa    [%o0]ASI_MEM, %o3
2580	add     %o0, 8, %o0
2581	ldxa    [%o0]ASI_MEM, %o4
2582	add     %o0, 8, %o0
2583	ldxa    [%o0]ASI_MEM, %o5
2584	stxa    %o2, [%o1]ASI_MEM
2585	add     %o1, 8, %o1
2586	stxa    %o3, [%o1]ASI_MEM
2587	add     %o1, 8, %o1
2588	stxa    %o4, [%o1]ASI_MEM
2589	add     %o1, 8, %o1
2590	stxa    %o5, [%o1]ASI_MEM
2591
2592	membar	#Sync
2593	retl
2594	  wrpr    %g0, %g1, %pstate
2595	SET_SIZE(hw_pa_bcopy32)
2596#endif /* lint */
2597
2598/*
2599 * Zero a block of storage.
2600 *
2601 * uzero is used by the kernel to zero a block in user address space.
2602 */
2603
2604/*
2605 * Control flow of the bzero/kzero/uzero routine.
2606 *
2607 *	For fewer than 7 bytes stores, bytes will be zeroed.
2608 *
2609 *	For less than 15 bytes stores, align the address on 4 byte boundary.
2610 *	Then store as many 4-byte chunks, followed by trailing bytes.
2611 *
2612 *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
2613 *	if (count > 128) {
2614 *		store as many 8-bytes chunks to block align the address
2615 *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
2616 *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
2617 *	}
2618 *	Store as many 8-byte chunks, followed by trailing bytes.
2619 */
2620
2621#if defined(lint)
2622
2623/* ARGSUSED */
2624int
2625kzero(void *addr, size_t count)
2626{ return(0); }
2627
2628/* ARGSUSED */
2629void
2630uzero(void *addr, size_t count)
2631{}
2632
2633#else	/* lint */
2634
2635	ENTRY(uzero)
2636	!
2637	! Set a new lo_fault handler only if we came in with one
2638	! already specified.
2639	!
2640	wr	%g0, ASI_USER, %asi
2641	ldn	[THREAD_REG + T_LOFAULT], %o5
2642	tst	%o5
2643	bz,pt	%ncc, .do_zero
2644	sethi	%hi(.zeroerr), %o2
2645	or	%o2, %lo(.zeroerr), %o2
2646	membar	#Sync
2647	ba,pt	%ncc, .do_zero
2648	stn	%o2, [THREAD_REG + T_LOFAULT]
2649
2650	ENTRY(kzero)
2651	!
2652	! Always set a lo_fault handler
2653	!
2654	wr	%g0, ASI_P, %asi
2655	ldn	[THREAD_REG + T_LOFAULT], %o5
2656	sethi	%hi(.zeroerr), %o2
2657	or	%o5, LOFAULT_SET, %o5
2658	or	%o2, %lo(.zeroerr), %o2
2659	membar	#Sync
2660	ba,pt	%ncc, .do_zero
2661	stn	%o2, [THREAD_REG + T_LOFAULT]
2662
2663/*
2664 * We got here because of a fault during kzero or if
2665 * uzero or bzero was called with t_lofault non-zero.
2666 * Otherwise we've already run screaming from the room.
2667 * Errno value is in %g1. Note that we're here iff
2668 * we did set t_lofault.
2669 */
2670.zeroerr:
2671	!
2672	! Undo asi register setting. Just set it to be the
2673        ! kernel default without checking.
2674	!
2675	wr	%g0, ASI_P, %asi
2676
2677	!
2678	! We did set t_lofault. It may well have been zero coming in.
2679	!
26801:
2681	tst	%o5
2682	membar #Sync
2683	bne,pn	%ncc, 3f
2684	andncc	%o5, LOFAULT_SET, %o5
26852:
2686	!
2687	! Old handler was zero. Just return the error.
2688	!
2689	retl				! return
2690	mov	%g1, %o0		! error code from %g1
26913:
2692	!
2693	! We're here because %o5 was non-zero. It was non-zero
2694	! because either LOFAULT_SET was present, a previous fault
2695	! handler was present or both. In all cases we need to reset
2696	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
2697	! before we either simply return the error or we invoke the
2698	! previously specified handler.
2699	!
2700	be	%ncc, 2b
2701	stn	%o5, [THREAD_REG + T_LOFAULT]
2702	jmp	%o5			! goto real handler
2703	  nop
2704	SET_SIZE(kzero)
2705	SET_SIZE(uzero)
2706
2707#endif	/* lint */
2708
2709/*
2710 * Zero a block of storage.
2711 */
2712
2713#if defined(lint)
2714
2715/* ARGSUSED */
2716void
2717bzero(void *addr, size_t count)
2718{}
2719
2720#else	/* lint */
2721
2722	ENTRY(bzero)
2723	wr	%g0, ASI_P, %asi
2724
2725	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
2726	tst	%o5
2727	bz,pt	%ncc, .do_zero
2728	sethi	%hi(.zeroerr), %o2
2729	or	%o2, %lo(.zeroerr), %o2
2730	membar	#Sync				! sync error barrier
2731	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
2732
2733.do_zero:
2734	cmp	%o1, 7
2735	blu,pn	%ncc, .byteclr
2736	nop
2737
2738	cmp	%o1, 15
2739	blu,pn	%ncc, .wdalign
2740	nop
2741
2742	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
2743	bz,pt	%ncc, .blkalign		! already double aligned
2744	sub	%o3, 8, %o3		! -(bytes till double aligned)
2745	add	%o1, %o3, %o1		! update o1 with new count
2746
27471:
2748	stba	%g0, [%o0]%asi
2749	inccc	%o3
2750	bl,pt	%ncc, 1b
2751	inc	%o0
2752
2753	! Now address is double aligned
2754.blkalign:
2755	cmp	%o1, 0x80		! check if there are 128 bytes to set
2756	blu,pn	%ncc, .bzero_small
2757	mov	%o1, %o3
2758
2759	sethi	%hi(use_hw_bzero), %o2
2760	ld	[%o2 + %lo(use_hw_bzero)], %o2
2761	tst	%o2
2762	bz	%ncc, .bzero_small
2763	mov	%o1, %o3
2764
2765	rd	%asi, %o3
2766	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2767	cmp	%o3, ASI_P
2768	bne,a	%ncc, .algnblk
2769	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
2770
2771.algnblk:
2772	andcc	%o0, 0x3f, %o3		! is block aligned?
2773	bz,pt	%ncc, .bzero_blk
2774	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
2775	add	%o1, %o3, %o1		! o1 is the remainder
2776
2777	! Clear -(%o3) bytes till block aligned
27781:
2779	stxa	%g0, [%o0]%asi
2780	addcc	%o3, 8, %o3
2781	bl,pt	%ncc, 1b
2782	add	%o0, 8, %o0
2783
2784.bzero_blk:
2785	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
2786	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
2787
2788	cmp	%o4, 0x100		! 256 bytes or more
2789	blu,pn	%ncc, 3f
2790	nop
2791
27922:
2793	stxa	%g0, [%o0+0x0]%asi
2794	stxa	%g0, [%o0+0x40]%asi
2795	stxa	%g0, [%o0+0x80]%asi
2796	stxa	%g0, [%o0+0xc0]%asi
2797
2798	stxa	%g0, [%o0+0x8]%asi
2799	stxa	%g0, [%o0+0x10]%asi
2800	stxa	%g0, [%o0+0x18]%asi
2801	stxa	%g0, [%o0+0x20]%asi
2802	stxa	%g0, [%o0+0x28]%asi
2803	stxa	%g0, [%o0+0x30]%asi
2804	stxa	%g0, [%o0+0x38]%asi
2805
2806	stxa	%g0, [%o0+0x48]%asi
2807	stxa	%g0, [%o0+0x50]%asi
2808	stxa	%g0, [%o0+0x58]%asi
2809	stxa	%g0, [%o0+0x60]%asi
2810	stxa	%g0, [%o0+0x68]%asi
2811	stxa	%g0, [%o0+0x70]%asi
2812	stxa	%g0, [%o0+0x78]%asi
2813
2814	stxa	%g0, [%o0+0x88]%asi
2815	stxa	%g0, [%o0+0x90]%asi
2816	stxa	%g0, [%o0+0x98]%asi
2817	stxa	%g0, [%o0+0xa0]%asi
2818	stxa	%g0, [%o0+0xa8]%asi
2819	stxa	%g0, [%o0+0xb0]%asi
2820	stxa	%g0, [%o0+0xb8]%asi
2821
2822	stxa	%g0, [%o0+0xc8]%asi
2823	stxa	%g0, [%o0+0xd0]%asi
2824	stxa	%g0, [%o0+0xd8]%asi
2825	stxa	%g0, [%o0+0xe0]%asi
2826	stxa	%g0, [%o0+0xe8]%asi
2827	stxa	%g0, [%o0+0xf0]%asi
2828	stxa	%g0, [%o0+0xf8]%asi
2829
2830	sub	%o4, 0x100, %o4
2831	cmp	%o4, 0x100
2832	bgu,pt	%ncc, 2b
2833	add	%o0, 0x100, %o0
2834
28353:
2836	! ... check if 64 bytes to set
2837	cmp	%o4, 0x40
2838	blu	%ncc, .bzero_blk_done
2839	nop
2840
28414:
2842	stxa	%g0, [%o0+0x0]%asi
2843	stxa	%g0, [%o0+0x8]%asi
2844	stxa	%g0, [%o0+0x10]%asi
2845	stxa	%g0, [%o0+0x18]%asi
2846	stxa	%g0, [%o0+0x20]%asi
2847	stxa	%g0, [%o0+0x28]%asi
2848	stxa	%g0, [%o0+0x30]%asi
2849	stxa	%g0, [%o0+0x38]%asi
2850
2851	subcc	%o4, 0x40, %o4
2852	bgu,pt	%ncc, 3b
2853	add	%o0, 0x40, %o0
2854
2855.bzero_blk_done:
2856	membar	#Sync
2857	!
2858	! Undo asi register setting.
2859	!
2860	rd	%asi, %o4
2861	wr	%g0, ASI_P, %asi
2862	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
2863	bne,a	%ncc, .bzero_small
2864	wr	%g0, ASI_USER, %asi
2865
2866.bzero_small:
2867	! Set the remaining doubles
2868	subcc	%o3, 8, %o3		! Can we store any doubles?
2869	blu,pn	%ncc, .byteclr
2870	and	%o1, 7, %o1		! calc bytes left after doubles
2871
2872.dbclr:
2873	stxa	%g0, [%o0]%asi		! Clear the doubles
2874	subcc	%o3, 8, %o3
2875	bgeu,pt	%ncc, .dbclr
2876	add	%o0, 8, %o0
2877
2878	ba	.byteclr
2879	nop
2880
2881.wdalign:
2882	andcc	%o0, 3, %o3		! is add aligned on a word boundary
2883	bz,pn	%ncc, .wdclr
2884	andn	%o1, 3, %o3		! create word sized count in %o3
2885
2886	dec	%o1			! decrement count
2887	stba	%g0, [%o0]%asi		! clear a byte
2888	ba	.wdalign
2889	inc	%o0			! next byte
2890
2891.wdclr:
2892	sta	%g0, [%o0]%asi		! 4-byte clearing loop
2893	subcc	%o3, 4, %o3
2894	bnz,pt	%ncc, .wdclr
2895	inc	4, %o0
2896
2897	and	%o1, 3, %o1		! leftover count, if any
2898
2899.byteclr:
2900	! Set the leftover bytes
2901	brz	%o1, .bzero_exit
2902	nop
2903
29047:
2905	deccc	%o1			! byte clearing loop
2906	stba	%g0, [%o0]%asi
2907	bgu,pt	%ncc, 7b
2908	inc	%o0
2909
2910.bzero_exit:
2911	!
2912	! We're just concerned with whether t_lofault was set
2913	! when we came in. We end up here from either kzero()
2914	! or bzero(). kzero() *always* sets a lofault handler.
2915	! It ors LOFAULT_SET into %o5 to indicate it has done
2916	! this even if the value of %o5 is otherwise zero.
2917	! bzero() sets a lofault handler *only* if one was
2918	! previously set. Accordingly we need to examine
2919	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
2920	! before resetting the error handler.
2921	!
2922	tst	%o5
2923	bz	%ncc, 1f
2924	andn	%o5, LOFAULT_SET, %o5
2925	membar	#Sync				! sync error barrier
2926	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
29271:
2928	retl
2929	clr	%o0			! return (0)
2930
2931	SET_SIZE(bzero)
2932#endif	/* lint */
2933