xref: /illumos-gate/usr/src/uts/sun4v/cpu/generic_copy.S (revision 9164a50bf932130cbb5097a16f6986873ce0e6e5)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26# ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/asm_linkage.h>
31#include <sys/vtrace.h>
32#include <sys/machthread.h>
33#include <sys/clock.h>
34#include <sys/asi.h>
35#include <sys/fsr.h>
36#include <sys/privregs.h>
37
38#include "assym.h"
39
40
41/*
42 * Less then or equal this number of bytes we will always copy byte-for-byte
43 */
44#define	SMALL_LIMIT	7
45
46/*
47 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
48 * handler was set
49 */
50#define	LOFAULT_SET 2
51
52
53/*
54 * Copy a block of storage, returning an error code if `from' or
55 * `to' takes a kernel pagefault which cannot be resolved.
56 * Returns errno value on pagefault error, 0 if all ok
57 */
58
59
60
61	.seg	".text"
62	.align	4
63
64	ENTRY(kcopy)
65
66	save	%sp, -SA(MINFRAME), %sp
67	set	.copyerr, %l7			! copyerr is lofault value
68	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
69	or	%o5, LOFAULT_SET, %o5
70	membar	#Sync				! sync error barrier
71	b	.do_copy			! common code
72	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
73
74/*
75 * We got here because of a fault during kcopy.
76 * Errno value is in %g1.
77 */
78.copyerr:
79	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
80	! into %o5 to indicate it has set t_lofault handler. Need to clear
81	! LOFAULT_SET flag before restoring the error handler.
82	andn	%o5, LOFAULT_SET, %o5
83	membar	#Sync			! sync error barrier
84	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
85	ret
86	restore	%g1, 0, %o0
87
88	SET_SIZE(kcopy)
89
90
91/*
92 * Copy a block of storage - must not overlap (from + len <= to).
93 */
94
95	ENTRY(bcopy)
96
97	save	%sp, -SA(MINFRAME), %sp
98	clr	%o5			! flag LOFAULT_SET is not set for bcopy
99
100.do_copy:
101        mov     %i1, %g5                ! save dest addr start
102
103        mov     %i2, %l6                ! save size
104
105	cmp	%i2, 12			! for small counts
106	blu	%ncc, .bytecp		! just copy bytes
107	  .empty
108
109	!
110	! use aligned transfers where possible
111	!
112	xor	%i0, %i1, %o4		! xor from and to address
113	btst	7, %o4			! if lower three bits zero
114	bz	.aldoubcp		! can align on double boundary
115	.empty	! assembler complaints about label
116
117	xor	%i0, %i1, %o4		! xor from and to address
118	btst	3, %o4			! if lower two bits zero
119	bz	.alwordcp		! can align on word boundary
120	btst	3, %i0			! delay slot, from address unaligned?
121	!
122	! use aligned reads and writes where possible
123	! this differs from wordcp in that it copes
124	! with odd alignment between source and destnation
125	! using word reads and writes with the proper shifts
126	! in between to align transfers to and from memory
127	! i0 - src address, i1 - dest address, i2 - count
128	! i3, i4 - tmps for used generating complete word
129	! i5 (word to write)
130	! l0 size in bits of upper part of source word (US)
131	! l1 size in bits of lower part of source word (LS = 32 - US)
132	! l2 size in bits of upper part of destination word (UD)
133	! l3 size in bits of lower part of destination word (LD = 32 - UD)
134	! l4 number of bytes leftover after aligned transfers complete
135	! l5 the number 32
136	!
137	mov	32, %l5			! load an oft-needed constant
138	bz	.align_dst_only
139	btst	3, %i1			! is destnation address aligned?
140	clr	%i4			! clear registers used in either case
141	bz	.align_src_only
142	clr	%l0
143	!
144	! both source and destination addresses are unaligned
145	!
1461:					! align source
147	ldub	[%i0], %i3		! read a byte from source address
148	add	%i0, 1, %i0		! increment source address
149	or	%i4, %i3, %i4		! or in with previous bytes (if any)
150	btst	3, %i0			! is source aligned?
151	add	%l0, 8, %l0		! increment size of upper source (US)
152	bnz,a	1b
153	sll	%i4, 8, %i4		! make room for next byte
154
155	sub	%l5, %l0, %l1		! generate shift left count (LS)
156	sll	%i4, %l1, %i4		! prepare to get rest
157	ld	[%i0], %i3		! read a word
158	add	%i0, 4, %i0		! increment source address
159	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
160	or	%i4, %i5, %i5		! merge
161	mov	24, %l3			! align destination
1621:
163	srl	%i5, %l3, %i4		! prepare to write a single byte
164	stb	%i4, [%i1]		! write a byte
165	add	%i1, 1, %i1		! increment destination address
166	sub	%i2, 1, %i2		! decrement count
167	btst	3, %i1			! is destination aligned?
168	bnz,a	1b
169	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
170	sub	%l5, %l3, %l2		! generate shift left count (UD)
171	sll	%i5, %l2, %i5		! move leftover into upper bytes
172	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
173	bgu	%ncc, .more_needed	! need more to fill than we have
174	nop
175
176	sll	%i3, %l1, %i3		! clear upper used byte(s)
177	srl	%i3, %l1, %i3
178	! get the odd bytes between alignments
179	sub	%l0, %l2, %l0		! regenerate shift count
180	sub	%l5, %l0, %l1		! generate new shift left count (LS)
181	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
182	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
183	srl	%i3, %l0, %i4
184	or	%i5, %i4, %i5
185	st	%i5, [%i1]		! write a word
186	subcc	%i2, 4, %i2		! decrement count
187	bz	%ncc, .unalign_out
188	add	%i1, 4, %i1		! increment destination address
189
190	b	2f
191	sll	%i3, %l1, %i5		! get leftover into upper bits
192.more_needed:
193	sll	%i3, %l0, %i3		! save remaining byte(s)
194	srl	%i3, %l0, %i3
195	sub	%l2, %l0, %l1		! regenerate shift count
196	sub	%l5, %l1, %l0		! generate new shift left count
197	sll	%i3, %l1, %i4		! move to fill empty space
198	b	3f
199	or	%i5, %i4, %i5		! merge to complete word
200	!
201	! the source address is aligned and destination is not
202	!
203.align_dst_only:
204	ld	[%i0], %i4		! read a word
205	add	%i0, 4, %i0		! increment source address
206	mov	24, %l0			! initial shift alignment count
2071:
208	srl	%i4, %l0, %i3		! prepare to write a single byte
209	stb	%i3, [%i1]		! write a byte
210	add	%i1, 1, %i1		! increment destination address
211	sub	%i2, 1, %i2		! decrement count
212	btst	3, %i1			! is destination aligned?
213	bnz,a	1b
214	sub	%l0, 8, %l0		! delay slot, decrement shift count
215.xfer:
216	sub	%l5, %l0, %l1		! generate shift left count
217	sll	%i4, %l1, %i5		! get leftover
2183:
219	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
220	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
2212:
222	ld	[%i0], %i3		! read a source word
223	add	%i0, 4, %i0		! increment source address
224	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
225	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
226	st	%i5, [%i1]		! write a destination word
227	subcc	%i2, 4, %i2		! decrement count
228	bz	%ncc, .unalign_out	! check if done
229	add	%i1, 4, %i1		! increment destination address
230	b	2b			! loop
231	sll	%i3, %l1, %i5		! get leftover
232.unalign_out:
233	tst	%l4			! any bytes leftover?
234	bz	%ncc, .cpdone
235	.empty				! allow next instruction in delay slot
2361:
237	sub	%l0, 8, %l0		! decrement shift
238	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
239	stb	%i4, [%i1]		! write a byte
240	subcc	%l4, 1, %l4		! decrement count
241	bz	%ncc, .cpdone		! done?
242	add	%i1, 1, %i1		! increment destination
243	tst	%l0			! any more previously read bytes
244	bnz	%ncc, 1b		! we have leftover bytes
245	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
246	b	.dbytecp		! let dbytecp do the rest
247	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
248	!
249	! the destination address is aligned and the source is not
250	!
251.align_src_only:
252	ldub	[%i0], %i3		! read a byte from source address
253	add	%i0, 1, %i0		! increment source address
254	or	%i4, %i3, %i4		! or in with previous bytes (if any)
255	btst	3, %i0			! is source aligned?
256	add	%l0, 8, %l0		! increment shift count (US)
257	bnz,a	.align_src_only
258	sll	%i4, 8, %i4		! make room for next byte
259	b,a	.xfer
260	!
261	! if from address unaligned for double-word moves,
262	! move bytes till it is, if count is < 56 it could take
263	! longer to align the thing than to do the transfer
264	! in word size chunks right away
265	!
266.aldoubcp:
267	cmp	%i2, 56			! if count < 56, use wordcp, it takes
268	blu,a	%ncc, .alwordcp		! longer to align doubles than words
269	mov	3, %o0			! mask for word alignment
270	call	.alignit		! copy bytes until aligned
271	mov	7, %o0			! mask for double alignment
272	!
273	! source and destination are now double-word aligned
274	! i3 has aligned count returned by alignit
275	!
276	and	%i2, 7, %i2		! unaligned leftover count
277	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2785:
279	ldx	[%i0+%i1], %o4		! read from address
280	stx	%o4, [%i1]		! write at destination address
281	subcc	%i3, 8, %i3		! dec count
282	bgu	%ncc, 5b
283	add	%i1, 8, %i1		! delay slot, inc to address
284	cmp	%i2, 4			! see if we can copy a word
285	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
286	.empty
287	!
288	! for leftover bytes we fall into wordcp, if needed
289	!
290.wordcp:
291	and	%i2, 3, %i2		! unaligned leftover count
2925:
293	ld	[%i0+%i1], %o4		! read from address
294	st	%o4, [%i1]		! write at destination address
295	subcc	%i3, 4, %i3		! dec count
296	bgu	%ncc, 5b
297	add	%i1, 4, %i1		! delay slot, inc to address
298	b,a	.dbytecp
299
300	! we come here to align copies on word boundaries
301.alwordcp:
302	call	.alignit		! go word-align it
303	mov	3, %o0			! bits that must be zero to be aligned
304	b	.wordcp
305	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
306
307	!
308	! byte copy, works with any alignment
309	!
310.bytecp:
311	b	.dbytecp
312	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
313
314	!
315	! differenced byte copy, works with any alignment
316	! assumes dest in %i1 and (source - dest) in %i0
317	!
3181:
319	stb	%o4, [%i1]		! write to address
320	inc	%i1			! inc to address
321.dbytecp:
322	deccc	%i2			! dec count
323	bgeu,a	%ncc, 1b		! loop till done
324	ldub	[%i0+%i1], %o4		! read from address
325.cpdone:
326	membar	#Sync				! sync error barrier
327	! Restore t_lofault handler, if came here from kcopy().
328	tst	%o5
329	bz	%ncc, 1f
330	andn	%o5, LOFAULT_SET, %o5
331	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3321:
333        mov     %g5, %o0                ! copy dest address
334        call    sync_icache
335        mov     %l6, %o1                ! saved size
336	ret
337	restore %g0, 0, %o0		! return (0)
338
339/*
340 * Common code used to align transfers on word and doubleword
341 * boudaries.  Aligns source and destination and returns a count
342 * of aligned bytes to transfer in %i3
343 */
3441:
345	inc	%i0			! inc from
346	stb	%o4, [%i1]		! write a byte
347	inc	%i1			! inc to
348	dec	%i2			! dec count
349.alignit:
350	btst	%o0, %i0		! %o0 is bit mask to check for alignment
351	bnz,a	1b
352	ldub	[%i0], %o4		! read next byte
353
354	retl
355	andn	%i2, %o0, %i3		! return size of aligned bytes
356	SET_SIZE(bcopy)
357
358/*
359 * Block copy with possibly overlapped operands.
360 */
361
362	ENTRY(ovbcopy)
363	tst	%o2			! check count
364	bgu,a	%ncc, 1f		! nothing to do or bad arguments
365	subcc	%o0, %o1, %o3		! difference of from and to address
366
367	retl				! return
368	nop
3691:
370	bneg,a	%ncc, 2f
371	neg	%o3			! if < 0, make it positive
3722:	cmp	%o2, %o3		! cmp size and abs(from - to)
373	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
374	.empty				!   no overlap
375	cmp	%o0, %o1		! compare from and to addresses
376	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
377	nop
378	!
379	! Copy forwards.
380	!
381.ov_fwd:
382	ldub	[%o0], %o3		! read from address
383	inc	%o0			! inc from address
384	stb	%o3, [%o1]		! write to address
385	deccc	%o2			! dec count
386	bgu	%ncc, .ov_fwd		! loop till done
387	inc	%o1			! inc to address
388
389	retl				! return
390	nop
391	!
392	! Copy backwards.
393	!
394.ov_bkwd:
395	deccc	%o2			! dec count
396	ldub	[%o0 + %o2], %o3	! get byte at end of src
397	bgu	%ncc, .ov_bkwd		! loop till done
398	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
399
400	retl				! return
401	nop
402	SET_SIZE(ovbcopy)
403
404/*
405 * hwblkpagecopy()
406 *
407 * Copies exactly one page.  This routine assumes the caller (ppcopy)
408 * has already disabled kernel preemption and has checked
409 * use_hw_bcopy.
410 */
411	ENTRY(hwblkpagecopy)
412	save	%sp, -SA(MINFRAME), %sp
413
414	! %i0 - source address (arg)
415	! %i1 - destination address (arg)
416	! %i2 - length of region (not arg)
417
418	set	PAGESIZE, %i2
419	mov     %i1,    %o0     ! store destination address for flushing
420
421	/*
422	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
423	 */
4241:
425	ldx	[%i0+0x0], %l0
426	ldx	[%i0+0x8], %l1
427	ldx	[%i0+0x10], %l2
428	ldx	[%i0+0x18], %l3
429	ldx	[%i0+0x20], %l4
430	ldx	[%i0+0x28], %l5
431	ldx	[%i0+0x30], %l6
432	ldx	[%i0+0x38], %l7
433	stx	%l0, [%i1+0x0]
434	stx	%l1, [%i1+0x8]
435	stx	%l2, [%i1+0x10]
436	stx	%l3, [%i1+0x18]
437	stx	%l4, [%i1+0x20]
438	stx	%l5, [%i1+0x28]
439	stx	%l6, [%i1+0x30]
440	stx	%l7, [%i1+0x38]
441
442	ldx	[%i0+0x40], %l0
443	ldx	[%i0+0x48], %l1
444	ldx	[%i0+0x50], %l2
445	ldx	[%i0+0x58], %l3
446	ldx	[%i0+0x60], %l4
447	ldx	[%i0+0x68], %l5
448	ldx	[%i0+0x70], %l6
449	ldx	[%i0+0x78], %l7
450	stx	%l0, [%i1+0x40]
451	stx	%l1, [%i1+0x48]
452	stx	%l2, [%i1+0x50]
453	stx	%l3, [%i1+0x58]
454	stx	%l4, [%i1+0x60]
455	stx	%l5, [%i1+0x68]
456	stx	%l6, [%i1+0x70]
457	stx	%l7, [%i1+0x78]
458
459	add	%i0, 0x80, %i0
460	subcc	%i2, 0x80, %i2
461	bgu,pt	%xcc, 1b
462	add	%i1, 0x80, %i1
463
464	! %o0 contains the dest. address
465	set	PAGESIZE, %o1
466	call	sync_icache
467	nop
468
469	membar #Sync
470	ret
471	restore	%g0, 0, %o0
472	SET_SIZE(hwblkpagecopy)
473
474
475/*
476 * Transfer data to and from user space -
477 * Note that these routines can cause faults
478 * It is assumed that the kernel has nothing at
479 * less than KERNELBASE in the virtual address space.
480 *
481 * Note that copyin(9F) and copyout(9F) are part of the
482 * DDI/DKI which specifies that they return '-1' on "errors."
483 *
484 * Sigh.
485 *
486 * So there's two extremely similar routines - xcopyin() and xcopyout()
487 * which return the errno that we've faithfully computed.  This
488 * allows other callers (e.g. uiomove(9F)) to work correctly.
489 * Given that these are used pretty heavily, we expand the calling
490 * sequences inline for all flavours (rather than making wrappers).
491 *
492 * There are also stub routines for xcopyout_little and xcopyin_little,
493 * which currently are intended to handle requests of <= 16 bytes from
494 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
495 * is left as an exercise...
496 */
497
498/*
499 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
500 *
501 * General theory of operation:
502 *
503 * None of the copyops routines grab a window.
504 *
505 * Flow:
506 *
507 * If count == zero return zero.
508 *
509 * Store the previous lo_fault handler into %g6.
510 * Place our secondary lofault handler into %g5.
511 * Place the address of our fault handler into %o3.
512 *
513 * If count is less than or equal to SMALL_LIMIT (7) we
514 * always do a byte for byte copy.
515 *
516 * If count is > SMALL_LIMIT, we check the alignment of the input
517 * and output pointers.  We store -count in %o3, we store the number
518 * of chunks (8, 4, 2 or 1 byte) operated on in our basic copy loop
519 * in %o2. Following this we branch to the appropriate copy loop and
520 * copy that many chunks.  Since we've been adding the chunk size
521 * to %o3 each time through as well as decrementing %o2, we can tell
522 * if any data is is left to be copied by examining %o3. If that is
523 * zero, we're done and can go home. If not, we figure out what the
524 * largest chunk size left to be copied is and branch to that copy
525 * loop unless there's only one byte left. We load that as we're
526 * branching to code that stores it just before we return.
527 *
528 * Fault handlers are invoked if we reference memory that has no
529 * current mapping.  All forms share the same copyio_fault handler.
530 * This routine handles fixing up the stack and general housecleaning.
531 * Each copy operation has a simple fault handler that is then called
532 * to do the work specific to the invidual operation.  The handler
533 * for copyOP and xcopyOP are found at the end of individual function.
534 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
535 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
536 */
537
538/*
539 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
540 */
541
542/*
543 * We save the arguments in the following registers in case of a fault:
544 * 	kaddr - %g2
545 * 	uaddr - %g3
546 * 	count - %g4
547 */
548#define	SAVE_SRC	%g2
549#define	SAVE_DST	%g3
550#define	SAVE_COUNT	%g4
551
552#define	REAL_LOFAULT		%g5
553#define	SAVED_LOFAULT		%g6
554
555/*
556 * Generic copyio fault handler.  This is the first line of defense when a
557 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
558 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
559 * This allows us to share common code for all the flavors of the copy
560 * operations, including the _noerr versions.
561 *
562 * Note that this function will restore the original input parameters before
563 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
564 * member of the t_copyop structure, if needed.
565 */
566	ENTRY(copyio_fault)
567	membar	#Sync
568	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
569
570	mov	SAVE_SRC, %o0
571	mov	SAVE_DST, %o1
572	jmp	REAL_LOFAULT
573	  mov	SAVE_COUNT, %o2
574	SET_SIZE(copyio_fault)
575
576	ENTRY(copyout)
577	sethi	%hi(.copyout_err), REAL_LOFAULT
578	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
579
580.do_copyout:
581	!
582	! Check the length and bail if zero.
583	!
584	tst	%o2
585	bnz,pt	%ncc, 1f
586	  nop
587	retl
588	  clr	%o0
5891:
590	sethi	%hi(copyio_fault), %o3
591	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
592	or	%o3, %lo(copyio_fault), %o3
593	membar	#Sync
594	stn	%o3, [THREAD_REG + T_LOFAULT]
595
596	mov	%o0, SAVE_SRC
597	mov	%o1, SAVE_DST
598	mov	%o2, SAVE_COUNT
599
600	!
601	! Check to see if we're more than SMALL_LIMIT (7 bytes).
602	! Run in leaf mode, using the %o regs as our input regs.
603	!
604	subcc	%o2, SMALL_LIMIT, %o3
605	bgu,a,pt %ncc, .dco_ns
606	or	%o0, %o1, %o3
607
608.dcobcp:
609	sub	%g0, %o2, %o3		! negate count
610	add	%o0, %o2, %o0		! make %o0 point at the end
611	add	%o1, %o2, %o1		! make %o1 point at the end
612	ba,pt	%ncc, .dcocl
613	ldub	[%o0 + %o3], %o4	! load first byte
614	!
615	! %o0 and %o2 point at the end and remain pointing at the end
616	! of their buffers. We pull things out by adding %o3 (which is
617	! the negation of the length) to the buffer end which gives us
618	! the curent location in the buffers. By incrementing %o3 we walk
619	! through both buffers without having to bump each buffer's
620	! pointer. A very fast 4 instruction loop.
621	!
622	.align 16
623.dcocl:
624	stba	%o4, [%o1 + %o3]ASI_USER
625	inccc	%o3
626	bl,a,pt	%ncc, .dcocl
627	ldub	[%o0 + %o3], %o4
628	!
629	! We're done. Go home.
630	!
631	membar	#Sync
632	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
633	retl
634	clr	%o0
635	!
636	! Try aligned copies from here.
637	!
638.dco_ns:
639	! %o0 = kernel addr (to be copied from)
640	! %o1 = user addr (to be copied to)
641	! %o2 = length
642	! %o3 = %o1 | %o2 (used for alignment checking)
643	! %o4 is alternate lo_fault
644	! %o5 is original lo_fault
645	!
646	! See if we're single byte aligned. If we are, check the
647	! limit for single byte copies. If we're smaller or equal,
648	! bounce to the byte for byte copy loop. Otherwise do it in
649	! HW (if enabled).
650	!
651	btst	1, %o3
652	bz,pt	%icc, .dcoh8
653	btst	7, %o3
654
655	ba	.dcobcp
656	nop
657.dcoh8:
658	!
659	! 8 byte aligned?
660	!
661	bnz,a	%ncc, .dcoh4
662	btst	3, %o3
663.dcos8:
664	!
665	! Housekeeping for copy loops. Uses same idea as in the byte for
666	! byte copy loop above.
667	!
668	add	%o0, %o2, %o0
669	add	%o1, %o2, %o1
670	sub	%g0, %o2, %o3
671	ba,pt	%ncc, .dodebc
672	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
673	!
674	! 4 byte aligned?
675	!
676.dcoh4:
677	bnz,pn	%ncc, .dcoh2
678	nop
679.dcos4:
680	add	%o0, %o2, %o0
681	add	%o1, %o2, %o1
682	sub	%g0, %o2, %o3
683	ba,pt	%ncc, .dodfbc
684	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
685	!
686	! We must be 2 byte aligned. Off we go.
687	! The check for small copies was done in the
688	! delay at .dcoh4
689	!
690.dcoh2:
691.dcos2:
692	add	%o0, %o2, %o0
693	add	%o1, %o2, %o1
694	sub	%g0, %o2, %o3
695	ba,pt	%ncc, .dodtbc
696	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
697
698.dodebc:
699	ldx	[%o0 + %o3], %o4
700	deccc	%o2
701	stxa	%o4, [%o1 + %o3]ASI_USER
702	bg,pt	%ncc, .dodebc
703	addcc	%o3, 8, %o3
704	!
705	! End of copy loop. Check to see if we're done. Most
706	! eight byte aligned copies end here.
707	!
708	bz,pt	%ncc, .dcofh
709	nop
710	!
711	! Something is left - do it byte for byte.
712	!
713	ba,pt	%ncc, .dcocl
714	ldub	[%o0 + %o3], %o4	! load next byte
715	!
716	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
717	!
718	.align 32
719.dodfbc:
720	lduw	[%o0 + %o3], %o4
721	deccc	%o2
722	sta	%o4, [%o1 + %o3]ASI_USER
723	bg,pt	%ncc, .dodfbc
724	addcc	%o3, 4, %o3
725	!
726	! End of copy loop. Check to see if we're done. Most
727	! four byte aligned copies end here.
728	!
729	bz,pt	%ncc, .dcofh
730	nop
731	!
732	! Something is left. Do it byte for byte.
733	!
734	ba,pt	%ncc, .dcocl
735	ldub	[%o0 + %o3], %o4	! load next byte
736	!
737	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
738	! copy.
739	!
740	.align 32
741.dodtbc:
742	lduh	[%o0 + %o3], %o4
743	deccc	%o2
744	stha	%o4, [%o1 + %o3]ASI_USER
745	bg,pt	%ncc, .dodtbc
746	addcc	%o3, 2, %o3
747	!
748	! End of copy loop. Anything left?
749	!
750	bz,pt	%ncc, .dcofh
751	nop
752	!
753	! Deal with the last byte
754	!
755	ldub	[%o0 + %o3], %o4
756	stba	%o4, [%o1 + %o3]ASI_USER
757.dcofh:
758	membar	#Sync
759	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
760	retl
761	clr	%o0
762
763.copyout_err:
764	ldn	[THREAD_REG + T_COPYOPS], %o4
765	brz	%o4, 2f
766	nop
767	ldn	[%o4 + CP_COPYOUT], %g2
768	jmp	%g2
769	nop
7702:
771	retl
772	mov	-1, %o0
773	SET_SIZE(copyout)
774
775
776	ENTRY(xcopyout)
777	sethi	%hi(.xcopyout_err), REAL_LOFAULT
778	b	.do_copyout
779	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
780.xcopyout_err:
781	ldn	[THREAD_REG + T_COPYOPS], %o4
782	brz	%o4, 2f
783	nop
784	ldn	[%o4 + CP_XCOPYOUT], %g2
785	jmp	%g2
786	nop
7872:
788	retl
789	mov	%g1, %o0
790	SET_SIZE(xcopyout)
791
792	ENTRY(xcopyout_little)
793	sethi	%hi(.little_err), %o4
794	ldn	[THREAD_REG + T_LOFAULT], %o5
795	or	%o4, %lo(.little_err), %o4
796	membar	#Sync			! sync error barrier
797	stn	%o4, [THREAD_REG + T_LOFAULT]
798
799	subcc	%g0, %o2, %o3
800	add	%o0, %o2, %o0
801	bz,pn	%ncc, 2f		! check for zero bytes
802	sub	%o2, 1, %o4
803	add	%o0, %o4, %o0		! start w/last byte
804	add	%o1, %o2, %o1
805	ldub	[%o0+%o3], %o4
806
8071:	stba	%o4, [%o1+%o3]ASI_AIUSL
808	inccc	%o3
809	sub	%o0, 2, %o0		! get next byte
810	bcc,a,pt %ncc, 1b
811	  ldub	[%o0+%o3], %o4
812
8132:	membar	#Sync			! sync error barrier
814	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
815	retl
816	mov	%g0, %o0		! return (0)
817	SET_SIZE(xcopyout_little)
818
819/*
820 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
821 */
822
823	ENTRY(copyin)
824	sethi	%hi(.copyin_err), REAL_LOFAULT
825	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
826
827.do_copyin:
828	!
829	! Check the length and bail if zero.
830	!
831	tst	%o2
832	bnz,pt	%ncc, 1f
833	  nop
834	retl
835	  clr	%o0
8361:
837	sethi	%hi(copyio_fault), %o3
838	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
839	or	%o3, %lo(copyio_fault), %o3
840	membar	#Sync
841	stn	%o3, [THREAD_REG + T_LOFAULT]
842
843	mov	%o0, SAVE_SRC
844	mov	%o1, SAVE_DST
845	mov	%o2, SAVE_COUNT
846
847	!
848	! Check to see if we're more than SMALL_LIMIT.
849	!
850	subcc	%o2, SMALL_LIMIT, %o3
851	bgu,a,pt %ncc, .dci_ns
852	or	%o0, %o1, %o3
853
854.dcibcp:
855	sub	%g0, %o2, %o3		! setup for copy loop
856	add	%o0, %o2, %o0
857	add	%o1, %o2, %o1
858	ba,pt	%ncc, .dcicl
859	lduba	[%o0 + %o3]ASI_USER, %o4
860	!
861	! %o0 and %o1 point at the end and remain pointing at the end
862	! of their buffers. We pull things out by adding %o3 (which is
863	! the negation of the length) to the buffer end which gives us
864	! the curent location in the buffers. By incrementing %o3 we walk
865	! through both buffers without having to bump each buffer's
866	! pointer. A very fast 4 instruction loop.
867	!
868	.align 16
869.dcicl:
870	stb	%o4, [%o1 + %o3]
871	inccc	%o3
872	bl,a,pt %ncc, .dcicl
873	lduba	[%o0 + %o3]ASI_USER, %o4
874	!
875	! We're done. Go home.
876	!
877	membar	#Sync
878	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
879	retl
880	clr	%o0
881	!
882	! Try aligned copies from here.
883	!
884.dci_ns:
885	!
886	! See if we're single byte aligned. If we are, check the
887	! limit for single byte copies. If we're smaller, or equal,
888	! bounce to the byte for byte copy loop. Otherwise do it in
889	! HW (if enabled).
890	!
891	btst	1, %o3
892	bz,a,pt	%icc, .dcih8
893	btst	7, %o3
894	ba	.dcibcp
895	nop
896
897.dcih8:
898	!
899	! 8 byte aligned?
900	!
901	bnz,a	%ncc, .dcih4
902	btst	3, %o3
903.dcis8:
904	!
905	! Housekeeping for copy loops. Uses same idea as in the byte for
906	! byte copy loop above.
907	!
908	add	%o0, %o2, %o0
909	add	%o1, %o2, %o1
910	sub	%g0, %o2, %o3
911	ba,pt	%ncc, .didebc
912	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
913	!
914	! 4 byte aligned?
915	!
916.dcih4:
917	bnz	%ncc, .dcih2
918	nop
919.dcis4:
920	!
921	! Housekeeping for copy loops. Uses same idea as in the byte
922	! for byte copy loop above.
923	!
924	add	%o0, %o2, %o0
925	add	%o1, %o2, %o1
926	sub	%g0, %o2, %o3
927	ba,pt	%ncc, .didfbc
928	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
929.dcih2:
930.dcis2:
931	add	%o0, %o2, %o0
932	add	%o1, %o2, %o1
933	sub	%g0, %o2, %o3
934	ba,pt	%ncc, .didtbc
935	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
936
937.didebc:
938	ldxa	[%o0 + %o3]ASI_USER, %o4
939	deccc	%o2
940	stx	%o4, [%o1 + %o3]
941	bg,pt	%ncc, .didebc
942	addcc	%o3, 8, %o3
943	!
944	! End of copy loop. Most 8 byte aligned copies end here.
945	!
946	bz,pt	%ncc, .dcifh
947	nop
948	!
949	! Something is left. Do it byte for byte.
950	!
951	ba,pt	%ncc, .dcicl
952	lduba	[%o0 + %o3]ASI_USER, %o4
953	!
954	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
955	!
956	.align 32
957.didfbc:
958	lduwa	[%o0 + %o3]ASI_USER, %o4
959	deccc	%o2
960	st	%o4, [%o1 + %o3]
961	bg,pt	%ncc, .didfbc
962	addcc	%o3, 4, %o3
963	!
964	! End of copy loop. Most 4 byte aligned copies end here.
965	!
966	bz,pt	%ncc, .dcifh
967	nop
968	!
969	! Something is left. Do it byte for byte.
970	!
971	ba,pt	%ncc, .dcicl
972	lduba	[%o0 + %o3]ASI_USER, %o4
973	!
974	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
975	! copy.
976	!
977	.align 32
978.didtbc:
979	lduha	[%o0 + %o3]ASI_USER, %o4
980	deccc	%o2
981	sth	%o4, [%o1 + %o3]
982	bg,pt	%ncc, .didtbc
983	addcc	%o3, 2, %o3
984	!
985	! End of copy loop. Most 2 byte aligned copies end here.
986	!
987	bz,pt	%ncc, .dcifh
988	nop
989	!
990	! Deal with the last byte
991	!
992	lduba	[%o0 + %o3]ASI_USER, %o4
993	stb	%o4, [%o1 + %o3]
994.dcifh:
995	membar	#Sync
996	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
997	retl
998	clr	%o0
999
1000.copyin_err:
1001	ldn	[THREAD_REG + T_COPYOPS], %o4
1002	brz	%o4, 2f
1003	nop
1004	ldn	[%o4 + CP_COPYIN], %g2
1005	jmp	%g2
1006	nop
10072:
1008	retl
1009	mov	-1, %o0
1010	SET_SIZE(copyin)
1011
1012	ENTRY(xcopyin)
1013	sethi	%hi(.xcopyin_err), REAL_LOFAULT
1014	b	.do_copyin
1015	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
1016.xcopyin_err:
1017	ldn	[THREAD_REG + T_COPYOPS], %o4
1018	brz	%o4, 2f
1019	nop
1020	ldn	[%o4 + CP_XCOPYIN], %g2
1021	jmp	%g2
1022	nop
10232:
1024	retl
1025	mov	%g1, %o0
1026	SET_SIZE(xcopyin)
1027
1028	ENTRY(xcopyin_little)
1029	sethi	%hi(.little_err), %o4
1030	ldn	[THREAD_REG + T_LOFAULT], %o5
1031	or	%o4, %lo(.little_err), %o4
1032	membar	#Sync				! sync error barrier
1033	stn	%o4, [THREAD_REG + T_LOFAULT]
1034
1035	subcc	%g0, %o2, %o3
1036	add	%o0, %o2, %o0
1037	bz,pn	%ncc, 2f		! check for zero bytes
1038	sub	%o2, 1, %o4
1039	add	%o0, %o4, %o0		! start w/last byte
1040	add	%o1, %o2, %o1
1041	lduba	[%o0+%o3]ASI_AIUSL, %o4
1042
10431:	stb	%o4, [%o1+%o3]
1044	inccc	%o3
1045	sub	%o0, 2, %o0		! get next byte
1046	bcc,a,pt %ncc, 1b
1047	  lduba	[%o0+%o3]ASI_AIUSL, %o4
1048
10492:	membar	#Sync				! sync error barrier
1050	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1051	retl
1052	mov	%g0, %o0		! return (0)
1053
1054.little_err:
1055	membar	#Sync				! sync error barrier
1056	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1057	retl
1058	mov	%g1, %o0
1059	SET_SIZE(xcopyin_little)
1060
1061
1062/*
1063 * Copy a block of storage - must not overlap (from + len <= to).
1064 * No fault handler installed (to be called under on_fault())
1065 */
1066
1067	ENTRY(copyin_noerr)
1068	sethi	%hi(.copyio_noerr), REAL_LOFAULT
1069	b	.do_copyin
1070	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
1071.copyio_noerr:
1072	jmp	SAVED_LOFAULT
1073	  nop
1074	SET_SIZE(copyin_noerr)
1075
1076/*
1077 * Copy a block of storage - must not overlap (from + len <= to).
1078 * No fault handler installed (to be called under on_fault())
1079 */
1080
1081	ENTRY(copyout_noerr)
1082	sethi	%hi(.copyio_noerr), REAL_LOFAULT
1083	b	.do_copyout
1084	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
1085	SET_SIZE(copyout_noerr)
1086
1087	.align	4
1088	DGDEF(use_hw_bcopy)
1089	.word	1
1090	DGDEF(use_hw_bzero)
1091	.word	1
1092
1093	.align	64
1094	.section ".text"
1095
1096
1097/*
1098 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
1099 * longer than 256 bytes in length. For the generic module we will simply
1100 * call bzero and return 1 to ensure that the pages in cache should be
1101 * flushed to ensure integrity.
1102 * Caller is responsible for ensuring use_hw_bzero is true and that
1103 * kpreempt_disable() has been called.
1104 */
1105	! %i0 - start address
1106	! %i1 - length of region (multiple of 64)
1107
1108	ENTRY(hwblkclr)
1109	save	%sp, -SA(MINFRAME), %sp
1110
1111	! Simply call bzero and notify the caller that bzero was used
1112	mov	%i0, %o0
1113	call	bzero
1114	  mov	%i1, %o1
1115	ret
1116	restore	%g0, 1, %o0	! return (1) - did not use block operations
1117
1118	SET_SIZE(hwblkclr)
1119
1120	/*
1121	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
1122	 * using physical addresses.
1123	 */
1124	ENTRY_NP(hw_pa_bcopy32)
1125	rdpr    %pstate, %g1
1126	andn    %g1, PSTATE_IE, %g2
1127	wrpr    %g0, %g2, %pstate
1128
1129	ldxa    [%o0]ASI_MEM, %o2
1130	add     %o0, 8, %o0
1131	ldxa    [%o0]ASI_MEM, %o3
1132	add     %o0, 8, %o0
1133	ldxa    [%o0]ASI_MEM, %o4
1134	add     %o0, 8, %o0
1135	ldxa    [%o0]ASI_MEM, %o5
1136	stxa    %o2, [%o1]ASI_MEM
1137	add     %o1, 8, %o1
1138	stxa    %o3, [%o1]ASI_MEM
1139	add     %o1, 8, %o1
1140	stxa    %o4, [%o1]ASI_MEM
1141	add     %o1, 8, %o1
1142	stxa    %o5, [%o1]ASI_MEM
1143
1144	membar	#Sync
1145	retl
1146	  wrpr    %g0, %g1, %pstate
1147	SET_SIZE(hw_pa_bcopy32)
1148
1149/*
1150 * Zero a block of storage.
1151 *
1152 * uzero is used by the kernel to zero a block in user address space.
1153 */
1154
1155
1156	ENTRY(uzero)
1157	!
1158	! Set a new lo_fault handler only if we came in with one
1159	! already specified.
1160	!
1161	wr	%g0, ASI_USER, %asi
1162	ldn	[THREAD_REG + T_LOFAULT], %o5
1163	tst	%o5
1164	bz,pt	%ncc, .do_zero
1165	sethi	%hi(.zeroerr), %o2
1166	or	%o2, %lo(.zeroerr), %o2
1167	membar	#Sync
1168	ba,pt	%ncc, .do_zero
1169	stn	%o2, [THREAD_REG + T_LOFAULT]
1170
1171	ENTRY(kzero)
1172	!
1173	! Always set a lo_fault handler
1174	!
1175	wr	%g0, ASI_P, %asi
1176	ldn	[THREAD_REG + T_LOFAULT], %o5
1177	sethi	%hi(.zeroerr), %o2
1178	or	%o5, LOFAULT_SET, %o5
1179	or	%o2, %lo(.zeroerr), %o2
1180	membar	#Sync
1181	ba,pt	%ncc, .do_zero
1182	stn	%o2, [THREAD_REG + T_LOFAULT]
1183
1184/*
1185 * We got here because of a fault during kzero or if
1186 * uzero or bzero was called with t_lofault non-zero.
1187 * Otherwise we've already run screaming from the room.
1188 * Errno value is in %g1. Note that we're here iff
1189 * we did set t_lofault.
1190 */
1191.zeroerr:
1192	!
1193	! Undo asi register setting. Just set it to be the
1194        ! kernel default without checking.
1195	!
1196	wr	%g0, ASI_P, %asi
1197
1198	!
1199	! We did set t_lofault. It may well have been zero coming in.
1200	!
12011:
1202	tst	%o5
1203	membar #Sync
1204	bne,pn	%ncc, 3f
1205	andncc	%o5, LOFAULT_SET, %o5
12062:
1207	!
1208	! Old handler was zero. Just return the error.
1209	!
1210	retl				! return
1211	mov	%g1, %o0		! error code from %g1
12123:
1213	!
1214	! We're here because %o5 was non-zero. It was non-zero
1215	! because either LOFAULT_SET was present, a previous fault
1216	! handler was present or both. In all cases we need to reset
1217	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
1218	! before we either simply return the error or we invoke the
1219	! previously specified handler.
1220	!
1221	be	%ncc, 2b
1222	stn	%o5, [THREAD_REG + T_LOFAULT]
1223	jmp	%o5			! goto real handler
1224	  nop
1225	SET_SIZE(kzero)
1226	SET_SIZE(uzero)
1227
1228/*
1229 * Zero a block of storage.
1230 */
1231
1232	ENTRY(bzero)
1233	wr	%g0, ASI_P, %asi
1234
1235	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
1236	tst	%o5
1237	bz,pt	%ncc, .do_zero
1238	sethi	%hi(.zeroerr), %o2
1239	or	%o2, %lo(.zeroerr), %o2
1240	membar	#Sync				! sync error barrier
1241	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1242
1243.do_zero:
1244	cmp	%o1, 7
1245	blu,pn	%ncc, .byteclr
1246	nop
1247
1248	cmp	%o1, 15
1249	blu,pn	%ncc, .wdalign
1250	nop
1251
1252	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
1253	bz,pt	%ncc, .blkalign		! already double aligned
1254	sub	%o3, 8, %o3		! -(bytes till double aligned)
1255	add	%o1, %o3, %o1		! update o1 with new count
1256
12571:
1258	stba	%g0, [%o0]%asi
1259	inccc	%o3
1260	bl,pt	%ncc, 1b
1261	inc	%o0
1262
1263	! Now address is double aligned
1264.blkalign:
1265	cmp	%o1, 0x80		! check if there are 128 bytes to set
1266	blu,pn	%ncc, .bzero_small
1267	mov	%o1, %o3
1268
1269	andcc	%o0, 0x3f, %o3		! is block aligned?
1270	bz,pt	%ncc, .bzero_blk
1271	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
1272	add	%o1, %o3, %o1		! o1 is the remainder
1273
1274	! Clear -(%o3) bytes till block aligned
12751:
1276	stxa	%g0, [%o0]%asi
1277	addcc	%o3, 8, %o3
1278	bl,pt	%ncc, 1b
1279	add	%o0, 8, %o0
1280
1281.bzero_blk:
1282	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
1283	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
1284
1285	cmp	%o4, 0x100		! 256 bytes or more
1286	blu,pn	%ncc, 3f
1287	nop
1288
12892:
1290	stxa	%g0, [%o0+0x0]%asi
1291	stxa	%g0, [%o0+0x40]%asi
1292	stxa	%g0, [%o0+0x80]%asi
1293	stxa	%g0, [%o0+0xc0]%asi
1294
1295	stxa	%g0, [%o0+0x8]%asi
1296	stxa	%g0, [%o0+0x10]%asi
1297	stxa	%g0, [%o0+0x18]%asi
1298	stxa	%g0, [%o0+0x20]%asi
1299	stxa	%g0, [%o0+0x28]%asi
1300	stxa	%g0, [%o0+0x30]%asi
1301	stxa	%g0, [%o0+0x38]%asi
1302
1303	stxa	%g0, [%o0+0x48]%asi
1304	stxa	%g0, [%o0+0x50]%asi
1305	stxa	%g0, [%o0+0x58]%asi
1306	stxa	%g0, [%o0+0x60]%asi
1307	stxa	%g0, [%o0+0x68]%asi
1308	stxa	%g0, [%o0+0x70]%asi
1309	stxa	%g0, [%o0+0x78]%asi
1310
1311	stxa	%g0, [%o0+0x88]%asi
1312	stxa	%g0, [%o0+0x90]%asi
1313	stxa	%g0, [%o0+0x98]%asi
1314	stxa	%g0, [%o0+0xa0]%asi
1315	stxa	%g0, [%o0+0xa8]%asi
1316	stxa	%g0, [%o0+0xb0]%asi
1317	stxa	%g0, [%o0+0xb8]%asi
1318
1319	stxa	%g0, [%o0+0xc8]%asi
1320	stxa	%g0, [%o0+0xd0]%asi
1321	stxa	%g0, [%o0+0xd8]%asi
1322	stxa	%g0, [%o0+0xe0]%asi
1323	stxa	%g0, [%o0+0xe8]%asi
1324	stxa	%g0, [%o0+0xf0]%asi
1325	stxa	%g0, [%o0+0xf8]%asi
1326
1327	sub	%o4, 0x100, %o4
1328	cmp	%o4, 0x100
1329	bgu,pt	%ncc, 2b
1330	add	%o0, 0x100, %o0
1331
13323:
1333	! ... check if 64 bytes to set
1334	cmp	%o4, 0x40
1335	blu	%ncc, .bzero_blk_done
1336	nop
1337
13384:
1339	stxa	%g0, [%o0+0x0]%asi
1340	stxa	%g0, [%o0+0x8]%asi
1341	stxa	%g0, [%o0+0x10]%asi
1342	stxa	%g0, [%o0+0x18]%asi
1343	stxa	%g0, [%o0+0x20]%asi
1344	stxa	%g0, [%o0+0x28]%asi
1345	stxa	%g0, [%o0+0x30]%asi
1346	stxa	%g0, [%o0+0x38]%asi
1347
1348	subcc	%o4, 0x40, %o4
1349	bgu,pt	%ncc, 3b
1350	add	%o0, 0x40, %o0
1351
1352.bzero_blk_done:
1353	membar	#Sync
1354
1355.bzero_small:
1356	! Set the remaining doubles
1357	subcc	%o3, 8, %o3		! Can we store any doubles?
1358	blu,pn	%ncc, .byteclr
1359	and	%o1, 7, %o1		! calc bytes left after doubles
1360
1361.dbclr:
1362	stxa	%g0, [%o0]%asi		! Clear the doubles
1363	subcc	%o3, 8, %o3
1364	bgeu,pt	%ncc, .dbclr
1365	add	%o0, 8, %o0
1366
1367	ba	.byteclr
1368	nop
1369
1370.wdalign:
1371	andcc	%o0, 3, %o3		! is add aligned on a word boundary
1372	bz,pn	%ncc, .wdclr
1373	andn	%o1, 3, %o3		! create word sized count in %o3
1374
1375	dec	%o1			! decrement count
1376	stba	%g0, [%o0]%asi		! clear a byte
1377	ba	.wdalign
1378	inc	%o0			! next byte
1379
1380.wdclr:
1381	sta	%g0, [%o0]%asi		! 4-byte clearing loop
1382	subcc	%o3, 4, %o3
1383	bnz,pt	%ncc, .wdclr
1384	inc	4, %o0
1385
1386	and	%o1, 3, %o1		! leftover count, if any
1387
1388.byteclr:
1389	! Set the leftover bytes
1390	brz	%o1, .bzero_exit
1391	nop
1392
13937:
1394	deccc	%o1			! byte clearing loop
1395	stba	%g0, [%o0]%asi
1396	bgu,pt	%ncc, 7b
1397	inc	%o0
1398
1399.bzero_exit:
1400	!
1401	! We're just concerned with whether t_lofault was set
1402	! when we came in. We end up here from either kzero()
1403	! or bzero(). kzero() *always* sets a lofault handler.
1404	! It ors LOFAULT_SET into %o5 to indicate it has done
1405	! this even if the value of %o5 is otherwise zero.
1406	! bzero() sets a lofault handler *only* if one was
1407	! previously set. Accordingly we need to examine
1408	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
1409	! before resetting the error handler.
1410	!
1411	tst	%o5
1412	bz	%ncc, 1f
1413	andn	%o5, LOFAULT_SET, %o5
1414	membar	#Sync				! sync error barrier
1415	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
14161:
1417	retl
1418	clr	%o0			! return (0)
1419
1420	SET_SIZE(bzero)
1421