xref: /titanic_52/usr/src/uts/sun4v/cpu/generic_copy.s (revision fd9cb95cbb2f626355a60efb9d02c5f0a33c10e6)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/errno.h>
31#include <sys/asm_linkage.h>
32#include <sys/vtrace.h>
33#include <sys/machthread.h>
34#include <sys/clock.h>
35#include <sys/asi.h>
36#include <sys/fsr.h>
37#include <sys/privregs.h>
38
39#if !defined(lint)
40#include "assym.h"
41#endif	/* lint */
42
43
44/*
45 * Less then or equal this number of bytes we will always copy byte-for-byte
46 */
47#define	SMALL_LIMIT	7
48
49/*
50 * Flags used by uzero/kzero/bzero functions. They set lower bits of
51 * the t_lofault address :
52 * LOFAULT_SET : Set by kzero to indicate that lo_fault handler was set
53 */
54#define	LOFAULT_SET 2
55
56
57/*
58 * Copy a block of storage, returning an error code if `from' or
59 * `to' takes a kernel pagefault which cannot be resolved.
60 * Returns errno value on pagefault error, 0 if all ok
61 */
62
63
64
65#if defined(lint)
66
67/* ARGSUSED */
68int
69kcopy(const void *from, void *to, size_t count)
70{ return(0); }
71
72#else	/* lint */
73
74	.seg	".text"
75	.align	4
76
77	ENTRY(kcopy)
78
79	save	%sp, -SA(MINFRAME), %sp
80	set	.copyerr, %o5		! copyerr is lofault value
81	ldn	[THREAD_REG + T_LOFAULT], %l7	! save existing handler
82	membar	#Sync			! sync error barrier (see copy.s)
83	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
84	b	.do_copy		! common code
85	  mov	%l7, %o5
86
87/*
88 * We got here because of a fault during kcopy.
89 * Errno value is in %g1.
90 */
91.copyerr:
92	membar	#Sync			! sync error barrier
93	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
94	ret
95	restore	%g1, 0, %o0
96
97	SET_SIZE(kcopy)
98#endif	/* lint */
99
100
101/*
102 * Copy a block of storage - must not overlap (from + len <= to).
103 *
104 * Copy a page of memory.
105 * Assumes double word alignment and a count >= 256.
106 */
107#if defined(lint)
108
109/* ARGSUSED */
110void
111bcopy(const void *from, void *to, size_t count)
112{}
113
114#else	/* lint */
115
116	ENTRY(bcopy)
117
118	save	%sp, -SA(MINFRAME), %sp
119
120.do_copy:
121	cmp	%i2, 12			! for small counts
122	blu	%ncc, .bytecp		! just copy bytes
123	  .empty
124
125	!
126	! use aligned transfers where possible
127	!
128	xor	%i0, %i1, %o4		! xor from and to address
129	btst	7, %o4			! if lower three bits zero
130	bz	.aldoubcp		! can align on double boundary
131	.empty	! assembler complaints about label
132
133	xor	%i0, %i1, %o4		! xor from and to address
134	btst	3, %o4			! if lower two bits zero
135	bz	.alwordcp		! can align on word boundary
136	btst	3, %i0			! delay slot, from address unaligned?
137	!
138	! use aligned reads and writes where possible
139	! this differs from wordcp in that it copes
140	! with odd alignment between source and destnation
141	! using word reads and writes with the proper shifts
142	! in between to align transfers to and from memory
143	! i0 - src address, i1 - dest address, i2 - count
144	! i3, i4 - tmps for used generating complete word
145	! i5 (word to write)
146	! l0 size in bits of upper part of source word (US)
147	! l1 size in bits of lower part of source word (LS = 32 - US)
148	! l2 size in bits of upper part of destination word (UD)
149	! l3 size in bits of lower part of destination word (LD = 32 - UD)
150	! l4 number of bytes leftover after aligned transfers complete
151	! l5 the number 32
152	!
153	mov	32, %l5			! load an oft-needed constant
154	bz	.align_dst_only
155	btst	3, %i1			! is destnation address aligned?
156	clr	%i4			! clear registers used in either case
157	bz	.align_src_only
158	clr	%l0
159	!
160	! both source and destination addresses are unaligned
161	!
1621:					! align source
163	ldub	[%i0], %i3		! read a byte from source address
164	add	%i0, 1, %i0		! increment source address
165	or	%i4, %i3, %i4		! or in with previous bytes (if any)
166	btst	3, %i0			! is source aligned?
167	add	%l0, 8, %l0		! increment size of upper source (US)
168	bnz,a	1b
169	sll	%i4, 8, %i4		! make room for next byte
170
171	sub	%l5, %l0, %l1		! generate shift left count (LS)
172	sll	%i4, %l1, %i4		! prepare to get rest
173	ld	[%i0], %i3		! read a word
174	add	%i0, 4, %i0		! increment source address
175	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
176	or	%i4, %i5, %i5		! merge
177	mov	24, %l3			! align destination
1781:
179	srl	%i5, %l3, %i4		! prepare to write a single byte
180	stb	%i4, [%i1]		! write a byte
181	add	%i1, 1, %i1		! increment destination address
182	sub	%i2, 1, %i2		! decrement count
183	btst	3, %i1			! is destination aligned?
184	bnz,a	1b
185	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
186	sub	%l5, %l3, %l2		! generate shift left count (UD)
187	sll	%i5, %l2, %i5		! move leftover into upper bytes
188	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
189	bgu	%ncc, .more_needed	! need more to fill than we have
190	nop
191
192	sll	%i3, %l1, %i3		! clear upper used byte(s)
193	srl	%i3, %l1, %i3
194	! get the odd bytes between alignments
195	sub	%l0, %l2, %l0		! regenerate shift count
196	sub	%l5, %l0, %l1		! generate new shift left count (LS)
197	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
198	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
199	srl	%i3, %l0, %i4
200	or	%i5, %i4, %i5
201	st	%i5, [%i1]		! write a word
202	subcc	%i2, 4, %i2		! decrement count
203	bz	%ncc, .unalign_out
204	add	%i1, 4, %i1		! increment destination address
205
206	b	2f
207	sll	%i3, %l1, %i5		! get leftover into upper bits
208.more_needed:
209	sll	%i3, %l0, %i3		! save remaining byte(s)
210	srl	%i3, %l0, %i3
211	sub	%l2, %l0, %l1		! regenerate shift count
212	sub	%l5, %l1, %l0		! generate new shift left count
213	sll	%i3, %l1, %i4		! move to fill empty space
214	b	3f
215	or	%i5, %i4, %i5		! merge to complete word
216	!
217	! the source address is aligned and destination is not
218	!
219.align_dst_only:
220	ld	[%i0], %i4		! read a word
221	add	%i0, 4, %i0		! increment source address
222	mov	24, %l0			! initial shift alignment count
2231:
224	srl	%i4, %l0, %i3		! prepare to write a single byte
225	stb	%i3, [%i1]		! write a byte
226	add	%i1, 1, %i1		! increment destination address
227	sub	%i2, 1, %i2		! decrement count
228	btst	3, %i1			! is destination aligned?
229	bnz,a	1b
230	sub	%l0, 8, %l0		! delay slot, decrement shift count
231.xfer:
232	sub	%l5, %l0, %l1		! generate shift left count
233	sll	%i4, %l1, %i5		! get leftover
2343:
235	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
236	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
2372:
238	ld	[%i0], %i3		! read a source word
239	add	%i0, 4, %i0		! increment source address
240	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
241	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
242	st	%i5, [%i1]		! write a destination word
243	subcc	%i2, 4, %i2		! decrement count
244	bz	%ncc, .unalign_out	! check if done
245	add	%i1, 4, %i1		! increment destination address
246	b	2b			! loop
247	sll	%i3, %l1, %i5		! get leftover
248.unalign_out:
249	tst	%l4			! any bytes leftover?
250	bz	%ncc, .cpdone
251	.empty				! allow next instruction in delay slot
2521:
253	sub	%l0, 8, %l0		! decrement shift
254	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
255	stb	%i4, [%i1]		! write a byte
256	subcc	%l4, 1, %l4		! decrement count
257	bz	%ncc, .cpdone		! done?
258	add	%i1, 1, %i1		! increment destination
259	tst	%l0			! any more previously read bytes
260	bnz	%ncc, 1b		! we have leftover bytes
261	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
262	b	.dbytecp		! let dbytecp do the rest
263	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
264	!
265	! the destination address is aligned and the source is not
266	!
267.align_src_only:
268	ldub	[%i0], %i3		! read a byte from source address
269	add	%i0, 1, %i0		! increment source address
270	or	%i4, %i3, %i4		! or in with previous bytes (if any)
271	btst	3, %i0			! is source aligned?
272	add	%l0, 8, %l0		! increment shift count (US)
273	bnz,a	.align_src_only
274	sll	%i4, 8, %i4		! make room for next byte
275	b,a	.xfer
276	!
277	! if from address unaligned for double-word moves,
278	! move bytes till it is, if count is < 56 it could take
279	! longer to align the thing than to do the transfer
280	! in word size chunks right away
281	!
282.aldoubcp:
283	cmp	%i2, 56			! if count < 56, use wordcp, it takes
284	blu,a	%ncc, .alwordcp		! longer to align doubles than words
285	mov	3, %o0			! mask for word alignment
286	call	.alignit		! copy bytes until aligned
287	mov	7, %o0			! mask for double alignment
288	!
289	! source and destination are now double-word aligned
290	! i3 has aligned count returned by alignit
291	!
292	and	%i2, 7, %i2		! unaligned leftover count
293	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2945:
295	ldx	[%i0+%i1], %o4		! read from address
296	stx	%o4, [%i1]		! write at destination address
297	subcc	%i3, 8, %i3		! dec count
298	bgu	%ncc, 5b
299	add	%i1, 8, %i1		! delay slot, inc to address
300	cmp	%i2, 4			! see if we can copy a word
301	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
302	.empty
303	!
304	! for leftover bytes we fall into wordcp, if needed
305	!
306.wordcp:
307	and	%i2, 3, %i2		! unaligned leftover count
3085:
309	ld	[%i0+%i1], %o4		! read from address
310	st	%o4, [%i1]		! write at destination address
311	subcc	%i3, 4, %i3		! dec count
312	bgu	%ncc, 5b
313	add	%i1, 4, %i1		! delay slot, inc to address
314	b,a	.dbytecp
315
316	! we come here to align copies on word boundaries
317.alwordcp:
318	call	.alignit		! go word-align it
319	mov	3, %o0			! bits that must be zero to be aligned
320	b	.wordcp
321	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
322
323	!
324	! byte copy, works with any alignment
325	!
326.bytecp:
327	b	.dbytecp
328	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
329
330	!
331	! differenced byte copy, works with any alignment
332	! assumes dest in %i1 and (source - dest) in %i0
333	!
3341:
335	stb	%o4, [%i1]		! write to address
336	inc	%i1			! inc to address
337.dbytecp:
338	deccc	%i2			! dec count
339	bgeu,a	%ncc, 1b		! loop till done
340	ldub	[%i0+%i1], %o4		! read from address
341.cpdone:
342	membar	#Sync			! sync error barrier
343	ret
344	restore %g0, 0, %o0		! return (0)
345
346/*
347 * Common code used to align transfers on word and doubleword
348 * boudaries.  Aligns source and destination and returns a count
349 * of aligned bytes to transfer in %i3
350 */
3511:
352	inc	%i0			! inc from
353	stb	%o4, [%i1]		! write a byte
354	inc	%i1			! inc to
355	dec	%i2			! dec count
356.alignit:
357	btst	%o0, %i0		! %o0 is bit mask to check for alignment
358	bnz,a	1b
359	ldub	[%i0], %o4		! read next byte
360
361	retl
362	andn	%i2, %o0, %i3		! return size of aligned bytes
363	SET_SIZE(bcopy)
364
365#endif	/* lint */
366
367/*
368 * Block copy with possibly overlapped operands.
369 */
370
371#if defined(lint)
372
373/*ARGSUSED*/
374void
375ovbcopy(const void *from, void *to, size_t count)
376{}
377
378#else	/* lint */
379
380	ENTRY(ovbcopy)
381	tst	%o2			! check count
382	bgu,a	%ncc, 1f		! nothing to do or bad arguments
383	subcc	%o0, %o1, %o3		! difference of from and to address
384
385	retl				! return
386	nop
3871:
388	bneg,a	%ncc, 2f
389	neg	%o3			! if < 0, make it positive
3902:	cmp	%o2, %o3		! cmp size and abs(from - to)
391	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
392	.empty				!   no overlap
393	cmp	%o0, %o1		! compare from and to addresses
394	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
395	nop
396	!
397	! Copy forwards.
398	!
399.ov_fwd:
400	ldub	[%o0], %o3		! read from address
401	inc	%o0			! inc from address
402	stb	%o3, [%o1]		! write to address
403	deccc	%o2			! dec count
404	bgu	%ncc, .ov_fwd		! loop till done
405	inc	%o1			! inc to address
406
407	retl				! return
408	nop
409	!
410	! Copy backwards.
411	!
412.ov_bkwd:
413	deccc	%o2			! dec count
414	ldub	[%o0 + %o2], %o3	! get byte at end of src
415	bgu	%ncc, .ov_bkwd		! loop till done
416	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
417
418	retl				! return
419	nop
420	SET_SIZE(ovbcopy)
421
422#endif	/* lint */
423
424/*
425 * hwblkpagecopy()
426 *
427 * Copies exactly one page.  This routine assumes the caller (ppcopy)
428 * has already disabled kernel preemption and has checked
429 * use_hw_bcopy.
430 */
431#ifdef lint
432/*ARGSUSED*/
433void
434hwblkpagecopy(const void *src, void *dst)
435{ }
436#else /* lint */
437	ENTRY(hwblkpagecopy)
438	save	%sp, -SA(MINFRAME), %sp
439
440	! %i0 - source address (arg)
441	! %i1 - destination address (arg)
442	! %i2 - length of region (not arg)
443
444	set	PAGESIZE, %i2
445
446	/*
447	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
448	 */
4491:
450	ldx	[%i0+0x0], %l0
451	ldx	[%i0+0x8], %l1
452	ldx	[%i0+0x10], %l2
453	ldx	[%i0+0x18], %l3
454	ldx	[%i0+0x20], %l4
455	ldx	[%i0+0x28], %l5
456	ldx	[%i0+0x30], %l6
457	ldx	[%i0+0x38], %l7
458	stx	%l0, [%i1+0x0]
459	stx	%l1, [%i1+0x8]
460	stx	%l2, [%i1+0x10]
461	stx	%l3, [%i1+0x18]
462	stx	%l4, [%i1+0x20]
463	stx	%l5, [%i1+0x28]
464	stx	%l6, [%i1+0x30]
465	stx	%l7, [%i1+0x38]
466
467	ldx	[%i0+0x40], %l0
468	ldx	[%i0+0x48], %l1
469	ldx	[%i0+0x50], %l2
470	ldx	[%i0+0x58], %l3
471	ldx	[%i0+0x60], %l4
472	ldx	[%i0+0x68], %l5
473	ldx	[%i0+0x70], %l6
474	ldx	[%i0+0x78], %l7
475	stx	%l0, [%i1+0x40]
476	stx	%l1, [%i1+0x48]
477	stx	%l2, [%i1+0x50]
478	stx	%l3, [%i1+0x58]
479	stx	%l4, [%i1+0x60]
480	stx	%l5, [%i1+0x68]
481	stx	%l6, [%i1+0x70]
482	stx	%l7, [%i1+0x78]
483
484	add	%i0, 0x80, %i0
485	subcc	%i2, 0x80, %i2
486	bgu,pt	%xcc, 1b
487	add	%i1, 0x80, %i1
488
489	membar #Sync
490	ret
491	restore	%g0, 0, %o0
492	SET_SIZE(hwblkpagecopy)
493#endif	/* lint */
494
495
496/*
497 * Transfer data to and from user space -
498 * Note that these routines can cause faults
499 * It is assumed that the kernel has nothing at
500 * less than KERNELBASE in the virtual address space.
501 *
502 * Note that copyin(9F) and copyout(9F) are part of the
503 * DDI/DKI which specifies that they return '-1' on "errors."
504 *
505 * Sigh.
506 *
507 * So there's two extremely similar routines - xcopyin() and xcopyout()
508 * which return the errno that we've faithfully computed.  This
509 * allows other callers (e.g. uiomove(9F)) to work correctly.
510 * Given that these are used pretty heavily, we expand the calling
511 * sequences inline for all flavours (rather than making wrappers).
512 *
513 * There are also stub routines for xcopyout_little and xcopyin_little,
514 * which currently are intended to handle requests of <= 16 bytes from
515 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
516 * is left as an exercise...
517 */
518
519/*
520 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
521 *
522 * General theory of operation:
523 *
524 * None of the copyops routines grab a window.
525 *
526 * Flow:
527 *
528 * If count == zero return zero.
529 *
530 * Store the previous lo_fault handler into %g6.
531 * Place our secondary lofault handler into %g5.
532 * Place the address of our fault handler into %o3.
533 *
534 * If count is less than or equal to SMALL_LIMIT (7) we
535 * always do a byte for byte copy.
536 *
537 * If count is > SMALL_LIMIT, we check the alignment of the input
538 * and output pointers.  We store -count in %o3, we store the number
539 * of chunks (8, 4, 2 or 1 byte) operated on in our basic copy loop
540 * in %o2. Following this we branch to the appropriate copy loop and
541 * copy that many chunks.  Since we've been adding the chunk size
542 * to %o3 each time through as well as decrementing %o2, we can tell
543 * if any data is is left to be copied by examining %o3. If that is
544 * zero, we're done and can go home. If not, we figure out what the
545 * largest chunk size left to be copied is and branch to that copy
546 * loop unless there's only one byte left. We load that as we're
547 * branching to code that stores it just before we return.
548 *
549 * Fault handlers are invoked if we reference memory that has no
550 * current mapping.  All forms share the same copyio_fault handler.
551 * This routine handles fixing up the stack and general housecleaning.
552 * Each copy operation has a simple fault handler that is then called
553 * to do the work specific to the invidual operation.  The handler
554 * for copyOP and xcopyOP are found at the end of individual function.
555 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
556 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
557 */
558
559/*
560 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
561 */
562
563#if defined(lint)
564
565/*ARGSUSED*/
566int
567copyout(const void *kaddr, void *uaddr, size_t count)
568{ return (0); }
569
570#else	/* lint */
571
572/*
573 * We save the arguments in the following registers in case of a fault:
574 * 	kaddr - %g2
575 * 	uaddr - %g3
576 * 	count - %g4
577 */
578#define	SAVE_SRC	%g2
579#define	SAVE_DST	%g3
580#define	SAVE_COUNT	%g4
581
582#define	REAL_LOFAULT		%g5
583#define	SAVED_LOFAULT		%g6
584
585/*
586 * Generic copyio fault handler.  This is the first line of defense when a
587 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
588 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
589 * This allows us to share common code for all the flavors of the copy
590 * operations, including the _noerr versions.
591 *
592 * Note that this function will restore the original input parameters before
593 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
594 * member of the t_copyop structure, if needed.
595 */
596	ENTRY(copyio_fault)
597	membar	#Sync
598	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
599
600	mov	SAVE_SRC, %o0
601	mov	SAVE_DST, %o1
602	jmp	REAL_LOFAULT
603	  mov	SAVE_COUNT, %o2
604	SET_SIZE(copyio_fault)
605
606	ENTRY(copyout)
607	sethi	%hi(.copyout_err), REAL_LOFAULT
608	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
609
610.do_copyout:
611	!
612	! Check the length and bail if zero.
613	!
614	tst	%o2
615	bnz,pt	%ncc, 1f
616	  nop
617	retl
618	  clr	%o0
6191:
620	sethi	%hi(copyio_fault), %o3
621	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
622	or	%o3, %lo(copyio_fault), %o3
623	membar	#Sync
624	stn	%o3, [THREAD_REG + T_LOFAULT]
625
626	mov	%o0, SAVE_SRC
627	mov	%o1, SAVE_DST
628	mov	%o2, SAVE_COUNT
629
630	!
631	! Check to see if we're more than SMALL_LIMIT (7 bytes).
632	! Run in leaf mode, using the %o regs as our input regs.
633	!
634	subcc	%o2, SMALL_LIMIT, %o3
635	bgu,a,pt %ncc, .dco_ns
636	or	%o0, %o1, %o3
637
638.dcobcp:
639	sub	%g0, %o2, %o3		! negate count
640	add	%o0, %o2, %o0		! make %o0 point at the end
641	add	%o1, %o2, %o1		! make %o1 point at the end
642	ba,pt	%ncc, .dcocl
643	ldub	[%o0 + %o3], %o4	! load first byte
644	!
645	! %o0 and %o2 point at the end and remain pointing at the end
646	! of their buffers. We pull things out by adding %o3 (which is
647	! the negation of the length) to the buffer end which gives us
648	! the curent location in the buffers. By incrementing %o3 we walk
649	! through both buffers without having to bump each buffer's
650	! pointer. A very fast 4 instruction loop.
651	!
652	.align 16
653.dcocl:
654	stba	%o4, [%o1 + %o3]ASI_USER
655	inccc	%o3
656	bl,a,pt	%ncc, .dcocl
657	ldub	[%o0 + %o3], %o4
658	!
659	! We're done. Go home.
660	!
661	membar	#Sync
662	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
663	retl
664	clr	%o0
665	!
666	! Try aligned copies from here.
667	!
668.dco_ns:
669	! %o0 = kernel addr (to be copied from)
670	! %o1 = user addr (to be copied to)
671	! %o2 = length
672	! %o3 = %o1 | %o2 (used for alignment checking)
673	! %o4 is alternate lo_fault
674	! %o5 is original lo_fault
675	!
676	! See if we're single byte aligned. If we are, check the
677	! limit for single byte copies. If we're smaller or equal,
678	! bounce to the byte for byte copy loop. Otherwise do it in
679	! HW (if enabled).
680	!
681	btst	1, %o3
682	bz,pt	%icc, .dcoh8
683	btst	7, %o3
684
685	ba	.dcobcp
686	nop
687.dcoh8:
688	!
689	! 8 byte aligned?
690	!
691	bnz,a	%ncc, .dcoh4
692	btst	3, %o3
693.dcos8:
694	!
695	! Housekeeping for copy loops. Uses same idea as in the byte for
696	! byte copy loop above.
697	!
698	add	%o0, %o2, %o0
699	add	%o1, %o2, %o1
700	sub	%g0, %o2, %o3
701	ba,pt	%ncc, .dodebc
702	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
703	!
704	! 4 byte aligned?
705	!
706.dcoh4:
707	bnz,pn	%ncc, .dcoh2
708	nop
709.dcos4:
710	add	%o0, %o2, %o0
711	add	%o1, %o2, %o1
712	sub	%g0, %o2, %o3
713	ba,pt	%ncc, .dodfbc
714	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
715	!
716	! We must be 2 byte aligned. Off we go.
717	! The check for small copies was done in the
718	! delay at .dcoh4
719	!
720.dcoh2:
721.dcos2:
722	add	%o0, %o2, %o0
723	add	%o1, %o2, %o1
724	sub	%g0, %o2, %o3
725	ba,pt	%ncc, .dodtbc
726	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
727
728.dodebc:
729	ldx	[%o0 + %o3], %o4
730	deccc	%o2
731	stxa	%o4, [%o1 + %o3]ASI_USER
732	bg,pt	%ncc, .dodebc
733	addcc	%o3, 8, %o3
734	!
735	! End of copy loop. Check to see if we're done. Most
736	! eight byte aligned copies end here.
737	!
738	bz,pt	%ncc, .dcofh
739	nop
740	!
741	! Something is left - do it byte for byte.
742	!
743	ba,pt	%ncc, .dcocl
744	ldub	[%o0 + %o3], %o4	! load next byte
745	!
746	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
747	!
748	.align 32
749.dodfbc:
750	lduw	[%o0 + %o3], %o4
751	deccc	%o2
752	sta	%o4, [%o1 + %o3]ASI_USER
753	bg,pt	%ncc, .dodfbc
754	addcc	%o3, 4, %o3
755	!
756	! End of copy loop. Check to see if we're done. Most
757	! four byte aligned copies end here.
758	!
759	bz,pt	%ncc, .dcofh
760	nop
761	!
762	! Something is left. Do it byte for byte.
763	!
764	ba,pt	%ncc, .dcocl
765	ldub	[%o0 + %o3], %o4	! load next byte
766	!
767	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
768	! copy.
769	!
770	.align 32
771.dodtbc:
772	lduh	[%o0 + %o3], %o4
773	deccc	%o2
774	stha	%o4, [%o1 + %o3]ASI_USER
775	bg,pt	%ncc, .dodtbc
776	addcc	%o3, 2, %o3
777	!
778	! End of copy loop. Anything left?
779	!
780	bz,pt	%ncc, .dcofh
781	nop
782	!
783	! Deal with the last byte
784	!
785	ldub	[%o0 + %o3], %o4
786	stba	%o4, [%o1 + %o3]ASI_USER
787.dcofh:
788	membar	#Sync
789	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
790	retl
791	clr	%o0
792
793.copyout_err:
794	ldn	[THREAD_REG + T_COPYOPS], %o4
795	brz	%o4, 2f
796	nop
797	ldn	[%o4 + CP_COPYOUT], %g2
798	jmp	%g2
799	nop
8002:
801	retl
802	mov	-1, %o0
803	SET_SIZE(copyout)
804
805#endif	/* lint */
806
807
808#ifdef	lint
809
810/*ARGSUSED*/
811int
812xcopyout(const void *kaddr, void *uaddr, size_t count)
813{ return (0); }
814
815#else	/* lint */
816
817	ENTRY(xcopyout)
818	sethi	%hi(.xcopyout_err), REAL_LOFAULT
819	b	.do_copyout
820	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
821.xcopyout_err:
822	ldn	[THREAD_REG + T_COPYOPS], %o4
823	brz	%o4, 2f
824	nop
825	ldn	[%o4 + CP_XCOPYOUT], %g2
826	jmp	%g2
827	nop
8282:
829	retl
830	mov	%g1, %o0
831	SET_SIZE(xcopyout)
832
833#endif	/* lint */
834
835#ifdef	lint
836
837/*ARGSUSED*/
838int
839xcopyout_little(const void *kaddr, void *uaddr, size_t count)
840{ return (0); }
841
842#else	/* lint */
843
844	ENTRY(xcopyout_little)
845	sethi	%hi(.little_err), %o4
846	ldn	[THREAD_REG + T_LOFAULT], %o5
847	or	%o4, %lo(.little_err), %o4
848	membar	#Sync			! sync error barrier
849	stn	%o4, [THREAD_REG + T_LOFAULT]
850
851	subcc	%g0, %o2, %o3
852	add	%o0, %o2, %o0
853	bz,pn	%ncc, 2f		! check for zero bytes
854	sub	%o2, 1, %o4
855	add	%o0, %o4, %o0		! start w/last byte
856	add	%o1, %o2, %o1
857	ldub	[%o0+%o3], %o4
858
8591:	stba	%o4, [%o1+%o3]ASI_AIUSL
860	inccc	%o3
861	sub	%o0, 2, %o0		! get next byte
862	bcc,a,pt %ncc, 1b
863	  ldub	[%o0+%o3], %o4
864
8652:	membar	#Sync			! sync error barrier
866	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
867	retl
868	mov	%g0, %o0		! return (0)
869	SET_SIZE(xcopyout_little)
870
871#endif	/* lint */
872
873/*
874 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
875 */
876
877#if defined(lint)
878
879/*ARGSUSED*/
880int
881copyin(const void *uaddr, void *kaddr, size_t count)
882{ return (0); }
883
884#else	/* lint */
885
886	ENTRY(copyin)
887	sethi	%hi(.copyin_err), REAL_LOFAULT
888	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
889
890.do_copyin:
891	!
892	! Check the length and bail if zero.
893	!
894	tst	%o2
895	bnz,pt	%ncc, 1f
896	  nop
897	retl
898	  clr	%o0
8991:
900	sethi	%hi(copyio_fault), %o3
901	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
902	or	%o3, %lo(copyio_fault), %o3
903	membar	#Sync
904	stn	%o3, [THREAD_REG + T_LOFAULT]
905
906	mov	%o0, SAVE_SRC
907	mov	%o1, SAVE_DST
908	mov	%o2, SAVE_COUNT
909
910	!
911	! Check to see if we're more than SMALL_LIMIT.
912	!
913	subcc	%o2, SMALL_LIMIT, %o3
914	bgu,a,pt %ncc, .dci_ns
915	or	%o0, %o1, %o3
916
917.dcibcp:
918	sub	%g0, %o2, %o3		! setup for copy loop
919	add	%o0, %o2, %o0
920	add	%o1, %o2, %o1
921	ba,pt	%ncc, .dcicl
922	lduba	[%o0 + %o3]ASI_USER, %o4
923	!
924	! %o0 and %o1 point at the end and remain pointing at the end
925	! of their buffers. We pull things out by adding %o3 (which is
926	! the negation of the length) to the buffer end which gives us
927	! the curent location in the buffers. By incrementing %o3 we walk
928	! through both buffers without having to bump each buffer's
929	! pointer. A very fast 4 instruction loop.
930	!
931	.align 16
932.dcicl:
933	stb	%o4, [%o1 + %o3]
934	inccc	%o3
935	bl,a,pt %ncc, .dcicl
936	lduba	[%o0 + %o3]ASI_USER, %o4
937	!
938	! We're done. Go home.
939	!
940	membar	#Sync
941	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
942	retl
943	clr	%o0
944	!
945	! Try aligned copies from here.
946	!
947.dci_ns:
948	!
949	! See if we're single byte aligned. If we are, check the
950	! limit for single byte copies. If we're smaller, or equal,
951	! bounce to the byte for byte copy loop. Otherwise do it in
952	! HW (if enabled).
953	!
954	btst	1, %o3
955	bz,a,pt	%icc, .dcih8
956	btst	7, %o3
957	ba	.dcibcp
958	nop
959
960.dcih8:
961	!
962	! 8 byte aligned?
963	!
964	bnz,a	%ncc, .dcih4
965	btst	3, %o3
966.dcis8:
967	!
968	! Housekeeping for copy loops. Uses same idea as in the byte for
969	! byte copy loop above.
970	!
971	add	%o0, %o2, %o0
972	add	%o1, %o2, %o1
973	sub	%g0, %o2, %o3
974	ba,pt	%ncc, .didebc
975	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
976	!
977	! 4 byte aligned?
978	!
979.dcih4:
980	bnz	%ncc, .dcih2
981	nop
982.dcis4:
983	!
984	! Housekeeping for copy loops. Uses same idea as in the byte
985	! for byte copy loop above.
986	!
987	add	%o0, %o2, %o0
988	add	%o1, %o2, %o1
989	sub	%g0, %o2, %o3
990	ba,pt	%ncc, .didfbc
991	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
992.dcih2:
993.dcis2:
994	add	%o0, %o2, %o0
995	add	%o1, %o2, %o1
996	sub	%g0, %o2, %o3
997	ba,pt	%ncc, .didtbc
998	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
999
1000.didebc:
1001	ldxa	[%o0 + %o3]ASI_USER, %o4
1002	deccc	%o2
1003	stx	%o4, [%o1 + %o3]
1004	bg,pt	%ncc, .didebc
1005	addcc	%o3, 8, %o3
1006	!
1007	! End of copy loop. Most 8 byte aligned copies end here.
1008	!
1009	bz,pt	%ncc, .dcifh
1010	nop
1011	!
1012	! Something is left. Do it byte for byte.
1013	!
1014	ba,pt	%ncc, .dcicl
1015	lduba	[%o0 + %o3]ASI_USER, %o4
1016	!
1017	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
1018	!
1019	.align 32
1020.didfbc:
1021	lduwa	[%o0 + %o3]ASI_USER, %o4
1022	deccc	%o2
1023	st	%o4, [%o1 + %o3]
1024	bg,pt	%ncc, .didfbc
1025	addcc	%o3, 4, %o3
1026	!
1027	! End of copy loop. Most 4 byte aligned copies end here.
1028	!
1029	bz,pt	%ncc, .dcifh
1030	nop
1031	!
1032	! Something is left. Do it byte for byte.
1033	!
1034	ba,pt	%ncc, .dcicl
1035	lduba	[%o0 + %o3]ASI_USER, %o4
1036	!
1037	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
1038	! copy.
1039	!
1040	.align 32
1041.didtbc:
1042	lduha	[%o0 + %o3]ASI_USER, %o4
1043	deccc	%o2
1044	sth	%o4, [%o1 + %o3]
1045	bg,pt	%ncc, .didtbc
1046	addcc	%o3, 2, %o3
1047	!
1048	! End of copy loop. Most 2 byte aligned copies end here.
1049	!
1050	bz,pt	%ncc, .dcifh
1051	nop
1052	!
1053	! Deal with the last byte
1054	!
1055	lduba	[%o0 + %o3]ASI_USER, %o4
1056	stb	%o4, [%o1 + %o3]
1057.dcifh:
1058	membar	#Sync
1059	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1060	retl
1061	clr	%o0
1062
1063.copyin_err:
1064	ldn	[THREAD_REG + T_COPYOPS], %o4
1065	brz	%o4, 2f
1066	nop
1067	ldn	[%o4 + CP_COPYIN], %g2
1068	jmp	%g2
1069	nop
10702:
1071	retl
1072	mov	-1, %o0
1073	SET_SIZE(copyin)
1074
1075#endif	/* lint */
1076
1077#ifdef	lint
1078
1079/*ARGSUSED*/
1080int
1081xcopyin(const void *uaddr, void *kaddr, size_t count)
1082{ return (0); }
1083
1084#else	/* lint */
1085
1086	ENTRY(xcopyin)
1087	sethi	%hi(.xcopyin_err), REAL_LOFAULT
1088	b	.do_copyin
1089	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
1090.xcopyin_err:
1091	ldn	[THREAD_REG + T_COPYOPS], %o4
1092	brz	%o4, 2f
1093	nop
1094	ldn	[%o4 + CP_XCOPYIN], %g2
1095	jmp	%g2
1096	nop
10972:
1098	retl
1099	mov	%g1, %o0
1100	SET_SIZE(xcopyin)
1101
1102#endif	/* lint */
1103
1104#ifdef	lint
1105
1106/*ARGSUSED*/
1107int
1108xcopyin_little(const void *uaddr, void *kaddr, size_t count)
1109{ return (0); }
1110
1111#else	/* lint */
1112
1113	ENTRY(xcopyin_little)
1114	sethi	%hi(.little_err), %o4
1115	ldn	[THREAD_REG + T_LOFAULT], %o5
1116	or	%o4, %lo(.little_err), %o4
1117	membar	#Sync				! sync error barrier
1118	stn	%o4, [THREAD_REG + T_LOFAULT]
1119
1120	subcc	%g0, %o2, %o3
1121	add	%o0, %o2, %o0
1122	bz,pn	%ncc, 2f		! check for zero bytes
1123	sub	%o2, 1, %o4
1124	add	%o0, %o4, %o0		! start w/last byte
1125	add	%o1, %o2, %o1
1126	lduba	[%o0+%o3]ASI_AIUSL, %o4
1127
11281:	stb	%o4, [%o1+%o3]
1129	inccc	%o3
1130	sub	%o0, 2, %o0		! get next byte
1131	bcc,a,pt %ncc, 1b
1132	  lduba	[%o0+%o3]ASI_AIUSL, %o4
1133
11342:	membar	#Sync				! sync error barrier
1135	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1136	retl
1137	mov	%g0, %o0		! return (0)
1138
1139.little_err:
1140	membar	#Sync				! sync error barrier
1141	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1142	retl
1143	mov	%g1, %o0
1144	SET_SIZE(xcopyin_little)
1145
1146#endif	/* lint */
1147
1148
1149/*
1150 * Copy a block of storage - must not overlap (from + len <= to).
1151 * No fault handler installed (to be called under on_fault())
1152 */
1153#if defined(lint)
1154
1155/* ARGSUSED */
1156void
1157copyin_noerr(const void *ufrom, void *kto, size_t count)
1158{}
1159
1160#else	/* lint */
1161
1162	ENTRY(copyin_noerr)
1163	sethi	%hi(.copyio_noerr), REAL_LOFAULT
1164	b	.do_copyin
1165	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
1166.copyio_noerr:
1167	jmp	SAVED_LOFAULT
1168	  nop
1169	SET_SIZE(copyin_noerr)
1170
1171#endif /* lint */
1172
1173/*
1174 * Copy a block of storage - must not overlap (from + len <= to).
1175 * No fault handler installed (to be called under on_fault())
1176 */
1177
1178#if defined(lint)
1179
1180/* ARGSUSED */
1181void
1182copyout_noerr(const void *kfrom, void *uto, size_t count)
1183{}
1184
1185#else	/* lint */
1186
1187	ENTRY(copyout_noerr)
1188	sethi	%hi(.copyio_noerr), REAL_LOFAULT
1189	b	.do_copyout
1190	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
1191	SET_SIZE(copyout_noerr)
1192
1193#endif /* lint */
1194
1195#if defined(lint)
1196
1197int use_hw_bcopy = 1;
1198int use_hw_bzero = 1;
1199
1200#else /* !lint */
1201
1202	.align	4
1203	DGDEF(use_hw_bcopy)
1204	.word	1
1205	DGDEF(use_hw_bzero)
1206	.word	1
1207
1208	.align	64
1209	.section ".text"
1210#endif /* !lint */
1211
1212
1213/*
1214 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
1215 * longer than 256 bytes in length using load/stores.  If
1216 * the criteria for using this routine are not met then it calls bzero
1217 * and returns 1.  Otherwise 0 is returned indicating success.
1218 * Caller is responsible for ensuring use_hw_bzero is true and that
1219 * kpreempt_disable() has been called.
1220 */
1221#ifdef lint
1222/*ARGSUSED*/
1223int
1224hwblkclr(void *addr, size_t len)
1225{
1226	return(0);
1227}
1228#else /* lint */
1229	! %i0 - start address
1230	! %i1 - length of region (multiple of 64)
1231
1232	ENTRY(hwblkclr)
1233	save	%sp, -SA(MINFRAME), %sp
1234
1235	! Must be block-aligned
1236	andcc	%i0, 0x3f, %g0
1237	bnz,pn	%ncc, 1f
1238	  nop
1239
1240	! ... and must be 256 bytes or more
1241	cmp	%i1, 0x100
1242	blu,pn	%ncc, 1f
1243	  nop
1244
1245	! ... and length must be a multiple of 64
1246	andcc	%i1, 0x3f, %g0
1247	bz,pn	%ncc, .pz_doblock
1248	nop
1249
12501:	! punt, call bzero but notify the caller that bzero was used
1251	mov	%i0, %o0
1252	call	bzero
1253	  mov	%i1, %o1
1254	ret
1255	restore	%g0, 1, %o0	! return (1) - did not use block operations
1256
1257	! Already verified that there are at least 256 bytes to set
1258.pz_doblock:
1259	stx	%g0, [%i0+0x0]
1260	stx	%g0, [%i0+0x40]
1261	stx	%g0, [%i0+0x80]
1262	stx	%g0, [%i0+0xc0]
1263
1264	stx	%g0, [%i0+0x8]
1265	stx	%g0, [%i0+0x10]
1266	stx	%g0, [%i0+0x18]
1267	stx	%g0, [%i0+0x20]
1268	stx	%g0, [%i0+0x28]
1269	stx	%g0, [%i0+0x30]
1270	stx	%g0, [%i0+0x38]
1271
1272	stx	%g0, [%i0+0x48]
1273	stx	%g0, [%i0+0x50]
1274	stx	%g0, [%i0+0x58]
1275	stx	%g0, [%i0+0x60]
1276	stx	%g0, [%i0+0x68]
1277	stx	%g0, [%i0+0x70]
1278	stx	%g0, [%i0+0x78]
1279
1280	stx	%g0, [%i0+0x88]
1281	stx	%g0, [%i0+0x90]
1282	stx	%g0, [%i0+0x98]
1283	stx	%g0, [%i0+0xa0]
1284	stx	%g0, [%i0+0xa8]
1285	stx	%g0, [%i0+0xb0]
1286	stx	%g0, [%i0+0xb8]
1287
1288	stx	%g0, [%i0+0xc8]
1289	stx	%g0, [%i0+0xd0]
1290	stx	%g0, [%i0+0xd8]
1291	stx	%g0, [%i0+0xe0]
1292	stx	%g0, [%i0+0xe8]
1293	stx	%g0, [%i0+0xf0]
1294	stx	%g0, [%i0+0xf8]
1295
1296	sub	%i1, 0x100, %i1
1297	cmp	%i1, 0x100
1298	bgu,pt	%ncc, .pz_doblock
1299	add	%i0, 0x100, %i0
1300
13012:
1302	! Check if more than 64 bytes to set
1303	cmp	%i1,0x40
1304	blu	%ncc, .pz_finish
1305	nop
1306
13073:
1308	stx	%g0, [%i0+0x0]
1309	stx	%g0, [%i0+0x8]
1310	stx	%g0, [%i0+0x10]
1311	stx	%g0, [%i0+0x18]
1312	stx	%g0, [%i0+0x20]
1313	stx	%g0, [%i0+0x28]
1314	stx	%g0, [%i0+0x30]
1315	stx	%g0, [%i0+0x38]
1316
1317	subcc	%i1, 0x40, %i1
1318	bgu,pt	%ncc, 3b
1319	add	%i0, 0x40, %i0
1320
1321.pz_finish:
1322	membar	#Sync
1323	ret
1324	restore	%g0, 0, %o0		! return (bzero or not)
1325	SET_SIZE(hwblkclr)
1326#endif	/* lint */
1327
1328#ifdef	lint
1329/* Copy 32 bytes of data from src to dst using physical addresses */
1330/*ARGSUSED*/
1331void
1332hw_pa_bcopy32(uint64_t src, uint64_t dst)
1333{}
1334#else	/*!lint */
1335
1336	/*
1337	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
1338	 * using physical addresses.
1339	 */
1340	ENTRY_NP(hw_pa_bcopy32)
1341	rdpr    %pstate, %g1
1342	andn    %g1, PSTATE_IE, %g2
1343	wrpr    %g0, %g2, %pstate
1344
1345	ldxa    [%o0]ASI_MEM, %o2
1346	add     %o0, 8, %o0
1347	ldxa    [%o0]ASI_MEM, %o3
1348	add     %o0, 8, %o0
1349	ldxa    [%o0]ASI_MEM, %o4
1350	add     %o0, 8, %o0
1351	ldxa    [%o0]ASI_MEM, %o5
1352	stxa    %o2, [%o1]ASI_MEM
1353	add     %o1, 8, %o1
1354	stxa    %o3, [%o1]ASI_MEM
1355	add     %o1, 8, %o1
1356	stxa    %o4, [%o1]ASI_MEM
1357	add     %o1, 8, %o1
1358	stxa    %o5, [%o1]ASI_MEM
1359
1360	membar	#Sync
1361	retl
1362	  wrpr    %g0, %g1, %pstate
1363	SET_SIZE(hw_pa_bcopy32)
1364#endif /* lint */
1365
1366/*
1367 * Zero a block of storage.
1368 *
1369 * uzero is used by the kernel to zero a block in user address space.
1370 */
1371
1372
1373#if defined(lint)
1374
1375/* ARGSUSED */
1376int
1377kzero(void *addr, size_t count)
1378{ return(0); }
1379
1380/* ARGSUSED */
1381void
1382uzero(void *addr, size_t count)
1383{}
1384
1385#else	/* lint */
1386
1387	ENTRY(uzero)
1388	!
1389	! Set a new lo_fault handler only if we came in with one
1390	! already specified.
1391	!
1392	wr	%g0, ASI_USER, %asi
1393	ldn	[THREAD_REG + T_LOFAULT], %o5
1394	tst	%o5
1395	bz,pt	%ncc, .do_zero
1396	sethi	%hi(.zeroerr), %o2
1397	or	%o2, %lo(.zeroerr), %o2
1398	membar	#Sync
1399	ba,pt	%ncc, .do_zero
1400	stn	%o2, [THREAD_REG + T_LOFAULT]
1401
1402	ENTRY(kzero)
1403	!
1404	! Always set a lo_fault handler
1405	!
1406	wr	%g0, ASI_P, %asi
1407	ldn	[THREAD_REG + T_LOFAULT], %o5
1408	sethi	%hi(.zeroerr), %o2
1409	or	%o5, LOFAULT_SET, %o5
1410	or	%o2, %lo(.zeroerr), %o2
1411	membar	#Sync
1412	ba,pt	%ncc, .do_zero
1413	stn	%o2, [THREAD_REG + T_LOFAULT]
1414
1415/*
1416 * We got here because of a fault during kzero or if
1417 * uzero or bzero was called with t_lofault non-zero.
1418 * Otherwise we've already run screaming from the room.
1419 * Errno value is in %g1. Note that we're here iff
1420 * we did set t_lofault.
1421 */
1422.zeroerr:
1423	!
1424	! Undo asi register setting. Just set it to be the
1425        ! kernel default without checking.
1426	!
1427	wr	%g0, ASI_P, %asi
1428
1429	!
1430	! We did set t_lofault. It may well have been zero coming in.
1431	!
14321:
1433	tst	%o5
1434	membar #Sync
1435	bne,pn	%ncc, 3f
1436	andncc	%o5, LOFAULT_SET, %o5
14372:
1438	!
1439	! Old handler was zero. Just return the error.
1440	!
1441	retl				! return
1442	mov	%g1, %o0		! error code from %g1
14433:
1444	!
1445	! We're here because %o5 was non-zero. It was non-zero
1446	! because either LOFAULT_SET was present, a previous fault
1447	! handler was present or both. In all cases we need to reset
1448	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
1449	! before we either simply return the error or we invoke the
1450	! previously specified handler.
1451	!
1452	be	%ncc, 2b
1453	stn	%o5, [THREAD_REG + T_LOFAULT]
1454	jmp	%o5			! goto real handler
1455	  nop
1456	SET_SIZE(kzero)
1457	SET_SIZE(uzero)
1458
1459#endif	/* lint */
1460
1461/*
1462 * Zero a block of storage.
1463 */
1464
1465#if defined(lint)
1466
1467/* ARGSUSED */
1468void
1469bzero(void *addr, size_t count)
1470{}
1471
1472#else	/* lint */
1473
1474	ENTRY(bzero)
1475	wr	%g0, ASI_P, %asi
1476
1477	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
1478	tst	%o5
1479	bz,pt	%ncc, .do_zero
1480	sethi	%hi(.zeroerr), %o2
1481	or	%o2, %lo(.zeroerr), %o2
1482	membar	#Sync				! sync error barrier
1483	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1484
1485.do_zero:
1486	cmp	%o1, 7
1487	blu,pn	%ncc, .byteclr
1488	nop
1489
1490	cmp	%o1, 15
1491	blu,pn	%ncc, .wdalign
1492	nop
1493
1494	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
1495	bz,pt	%ncc, .blkalign		! already double aligned
1496	sub	%o3, 8, %o3		! -(bytes till double aligned)
1497	add	%o1, %o3, %o1		! update o1 with new count
1498
14991:
1500	stba	%g0, [%o0]%asi
1501	inccc	%o3
1502	bl,pt	%ncc, 1b
1503	inc	%o0
1504
1505	! Now address is double aligned
1506.blkalign:
1507	cmp	%o1, 0x80		! check if there are 128 bytes to set
1508	blu,pn	%ncc, .bzero_small
1509	mov	%o1, %o3
1510
1511	andcc	%o0, 0x3f, %o3		! is block aligned?
1512	bz,pt	%ncc, .bzero_blk
1513	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
1514	add	%o1, %o3, %o1		! o1 is the remainder
1515
1516	! Clear -(%o3) bytes till block aligned
15171:
1518	stxa	%g0, [%o0]%asi
1519	addcc	%o3, 8, %o3
1520	bl,pt	%ncc, 1b
1521	add	%o0, 8, %o0
1522
1523.bzero_blk:
1524	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
1525	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
1526
1527	cmp	%o4, 0x100		! 256 bytes or more
1528	blu,pn	%ncc, 3f
1529	nop
1530
15312:
1532	stxa	%g0, [%o0+0x0]%asi
1533	stxa	%g0, [%o0+0x40]%asi
1534	stxa	%g0, [%o0+0x80]%asi
1535	stxa	%g0, [%o0+0xc0]%asi
1536
1537	stxa	%g0, [%o0+0x8]%asi
1538	stxa	%g0, [%o0+0x10]%asi
1539	stxa	%g0, [%o0+0x18]%asi
1540	stxa	%g0, [%o0+0x20]%asi
1541	stxa	%g0, [%o0+0x28]%asi
1542	stxa	%g0, [%o0+0x30]%asi
1543	stxa	%g0, [%o0+0x38]%asi
1544
1545	stxa	%g0, [%o0+0x48]%asi
1546	stxa	%g0, [%o0+0x50]%asi
1547	stxa	%g0, [%o0+0x58]%asi
1548	stxa	%g0, [%o0+0x60]%asi
1549	stxa	%g0, [%o0+0x68]%asi
1550	stxa	%g0, [%o0+0x70]%asi
1551	stxa	%g0, [%o0+0x78]%asi
1552
1553	stxa	%g0, [%o0+0x88]%asi
1554	stxa	%g0, [%o0+0x90]%asi
1555	stxa	%g0, [%o0+0x98]%asi
1556	stxa	%g0, [%o0+0xa0]%asi
1557	stxa	%g0, [%o0+0xa8]%asi
1558	stxa	%g0, [%o0+0xb0]%asi
1559	stxa	%g0, [%o0+0xb8]%asi
1560
1561	stxa	%g0, [%o0+0xc8]%asi
1562	stxa	%g0, [%o0+0xd0]%asi
1563	stxa	%g0, [%o0+0xd8]%asi
1564	stxa	%g0, [%o0+0xe0]%asi
1565	stxa	%g0, [%o0+0xe8]%asi
1566	stxa	%g0, [%o0+0xf0]%asi
1567	stxa	%g0, [%o0+0xf8]%asi
1568
1569	sub	%o4, 0x100, %o4
1570	cmp	%o4, 0x100
1571	bgu,pt	%ncc, 2b
1572	add	%o0, 0x100, %o0
1573
15743:
1575	! ... check if 64 bytes to set
1576	cmp	%o4, 0x40
1577	blu	%ncc, .bzero_blk_done
1578	nop
1579
15804:
1581	stxa	%g0, [%o0+0x0]%asi
1582	stxa	%g0, [%o0+0x8]%asi
1583	stxa	%g0, [%o0+0x10]%asi
1584	stxa	%g0, [%o0+0x18]%asi
1585	stxa	%g0, [%o0+0x20]%asi
1586	stxa	%g0, [%o0+0x28]%asi
1587	stxa	%g0, [%o0+0x30]%asi
1588	stxa	%g0, [%o0+0x38]%asi
1589
1590	subcc	%o4, 0x40, %o4
1591	bgu,pt	%ncc, 3b
1592	add	%o0, 0x40, %o0
1593
1594.bzero_blk_done:
1595	membar	#Sync
1596
1597.bzero_small:
1598	! Set the remaining doubles
1599	subcc	%o3, 8, %o3		! Can we store any doubles?
1600	blu,pn	%ncc, .byteclr
1601	and	%o1, 7, %o1		! calc bytes left after doubles
1602
1603.dbclr:
1604	stxa	%g0, [%o0]%asi		! Clear the doubles
1605	subcc	%o3, 8, %o3
1606	bgeu,pt	%ncc, .dbclr
1607	add	%o0, 8, %o0
1608
1609	ba	.byteclr
1610	nop
1611
1612.wdalign:
1613	andcc	%o0, 3, %o3		! is add aligned on a word boundary
1614	bz,pn	%ncc, .wdclr
1615	andn	%o1, 3, %o3		! create word sized count in %o3
1616
1617	dec	%o1			! decrement count
1618	stba	%g0, [%o0]%asi		! clear a byte
1619	ba	.wdalign
1620	inc	%o0			! next byte
1621
1622.wdclr:
1623	sta	%g0, [%o0]%asi		! 4-byte clearing loop
1624	subcc	%o3, 4, %o3
1625	bnz,pt	%ncc, .wdclr
1626	inc	4, %o0
1627
1628	and	%o1, 3, %o1		! leftover count, if any
1629
1630.byteclr:
1631	! Set the leftover bytes
1632	brz	%o1, .bzero_exit
1633	nop
1634
16357:
1636	deccc	%o1			! byte clearing loop
1637	stba	%g0, [%o0]%asi
1638	bgu,pt	%ncc, 7b
1639	inc	%o0
1640
1641.bzero_exit:
1642	!
1643	! We're just concerned with whether t_lofault was set
1644	! when we came in. We end up here from either kzero()
1645	! or bzero(). kzero() *always* sets a lofault handler.
1646	! It ors LOFAULT_SET into %o5 to indicate it has done
1647	! this even if the value of %o5 is otherwise zero.
1648	! bzero() sets a lofault handler *only* if one was
1649	! previously set. Accordingly we need to examine
1650	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
1651	! before resetting the error handler.
1652	!
1653	tst	%o5
1654	bz	%ncc, 1f
1655	andn	%o5, LOFAULT_SET, %o5
1656	membar	#Sync				! sync error barrier
1657	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
16581:
1659	retl
1660	clr	%o0			! return (0)
1661
1662	SET_SIZE(bzero)
1663#endif	/* lint */
1664