xref: /titanic_50/usr/src/uts/sun4v/cpu/generic_copy.s (revision 6dfee4834394825da35b977ca71cdc965bc7b6a4)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/errno.h>
31#include <sys/asm_linkage.h>
32#include <sys/vtrace.h>
33#include <sys/machthread.h>
34#include <sys/clock.h>
35#include <sys/asi.h>
36#include <sys/fsr.h>
37#include <sys/privregs.h>
38
39#if !defined(lint)
40#include "assym.h"
41#endif	/* lint */
42
43
44/*
45 * Less then or equal this number of bytes we will always copy byte-for-byte
46 */
47#define	SMALL_LIMIT	7
48
49/*
50 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
51 * handler was set
52 */
53#define	LOFAULT_SET 2
54
55
56/*
57 * Copy a block of storage, returning an error code if `from' or
58 * `to' takes a kernel pagefault which cannot be resolved.
59 * Returns errno value on pagefault error, 0 if all ok
60 */
61
62
63
64#if defined(lint)
65
66/* ARGSUSED */
67int
68kcopy(const void *from, void *to, size_t count)
69{ return(0); }
70
71#else	/* lint */
72
73	.seg	".text"
74	.align	4
75
76	ENTRY(kcopy)
77
78	save	%sp, -SA(MINFRAME), %sp
79	set	.copyerr, %l7			! copyerr is lofault value
80	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
81	or	%o5, LOFAULT_SET, %o5
82	membar	#Sync				! sync error barrier
83	b	.do_copy			! common code
84	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
85
86/*
87 * We got here because of a fault during kcopy.
88 * Errno value is in %g1.
89 */
90.copyerr:
91	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
92	! into %o5 to indicate it has set t_lofault handler. Need to clear
93	! LOFAULT_SET flag before restoring the error handler.
94	andn	%o5, LOFAULT_SET, %o5
95	membar	#Sync			! sync error barrier
96	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
97	ret
98	restore	%g1, 0, %o0
99
100	SET_SIZE(kcopy)
101#endif	/* lint */
102
103
104/*
105 * Copy a block of storage - must not overlap (from + len <= to).
106 */
107#if defined(lint)
108
109/* ARGSUSED */
110void
111bcopy(const void *from, void *to, size_t count)
112{}
113
114#else	/* lint */
115
116	ENTRY(bcopy)
117
118	save	%sp, -SA(MINFRAME), %sp
119	clr	%o5			! flag LOFAULT_SET is not set for bcopy
120
121.do_copy:
122	cmp	%i2, 12			! for small counts
123	blu	%ncc, .bytecp		! just copy bytes
124	  .empty
125
126	!
127	! use aligned transfers where possible
128	!
129	xor	%i0, %i1, %o4		! xor from and to address
130	btst	7, %o4			! if lower three bits zero
131	bz	.aldoubcp		! can align on double boundary
132	.empty	! assembler complaints about label
133
134	xor	%i0, %i1, %o4		! xor from and to address
135	btst	3, %o4			! if lower two bits zero
136	bz	.alwordcp		! can align on word boundary
137	btst	3, %i0			! delay slot, from address unaligned?
138	!
139	! use aligned reads and writes where possible
140	! this differs from wordcp in that it copes
141	! with odd alignment between source and destnation
142	! using word reads and writes with the proper shifts
143	! in between to align transfers to and from memory
144	! i0 - src address, i1 - dest address, i2 - count
145	! i3, i4 - tmps for used generating complete word
146	! i5 (word to write)
147	! l0 size in bits of upper part of source word (US)
148	! l1 size in bits of lower part of source word (LS = 32 - US)
149	! l2 size in bits of upper part of destination word (UD)
150	! l3 size in bits of lower part of destination word (LD = 32 - UD)
151	! l4 number of bytes leftover after aligned transfers complete
152	! l5 the number 32
153	!
154	mov	32, %l5			! load an oft-needed constant
155	bz	.align_dst_only
156	btst	3, %i1			! is destnation address aligned?
157	clr	%i4			! clear registers used in either case
158	bz	.align_src_only
159	clr	%l0
160	!
161	! both source and destination addresses are unaligned
162	!
1631:					! align source
164	ldub	[%i0], %i3		! read a byte from source address
165	add	%i0, 1, %i0		! increment source address
166	or	%i4, %i3, %i4		! or in with previous bytes (if any)
167	btst	3, %i0			! is source aligned?
168	add	%l0, 8, %l0		! increment size of upper source (US)
169	bnz,a	1b
170	sll	%i4, 8, %i4		! make room for next byte
171
172	sub	%l5, %l0, %l1		! generate shift left count (LS)
173	sll	%i4, %l1, %i4		! prepare to get rest
174	ld	[%i0], %i3		! read a word
175	add	%i0, 4, %i0		! increment source address
176	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
177	or	%i4, %i5, %i5		! merge
178	mov	24, %l3			! align destination
1791:
180	srl	%i5, %l3, %i4		! prepare to write a single byte
181	stb	%i4, [%i1]		! write a byte
182	add	%i1, 1, %i1		! increment destination address
183	sub	%i2, 1, %i2		! decrement count
184	btst	3, %i1			! is destination aligned?
185	bnz,a	1b
186	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
187	sub	%l5, %l3, %l2		! generate shift left count (UD)
188	sll	%i5, %l2, %i5		! move leftover into upper bytes
189	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
190	bgu	%ncc, .more_needed	! need more to fill than we have
191	nop
192
193	sll	%i3, %l1, %i3		! clear upper used byte(s)
194	srl	%i3, %l1, %i3
195	! get the odd bytes between alignments
196	sub	%l0, %l2, %l0		! regenerate shift count
197	sub	%l5, %l0, %l1		! generate new shift left count (LS)
198	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
199	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
200	srl	%i3, %l0, %i4
201	or	%i5, %i4, %i5
202	st	%i5, [%i1]		! write a word
203	subcc	%i2, 4, %i2		! decrement count
204	bz	%ncc, .unalign_out
205	add	%i1, 4, %i1		! increment destination address
206
207	b	2f
208	sll	%i3, %l1, %i5		! get leftover into upper bits
209.more_needed:
210	sll	%i3, %l0, %i3		! save remaining byte(s)
211	srl	%i3, %l0, %i3
212	sub	%l2, %l0, %l1		! regenerate shift count
213	sub	%l5, %l1, %l0		! generate new shift left count
214	sll	%i3, %l1, %i4		! move to fill empty space
215	b	3f
216	or	%i5, %i4, %i5		! merge to complete word
217	!
218	! the source address is aligned and destination is not
219	!
220.align_dst_only:
221	ld	[%i0], %i4		! read a word
222	add	%i0, 4, %i0		! increment source address
223	mov	24, %l0			! initial shift alignment count
2241:
225	srl	%i4, %l0, %i3		! prepare to write a single byte
226	stb	%i3, [%i1]		! write a byte
227	add	%i1, 1, %i1		! increment destination address
228	sub	%i2, 1, %i2		! decrement count
229	btst	3, %i1			! is destination aligned?
230	bnz,a	1b
231	sub	%l0, 8, %l0		! delay slot, decrement shift count
232.xfer:
233	sub	%l5, %l0, %l1		! generate shift left count
234	sll	%i4, %l1, %i5		! get leftover
2353:
236	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
237	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
2382:
239	ld	[%i0], %i3		! read a source word
240	add	%i0, 4, %i0		! increment source address
241	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
242	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
243	st	%i5, [%i1]		! write a destination word
244	subcc	%i2, 4, %i2		! decrement count
245	bz	%ncc, .unalign_out	! check if done
246	add	%i1, 4, %i1		! increment destination address
247	b	2b			! loop
248	sll	%i3, %l1, %i5		! get leftover
249.unalign_out:
250	tst	%l4			! any bytes leftover?
251	bz	%ncc, .cpdone
252	.empty				! allow next instruction in delay slot
2531:
254	sub	%l0, 8, %l0		! decrement shift
255	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
256	stb	%i4, [%i1]		! write a byte
257	subcc	%l4, 1, %l4		! decrement count
258	bz	%ncc, .cpdone		! done?
259	add	%i1, 1, %i1		! increment destination
260	tst	%l0			! any more previously read bytes
261	bnz	%ncc, 1b		! we have leftover bytes
262	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
263	b	.dbytecp		! let dbytecp do the rest
264	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
265	!
266	! the destination address is aligned and the source is not
267	!
268.align_src_only:
269	ldub	[%i0], %i3		! read a byte from source address
270	add	%i0, 1, %i0		! increment source address
271	or	%i4, %i3, %i4		! or in with previous bytes (if any)
272	btst	3, %i0			! is source aligned?
273	add	%l0, 8, %l0		! increment shift count (US)
274	bnz,a	.align_src_only
275	sll	%i4, 8, %i4		! make room for next byte
276	b,a	.xfer
277	!
278	! if from address unaligned for double-word moves,
279	! move bytes till it is, if count is < 56 it could take
280	! longer to align the thing than to do the transfer
281	! in word size chunks right away
282	!
283.aldoubcp:
284	cmp	%i2, 56			! if count < 56, use wordcp, it takes
285	blu,a	%ncc, .alwordcp		! longer to align doubles than words
286	mov	3, %o0			! mask for word alignment
287	call	.alignit		! copy bytes until aligned
288	mov	7, %o0			! mask for double alignment
289	!
290	! source and destination are now double-word aligned
291	! i3 has aligned count returned by alignit
292	!
293	and	%i2, 7, %i2		! unaligned leftover count
294	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2955:
296	ldx	[%i0+%i1], %o4		! read from address
297	stx	%o4, [%i1]		! write at destination address
298	subcc	%i3, 8, %i3		! dec count
299	bgu	%ncc, 5b
300	add	%i1, 8, %i1		! delay slot, inc to address
301	cmp	%i2, 4			! see if we can copy a word
302	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
303	.empty
304	!
305	! for leftover bytes we fall into wordcp, if needed
306	!
307.wordcp:
308	and	%i2, 3, %i2		! unaligned leftover count
3095:
310	ld	[%i0+%i1], %o4		! read from address
311	st	%o4, [%i1]		! write at destination address
312	subcc	%i3, 4, %i3		! dec count
313	bgu	%ncc, 5b
314	add	%i1, 4, %i1		! delay slot, inc to address
315	b,a	.dbytecp
316
317	! we come here to align copies on word boundaries
318.alwordcp:
319	call	.alignit		! go word-align it
320	mov	3, %o0			! bits that must be zero to be aligned
321	b	.wordcp
322	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
323
324	!
325	! byte copy, works with any alignment
326	!
327.bytecp:
328	b	.dbytecp
329	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
330
331	!
332	! differenced byte copy, works with any alignment
333	! assumes dest in %i1 and (source - dest) in %i0
334	!
3351:
336	stb	%o4, [%i1]		! write to address
337	inc	%i1			! inc to address
338.dbytecp:
339	deccc	%i2			! dec count
340	bgeu,a	%ncc, 1b		! loop till done
341	ldub	[%i0+%i1], %o4		! read from address
342.cpdone:
343	membar	#Sync				! sync error barrier
344	! Restore t_lofault handler, if came here from kcopy().
345	tst	%o5
346	bz	%ncc, 1f
347	andn	%o5, LOFAULT_SET, %o5
348	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3491:
350	ret
351	restore %g0, 0, %o0		! return (0)
352
353/*
354 * Common code used to align transfers on word and doubleword
355 * boudaries.  Aligns source and destination and returns a count
356 * of aligned bytes to transfer in %i3
357 */
3581:
359	inc	%i0			! inc from
360	stb	%o4, [%i1]		! write a byte
361	inc	%i1			! inc to
362	dec	%i2			! dec count
363.alignit:
364	btst	%o0, %i0		! %o0 is bit mask to check for alignment
365	bnz,a	1b
366	ldub	[%i0], %o4		! read next byte
367
368	retl
369	andn	%i2, %o0, %i3		! return size of aligned bytes
370	SET_SIZE(bcopy)
371
372#endif	/* lint */
373
374/*
375 * Block copy with possibly overlapped operands.
376 */
377
378#if defined(lint)
379
380/*ARGSUSED*/
381void
382ovbcopy(const void *from, void *to, size_t count)
383{}
384
385#else	/* lint */
386
387	ENTRY(ovbcopy)
388	tst	%o2			! check count
389	bgu,a	%ncc, 1f		! nothing to do or bad arguments
390	subcc	%o0, %o1, %o3		! difference of from and to address
391
392	retl				! return
393	nop
3941:
395	bneg,a	%ncc, 2f
396	neg	%o3			! if < 0, make it positive
3972:	cmp	%o2, %o3		! cmp size and abs(from - to)
398	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
399	.empty				!   no overlap
400	cmp	%o0, %o1		! compare from and to addresses
401	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
402	nop
403	!
404	! Copy forwards.
405	!
406.ov_fwd:
407	ldub	[%o0], %o3		! read from address
408	inc	%o0			! inc from address
409	stb	%o3, [%o1]		! write to address
410	deccc	%o2			! dec count
411	bgu	%ncc, .ov_fwd		! loop till done
412	inc	%o1			! inc to address
413
414	retl				! return
415	nop
416	!
417	! Copy backwards.
418	!
419.ov_bkwd:
420	deccc	%o2			! dec count
421	ldub	[%o0 + %o2], %o3	! get byte at end of src
422	bgu	%ncc, .ov_bkwd		! loop till done
423	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
424
425	retl				! return
426	nop
427	SET_SIZE(ovbcopy)
428
429#endif	/* lint */
430
431/*
432 * hwblkpagecopy()
433 *
434 * Copies exactly one page.  This routine assumes the caller (ppcopy)
435 * has already disabled kernel preemption and has checked
436 * use_hw_bcopy.
437 */
438#ifdef lint
439/*ARGSUSED*/
440void
441hwblkpagecopy(const void *src, void *dst)
442{ }
443#else /* lint */
444	ENTRY(hwblkpagecopy)
445	save	%sp, -SA(MINFRAME), %sp
446
447	! %i0 - source address (arg)
448	! %i1 - destination address (arg)
449	! %i2 - length of region (not arg)
450
451	set	PAGESIZE, %i2
452
453	/*
454	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
455	 */
4561:
457	ldx	[%i0+0x0], %l0
458	ldx	[%i0+0x8], %l1
459	ldx	[%i0+0x10], %l2
460	ldx	[%i0+0x18], %l3
461	ldx	[%i0+0x20], %l4
462	ldx	[%i0+0x28], %l5
463	ldx	[%i0+0x30], %l6
464	ldx	[%i0+0x38], %l7
465	stx	%l0, [%i1+0x0]
466	stx	%l1, [%i1+0x8]
467	stx	%l2, [%i1+0x10]
468	stx	%l3, [%i1+0x18]
469	stx	%l4, [%i1+0x20]
470	stx	%l5, [%i1+0x28]
471	stx	%l6, [%i1+0x30]
472	stx	%l7, [%i1+0x38]
473
474	ldx	[%i0+0x40], %l0
475	ldx	[%i0+0x48], %l1
476	ldx	[%i0+0x50], %l2
477	ldx	[%i0+0x58], %l3
478	ldx	[%i0+0x60], %l4
479	ldx	[%i0+0x68], %l5
480	ldx	[%i0+0x70], %l6
481	ldx	[%i0+0x78], %l7
482	stx	%l0, [%i1+0x40]
483	stx	%l1, [%i1+0x48]
484	stx	%l2, [%i1+0x50]
485	stx	%l3, [%i1+0x58]
486	stx	%l4, [%i1+0x60]
487	stx	%l5, [%i1+0x68]
488	stx	%l6, [%i1+0x70]
489	stx	%l7, [%i1+0x78]
490
491	add	%i0, 0x80, %i0
492	subcc	%i2, 0x80, %i2
493	bgu,pt	%xcc, 1b
494	add	%i1, 0x80, %i1
495
496	membar #Sync
497	ret
498	restore	%g0, 0, %o0
499	SET_SIZE(hwblkpagecopy)
500#endif	/* lint */
501
502
503/*
504 * Transfer data to and from user space -
505 * Note that these routines can cause faults
506 * It is assumed that the kernel has nothing at
507 * less than KERNELBASE in the virtual address space.
508 *
509 * Note that copyin(9F) and copyout(9F) are part of the
510 * DDI/DKI which specifies that they return '-1' on "errors."
511 *
512 * Sigh.
513 *
514 * So there's two extremely similar routines - xcopyin() and xcopyout()
515 * which return the errno that we've faithfully computed.  This
516 * allows other callers (e.g. uiomove(9F)) to work correctly.
517 * Given that these are used pretty heavily, we expand the calling
518 * sequences inline for all flavours (rather than making wrappers).
519 *
520 * There are also stub routines for xcopyout_little and xcopyin_little,
521 * which currently are intended to handle requests of <= 16 bytes from
522 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
523 * is left as an exercise...
524 */
525
526/*
527 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
528 *
529 * General theory of operation:
530 *
531 * None of the copyops routines grab a window.
532 *
533 * Flow:
534 *
535 * If count == zero return zero.
536 *
537 * Store the previous lo_fault handler into %g6.
538 * Place our secondary lofault handler into %g5.
539 * Place the address of our fault handler into %o3.
540 *
541 * If count is less than or equal to SMALL_LIMIT (7) we
542 * always do a byte for byte copy.
543 *
544 * If count is > SMALL_LIMIT, we check the alignment of the input
545 * and output pointers.  We store -count in %o3, we store the number
546 * of chunks (8, 4, 2 or 1 byte) operated on in our basic copy loop
547 * in %o2. Following this we branch to the appropriate copy loop and
548 * copy that many chunks.  Since we've been adding the chunk size
549 * to %o3 each time through as well as decrementing %o2, we can tell
550 * if any data is is left to be copied by examining %o3. If that is
551 * zero, we're done and can go home. If not, we figure out what the
552 * largest chunk size left to be copied is and branch to that copy
553 * loop unless there's only one byte left. We load that as we're
554 * branching to code that stores it just before we return.
555 *
556 * Fault handlers are invoked if we reference memory that has no
557 * current mapping.  All forms share the same copyio_fault handler.
558 * This routine handles fixing up the stack and general housecleaning.
559 * Each copy operation has a simple fault handler that is then called
560 * to do the work specific to the invidual operation.  The handler
561 * for copyOP and xcopyOP are found at the end of individual function.
562 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
563 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
564 */
565
566/*
567 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
568 */
569
570#if defined(lint)
571
572/*ARGSUSED*/
573int
574copyout(const void *kaddr, void *uaddr, size_t count)
575{ return (0); }
576
577#else	/* lint */
578
579/*
580 * We save the arguments in the following registers in case of a fault:
581 * 	kaddr - %g2
582 * 	uaddr - %g3
583 * 	count - %g4
584 */
585#define	SAVE_SRC	%g2
586#define	SAVE_DST	%g3
587#define	SAVE_COUNT	%g4
588
589#define	REAL_LOFAULT		%g5
590#define	SAVED_LOFAULT		%g6
591
592/*
593 * Generic copyio fault handler.  This is the first line of defense when a
594 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
595 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
596 * This allows us to share common code for all the flavors of the copy
597 * operations, including the _noerr versions.
598 *
599 * Note that this function will restore the original input parameters before
600 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
601 * member of the t_copyop structure, if needed.
602 */
603	ENTRY(copyio_fault)
604	membar	#Sync
605	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
606
607	mov	SAVE_SRC, %o0
608	mov	SAVE_DST, %o1
609	jmp	REAL_LOFAULT
610	  mov	SAVE_COUNT, %o2
611	SET_SIZE(copyio_fault)
612
613	ENTRY(copyout)
614	sethi	%hi(.copyout_err), REAL_LOFAULT
615	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
616
617.do_copyout:
618	!
619	! Check the length and bail if zero.
620	!
621	tst	%o2
622	bnz,pt	%ncc, 1f
623	  nop
624	retl
625	  clr	%o0
6261:
627	sethi	%hi(copyio_fault), %o3
628	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
629	or	%o3, %lo(copyio_fault), %o3
630	membar	#Sync
631	stn	%o3, [THREAD_REG + T_LOFAULT]
632
633	mov	%o0, SAVE_SRC
634	mov	%o1, SAVE_DST
635	mov	%o2, SAVE_COUNT
636
637	!
638	! Check to see if we're more than SMALL_LIMIT (7 bytes).
639	! Run in leaf mode, using the %o regs as our input regs.
640	!
641	subcc	%o2, SMALL_LIMIT, %o3
642	bgu,a,pt %ncc, .dco_ns
643	or	%o0, %o1, %o3
644
645.dcobcp:
646	sub	%g0, %o2, %o3		! negate count
647	add	%o0, %o2, %o0		! make %o0 point at the end
648	add	%o1, %o2, %o1		! make %o1 point at the end
649	ba,pt	%ncc, .dcocl
650	ldub	[%o0 + %o3], %o4	! load first byte
651	!
652	! %o0 and %o2 point at the end and remain pointing at the end
653	! of their buffers. We pull things out by adding %o3 (which is
654	! the negation of the length) to the buffer end which gives us
655	! the curent location in the buffers. By incrementing %o3 we walk
656	! through both buffers without having to bump each buffer's
657	! pointer. A very fast 4 instruction loop.
658	!
659	.align 16
660.dcocl:
661	stba	%o4, [%o1 + %o3]ASI_USER
662	inccc	%o3
663	bl,a,pt	%ncc, .dcocl
664	ldub	[%o0 + %o3], %o4
665	!
666	! We're done. Go home.
667	!
668	membar	#Sync
669	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
670	retl
671	clr	%o0
672	!
673	! Try aligned copies from here.
674	!
675.dco_ns:
676	! %o0 = kernel addr (to be copied from)
677	! %o1 = user addr (to be copied to)
678	! %o2 = length
679	! %o3 = %o1 | %o2 (used for alignment checking)
680	! %o4 is alternate lo_fault
681	! %o5 is original lo_fault
682	!
683	! See if we're single byte aligned. If we are, check the
684	! limit for single byte copies. If we're smaller or equal,
685	! bounce to the byte for byte copy loop. Otherwise do it in
686	! HW (if enabled).
687	!
688	btst	1, %o3
689	bz,pt	%icc, .dcoh8
690	btst	7, %o3
691
692	ba	.dcobcp
693	nop
694.dcoh8:
695	!
696	! 8 byte aligned?
697	!
698	bnz,a	%ncc, .dcoh4
699	btst	3, %o3
700.dcos8:
701	!
702	! Housekeeping for copy loops. Uses same idea as in the byte for
703	! byte copy loop above.
704	!
705	add	%o0, %o2, %o0
706	add	%o1, %o2, %o1
707	sub	%g0, %o2, %o3
708	ba,pt	%ncc, .dodebc
709	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
710	!
711	! 4 byte aligned?
712	!
713.dcoh4:
714	bnz,pn	%ncc, .dcoh2
715	nop
716.dcos4:
717	add	%o0, %o2, %o0
718	add	%o1, %o2, %o1
719	sub	%g0, %o2, %o3
720	ba,pt	%ncc, .dodfbc
721	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
722	!
723	! We must be 2 byte aligned. Off we go.
724	! The check for small copies was done in the
725	! delay at .dcoh4
726	!
727.dcoh2:
728.dcos2:
729	add	%o0, %o2, %o0
730	add	%o1, %o2, %o1
731	sub	%g0, %o2, %o3
732	ba,pt	%ncc, .dodtbc
733	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
734
735.dodebc:
736	ldx	[%o0 + %o3], %o4
737	deccc	%o2
738	stxa	%o4, [%o1 + %o3]ASI_USER
739	bg,pt	%ncc, .dodebc
740	addcc	%o3, 8, %o3
741	!
742	! End of copy loop. Check to see if we're done. Most
743	! eight byte aligned copies end here.
744	!
745	bz,pt	%ncc, .dcofh
746	nop
747	!
748	! Something is left - do it byte for byte.
749	!
750	ba,pt	%ncc, .dcocl
751	ldub	[%o0 + %o3], %o4	! load next byte
752	!
753	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
754	!
755	.align 32
756.dodfbc:
757	lduw	[%o0 + %o3], %o4
758	deccc	%o2
759	sta	%o4, [%o1 + %o3]ASI_USER
760	bg,pt	%ncc, .dodfbc
761	addcc	%o3, 4, %o3
762	!
763	! End of copy loop. Check to see if we're done. Most
764	! four byte aligned copies end here.
765	!
766	bz,pt	%ncc, .dcofh
767	nop
768	!
769	! Something is left. Do it byte for byte.
770	!
771	ba,pt	%ncc, .dcocl
772	ldub	[%o0 + %o3], %o4	! load next byte
773	!
774	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
775	! copy.
776	!
777	.align 32
778.dodtbc:
779	lduh	[%o0 + %o3], %o4
780	deccc	%o2
781	stha	%o4, [%o1 + %o3]ASI_USER
782	bg,pt	%ncc, .dodtbc
783	addcc	%o3, 2, %o3
784	!
785	! End of copy loop. Anything left?
786	!
787	bz,pt	%ncc, .dcofh
788	nop
789	!
790	! Deal with the last byte
791	!
792	ldub	[%o0 + %o3], %o4
793	stba	%o4, [%o1 + %o3]ASI_USER
794.dcofh:
795	membar	#Sync
796	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
797	retl
798	clr	%o0
799
800.copyout_err:
801	ldn	[THREAD_REG + T_COPYOPS], %o4
802	brz	%o4, 2f
803	nop
804	ldn	[%o4 + CP_COPYOUT], %g2
805	jmp	%g2
806	nop
8072:
808	retl
809	mov	-1, %o0
810	SET_SIZE(copyout)
811
812#endif	/* lint */
813
814
815#ifdef	lint
816
817/*ARGSUSED*/
818int
819xcopyout(const void *kaddr, void *uaddr, size_t count)
820{ return (0); }
821
822#else	/* lint */
823
824	ENTRY(xcopyout)
825	sethi	%hi(.xcopyout_err), REAL_LOFAULT
826	b	.do_copyout
827	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
828.xcopyout_err:
829	ldn	[THREAD_REG + T_COPYOPS], %o4
830	brz	%o4, 2f
831	nop
832	ldn	[%o4 + CP_XCOPYOUT], %g2
833	jmp	%g2
834	nop
8352:
836	retl
837	mov	%g1, %o0
838	SET_SIZE(xcopyout)
839
840#endif	/* lint */
841
842#ifdef	lint
843
844/*ARGSUSED*/
845int
846xcopyout_little(const void *kaddr, void *uaddr, size_t count)
847{ return (0); }
848
849#else	/* lint */
850
851	ENTRY(xcopyout_little)
852	sethi	%hi(.little_err), %o4
853	ldn	[THREAD_REG + T_LOFAULT], %o5
854	or	%o4, %lo(.little_err), %o4
855	membar	#Sync			! sync error barrier
856	stn	%o4, [THREAD_REG + T_LOFAULT]
857
858	subcc	%g0, %o2, %o3
859	add	%o0, %o2, %o0
860	bz,pn	%ncc, 2f		! check for zero bytes
861	sub	%o2, 1, %o4
862	add	%o0, %o4, %o0		! start w/last byte
863	add	%o1, %o2, %o1
864	ldub	[%o0+%o3], %o4
865
8661:	stba	%o4, [%o1+%o3]ASI_AIUSL
867	inccc	%o3
868	sub	%o0, 2, %o0		! get next byte
869	bcc,a,pt %ncc, 1b
870	  ldub	[%o0+%o3], %o4
871
8722:	membar	#Sync			! sync error barrier
873	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
874	retl
875	mov	%g0, %o0		! return (0)
876	SET_SIZE(xcopyout_little)
877
878#endif	/* lint */
879
880/*
881 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
882 */
883
884#if defined(lint)
885
886/*ARGSUSED*/
887int
888copyin(const void *uaddr, void *kaddr, size_t count)
889{ return (0); }
890
891#else	/* lint */
892
893	ENTRY(copyin)
894	sethi	%hi(.copyin_err), REAL_LOFAULT
895	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
896
897.do_copyin:
898	!
899	! Check the length and bail if zero.
900	!
901	tst	%o2
902	bnz,pt	%ncc, 1f
903	  nop
904	retl
905	  clr	%o0
9061:
907	sethi	%hi(copyio_fault), %o3
908	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
909	or	%o3, %lo(copyio_fault), %o3
910	membar	#Sync
911	stn	%o3, [THREAD_REG + T_LOFAULT]
912
913	mov	%o0, SAVE_SRC
914	mov	%o1, SAVE_DST
915	mov	%o2, SAVE_COUNT
916
917	!
918	! Check to see if we're more than SMALL_LIMIT.
919	!
920	subcc	%o2, SMALL_LIMIT, %o3
921	bgu,a,pt %ncc, .dci_ns
922	or	%o0, %o1, %o3
923
924.dcibcp:
925	sub	%g0, %o2, %o3		! setup for copy loop
926	add	%o0, %o2, %o0
927	add	%o1, %o2, %o1
928	ba,pt	%ncc, .dcicl
929	lduba	[%o0 + %o3]ASI_USER, %o4
930	!
931	! %o0 and %o1 point at the end and remain pointing at the end
932	! of their buffers. We pull things out by adding %o3 (which is
933	! the negation of the length) to the buffer end which gives us
934	! the curent location in the buffers. By incrementing %o3 we walk
935	! through both buffers without having to bump each buffer's
936	! pointer. A very fast 4 instruction loop.
937	!
938	.align 16
939.dcicl:
940	stb	%o4, [%o1 + %o3]
941	inccc	%o3
942	bl,a,pt %ncc, .dcicl
943	lduba	[%o0 + %o3]ASI_USER, %o4
944	!
945	! We're done. Go home.
946	!
947	membar	#Sync
948	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
949	retl
950	clr	%o0
951	!
952	! Try aligned copies from here.
953	!
954.dci_ns:
955	!
956	! See if we're single byte aligned. If we are, check the
957	! limit for single byte copies. If we're smaller, or equal,
958	! bounce to the byte for byte copy loop. Otherwise do it in
959	! HW (if enabled).
960	!
961	btst	1, %o3
962	bz,a,pt	%icc, .dcih8
963	btst	7, %o3
964	ba	.dcibcp
965	nop
966
967.dcih8:
968	!
969	! 8 byte aligned?
970	!
971	bnz,a	%ncc, .dcih4
972	btst	3, %o3
973.dcis8:
974	!
975	! Housekeeping for copy loops. Uses same idea as in the byte for
976	! byte copy loop above.
977	!
978	add	%o0, %o2, %o0
979	add	%o1, %o2, %o1
980	sub	%g0, %o2, %o3
981	ba,pt	%ncc, .didebc
982	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
983	!
984	! 4 byte aligned?
985	!
986.dcih4:
987	bnz	%ncc, .dcih2
988	nop
989.dcis4:
990	!
991	! Housekeeping for copy loops. Uses same idea as in the byte
992	! for byte copy loop above.
993	!
994	add	%o0, %o2, %o0
995	add	%o1, %o2, %o1
996	sub	%g0, %o2, %o3
997	ba,pt	%ncc, .didfbc
998	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
999.dcih2:
1000.dcis2:
1001	add	%o0, %o2, %o0
1002	add	%o1, %o2, %o1
1003	sub	%g0, %o2, %o3
1004	ba,pt	%ncc, .didtbc
1005	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
1006
1007.didebc:
1008	ldxa	[%o0 + %o3]ASI_USER, %o4
1009	deccc	%o2
1010	stx	%o4, [%o1 + %o3]
1011	bg,pt	%ncc, .didebc
1012	addcc	%o3, 8, %o3
1013	!
1014	! End of copy loop. Most 8 byte aligned copies end here.
1015	!
1016	bz,pt	%ncc, .dcifh
1017	nop
1018	!
1019	! Something is left. Do it byte for byte.
1020	!
1021	ba,pt	%ncc, .dcicl
1022	lduba	[%o0 + %o3]ASI_USER, %o4
1023	!
1024	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
1025	!
1026	.align 32
1027.didfbc:
1028	lduwa	[%o0 + %o3]ASI_USER, %o4
1029	deccc	%o2
1030	st	%o4, [%o1 + %o3]
1031	bg,pt	%ncc, .didfbc
1032	addcc	%o3, 4, %o3
1033	!
1034	! End of copy loop. Most 4 byte aligned copies end here.
1035	!
1036	bz,pt	%ncc, .dcifh
1037	nop
1038	!
1039	! Something is left. Do it byte for byte.
1040	!
1041	ba,pt	%ncc, .dcicl
1042	lduba	[%o0 + %o3]ASI_USER, %o4
1043	!
1044	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
1045	! copy.
1046	!
1047	.align 32
1048.didtbc:
1049	lduha	[%o0 + %o3]ASI_USER, %o4
1050	deccc	%o2
1051	sth	%o4, [%o1 + %o3]
1052	bg,pt	%ncc, .didtbc
1053	addcc	%o3, 2, %o3
1054	!
1055	! End of copy loop. Most 2 byte aligned copies end here.
1056	!
1057	bz,pt	%ncc, .dcifh
1058	nop
1059	!
1060	! Deal with the last byte
1061	!
1062	lduba	[%o0 + %o3]ASI_USER, %o4
1063	stb	%o4, [%o1 + %o3]
1064.dcifh:
1065	membar	#Sync
1066	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1067	retl
1068	clr	%o0
1069
1070.copyin_err:
1071	ldn	[THREAD_REG + T_COPYOPS], %o4
1072	brz	%o4, 2f
1073	nop
1074	ldn	[%o4 + CP_COPYIN], %g2
1075	jmp	%g2
1076	nop
10772:
1078	retl
1079	mov	-1, %o0
1080	SET_SIZE(copyin)
1081
1082#endif	/* lint */
1083
1084#ifdef	lint
1085
1086/*ARGSUSED*/
1087int
1088xcopyin(const void *uaddr, void *kaddr, size_t count)
1089{ return (0); }
1090
1091#else	/* lint */
1092
1093	ENTRY(xcopyin)
1094	sethi	%hi(.xcopyin_err), REAL_LOFAULT
1095	b	.do_copyin
1096	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
1097.xcopyin_err:
1098	ldn	[THREAD_REG + T_COPYOPS], %o4
1099	brz	%o4, 2f
1100	nop
1101	ldn	[%o4 + CP_XCOPYIN], %g2
1102	jmp	%g2
1103	nop
11042:
1105	retl
1106	mov	%g1, %o0
1107	SET_SIZE(xcopyin)
1108
1109#endif	/* lint */
1110
1111#ifdef	lint
1112
1113/*ARGSUSED*/
1114int
1115xcopyin_little(const void *uaddr, void *kaddr, size_t count)
1116{ return (0); }
1117
1118#else	/* lint */
1119
1120	ENTRY(xcopyin_little)
1121	sethi	%hi(.little_err), %o4
1122	ldn	[THREAD_REG + T_LOFAULT], %o5
1123	or	%o4, %lo(.little_err), %o4
1124	membar	#Sync				! sync error barrier
1125	stn	%o4, [THREAD_REG + T_LOFAULT]
1126
1127	subcc	%g0, %o2, %o3
1128	add	%o0, %o2, %o0
1129	bz,pn	%ncc, 2f		! check for zero bytes
1130	sub	%o2, 1, %o4
1131	add	%o0, %o4, %o0		! start w/last byte
1132	add	%o1, %o2, %o1
1133	lduba	[%o0+%o3]ASI_AIUSL, %o4
1134
11351:	stb	%o4, [%o1+%o3]
1136	inccc	%o3
1137	sub	%o0, 2, %o0		! get next byte
1138	bcc,a,pt %ncc, 1b
1139	  lduba	[%o0+%o3]ASI_AIUSL, %o4
1140
11412:	membar	#Sync				! sync error barrier
1142	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1143	retl
1144	mov	%g0, %o0		! return (0)
1145
1146.little_err:
1147	membar	#Sync				! sync error barrier
1148	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1149	retl
1150	mov	%g1, %o0
1151	SET_SIZE(xcopyin_little)
1152
1153#endif	/* lint */
1154
1155
1156/*
1157 * Copy a block of storage - must not overlap (from + len <= to).
1158 * No fault handler installed (to be called under on_fault())
1159 */
1160#if defined(lint)
1161
1162/* ARGSUSED */
1163void
1164copyin_noerr(const void *ufrom, void *kto, size_t count)
1165{}
1166
1167#else	/* lint */
1168
1169	ENTRY(copyin_noerr)
1170	sethi	%hi(.copyio_noerr), REAL_LOFAULT
1171	b	.do_copyin
1172	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
1173.copyio_noerr:
1174	jmp	SAVED_LOFAULT
1175	  nop
1176	SET_SIZE(copyin_noerr)
1177
1178#endif /* lint */
1179
1180/*
1181 * Copy a block of storage - must not overlap (from + len <= to).
1182 * No fault handler installed (to be called under on_fault())
1183 */
1184
1185#if defined(lint)
1186
1187/* ARGSUSED */
1188void
1189copyout_noerr(const void *kfrom, void *uto, size_t count)
1190{}
1191
1192#else	/* lint */
1193
1194	ENTRY(copyout_noerr)
1195	sethi	%hi(.copyio_noerr), REAL_LOFAULT
1196	b	.do_copyout
1197	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
1198	SET_SIZE(copyout_noerr)
1199
1200#endif /* lint */
1201
1202#if defined(lint)
1203
1204int use_hw_bcopy = 1;
1205int use_hw_bzero = 1;
1206
1207#else /* !lint */
1208
1209	.align	4
1210	DGDEF(use_hw_bcopy)
1211	.word	1
1212	DGDEF(use_hw_bzero)
1213	.word	1
1214
1215	.align	64
1216	.section ".text"
1217#endif /* !lint */
1218
1219
1220/*
1221 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
1222 * longer than 256 bytes in length using load/stores.  If
1223 * the criteria for using this routine are not met then it calls bzero
1224 * and returns 1.  Otherwise 0 is returned indicating success.
1225 * Caller is responsible for ensuring use_hw_bzero is true and that
1226 * kpreempt_disable() has been called.
1227 */
1228#ifdef lint
1229/*ARGSUSED*/
1230int
1231hwblkclr(void *addr, size_t len)
1232{
1233	return(0);
1234}
1235#else /* lint */
1236	! %i0 - start address
1237	! %i1 - length of region (multiple of 64)
1238
1239	ENTRY(hwblkclr)
1240	save	%sp, -SA(MINFRAME), %sp
1241
1242	! Must be block-aligned
1243	andcc	%i0, 0x3f, %g0
1244	bnz,pn	%ncc, 1f
1245	  nop
1246
1247	! ... and must be 256 bytes or more
1248	cmp	%i1, 0x100
1249	blu,pn	%ncc, 1f
1250	  nop
1251
1252	! ... and length must be a multiple of 64
1253	andcc	%i1, 0x3f, %g0
1254	bz,pn	%ncc, .pz_doblock
1255	nop
1256
12571:	! punt, call bzero but notify the caller that bzero was used
1258	mov	%i0, %o0
1259	call	bzero
1260	  mov	%i1, %o1
1261	ret
1262	restore	%g0, 1, %o0	! return (1) - did not use block operations
1263
1264	! Already verified that there are at least 256 bytes to set
1265.pz_doblock:
1266	stx	%g0, [%i0+0x0]
1267	stx	%g0, [%i0+0x40]
1268	stx	%g0, [%i0+0x80]
1269	stx	%g0, [%i0+0xc0]
1270
1271	stx	%g0, [%i0+0x8]
1272	stx	%g0, [%i0+0x10]
1273	stx	%g0, [%i0+0x18]
1274	stx	%g0, [%i0+0x20]
1275	stx	%g0, [%i0+0x28]
1276	stx	%g0, [%i0+0x30]
1277	stx	%g0, [%i0+0x38]
1278
1279	stx	%g0, [%i0+0x48]
1280	stx	%g0, [%i0+0x50]
1281	stx	%g0, [%i0+0x58]
1282	stx	%g0, [%i0+0x60]
1283	stx	%g0, [%i0+0x68]
1284	stx	%g0, [%i0+0x70]
1285	stx	%g0, [%i0+0x78]
1286
1287	stx	%g0, [%i0+0x88]
1288	stx	%g0, [%i0+0x90]
1289	stx	%g0, [%i0+0x98]
1290	stx	%g0, [%i0+0xa0]
1291	stx	%g0, [%i0+0xa8]
1292	stx	%g0, [%i0+0xb0]
1293	stx	%g0, [%i0+0xb8]
1294
1295	stx	%g0, [%i0+0xc8]
1296	stx	%g0, [%i0+0xd0]
1297	stx	%g0, [%i0+0xd8]
1298	stx	%g0, [%i0+0xe0]
1299	stx	%g0, [%i0+0xe8]
1300	stx	%g0, [%i0+0xf0]
1301	stx	%g0, [%i0+0xf8]
1302
1303	sub	%i1, 0x100, %i1
1304	cmp	%i1, 0x100
1305	bgu,pt	%ncc, .pz_doblock
1306	add	%i0, 0x100, %i0
1307
13082:
1309	! Check if more than 64 bytes to set
1310	cmp	%i1,0x40
1311	blu	%ncc, .pz_finish
1312	nop
1313
13143:
1315	stx	%g0, [%i0+0x0]
1316	stx	%g0, [%i0+0x8]
1317	stx	%g0, [%i0+0x10]
1318	stx	%g0, [%i0+0x18]
1319	stx	%g0, [%i0+0x20]
1320	stx	%g0, [%i0+0x28]
1321	stx	%g0, [%i0+0x30]
1322	stx	%g0, [%i0+0x38]
1323
1324	subcc	%i1, 0x40, %i1
1325	bgu,pt	%ncc, 3b
1326	add	%i0, 0x40, %i0
1327
1328.pz_finish:
1329	membar	#Sync
1330	ret
1331	restore	%g0, 0, %o0		! return (bzero or not)
1332	SET_SIZE(hwblkclr)
1333#endif	/* lint */
1334
1335#ifdef	lint
1336/* Copy 32 bytes of data from src to dst using physical addresses */
1337/*ARGSUSED*/
1338void
1339hw_pa_bcopy32(uint64_t src, uint64_t dst)
1340{}
1341#else	/*!lint */
1342
1343	/*
1344	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
1345	 * using physical addresses.
1346	 */
1347	ENTRY_NP(hw_pa_bcopy32)
1348	rdpr    %pstate, %g1
1349	andn    %g1, PSTATE_IE, %g2
1350	wrpr    %g0, %g2, %pstate
1351
1352	ldxa    [%o0]ASI_MEM, %o2
1353	add     %o0, 8, %o0
1354	ldxa    [%o0]ASI_MEM, %o3
1355	add     %o0, 8, %o0
1356	ldxa    [%o0]ASI_MEM, %o4
1357	add     %o0, 8, %o0
1358	ldxa    [%o0]ASI_MEM, %o5
1359	stxa    %o2, [%o1]ASI_MEM
1360	add     %o1, 8, %o1
1361	stxa    %o3, [%o1]ASI_MEM
1362	add     %o1, 8, %o1
1363	stxa    %o4, [%o1]ASI_MEM
1364	add     %o1, 8, %o1
1365	stxa    %o5, [%o1]ASI_MEM
1366
1367	membar	#Sync
1368	retl
1369	  wrpr    %g0, %g1, %pstate
1370	SET_SIZE(hw_pa_bcopy32)
1371#endif /* lint */
1372
1373/*
1374 * Zero a block of storage.
1375 *
1376 * uzero is used by the kernel to zero a block in user address space.
1377 */
1378
1379
1380#if defined(lint)
1381
1382/* ARGSUSED */
1383int
1384kzero(void *addr, size_t count)
1385{ return(0); }
1386
1387/* ARGSUSED */
1388void
1389uzero(void *addr, size_t count)
1390{}
1391
1392#else	/* lint */
1393
1394	ENTRY(uzero)
1395	!
1396	! Set a new lo_fault handler only if we came in with one
1397	! already specified.
1398	!
1399	wr	%g0, ASI_USER, %asi
1400	ldn	[THREAD_REG + T_LOFAULT], %o5
1401	tst	%o5
1402	bz,pt	%ncc, .do_zero
1403	sethi	%hi(.zeroerr), %o2
1404	or	%o2, %lo(.zeroerr), %o2
1405	membar	#Sync
1406	ba,pt	%ncc, .do_zero
1407	stn	%o2, [THREAD_REG + T_LOFAULT]
1408
1409	ENTRY(kzero)
1410	!
1411	! Always set a lo_fault handler
1412	!
1413	wr	%g0, ASI_P, %asi
1414	ldn	[THREAD_REG + T_LOFAULT], %o5
1415	sethi	%hi(.zeroerr), %o2
1416	or	%o5, LOFAULT_SET, %o5
1417	or	%o2, %lo(.zeroerr), %o2
1418	membar	#Sync
1419	ba,pt	%ncc, .do_zero
1420	stn	%o2, [THREAD_REG + T_LOFAULT]
1421
1422/*
1423 * We got here because of a fault during kzero or if
1424 * uzero or bzero was called with t_lofault non-zero.
1425 * Otherwise we've already run screaming from the room.
1426 * Errno value is in %g1. Note that we're here iff
1427 * we did set t_lofault.
1428 */
1429.zeroerr:
1430	!
1431	! Undo asi register setting. Just set it to be the
1432        ! kernel default without checking.
1433	!
1434	wr	%g0, ASI_P, %asi
1435
1436	!
1437	! We did set t_lofault. It may well have been zero coming in.
1438	!
14391:
1440	tst	%o5
1441	membar #Sync
1442	bne,pn	%ncc, 3f
1443	andncc	%o5, LOFAULT_SET, %o5
14442:
1445	!
1446	! Old handler was zero. Just return the error.
1447	!
1448	retl				! return
1449	mov	%g1, %o0		! error code from %g1
14503:
1451	!
1452	! We're here because %o5 was non-zero. It was non-zero
1453	! because either LOFAULT_SET was present, a previous fault
1454	! handler was present or both. In all cases we need to reset
1455	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
1456	! before we either simply return the error or we invoke the
1457	! previously specified handler.
1458	!
1459	be	%ncc, 2b
1460	stn	%o5, [THREAD_REG + T_LOFAULT]
1461	jmp	%o5			! goto real handler
1462	  nop
1463	SET_SIZE(kzero)
1464	SET_SIZE(uzero)
1465
1466#endif	/* lint */
1467
1468/*
1469 * Zero a block of storage.
1470 */
1471
1472#if defined(lint)
1473
1474/* ARGSUSED */
1475void
1476bzero(void *addr, size_t count)
1477{}
1478
1479#else	/* lint */
1480
1481	ENTRY(bzero)
1482	wr	%g0, ASI_P, %asi
1483
1484	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
1485	tst	%o5
1486	bz,pt	%ncc, .do_zero
1487	sethi	%hi(.zeroerr), %o2
1488	or	%o2, %lo(.zeroerr), %o2
1489	membar	#Sync				! sync error barrier
1490	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1491
1492.do_zero:
1493	cmp	%o1, 7
1494	blu,pn	%ncc, .byteclr
1495	nop
1496
1497	cmp	%o1, 15
1498	blu,pn	%ncc, .wdalign
1499	nop
1500
1501	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
1502	bz,pt	%ncc, .blkalign		! already double aligned
1503	sub	%o3, 8, %o3		! -(bytes till double aligned)
1504	add	%o1, %o3, %o1		! update o1 with new count
1505
15061:
1507	stba	%g0, [%o0]%asi
1508	inccc	%o3
1509	bl,pt	%ncc, 1b
1510	inc	%o0
1511
1512	! Now address is double aligned
1513.blkalign:
1514	cmp	%o1, 0x80		! check if there are 128 bytes to set
1515	blu,pn	%ncc, .bzero_small
1516	mov	%o1, %o3
1517
1518	andcc	%o0, 0x3f, %o3		! is block aligned?
1519	bz,pt	%ncc, .bzero_blk
1520	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
1521	add	%o1, %o3, %o1		! o1 is the remainder
1522
1523	! Clear -(%o3) bytes till block aligned
15241:
1525	stxa	%g0, [%o0]%asi
1526	addcc	%o3, 8, %o3
1527	bl,pt	%ncc, 1b
1528	add	%o0, 8, %o0
1529
1530.bzero_blk:
1531	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
1532	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
1533
1534	cmp	%o4, 0x100		! 256 bytes or more
1535	blu,pn	%ncc, 3f
1536	nop
1537
15382:
1539	stxa	%g0, [%o0+0x0]%asi
1540	stxa	%g0, [%o0+0x40]%asi
1541	stxa	%g0, [%o0+0x80]%asi
1542	stxa	%g0, [%o0+0xc0]%asi
1543
1544	stxa	%g0, [%o0+0x8]%asi
1545	stxa	%g0, [%o0+0x10]%asi
1546	stxa	%g0, [%o0+0x18]%asi
1547	stxa	%g0, [%o0+0x20]%asi
1548	stxa	%g0, [%o0+0x28]%asi
1549	stxa	%g0, [%o0+0x30]%asi
1550	stxa	%g0, [%o0+0x38]%asi
1551
1552	stxa	%g0, [%o0+0x48]%asi
1553	stxa	%g0, [%o0+0x50]%asi
1554	stxa	%g0, [%o0+0x58]%asi
1555	stxa	%g0, [%o0+0x60]%asi
1556	stxa	%g0, [%o0+0x68]%asi
1557	stxa	%g0, [%o0+0x70]%asi
1558	stxa	%g0, [%o0+0x78]%asi
1559
1560	stxa	%g0, [%o0+0x88]%asi
1561	stxa	%g0, [%o0+0x90]%asi
1562	stxa	%g0, [%o0+0x98]%asi
1563	stxa	%g0, [%o0+0xa0]%asi
1564	stxa	%g0, [%o0+0xa8]%asi
1565	stxa	%g0, [%o0+0xb0]%asi
1566	stxa	%g0, [%o0+0xb8]%asi
1567
1568	stxa	%g0, [%o0+0xc8]%asi
1569	stxa	%g0, [%o0+0xd0]%asi
1570	stxa	%g0, [%o0+0xd8]%asi
1571	stxa	%g0, [%o0+0xe0]%asi
1572	stxa	%g0, [%o0+0xe8]%asi
1573	stxa	%g0, [%o0+0xf0]%asi
1574	stxa	%g0, [%o0+0xf8]%asi
1575
1576	sub	%o4, 0x100, %o4
1577	cmp	%o4, 0x100
1578	bgu,pt	%ncc, 2b
1579	add	%o0, 0x100, %o0
1580
15813:
1582	! ... check if 64 bytes to set
1583	cmp	%o4, 0x40
1584	blu	%ncc, .bzero_blk_done
1585	nop
1586
15874:
1588	stxa	%g0, [%o0+0x0]%asi
1589	stxa	%g0, [%o0+0x8]%asi
1590	stxa	%g0, [%o0+0x10]%asi
1591	stxa	%g0, [%o0+0x18]%asi
1592	stxa	%g0, [%o0+0x20]%asi
1593	stxa	%g0, [%o0+0x28]%asi
1594	stxa	%g0, [%o0+0x30]%asi
1595	stxa	%g0, [%o0+0x38]%asi
1596
1597	subcc	%o4, 0x40, %o4
1598	bgu,pt	%ncc, 3b
1599	add	%o0, 0x40, %o0
1600
1601.bzero_blk_done:
1602	membar	#Sync
1603
1604.bzero_small:
1605	! Set the remaining doubles
1606	subcc	%o3, 8, %o3		! Can we store any doubles?
1607	blu,pn	%ncc, .byteclr
1608	and	%o1, 7, %o1		! calc bytes left after doubles
1609
1610.dbclr:
1611	stxa	%g0, [%o0]%asi		! Clear the doubles
1612	subcc	%o3, 8, %o3
1613	bgeu,pt	%ncc, .dbclr
1614	add	%o0, 8, %o0
1615
1616	ba	.byteclr
1617	nop
1618
1619.wdalign:
1620	andcc	%o0, 3, %o3		! is add aligned on a word boundary
1621	bz,pn	%ncc, .wdclr
1622	andn	%o1, 3, %o3		! create word sized count in %o3
1623
1624	dec	%o1			! decrement count
1625	stba	%g0, [%o0]%asi		! clear a byte
1626	ba	.wdalign
1627	inc	%o0			! next byte
1628
1629.wdclr:
1630	sta	%g0, [%o0]%asi		! 4-byte clearing loop
1631	subcc	%o3, 4, %o3
1632	bnz,pt	%ncc, .wdclr
1633	inc	4, %o0
1634
1635	and	%o1, 3, %o1		! leftover count, if any
1636
1637.byteclr:
1638	! Set the leftover bytes
1639	brz	%o1, .bzero_exit
1640	nop
1641
16427:
1643	deccc	%o1			! byte clearing loop
1644	stba	%g0, [%o0]%asi
1645	bgu,pt	%ncc, 7b
1646	inc	%o0
1647
1648.bzero_exit:
1649	!
1650	! We're just concerned with whether t_lofault was set
1651	! when we came in. We end up here from either kzero()
1652	! or bzero(). kzero() *always* sets a lofault handler.
1653	! It ors LOFAULT_SET into %o5 to indicate it has done
1654	! this even if the value of %o5 is otherwise zero.
1655	! bzero() sets a lofault handler *only* if one was
1656	! previously set. Accordingly we need to examine
1657	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
1658	! before resetting the error handler.
1659	!
1660	tst	%o5
1661	bz	%ncc, 1f
1662	andn	%o5, LOFAULT_SET, %o5
1663	membar	#Sync				! sync error barrier
1664	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
16651:
1666	retl
1667	clr	%o0			! return (0)
1668
1669	SET_SIZE(bzero)
1670#endif	/* lint */
1671