xref: /titanic_50/usr/src/uts/sun4u/cpu/common_asm.s (revision 84ba300aaa958c8e8427c2ec66a932d86bee71c4)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#if !defined(lint)
26#include "assym.h"
27#endif	/* !lint */
28
29/*
30 * General assembly language routines.
31 * It is the intent of this file to contain routines that are
32 * specific to cpu architecture.
33 */
34
35/*
36 * WARNING: If you add a fast trap handler which can be invoked by a
37 * non-privileged user, you may have to use the FAST_TRAP_DONE macro
38 * instead of "done" instruction to return back to the user mode. See
39 * comments for the "fast_trap_done" entry point for more information.
40 */
41#define	FAST_TRAP_DONE	\
42	ba,a	fast_trap_done
43
44/*
45 * Override GET_NATIVE_TIME for the cpu module code.  This is not
46 * guaranteed to be exactly one instruction, be careful of using
47 * the macro in delay slots.
48 *
49 * Do not use any instruction that modifies condition codes as the
50 * caller may depend on these to remain unchanged across the macro.
51 */
52#if defined(CHEETAH) || defined(OLYMPUS_C)
53
54#define	GET_NATIVE_TIME(out, scr1, scr2) \
55	rd	STICK, out
56#define	DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
57	rd	STICK, reg;		\
58	add	reg, delta, reg;	\
59	wr	reg, STICK
60#define	RD_TICKCMPR(out, scr)		\
61	rd	STICK_COMPARE, out
62#define	WR_TICKCMPR(in, scr1, scr2, label) \
63	wr	in, STICK_COMPARE
64
65#elif defined(HUMMINGBIRD)
66#include <sys/spitregs.h>
67
68/*
69 * the current hummingbird version of %stick and %stick_cmp
70 * were both implemented as (2) 32-bit locations in ASI_IO space;
71 * the hdwr should support atomic r/w; meanwhile: ugly alert! ...
72 *
73 * 64-bit opcodes are required, but move only 32-bits:
74 *
75 * ldxa [phys]ASI_IO, %dst 	reads  the low 32-bits from phys into %dst
76 * stxa %src, [phys]ASI_IO 	writes the low 32-bits from %src into phys
77 *
78 * reg equivalent		[phys]ASI_IO
79 * ------------------		---------------
80 * %stick_cmp  low-32		0x1FE.0000.F060
81 * %stick_cmp high-32		0x1FE.0000.F068
82 * %stick      low-32		0x1FE.0000.F070
83 * %stick     high-32		0x1FE.0000.F078
84 */
85#define	HSTC_LOW	0x60			/* stick_cmp low  32-bits */
86#define	HSTC_HIGH	0x68			/* stick_cmp high 32-bits */
87#define	HST_LOW		0x70			/* stick low  32-bits */
88#define	HST_HIGH	0x78			/* stick high 32-bits */
89#define	HST_DIFF	0x08			/* low<-->high diff */
90
91/*
92 * Any change in the number of instructions in SETL41()
93 * will affect SETL41_OFF
94 */
95#define	SETL41(reg, byte) \
96	sethi	%hi(0x1FE00000), reg;		/* 0000.0000.1FE0.0000 */ \
97	or	reg, 0xF, reg;			/* 0000.0000.1FE0.000F */ \
98	sllx	reg, 12, reg;			/* 0000.01FE.0000.F000 */ \
99	or	reg, byte, reg;			/* 0000.01FE.0000.F0xx */
100
101/*
102 * SETL41_OFF is used to calulate the relative PC value when a
103 * branch instruction needs to go over SETL41() macro
104 */
105#define SETL41_OFF  16
106
107/*
108 * reading stick requires 2 loads, and there could be an intervening
109 * low-to-high 32-bit rollover resulting in a return value that is
110 * off by about (2 ^ 32); this rare case is prevented by re-reading
111 * the low-32 bits after the high-32 and verifying the "after" value
112 * is >= the "before" value; if not, increment the high-32 value.
113 *
114 * this method is limited to 1 rollover, and based on the fixed
115 * stick-frequency (5555555), requires the loads to complete within
116 * 773 seconds; incrementing the high-32 value will not overflow for
117 * about 52644 years.
118 *
119 * writing stick requires 2 stores; if the old/new low-32 value is
120 * near 0xffffffff, there could be another rollover (also rare).
121 * to prevent this, we first write a 0 to the low-32, then write
122 * new values to the high-32 then the low-32.
123 *
124 * When we detect a carry in the lower %stick register, we need to
125 * read HST_HIGH again. However at the point where we detect this,
126 * we need to rebuild the register address HST_HIGH.This involves more
127 * than one instructions and a branch is unavoidable. However, most of
128 * the time, there is no carry. So we take the penalty of a branch
129 * instruction only when there is carry (less frequent).
130 *
131 * For GET_NATIVE_TIME(), we start afresh and branch to SETL41().
132 * For DELTA_NATIVE_TIME(), we branch to just after SETL41() since
133 * addr already points to HST_LOW.
134 *
135 * NOTE: this method requires disabling interrupts before using
136 * DELTA_NATIVE_TIME.
137 */
138#define	GET_NATIVE_TIME(out, scr, tmp)	\
139	SETL41(scr, HST_LOW);		\
140	ldxa	[scr]ASI_IO, tmp;	\
141	inc	HST_DIFF, scr;		\
142	ldxa	[scr]ASI_IO, out;	\
143	dec	HST_DIFF, scr;		\
144	ldxa	[scr]ASI_IO, scr;	\
145	sub	scr, tmp, tmp;		\
146	brlz,pn tmp, .-(SETL41_OFF+24); \
147	sllx	out, 32, out;		\
148	or	out, scr, out
149#define	DELTA_NATIVE_TIME(delta, addr, high, low, tmp) \
150	SETL41(addr, HST_LOW);		\
151	ldxa	[addr]ASI_IO, tmp;	\
152	inc	HST_DIFF, addr;		\
153	ldxa	[addr]ASI_IO, high;	\
154	dec	HST_DIFF, addr;		\
155	ldxa	[addr]ASI_IO, low;	\
156	sub	low, tmp, tmp;		\
157	brlz,pn tmp, .-24;		\
158	sllx	high, 32, high;		\
159	or	high, low, high;	\
160	add	high, delta, high;	\
161	srl	high, 0, low;		\
162	srlx	high, 32, high;		\
163	stxa	%g0, [addr]ASI_IO;	\
164	inc	HST_DIFF, addr;		\
165	stxa	high, [addr]ASI_IO;	\
166	dec	HST_DIFF, addr;		\
167	stxa	low, [addr]ASI_IO
168#define RD_TICKCMPR(out, scr)		\
169	SETL41(scr, HSTC_LOW);		\
170	ldxa	[scr]ASI_IO, out;	\
171	inc	HST_DIFF, scr;		\
172	ldxa	[scr]ASI_IO, scr;	\
173	sllx	scr, 32, scr;		\
174	or	scr, out, out
175#define WR_TICKCMPR(in, scra, scrd, label) \
176	SETL41(scra, HSTC_HIGH);	\
177	srlx	in, 32, scrd;		\
178	stxa	scrd, [scra]ASI_IO;	\
179	dec	HST_DIFF, scra;		\
180	stxa	in, [scra]ASI_IO
181
182#else	/* !CHEETAH && !HUMMINGBIRD */
183
184#define	GET_NATIVE_TIME(out, scr1, scr2) \
185	rdpr	%tick, out
186#define	DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
187	rdpr	%tick, reg;		\
188	add	reg, delta, reg;	\
189	wrpr	reg, %tick
190#define	RD_TICKCMPR(out, scr)		\
191	rd	TICK_COMPARE, out
192#ifdef BB_ERRATA_1 /* writes to TICK_COMPARE may fail */
193/*
194 * Writes to the TICK_COMPARE register sometimes fail on blackbird modules.
195 * The failure occurs only when the following instruction decodes to wr or
196 * wrpr.  The workaround is to immediately follow writes to TICK_COMPARE
197 * with a read, thus stalling the pipe and keeping following instructions
198 * from causing data corruption.  Aligning to a quadword will ensure these
199 * two instructions are not split due to i$ misses.
200 */
201#define WR_TICKCMPR(cmpr,scr1,scr2,label)	\
202	ba,a	.bb_errata_1.label		;\
203	.align	64				;\
204.bb_errata_1.label:				;\
205	wr	cmpr, TICK_COMPARE		;\
206	rd	TICK_COMPARE, %g0
207#else	/* BB_ERRATA_1 */
208#define	WR_TICKCMPR(in,scr1,scr2,label)		\
209	wr	in, TICK_COMPARE
210#endif	/* BB_ERRATA_1 */
211
212#endif	/* !CHEETAH && !HUMMINGBIRD */
213
214#include <sys/clock.h>
215
216#if defined(lint)
217#include <sys/types.h>
218#include <sys/scb.h>
219#include <sys/systm.h>
220#include <sys/regset.h>
221#include <sys/sunddi.h>
222#include <sys/lockstat.h>
223#endif	/* lint */
224
225
226#include <sys/asm_linkage.h>
227#include <sys/privregs.h>
228#include <sys/machparam.h>	/* To get SYSBASE and PAGESIZE */
229#include <sys/machthread.h>
230#include <sys/clock.h>
231#include <sys/intreg.h>
232#include <sys/psr_compat.h>
233#include <sys/isa_defs.h>
234#include <sys/dditypes.h>
235#include <sys/intr.h>
236
237#if !defined(lint)
238#include "assym.h"
239#endif	/* !lint */
240
241#if defined(lint)
242
243uint_t
244get_impl(void)
245{ return (0); }
246
247#else	/* lint */
248
249	ENTRY(get_impl)
250	GET_CPU_IMPL(%o0)
251	retl
252	nop
253	SET_SIZE(get_impl)
254
255#endif	/* lint */
256
257#if defined(lint)
258/*
259 * Softint generated when counter field of tick reg matches value field
260 * of tick_cmpr reg
261 */
262/*ARGSUSED*/
263void
264tickcmpr_set(uint64_t clock_cycles)
265{}
266
267#else	/* lint */
268
269	ENTRY_NP(tickcmpr_set)
270	! get 64-bit clock_cycles interval
271	mov	%o0, %o2
272	mov	8, %o3			! A reasonable initial step size
2731:
274	WR_TICKCMPR(%o2,%o4,%o5,__LINE__)	! Write to TICK_CMPR
275
276	GET_NATIVE_TIME(%o0, %o4, %o5)	! Read %tick to confirm the
277	sllx	%o0, 1, %o0		!   value we wrote was in the future.
278	srlx	%o0, 1, %o0
279
280	cmp	%o2, %o0		! If the value we wrote was in the
281	bg,pt	%xcc, 2f		!   future, then blow out of here.
282	sllx	%o3, 1, %o3		! If not, then double our step size,
283	ba,pt	%xcc, 1b		!   and take another lap.
284	add	%o0, %o3, %o2		!
2852:
286	retl
287	nop
288	SET_SIZE(tickcmpr_set)
289
290#endif	/* lint */
291
292#if defined(lint)
293
294void
295tickcmpr_disable(void)
296{}
297
298#else	/* lint */
299
300	ENTRY_NP(tickcmpr_disable)
301	mov	1, %g1
302	sllx	%g1, TICKINT_DIS_SHFT, %o0
303	WR_TICKCMPR(%o0,%o4,%o5,__LINE__)	! Write to TICK_CMPR
304	retl
305	nop
306	SET_SIZE(tickcmpr_disable)
307
308#endif	/* lint */
309
310#if defined(lint)
311
312/*
313 * tick_write_delta() increments %tick by the specified delta.  This should
314 * only be called after a CPR event to assure that gethrtime() continues to
315 * increase monotonically.  Obviously, writing %tick needs to de done very
316 * carefully to avoid introducing unnecessary %tick skew across CPUs.  For
317 * this reason, we make sure we're i-cache hot before actually writing to
318 * %tick.
319 */
320/*ARGSUSED*/
321void
322tick_write_delta(uint64_t delta)
323{}
324
325#else	/* lint */
326
327#ifdef DEBUG
328	.seg	".text"
329tick_write_panic:
330	.asciz	"tick_write_delta: interrupts already disabled on entry"
331#endif	/* DEBUG */
332
333	ENTRY_NP(tick_write_delta)
334	rdpr	%pstate, %g1
335#ifdef DEBUG
336	andcc	%g1, PSTATE_IE, %g0	! If DEBUG, check that interrupts
337	bnz	0f			! aren't already disabled.
338	sethi	%hi(tick_write_panic), %o1
339        save    %sp, -SA(MINFRAME), %sp ! get a new window to preserve caller
340	call	panic
341	or	%i1, %lo(tick_write_panic), %o0
342#endif	/* DEBUG */
3430:	wrpr	%g1, PSTATE_IE, %pstate	! Disable interrupts
344	mov	%o0, %o2
345	ba	0f			! Branch to cache line-aligned instr.
346	nop
347	.align	16
3480:	nop				! The next 3 instructions are now hot.
349	DELTA_NATIVE_TIME(%o2, %o3, %o4, %o5, %g2)	! read/inc/write %tick
350
351	retl				! Return
352	wrpr	%g0, %g1, %pstate	!     delay: Re-enable interrupts
353#endif	/* lint */
354
355#if defined(lint)
356/*
357 *  return 1 if disabled
358 */
359
360int
361tickcmpr_disabled(void)
362{ return (0); }
363
364#else	/* lint */
365
366	ENTRY_NP(tickcmpr_disabled)
367	RD_TICKCMPR(%g1, %o0)
368	retl
369	srlx	%g1, TICKINT_DIS_SHFT, %o0
370	SET_SIZE(tickcmpr_disabled)
371
372#endif	/* lint */
373
374/*
375 * Get current tick
376 */
377#if defined(lint)
378
379u_longlong_t
380gettick(void)
381{ return (0); }
382
383u_longlong_t
384randtick(void)
385{ return (0); }
386
387#else	/* lint */
388
389	ENTRY(gettick)
390	ALTENTRY(randtick)
391	GET_NATIVE_TIME(%o0, %o2, %o3)
392	retl
393	nop
394	SET_SIZE(randtick)
395	SET_SIZE(gettick)
396
397#endif	/* lint */
398
399
400/*
401 * Return the counter portion of the tick register.
402 */
403
404#if defined(lint)
405
406uint64_t
407gettick_counter(void)
408{ return(0); }
409
410#else	/* lint */
411
412	ENTRY_NP(gettick_counter)
413	rdpr	%tick, %o0
414	sllx	%o0, 1, %o0
415	retl
416	srlx	%o0, 1, %o0		! shake off npt bit
417	SET_SIZE(gettick_counter)
418#endif	/* lint */
419
420/*
421 * Provide a C callable interface to the trap that reads the hi-res timer.
422 * Returns 64-bit nanosecond timestamp in %o0 and %o1.
423 */
424
425#if defined(lint)
426
427hrtime_t
428gethrtime(void)
429{
430	return ((hrtime_t)0);
431}
432
433hrtime_t
434gethrtime_unscaled(void)
435{
436	return ((hrtime_t)0);
437}
438
439hrtime_t
440gethrtime_max(void)
441{
442	return ((hrtime_t)0);
443}
444
445void
446scalehrtime(hrtime_t *hrt)
447{
448	*hrt = 0;
449}
450
451void
452gethrestime(timespec_t *tp)
453{
454	tp->tv_sec = 0;
455	tp->tv_nsec = 0;
456}
457
458time_t
459gethrestime_sec(void)
460{
461	return (0);
462}
463
464void
465gethrestime_lasttick(timespec_t *tp)
466{
467	tp->tv_sec = 0;
468	tp->tv_nsec = 0;
469}
470
471/*ARGSUSED*/
472void
473hres_tick(void)
474{
475}
476
477void
478panic_hres_tick(void)
479{
480}
481
482#else	/* lint */
483
484	ENTRY_NP(gethrtime)
485	GET_HRTIME(%g1, %o0, %o1, %o2, %o3, %o4, %o5, %g2)
486							! %g1 = hrtime
487	retl
488	mov	%g1, %o0
489	SET_SIZE(gethrtime)
490
491	ENTRY_NP(gethrtime_unscaled)
492	GET_NATIVE_TIME(%g1, %o2, %o3)			! %g1 = native time
493	retl
494	mov	%g1, %o0
495	SET_SIZE(gethrtime_unscaled)
496
497	ENTRY_NP(gethrtime_waitfree)
498	ALTENTRY(dtrace_gethrtime)
499	GET_NATIVE_TIME(%g1, %o2, %o3)			! %g1 = native time
500	NATIVE_TIME_TO_NSEC(%g1, %o2, %o3)
501	retl
502	mov	%g1, %o0
503	SET_SIZE(dtrace_gethrtime)
504	SET_SIZE(gethrtime_waitfree)
505
506	ENTRY(gethrtime_max)
507	NATIVE_TIME_MAX(%g1)
508	NATIVE_TIME_TO_NSEC(%g1, %o0, %o1)
509
510	! hrtime_t's are signed, max hrtime_t must be positive
511	mov	-1, %o2
512	brlz,a	%g1, 1f
513	srlx	%o2, 1, %g1
5141:
515	retl
516	mov	%g1, %o0
517	SET_SIZE(gethrtime_max)
518
519	ENTRY(scalehrtime)
520	ldx	[%o0], %o1
521	NATIVE_TIME_TO_NSEC(%o1, %o2, %o3)
522	retl
523	stx	%o1, [%o0]
524	SET_SIZE(scalehrtime)
525
526/*
527 * Fast trap to return a timestamp, uses trap window, leaves traps
528 * disabled.  Returns a 64-bit nanosecond timestamp in %o0 and %o1.
529 *
530 * This is the handler for the ST_GETHRTIME trap.
531 */
532
533	ENTRY_NP(get_timestamp)
534	GET_HRTIME(%g1, %g2, %g3, %g4, %g5, %o0, %o1, %o2)	! %g1 = hrtime
535	srlx	%g1, 32, %o0				! %o0 = hi32(%g1)
536	srl	%g1, 0, %o1				! %o1 = lo32(%g1)
537	FAST_TRAP_DONE
538	SET_SIZE(get_timestamp)
539
540/*
541 * Macro to convert GET_HRESTIME() bits into a timestamp.
542 *
543 * We use two separate macros so that the platform-dependent GET_HRESTIME()
544 * can be as small as possible; CONV_HRESTIME() implements the generic part.
545 */
546#define	CONV_HRESTIME(hrestsec, hrestnsec, adj, nslt, nano) \
547	brz,pt	adj, 3f;		/* no adjustments, it's easy */	\
548	add	hrestnsec, nslt, hrestnsec; /* hrest.tv_nsec += nslt */	\
549	brlz,pn	adj, 2f;		/* if hrestime_adj negative */	\
550	srlx	nslt, ADJ_SHIFT, nslt;	/* delay: nslt >>= 4 */		\
551	subcc	adj, nslt, %g0;		/* hrestime_adj - nslt/16 */	\
552	movg	%xcc, nslt, adj;	/* adj by min(adj, nslt/16) */	\
553	ba	3f;			/* go convert to sec/nsec */	\
554	add	hrestnsec, adj, hrestnsec; /* delay: apply adjustment */ \
5552:	addcc	adj, nslt, %g0;		/* hrestime_adj + nslt/16 */	\
556	bge,a,pt %xcc, 3f;		/* is adj less negative? */	\
557	add	hrestnsec, adj, hrestnsec; /* yes: hrest.nsec += adj */	\
558	sub	hrestnsec, nslt, hrestnsec; /* no: hrest.nsec -= nslt/16 */ \
5593:	cmp	hrestnsec, nano;	/* more than a billion? */	\
560	bl,pt	%xcc, 4f;		/* if not, we're done */	\
561	nop;				/* delay: do nothing :( */	\
562	add	hrestsec, 1, hrestsec;	/* hrest.tv_sec++; */		\
563	sub	hrestnsec, nano, hrestnsec; /* hrest.tv_nsec -= NANOSEC; */ \
564	ba,a	3b;			/* check >= billion again */	\
5654:
566
567	ENTRY_NP(gethrestime)
568	GET_HRESTIME(%o1, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
569	CONV_HRESTIME(%o1, %o2, %o3, %o4, %o5)
570	stn	%o1, [%o0]
571	retl
572	stn	%o2, [%o0 + CLONGSIZE]
573	SET_SIZE(gethrestime)
574
575/*
576 * Similar to gethrestime(), but gethrestime_sec() returns current hrestime
577 * seconds.
578 */
579	ENTRY_NP(gethrestime_sec)
580	GET_HRESTIME(%o0, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
581	CONV_HRESTIME(%o0, %o2, %o3, %o4, %o5)
582	retl					! %o0 current hrestime seconds
583	nop
584	SET_SIZE(gethrestime_sec)
585
586/*
587 * Returns the hrestime on the last tick.  This is simpler than gethrestime()
588 * and gethrestime_sec():  no conversion is required.  gethrestime_lasttick()
589 * follows the same locking algorithm as GET_HRESTIME and GET_HRTIME,
590 * outlined in detail in clock.h.  (Unlike GET_HRESTIME/GET_HRTIME, we don't
591 * rely on load dependencies to effect the membar #LoadLoad, instead declaring
592 * it explicitly.)
593 */
594	ENTRY_NP(gethrestime_lasttick)
595	sethi	%hi(hres_lock), %o1
5960:
597	lduw	[%o1 + %lo(hres_lock)], %o2	! Load lock value
598	membar	#LoadLoad			! Load of lock must complete
599	andn	%o2, 1, %o2			! Mask off lowest bit
600	ldn	[%o1 + %lo(hrestime)], %g1	! Seconds.
601	add	%o1, %lo(hrestime), %o4
602	ldn	[%o4 + CLONGSIZE], %g2		! Nanoseconds.
603	membar	#LoadLoad			! All loads must complete
604	lduw	[%o1 + %lo(hres_lock)], %o3	! Reload lock value
605	cmp	%o3, %o2			! If lock is locked or has
606	bne	0b				!   changed, retry.
607	stn	%g1, [%o0]			! Delay: store seconds
608	retl
609	stn	%g2, [%o0 + CLONGSIZE]		! Delay: store nanoseconds
610	SET_SIZE(gethrestime_lasttick)
611
612/*
613 * Fast trap for gettimeofday().  Returns a timestruc_t in %o0 and %o1.
614 *
615 * This is the handler for the ST_GETHRESTIME trap.
616 */
617
618	ENTRY_NP(get_hrestime)
619	GET_HRESTIME(%o0, %o1, %g1, %g2, %g3, %g4, %g5, %o2, %o3)
620	CONV_HRESTIME(%o0, %o1, %g1, %g2, %g3)
621	FAST_TRAP_DONE
622	SET_SIZE(get_hrestime)
623
624/*
625 * Fast trap to return lwp virtual time, uses trap window, leaves traps
626 * disabled.  Returns a 64-bit number in %o0:%o1, which is the number
627 * of nanoseconds consumed.
628 *
629 * This is the handler for the ST_GETHRVTIME trap.
630 *
631 * Register usage:
632 *	%o0, %o1 = return lwp virtual time
633 * 	%o2 = CPU/thread
634 * 	%o3 = lwp
635 * 	%g1 = scratch
636 * 	%g5 = scratch
637 */
638	ENTRY_NP(get_virtime)
639	GET_NATIVE_TIME(%g5, %g1, %g2)	! %g5 = native time in ticks
640	CPU_ADDR(%g2, %g3)			! CPU struct ptr to %g2
641	ldn	[%g2 + CPU_THREAD], %g2		! thread pointer to %g2
642	ldn	[%g2 + T_LWP], %g3		! lwp pointer to %g3
643
644	/*
645	 * Subtract start time of current microstate from time
646	 * of day to get increment for lwp virtual time.
647	 */
648	ldx	[%g3 + LWP_STATE_START], %g1	! ms_state_start
649	sub	%g5, %g1, %g5
650
651	/*
652	 * Add current value of ms_acct[LMS_USER]
653	 */
654	ldx	[%g3 + LWP_ACCT_USER], %g1	! ms_acct[LMS_USER]
655	add	%g5, %g1, %g5
656	NATIVE_TIME_TO_NSEC(%g5, %g1, %o0)
657
658	srl	%g5, 0, %o1			! %o1 = lo32(%g5)
659	srlx	%g5, 32, %o0			! %o0 = hi32(%g5)
660
661	FAST_TRAP_DONE
662	SET_SIZE(get_virtime)
663
664
665
666	.seg	".text"
667hrtime_base_panic:
668	.asciz	"hrtime_base stepping back"
669
670
671	ENTRY_NP(hres_tick)
672	save	%sp, -SA(MINFRAME), %sp	! get a new window
673
674	sethi	%hi(hrestime), %l4
675	ldstub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5	! try locking
6767:	tst	%l5
677	bz,pt	%xcc, 8f			! if we got it, drive on
678	ld	[%l4 + %lo(nsec_scale)], %l5	! delay: %l5 = scaling factor
679	ldub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
6809:	tst	%l5
681	bz,a,pn	%xcc, 7b
682	ldstub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
683	ba,pt	%xcc, 9b
684	ldub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
6858:
686	membar	#StoreLoad|#StoreStore
687
688	!
689	! update hres_last_tick.  %l5 has the scaling factor (nsec_scale).
690	!
691	ldx	[%l4 + %lo(hrtime_base)], %g1	! load current hrtime_base
692	GET_NATIVE_TIME(%l0, %l3, %l6)		! current native time
693	stx	%l0, [%l4 + %lo(hres_last_tick)]! prev = current
694	! convert native time to nsecs
695	NATIVE_TIME_TO_NSEC_SCALE(%l0, %l5, %l2, NSEC_SHIFT)
696
697	sub	%l0, %g1, %i1			! get accurate nsec delta
698
699	ldx	[%l4 + %lo(hrtime_base)], %l1
700	cmp	%l1, %l0
701	bg,pn	%xcc, 9f
702	nop
703
704	stx	%l0, [%l4 + %lo(hrtime_base)]	! update hrtime_base
705
706	!
707	! apply adjustment, if any
708	!
709	ldx	[%l4 + %lo(hrestime_adj)], %l0	! %l0 = hrestime_adj
710	brz	%l0, 2f
711						! hrestime_adj == 0 ?
712						! yes, skip adjustments
713	clr	%l5				! delay: set adj to zero
714	tst	%l0				! is hrestime_adj >= 0 ?
715	bge,pt	%xcc, 1f			! yes, go handle positive case
716	srl	%i1, ADJ_SHIFT, %l5		! delay: %l5 = adj
717
718	addcc	%l0, %l5, %g0			! hrestime_adj < -adj ?
719	bl,pt	%xcc, 2f			! yes, use current adj
720	neg	%l5				! delay: %l5 = -adj
721	ba,pt	%xcc, 2f
722	mov	%l0, %l5			! no, so set adj = hrestime_adj
7231:
724	subcc	%l0, %l5, %g0			! hrestime_adj < adj ?
725	bl,a,pt	%xcc, 2f			! yes, set adj = hrestime_adj
726	mov	%l0, %l5			! delay: adj = hrestime_adj
7272:
728	ldx	[%l4 + %lo(timedelta)], %l0	! %l0 = timedelta
729	sub	%l0, %l5, %l0			! timedelta -= adj
730
731	stx	%l0, [%l4 + %lo(timedelta)]	! store new timedelta
732	stx	%l0, [%l4 + %lo(hrestime_adj)]	! hrestime_adj = timedelta
733
734	or	%l4, %lo(hrestime), %l2
735	ldn	[%l2], %i2			! %i2:%i3 = hrestime sec:nsec
736	ldn	[%l2 + CLONGSIZE], %i3
737	add	%i3, %l5, %i3			! hrestime.nsec += adj
738	add	%i3, %i1, %i3			! hrestime.nsec += nslt
739
740	set	NANOSEC, %l5			! %l5 = NANOSEC
741	cmp	%i3, %l5
742	bl,pt	%xcc, 5f			! if hrestime.tv_nsec < NANOSEC
743	sethi	%hi(one_sec), %i1		! delay
744	add	%i2, 0x1, %i2			! hrestime.tv_sec++
745	sub	%i3, %l5, %i3			! hrestime.tv_nsec - NANOSEC
746	mov	0x1, %l5
747	st	%l5, [%i1 + %lo(one_sec)]
7485:
749	stn	%i2, [%l2]
750	stn	%i3, [%l2 + CLONGSIZE]		! store the new hrestime
751
752	membar	#StoreStore
753
754	ld	[%l4 + %lo(hres_lock)], %i1
755	inc	%i1				! release lock
756	st	%i1, [%l4 + %lo(hres_lock)]	! clear hres_lock
757
758	ret
759	restore
760
7619:
762	!
763	! release hres_lock
764	!
765	ld	[%l4 + %lo(hres_lock)], %i1
766	inc	%i1
767	st	%i1, [%l4 + %lo(hres_lock)]
768
769	sethi	%hi(hrtime_base_panic), %o0
770	call	panic
771	or	%o0, %lo(hrtime_base_panic), %o0
772
773	SET_SIZE(hres_tick)
774
775#endif	/* lint */
776
777#if !defined(lint) && !defined(__lint)
778
779	.seg	".text"
780kstat_q_panic_msg:
781	.asciz	"kstat_q_exit: qlen == 0"
782
783	ENTRY(kstat_q_panic)
784	save	%sp, -SA(MINFRAME), %sp
785	sethi	%hi(kstat_q_panic_msg), %o0
786	call	panic
787	or	%o0, %lo(kstat_q_panic_msg), %o0
788	/*NOTREACHED*/
789	SET_SIZE(kstat_q_panic)
790
791#define	BRZPN	brz,pn
792#define	BRZPT	brz,pt
793
794#define	KSTAT_Q_UPDATE(QOP, QBR, QZERO, QRETURN, QTYPE) \
795	ld	[%o0 + QTYPE/**/CNT], %o1;	/* %o1 = old qlen */	\
796	QOP	%o1, 1, %o2;			/* %o2 = new qlen */	\
797	QBR	%o1, QZERO;			/* done if qlen == 0 */	\
798	st	%o2, [%o0 + QTYPE/**/CNT];	/* delay: save qlen */	\
799	ldx	[%o0 + QTYPE/**/LASTUPDATE], %o3;			\
800	ldx	[%o0 + QTYPE/**/TIME], %o4;	/* %o4 = old time */	\
801	ldx	[%o0 + QTYPE/**/LENTIME], %o5;	/* %o5 = old lentime */	\
802	sub	%g1, %o3, %o2;			/* %o2 = time delta */	\
803	mulx	%o1, %o2, %o3;			/* %o3 = cur lentime */	\
804	add	%o4, %o2, %o4;			/* %o4 = new time */	\
805	add	%o5, %o3, %o5;			/* %o5 = new lentime */	\
806	stx	%o4, [%o0 + QTYPE/**/TIME];	/* save time */		\
807	stx	%o5, [%o0 + QTYPE/**/LENTIME];	/* save lentime */	\
808QRETURN;								\
809	stx	%g1, [%o0 + QTYPE/**/LASTUPDATE]; /* lastupdate = now */
810
811#if !defined(DEBUG)
812/*
813 * same as KSTAT_Q_UPDATE but without:
814 * QBR     %o1, QZERO;
815 * to be used only with non-debug build. mimics ASSERT() behaviour.
816 */
817#define	KSTAT_Q_UPDATE_ND(QOP, QRETURN, QTYPE) \
818	ld	[%o0 + QTYPE/**/CNT], %o1;	/* %o1 = old qlen */	\
819	QOP	%o1, 1, %o2;			/* %o2 = new qlen */	\
820	st	%o2, [%o0 + QTYPE/**/CNT];	/* delay: save qlen */	\
821	ldx	[%o0 + QTYPE/**/LASTUPDATE], %o3;			\
822	ldx	[%o0 + QTYPE/**/TIME], %o4;	/* %o4 = old time */	\
823	ldx	[%o0 + QTYPE/**/LENTIME], %o5;	/* %o5 = old lentime */	\
824	sub	%g1, %o3, %o2;			/* %o2 = time delta */	\
825	mulx	%o1, %o2, %o3;			/* %o3 = cur lentime */	\
826	add	%o4, %o2, %o4;			/* %o4 = new time */	\
827	add	%o5, %o3, %o5;			/* %o5 = new lentime */	\
828	stx	%o4, [%o0 + QTYPE/**/TIME];	/* save time */		\
829	stx	%o5, [%o0 + QTYPE/**/LENTIME];	/* save lentime */	\
830QRETURN;								\
831	stx	%g1, [%o0 + QTYPE/**/LASTUPDATE]; /* lastupdate = now */
832#endif
833
834	.align 16
835	ENTRY(kstat_waitq_enter)
836	GET_NATIVE_TIME(%g1, %g2, %g3)
837	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
838	SET_SIZE(kstat_waitq_enter)
839
840	.align 16
841	ENTRY(kstat_waitq_exit)
842	GET_NATIVE_TIME(%g1, %g2, %g3)
843#if defined(DEBUG)
844	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_W)
845#else
846	KSTAT_Q_UPDATE_ND(sub, retl, KSTAT_IO_W)
847#endif
848	SET_SIZE(kstat_waitq_exit)
849
850	.align 16
851	ENTRY(kstat_runq_enter)
852	GET_NATIVE_TIME(%g1, %g2, %g3)
853	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
854	SET_SIZE(kstat_runq_enter)
855
856	.align 16
857	ENTRY(kstat_runq_exit)
858	GET_NATIVE_TIME(%g1, %g2, %g3)
859#if defined(DEBUG)
860	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_R)
861#else
862	KSTAT_Q_UPDATE_ND(sub, retl, KSTAT_IO_R)
863#endif
864	SET_SIZE(kstat_runq_exit)
865
866	.align 16
867	ENTRY(kstat_waitq_to_runq)
868	GET_NATIVE_TIME(%g1, %g2, %g3)
869#if defined(DEBUG)
870	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_W)
871#else
872	KSTAT_Q_UPDATE_ND(sub, 1:, KSTAT_IO_W)
873#endif
874	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
875	SET_SIZE(kstat_waitq_to_runq)
876
877	.align 16
878	ENTRY(kstat_runq_back_to_waitq)
879	GET_NATIVE_TIME(%g1, %g2, %g3)
880#if defined(DEBUG)
881	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_R)
882#else
883	KSTAT_Q_UPDATE_ND(sub, 1:, KSTAT_IO_R)
884#endif
885	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
886	SET_SIZE(kstat_runq_back_to_waitq)
887
888#endif	/* !(lint || __lint) */
889
890#ifdef lint
891
892int64_t timedelta;
893hrtime_t hres_last_tick;
894volatile timestruc_t hrestime;
895int64_t hrestime_adj;
896volatile int hres_lock;
897uint_t nsec_scale;
898hrtime_t hrtime_base;
899int traptrace_use_stick;
900
901#else	/* lint */
902	/*
903	 *  -- WARNING --
904	 *
905	 * The following variables MUST be together on a 128-byte boundary.
906	 * In addition to the primary performance motivation (having them all
907	 * on the same cache line(s)), code here and in the GET*TIME() macros
908	 * assumes that they all have the same high 22 address bits (so
909	 * there's only one sethi).
910	 */
911	.seg	".data"
912	.global	timedelta, hres_last_tick, hrestime, hrestime_adj
913	.global	hres_lock, nsec_scale, hrtime_base, traptrace_use_stick
914	.global	nsec_shift, adj_shift
915
916	/* XXX - above comment claims 128-bytes is necessary */
917	.align	64
918timedelta:
919	.word	0, 0		/* int64_t */
920hres_last_tick:
921	.word	0, 0		/* hrtime_t */
922hrestime:
923	.nword	0, 0		/* 2 longs */
924hrestime_adj:
925	.word	0, 0		/* int64_t */
926hres_lock:
927	.word	0
928nsec_scale:
929	.word	0
930hrtime_base:
931	.word	0, 0
932traptrace_use_stick:
933	.word	0
934nsec_shift:
935	.word	NSEC_SHIFT
936adj_shift:
937	.word	ADJ_SHIFT
938
939#endif	/* lint */
940
941
942/*
943 * drv_usecwait(clock_t n)	[DDI/DKI - section 9F]
944 * usec_delay(int n)		[compatibility - should go one day]
945 * Delay by spinning.
946 *
947 * delay for n microseconds.  numbers <= 0 delay 1 usec
948 *
949 * With UltraSPARC-III the combination of supporting mixed-speed CPUs
950 * and variable clock rate for power management requires that we
951 * use %stick to implement this routine.
952 *
953 * For OPL platforms that support the "sleep" instruction, we
954 * conditionally (ifdef'ed) insert a "sleep" instruction in
955 * the loop. Note that theoritically we should have move (duplicated)
956 * the code down to spitfire/us3/opl specific asm files - but this
957 * is alot of code duplication just to add one "sleep" instruction.
958 * We chose less code duplication for this.
959 */
960
961#if defined(lint)
962
963/*ARGSUSED*/
964void
965drv_usecwait(clock_t n)
966{}
967
968/*ARGSUSED*/
969void
970usec_delay(int n)
971{}
972
973#else	/* lint */
974
975	ENTRY(drv_usecwait)
976	ALTENTRY(usec_delay)
977	brlez,a,pn %o0, 0f
978	mov	1, %o0
9790:
980	sethi	%hi(sticks_per_usec), %o1
981	lduw	[%o1 + %lo(sticks_per_usec)], %o1
982	mulx	%o1, %o0, %o1		! Scale usec to ticks
983	inc	%o1			! We don't start on a tick edge
984	GET_NATIVE_TIME(%o2, %o3, %o4)
985	add	%o1, %o2, %o1
986
9871:
988#ifdef	_OPL
989	.word 0x81b01060		! insert "sleep" instruction
990#endif /* _OPL */			! use byte code for now
991	cmp	%o1, %o2
992	GET_NATIVE_TIME(%o2, %o3, %o4)
993	bgeu,pt	%xcc, 1b
994	nop
995	retl
996	nop
997	SET_SIZE(usec_delay)
998	SET_SIZE(drv_usecwait)
999#endif	/* lint */
1000
1001#if defined(lint)
1002
1003/* ARGSUSED */
1004void
1005pil14_interrupt(int level)
1006{}
1007
1008#else	/* lint */
1009
1010/*
1011 * Level-14 interrupt prologue.
1012 */
1013	ENTRY_NP(pil14_interrupt)
1014	CPU_ADDR(%g1, %g2)
1015	rdpr	%pil, %g6			! %g6 = interrupted PIL
1016	stn	%g6, [%g1 + CPU_PROFILE_PIL]	! record interrupted PIL
1017	rdpr	%tstate, %g6
1018	rdpr	%tpc, %g5
1019	btst	TSTATE_PRIV, %g6		! trap from supervisor mode?
1020	bnz,a,pt %xcc, 1f
1021	stn	%g5, [%g1 + CPU_PROFILE_PC]	! if so, record kernel PC
1022	stn	%g5, [%g1 + CPU_PROFILE_UPC]	! if not, record user PC
1023	ba	pil_interrupt_common		! must be large-disp branch
1024	stn	%g0, [%g1 + CPU_PROFILE_PC]	! zero kernel PC
10251:	ba	pil_interrupt_common		! must be large-disp branch
1026	stn	%g0, [%g1 + CPU_PROFILE_UPC]	! zero user PC
1027	SET_SIZE(pil14_interrupt)
1028
1029	ENTRY_NP(tick_rtt)
1030	!
1031	! Load TICK_COMPARE into %o5; if bit 63 is set, then TICK_COMPARE is
1032	! disabled.  If TICK_COMPARE is enabled, we know that we need to
1033	! reenqueue the interrupt request structure.  We'll then check TICKINT
1034	! in SOFTINT; if it's set, then we know that we were in a TICK_COMPARE
1035	! interrupt.  In this case, TICK_COMPARE may have been rewritten
1036	! recently; we'll compare %o5 to the current time to verify that it's
1037	! in the future.
1038	!
1039	! Note that %o5 is live until after 1f.
1040	! XXX - there is a subroutine call while %o5 is live!
1041	!
1042	RD_TICKCMPR(%o5, %g1)
1043	srlx	%o5, TICKINT_DIS_SHFT, %g1
1044	brnz,pt	%g1, 2f
1045	nop
1046
1047	rdpr 	%pstate, %g5
1048	andn	%g5, PSTATE_IE, %g1
1049	wrpr	%g0, %g1, %pstate		! Disable vec interrupts
1050
1051	sethi	%hi(cbe_level14_inum), %o1
1052	ldx	[%o1 + %lo(cbe_level14_inum)], %o1
1053	call	intr_enqueue_req ! preserves %o5 and %g5
1054	mov	PIL_14, %o0
1055
1056	! Check SOFTINT for TICKINT/STICKINT
1057	rd	SOFTINT, %o4
1058	set	(TICK_INT_MASK | STICK_INT_MASK), %o0
1059	andcc	%o4, %o0, %g0
1060	bz,a,pn	%icc, 2f
1061	wrpr	%g0, %g5, %pstate		! Enable vec interrupts
1062
1063	! clear TICKINT/STICKINT
1064	wr	%o0, CLEAR_SOFTINT
1065
1066	!
1067	! Now that we've cleared TICKINT, we can reread %tick and confirm
1068	! that the value we programmed is still in the future.  If it isn't,
1069	! we need to reprogram TICK_COMPARE to fire as soon as possible.
1070	!
1071	GET_NATIVE_TIME(%o0, %g1, %g2)		! %o0 = tick
1072	sllx	%o0, 1, %o0			! Clear the DIS bit
1073	srlx	%o0, 1, %o0
1074	cmp	%o5, %o0			! In the future?
1075	bg,a,pt	%xcc, 2f			! Yes, drive on.
1076	wrpr	%g0, %g5, %pstate		!   delay: enable vec intr
1077
1078	!
1079	! If we're here, then we have programmed TICK_COMPARE with a %tick
1080	! which is in the past; we'll now load an initial step size, and loop
1081	! until we've managed to program TICK_COMPARE to fire in the future.
1082	!
1083	mov	8, %o4				! 8 = arbitrary inital step
10841:	add	%o0, %o4, %o5			! Add the step
1085	WR_TICKCMPR(%o5,%g1,%g2,__LINE__)	! Write to TICK_CMPR
1086	GET_NATIVE_TIME(%o0, %g1, %g2)		! %o0 = tick
1087	sllx	%o0, 1, %o0			! Clear the DIS bit
1088	srlx	%o0, 1, %o0
1089	cmp	%o5, %o0			! In the future?
1090	bg,a,pt	%xcc, 2f			! Yes, drive on.
1091	wrpr	%g0, %g5, %pstate		!    delay: enable vec intr
1092	ba	1b				! No, try again.
1093	sllx	%o4, 1, %o4			!    delay: double step size
1094
10952:	ba	current_thread_complete
1096	nop
1097	SET_SIZE(tick_rtt)
1098
1099#endif	/* lint */
1100
1101#if defined(lint)
1102
1103/* ARGSUSED */
1104void
1105pil15_interrupt(int level)
1106{}
1107
1108#else  /* lint */
1109
1110/*
1111 * Level-15 interrupt prologue.
1112 */
1113       ENTRY_NP(pil15_interrupt)
1114       CPU_ADDR(%g1, %g2)
1115       rdpr    %tstate, %g6
1116       rdpr    %tpc, %g5
1117       btst    TSTATE_PRIV, %g6                ! trap from supervisor mode?
1118       bnz,a,pt %xcc, 1f
1119       stn     %g5, [%g1 + CPU_CPCPROFILE_PC]  ! if so, record kernel PC
1120       stn     %g5, [%g1 + CPU_CPCPROFILE_UPC] ! if not, record user PC
1121       ba      pil15_epilogue                  ! must be large-disp branch
1122       stn     %g0, [%g1 + CPU_CPCPROFILE_PC]  ! zero kernel PC
11231:     ba      pil15_epilogue                  ! must be large-disp branch
1124       stn     %g0, [%g1 + CPU_CPCPROFILE_UPC] ! zero user PC
1125       SET_SIZE(pil15_interrupt)
1126
1127#endif /* lint */
1128
1129#if defined(lint) || defined(__lint)
1130
1131/* ARGSUSED */
1132uint64_t
1133find_cpufrequency(volatile uchar_t *clock_ptr)
1134{
1135	return (0);
1136}
1137
1138#else	/* lint */
1139
1140#ifdef DEBUG
1141	.seg	".text"
1142find_cpufreq_panic:
1143	.asciz	"find_cpufrequency: interrupts already disabled on entry"
1144#endif	/* DEBUG */
1145
1146	ENTRY_NP(find_cpufrequency)
1147	rdpr	%pstate, %g1
1148
1149#ifdef DEBUG
1150	andcc	%g1, PSTATE_IE, %g0	! If DEBUG, check that interrupts
1151	bnz	0f			! are currently enabled
1152	sethi	%hi(find_cpufreq_panic), %o1
1153	call	panic
1154	or	%o1, %lo(find_cpufreq_panic), %o0
1155#endif	/* DEBUG */
1156
11570:
1158	wrpr	%g1, PSTATE_IE, %pstate	! Disable interrupts
11593:
1160	ldub	[%o0], %o1		! Read the number of seconds
1161	mov	%o1, %o2		! remember initial value in %o2
11621:
1163	GET_NATIVE_TIME(%o3, %g4, %g5)
1164	cmp	%o1, %o2		! did the seconds register roll over?
1165	be,pt	%icc, 1b		! branch back if unchanged
1166	ldub	[%o0], %o2		!   delay: load the new seconds val
1167
1168	brz,pn	%o2, 3b			! if the minutes just rolled over,
1169					! the last second could have been
1170					! inaccurate; try again.
1171	mov	%o2, %o4		!   delay: store init. val. in %o2
11722:
1173	GET_NATIVE_TIME(%o5, %g4, %g5)
1174	cmp	%o2, %o4		! did the seconds register roll over?
1175	be,pt	%icc, 2b		! branch back if unchanged
1176	ldub	[%o0], %o4		!   delay: load the new seconds val
1177
1178	brz,pn	%o4, 0b			! if the minutes just rolled over,
1179					! the last second could have been
1180					! inaccurate; try again.
1181	wrpr	%g0, %g1, %pstate	!   delay: re-enable interrupts
1182
1183	retl
1184	sub	%o5, %o3, %o0		! return the difference in ticks
1185	SET_SIZE(find_cpufrequency)
1186
1187#endif	/* lint */
1188
1189#if defined(lint)
1190/*
1191 * Prefetch a page_t for write or read, this assumes a linear
1192 * scan of sequential page_t's.
1193 */
1194/*ARGSUSED*/
1195void
1196prefetch_page_w(void *pp)
1197{}
1198
1199/*ARGSUSED*/
1200void
1201prefetch_page_r(void *pp)
1202{}
1203#else	/* lint */
1204
1205#if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
1206	defined(SERRANO)
1207	!
1208	! On US-III, the prefetch instruction queue is 8 entries deep.
1209	! Also, prefetches for write put data in the E$, which has
1210	! lines of 512 bytes for an 8MB cache. Each E$ line is further
1211	! subblocked into 64 byte chunks.
1212	!
1213	! Since prefetch can only bring in 64 bytes at a time (See Sparc
1214	! v9 Architecture Manual pp.204) and a page_t is 128 bytes,
1215	! then 2 prefetches are required in order to bring an entire
1216	! page into the E$.
1217	!
1218	! Since the prefetch queue is 8 entries deep, we currently can
1219	! only have 4 prefetches for page_t's outstanding. Thus, we
1220	! prefetch n+4 ahead of where we are now:
1221	!
1222	!      4 * sizeof(page_t)     -> 512
1223	!      4 * sizeof(page_t) +64 -> 576
1224	!
1225	! Example
1226	! =======
1227	! contiguous page array in memory...
1228	!
1229	! |AAA1|AAA2|BBB1|BBB2|CCC1|CCC2|DDD1|DDD2|XXX1|XXX2|YYY1|YYY2|...
1230	! ^         ^         ^         ^         ^    ^
1231	! pp                                      |    pp+4*sizeof(page)+64
1232	!                                         |
1233	!                                         pp+4*sizeof(page)
1234	!
1235	!  Prefetch
1236	!   Queue
1237	! +-------+<--- In this iteration, we're working with pp (AAA1),
1238	! |Preftch|     but we enqueue prefetch for addr = XXX1
1239	! | XXX1  |
1240	! +-------+<--- this queue slot will be a prefetch instruction for
1241	! |Preftch|     for addr = pp + 4*sizeof(page_t) + 64 (or second
1242	! | XXX2  |     half of page XXX)
1243	! +-------+
1244	! |Preftch|<-+- The next time around this function, we'll be
1245	! | YYY1  |  |  working with pp = BBB1, but will be enqueueing
1246	! +-------+  |  prefetches to for both halves of page YYY,
1247	! |Preftch|  |  while both halves of page XXX are in transit
1248	! | YYY2  |<-+  make their way into the E$.
1249	! +-------+
1250	! |Preftch|
1251	! | ZZZ1  |
1252	! +-------+
1253	! .       .
1254	! :       :
1255	!
1256	!  E$
1257	! +============================================...
1258	! | XXX1 | XXX2 | YYY1 | YYY2 | ZZZ1 | ZZZ2 |
1259	! +============================================...
1260	! |      |      |      |      |      |      |
1261	! +============================================...
1262	! .
1263	! :
1264	!
1265	! So we should expect the first four page accesses to stall
1266	! while we warm up the cache, afterwhich, most of the pages
1267	! will have their pp ready in the E$.
1268	!
1269	! Also note that if sizeof(page_t) grows beyond 128, then
1270	! we'll need an additional prefetch to get an entire page
1271	! into the E$, thus reducing the number of outstanding page
1272	! prefetches to 2 (ie. 3 prefetches/page = 6 queue slots)
1273	! etc.
1274	!
1275	! Cheetah+
1276	! ========
1277	! On Cheetah+ we use "#n_write" prefetches as these avoid
1278	! unnecessary RTS->RTO bus transaction state change, and
1279	! just issues RTO transaction. (See pp.77 of Cheetah+ Delta
1280	! PRM). On Cheetah, #n_write prefetches are reflected with
1281	! RTS->RTO state transition regardless.
1282	!
1283#define STRIDE1 512
1284#define STRIDE2 576
1285
1286#if	STRIDE1 != (PAGE_SIZE * 4)
1287#error	"STRIDE1 != (PAGE_SIZE * 4)"
1288#endif	/* STRIDE1 != (PAGE_SIZE * 4) */
1289
1290        ENTRY(prefetch_page_w)
1291        prefetch        [%o0+STRIDE1], #n_writes
1292        retl
1293        prefetch        [%o0+STRIDE2], #n_writes
1294        SET_SIZE(prefetch_page_w)
1295
1296	!
1297	! Note on CHEETAH to prefetch for read, we really use #one_write.
1298	! This fetches to E$ (general use) rather than P$ (floating point use).
1299	!
1300        ENTRY(prefetch_page_r)
1301        prefetch        [%o0+STRIDE1], #one_write
1302        retl
1303        prefetch        [%o0+STRIDE2], #one_write
1304        SET_SIZE(prefetch_page_r)
1305
1306#elif defined(SPITFIRE) || defined(HUMMINGBIRD)
1307
1308	!
1309	! UltraSparcII can have up to 3 prefetches outstanding.
1310	! A page_t is 128 bytes (2 prefetches of 64 bytes each)
1311	! So prefetch for pp + 1, which is
1312	!
1313	!       pp + sizeof(page_t)
1314	! and
1315	!       pp + sizeof(page_t) + 64
1316	!
1317#define STRIDE1	128
1318#define STRIDE2	192
1319
1320#if	STRIDE1 != PAGE_SIZE
1321#error	"STRIDE1 != PAGE_SIZE"
1322#endif	/* STRIDE1 != PAGE_SIZE */
1323
1324        ENTRY(prefetch_page_w)
1325        prefetch        [%o0+STRIDE1], #n_writes
1326        retl
1327        prefetch        [%o0+STRIDE2], #n_writes
1328        SET_SIZE(prefetch_page_w)
1329
1330        ENTRY(prefetch_page_r)
1331        prefetch        [%o0+STRIDE1], #n_reads
1332        retl
1333        prefetch        [%o0+STRIDE2], #n_reads
1334        SET_SIZE(prefetch_page_r)
1335
1336#elif defined(OLYMPUS_C)
1337	!
1338	! Prefetch strides for Olympus-C
1339	!
1340
1341#define STRIDE1	0x440
1342#define STRIDE2	0x640
1343
1344	ENTRY(prefetch_page_w)
1345        prefetch        [%o0+STRIDE1], #n_writes
1346	retl
1347        prefetch        [%o0+STRIDE2], #n_writes
1348	SET_SIZE(prefetch_page_w)
1349
1350	ENTRY(prefetch_page_r)
1351        prefetch        [%o0+STRIDE1], #n_writes
1352	retl
1353        prefetch        [%o0+STRIDE2], #n_writes
1354	SET_SIZE(prefetch_page_r)
1355#else	/* OLYMPUS_C */
1356
1357#error "You need to fix this for your new cpu type."
1358
1359#endif	/* OLYMPUS_C */
1360
1361#endif	/* lint */
1362
1363#if defined(lint)
1364/*
1365 * Prefetch struct smap for write.
1366 */
1367/*ARGSUSED*/
1368void
1369prefetch_smap_w(void *smp)
1370{}
1371#else	/* lint */
1372
1373#if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
1374	defined(SERRANO)
1375
1376#define	PREFETCH_Q_LEN 8
1377
1378#elif defined(SPITFIRE) || defined(HUMMINGBIRD)
1379
1380#define	PREFETCH_Q_LEN 3
1381
1382#elif defined(OLYMPUS_C)
1383	!
1384	! Use length of one for now.
1385	!
1386#define	PREFETCH_Q_LEN	1
1387
1388#else 	/* OLYMPUS_C */
1389
1390#error You need to fix this for your new cpu type.
1391
1392#endif	/* OLYMPUS_C */
1393
1394#include <vm/kpm.h>
1395
1396#ifdef	SEGKPM_SUPPORT
1397
1398#define	SMAP_SIZE 72
1399#define SMAP_STRIDE (((PREFETCH_Q_LEN * 64) / SMAP_SIZE) * 64)
1400
1401#else	/* SEGKPM_SUPPORT */
1402
1403	!
1404	! The hardware will prefetch the 64 byte cache aligned block
1405	! that contains the address specified in the prefetch instruction.
1406	! Since the size of the smap struct is 48 bytes, issuing 1 prefetch
1407	! per pass will suffice as long as we prefetch far enough ahead to
1408	! make sure we don't stall for the cases where the smap object
1409	! spans multiple hardware prefetch blocks.  Let's prefetch as far
1410	! ahead as the hardware will allow.
1411	!
1412	! The smap array is processed with decreasing address pointers.
1413	!
1414#define	SMAP_SIZE 48
1415#define	SMAP_STRIDE (PREFETCH_Q_LEN * SMAP_SIZE)
1416
1417#endif	/* SEGKPM_SUPPORT */
1418
1419	ENTRY(prefetch_smap_w)
1420	retl
1421	prefetch	[%o0-SMAP_STRIDE], #n_writes
1422	SET_SIZE(prefetch_smap_w)
1423
1424#endif	/* lint */
1425
1426#if defined(lint) || defined(__lint)
1427
1428/* ARGSUSED */
1429uint64_t
1430getidsr(void)
1431{ return 0; }
1432
1433#else	/* lint */
1434
1435	ENTRY_NP(getidsr)
1436	retl
1437	ldxa	[%g0]ASI_INTR_DISPATCH_STATUS, %o0
1438	SET_SIZE(getidsr)
1439
1440#endif	/* lint */
1441