xref: /illumos-gate/usr/src/uts/i86pc/os/mp_machdep.c (revision a38ddfee9c8c6b6c5a2947ff52fd2338362a4444)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #define	PSMI_1_6
27 #include <sys/smp_impldefs.h>
28 #include <sys/psm.h>
29 #include <sys/psm_modctl.h>
30 #include <sys/pit.h>
31 #include <sys/cmn_err.h>
32 #include <sys/strlog.h>
33 #include <sys/clock.h>
34 #include <sys/debug.h>
35 #include <sys/rtc.h>
36 #include <sys/x86_archext.h>
37 #include <sys/cpupart.h>
38 #include <sys/cpuvar.h>
39 #include <sys/cmt.h>
40 #include <sys/cpu.h>
41 #include <sys/disp.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/sysmacros.h>
45 #include <sys/memlist.h>
46 #include <sys/param.h>
47 #include <sys/promif.h>
48 #if defined(__xpv)
49 #include <sys/hypervisor.h>
50 #endif
51 #include <sys/mach_intr.h>
52 #include <vm/hat_i86.h>
53 #include <sys/kdi_machimpl.h>
54 #include <sys/sdt.h>
55 
56 #define	OFFSETOF(s, m)		(size_t)(&(((s *)0)->m))
57 
58 /*
59  *	Local function prototypes
60  */
61 static int mp_disable_intr(processorid_t cpun);
62 static void mp_enable_intr(processorid_t cpun);
63 static void mach_init();
64 static void mach_picinit();
65 static int machhztomhz(uint64_t cpu_freq_hz);
66 static uint64_t mach_getcpufreq(void);
67 static void mach_fixcpufreq(void);
68 static int mach_clkinit(int, int *);
69 static void mach_smpinit(void);
70 static int mach_softlvl_to_vect(int ipl);
71 static void mach_get_platform(int owner);
72 static void mach_construct_info();
73 static int mach_translate_irq(dev_info_t *dip, int irqno);
74 static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *,
75     psm_intr_op_t, int *);
76 static void mach_notify_error(int level, char *errmsg);
77 static hrtime_t dummy_hrtime(void);
78 static void dummy_scalehrtime(hrtime_t *);
79 static void cpu_idle(void);
80 static void cpu_wakeup(cpu_t *, int);
81 #ifndef __xpv
82 static void cpu_idle_mwait(void);
83 static void cpu_wakeup_mwait(cpu_t *, int);
84 #endif
85 /*
86  *	External reference functions
87  */
88 extern void return_instr();
89 extern uint64_t freq_tsc(uint32_t *);
90 #if defined(__i386)
91 extern uint64_t freq_notsc(uint32_t *);
92 #endif
93 extern void pc_gethrestime(timestruc_t *);
94 extern int cpuid_get_coreid(cpu_t *);
95 extern int cpuid_get_chipid(cpu_t *);
96 
97 /*
98  *	PSM functions initialization
99  */
100 void (*psm_shutdownf)(int, int)	= (void (*)(int, int))return_instr;
101 void (*psm_preshutdownf)(int, int) = (void (*)(int, int))return_instr;
102 void (*psm_notifyf)(int)	= (void (*)(int))return_instr;
103 void (*psm_set_idle_cpuf)(int)	= (void (*)(int))return_instr;
104 void (*psm_unset_idle_cpuf)(int) = (void (*)(int))return_instr;
105 void (*psminitf)()		= mach_init;
106 void (*picinitf)() 		= return_instr;
107 int (*clkinitf)(int, int *) 	= (int (*)(int, int *))return_instr;
108 int (*ap_mlsetup)() 		= (int (*)(void))return_instr;
109 void (*send_dirintf)() 		= return_instr;
110 void (*setspl)(int)		= (void (*)(int))return_instr;
111 int (*addspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
112 int (*delspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
113 void (*kdisetsoftint)(int, struct av_softinfo *)=
114 	(void (*)(int, struct av_softinfo *))return_instr;
115 void (*setsoftint)(int, struct av_softinfo *)=
116 	(void (*)(int, struct av_softinfo *))return_instr;
117 int (*slvltovect)(int)		= (int (*)(int))return_instr;
118 int (*setlvl)(int, int *)	= (int (*)(int, int *))return_instr;
119 void (*setlvlx)(int, int)	= (void (*)(int, int))return_instr;
120 int (*psm_disable_intr)(int)	= mp_disable_intr;
121 void (*psm_enable_intr)(int)	= mp_enable_intr;
122 hrtime_t (*gethrtimef)(void)	= dummy_hrtime;
123 hrtime_t (*gethrtimeunscaledf)(void)	= dummy_hrtime;
124 void (*scalehrtimef)(hrtime_t *)	= dummy_scalehrtime;
125 int (*psm_translate_irq)(dev_info_t *, int) = mach_translate_irq;
126 void (*gethrestimef)(timestruc_t *) = pc_gethrestime;
127 void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL;
128 int (*psm_get_clockirq)(int) = NULL;
129 int (*psm_get_ipivect)(int, int) = NULL;
130 
131 int (*psm_clkinit)(int) = NULL;
132 void (*psm_timer_reprogram)(hrtime_t) = NULL;
133 void (*psm_timer_enable)(void) = NULL;
134 void (*psm_timer_disable)(void) = NULL;
135 void (*psm_post_cyclic_setup)(void *arg) = NULL;
136 int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t,
137     int *) = mach_intr_ops;
138 int (*psm_state)(psm_state_request_t *) = (int (*)(psm_state_request_t *))
139     return_instr;
140 
141 void (*notify_error)(int, char *) = (void (*)(int, char *))return_instr;
142 void (*hrtime_tick)(void)	= return_instr;
143 
144 /*
145  * True if the generic TSC code is our source of hrtime, rather than whatever
146  * the PSM can provide.
147  */
148 #ifdef __xpv
149 int tsc_gethrtime_enable = 0;
150 #else
151 int tsc_gethrtime_enable = 1;
152 #endif
153 int tsc_gethrtime_initted = 0;
154 
155 /*
156  * True if the hrtime implementation is "hires"; namely, better than microdata.
157  */
158 int gethrtime_hires = 0;
159 
160 /*
161  * Local Static Data
162  */
163 static struct psm_ops mach_ops;
164 static struct psm_ops *mach_set[4] = {&mach_ops, NULL, NULL, NULL};
165 static ushort_t mach_ver[4] = {0, 0, 0, 0};
166 
167 /*
168  * If non-zero, idle cpus will become "halted" when there's
169  * no work to do.
170  */
171 int	idle_cpu_use_hlt = 1;
172 
173 #ifndef __xpv
174 /*
175  * If non-zero, idle cpus will use mwait if available to halt instead of hlt.
176  */
177 int	idle_cpu_prefer_mwait = 1;
178 /*
179  * Set to 0 to avoid MONITOR+CLFLUSH assertion.
180  */
181 int	idle_cpu_assert_cflush_monitor = 1;
182 
183 #endif
184 
185 /*ARGSUSED*/
186 int
187 pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
188 {
189 	switch (hw) {
190 	case PGHW_IPIPE:
191 		if (x86_feature & (X86_HTT)) {
192 			/*
193 			 * Hyper-threading is SMT
194 			 */
195 			return (1);
196 		} else {
197 			return (0);
198 		}
199 	case PGHW_CHIP:
200 		if (x86_feature & (X86_CMP|X86_HTT))
201 			return (1);
202 		else
203 			return (0);
204 	case PGHW_CACHE:
205 		if (cpuid_get_ncpu_sharing_last_cache(cp) > 1)
206 			return (1);
207 		else
208 			return (0);
209 	default:
210 		return (0);
211 	}
212 }
213 
214 /*
215  * Compare two CPUs and see if they have a pghw_type_t sharing relationship
216  * If pghw_type_t is an unsupported hardware type, then return -1
217  */
218 int
219 pg_plat_cpus_share(cpu_t *cpu_a, cpu_t *cpu_b, pghw_type_t hw)
220 {
221 	id_t pgp_a, pgp_b;
222 
223 	pgp_a = pg_plat_hw_instance_id(cpu_a, hw);
224 	pgp_b = pg_plat_hw_instance_id(cpu_b, hw);
225 
226 	if (pgp_a == -1 || pgp_b == -1)
227 		return (-1);
228 
229 	return (pgp_a == pgp_b);
230 }
231 
232 /*
233  * Return a physical instance identifier for known hardware sharing
234  * relationships
235  */
236 id_t
237 pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
238 {
239 	switch (hw) {
240 	case PGHW_IPIPE:
241 		return (cpuid_get_coreid(cpu));
242 	case PGHW_CACHE:
243 		return (cpuid_get_last_lvl_cacheid(cpu));
244 	case PGHW_CHIP:
245 		return (cpuid_get_chipid(cpu));
246 	default:
247 		return (-1);
248 	}
249 }
250 
251 int
252 pg_plat_hw_level(pghw_type_t hw)
253 {
254 	int i;
255 	static pghw_type_t hw_hier[] = {
256 		PGHW_IPIPE,
257 		PGHW_CACHE,
258 		PGHW_CHIP,
259 		PGHW_NUM_COMPONENTS
260 	};
261 
262 	for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
263 		if (hw_hier[i] == hw)
264 			return (i);
265 	}
266 	return (-1);
267 }
268 
269 /*
270  * Return 1 if CMT load balancing policies should be
271  * implemented across instances of the specified hardware
272  * sharing relationship.
273  */
274 int
275 pg_plat_cmt_load_bal_hw(pghw_type_t hw)
276 {
277 	if (hw == PGHW_IPIPE ||
278 	    hw == PGHW_FPU ||
279 	    hw == PGHW_CHIP ||
280 	    hw == PGHW_CACHE)
281 		return (1);
282 	else
283 		return (0);
284 }
285 
286 
287 /*
288  * Return 1 if thread affinity polices should be implemented
289  * for instances of the specifed hardware sharing relationship.
290  */
291 int
292 pg_plat_cmt_affinity_hw(pghw_type_t hw)
293 {
294 	if (hw == PGHW_CACHE)
295 		return (1);
296 	else
297 		return (0);
298 }
299 
300 id_t
301 pg_plat_get_core_id(cpu_t *cpu)
302 {
303 	return ((id_t)cpuid_get_coreid(cpu));
304 }
305 
306 void
307 cmp_set_nosteal_interval(void)
308 {
309 	/* Set the nosteal interval (used by disp_getbest()) to 100us */
310 	nosteal_nsec = 100000UL;
311 }
312 
313 /*
314  * Routine to ensure initial callers to hrtime gets 0 as return
315  */
316 static hrtime_t
317 dummy_hrtime(void)
318 {
319 	return (0);
320 }
321 
322 /* ARGSUSED */
323 static void
324 dummy_scalehrtime(hrtime_t *ticks)
325 {}
326 
327 /*
328  * Idle the present CPU until awoken via an interrupt
329  */
330 static void
331 cpu_idle(void)
332 {
333 	cpu_t		*cpup = CPU;
334 	processorid_t	cpun = cpup->cpu_id;
335 	cpupart_t	*cp = cpup->cpu_part;
336 	int		hset_update = 1;
337 
338 	/*
339 	 * If this CPU is online, and there's multiple CPUs
340 	 * in the system, then we should notate our halting
341 	 * by adding ourselves to the partition's halted CPU
342 	 * bitmap. This allows other CPUs to find/awaken us when
343 	 * work becomes available.
344 	 */
345 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
346 		hset_update = 0;
347 
348 	/*
349 	 * Add ourselves to the partition's halted CPUs bitmask
350 	 * and set our HALTED flag, if necessary.
351 	 *
352 	 * When a thread becomes runnable, it is placed on the queue
353 	 * and then the halted cpuset is checked to determine who
354 	 * (if anyone) should be awoken. We therefore need to first
355 	 * add ourselves to the halted cpuset, and and then check if there
356 	 * is any work available.
357 	 *
358 	 * Note that memory barriers after updating the HALTED flag
359 	 * are not necessary since an atomic operation (updating the bitmap)
360 	 * immediately follows. On x86 the atomic operation acts as a
361 	 * memory barrier for the update of cpu_disp_flags.
362 	 */
363 	if (hset_update) {
364 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
365 		CPUSET_ATOMIC_ADD(cp->cp_mach->mc_haltset, cpun);
366 	}
367 
368 	/*
369 	 * Check to make sure there's really nothing to do.
370 	 * Work destined for this CPU may become available after
371 	 * this check. We'll be notified through the clearing of our
372 	 * bit in the halted CPU bitmask, and a poke.
373 	 */
374 	if (disp_anywork()) {
375 		if (hset_update) {
376 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
377 			CPUSET_ATOMIC_DEL(cp->cp_mach->mc_haltset, cpun);
378 		}
379 		return;
380 	}
381 
382 	/*
383 	 * We're on our way to being halted.
384 	 *
385 	 * Disable interrupts now, so that we'll awaken immediately
386 	 * after halting if someone tries to poke us between now and
387 	 * the time we actually halt.
388 	 *
389 	 * We check for the presence of our bit after disabling interrupts.
390 	 * If it's cleared, we'll return. If the bit is cleared after
391 	 * we check then the poke will pop us out of the halted state.
392 	 *
393 	 * This means that the ordering of the poke and the clearing
394 	 * of the bit by cpu_wakeup is important.
395 	 * cpu_wakeup() must clear, then poke.
396 	 * cpu_idle() must disable interrupts, then check for the bit.
397 	 */
398 	cli();
399 
400 	if (hset_update && !CPU_IN_SET(cp->cp_mach->mc_haltset, cpun)) {
401 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
402 		sti();
403 		return;
404 	}
405 
406 	/*
407 	 * The check for anything locally runnable is here for performance
408 	 * and isn't needed for correctness. disp_nrunnable ought to be
409 	 * in our cache still, so it's inexpensive to check, and if there
410 	 * is anything runnable we won't have to wait for the poke.
411 	 */
412 	if (cpup->cpu_disp->disp_nrunnable != 0) {
413 		if (hset_update) {
414 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
415 			CPUSET_ATOMIC_DEL(cp->cp_mach->mc_haltset, cpun);
416 		}
417 		sti();
418 		return;
419 	}
420 
421 	DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1);
422 
423 	mach_cpu_idle();
424 
425 	DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0);
426 
427 	/*
428 	 * We're no longer halted
429 	 */
430 	if (hset_update) {
431 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
432 		CPUSET_ATOMIC_DEL(cp->cp_mach->mc_haltset, cpun);
433 	}
434 }
435 
436 
437 /*
438  * If "cpu" is halted, then wake it up clearing its halted bit in advance.
439  * Otherwise, see if other CPUs in the cpu partition are halted and need to
440  * be woken up so that they can steal the thread we placed on this CPU.
441  * This function is only used on MP systems.
442  */
443 static void
444 cpu_wakeup(cpu_t *cpu, int bound)
445 {
446 	uint_t		cpu_found;
447 	int		result;
448 	cpupart_t	*cp;
449 
450 	cp = cpu->cpu_part;
451 	if (CPU_IN_SET(cp->cp_mach->mc_haltset, cpu->cpu_id)) {
452 		/*
453 		 * Clear the halted bit for that CPU since it will be
454 		 * poked in a moment.
455 		 */
456 		CPUSET_ATOMIC_DEL(cp->cp_mach->mc_haltset, cpu->cpu_id);
457 		/*
458 		 * We may find the current CPU present in the halted cpuset
459 		 * if we're in the context of an interrupt that occurred
460 		 * before we had a chance to clear our bit in cpu_idle().
461 		 * Poking ourself is obviously unnecessary, since if
462 		 * we're here, we're not halted.
463 		 */
464 		if (cpu != CPU)
465 			poke_cpu(cpu->cpu_id);
466 		return;
467 	} else {
468 		/*
469 		 * This cpu isn't halted, but it's idle or undergoing a
470 		 * context switch. No need to awaken anyone else.
471 		 */
472 		if (cpu->cpu_thread == cpu->cpu_idle_thread ||
473 		    cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL)
474 			return;
475 	}
476 
477 	/*
478 	 * No need to wake up other CPUs if the thread we just enqueued
479 	 * is bound.
480 	 */
481 	if (bound)
482 		return;
483 
484 
485 	/*
486 	 * See if there's any other halted CPUs. If there are, then
487 	 * select one, and awaken it.
488 	 * It's possible that after we find a CPU, somebody else
489 	 * will awaken it before we get the chance.
490 	 * In that case, look again.
491 	 */
492 	do {
493 		CPUSET_FIND(cp->cp_mach->mc_haltset, cpu_found);
494 		if (cpu_found == CPUSET_NOTINSET)
495 			return;
496 
497 		ASSERT(cpu_found >= 0 && cpu_found < NCPU);
498 		CPUSET_ATOMIC_XDEL(cp->cp_mach->mc_haltset, cpu_found, result);
499 	} while (result < 0);
500 
501 	if (cpu_found != CPU->cpu_id)
502 		poke_cpu(cpu_found);
503 }
504 
505 #ifndef __xpv
506 /*
507  * Idle the present CPU until awoken via touching its monitored line
508  */
509 static void
510 cpu_idle_mwait(void)
511 {
512 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
513 	cpu_t			*cpup = CPU;
514 	processorid_t		cpun = cpup->cpu_id;
515 	cpupart_t		*cp = cpup->cpu_part;
516 	int			hset_update = 1;
517 
518 	/*
519 	 * Set our mcpu_mwait here, so we can tell if anyone trys to
520 	 * wake us between now and when we call mwait.  No other cpu will
521 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
522 	 */
523 	*mcpu_mwait = MWAIT_HALTED;
524 
525 	/*
526 	 * If this CPU is online, and there's multiple CPUs
527 	 * in the system, then we should notate our halting
528 	 * by adding ourselves to the partition's halted CPU
529 	 * bitmap. This allows other CPUs to find/awaken us when
530 	 * work becomes available.
531 	 */
532 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
533 		hset_update = 0;
534 
535 	/*
536 	 * Add ourselves to the partition's halted CPUs bitmask
537 	 * and set our HALTED flag, if necessary.
538 	 *
539 	 * When a thread becomes runnable, it is placed on the queue
540 	 * and then the halted cpuset is checked to determine who
541 	 * (if anyone) should be awoken. We therefore need to first
542 	 * add ourselves to the halted cpuset, and and then check if there
543 	 * is any work available.
544 	 *
545 	 * Note that memory barriers after updating the HALTED flag
546 	 * are not necessary since an atomic operation (updating the bitmap)
547 	 * immediately follows. On x86 the atomic operation acts as a
548 	 * memory barrier for the update of cpu_disp_flags.
549 	 */
550 	if (hset_update) {
551 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
552 		CPUSET_ATOMIC_ADD(cp->cp_mach->mc_haltset, cpun);
553 	}
554 
555 	/*
556 	 * Check to make sure there's really nothing to do.
557 	 * Work destined for this CPU may become available after
558 	 * this check. We'll be notified through the clearing of our
559 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
560 	 *
561 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
562 	 */
563 	if (disp_anywork()) {
564 		if (hset_update) {
565 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
566 			CPUSET_ATOMIC_DEL(cp->cp_mach->mc_haltset, cpun);
567 		}
568 		return;
569 	}
570 
571 	/*
572 	 * We're on our way to being halted.
573 	 * To avoid a lost wakeup, arm the monitor before checking if another
574 	 * cpu wrote to mcpu_mwait to wake us up.
575 	 */
576 	i86_monitor(mcpu_mwait, 0, 0);
577 	if (*mcpu_mwait == MWAIT_HALTED) {
578 		DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C1);
579 
580 		tlb_going_idle();
581 		i86_mwait(0, 0);
582 		tlb_service();
583 
584 		DTRACE_PROBE1(idle__state__transition, uint_t, IDLE_STATE_C0);
585 	}
586 
587 	/*
588 	 * We're no longer halted
589 	 */
590 	if (hset_update) {
591 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
592 		CPUSET_ATOMIC_DEL(cp->cp_mach->mc_haltset, cpun);
593 	}
594 }
595 
596 /*
597  * If "cpu" is halted in mwait, then wake it up clearing its halted bit in
598  * advance.  Otherwise, see if other CPUs in the cpu partition are halted and
599  * need to be woken up so that they can steal the thread we placed on this CPU.
600  * This function is only used on MP systems.
601  */
602 static void
603 cpu_wakeup_mwait(cpu_t *cp, int bound)
604 {
605 	cpupart_t	*cpu_part;
606 	uint_t		cpu_found;
607 	int		result;
608 
609 	cpu_part = cp->cpu_part;
610 
611 	/*
612 	 * Clear the halted bit for that CPU since it will be woken up
613 	 * in a moment.
614 	 */
615 	if (CPU_IN_SET(cpu_part->cp_mach->mc_haltset, cp->cpu_id)) {
616 		/*
617 		 * Clear the halted bit for that CPU since it will be
618 		 * poked in a moment.
619 		 */
620 		CPUSET_ATOMIC_DEL(cpu_part->cp_mach->mc_haltset, cp->cpu_id);
621 		/*
622 		 * We may find the current CPU present in the halted cpuset
623 		 * if we're in the context of an interrupt that occurred
624 		 * before we had a chance to clear our bit in cpu_idle().
625 		 * Waking ourself is obviously unnecessary, since if
626 		 * we're here, we're not halted.
627 		 *
628 		 * monitor/mwait wakeup via writing to our cache line is
629 		 * harmless and less expensive than always checking if we
630 		 * are waking ourself which is an uncommon case.
631 		 */
632 		MWAIT_WAKEUP(cp);	/* write to monitored line */
633 		return;
634 	} else {
635 		/*
636 		 * This cpu isn't halted, but it's idle or undergoing a
637 		 * context switch. No need to awaken anyone else.
638 		 */
639 		if (cp->cpu_thread == cp->cpu_idle_thread ||
640 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
641 			return;
642 	}
643 
644 	/*
645 	 * No need to wake up other CPUs if the thread we just enqueued
646 	 * is bound.
647 	 */
648 	if (bound)
649 		return;
650 
651 
652 	/*
653 	 * See if there's any other halted CPUs. If there are, then
654 	 * select one, and awaken it.
655 	 * It's possible that after we find a CPU, somebody else
656 	 * will awaken it before we get the chance.
657 	 * In that case, look again.
658 	 */
659 	do {
660 		CPUSET_FIND(cpu_part->cp_mach->mc_haltset, cpu_found);
661 		if (cpu_found == CPUSET_NOTINSET)
662 			return;
663 
664 		ASSERT(cpu_found >= 0 && cpu_found < NCPU);
665 		CPUSET_ATOMIC_XDEL(cpu_part->cp_mach->mc_haltset, cpu_found,
666 		    result);
667 	} while (result < 0);
668 
669 	/*
670 	 * Do not check if cpu_found is ourself as monitor/mwait wakeup is
671 	 * cheap.
672 	 */
673 	MWAIT_WAKEUP(cpu[cpu_found]);	/* write to monitored line */
674 }
675 #endif
676 
677 void (*cpu_pause_handler)(volatile char *) = NULL;
678 
679 static int
680 mp_disable_intr(int cpun)
681 {
682 	/*
683 	 * switch to the offline cpu
684 	 */
685 	affinity_set(cpun);
686 	/*
687 	 * raise ipl to just below cross call
688 	 */
689 	splx(XC_MED_PIL-1);
690 	/*
691 	 *	set base spl to prevent the next swtch to idle from
692 	 *	lowering back to ipl 0
693 	 */
694 	CPU->cpu_intr_actv |= (1 << (XC_MED_PIL-1));
695 	set_base_spl();
696 	affinity_clear();
697 	return (DDI_SUCCESS);
698 }
699 
700 static void
701 mp_enable_intr(int cpun)
702 {
703 	/*
704 	 * switch to the online cpu
705 	 */
706 	affinity_set(cpun);
707 	/*
708 	 * clear the interrupt active mask
709 	 */
710 	CPU->cpu_intr_actv &= ~(1 << (XC_MED_PIL-1));
711 	set_base_spl();
712 	(void) spl0();
713 	affinity_clear();
714 }
715 
716 static void
717 mach_get_platform(int owner)
718 {
719 	void		**srv_opsp;
720 	void		**clt_opsp;
721 	int		i;
722 	int		total_ops;
723 
724 	/* fix up psm ops */
725 	srv_opsp = (void **)mach_set[0];
726 	clt_opsp = (void **)mach_set[owner];
727 	if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01)
728 		total_ops = sizeof (struct psm_ops_ver01) /
729 		    sizeof (void (*)(void));
730 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_1)
731 		/* no psm_notify_func */
732 		total_ops = OFFSETOF(struct psm_ops, psm_notify_func) /
733 		    sizeof (void (*)(void));
734 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_2)
735 		/* no psm_timer funcs */
736 		total_ops = OFFSETOF(struct psm_ops, psm_timer_reprogram) /
737 		    sizeof (void (*)(void));
738 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_3)
739 		/* no psm_preshutdown function */
740 		total_ops = OFFSETOF(struct psm_ops, psm_preshutdown) /
741 		    sizeof (void (*)(void));
742 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_4)
743 		/* no psm_preshutdown function */
744 		total_ops = OFFSETOF(struct psm_ops, psm_intr_ops) /
745 		    sizeof (void (*)(void));
746 	else
747 		total_ops = sizeof (struct psm_ops) / sizeof (void (*)(void));
748 
749 	/*
750 	 * Save the version of the PSM module, in case we need to
751 	 * bahave differently based on version.
752 	 */
753 	mach_ver[0] = mach_ver[owner];
754 
755 	for (i = 0; i < total_ops; i++)
756 		if (clt_opsp[i] != NULL)
757 			srv_opsp[i] = clt_opsp[i];
758 }
759 
760 static void
761 mach_construct_info()
762 {
763 	struct psm_sw *swp;
764 	int	mach_cnt[PSM_OWN_OVERRIDE+1] = {0};
765 	int	conflict_owner = 0;
766 
767 	if (psmsw->psw_forw == psmsw)
768 		panic("No valid PSM modules found");
769 	mutex_enter(&psmsw_lock);
770 	for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
771 		if (!(swp->psw_flag & PSM_MOD_IDENTIFY))
772 			continue;
773 		mach_set[swp->psw_infop->p_owner] = swp->psw_infop->p_ops;
774 		mach_ver[swp->psw_infop->p_owner] = swp->psw_infop->p_version;
775 		mach_cnt[swp->psw_infop->p_owner]++;
776 	}
777 	mutex_exit(&psmsw_lock);
778 
779 	mach_get_platform(PSM_OWN_SYS_DEFAULT);
780 
781 	/* check to see are there any conflicts */
782 	if (mach_cnt[PSM_OWN_EXCLUSIVE] > 1)
783 		conflict_owner = PSM_OWN_EXCLUSIVE;
784 	if (mach_cnt[PSM_OWN_OVERRIDE] > 1)
785 		conflict_owner = PSM_OWN_OVERRIDE;
786 	if (conflict_owner) {
787 		/* remove all psm modules except uppc */
788 		cmn_err(CE_WARN,
789 		    "Conflicts detected on the following PSM modules:");
790 		mutex_enter(&psmsw_lock);
791 		for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
792 			if (swp->psw_infop->p_owner == conflict_owner)
793 				cmn_err(CE_WARN, "%s ",
794 				    swp->psw_infop->p_mach_idstring);
795 		}
796 		mutex_exit(&psmsw_lock);
797 		cmn_err(CE_WARN,
798 		    "Setting the system back to SINGLE processor mode!");
799 		cmn_err(CE_WARN,
800 		    "Please edit /etc/mach to remove the invalid PSM module.");
801 		return;
802 	}
803 
804 	if (mach_set[PSM_OWN_EXCLUSIVE])
805 		mach_get_platform(PSM_OWN_EXCLUSIVE);
806 
807 	if (mach_set[PSM_OWN_OVERRIDE])
808 		mach_get_platform(PSM_OWN_OVERRIDE);
809 }
810 
811 static void
812 mach_init()
813 {
814 	struct psm_ops  *pops;
815 
816 	mach_construct_info();
817 
818 	pops = mach_set[0];
819 
820 	/* register the interrupt and clock initialization rotuines */
821 	picinitf = mach_picinit;
822 	clkinitf = mach_clkinit;
823 	psm_get_clockirq = pops->psm_get_clockirq;
824 
825 	/* register the interrupt setup code */
826 	slvltovect = mach_softlvl_to_vect;
827 	addspl	= pops->psm_addspl;
828 	delspl	= pops->psm_delspl;
829 
830 	if (pops->psm_translate_irq)
831 		psm_translate_irq = pops->psm_translate_irq;
832 	if (pops->psm_intr_ops)
833 		psm_intr_ops = pops->psm_intr_ops;
834 
835 #if defined(PSMI_1_2) || defined(PSMI_1_3) || defined(PSMI_1_4)
836 	/*
837 	 * Time-of-day functionality now handled in TOD modules.
838 	 * (Warn about PSM modules that think that we're going to use
839 	 * their ops vectors.)
840 	 */
841 	if (pops->psm_tod_get)
842 		cmn_err(CE_WARN, "obsolete psm_tod_get op %p",
843 		    (void *)pops->psm_tod_get);
844 
845 	if (pops->psm_tod_set)
846 		cmn_err(CE_WARN, "obsolete psm_tod_set op %p",
847 		    (void *)pops->psm_tod_set);
848 #endif
849 
850 	if (pops->psm_notify_error) {
851 		psm_notify_error = mach_notify_error;
852 		notify_error = pops->psm_notify_error;
853 	}
854 
855 	(*pops->psm_softinit)();
856 
857 	/*
858 	 * Initialize the dispatcher's function hooks
859 	 * to enable CPU halting when idle.
860 	 * Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle)
861 	 * or idle_cpu_prefer_mwait is not set.
862 	 * Allocate monitor/mwait buffer for cpu0.
863 	 */
864 	if (idle_cpu_use_hlt) {
865 		idle_cpu = cpu_idle;
866 #ifndef __xpv
867 		if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) {
868 			CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
869 			/*
870 			 * Protect ourself from insane mwait size.
871 			 */
872 			if (CPU->cpu_m.mcpu_mwait == NULL) {
873 #ifdef DEBUG
874 				cmn_err(CE_NOTE, "Using hlt idle.  Cannot "
875 				    "handle cpu 0 mwait size.");
876 #endif
877 				idle_cpu_prefer_mwait = 0;
878 				idle_cpu = cpu_idle;
879 			} else {
880 				idle_cpu = cpu_idle_mwait;
881 			}
882 		} else {
883 			idle_cpu = cpu_idle;
884 		}
885 #endif
886 	}
887 
888 	mach_smpinit();
889 }
890 
891 static void
892 mach_smpinit(void)
893 {
894 	struct psm_ops  *pops;
895 	processorid_t cpu_id;
896 	int cnt;
897 	cpuset_t cpumask;
898 
899 	pops = mach_set[0];
900 	CPUSET_ZERO(cpumask);
901 
902 	cpu_id = -1;
903 	cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
904 	for (cnt = 0; cpu_id != -1; cnt++) {
905 		CPUSET_ADD(cpumask, cpu_id);
906 		cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
907 	}
908 
909 	mp_cpus = cpumask;
910 
911 	/* MP related routines */
912 	ap_mlsetup = pops->psm_post_cpu_start;
913 	send_dirintf = pops->psm_send_ipi;
914 
915 	/* optional MP related routines */
916 	if (pops->psm_shutdown)
917 		psm_shutdownf = pops->psm_shutdown;
918 	if (pops->psm_preshutdown)
919 		psm_preshutdownf = pops->psm_preshutdown;
920 	if (pops->psm_notify_func)
921 		psm_notifyf = pops->psm_notify_func;
922 	if (pops->psm_set_idlecpu)
923 		psm_set_idle_cpuf = pops->psm_set_idlecpu;
924 	if (pops->psm_unset_idlecpu)
925 		psm_unset_idle_cpuf = pops->psm_unset_idlecpu;
926 
927 	psm_clkinit = pops->psm_clkinit;
928 
929 	if (pops->psm_timer_reprogram)
930 		psm_timer_reprogram = pops->psm_timer_reprogram;
931 
932 	if (pops->psm_timer_enable)
933 		psm_timer_enable = pops->psm_timer_enable;
934 
935 	if (pops->psm_timer_disable)
936 		psm_timer_disable = pops->psm_timer_disable;
937 
938 	if (pops->psm_post_cyclic_setup)
939 		psm_post_cyclic_setup = pops->psm_post_cyclic_setup;
940 
941 	if (pops->psm_state)
942 		psm_state = pops->psm_state;
943 
944 	/*
945 	 * Set these vectors here so they can be used by Suspend/Resume
946 	 * on UP machines.
947 	 */
948 	if (pops->psm_disable_intr)
949 		psm_disable_intr = pops->psm_disable_intr;
950 	if (pops->psm_enable_intr)
951 		psm_enable_intr  = pops->psm_enable_intr;
952 
953 	/* check for multiple CPUs */
954 	if (cnt < 2)
955 		return;
956 
957 	/* check for MP platforms */
958 	if (pops->psm_cpu_start == NULL)
959 		return;
960 
961 	/*
962 	 * Set the dispatcher hook to enable cpu "wake up"
963 	 * when a thread becomes runnable.
964 	 */
965 	if (idle_cpu_use_hlt) {
966 		disp_enq_thread = cpu_wakeup;
967 #ifndef __xpv
968 		if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait)
969 			disp_enq_thread = cpu_wakeup_mwait;
970 #endif
971 	}
972 
973 	psm_get_ipivect = pops->psm_get_ipivect;
974 
975 	(void) add_avintr((void *)NULL, XC_HI_PIL, xc_serv, "xc_hi_intr",
976 	    (*pops->psm_get_ipivect)(XC_HI_PIL, PSM_INTR_IPI_HI),
977 	    (caddr_t)X_CALL_HIPRI, NULL, NULL, NULL);
978 	(void) add_avintr((void *)NULL, XC_MED_PIL, xc_serv, "xc_med_intr",
979 	    (*pops->psm_get_ipivect)(XC_MED_PIL, PSM_INTR_IPI_LO),
980 	    (caddr_t)X_CALL_MEDPRI, NULL, NULL, NULL);
981 
982 	(void) (*pops->psm_get_ipivect)(XC_CPUPOKE_PIL, PSM_INTR_POKE);
983 }
984 
985 static void
986 mach_picinit()
987 {
988 	struct psm_ops  *pops;
989 
990 	pops = mach_set[0];
991 
992 	/* register the interrupt handlers */
993 	setlvl = pops->psm_intr_enter;
994 	setlvlx = pops->psm_intr_exit;
995 
996 	/* initialize the interrupt hardware */
997 	(*pops->psm_picinit)();
998 
999 	/* set interrupt mask for current ipl */
1000 	setspl = pops->psm_setspl;
1001 	cli();
1002 	setspl(CPU->cpu_pri);
1003 }
1004 
1005 uint_t	cpu_freq;	/* MHz */
1006 uint64_t cpu_freq_hz;	/* measured (in hertz) */
1007 
1008 #define	MEGA_HZ		1000000
1009 
1010 #ifdef __xpv
1011 
1012 int xpv_cpufreq_workaround = 1;
1013 int xpv_cpufreq_verbose = 0;
1014 
1015 #else	/* __xpv */
1016 
1017 static uint64_t
1018 mach_calchz(uint32_t pit_counter, uint64_t *processor_clks)
1019 {
1020 	uint64_t cpu_hz;
1021 
1022 	if ((pit_counter == 0) || (*processor_clks == 0) ||
1023 	    (*processor_clks > (((uint64_t)-1) / PIT_HZ)))
1024 		return (0);
1025 
1026 	cpu_hz = ((uint64_t)PIT_HZ * *processor_clks) / pit_counter;
1027 
1028 	return (cpu_hz);
1029 }
1030 
1031 #endif	/* __xpv */
1032 
1033 static uint64_t
1034 mach_getcpufreq(void)
1035 {
1036 #if defined(__xpv)
1037 	vcpu_time_info_t *vti = &CPU->cpu_m.mcpu_vcpu_info->time;
1038 	uint64_t cpu_hz;
1039 
1040 	/*
1041 	 * During dom0 bringup, it was noted that on at least one older
1042 	 * Intel HT machine, the hypervisor initially gives a tsc_to_system_mul
1043 	 * value that is quite wrong (the 3.06GHz clock was reported
1044 	 * as 4.77GHz)
1045 	 *
1046 	 * The curious thing is, that if you stop the kernel at entry,
1047 	 * breakpoint here and inspect the value with kmdb, the value
1048 	 * is correct - but if you don't stop and simply enable the
1049 	 * printf statement (below), you can see the bad value printed
1050 	 * here.  Almost as if something kmdb did caused the hypervisor to
1051 	 * figure it out correctly.  And, note that the hypervisor
1052 	 * eventually -does- figure it out correctly ... if you look at
1053 	 * the field later in the life of dom0, it is correct.
1054 	 *
1055 	 * For now, on dom0, we employ a slightly cheesy workaround of
1056 	 * using the DOM0_PHYSINFO hypercall.
1057 	 */
1058 	if (DOMAIN_IS_INITDOMAIN(xen_info) && xpv_cpufreq_workaround) {
1059 		xen_sysctl_t op0, *op = &op0;
1060 
1061 		op->cmd = XEN_SYSCTL_physinfo;
1062 		op->interface_version = XEN_SYSCTL_INTERFACE_VERSION;
1063 		if (HYPERVISOR_sysctl(op) != 0)
1064 			panic("physinfo op refused");
1065 
1066 		cpu_hz = 1000 * (uint64_t)op->u.physinfo.cpu_khz;
1067 	} else {
1068 		cpu_hz = (UINT64_C(1000000000) << 32) / vti->tsc_to_system_mul;
1069 
1070 		if (vti->tsc_shift < 0)
1071 			cpu_hz <<= -vti->tsc_shift;
1072 		else
1073 			cpu_hz >>= vti->tsc_shift;
1074 	}
1075 
1076 	if (xpv_cpufreq_verbose)
1077 		printf("mach_getcpufreq: system_mul 0x%x, shift %d, "
1078 		    "cpu_hz %" PRId64 "Hz\n",
1079 		    vti->tsc_to_system_mul, vti->tsc_shift, cpu_hz);
1080 
1081 	return (cpu_hz);
1082 #else	/* __xpv */
1083 	uint32_t pit_counter;
1084 	uint64_t processor_clks;
1085 
1086 	if (x86_feature & X86_TSC) {
1087 		/*
1088 		 * We have a TSC. freq_tsc() knows how to measure the number
1089 		 * of clock cycles sampled against the PIT.
1090 		 */
1091 		ulong_t flags = clear_int_flag();
1092 		processor_clks = freq_tsc(&pit_counter);
1093 		restore_int_flag(flags);
1094 		return (mach_calchz(pit_counter, &processor_clks));
1095 	} else if (x86_vendor == X86_VENDOR_Cyrix || x86_type == X86_TYPE_P5) {
1096 #if defined(__amd64)
1097 		panic("mach_getcpufreq: no TSC!");
1098 #elif defined(__i386)
1099 		/*
1100 		 * We are a Cyrix based on a 6x86 core or an Intel Pentium
1101 		 * for which freq_notsc() knows how to measure the number of
1102 		 * elapsed clock cycles sampled against the PIT
1103 		 */
1104 		ulong_t flags = clear_int_flag();
1105 		processor_clks = freq_notsc(&pit_counter);
1106 		restore_int_flag(flags);
1107 		return (mach_calchz(pit_counter, &processor_clks));
1108 #endif	/* __i386 */
1109 	}
1110 
1111 	/* We do not know how to calculate cpu frequency for this cpu. */
1112 	return (0);
1113 #endif	/* __xpv */
1114 }
1115 
1116 /*
1117  * If the clock speed of a cpu is found to be reported incorrectly, do not add
1118  * to this array, instead improve the accuracy of the algorithm that determines
1119  * the clock speed of the processor or extend the implementation to support the
1120  * vendor as appropriate. This is here only to support adjusting the speed on
1121  * older slower processors that mach_fixcpufreq() would not be able to account
1122  * for otherwise.
1123  */
1124 static int x86_cpu_freq[] = { 60, 75, 80, 90, 120, 160, 166, 175, 180, 233 };
1125 
1126 /*
1127  * On fast processors the clock frequency that is measured may be off by
1128  * a few MHz from the value printed on the part. This is a combination of
1129  * the factors that for such fast parts being off by this much is within
1130  * the tolerances for manufacture and because of the difficulties in the
1131  * measurement that can lead to small error. This function uses some
1132  * heuristics in order to tweak the value that was measured to match what
1133  * is most likely printed on the part.
1134  *
1135  * Some examples:
1136  * 	AMD Athlon 1000 mhz measured as 998 mhz
1137  * 	Intel Pentium III Xeon 733 mhz measured as 731 mhz
1138  * 	Intel Pentium IV 1500 mhz measured as 1495mhz
1139  *
1140  * If in the future this function is no longer sufficient to correct
1141  * for the error in the measurement, then the algorithm used to perform
1142  * the measurement will have to be improved in order to increase accuracy
1143  * rather than adding horrible and questionable kludges here.
1144  *
1145  * This is called after the cyclics subsystem because of the potential
1146  * that the heuristics within may give a worse estimate of the clock
1147  * frequency than the value that was measured.
1148  */
1149 static void
1150 mach_fixcpufreq(void)
1151 {
1152 	uint32_t freq, mul, near66, delta66, near50, delta50, fixed, delta, i;
1153 
1154 	freq = (uint32_t)cpu_freq;
1155 
1156 	/*
1157 	 * Find the nearest integer multiple of 200/3 (about 66) MHz to the
1158 	 * measured speed taking into account that the 667 MHz parts were
1159 	 * the first to round-up.
1160 	 */
1161 	mul = (uint32_t)((3 * (uint64_t)freq + 100) / 200);
1162 	near66 = (uint32_t)((200 * (uint64_t)mul + ((mul >= 10) ? 1 : 0)) / 3);
1163 	delta66 = (near66 > freq) ? (near66 - freq) : (freq - near66);
1164 
1165 	/* Find the nearest integer multiple of 50 MHz to the measured speed */
1166 	mul = (freq + 25) / 50;
1167 	near50 = mul * 50;
1168 	delta50 = (near50 > freq) ? (near50 - freq) : (freq - near50);
1169 
1170 	/* Find the closer of the two */
1171 	if (delta66 < delta50) {
1172 		fixed = near66;
1173 		delta = delta66;
1174 	} else {
1175 		fixed = near50;
1176 		delta = delta50;
1177 	}
1178 
1179 	if (fixed > INT_MAX)
1180 		return;
1181 
1182 	/*
1183 	 * Some older parts have a core clock frequency that is not an
1184 	 * integral multiple of 50 or 66 MHz. Check if one of the old
1185 	 * clock frequencies is closer to the measured value than any
1186 	 * of the integral multiples of 50 an 66, and if so set fixed
1187 	 * and delta appropriately to represent the closest value.
1188 	 */
1189 	i = sizeof (x86_cpu_freq) / sizeof (int);
1190 	while (i > 0) {
1191 		i--;
1192 
1193 		if (x86_cpu_freq[i] <= freq) {
1194 			mul = freq - x86_cpu_freq[i];
1195 
1196 			if (mul < delta) {
1197 				fixed = x86_cpu_freq[i];
1198 				delta = mul;
1199 			}
1200 
1201 			break;
1202 		}
1203 
1204 		mul = x86_cpu_freq[i] - freq;
1205 
1206 		if (mul < delta) {
1207 			fixed = x86_cpu_freq[i];
1208 			delta = mul;
1209 		}
1210 	}
1211 
1212 	/*
1213 	 * Set a reasonable maximum for how much to correct the measured
1214 	 * result by. This check is here to prevent the adjustment made
1215 	 * by this function from being more harm than good. It is entirely
1216 	 * possible that in the future parts will be made that are not
1217 	 * integral multiples of 66 or 50 in clock frequency or that
1218 	 * someone may overclock a part to some odd frequency. If the
1219 	 * measured value is farther from the corrected value than
1220 	 * allowed, then assume the corrected value is in error and use
1221 	 * the measured value.
1222 	 */
1223 	if (6 < delta)
1224 		return;
1225 
1226 	cpu_freq = (int)fixed;
1227 }
1228 
1229 
1230 static int
1231 machhztomhz(uint64_t cpu_freq_hz)
1232 {
1233 	uint64_t cpu_mhz;
1234 
1235 	/* Round to nearest MHZ */
1236 	cpu_mhz = (cpu_freq_hz + (MEGA_HZ / 2)) / MEGA_HZ;
1237 
1238 	if (cpu_mhz > INT_MAX)
1239 		return (0);
1240 
1241 	return ((int)cpu_mhz);
1242 
1243 }
1244 
1245 
1246 static int
1247 mach_clkinit(int preferred_mode, int *set_mode)
1248 {
1249 	struct psm_ops  *pops;
1250 	int resolution;
1251 
1252 	pops = mach_set[0];
1253 
1254 	cpu_freq_hz = mach_getcpufreq();
1255 
1256 	cpu_freq = machhztomhz(cpu_freq_hz);
1257 
1258 	if (!(x86_feature & X86_TSC) || (cpu_freq == 0))
1259 		tsc_gethrtime_enable = 0;
1260 
1261 #ifndef __xpv
1262 	if (tsc_gethrtime_enable) {
1263 		tsc_hrtimeinit(cpu_freq_hz);
1264 	} else
1265 #endif
1266 	{
1267 		if (pops->psm_hrtimeinit)
1268 			(*pops->psm_hrtimeinit)();
1269 		gethrtimef = pops->psm_gethrtime;
1270 		gethrtimeunscaledf = gethrtimef;
1271 		/* scalehrtimef will remain dummy */
1272 	}
1273 
1274 	mach_fixcpufreq();
1275 
1276 	if (mach_ver[0] >= PSM_INFO_VER01_3) {
1277 		if (preferred_mode == TIMER_ONESHOT) {
1278 
1279 			resolution = (*pops->psm_clkinit)(0);
1280 			if (resolution != 0)  {
1281 				*set_mode = TIMER_ONESHOT;
1282 				return (resolution);
1283 			}
1284 		}
1285 
1286 		/*
1287 		 * either periodic mode was requested or could not set to
1288 		 * one-shot mode
1289 		 */
1290 		resolution = (*pops->psm_clkinit)(hz);
1291 		/*
1292 		 * psm should be able to do periodic, so we do not check
1293 		 * for return value of psm_clkinit here.
1294 		 */
1295 		*set_mode = TIMER_PERIODIC;
1296 		return (resolution);
1297 	} else {
1298 		/*
1299 		 * PSMI interface prior to PSMI_3 does not define a return
1300 		 * value for psm_clkinit, so the return value is ignored.
1301 		 */
1302 		(void) (*pops->psm_clkinit)(hz);
1303 		*set_mode = TIMER_PERIODIC;
1304 		return (nsec_per_tick);
1305 	}
1306 }
1307 
1308 
1309 /*ARGSUSED*/
1310 static int
1311 mach_softlvl_to_vect(int ipl)
1312 {
1313 	setsoftint = av_set_softint_pending;
1314 	kdisetsoftint = kdi_av_set_softint_pending;
1315 
1316 	return (PSM_SV_SOFTWARE);
1317 }
1318 
1319 #ifdef DEBUG
1320 /*
1321  * This is here to allow us to simulate cpus that refuse to start.
1322  */
1323 cpuset_t cpufailset;
1324 #endif
1325 
1326 int
1327 mach_cpu_start(struct cpu *cp, void *ctx)
1328 {
1329 	struct psm_ops *pops = mach_set[0];
1330 	processorid_t id = cp->cpu_id;
1331 
1332 #ifdef DEBUG
1333 	if (CPU_IN_SET(cpufailset, id))
1334 		return (0);
1335 #endif
1336 	return ((*pops->psm_cpu_start)(id, ctx));
1337 }
1338 
1339 int
1340 mach_cpuid_start(processorid_t id, void *ctx)
1341 {
1342 	struct psm_ops *pops = mach_set[0];
1343 
1344 #ifdef DEBUG
1345 	if (CPU_IN_SET(cpufailset, id))
1346 		return (0);
1347 #endif
1348 	return ((*pops->psm_cpu_start)(id, ctx));
1349 }
1350 
1351 /*ARGSUSED*/
1352 static int
1353 mach_translate_irq(dev_info_t *dip, int irqno)
1354 {
1355 	return (irqno);	/* default to NO translation */
1356 }
1357 
1358 static void
1359 mach_notify_error(int level, char *errmsg)
1360 {
1361 	/*
1362 	 * SL_FATAL is pass in once panicstr is set, deliver it
1363 	 * as CE_PANIC.  Also, translate SL_ codes back to CE_
1364 	 * codes for the psmi handler
1365 	 */
1366 	if (level & SL_FATAL)
1367 		(*notify_error)(CE_PANIC, errmsg);
1368 	else if (level & SL_WARN)
1369 		(*notify_error)(CE_WARN, errmsg);
1370 	else if (level & SL_NOTE)
1371 		(*notify_error)(CE_NOTE, errmsg);
1372 	else if (level & SL_CONSOLE)
1373 		(*notify_error)(CE_CONT, errmsg);
1374 }
1375 
1376 /*
1377  * It provides the default basic intr_ops interface for the new DDI
1378  * interrupt framework if the PSM doesn't have one.
1379  *
1380  * Input:
1381  * dip     - pointer to the dev_info structure of the requested device
1382  * hdlp    - pointer to the internal interrupt handle structure for the
1383  *	     requested interrupt
1384  * intr_op - opcode for this call
1385  * result  - pointer to the integer that will hold the result to be
1386  *	     passed back if return value is PSM_SUCCESS
1387  *
1388  * Output:
1389  * return value is either PSM_SUCCESS or PSM_FAILURE
1390  */
1391 static int
1392 mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
1393     psm_intr_op_t intr_op, int *result)
1394 {
1395 	struct intrspec *ispec;
1396 
1397 	switch (intr_op) {
1398 	case PSM_INTR_OP_CHECK_MSI:
1399 		*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
1400 		    DDI_INTR_TYPE_MSIX);
1401 		break;
1402 	case PSM_INTR_OP_ALLOC_VECTORS:
1403 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1404 			*result = 1;
1405 		else
1406 			*result = 0;
1407 		break;
1408 	case PSM_INTR_OP_FREE_VECTORS:
1409 		break;
1410 	case PSM_INTR_OP_NAVAIL_VECTORS:
1411 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1412 			*result = 1;
1413 		else
1414 			*result = 0;
1415 		break;
1416 	case PSM_INTR_OP_XLATE_VECTOR:
1417 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
1418 		*result = psm_translate_irq(dip, ispec->intrspec_vec);
1419 		break;
1420 	case PSM_INTR_OP_GET_CAP:
1421 		*result = 0;
1422 		break;
1423 	case PSM_INTR_OP_GET_PENDING:
1424 	case PSM_INTR_OP_CLEAR_MASK:
1425 	case PSM_INTR_OP_SET_MASK:
1426 	case PSM_INTR_OP_GET_SHARED:
1427 	case PSM_INTR_OP_SET_PRI:
1428 	case PSM_INTR_OP_SET_CAP:
1429 	case PSM_INTR_OP_SET_CPU:
1430 	case PSM_INTR_OP_GET_INTR:
1431 	default:
1432 		return (PSM_FAILURE);
1433 	}
1434 	return (PSM_SUCCESS);
1435 }
1436 /*
1437  * Return 1 if CMT load balancing policies should be
1438  * implemented across instances of the specified hardware
1439  * sharing relationship.
1440  */
1441 int
1442 pg_cmt_load_bal_hw(pghw_type_t hw)
1443 {
1444 	if (hw == PGHW_IPIPE ||
1445 	    hw == PGHW_FPU ||
1446 	    hw == PGHW_CHIP)
1447 		return (1);
1448 	else
1449 		return (0);
1450 }
1451 /*
1452  * Return 1 if thread affinity polices should be implemented
1453  * for instances of the specifed hardware sharing relationship.
1454  */
1455 int
1456 pg_cmt_affinity_hw(pghw_type_t hw)
1457 {
1458 	if (hw == PGHW_CACHE)
1459 		return (1);
1460 	else
1461 		return (0);
1462 }
1463