xref: /titanic_52/usr/src/uts/i86pc/os/mp_machdep.c (revision 17f1e64a433a4ca00ffed7539e10c297580a7002)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #define	PSMI_1_6
27 #include <sys/smp_impldefs.h>
28 #include <sys/psm.h>
29 #include <sys/psm_modctl.h>
30 #include <sys/pit.h>
31 #include <sys/cmn_err.h>
32 #include <sys/strlog.h>
33 #include <sys/clock.h>
34 #include <sys/debug.h>
35 #include <sys/rtc.h>
36 #include <sys/x86_archext.h>
37 #include <sys/cpupart.h>
38 #include <sys/cpuvar.h>
39 #include <sys/cmt.h>
40 #include <sys/cpu.h>
41 #include <sys/disp.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/sysmacros.h>
45 #include <sys/memlist.h>
46 #include <sys/param.h>
47 #include <sys/promif.h>
48 #include <sys/cpu_pm.h>
49 #if defined(__xpv)
50 #include <sys/hypervisor.h>
51 #endif
52 #include <sys/mach_intr.h>
53 #include <vm/hat_i86.h>
54 #include <sys/kdi_machimpl.h>
55 #include <sys/sdt.h>
56 #include <sys/hpet.h>
57 
58 #define	OFFSETOF(s, m)		(size_t)(&(((s *)0)->m))
59 
60 /*
61  *	Local function prototypes
62  */
63 static int mp_disable_intr(processorid_t cpun);
64 static void mp_enable_intr(processorid_t cpun);
65 static void mach_init();
66 static void mach_picinit();
67 static int machhztomhz(uint64_t cpu_freq_hz);
68 static uint64_t mach_getcpufreq(void);
69 static void mach_fixcpufreq(void);
70 static int mach_clkinit(int, int *);
71 static void mach_smpinit(void);
72 static int mach_softlvl_to_vect(int ipl);
73 static void mach_get_platform(int owner);
74 static void mach_construct_info();
75 static int mach_translate_irq(dev_info_t *dip, int irqno);
76 static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *,
77     psm_intr_op_t, int *);
78 static void mach_notify_error(int level, char *errmsg);
79 static hrtime_t dummy_hrtime(void);
80 static void dummy_scalehrtime(hrtime_t *);
81 void cpu_idle(void);
82 static void cpu_wakeup(cpu_t *, int);
83 #ifndef __xpv
84 void cpu_idle_mwait(void);
85 static void cpu_wakeup_mwait(cpu_t *, int);
86 #endif
87 /*
88  *	External reference functions
89  */
90 extern void return_instr();
91 extern uint64_t freq_tsc(uint32_t *);
92 #if defined(__i386)
93 extern uint64_t freq_notsc(uint32_t *);
94 #endif
95 extern void pc_gethrestime(timestruc_t *);
96 extern int cpuid_get_coreid(cpu_t *);
97 extern int cpuid_get_chipid(cpu_t *);
98 
99 /*
100  *	PSM functions initialization
101  */
102 void (*psm_shutdownf)(int, int)	= (void (*)(int, int))return_instr;
103 void (*psm_preshutdownf)(int, int) = (void (*)(int, int))return_instr;
104 void (*psm_notifyf)(int)	= (void (*)(int))return_instr;
105 void (*psm_set_idle_cpuf)(int)	= (void (*)(int))return_instr;
106 void (*psm_unset_idle_cpuf)(int) = (void (*)(int))return_instr;
107 void (*psminitf)()		= mach_init;
108 void (*picinitf)() 		= return_instr;
109 int (*clkinitf)(int, int *) 	= (int (*)(int, int *))return_instr;
110 int (*ap_mlsetup)() 		= (int (*)(void))return_instr;
111 void (*send_dirintf)() 		= return_instr;
112 void (*setspl)(int)		= (void (*)(int))return_instr;
113 int (*addspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
114 int (*delspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
115 void (*kdisetsoftint)(int, struct av_softinfo *)=
116 	(void (*)(int, struct av_softinfo *))return_instr;
117 void (*setsoftint)(int, struct av_softinfo *)=
118 	(void (*)(int, struct av_softinfo *))return_instr;
119 int (*slvltovect)(int)		= (int (*)(int))return_instr;
120 int (*setlvl)(int, int *)	= (int (*)(int, int *))return_instr;
121 void (*setlvlx)(int, int)	= (void (*)(int, int))return_instr;
122 int (*psm_disable_intr)(int)	= mp_disable_intr;
123 void (*psm_enable_intr)(int)	= mp_enable_intr;
124 hrtime_t (*gethrtimef)(void)	= dummy_hrtime;
125 hrtime_t (*gethrtimeunscaledf)(void)	= dummy_hrtime;
126 void (*scalehrtimef)(hrtime_t *)	= dummy_scalehrtime;
127 int (*psm_translate_irq)(dev_info_t *, int) = mach_translate_irq;
128 void (*gethrestimef)(timestruc_t *) = pc_gethrestime;
129 void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL;
130 int (*psm_get_clockirq)(int) = NULL;
131 int (*psm_get_ipivect)(int, int) = NULL;
132 
133 int (*psm_clkinit)(int) = NULL;
134 void (*psm_timer_reprogram)(hrtime_t) = NULL;
135 void (*psm_timer_enable)(void) = NULL;
136 void (*psm_timer_disable)(void) = NULL;
137 void (*psm_post_cyclic_setup)(void *arg) = NULL;
138 int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t,
139     int *) = mach_intr_ops;
140 int (*psm_state)(psm_state_request_t *) = (int (*)(psm_state_request_t *))
141     return_instr;
142 
143 void (*notify_error)(int, char *) = (void (*)(int, char *))return_instr;
144 void (*hrtime_tick)(void)	= return_instr;
145 
146 /*
147  * True if the generic TSC code is our source of hrtime, rather than whatever
148  * the PSM can provide.
149  */
150 #ifdef __xpv
151 int tsc_gethrtime_enable = 0;
152 #else
153 int tsc_gethrtime_enable = 1;
154 #endif
155 int tsc_gethrtime_initted = 0;
156 
157 /*
158  * True if the hrtime implementation is "hires"; namely, better than microdata.
159  */
160 int gethrtime_hires = 0;
161 
162 /*
163  * Local Static Data
164  */
165 static struct psm_ops mach_ops;
166 static struct psm_ops *mach_set[4] = {&mach_ops, NULL, NULL, NULL};
167 static ushort_t mach_ver[4] = {0, 0, 0, 0};
168 
169 /*
170  * virtualization support for psm
171  */
172 void *psm_vt_ops = NULL;
173 /*
174  * If non-zero, idle cpus will become "halted" when there's
175  * no work to do.
176  */
177 int	idle_cpu_use_hlt = 1;
178 
179 #ifndef __xpv
180 /*
181  * If non-zero, idle cpus will use mwait if available to halt instead of hlt.
182  */
183 int	idle_cpu_prefer_mwait = 1;
184 /*
185  * Set to 0 to avoid MONITOR+CLFLUSH assertion.
186  */
187 int	idle_cpu_assert_cflush_monitor = 1;
188 
189 /*
190  * If non-zero, idle cpus will not use power saving Deep C-States idle loop.
191  */
192 int	idle_cpu_no_deep_c = 0;
193 /*
194  * Non-power saving idle loop and wakeup pointers.
195  * Allows user to toggle Deep Idle power saving feature on/off.
196  */
197 void	(*non_deep_idle_cpu)() = cpu_idle;
198 void	(*non_deep_idle_disp_enq_thread)(cpu_t *, int);
199 
200 /*
201  * Object for the kernel to access the HPET.
202  */
203 hpet_t hpet;
204 
205 #endif	/* ifndef __xpv */
206 
207 /*ARGSUSED*/
208 int
209 pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
210 {
211 	switch (hw) {
212 	case PGHW_IPIPE:
213 		if (x86_feature & (X86_HTT)) {
214 			/*
215 			 * Hyper-threading is SMT
216 			 */
217 			return (1);
218 		} else {
219 			return (0);
220 		}
221 	case PGHW_CHIP:
222 		if (x86_feature & (X86_CMP|X86_HTT))
223 			return (1);
224 		else
225 			return (0);
226 	case PGHW_CACHE:
227 		if (cpuid_get_ncpu_sharing_last_cache(cp) > 1)
228 			return (1);
229 		else
230 			return (0);
231 	case PGHW_POW_ACTIVE:
232 		if (cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE) != (id_t)-1)
233 			return (1);
234 		else
235 			return (0);
236 	case PGHW_POW_IDLE:
237 		if (cpupm_domain_id(cp, CPUPM_DTYPE_IDLE) != (id_t)-1)
238 			return (1);
239 		else
240 			return (0);
241 	default:
242 		return (0);
243 	}
244 }
245 
246 /*
247  * Compare two CPUs and see if they have a pghw_type_t sharing relationship
248  * If pghw_type_t is an unsupported hardware type, then return -1
249  */
250 int
251 pg_plat_cpus_share(cpu_t *cpu_a, cpu_t *cpu_b, pghw_type_t hw)
252 {
253 	id_t pgp_a, pgp_b;
254 
255 	pgp_a = pg_plat_hw_instance_id(cpu_a, hw);
256 	pgp_b = pg_plat_hw_instance_id(cpu_b, hw);
257 
258 	if (pgp_a == -1 || pgp_b == -1)
259 		return (-1);
260 
261 	return (pgp_a == pgp_b);
262 }
263 
264 /*
265  * Return a physical instance identifier for known hardware sharing
266  * relationships
267  */
268 id_t
269 pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
270 {
271 	switch (hw) {
272 	case PGHW_IPIPE:
273 		return (cpuid_get_coreid(cpu));
274 	case PGHW_CACHE:
275 		return (cpuid_get_last_lvl_cacheid(cpu));
276 	case PGHW_CHIP:
277 		return (cpuid_get_chipid(cpu));
278 	case PGHW_POW_ACTIVE:
279 		return (cpupm_domain_id(cpu, CPUPM_DTYPE_ACTIVE));
280 	case PGHW_POW_IDLE:
281 		return (cpupm_domain_id(cpu, CPUPM_DTYPE_IDLE));
282 	default:
283 		return (-1);
284 	}
285 }
286 
287 /*
288  * Express preference for optimizing for sharing relationship
289  * hw1 vs hw2
290  */
291 pghw_type_t
292 pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
293 {
294 	int i, rank1, rank2;
295 
296 	static pghw_type_t hw_hier[] = {
297 		PGHW_IPIPE,
298 		PGHW_CACHE,
299 		PGHW_CHIP,
300 		PGHW_POW_IDLE,
301 		PGHW_POW_ACTIVE,
302 		PGHW_NUM_COMPONENTS
303 	};
304 
305 	for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
306 		if (hw_hier[i] == hw1)
307 			rank1 = i;
308 		if (hw_hier[i] == hw2)
309 			rank2 = i;
310 	}
311 
312 	if (rank1 > rank2)
313 		return (hw1);
314 	else
315 		return (hw2);
316 }
317 
318 /*
319  * Override the default CMT dispatcher policy for the specified
320  * hardware sharing relationship
321  */
322 pg_cmt_policy_t
323 pg_plat_cmt_policy(pghw_type_t hw)
324 {
325 	/*
326 	 * For shared caches, also load balance across them to
327 	 * maximize aggregate cache capacity
328 	 */
329 	switch (hw) {
330 	case PGHW_CACHE:
331 		return (CMT_BALANCE|CMT_AFFINITY);
332 	default:
333 		return (CMT_NO_POLICY);
334 	}
335 }
336 
337 id_t
338 pg_plat_get_core_id(cpu_t *cpu)
339 {
340 	return ((id_t)cpuid_get_coreid(cpu));
341 }
342 
343 void
344 cmp_set_nosteal_interval(void)
345 {
346 	/* Set the nosteal interval (used by disp_getbest()) to 100us */
347 	nosteal_nsec = 100000UL;
348 }
349 
350 /*
351  * Routine to ensure initial callers to hrtime gets 0 as return
352  */
353 static hrtime_t
354 dummy_hrtime(void)
355 {
356 	return (0);
357 }
358 
359 /* ARGSUSED */
360 static void
361 dummy_scalehrtime(hrtime_t *ticks)
362 {}
363 
364 /*
365  * Supports Deep C-State power saving idle loop.
366  */
367 void
368 cpu_idle_adaptive(void)
369 {
370 	(*CPU->cpu_m.mcpu_idle_cpu)();
371 }
372 
373 void
374 cpu_dtrace_idle_probe(uint_t cstate)
375 {
376 	cpu_t		*cpup = CPU;
377 	struct machcpu	*mcpu = &(cpup->cpu_m);
378 
379 	mcpu->curr_cstate = cstate;
380 	DTRACE_PROBE1(idle__state__transition, uint_t, cstate);
381 }
382 
383 /*
384  * Idle the present CPU until awoken via an interrupt
385  */
386 void
387 cpu_idle(void)
388 {
389 	cpu_t		*cpup = CPU;
390 	processorid_t	cpu_sid = cpup->cpu_seqid;
391 	cpupart_t	*cp = cpup->cpu_part;
392 	int		hset_update = 1;
393 
394 	/*
395 	 * If this CPU is online, and there's multiple CPUs
396 	 * in the system, then we should notate our halting
397 	 * by adding ourselves to the partition's halted CPU
398 	 * bitmap. This allows other CPUs to find/awaken us when
399 	 * work becomes available.
400 	 */
401 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
402 		hset_update = 0;
403 
404 	/*
405 	 * Add ourselves to the partition's halted CPUs bitmap
406 	 * and set our HALTED flag, if necessary.
407 	 *
408 	 * When a thread becomes runnable, it is placed on the queue
409 	 * and then the halted CPU bitmap is checked to determine who
410 	 * (if anyone) should be awoken. We therefore need to first
411 	 * add ourselves to the bitmap, and and then check if there
412 	 * is any work available. The order is important to prevent a race
413 	 * that can lead to work languishing on a run queue somewhere while
414 	 * this CPU remains halted.
415 	 *
416 	 * Either the producing CPU will see we're halted and will awaken us,
417 	 * or this CPU will see the work available in disp_anywork().
418 	 *
419 	 * Note that memory barriers after updating the HALTED flag
420 	 * are not necessary since an atomic operation (updating the bitset)
421 	 * immediately follows. On x86 the atomic operation acts as a
422 	 * memory barrier for the update of cpu_disp_flags.
423 	 */
424 	if (hset_update) {
425 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
426 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
427 	}
428 
429 	/*
430 	 * Check to make sure there's really nothing to do.
431 	 * Work destined for this CPU may become available after
432 	 * this check. We'll be notified through the clearing of our
433 	 * bit in the halted CPU bitmap, and a poke.
434 	 */
435 	if (disp_anywork()) {
436 		if (hset_update) {
437 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
438 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
439 		}
440 		return;
441 	}
442 
443 	/*
444 	 * We're on our way to being halted.
445 	 *
446 	 * Disable interrupts now, so that we'll awaken immediately
447 	 * after halting if someone tries to poke us between now and
448 	 * the time we actually halt.
449 	 *
450 	 * We check for the presence of our bit after disabling interrupts.
451 	 * If it's cleared, we'll return. If the bit is cleared after
452 	 * we check then the poke will pop us out of the halted state.
453 	 *
454 	 * This means that the ordering of the poke and the clearing
455 	 * of the bit by cpu_wakeup is important.
456 	 * cpu_wakeup() must clear, then poke.
457 	 * cpu_idle() must disable interrupts, then check for the bit.
458 	 */
459 	cli();
460 
461 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
462 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
463 		sti();
464 		return;
465 	}
466 
467 	/*
468 	 * The check for anything locally runnable is here for performance
469 	 * and isn't needed for correctness. disp_nrunnable ought to be
470 	 * in our cache still, so it's inexpensive to check, and if there
471 	 * is anything runnable we won't have to wait for the poke.
472 	 */
473 	if (cpup->cpu_disp->disp_nrunnable != 0) {
474 		if (hset_update) {
475 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
476 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
477 		}
478 		sti();
479 		return;
480 	}
481 
482 	cpu_dtrace_idle_probe(IDLE_STATE_C1);
483 
484 	mach_cpu_idle();
485 
486 	cpu_dtrace_idle_probe(IDLE_STATE_C0);
487 
488 	/*
489 	 * We're no longer halted
490 	 */
491 	if (hset_update) {
492 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
493 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
494 	}
495 }
496 
497 
498 /*
499  * If "cpu" is halted, then wake it up clearing its halted bit in advance.
500  * Otherwise, see if other CPUs in the cpu partition are halted and need to
501  * be woken up so that they can steal the thread we placed on this CPU.
502  * This function is only used on MP systems.
503  */
504 static void
505 cpu_wakeup(cpu_t *cpu, int bound)
506 {
507 	uint_t		cpu_found;
508 	processorid_t	cpu_sid;
509 	cpupart_t	*cp;
510 
511 	cp = cpu->cpu_part;
512 	cpu_sid = cpu->cpu_seqid;
513 	if (bitset_in_set(&cp->cp_haltset, cpu_sid)) {
514 		/*
515 		 * Clear the halted bit for that CPU since it will be
516 		 * poked in a moment.
517 		 */
518 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
519 		/*
520 		 * We may find the current CPU present in the halted cpuset
521 		 * if we're in the context of an interrupt that occurred
522 		 * before we had a chance to clear our bit in cpu_idle().
523 		 * Poking ourself is obviously unnecessary, since if
524 		 * we're here, we're not halted.
525 		 */
526 		if (cpu != CPU)
527 			poke_cpu(cpu->cpu_id);
528 		return;
529 	} else {
530 		/*
531 		 * This cpu isn't halted, but it's idle or undergoing a
532 		 * context switch. No need to awaken anyone else.
533 		 */
534 		if (cpu->cpu_thread == cpu->cpu_idle_thread ||
535 		    cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL)
536 			return;
537 	}
538 
539 	/*
540 	 * No need to wake up other CPUs if this is for a bound thread.
541 	 */
542 	if (bound)
543 		return;
544 
545 	/*
546 	 * The CPU specified for wakeup isn't currently halted, so check
547 	 * to see if there are any other halted CPUs in the partition,
548 	 * and if there are then awaken one.
549 	 */
550 	do {
551 		cpu_found = bitset_find(&cp->cp_haltset);
552 		if (cpu_found == (uint_t)-1)
553 			return;
554 	} while (bitset_atomic_test_and_del(&cp->cp_haltset, cpu_found) < 0);
555 
556 	if (cpu_found != CPU->cpu_seqid) {
557 		poke_cpu(cpu_seq[cpu_found]->cpu_id);
558 	}
559 }
560 
561 #ifndef __xpv
562 /*
563  * Idle the present CPU until awoken via touching its monitored line
564  */
565 void
566 cpu_idle_mwait(void)
567 {
568 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
569 	cpu_t			*cpup = CPU;
570 	processorid_t		cpu_sid = cpup->cpu_seqid;
571 	cpupart_t		*cp = cpup->cpu_part;
572 	int			hset_update = 1;
573 
574 	/*
575 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
576 	 * wake us between now and when we call mwait.  No other cpu will
577 	 * attempt to set our mcpu_mwait until we add ourself to the halted
578 	 * CPU bitmap.
579 	 */
580 	*mcpu_mwait = MWAIT_HALTED;
581 
582 	/*
583 	 * If this CPU is online, and there's multiple CPUs
584 	 * in the system, then we should note our halting
585 	 * by adding ourselves to the partition's halted CPU
586 	 * bitmap. This allows other CPUs to find/awaken us when
587 	 * work becomes available.
588 	 */
589 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
590 		hset_update = 0;
591 
592 	/*
593 	 * Add ourselves to the partition's halted CPUs bitmap
594 	 * and set our HALTED flag, if necessary.
595 	 *
596 	 * When a thread becomes runnable, it is placed on the queue
597 	 * and then the halted CPU bitmap is checked to determine who
598 	 * (if anyone) should be awakened. We therefore need to first
599 	 * add ourselves to the bitmap, and and then check if there
600 	 * is any work available.
601 	 *
602 	 * Note that memory barriers after updating the HALTED flag
603 	 * are not necessary since an atomic operation (updating the bitmap)
604 	 * immediately follows. On x86 the atomic operation acts as a
605 	 * memory barrier for the update of cpu_disp_flags.
606 	 */
607 	if (hset_update) {
608 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
609 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
610 	}
611 
612 	/*
613 	 * Check to make sure there's really nothing to do.
614 	 * Work destined for this CPU may become available after
615 	 * this check. We'll be notified through the clearing of our
616 	 * bit in the halted CPU bitmap, and a write to our mcpu_mwait.
617 	 *
618 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
619 	 */
620 	if (disp_anywork()) {
621 		if (hset_update) {
622 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
623 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
624 		}
625 		return;
626 	}
627 
628 	/*
629 	 * We're on our way to being halted.
630 	 * To avoid a lost wakeup, arm the monitor before checking if another
631 	 * cpu wrote to mcpu_mwait to wake us up.
632 	 */
633 	i86_monitor(mcpu_mwait, 0, 0);
634 	if (*mcpu_mwait == MWAIT_HALTED) {
635 		cpu_dtrace_idle_probe(IDLE_STATE_C1);
636 
637 		tlb_going_idle();
638 		i86_mwait(0, 0);
639 		tlb_service();
640 
641 		cpu_dtrace_idle_probe(IDLE_STATE_C0);
642 	}
643 
644 	/*
645 	 * We're no longer halted
646 	 */
647 	if (hset_update) {
648 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
649 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
650 	}
651 }
652 
653 /*
654  * If "cpu" is halted in mwait, then wake it up clearing its halted bit in
655  * advance.  Otherwise, see if other CPUs in the cpu partition are halted and
656  * need to be woken up so that they can steal the thread we placed on this CPU.
657  * This function is only used on MP systems.
658  */
659 static void
660 cpu_wakeup_mwait(cpu_t *cp, int bound)
661 {
662 	cpupart_t	*cpu_part;
663 	uint_t		cpu_found;
664 	processorid_t	cpu_sid;
665 
666 	cpu_part = cp->cpu_part;
667 	cpu_sid = cp->cpu_seqid;
668 
669 	/*
670 	 * Clear the halted bit for that CPU since it will be woken up
671 	 * in a moment.
672 	 */
673 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
674 		/*
675 		 * Clear the halted bit for that CPU since it will be
676 		 * poked in a moment.
677 		 */
678 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
679 		/*
680 		 * We may find the current CPU present in the halted cpuset
681 		 * if we're in the context of an interrupt that occurred
682 		 * before we had a chance to clear our bit in cpu_idle().
683 		 * Waking ourself is obviously unnecessary, since if
684 		 * we're here, we're not halted.
685 		 *
686 		 * monitor/mwait wakeup via writing to our cache line is
687 		 * harmless and less expensive than always checking if we
688 		 * are waking ourself which is an uncommon case.
689 		 */
690 		MWAIT_WAKEUP(cp);	/* write to monitored line */
691 		return;
692 	} else {
693 		/*
694 		 * This cpu isn't halted, but it's idle or undergoing a
695 		 * context switch. No need to awaken anyone else.
696 		 */
697 		if (cp->cpu_thread == cp->cpu_idle_thread ||
698 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
699 			return;
700 	}
701 
702 	/*
703 	 * No need to wake up other CPUs if the thread we just enqueued
704 	 * is bound.
705 	 */
706 	if (bound || ncpus == 1)
707 		return;
708 
709 	/*
710 	 * See if there's any other halted CPUs. If there are, then
711 	 * select one, and awaken it.
712 	 * It's possible that after we find a CPU, somebody else
713 	 * will awaken it before we get the chance.
714 	 * In that case, look again.
715 	 */
716 	do {
717 		cpu_found = bitset_find(&cpu_part->cp_haltset);
718 		if (cpu_found == (uint_t)-1)
719 			return;
720 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
721 	    cpu_found) < 0);
722 
723 	/*
724 	 * Do not check if cpu_found is ourself as monitor/mwait
725 	 * wakeup is cheap.
726 	 */
727 	MWAIT_WAKEUP(cpu_seq[cpu_found]); /* write to monitored line */
728 }
729 
730 #endif
731 
732 void (*cpu_pause_handler)(volatile char *) = NULL;
733 
734 static int
735 mp_disable_intr(int cpun)
736 {
737 	/*
738 	 * switch to the offline cpu
739 	 */
740 	affinity_set(cpun);
741 	/*
742 	 * raise ipl to just below cross call
743 	 */
744 	splx(XC_SYS_PIL - 1);
745 	/*
746 	 *	set base spl to prevent the next swtch to idle from
747 	 *	lowering back to ipl 0
748 	 */
749 	CPU->cpu_intr_actv |= (1 << (XC_SYS_PIL - 1));
750 	set_base_spl();
751 	affinity_clear();
752 	return (DDI_SUCCESS);
753 }
754 
755 static void
756 mp_enable_intr(int cpun)
757 {
758 	/*
759 	 * switch to the online cpu
760 	 */
761 	affinity_set(cpun);
762 	/*
763 	 * clear the interrupt active mask
764 	 */
765 	CPU->cpu_intr_actv &= ~(1 << (XC_SYS_PIL - 1));
766 	set_base_spl();
767 	(void) spl0();
768 	affinity_clear();
769 }
770 
771 static void
772 mach_get_platform(int owner)
773 {
774 	void		**srv_opsp;
775 	void		**clt_opsp;
776 	int		i;
777 	int		total_ops;
778 
779 	/* fix up psm ops */
780 	srv_opsp = (void **)mach_set[0];
781 	clt_opsp = (void **)mach_set[owner];
782 	if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01)
783 		total_ops = sizeof (struct psm_ops_ver01) /
784 		    sizeof (void (*)(void));
785 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_1)
786 		/* no psm_notify_func */
787 		total_ops = OFFSETOF(struct psm_ops, psm_notify_func) /
788 		    sizeof (void (*)(void));
789 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_2)
790 		/* no psm_timer funcs */
791 		total_ops = OFFSETOF(struct psm_ops, psm_timer_reprogram) /
792 		    sizeof (void (*)(void));
793 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_3)
794 		/* no psm_preshutdown function */
795 		total_ops = OFFSETOF(struct psm_ops, psm_preshutdown) /
796 		    sizeof (void (*)(void));
797 	else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_4)
798 		/* no psm_preshutdown function */
799 		total_ops = OFFSETOF(struct psm_ops, psm_intr_ops) /
800 		    sizeof (void (*)(void));
801 	else
802 		total_ops = sizeof (struct psm_ops) / sizeof (void (*)(void));
803 
804 	/*
805 	 * Save the version of the PSM module, in case we need to
806 	 * bahave differently based on version.
807 	 */
808 	mach_ver[0] = mach_ver[owner];
809 
810 	for (i = 0; i < total_ops; i++)
811 		if (clt_opsp[i] != NULL)
812 			srv_opsp[i] = clt_opsp[i];
813 }
814 
815 static void
816 mach_construct_info()
817 {
818 	struct psm_sw *swp;
819 	int	mach_cnt[PSM_OWN_OVERRIDE+1] = {0};
820 	int	conflict_owner = 0;
821 
822 	if (psmsw->psw_forw == psmsw)
823 		panic("No valid PSM modules found");
824 	mutex_enter(&psmsw_lock);
825 	for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
826 		if (!(swp->psw_flag & PSM_MOD_IDENTIFY))
827 			continue;
828 		mach_set[swp->psw_infop->p_owner] = swp->psw_infop->p_ops;
829 		mach_ver[swp->psw_infop->p_owner] = swp->psw_infop->p_version;
830 		mach_cnt[swp->psw_infop->p_owner]++;
831 	}
832 	mutex_exit(&psmsw_lock);
833 
834 	mach_get_platform(PSM_OWN_SYS_DEFAULT);
835 
836 	/* check to see are there any conflicts */
837 	if (mach_cnt[PSM_OWN_EXCLUSIVE] > 1)
838 		conflict_owner = PSM_OWN_EXCLUSIVE;
839 	if (mach_cnt[PSM_OWN_OVERRIDE] > 1)
840 		conflict_owner = PSM_OWN_OVERRIDE;
841 	if (conflict_owner) {
842 		/* remove all psm modules except uppc */
843 		cmn_err(CE_WARN,
844 		    "Conflicts detected on the following PSM modules:");
845 		mutex_enter(&psmsw_lock);
846 		for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
847 			if (swp->psw_infop->p_owner == conflict_owner)
848 				cmn_err(CE_WARN, "%s ",
849 				    swp->psw_infop->p_mach_idstring);
850 		}
851 		mutex_exit(&psmsw_lock);
852 		cmn_err(CE_WARN,
853 		    "Setting the system back to SINGLE processor mode!");
854 		cmn_err(CE_WARN,
855 		    "Please edit /etc/mach to remove the invalid PSM module.");
856 		return;
857 	}
858 
859 	if (mach_set[PSM_OWN_EXCLUSIVE])
860 		mach_get_platform(PSM_OWN_EXCLUSIVE);
861 
862 	if (mach_set[PSM_OWN_OVERRIDE])
863 		mach_get_platform(PSM_OWN_OVERRIDE);
864 }
865 
866 static void
867 mach_init()
868 {
869 	struct psm_ops  *pops;
870 
871 	mach_construct_info();
872 
873 	pops = mach_set[0];
874 
875 	/* register the interrupt and clock initialization rotuines */
876 	picinitf = mach_picinit;
877 	clkinitf = mach_clkinit;
878 	psm_get_clockirq = pops->psm_get_clockirq;
879 
880 	/* register the interrupt setup code */
881 	slvltovect = mach_softlvl_to_vect;
882 	addspl	= pops->psm_addspl;
883 	delspl	= pops->psm_delspl;
884 
885 	if (pops->psm_translate_irq)
886 		psm_translate_irq = pops->psm_translate_irq;
887 	if (pops->psm_intr_ops)
888 		psm_intr_ops = pops->psm_intr_ops;
889 
890 #if defined(PSMI_1_2) || defined(PSMI_1_3) || defined(PSMI_1_4)
891 	/*
892 	 * Time-of-day functionality now handled in TOD modules.
893 	 * (Warn about PSM modules that think that we're going to use
894 	 * their ops vectors.)
895 	 */
896 	if (pops->psm_tod_get)
897 		cmn_err(CE_WARN, "obsolete psm_tod_get op %p",
898 		    (void *)pops->psm_tod_get);
899 
900 	if (pops->psm_tod_set)
901 		cmn_err(CE_WARN, "obsolete psm_tod_set op %p",
902 		    (void *)pops->psm_tod_set);
903 #endif
904 
905 	if (pops->psm_notify_error) {
906 		psm_notify_error = mach_notify_error;
907 		notify_error = pops->psm_notify_error;
908 	}
909 
910 	(*pops->psm_softinit)();
911 
912 	/*
913 	 * Initialize the dispatcher's function hooks to enable CPU halting
914 	 * when idle.  Set both the deep-idle and non-deep-idle hooks.
915 	 *
916 	 * Assume we can use power saving deep-idle loop cpu_idle_adaptive.
917 	 * Platform deep-idle driver will reset our idle loop to
918 	 * non_deep_idle_cpu if power saving deep-idle feature is not available.
919 	 *
920 	 * Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle)
921 	 * or idle_cpu_prefer_mwait is not set.
922 	 * Allocate monitor/mwait buffer for cpu0.
923 	 */
924 #ifndef __xpv
925 	non_deep_idle_disp_enq_thread = disp_enq_thread;
926 #endif
927 	if (idle_cpu_use_hlt) {
928 		idle_cpu = cpu_idle_adaptive;
929 		CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
930 #ifndef __xpv
931 		if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) {
932 			CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
933 			/*
934 			 * Protect ourself from insane mwait size.
935 			 */
936 			if (CPU->cpu_m.mcpu_mwait == NULL) {
937 #ifdef DEBUG
938 				cmn_err(CE_NOTE, "Using hlt idle.  Cannot "
939 				    "handle cpu 0 mwait size.");
940 #endif
941 				idle_cpu_prefer_mwait = 0;
942 				CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
943 			} else {
944 				CPU->cpu_m.mcpu_idle_cpu = cpu_idle_mwait;
945 			}
946 		} else {
947 			CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
948 		}
949 		non_deep_idle_cpu = CPU->cpu_m.mcpu_idle_cpu;
950 
951 		/*
952 		 * Disable power saving deep idle loop?
953 		 */
954 		if (idle_cpu_no_deep_c) {
955 			idle_cpu = non_deep_idle_cpu;
956 		}
957 #endif
958 	}
959 
960 	mach_smpinit();
961 }
962 
963 static void
964 mach_smpinit(void)
965 {
966 	struct psm_ops  *pops;
967 	processorid_t cpu_id;
968 	int cnt;
969 	cpuset_t cpumask;
970 
971 	pops = mach_set[0];
972 	CPUSET_ZERO(cpumask);
973 
974 	cpu_id = -1;
975 	cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
976 	for (cnt = 0; cpu_id != -1; cnt++) {
977 		CPUSET_ADD(cpumask, cpu_id);
978 		cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
979 	}
980 
981 	mp_cpus = cpumask;
982 
983 	/* MP related routines */
984 	ap_mlsetup = pops->psm_post_cpu_start;
985 	send_dirintf = pops->psm_send_ipi;
986 
987 	/* optional MP related routines */
988 	if (pops->psm_shutdown)
989 		psm_shutdownf = pops->psm_shutdown;
990 	if (pops->psm_preshutdown)
991 		psm_preshutdownf = pops->psm_preshutdown;
992 	if (pops->psm_notify_func)
993 		psm_notifyf = pops->psm_notify_func;
994 	if (pops->psm_set_idlecpu)
995 		psm_set_idle_cpuf = pops->psm_set_idlecpu;
996 	if (pops->psm_unset_idlecpu)
997 		psm_unset_idle_cpuf = pops->psm_unset_idlecpu;
998 
999 	psm_clkinit = pops->psm_clkinit;
1000 
1001 	if (pops->psm_timer_reprogram)
1002 		psm_timer_reprogram = pops->psm_timer_reprogram;
1003 
1004 	if (pops->psm_timer_enable)
1005 		psm_timer_enable = pops->psm_timer_enable;
1006 
1007 	if (pops->psm_timer_disable)
1008 		psm_timer_disable = pops->psm_timer_disable;
1009 
1010 	if (pops->psm_post_cyclic_setup)
1011 		psm_post_cyclic_setup = pops->psm_post_cyclic_setup;
1012 
1013 	if (pops->psm_state)
1014 		psm_state = pops->psm_state;
1015 
1016 	/*
1017 	 * Set these vectors here so they can be used by Suspend/Resume
1018 	 * on UP machines.
1019 	 */
1020 	if (pops->psm_disable_intr)
1021 		psm_disable_intr = pops->psm_disable_intr;
1022 	if (pops->psm_enable_intr)
1023 		psm_enable_intr  = pops->psm_enable_intr;
1024 
1025 	/* check for multiple CPUs */
1026 	if (cnt < 2)
1027 		return;
1028 
1029 	/* check for MP platforms */
1030 	if (pops->psm_cpu_start == NULL)
1031 		return;
1032 
1033 	/*
1034 	 * Set the dispatcher hook to enable cpu "wake up"
1035 	 * when a thread becomes runnable.
1036 	 */
1037 	if (idle_cpu_use_hlt) {
1038 		disp_enq_thread = cpu_wakeup;
1039 #ifndef __xpv
1040 		if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait)
1041 			disp_enq_thread = cpu_wakeup_mwait;
1042 		non_deep_idle_disp_enq_thread = disp_enq_thread;
1043 #endif
1044 	}
1045 
1046 	psm_get_ipivect = pops->psm_get_ipivect;
1047 
1048 	(void) add_avintr((void *)NULL, XC_HI_PIL, xc_serv, "xc_intr",
1049 	    (*pops->psm_get_ipivect)(XC_HI_PIL, PSM_INTR_IPI_HI),
1050 	    NULL, NULL, NULL, NULL);
1051 
1052 	(void) (*pops->psm_get_ipivect)(XC_CPUPOKE_PIL, PSM_INTR_POKE);
1053 }
1054 
1055 static void
1056 mach_picinit()
1057 {
1058 	struct psm_ops  *pops;
1059 
1060 	pops = mach_set[0];
1061 
1062 	/* register the interrupt handlers */
1063 	setlvl = pops->psm_intr_enter;
1064 	setlvlx = pops->psm_intr_exit;
1065 
1066 	/* initialize the interrupt hardware */
1067 	(*pops->psm_picinit)();
1068 
1069 	/* set interrupt mask for current ipl */
1070 	setspl = pops->psm_setspl;
1071 	cli();
1072 	setspl(CPU->cpu_pri);
1073 }
1074 
1075 uint_t	cpu_freq;	/* MHz */
1076 uint64_t cpu_freq_hz;	/* measured (in hertz) */
1077 
1078 #define	MEGA_HZ		1000000
1079 
1080 #ifdef __xpv
1081 
1082 int xpv_cpufreq_workaround = 1;
1083 int xpv_cpufreq_verbose = 0;
1084 
1085 #else	/* __xpv */
1086 
1087 static uint64_t
1088 mach_calchz(uint32_t pit_counter, uint64_t *processor_clks)
1089 {
1090 	uint64_t cpu_hz;
1091 
1092 	if ((pit_counter == 0) || (*processor_clks == 0) ||
1093 	    (*processor_clks > (((uint64_t)-1) / PIT_HZ)))
1094 		return (0);
1095 
1096 	cpu_hz = ((uint64_t)PIT_HZ * *processor_clks) / pit_counter;
1097 
1098 	return (cpu_hz);
1099 }
1100 
1101 #endif	/* __xpv */
1102 
1103 static uint64_t
1104 mach_getcpufreq(void)
1105 {
1106 #if defined(__xpv)
1107 	vcpu_time_info_t *vti = &CPU->cpu_m.mcpu_vcpu_info->time;
1108 	uint64_t cpu_hz;
1109 
1110 	/*
1111 	 * During dom0 bringup, it was noted that on at least one older
1112 	 * Intel HT machine, the hypervisor initially gives a tsc_to_system_mul
1113 	 * value that is quite wrong (the 3.06GHz clock was reported
1114 	 * as 4.77GHz)
1115 	 *
1116 	 * The curious thing is, that if you stop the kernel at entry,
1117 	 * breakpoint here and inspect the value with kmdb, the value
1118 	 * is correct - but if you don't stop and simply enable the
1119 	 * printf statement (below), you can see the bad value printed
1120 	 * here.  Almost as if something kmdb did caused the hypervisor to
1121 	 * figure it out correctly.  And, note that the hypervisor
1122 	 * eventually -does- figure it out correctly ... if you look at
1123 	 * the field later in the life of dom0, it is correct.
1124 	 *
1125 	 * For now, on dom0, we employ a slightly cheesy workaround of
1126 	 * using the DOM0_PHYSINFO hypercall.
1127 	 */
1128 	if (DOMAIN_IS_INITDOMAIN(xen_info) && xpv_cpufreq_workaround) {
1129 		xen_sysctl_t op0, *op = &op0;
1130 
1131 		op->cmd = XEN_SYSCTL_physinfo;
1132 		op->interface_version = XEN_SYSCTL_INTERFACE_VERSION;
1133 		if (HYPERVISOR_sysctl(op) != 0)
1134 			panic("physinfo op refused");
1135 
1136 		cpu_hz = 1000 * (uint64_t)op->u.physinfo.cpu_khz;
1137 	} else {
1138 		cpu_hz = (UINT64_C(1000000000) << 32) / vti->tsc_to_system_mul;
1139 
1140 		if (vti->tsc_shift < 0)
1141 			cpu_hz <<= -vti->tsc_shift;
1142 		else
1143 			cpu_hz >>= vti->tsc_shift;
1144 	}
1145 
1146 	if (xpv_cpufreq_verbose)
1147 		printf("mach_getcpufreq: system_mul 0x%x, shift %d, "
1148 		    "cpu_hz %" PRId64 "Hz\n",
1149 		    vti->tsc_to_system_mul, vti->tsc_shift, cpu_hz);
1150 
1151 	return (cpu_hz);
1152 #else	/* __xpv */
1153 	uint32_t pit_counter;
1154 	uint64_t processor_clks;
1155 
1156 	if (x86_feature & X86_TSC) {
1157 		/*
1158 		 * We have a TSC. freq_tsc() knows how to measure the number
1159 		 * of clock cycles sampled against the PIT.
1160 		 */
1161 		ulong_t flags = clear_int_flag();
1162 		processor_clks = freq_tsc(&pit_counter);
1163 		restore_int_flag(flags);
1164 		return (mach_calchz(pit_counter, &processor_clks));
1165 	} else if (x86_vendor == X86_VENDOR_Cyrix || x86_type == X86_TYPE_P5) {
1166 #if defined(__amd64)
1167 		panic("mach_getcpufreq: no TSC!");
1168 #elif defined(__i386)
1169 		/*
1170 		 * We are a Cyrix based on a 6x86 core or an Intel Pentium
1171 		 * for which freq_notsc() knows how to measure the number of
1172 		 * elapsed clock cycles sampled against the PIT
1173 		 */
1174 		ulong_t flags = clear_int_flag();
1175 		processor_clks = freq_notsc(&pit_counter);
1176 		restore_int_flag(flags);
1177 		return (mach_calchz(pit_counter, &processor_clks));
1178 #endif	/* __i386 */
1179 	}
1180 
1181 	/* We do not know how to calculate cpu frequency for this cpu. */
1182 	return (0);
1183 #endif	/* __xpv */
1184 }
1185 
1186 /*
1187  * If the clock speed of a cpu is found to be reported incorrectly, do not add
1188  * to this array, instead improve the accuracy of the algorithm that determines
1189  * the clock speed of the processor or extend the implementation to support the
1190  * vendor as appropriate. This is here only to support adjusting the speed on
1191  * older slower processors that mach_fixcpufreq() would not be able to account
1192  * for otherwise.
1193  */
1194 static int x86_cpu_freq[] = { 60, 75, 80, 90, 120, 160, 166, 175, 180, 233 };
1195 
1196 /*
1197  * On fast processors the clock frequency that is measured may be off by
1198  * a few MHz from the value printed on the part. This is a combination of
1199  * the factors that for such fast parts being off by this much is within
1200  * the tolerances for manufacture and because of the difficulties in the
1201  * measurement that can lead to small error. This function uses some
1202  * heuristics in order to tweak the value that was measured to match what
1203  * is most likely printed on the part.
1204  *
1205  * Some examples:
1206  * 	AMD Athlon 1000 mhz measured as 998 mhz
1207  * 	Intel Pentium III Xeon 733 mhz measured as 731 mhz
1208  * 	Intel Pentium IV 1500 mhz measured as 1495mhz
1209  *
1210  * If in the future this function is no longer sufficient to correct
1211  * for the error in the measurement, then the algorithm used to perform
1212  * the measurement will have to be improved in order to increase accuracy
1213  * rather than adding horrible and questionable kludges here.
1214  *
1215  * This is called after the cyclics subsystem because of the potential
1216  * that the heuristics within may give a worse estimate of the clock
1217  * frequency than the value that was measured.
1218  */
1219 static void
1220 mach_fixcpufreq(void)
1221 {
1222 	uint32_t freq, mul, near66, delta66, near50, delta50, fixed, delta, i;
1223 
1224 	freq = (uint32_t)cpu_freq;
1225 
1226 	/*
1227 	 * Find the nearest integer multiple of 200/3 (about 66) MHz to the
1228 	 * measured speed taking into account that the 667 MHz parts were
1229 	 * the first to round-up.
1230 	 */
1231 	mul = (uint32_t)((3 * (uint64_t)freq + 100) / 200);
1232 	near66 = (uint32_t)((200 * (uint64_t)mul + ((mul >= 10) ? 1 : 0)) / 3);
1233 	delta66 = (near66 > freq) ? (near66 - freq) : (freq - near66);
1234 
1235 	/* Find the nearest integer multiple of 50 MHz to the measured speed */
1236 	mul = (freq + 25) / 50;
1237 	near50 = mul * 50;
1238 	delta50 = (near50 > freq) ? (near50 - freq) : (freq - near50);
1239 
1240 	/* Find the closer of the two */
1241 	if (delta66 < delta50) {
1242 		fixed = near66;
1243 		delta = delta66;
1244 	} else {
1245 		fixed = near50;
1246 		delta = delta50;
1247 	}
1248 
1249 	if (fixed > INT_MAX)
1250 		return;
1251 
1252 	/*
1253 	 * Some older parts have a core clock frequency that is not an
1254 	 * integral multiple of 50 or 66 MHz. Check if one of the old
1255 	 * clock frequencies is closer to the measured value than any
1256 	 * of the integral multiples of 50 an 66, and if so set fixed
1257 	 * and delta appropriately to represent the closest value.
1258 	 */
1259 	i = sizeof (x86_cpu_freq) / sizeof (int);
1260 	while (i > 0) {
1261 		i--;
1262 
1263 		if (x86_cpu_freq[i] <= freq) {
1264 			mul = freq - x86_cpu_freq[i];
1265 
1266 			if (mul < delta) {
1267 				fixed = x86_cpu_freq[i];
1268 				delta = mul;
1269 			}
1270 
1271 			break;
1272 		}
1273 
1274 		mul = x86_cpu_freq[i] - freq;
1275 
1276 		if (mul < delta) {
1277 			fixed = x86_cpu_freq[i];
1278 			delta = mul;
1279 		}
1280 	}
1281 
1282 	/*
1283 	 * Set a reasonable maximum for how much to correct the measured
1284 	 * result by. This check is here to prevent the adjustment made
1285 	 * by this function from being more harm than good. It is entirely
1286 	 * possible that in the future parts will be made that are not
1287 	 * integral multiples of 66 or 50 in clock frequency or that
1288 	 * someone may overclock a part to some odd frequency. If the
1289 	 * measured value is farther from the corrected value than
1290 	 * allowed, then assume the corrected value is in error and use
1291 	 * the measured value.
1292 	 */
1293 	if (6 < delta)
1294 		return;
1295 
1296 	cpu_freq = (int)fixed;
1297 }
1298 
1299 
1300 static int
1301 machhztomhz(uint64_t cpu_freq_hz)
1302 {
1303 	uint64_t cpu_mhz;
1304 
1305 	/* Round to nearest MHZ */
1306 	cpu_mhz = (cpu_freq_hz + (MEGA_HZ / 2)) / MEGA_HZ;
1307 
1308 	if (cpu_mhz > INT_MAX)
1309 		return (0);
1310 
1311 	return ((int)cpu_mhz);
1312 
1313 }
1314 
1315 
1316 static int
1317 mach_clkinit(int preferred_mode, int *set_mode)
1318 {
1319 	struct psm_ops  *pops;
1320 	int resolution;
1321 
1322 	pops = mach_set[0];
1323 
1324 	cpu_freq_hz = mach_getcpufreq();
1325 
1326 	cpu_freq = machhztomhz(cpu_freq_hz);
1327 
1328 	if (!(x86_feature & X86_TSC) || (cpu_freq == 0))
1329 		tsc_gethrtime_enable = 0;
1330 
1331 #ifndef __xpv
1332 	if (tsc_gethrtime_enable) {
1333 		tsc_hrtimeinit(cpu_freq_hz);
1334 	} else
1335 #endif
1336 	{
1337 		if (pops->psm_hrtimeinit)
1338 			(*pops->psm_hrtimeinit)();
1339 		gethrtimef = pops->psm_gethrtime;
1340 		gethrtimeunscaledf = gethrtimef;
1341 		/* scalehrtimef will remain dummy */
1342 	}
1343 
1344 	mach_fixcpufreq();
1345 
1346 	if (mach_ver[0] >= PSM_INFO_VER01_3) {
1347 		if (preferred_mode == TIMER_ONESHOT) {
1348 
1349 			resolution = (*pops->psm_clkinit)(0);
1350 			if (resolution != 0)  {
1351 				*set_mode = TIMER_ONESHOT;
1352 				return (resolution);
1353 			}
1354 		}
1355 
1356 		/*
1357 		 * either periodic mode was requested or could not set to
1358 		 * one-shot mode
1359 		 */
1360 		resolution = (*pops->psm_clkinit)(hz);
1361 		/*
1362 		 * psm should be able to do periodic, so we do not check
1363 		 * for return value of psm_clkinit here.
1364 		 */
1365 		*set_mode = TIMER_PERIODIC;
1366 		return (resolution);
1367 	} else {
1368 		/*
1369 		 * PSMI interface prior to PSMI_3 does not define a return
1370 		 * value for psm_clkinit, so the return value is ignored.
1371 		 */
1372 		(void) (*pops->psm_clkinit)(hz);
1373 		*set_mode = TIMER_PERIODIC;
1374 		return (nsec_per_tick);
1375 	}
1376 }
1377 
1378 
1379 /*ARGSUSED*/
1380 static int
1381 mach_softlvl_to_vect(int ipl)
1382 {
1383 	setsoftint = av_set_softint_pending;
1384 	kdisetsoftint = kdi_av_set_softint_pending;
1385 
1386 	return (PSM_SV_SOFTWARE);
1387 }
1388 
1389 #ifdef DEBUG
1390 /*
1391  * This is here to allow us to simulate cpus that refuse to start.
1392  */
1393 cpuset_t cpufailset;
1394 #endif
1395 
1396 int
1397 mach_cpu_start(struct cpu *cp, void *ctx)
1398 {
1399 	struct psm_ops *pops = mach_set[0];
1400 	processorid_t id = cp->cpu_id;
1401 
1402 #ifdef DEBUG
1403 	if (CPU_IN_SET(cpufailset, id))
1404 		return (0);
1405 #endif
1406 	return ((*pops->psm_cpu_start)(id, ctx));
1407 }
1408 
1409 int
1410 mach_cpuid_start(processorid_t id, void *ctx)
1411 {
1412 	struct psm_ops *pops = mach_set[0];
1413 
1414 #ifdef DEBUG
1415 	if (CPU_IN_SET(cpufailset, id))
1416 		return (0);
1417 #endif
1418 	return ((*pops->psm_cpu_start)(id, ctx));
1419 }
1420 
1421 /*ARGSUSED*/
1422 static int
1423 mach_translate_irq(dev_info_t *dip, int irqno)
1424 {
1425 	return (irqno);	/* default to NO translation */
1426 }
1427 
1428 static void
1429 mach_notify_error(int level, char *errmsg)
1430 {
1431 	/*
1432 	 * SL_FATAL is pass in once panicstr is set, deliver it
1433 	 * as CE_PANIC.  Also, translate SL_ codes back to CE_
1434 	 * codes for the psmi handler
1435 	 */
1436 	if (level & SL_FATAL)
1437 		(*notify_error)(CE_PANIC, errmsg);
1438 	else if (level & SL_WARN)
1439 		(*notify_error)(CE_WARN, errmsg);
1440 	else if (level & SL_NOTE)
1441 		(*notify_error)(CE_NOTE, errmsg);
1442 	else if (level & SL_CONSOLE)
1443 		(*notify_error)(CE_CONT, errmsg);
1444 }
1445 
1446 /*
1447  * It provides the default basic intr_ops interface for the new DDI
1448  * interrupt framework if the PSM doesn't have one.
1449  *
1450  * Input:
1451  * dip     - pointer to the dev_info structure of the requested device
1452  * hdlp    - pointer to the internal interrupt handle structure for the
1453  *	     requested interrupt
1454  * intr_op - opcode for this call
1455  * result  - pointer to the integer that will hold the result to be
1456  *	     passed back if return value is PSM_SUCCESS
1457  *
1458  * Output:
1459  * return value is either PSM_SUCCESS or PSM_FAILURE
1460  */
1461 static int
1462 mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
1463     psm_intr_op_t intr_op, int *result)
1464 {
1465 	struct intrspec *ispec;
1466 
1467 	switch (intr_op) {
1468 	case PSM_INTR_OP_CHECK_MSI:
1469 		*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
1470 		    DDI_INTR_TYPE_MSIX);
1471 		break;
1472 	case PSM_INTR_OP_ALLOC_VECTORS:
1473 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1474 			*result = 1;
1475 		else
1476 			*result = 0;
1477 		break;
1478 	case PSM_INTR_OP_FREE_VECTORS:
1479 		break;
1480 	case PSM_INTR_OP_NAVAIL_VECTORS:
1481 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1482 			*result = 1;
1483 		else
1484 			*result = 0;
1485 		break;
1486 	case PSM_INTR_OP_XLATE_VECTOR:
1487 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
1488 		*result = psm_translate_irq(dip, ispec->intrspec_vec);
1489 		break;
1490 	case PSM_INTR_OP_GET_CAP:
1491 		*result = 0;
1492 		break;
1493 	case PSM_INTR_OP_GET_PENDING:
1494 	case PSM_INTR_OP_CLEAR_MASK:
1495 	case PSM_INTR_OP_SET_MASK:
1496 	case PSM_INTR_OP_GET_SHARED:
1497 	case PSM_INTR_OP_SET_PRI:
1498 	case PSM_INTR_OP_SET_CAP:
1499 	case PSM_INTR_OP_SET_CPU:
1500 	case PSM_INTR_OP_GET_INTR:
1501 	default:
1502 		return (PSM_FAILURE);
1503 	}
1504 	return (PSM_SUCCESS);
1505 }
1506 /*
1507  * Return 1 if CMT load balancing policies should be
1508  * implemented across instances of the specified hardware
1509  * sharing relationship.
1510  */
1511 int
1512 pg_cmt_load_bal_hw(pghw_type_t hw)
1513 {
1514 	if (hw == PGHW_IPIPE ||
1515 	    hw == PGHW_FPU ||
1516 	    hw == PGHW_CHIP)
1517 		return (1);
1518 	else
1519 		return (0);
1520 }
1521 /*
1522  * Return 1 if thread affinity polices should be implemented
1523  * for instances of the specifed hardware sharing relationship.
1524  */
1525 int
1526 pg_cmt_affinity_hw(pghw_type_t hw)
1527 {
1528 	if (hw == PGHW_CACHE)
1529 		return (1);
1530 	else
1531 		return (0);
1532 }
1533