xref: /titanic_51/usr/src/uts/i86pc/os/mp_pc.c (revision 0ed5c46e82c989cfa9726d9dae452e3d24ef83be)
1ae115bc7Smrj /*
2ae115bc7Smrj  * CDDL HEADER START
3ae115bc7Smrj  *
4ae115bc7Smrj  * The contents of this file are subject to the terms of the
5ae115bc7Smrj  * Common Development and Distribution License (the "License").
6ae115bc7Smrj  * You may not use this file except in compliance with the License.
7ae115bc7Smrj  *
8ae115bc7Smrj  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9ae115bc7Smrj  * or http://www.opensolaris.org/os/licensing.
10ae115bc7Smrj  * See the License for the specific language governing permissions
11ae115bc7Smrj  * and limitations under the License.
12ae115bc7Smrj  *
13ae115bc7Smrj  * When distributing Covered Code, include this CDDL HEADER in each
14ae115bc7Smrj  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15ae115bc7Smrj  * If applicable, add the following below this CDDL HEADER, with the
16ae115bc7Smrj  * fields enclosed by brackets "[]" replaced with your own identifying
17ae115bc7Smrj  * information: Portions Copyright [yyyy] [name of copyright owner]
18ae115bc7Smrj  *
19ae115bc7Smrj  * CDDL HEADER END
20ae115bc7Smrj  */
21ae115bc7Smrj /*
227417cfdeSKuriakose Kuruvilla  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23ae115bc7Smrj  */
24a3114836SGerry Liu /*
25a3114836SGerry Liu  * Copyright (c) 2010, Intel Corporation.
26a3114836SGerry Liu  * All rights reserved.
27a3114836SGerry Liu  */
28f16a0f4cSRobert Mustacchi /*
29f16a0f4cSRobert Mustacchi  * Copyright 2011 Joyent, Inc. All rights reserved.
30f16a0f4cSRobert Mustacchi  */
31ae115bc7Smrj 
32ae115bc7Smrj /*
33ae115bc7Smrj  * Welcome to the world of the "real mode platter".
34ae115bc7Smrj  * See also startup.c, mpcore.s and apic.c for related routines.
35ae115bc7Smrj  */
36ae115bc7Smrj 
37ae115bc7Smrj #include <sys/types.h>
38ae115bc7Smrj #include <sys/systm.h>
39ae115bc7Smrj #include <sys/cpuvar.h>
40a3114836SGerry Liu #include <sys/cpu_module.h>
41ae115bc7Smrj #include <sys/kmem.h>
42ae115bc7Smrj #include <sys/archsystm.h>
43ae115bc7Smrj #include <sys/machsystm.h>
44ae115bc7Smrj #include <sys/controlregs.h>
45ae115bc7Smrj #include <sys/x86_archext.h>
46ae115bc7Smrj #include <sys/smp_impldefs.h>
47ae115bc7Smrj #include <sys/sysmacros.h>
48ae115bc7Smrj #include <sys/mach_mmu.h>
49ae115bc7Smrj #include <sys/promif.h>
50ae115bc7Smrj #include <sys/cpu.h>
51a3114836SGerry Liu #include <sys/cpu_event.h>
52a3114836SGerry Liu #include <sys/sunndi.h>
53a3114836SGerry Liu #include <sys/fs/dv_node.h>
5495c0a3c8Sjosephb #include <vm/hat_i86.h>
55a3114836SGerry Liu #include <vm/as.h>
56ae115bc7Smrj 
57a3114836SGerry Liu extern cpuset_t cpu_ready_set;
58a3114836SGerry Liu 
59a3114836SGerry Liu extern int  mp_start_cpu_common(cpu_t *cp, boolean_t boot);
60a3114836SGerry Liu extern void real_mode_start_cpu(void);
61a3114836SGerry Liu extern void real_mode_start_cpu_end(void);
62a3114836SGerry Liu extern void real_mode_stop_cpu_stage1(void);
63a3114836SGerry Liu extern void real_mode_stop_cpu_stage1_end(void);
64a3114836SGerry Liu extern void real_mode_stop_cpu_stage2(void);
65a3114836SGerry Liu extern void real_mode_stop_cpu_stage2_end(void);
662df1fe9cSrandyf 
672df1fe9cSrandyf void rmp_gdt_init(rm_platter_t *);
68ae115bc7Smrj 
69ae115bc7Smrj /*
70ae115bc7Smrj  * Fill up the real mode platter to make it easy for real mode code to
71ae115bc7Smrj  * kick it off. This area should really be one passed by boot to kernel
72ae115bc7Smrj  * and guaranteed to be below 1MB and aligned to 16 bytes. Should also
73ae115bc7Smrj  * have identical physical and virtual address in paged mode.
74ae115bc7Smrj  */
75ae115bc7Smrj static ushort_t *warm_reset_vector = NULL;
76ae115bc7Smrj 
77ae115bc7Smrj int
78ae115bc7Smrj mach_cpucontext_init(void)
79ae115bc7Smrj {
80ae115bc7Smrj 	ushort_t *vec;
81a3114836SGerry Liu 	ulong_t addr;
82a3114836SGerry Liu 	struct rm_platter *rm = (struct rm_platter *)rm_platter_va;
83ae115bc7Smrj 
84ae115bc7Smrj 	if (!(vec = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR,
85ae115bc7Smrj 	    sizeof (vec), PROT_READ | PROT_WRITE)))
86ae115bc7Smrj 		return (-1);
87a3114836SGerry Liu 
88ae115bc7Smrj 	/*
89ae115bc7Smrj 	 * setup secondary cpu bios boot up vector
90a3114836SGerry Liu 	 * Write page offset to 0x467 and page frame number to 0x469.
91ae115bc7Smrj 	 */
92a3114836SGerry Liu 	addr = (ulong_t)((caddr_t)rm->rm_code - (caddr_t)rm) + rm_platter_pa;
93a3114836SGerry Liu 	vec[0] = (ushort_t)(addr & PAGEOFFSET);
94a3114836SGerry Liu 	vec[1] = (ushort_t)((addr & (0xfffff & PAGEMASK)) >> 4);
95ae115bc7Smrj 	warm_reset_vector = vec;
96ae115bc7Smrj 
97a3114836SGerry Liu 	/* Map real mode platter into kas so kernel can access it. */
98a3114836SGerry Liu 	hat_devload(kas.a_hat,
99a3114836SGerry Liu 	    (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE,
100a3114836SGerry Liu 	    btop(rm_platter_pa), PROT_READ | PROT_WRITE | PROT_EXEC,
101a3114836SGerry Liu 	    HAT_LOAD_NOCONSIST);
102a3114836SGerry Liu 
103a3114836SGerry Liu 	/* Copy CPU startup code to rm_platter if it's still during boot. */
104a3114836SGerry Liu 	if (!plat_dr_enabled()) {
105a3114836SGerry Liu 		ASSERT((size_t)real_mode_start_cpu_end -
106a3114836SGerry Liu 		    (size_t)real_mode_start_cpu <= RM_PLATTER_CODE_SIZE);
107a3114836SGerry Liu 		bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code,
108a3114836SGerry Liu 		    (size_t)real_mode_start_cpu_end -
109a3114836SGerry Liu 		    (size_t)real_mode_start_cpu);
110a3114836SGerry Liu 	}
111ae115bc7Smrj 
112ae115bc7Smrj 	return (0);
113ae115bc7Smrj }
114ae115bc7Smrj 
115ae115bc7Smrj void
116ae115bc7Smrj mach_cpucontext_fini(void)
117ae115bc7Smrj {
118ae115bc7Smrj 	if (warm_reset_vector)
119ae115bc7Smrj 		psm_unmap_phys((caddr_t)warm_reset_vector,
120ae115bc7Smrj 		    sizeof (warm_reset_vector));
121ae115bc7Smrj 	hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE,
122ae115bc7Smrj 	    HAT_UNLOAD);
123ae115bc7Smrj }
124ae115bc7Smrj 
125ae115bc7Smrj #if defined(__amd64)
126ae115bc7Smrj extern void *long_mode_64(void);
127ae115bc7Smrj #endif	/* __amd64 */
128ae115bc7Smrj 
129a3114836SGerry Liu /*ARGSUSED*/
130a3114836SGerry Liu void
131a3114836SGerry Liu rmp_gdt_init(rm_platter_t *rm)
132ae115bc7Smrj {
133a3114836SGerry Liu 
134a3114836SGerry Liu #if defined(__amd64)
135a3114836SGerry Liu 	/* Use the kas address space for the CPU startup thread. */
136a3114836SGerry Liu 	if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL)
137a3114836SGerry Liu 		panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
138a3114836SGerry Liu 		    "located above 4G in physical memory (@ 0x%lx)",
139a3114836SGerry Liu 		    MAKECR3(kas.a_hat->hat_htable->ht_pfn));
140a3114836SGerry Liu 
141a3114836SGerry Liu 	/*
142a3114836SGerry Liu 	 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
143a3114836SGerry Liu 	 * by code in real_mode_start_cpu():
144a3114836SGerry Liu 	 *
145a3114836SGerry Liu 	 * GDT[0]:  NULL selector
146a3114836SGerry Liu 	 * GDT[1]:  64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1
147a3114836SGerry Liu 	 *
148a3114836SGerry Liu 	 * Clear the IDT as interrupts will be off and a limit of 0 will cause
149a3114836SGerry Liu 	 * the CPU to triple fault and reset on an NMI, seemingly as reasonable
150a3114836SGerry Liu 	 * a course of action as any other, though it may cause the entire
151a3114836SGerry Liu 	 * platform to reset in some cases...
152a3114836SGerry Liu 	 */
153a3114836SGerry Liu 	rm->rm_temp_gdt[0] = 0;
154a3114836SGerry Liu 	rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL;
155a3114836SGerry Liu 
156a3114836SGerry Liu 	rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1);
157a3114836SGerry Liu 	rm->rm_temp_gdt_base = rm_platter_pa +
158a3114836SGerry Liu 	    (uint32_t)offsetof(rm_platter_t, rm_temp_gdt);
159a3114836SGerry Liu 	rm->rm_temp_idt_lim = 0;
160a3114836SGerry Liu 	rm->rm_temp_idt_base = 0;
161a3114836SGerry Liu 
162a3114836SGerry Liu 	/*
163a3114836SGerry Liu 	 * Since the CPU needs to jump to protected mode using an identity
164a3114836SGerry Liu 	 * mapped address, we need to calculate it here.
165a3114836SGerry Liu 	 */
166a3114836SGerry Liu 	rm->rm_longmode64_addr = rm_platter_pa +
167*c909a41bSRichard Lowe 	    (uint32_t)((uintptr_t)long_mode_64 -
168*c909a41bSRichard Lowe 	    (uintptr_t)real_mode_start_cpu);
169a3114836SGerry Liu #endif	/* __amd64 */
170a3114836SGerry Liu }
171a3114836SGerry Liu 
172a3114836SGerry Liu static void *
173a3114836SGerry Liu mach_cpucontext_alloc_tables(struct cpu *cp)
174a3114836SGerry Liu {
175f16a0f4cSRobert Mustacchi 	tss_t *ntss;
176a3114836SGerry Liu 	struct cpu_tables *ct;
177ae115bc7Smrj 
178ae115bc7Smrj 	/*
1790cfdb603Sjosephb 	 * Allocate space for stack, tss, gdt and idt. We round the size
180fb2caebeSRandy Fishel 	 * allotted for cpu_tables up, so that the TSS is on a unique page.
1810cfdb603Sjosephb 	 * This is more efficient when running in virtual machines.
182ae115bc7Smrj 	 */
1830cfdb603Sjosephb 	ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP);
1840cfdb603Sjosephb 	if ((uintptr_t)ct & PAGEOFFSET)
185a3114836SGerry Liu 		panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables",
186a3114836SGerry Liu 		    cp->cpu_id);
187ae115bc7Smrj 
188ae115bc7Smrj 	ntss = cp->cpu_tss = &ct->ct_tss;
189ae115bc7Smrj 
190ae115bc7Smrj #if defined(__amd64)
191ae115bc7Smrj 
192ae115bc7Smrj 	/*
193ae115bc7Smrj 	 * #DF (double fault).
194ae115bc7Smrj 	 */
195ae115bc7Smrj 	ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)];
196ae115bc7Smrj 
197ae115bc7Smrj #elif defined(__i386)
198ae115bc7Smrj 
199ae115bc7Smrj 	ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp =
200ae115bc7Smrj 	    (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)];
201ae115bc7Smrj 
202ae115bc7Smrj 	ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL;
203ae115bc7Smrj 
204ae115bc7Smrj 	ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc;
205ae115bc7Smrj 
206ae115bc7Smrj 	ntss->tss_cs = KCS_SEL;
207ae115bc7Smrj 	ntss->tss_ds = ntss->tss_es = KDS_SEL;
208ae115bc7Smrj 	ntss->tss_fs = KFS_SEL;
209ae115bc7Smrj 	ntss->tss_gs = KGS_SEL;
210ae115bc7Smrj 
211ae115bc7Smrj #endif	/* __i386 */
212ae115bc7Smrj 
213ae115bc7Smrj 	/*
214ae115bc7Smrj 	 * Set I/O bit map offset equal to size of TSS segment limit
215ae115bc7Smrj 	 * for no I/O permission map. This will cause all user I/O
216ae115bc7Smrj 	 * instructions to generate #gp fault.
217ae115bc7Smrj 	 */
218ae115bc7Smrj 	ntss->tss_bitmapbase = sizeof (*ntss);
219ae115bc7Smrj 
220ae115bc7Smrj 	/*
221ae115bc7Smrj 	 * Setup kernel tss.
222ae115bc7Smrj 	 */
223ae115bc7Smrj 	set_syssegd((system_desc_t *)&cp->cpu_gdt[GDT_KTSS], cp->cpu_tss,
224ae115bc7Smrj 	    sizeof (*cp->cpu_tss) - 1, SDT_SYSTSS, SEL_KPL);
225ae115bc7Smrj 
226a3114836SGerry Liu 	return (ct);
227a3114836SGerry Liu }
228a3114836SGerry Liu 
229a3114836SGerry Liu void *
230a3114836SGerry Liu mach_cpucontext_xalloc(struct cpu *cp, int optype)
231a3114836SGerry Liu {
232a3114836SGerry Liu 	size_t len;
233a3114836SGerry Liu 	struct cpu_tables *ct;
234a3114836SGerry Liu 	rm_platter_t *rm = (rm_platter_t *)rm_platter_va;
235a3114836SGerry Liu 	static int cpu_halt_code_ready;
236a3114836SGerry Liu 
237a3114836SGerry Liu 	if (optype == MACH_CPUCONTEXT_OP_STOP) {
238a3114836SGerry Liu 		ASSERT(plat_dr_enabled());
239a3114836SGerry Liu 
240a3114836SGerry Liu 		/*
241a3114836SGerry Liu 		 * The WARM_RESET_VECTOR has a limitation that the physical
242a3114836SGerry Liu 		 * address written to it must be page-aligned. To work around
243a3114836SGerry Liu 		 * this limitation, the CPU stop code has been splitted into
244a3114836SGerry Liu 		 * two stages.
245a3114836SGerry Liu 		 * The stage 2 code, which implements the real logic to halt
246a3114836SGerry Liu 		 * CPUs, is copied to the rm_cpu_halt_code field in the real
247a3114836SGerry Liu 		 * mode platter. The stage 1 code, which simply jumps to the
248a3114836SGerry Liu 		 * stage 2 code in the rm_cpu_halt_code field, is copied to
249a3114836SGerry Liu 		 * rm_code field in the real mode platter and it may be
250a3114836SGerry Liu 		 * overwritten after the CPU has been stopped.
251a3114836SGerry Liu 		 */
252a3114836SGerry Liu 		if (!cpu_halt_code_ready) {
253a3114836SGerry Liu 			/*
254a3114836SGerry Liu 			 * The rm_cpu_halt_code field in the real mode platter
255a3114836SGerry Liu 			 * is used by the CPU stop code only. So only copy the
256a3114836SGerry Liu 			 * CPU stop stage 2 code into the rm_cpu_halt_code
257a3114836SGerry Liu 			 * field on the first call.
258a3114836SGerry Liu 			 */
259a3114836SGerry Liu 			len = (size_t)real_mode_stop_cpu_stage2_end -
260a3114836SGerry Liu 			    (size_t)real_mode_stop_cpu_stage2;
261a3114836SGerry Liu 			ASSERT(len <= RM_PLATTER_CPU_HALT_CODE_SIZE);
262a3114836SGerry Liu 			bcopy((caddr_t)real_mode_stop_cpu_stage2,
263a3114836SGerry Liu 			    (caddr_t)rm->rm_cpu_halt_code, len);
264a3114836SGerry Liu 			cpu_halt_code_ready = 1;
265a3114836SGerry Liu 		}
266a3114836SGerry Liu 
267a3114836SGerry Liu 		/*
268a3114836SGerry Liu 		 * The rm_code field in the real mode platter is shared by
269a3114836SGerry Liu 		 * the CPU start, CPU stop, CPR and fast reboot code. So copy
270a3114836SGerry Liu 		 * the CPU stop stage 1 code into the rm_code field every time.
271a3114836SGerry Liu 		 */
272a3114836SGerry Liu 		len = (size_t)real_mode_stop_cpu_stage1_end -
273a3114836SGerry Liu 		    (size_t)real_mode_stop_cpu_stage1;
274a3114836SGerry Liu 		ASSERT(len <= RM_PLATTER_CODE_SIZE);
275a3114836SGerry Liu 		bcopy((caddr_t)real_mode_stop_cpu_stage1,
276a3114836SGerry Liu 		    (caddr_t)rm->rm_code, len);
277a3114836SGerry Liu 		rm->rm_cpu_halted = 0;
278a3114836SGerry Liu 
279a3114836SGerry Liu 		return (cp->cpu_m.mcpu_mach_ctx_ptr);
280a3114836SGerry Liu 	} else if (optype != MACH_CPUCONTEXT_OP_START) {
281a3114836SGerry Liu 		return (NULL);
282a3114836SGerry Liu 	}
283a3114836SGerry Liu 
284a3114836SGerry Liu 	/*
285a3114836SGerry Liu 	 * Only need to allocate tables when starting CPU.
286a3114836SGerry Liu 	 * Tables allocated when starting CPU will be reused when stopping CPU.
287a3114836SGerry Liu 	 */
288a3114836SGerry Liu 	ct = mach_cpucontext_alloc_tables(cp);
289a3114836SGerry Liu 	if (ct == NULL) {
290a3114836SGerry Liu 		return (NULL);
291a3114836SGerry Liu 	}
292a3114836SGerry Liu 
293a3114836SGerry Liu 	/* Copy CPU startup code to rm_platter for CPU hot-add operations. */
294a3114836SGerry Liu 	if (plat_dr_enabled()) {
295a3114836SGerry Liu 		bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code,
296a3114836SGerry Liu 		    (size_t)real_mode_start_cpu_end -
297a3114836SGerry Liu 		    (size_t)real_mode_start_cpu);
298a3114836SGerry Liu 	}
299a3114836SGerry Liu 
300ae115bc7Smrj 	/*
301ae115bc7Smrj 	 * Now copy all that we've set up onto the real mode platter
302ae115bc7Smrj 	 * for the real mode code to digest as part of starting the cpu.
303ae115bc7Smrj 	 */
304ae115bc7Smrj 	rm->rm_idt_base = cp->cpu_idt;
3050cfdb603Sjosephb 	rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1;
306ae115bc7Smrj 	rm->rm_gdt_base = cp->cpu_gdt;
3070cfdb603Sjosephb 	rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1;
308ae115bc7Smrj 
309a3114836SGerry Liu 	/*
310a3114836SGerry Liu 	 * CPU needs to access kernel address space after powering on.
311a3114836SGerry Liu 	 * When hot-adding CPU at runtime, directly use top level page table
312a3114836SGerry Liu 	 * of kas other than the return value of getcr3(). getcr3() returns
313a3114836SGerry Liu 	 * current process's top level page table, which may be different from
314a3114836SGerry Liu 	 * the one of kas.
315a3114836SGerry Liu 	 */
316a3114836SGerry Liu 	rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn);
317ae115bc7Smrj 	rm->rm_cpu = cp->cpu_id;
318a3114836SGerry Liu 
319a3114836SGerry Liu 	/*
320a3114836SGerry Liu 	 * For hot-adding CPU at runtime, Machine Check and Performance Counter
321a3114836SGerry Liu 	 * should be disabled. They will be enabled on demand after CPU powers
322a3114836SGerry Liu 	 * on successfully
323a3114836SGerry Liu 	 */
324ae115bc7Smrj 	rm->rm_cr4 = getcr4();
325a3114836SGerry Liu 	rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE);
326ae115bc7Smrj 
3272df1fe9cSrandyf 	rmp_gdt_init(rm);
3282df1fe9cSrandyf 
3292df1fe9cSrandyf 	return (ct);
3302df1fe9cSrandyf }
3312df1fe9cSrandyf 
3322df1fe9cSrandyf void
333a3114836SGerry Liu mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype)
334ae115bc7Smrj {
335ae115bc7Smrj 	struct cpu_tables *ct = arg;
336ae115bc7Smrj 
337ae115bc7Smrj 	ASSERT(&ct->ct_tss == cp->cpu_tss);
338a3114836SGerry Liu 	if (optype == MACH_CPUCONTEXT_OP_START) {
339ae115bc7Smrj 		switch (err) {
340ae115bc7Smrj 		case 0:
341a3114836SGerry Liu 			/*
342a3114836SGerry Liu 			 * Save pointer for reuse when stopping CPU.
343a3114836SGerry Liu 			 */
344a3114836SGerry Liu 			cp->cpu_m.mcpu_mach_ctx_ptr = arg;
345ae115bc7Smrj 			break;
346ae115bc7Smrj 		case ETIMEDOUT:
347ae115bc7Smrj 			/*
348ae115bc7Smrj 			 * The processor was poked, but failed to start before
349ae115bc7Smrj 			 * we gave up waiting for it.  In case it starts later,
350ae115bc7Smrj 			 * don't free anything.
351ae115bc7Smrj 			 */
352a3114836SGerry Liu 			cp->cpu_m.mcpu_mach_ctx_ptr = arg;
353ae115bc7Smrj 			break;
354ae115bc7Smrj 		default:
355ae115bc7Smrj 			/*
356ae115bc7Smrj 			 * Some other, passive, error occurred.
357ae115bc7Smrj 			 */
3580cfdb603Sjosephb 			kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE));
359ae115bc7Smrj 			cp->cpu_tss = NULL;
360ae115bc7Smrj 			break;
361ae115bc7Smrj 		}
362a3114836SGerry Liu 	} else if (optype == MACH_CPUCONTEXT_OP_STOP) {
363a3114836SGerry Liu 		switch (err) {
364a3114836SGerry Liu 		case 0:
365a3114836SGerry Liu 			/*
366a3114836SGerry Liu 			 * Free resources allocated when starting CPU.
367a3114836SGerry Liu 			 */
368a3114836SGerry Liu 			kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE));
369a3114836SGerry Liu 			cp->cpu_tss = NULL;
370a3114836SGerry Liu 			cp->cpu_m.mcpu_mach_ctx_ptr = NULL;
371a3114836SGerry Liu 			break;
372a3114836SGerry Liu 		default:
373a3114836SGerry Liu 			/*
374a3114836SGerry Liu 			 * Don't touch table pointer in case of failure.
375a3114836SGerry Liu 			 */
376a3114836SGerry Liu 			break;
377a3114836SGerry Liu 		}
378a3114836SGerry Liu 	} else {
379a3114836SGerry Liu 		ASSERT(0);
380a3114836SGerry Liu 	}
381a3114836SGerry Liu }
382a3114836SGerry Liu 
383a3114836SGerry Liu void *
384a3114836SGerry Liu mach_cpucontext_alloc(struct cpu *cp)
385a3114836SGerry Liu {
386a3114836SGerry Liu 	return (mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_START));
387a3114836SGerry Liu }
388a3114836SGerry Liu 
389a3114836SGerry Liu void
390a3114836SGerry Liu mach_cpucontext_free(struct cpu *cp, void *arg, int err)
391a3114836SGerry Liu {
392a3114836SGerry Liu 	mach_cpucontext_xfree(cp, arg, err, MACH_CPUCONTEXT_OP_START);
393ae115bc7Smrj }
394ae115bc7Smrj 
395ae115bc7Smrj /*
396ae115bc7Smrj  * "Enter monitor."  Called via cross-call from stop_other_cpus().
397ae115bc7Smrj  */
398ae115bc7Smrj void
399ae115bc7Smrj mach_cpu_halt(char *msg)
400ae115bc7Smrj {
401ae115bc7Smrj 	if (msg)
402ae115bc7Smrj 		prom_printf("%s\n", msg);
403ae115bc7Smrj 
404ae115bc7Smrj 	/*CONSTANTCONDITION*/
405ae115bc7Smrj 	while (1)
406ae115bc7Smrj 		;
407ae115bc7Smrj }
408ae115bc7Smrj 
409ae115bc7Smrj void
410ae115bc7Smrj mach_cpu_idle(void)
411ae115bc7Smrj {
412ae115bc7Smrj 	i86_halt();
413ae115bc7Smrj }
414ae115bc7Smrj 
415ae115bc7Smrj void
416ae115bc7Smrj mach_cpu_pause(volatile char *safe)
417ae115bc7Smrj {
418ae115bc7Smrj 	/*
419ae115bc7Smrj 	 * This cpu is now safe.
420ae115bc7Smrj 	 */
421ae115bc7Smrj 	*safe = PAUSE_WAIT;
422ae115bc7Smrj 	membar_enter(); /* make sure stores are flushed */
423ae115bc7Smrj 
424ae115bc7Smrj 	/*
425ae115bc7Smrj 	 * Now we wait.  When we are allowed to continue, safe
426ae115bc7Smrj 	 * will be set to PAUSE_IDLE.
427ae115bc7Smrj 	 */
428ae115bc7Smrj 	while (*safe != PAUSE_IDLE)
429ae115bc7Smrj 		SMT_PAUSE();
430ae115bc7Smrj }
431ae115bc7Smrj 
432ae115bc7Smrj /*
433a3114836SGerry Liu  * Power on the target CPU.
434ae115bc7Smrj  */
435ae115bc7Smrj int
436ae115bc7Smrj mp_cpu_poweron(struct cpu *cp)
437ae115bc7Smrj {
438a3114836SGerry Liu 	int error;
439a3114836SGerry Liu 	cpuset_t tempset;
440a3114836SGerry Liu 	processorid_t cpuid;
441a3114836SGerry Liu 
442a3114836SGerry Liu 	ASSERT(cp != NULL);
443a3114836SGerry Liu 	cpuid = cp->cpu_id;
444a3114836SGerry Liu 	if (use_mp == 0 || plat_dr_support_cpu() == 0) {
445a3114836SGerry Liu 		return (ENOTSUP);
446a3114836SGerry Liu 	} else if (cpuid < 0 || cpuid >= max_ncpus) {
447a3114836SGerry Liu 		return (EINVAL);
448ae115bc7Smrj 	}
449ae115bc7Smrj 
450ae115bc7Smrj 	/*
451a3114836SGerry Liu 	 * The currrent x86 implementaiton of mp_cpu_configure() and
452a3114836SGerry Liu 	 * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only
453a3114836SGerry Liu 	 * be called once after calling mp_cpu_configure() for a specific CPU.
454a3114836SGerry Liu 	 * It's because mp_cpu_poweron() will destroy data structure created
455a3114836SGerry Liu 	 * by mp_cpu_configure(). So reject the request if the CPU has already
456a3114836SGerry Liu 	 * been powered on once after calling mp_cpu_configure().
457a3114836SGerry Liu 	 * This limitaiton only affects the p_online syscall and the DR driver
458a3114836SGerry Liu 	 * won't be affected because the DR driver always invoke public CPU
459a3114836SGerry Liu 	 * management interfaces in the predefined order:
460a3114836SGerry Liu 	 * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure()
461ae115bc7Smrj 	 */
462a3114836SGerry Liu 	if (cpuid_checkpass(cp, 4) || cp->cpu_thread == cp->cpu_idle_thread) {
463a3114836SGerry Liu 		return (ENOTSUP);
464a3114836SGerry Liu 	}
465a3114836SGerry Liu 
466a3114836SGerry Liu 	/*
467a3114836SGerry Liu 	 * Check if there's at least a Mbyte of kmem available
468a3114836SGerry Liu 	 * before attempting to start the cpu.
469a3114836SGerry Liu 	 */
470a3114836SGerry Liu 	if (kmem_avail() < 1024 * 1024) {
471a3114836SGerry Liu 		/*
472a3114836SGerry Liu 		 * Kick off a reap in case that helps us with
473a3114836SGerry Liu 		 * later attempts ..
474a3114836SGerry Liu 		 */
475a3114836SGerry Liu 		kmem_reap();
476a3114836SGerry Liu 		return (ENOMEM);
477a3114836SGerry Liu 	}
478a3114836SGerry Liu 
479a3114836SGerry Liu 	affinity_set(CPU->cpu_id);
480a3114836SGerry Liu 
481a3114836SGerry Liu 	/*
482a3114836SGerry Liu 	 * Start the target CPU. No need to call mach_cpucontext_fini()
483a3114836SGerry Liu 	 * if mach_cpucontext_init() fails.
484a3114836SGerry Liu 	 */
485a3114836SGerry Liu 	if ((error = mach_cpucontext_init()) == 0) {
486a3114836SGerry Liu 		error = mp_start_cpu_common(cp, B_FALSE);
487a3114836SGerry Liu 		mach_cpucontext_fini();
488a3114836SGerry Liu 	}
489a3114836SGerry Liu 	if (error != 0) {
490a3114836SGerry Liu 		affinity_clear();
491a3114836SGerry Liu 		return (error);
492a3114836SGerry Liu 	}
493a3114836SGerry Liu 
494a3114836SGerry Liu 	/* Wait for the target cpu to reach READY state. */
495a3114836SGerry Liu 	tempset = cpu_ready_set;
496a3114836SGerry Liu 	while (!CPU_IN_SET(tempset, cpuid)) {
497a3114836SGerry Liu 		delay(1);
498a3114836SGerry Liu 		tempset = *((volatile cpuset_t *)&cpu_ready_set);
499a3114836SGerry Liu 	}
500a3114836SGerry Liu 
501a3114836SGerry Liu 	/* Mark the target CPU as available for mp operation. */
502a3114836SGerry Liu 	CPUSET_ATOMIC_ADD(mp_cpus, cpuid);
503a3114836SGerry Liu 
504a3114836SGerry Liu 	/* Free the space allocated to hold the microcode file */
505a3114836SGerry Liu 	ucode_cleanup();
506a3114836SGerry Liu 
507a3114836SGerry Liu 	affinity_clear();
508a3114836SGerry Liu 
509a3114836SGerry Liu 	return (0);
510a3114836SGerry Liu }
511a3114836SGerry Liu 
512a3114836SGerry Liu #define	MP_CPU_DETACH_MAX_TRIES		5
513a3114836SGerry Liu #define	MP_CPU_DETACH_DELAY		100
514a3114836SGerry Liu 
515a3114836SGerry Liu static int
516a3114836SGerry Liu mp_cpu_detach_driver(dev_info_t *dip)
517a3114836SGerry Liu {
518a3114836SGerry Liu 	int i;
519a3114836SGerry Liu 	int rv = EBUSY;
520a3114836SGerry Liu 	dev_info_t *pdip;
521a3114836SGerry Liu 
522a3114836SGerry Liu 	pdip = ddi_get_parent(dip);
523a3114836SGerry Liu 	ASSERT(pdip != NULL);
524a3114836SGerry Liu 	/*
525a3114836SGerry Liu 	 * Check if caller holds pdip busy - can cause deadlocks in
526a3114836SGerry Liu 	 * e_ddi_branch_unconfigure(), which calls devfs_clean().
527a3114836SGerry Liu 	 */
528a3114836SGerry Liu 	if (DEVI_BUSY_OWNED(pdip)) {
529a3114836SGerry Liu 		return (EDEADLOCK);
530a3114836SGerry Liu 	}
531a3114836SGerry Liu 
532a3114836SGerry Liu 	for (i = 0; i < MP_CPU_DETACH_MAX_TRIES; i++) {
533a3114836SGerry Liu 		if (e_ddi_branch_unconfigure(dip, NULL, 0) == 0) {
534a3114836SGerry Liu 			rv = 0;
535a3114836SGerry Liu 			break;
536a3114836SGerry Liu 		}
537a3114836SGerry Liu 		DELAY(MP_CPU_DETACH_DELAY);
538a3114836SGerry Liu 	}
539a3114836SGerry Liu 
540a3114836SGerry Liu 	return (rv);
541a3114836SGerry Liu }
542a3114836SGerry Liu 
543a3114836SGerry Liu /*
544a3114836SGerry Liu  * Power off the target CPU.
545a3114836SGerry Liu  * Note: cpu_lock will be released and then reacquired.
546a3114836SGerry Liu  */
547ae115bc7Smrj int
548ae115bc7Smrj mp_cpu_poweroff(struct cpu *cp)
549ae115bc7Smrj {
550a3114836SGerry Liu 	int rv = 0;
551a3114836SGerry Liu 	void *ctx;
552a3114836SGerry Liu 	dev_info_t *dip = NULL;
553a3114836SGerry Liu 	rm_platter_t *rm = (rm_platter_t *)rm_platter_va;
554a3114836SGerry Liu 	extern void cpupm_start(cpu_t *);
555a3114836SGerry Liu 	extern void cpupm_stop(cpu_t *);
556a3114836SGerry Liu 
557a3114836SGerry Liu 	ASSERT(cp != NULL);
558a3114836SGerry Liu 	ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0);
559a3114836SGerry Liu 	ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0);
560a3114836SGerry Liu 
561a3114836SGerry Liu 	if (use_mp == 0 || plat_dr_support_cpu() == 0) {
562a3114836SGerry Liu 		return (ENOTSUP);
563a3114836SGerry Liu 	}
564a3114836SGerry Liu 	/*
565a3114836SGerry Liu 	 * There is no support for powering off cpu0 yet.
566a3114836SGerry Liu 	 * There are many pieces of code which have a hard dependency on cpu0.
567a3114836SGerry Liu 	 */
568a3114836SGerry Liu 	if (cp->cpu_id == 0) {
569a3114836SGerry Liu 		return (ENOTSUP);
570a3114836SGerry Liu 	};
571a3114836SGerry Liu 
572a3114836SGerry Liu 	if (mach_cpu_get_device_node(cp, &dip) != PSM_SUCCESS) {
573a3114836SGerry Liu 		return (ENXIO);
574a3114836SGerry Liu 	}
575a3114836SGerry Liu 	ASSERT(dip != NULL);
576a3114836SGerry Liu 	if (mp_cpu_detach_driver(dip) != 0) {
577a3114836SGerry Liu 		rv = EBUSY;
578a3114836SGerry Liu 		goto out_online;
579a3114836SGerry Liu 	}
580a3114836SGerry Liu 
581a3114836SGerry Liu 	/* Allocate CPU context for stopping */
582a3114836SGerry Liu 	if (mach_cpucontext_init() != 0) {
583a3114836SGerry Liu 		rv = ENXIO;
584a3114836SGerry Liu 		goto out_online;
585a3114836SGerry Liu 	}
586a3114836SGerry Liu 	ctx = mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_STOP);
587a3114836SGerry Liu 	if (ctx == NULL) {
588a3114836SGerry Liu 		rv = ENXIO;
589a3114836SGerry Liu 		goto out_context_fini;
590a3114836SGerry Liu 	}
591a3114836SGerry Liu 
592a3114836SGerry Liu 	cpupm_stop(cp);
593a3114836SGerry Liu 	cpu_event_fini_cpu(cp);
594a3114836SGerry Liu 
595a3114836SGerry Liu 	if (cp->cpu_m.mcpu_cmi_hdl != NULL) {
596a3114836SGerry Liu 		cmi_fini(cp->cpu_m.mcpu_cmi_hdl);
597a3114836SGerry Liu 		cp->cpu_m.mcpu_cmi_hdl = NULL;
598a3114836SGerry Liu 	}
599a3114836SGerry Liu 
600a3114836SGerry Liu 	rv = mach_cpu_stop(cp, ctx);
601a3114836SGerry Liu 	if (rv != 0) {
602a3114836SGerry Liu 		goto out_enable_cmi;
603a3114836SGerry Liu 	}
604a3114836SGerry Liu 
605a3114836SGerry Liu 	/* Wait until the target CPU has been halted. */
606a3114836SGerry Liu 	while (*(volatile ushort_t *)&(rm->rm_cpu_halted) != 0xdead) {
607a3114836SGerry Liu 		delay(1);
608a3114836SGerry Liu 	}
609a3114836SGerry Liu 	rm->rm_cpu_halted = 0xffff;
610a3114836SGerry Liu 
611a3114836SGerry Liu 	/* CPU_READY has been cleared by mach_cpu_stop. */
612a3114836SGerry Liu 	ASSERT((cp->cpu_flags & CPU_READY) == 0);
613a3114836SGerry Liu 	ASSERT((cp->cpu_flags & CPU_RUNNING) == 0);
614a3114836SGerry Liu 	cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF;
615a3114836SGerry Liu 	CPUSET_ATOMIC_DEL(mp_cpus, cp->cpu_id);
616a3114836SGerry Liu 
617a3114836SGerry Liu 	mach_cpucontext_xfree(cp, ctx, 0, MACH_CPUCONTEXT_OP_STOP);
618a3114836SGerry Liu 	mach_cpucontext_fini();
619a3114836SGerry Liu 
620a3114836SGerry Liu 	return (0);
621a3114836SGerry Liu 
622a3114836SGerry Liu out_enable_cmi:
623a3114836SGerry Liu 	{
624a3114836SGerry Liu 		cmi_hdl_t hdl;
625a3114836SGerry Liu 
626a3114836SGerry Liu 		if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp),
627a3114836SGerry Liu 		    cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) {
6287417cfdeSKuriakose Kuruvilla 			if (is_x86_feature(x86_featureset, X86FSET_MCA))
629a3114836SGerry Liu 				cmi_mca_init(hdl);
630a3114836SGerry Liu 			cp->cpu_m.mcpu_cmi_hdl = hdl;
631a3114836SGerry Liu 		}
632a3114836SGerry Liu 	}
633a3114836SGerry Liu 	cpu_event_init_cpu(cp);
634a3114836SGerry Liu 	cpupm_start(cp);
635a3114836SGerry Liu 	mach_cpucontext_xfree(cp, ctx, rv, MACH_CPUCONTEXT_OP_STOP);
636a3114836SGerry Liu 
637a3114836SGerry Liu out_context_fini:
638a3114836SGerry Liu 	mach_cpucontext_fini();
639a3114836SGerry Liu 
640a3114836SGerry Liu out_online:
641a3114836SGerry Liu 	(void) e_ddi_branch_configure(dip, NULL, 0);
642a3114836SGerry Liu 
643a3114836SGerry Liu 	if (rv != EAGAIN && rv != ETIME) {
644a3114836SGerry Liu 		rv = ENXIO;
645a3114836SGerry Liu 	}
646a3114836SGerry Liu 
647a3114836SGerry Liu 	return (rv);
648ae115bc7Smrj }
649b9bc7f78Ssmaybe 
650b9bc7f78Ssmaybe /*
651b9bc7f78Ssmaybe  * Return vcpu state, since this could be a virtual environment that we
652b9bc7f78Ssmaybe  * are unaware of, return "unknown".
653b9bc7f78Ssmaybe  */
654b9bc7f78Ssmaybe /* ARGSUSED */
655b9bc7f78Ssmaybe int
656b9bc7f78Ssmaybe vcpu_on_pcpu(processorid_t cpu)
657b9bc7f78Ssmaybe {
658b9bc7f78Ssmaybe 	return (VCPU_STATE_UNKNOWN);
659b9bc7f78Ssmaybe }
660