xref: /illumos-gate/usr/src/uts/i86pc/os/cpr_impl.c (revision 9d6ca3965c3358c32eb68544fe91ff8ad9c3fcde)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Platform specific implementation code
27  * Currently only suspend to RAM is supported (ACPI S3)
28  */
29 
30 #define	SUNDDI_IMPL
31 
32 #include <sys/types.h>
33 #include <sys/promif.h>
34 #include <sys/prom_isa.h>
35 #include <sys/prom_plat.h>
36 #include <sys/cpuvar.h>
37 #include <sys/pte.h>
38 #include <vm/hat.h>
39 #include <vm/page.h>
40 #include <vm/as.h>
41 #include <sys/cpr.h>
42 #include <sys/kmem.h>
43 #include <sys/clock.h>
44 #include <sys/kmem.h>
45 #include <sys/panic.h>
46 #include <vm/seg_kmem.h>
47 #include <sys/cpu_module.h>
48 #include <sys/callb.h>
49 #include <sys/machsystm.h>
50 #include <sys/vmsystm.h>
51 #include <sys/systm.h>
52 #include <sys/archsystm.h>
53 #include <sys/stack.h>
54 #include <sys/fs/ufs_fs.h>
55 #include <sys/memlist.h>
56 #include <sys/bootconf.h>
57 #include <sys/thread.h>
58 #include <sys/x_call.h>
59 #include <sys/smp_impldefs.h>
60 #include <vm/vm_dep.h>
61 #include <sys/psm.h>
62 #include <sys/epm.h>
63 #include <sys/cpr_wakecode.h>
64 #include <sys/x86_archext.h>
65 #include <sys/reboot.h>
66 #include <sys/acpi/acpi.h>
67 #include <sys/acpica.h>
68 #include <sys/fp.h>
69 #include <sys/sysmacros.h>
70 
71 #define	AFMT	"%lx"
72 
73 extern int	flushes_require_xcalls;
74 extern cpuset_t	cpu_ready_set;
75 
76 extern void	*wc_long_mode_64(void);
77 extern int	tsc_gethrtime_enable;
78 extern	void	i_cpr_start_cpu(void);
79 
80 ushort_t	cpr_mach_type = CPR_MACHTYPE_X86;
81 void		(*cpr_start_cpu_func)(void) = i_cpr_start_cpu;
82 
83 static wc_cpu_t	*wc_other_cpus = NULL;
84 static cpuset_t procset;
85 
86 static void
87 init_real_mode_platter(int cpun, uint32_t offset, uint_t cr4, wc_desctbr_t gdt);
88 
89 static int i_cpr_platform_alloc(psm_state_request_t *req);
90 static void i_cpr_platform_free(psm_state_request_t *req);
91 static int i_cpr_save_apic(psm_state_request_t *req);
92 static int i_cpr_restore_apic(psm_state_request_t *req);
93 static int wait_for_set(cpuset_t *set, int who);
94 
95 static	void i_cpr_save_stack(kthread_t *t, wc_cpu_t *wc_cpu);
96 void i_cpr_restore_stack(kthread_t *t, greg_t *save_stack);
97 
98 #ifdef STACK_GROWTH_DOWN
99 #define	CPR_GET_STACK_START(t) ((t)->t_stkbase)
100 #define	CPR_GET_STACK_END(t) ((t)->t_stk)
101 #else
102 #define	CPR_GET_STACK_START(t) ((t)->t_stk)
103 #define	CPR_GET_STACK_END(t) ((t)->t_stkbase)
104 #endif	/* STACK_GROWTH_DOWN */
105 
106 /*
107  * restart paused slave cpus
108  */
109 void
110 i_cpr_machdep_setup(void)
111 {
112 	if (ncpus > 1) {
113 		CPR_DEBUG(CPR_DEBUG1, ("MP restarted...\n"));
114 		mutex_enter(&cpu_lock);
115 		start_cpus();
116 		mutex_exit(&cpu_lock);
117 	}
118 }
119 
120 
121 /*
122  * Stop all interrupt activities in the system
123  */
124 void
125 i_cpr_stop_intr(void)
126 {
127 	(void) spl7();
128 }
129 
130 /*
131  * Set machine up to take interrupts
132  */
133 void
134 i_cpr_enable_intr(void)
135 {
136 	(void) spl0();
137 }
138 
139 /*
140  * Save miscellaneous information which needs to be written to the
141  * state file.  This information is required to re-initialize
142  * kernel/prom handshaking.
143  */
144 void
145 i_cpr_save_machdep_info(void)
146 {
147 	int notcalled = 0;
148 	ASSERT(notcalled);
149 }
150 
151 
152 void
153 i_cpr_set_tbr(void)
154 {
155 }
156 
157 
158 processorid_t
159 i_cpr_bootcpuid(void)
160 {
161 	return (0);
162 }
163 
164 /*
165  * cpu0 should contain bootcpu info
166  */
167 cpu_t *
168 i_cpr_bootcpu(void)
169 {
170 	ASSERT(MUTEX_HELD(&cpu_lock));
171 
172 	return (cpu_get(i_cpr_bootcpuid()));
173 }
174 
175 /*
176  *	Save context for the specified CPU
177  */
178 void *
179 i_cpr_save_context(void *arg)
180 {
181 	long	index = (long)arg;
182 	psm_state_request_t *papic_state;
183 	int resuming;
184 	int	ret;
185 	wc_cpu_t	*wc_cpu = wc_other_cpus + index;
186 
187 	PMD(PMD_SX, ("i_cpr_save_context() index = %ld\n", index))
188 
189 	ASSERT(index < NCPU);
190 
191 	papic_state = &(wc_cpu)->wc_apic_state;
192 
193 	ret = i_cpr_platform_alloc(papic_state);
194 	ASSERT(ret == 0);
195 
196 	ret = i_cpr_save_apic(papic_state);
197 	ASSERT(ret == 0);
198 
199 	i_cpr_save_stack(curthread, wc_cpu);
200 
201 	/*
202 	 * wc_save_context returns twice, once when susending and
203 	 * once when resuming,  wc_save_context() returns 0 when
204 	 * suspending and non-zero upon resume
205 	 */
206 	resuming = (wc_save_context(wc_cpu) == 0);
207 
208 	/*
209 	 * do NOT call any functions after this point, because doing so
210 	 * will modify the stack that we are running on
211 	 */
212 
213 	if (resuming) {
214 
215 		ret = i_cpr_restore_apic(papic_state);
216 		ASSERT(ret == 0);
217 
218 		i_cpr_platform_free(papic_state);
219 
220 		/*
221 		 * Enable interrupts on this cpu.
222 		 * Do not bind interrupts to this CPU's local APIC until
223 		 * the CPU is ready to receive interrupts.
224 		 */
225 		ASSERT(CPU->cpu_id != i_cpr_bootcpuid());
226 		mutex_enter(&cpu_lock);
227 		cpu_enable_intr(CPU);
228 		mutex_exit(&cpu_lock);
229 
230 		/*
231 		 * Setting the bit in cpu_ready_set must be the last operation
232 		 * in processor initialization; the boot CPU will continue to
233 		 * boot once it sees this bit set for all active CPUs.
234 		 */
235 		CPUSET_ATOMIC_ADD(cpu_ready_set, CPU->cpu_id);
236 
237 		PMD(PMD_SX,
238 		    ("i_cpr_save_context() resuming cpu %d in cpu_ready_set\n",
239 		    CPU->cpu_id))
240 	} else {
241 		/*
242 		 * Disable interrupts on this CPU so that PSM knows not to bind
243 		 * interrupts here on resume until the CPU has executed
244 		 * cpu_enable_intr() (above) in the resume path.
245 		 * We explicitly do not grab cpu_lock here because at this point
246 		 * in the suspend process, the boot cpu owns cpu_lock and all
247 		 * other cpus are also executing in the pause thread (only
248 		 * modifying their respective CPU structure).
249 		 */
250 		(void) cpu_disable_intr(CPU);
251 	}
252 
253 	PMD(PMD_SX, ("i_cpr_save_context: wc_save_context returns %d\n",
254 	    resuming))
255 
256 	return (NULL);
257 }
258 
259 static ushort_t *warm_reset_vector = NULL;
260 
261 static ushort_t *
262 map_warm_reset_vector()
263 {
264 	/*LINTED*/
265 	if (!(warm_reset_vector = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR,
266 	    sizeof (ushort_t *), PROT_READ|PROT_WRITE)))
267 		return (NULL);
268 
269 	/*
270 	 * setup secondary cpu bios boot up vector
271 	 */
272 	*warm_reset_vector = (ushort_t)((caddr_t)
273 	    /*LINTED*/
274 	    ((struct rm_platter *)rm_platter_va)->rm_code - rm_platter_va
275 	    + ((ulong_t)rm_platter_va & 0xf));
276 	warm_reset_vector++;
277 	*warm_reset_vector = (ushort_t)(rm_platter_pa >> 4);
278 
279 	--warm_reset_vector;
280 	return (warm_reset_vector);
281 }
282 
283 void
284 i_cpr_pre_resume_cpus()
285 {
286 	/*
287 	 * this is a cut down version of start_other_cpus()
288 	 * just do the initialization to wake the other cpus
289 	 */
290 	unsigned who;
291 	int boot_cpuid = i_cpr_bootcpuid();
292 	uint32_t		code_length = 0;
293 	caddr_t			wakevirt = rm_platter_va;
294 	/*LINTED*/
295 	wakecode_t		*wp = (wakecode_t *)wakevirt;
296 	char *str = "i_cpr_pre_resume_cpus";
297 	extern int get_tsc_ready();
298 	int err;
299 
300 	/*LINTED*/
301 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
302 
303 	/*
304 	 * If startup wasn't able to find a page under 1M, we cannot
305 	 * proceed.
306 	 */
307 	if (rm_platter_va == 0) {
308 		cmn_err(CE_WARN, "Cannot suspend the system because no "
309 		    "memory below 1M could be found for processor startup");
310 		return;
311 	}
312 
313 	/*
314 	 * Copy the real mode code at "real_mode_start" to the
315 	 * page at rm_platter_va.
316 	 */
317 	warm_reset_vector = map_warm_reset_vector();
318 	if (warm_reset_vector == NULL) {
319 		PMD(PMD_SX, ("i_cpr_pre_resume_cpus() returning #2\n"))
320 		return;
321 	}
322 
323 	flushes_require_xcalls = 1;
324 
325 	/*
326 	 * We lock our affinity to the master CPU to ensure that all slave CPUs
327 	 * do their TSC syncs with the same CPU.
328 	 */
329 
330 	affinity_set(CPU_CURRENT);
331 
332 	/*
333 	 * Mark the boot cpu as being ready and in the procset, since we are
334 	 * running on that cpu.
335 	 */
336 	CPUSET_ONLY(cpu_ready_set, boot_cpuid);
337 	CPUSET_ONLY(procset, boot_cpuid);
338 
339 	for (who = 0; who < max_ncpus; who++) {
340 
341 		wc_cpu_t	*cpup = wc_other_cpus + who;
342 		wc_desctbr_t	gdt;
343 
344 		if (who == boot_cpuid)
345 			continue;
346 
347 		if (!CPU_IN_SET(mp_cpus, who))
348 			continue;
349 
350 		PMD(PMD_SX, ("%s() waking up %d cpu\n", str, who))
351 
352 		bcopy(cpup, &(wp->wc_cpu), sizeof (wc_cpu_t));
353 
354 		gdt.base = cpup->wc_gdt_base;
355 		gdt.limit = cpup->wc_gdt_limit;
356 
357 		code_length = (uint32_t)((uintptr_t)wc_long_mode_64 -
358 		    (uintptr_t)wc_rm_start);
359 
360 		init_real_mode_platter(who, code_length, cpup->wc_cr4, gdt);
361 
362 		mutex_enter(&cpu_lock);
363 		err = mach_cpuid_start(who, rm_platter_va);
364 		mutex_exit(&cpu_lock);
365 		if (err != 0) {
366 			cmn_err(CE_WARN, "cpu%d: failed to start during "
367 			    "suspend/resume error %d", who, err);
368 			continue;
369 		}
370 
371 		PMD(PMD_SX, ("%s() #1 waiting for %d in procset\n", str, who))
372 
373 		if (!wait_for_set(&procset, who))
374 			continue;
375 
376 		PMD(PMD_SX, ("%s() %d cpu started\n", str, who))
377 
378 		PMD(PMD_SX, ("%s() tsc_ready = %d\n", str, get_tsc_ready()))
379 
380 		if (tsc_gethrtime_enable) {
381 			PMD(PMD_SX, ("%s() calling tsc_sync_master\n", str))
382 			tsc_sync_master(who);
383 		}
384 
385 		PMD(PMD_SX, ("%s() waiting for %d in cpu_ready_set\n", str,
386 		    who))
387 		/*
388 		 * Wait for cpu to declare that it is ready, we want the
389 		 * cpus to start serially instead of in parallel, so that
390 		 * they do not contend with each other in wc_rm_start()
391 		 */
392 		if (!wait_for_set(&cpu_ready_set, who))
393 			continue;
394 
395 		/*
396 		 * do not need to re-initialize dtrace using dtrace_cpu_init
397 		 * function
398 		 */
399 		PMD(PMD_SX, ("%s() cpu %d now ready\n", str, who))
400 	}
401 
402 	affinity_clear();
403 
404 	PMD(PMD_SX, ("%s() all cpus now ready\n", str))
405 
406 }
407 
408 static void
409 unmap_warm_reset_vector(ushort_t *warm_reset_vector)
410 {
411 	psm_unmap_phys((caddr_t)warm_reset_vector, sizeof (ushort_t *));
412 }
413 
414 /*
415  * We need to setup a 1:1 (virtual to physical) mapping for the
416  * page containing the wakeup code.
417  */
418 static struct as *save_as;	/* when switching to kas */
419 
420 static void
421 unmap_wakeaddr_1to1(uint64_t wakephys)
422 {
423 	uintptr_t	wp = (uintptr_t)wakephys;
424 	hat_setup(save_as->a_hat, 0);	/* switch back from kernel hat */
425 	hat_unload(kas.a_hat, (caddr_t)wp, PAGESIZE, HAT_UNLOAD);
426 }
427 
428 void
429 i_cpr_post_resume_cpus()
430 {
431 	uint64_t	wakephys = rm_platter_pa;
432 
433 	if (warm_reset_vector != NULL)
434 		unmap_warm_reset_vector(warm_reset_vector);
435 
436 	hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE,
437 	    HAT_UNLOAD);
438 
439 	/*
440 	 * cmi_post_mpstartup() is only required upon boot not upon
441 	 * resume from RAM
442 	 */
443 
444 	PT(PT_UNDO1to1);
445 	/* Tear down 1:1 mapping for wakeup code */
446 	unmap_wakeaddr_1to1(wakephys);
447 }
448 
449 /* ARGSUSED */
450 void
451 i_cpr_handle_xc(int flag)
452 {
453 }
454 
455 int
456 i_cpr_reusable_supported(void)
457 {
458 	return (0);
459 }
460 static void
461 map_wakeaddr_1to1(uint64_t wakephys)
462 {
463 	uintptr_t	wp = (uintptr_t)wakephys;
464 	hat_devload(kas.a_hat, (caddr_t)wp, PAGESIZE, btop(wakephys),
465 	    (PROT_READ|PROT_WRITE|PROT_EXEC|HAT_STORECACHING_OK|HAT_NOSYNC),
466 	    HAT_LOAD);
467 	save_as = curthread->t_procp->p_as;
468 	hat_setup(kas.a_hat, 0);	/* switch to kernel-only hat */
469 }
470 
471 
472 void
473 prt_other_cpus()
474 {
475 	int	who;
476 
477 	if (ncpus == 1) {
478 		PMD(PMD_SX, ("prt_other_cpus() other cpu table empty for "
479 		    "uniprocessor machine\n"))
480 		return;
481 	}
482 
483 	for (who = 0; who < max_ncpus; who++) {
484 
485 		wc_cpu_t	*cpup = wc_other_cpus + who;
486 
487 		if (!CPU_IN_SET(mp_cpus, who))
488 			continue;
489 
490 		PMD(PMD_SX, ("prt_other_cpus() who = %d, gdt=%p:%x, "
491 		    "idt=%p:%x, ldt=%lx, tr=%lx, kgsbase="
492 		    AFMT ", sp=%lx\n", who,
493 		    (void *)cpup->wc_gdt_base, cpup->wc_gdt_limit,
494 		    (void *)cpup->wc_idt_base, cpup->wc_idt_limit,
495 		    (long)cpup->wc_ldt, (long)cpup->wc_tr,
496 		    (long)cpup->wc_kgsbase, (long)cpup->wc_rsp))
497 	}
498 }
499 
500 /*
501  * Power down the system.
502  */
503 int
504 i_cpr_power_down(int sleeptype)
505 {
506 	caddr_t		wakevirt = rm_platter_va;
507 	uint64_t	wakephys = rm_platter_pa;
508 	ulong_t		saved_intr;
509 	uint32_t	code_length = 0;
510 	wc_desctbr_t	gdt;
511 	/*LINTED*/
512 	wakecode_t	*wp = (wakecode_t *)wakevirt;
513 	/*LINTED*/
514 	rm_platter_t	*wcpp = (rm_platter_t *)wakevirt;
515 	wc_cpu_t	*cpup = &(wp->wc_cpu);
516 	dev_info_t	*ppm;
517 	int		ret = 0;
518 	power_req_t	power_req;
519 	char *str =	"i_cpr_power_down";
520 	/*LINTED*/
521 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
522 	extern int	cpr_suspend_succeeded;
523 	extern void	kernel_wc_code();
524 
525 	ASSERT(sleeptype == CPR_TORAM);
526 	ASSERT(CPU->cpu_id == 0);
527 
528 	if ((ppm = PPM(ddi_root_node())) == NULL) {
529 		PMD(PMD_SX, ("%s: root node not claimed\n", str))
530 		return (ENOTTY);
531 	}
532 
533 	PMD(PMD_SX, ("Entering %s()\n", str))
534 
535 	PT(PT_IC);
536 	saved_intr = intr_clear();
537 
538 	PT(PT_1to1);
539 	/* Setup 1:1 mapping for wakeup code */
540 	map_wakeaddr_1to1(wakephys);
541 
542 	PMD(PMD_SX, ("ncpus=%d\n", ncpus))
543 
544 	PMD(PMD_SX, ("wc_rm_end - wc_rm_start=%lx WC_CODESIZE=%x\n",
545 	    ((size_t)((uintptr_t)wc_rm_end - (uintptr_t)wc_rm_start)),
546 	    WC_CODESIZE))
547 
548 	PMD(PMD_SX, ("wakevirt=%p, wakephys=%x\n",
549 	    (void *)wakevirt, (uint_t)wakephys))
550 
551 	ASSERT(((size_t)((uintptr_t)wc_rm_end - (uintptr_t)wc_rm_start)) <
552 	    WC_CODESIZE);
553 
554 	bzero(wakevirt, PAGESIZE);
555 
556 	/* Copy code to rm_platter */
557 	bcopy((caddr_t)wc_rm_start, wakevirt,
558 	    (size_t)((uintptr_t)wc_rm_end - (uintptr_t)wc_rm_start));
559 
560 	prt_other_cpus();
561 
562 
563 	PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
564 	    (ulong_t)real_mode_platter->rm_cr4, (ulong_t)getcr4()))
565 
566 	PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
567 	    (ulong_t)real_mode_platter->rm_pdbr, getcr3()))
568 
569 	real_mode_platter->rm_cr4 = getcr4();
570 	real_mode_platter->rm_pdbr = getcr3();
571 
572 	rmp_gdt_init(real_mode_platter);
573 
574 	/*
575 	 * Since the CPU needs to jump to protected mode using an identity
576 	 * mapped address, we need to calculate it here.
577 	 */
578 	real_mode_platter->rm_longmode64_addr = rm_platter_pa +
579 	    (uint32_t)((uintptr_t)wc_long_mode_64 - (uintptr_t)wc_rm_start);
580 
581 	PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
582 	    (ulong_t)real_mode_platter->rm_cr4, getcr4()))
583 	PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
584 	    (ulong_t)real_mode_platter->rm_pdbr, getcr3()))
585 
586 	PMD(PMD_SX, ("real_mode_platter->rm_longmode64_addr=%lx\n",
587 	    (ulong_t)real_mode_platter->rm_longmode64_addr))
588 
589 
590 	PT(PT_SC);
591 	if (wc_save_context(cpup)) {
592 
593 		ret = i_cpr_platform_alloc(&(wc_other_cpus->wc_apic_state));
594 		if (ret != 0)
595 			return (ret);
596 
597 		ret = i_cpr_save_apic(&(wc_other_cpus->wc_apic_state));
598 		PMD(PMD_SX, ("%s: i_cpr_save_apic() returned %d\n", str, ret))
599 		if (ret != 0)
600 			return (ret);
601 
602 		PMD(PMD_SX, ("wakephys=%x, kernel_wc_code=%p\n",
603 		    (uint_t)wakephys, (void *)&kernel_wc_code))
604 		PMD(PMD_SX, ("virtaddr=%lx, retaddr=%lx\n",
605 		    (long)cpup->wc_virtaddr, (long)cpup->wc_retaddr))
606 		PMD(PMD_SX, ("ebx=%x, edi=%x, esi=%x, ebp=%x, esp=%x\n",
607 		    cpup->wc_ebx, cpup->wc_edi, cpup->wc_esi, cpup->wc_ebp,
608 		    cpup->wc_esp))
609 		PMD(PMD_SX, ("cr0=%lx, cr3=%lx, cr4=%lx\n",
610 		    (long)cpup->wc_cr0, (long)cpup->wc_cr3,
611 		    (long)cpup->wc_cr4))
612 		PMD(PMD_SX, ("cs=%x, ds=%x, es=%x, ss=%x, fs=%lx, gs=%lx, "
613 		    "flgs=%lx\n", cpup->wc_cs, cpup->wc_ds, cpup->wc_es,
614 		    cpup->wc_ss, (long)cpup->wc_fs, (long)cpup->wc_gs,
615 		    (long)cpup->wc_eflags))
616 
617 		PMD(PMD_SX, ("gdt=%p:%x, idt=%p:%x, ldt=%lx, tr=%lx, "
618 		    "kgbase=%lx\n", (void *)cpup->wc_gdt_base,
619 		    cpup->wc_gdt_limit, (void *)cpup->wc_idt_base,
620 		    cpup->wc_idt_limit, (long)cpup->wc_ldt,
621 		    (long)cpup->wc_tr, (long)cpup->wc_kgsbase))
622 
623 		gdt.base = cpup->wc_gdt_base;
624 		gdt.limit = cpup->wc_gdt_limit;
625 
626 		code_length = (uint32_t)((uintptr_t)wc_long_mode_64 -
627 		    (uintptr_t)wc_rm_start);
628 
629 		init_real_mode_platter(0, code_length, cpup->wc_cr4, gdt);
630 
631 		PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
632 		    (ulong_t)wcpp->rm_cr4, getcr4()))
633 
634 		PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
635 		    (ulong_t)wcpp->rm_pdbr, getcr3()))
636 
637 		PMD(PMD_SX, ("real_mode_platter->rm_longmode64_addr=%lx\n",
638 		    (ulong_t)wcpp->rm_longmode64_addr))
639 
640 		PMD(PMD_SX,
641 		    ("real_mode_platter->rm_temp_gdt[TEMPGDT_KCODE64]=%lx\n",
642 		    (ulong_t)wcpp->rm_temp_gdt[TEMPGDT_KCODE64]))
643 
644 		PMD(PMD_SX, ("gdt=%p:%x, idt=%p:%x, ldt=%lx, tr=%lx, "
645 		    "kgsbase=%lx\n", (void *)wcpp->rm_gdt_base,
646 		    wcpp->rm_gdt_lim, (void *)wcpp->rm_idt_base,
647 		    wcpp->rm_idt_lim, (long)cpup->wc_ldt, (long)cpup->wc_tr,
648 		    (long)cpup->wc_kgsbase))
649 
650 		power_req.request_type = PMR_PPM_ENTER_SX;
651 		power_req.req.ppm_power_enter_sx_req.sx_state = S3;
652 		power_req.req.ppm_power_enter_sx_req.test_point =
653 		    cpr_test_point;
654 		power_req.req.ppm_power_enter_sx_req.wakephys = wakephys;
655 
656 		PMD(PMD_SX, ("%s: pm_ctlops PMR_PPM_ENTER_SX\n", str))
657 		PT(PT_PPMCTLOP);
658 		(void) pm_ctlops(ppm, ddi_root_node(), DDI_CTLOPS_POWER,
659 		    &power_req, &ret);
660 		PMD(PMD_SX, ("%s: returns %d\n", str, ret))
661 
662 		/*
663 		 * If it works, we get control back to the else branch below
664 		 * If we get control back here, it didn't work.
665 		 * XXX return EINVAL here?
666 		 */
667 
668 		unmap_wakeaddr_1to1(wakephys);
669 		intr_restore(saved_intr);
670 
671 		return (ret);
672 	} else {
673 		cpr_suspend_succeeded = 1;
674 
675 		power_req.request_type = PMR_PPM_EXIT_SX;
676 		power_req.req.ppm_power_enter_sx_req.sx_state = S3;
677 
678 		PMD(PMD_SX, ("%s: pm_ctlops PMR_PPM_EXIT_SX\n", str))
679 		PT(PT_PPMCTLOP);
680 		(void) pm_ctlops(ppm, ddi_root_node(), DDI_CTLOPS_POWER,
681 		    &power_req, &ret);
682 		PMD(PMD_SX, ("%s: returns %d\n", str, ret))
683 
684 		ret = i_cpr_restore_apic(&(wc_other_cpus->wc_apic_state));
685 		/*
686 		 * the restore should never fail, if the saved suceeded
687 		 */
688 		ASSERT(ret == 0);
689 
690 		i_cpr_platform_free(&(wc_other_cpus->wc_apic_state));
691 
692 		/*
693 		 * Enable interrupts on boot cpu.
694 		 */
695 		ASSERT(CPU->cpu_id == i_cpr_bootcpuid());
696 		mutex_enter(&cpu_lock);
697 		cpu_enable_intr(CPU);
698 		mutex_exit(&cpu_lock);
699 
700 		PT(PT_INTRRESTORE);
701 		intr_restore(saved_intr);
702 		PT(PT_CPU);
703 
704 		return (ret);
705 	}
706 }
707 
708 /*
709  * Stop all other cpu's before halting or rebooting. We pause the cpu's
710  * instead of sending a cross call.
711  * Stolen from sun4/os/mp_states.c
712  */
713 
714 static int cpu_are_paused;	/* sic */
715 
716 void
717 i_cpr_stop_other_cpus(void)
718 {
719 	mutex_enter(&cpu_lock);
720 	if (cpu_are_paused) {
721 		mutex_exit(&cpu_lock);
722 		return;
723 	}
724 	pause_cpus(NULL, NULL);
725 	cpu_are_paused = 1;
726 
727 	mutex_exit(&cpu_lock);
728 }
729 
730 int
731 i_cpr_is_supported(int sleeptype)
732 {
733 	extern int cpr_supported_override;
734 	extern int cpr_platform_enable;
735 	extern int pm_S3_enabled;
736 
737 	if (sleeptype != CPR_TORAM)
738 		return (0);
739 
740 	/*
741 	 * The next statement tests if a specific platform has turned off
742 	 * cpr support.
743 	 */
744 	if (cpr_supported_override)
745 		return (0);
746 
747 	/*
748 	 * If a platform has specifically turned on cpr support ...
749 	 */
750 	if (cpr_platform_enable)
751 		return (1);
752 
753 	return (pm_S3_enabled);
754 }
755 
756 void
757 i_cpr_bitmap_cleanup(void)
758 {
759 }
760 
761 void
762 i_cpr_free_memory_resources(void)
763 {
764 }
765 
766 /*
767  * Needed only for S3 so far
768  */
769 static int
770 i_cpr_platform_alloc(psm_state_request_t *req)
771 {
772 #ifdef DEBUG
773 	char	*str = "i_cpr_platform_alloc";
774 #endif
775 
776 	PMD(PMD_SX, ("cpu = %d, %s(%p) \n", CPU->cpu_id, str, (void *)req))
777 
778 	if (psm_state == NULL) {
779 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
780 		return (0);
781 	}
782 
783 	req->psr_cmd = PSM_STATE_ALLOC;
784 	return ((*psm_state)(req));
785 }
786 
787 /*
788  * Needed only for S3 so far
789  */
790 static void
791 i_cpr_platform_free(psm_state_request_t *req)
792 {
793 #ifdef DEBUG
794 	char	*str = "i_cpr_platform_free";
795 #endif
796 
797 	PMD(PMD_SX, ("cpu = %d, %s(%p) \n", CPU->cpu_id, str, (void *)req))
798 
799 	if (psm_state == NULL) {
800 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
801 		return;
802 	}
803 
804 	req->psr_cmd = PSM_STATE_FREE;
805 	(void) (*psm_state)(req);
806 }
807 
808 static int
809 i_cpr_save_apic(psm_state_request_t *req)
810 {
811 #ifdef DEBUG
812 	char	*str = "i_cpr_save_apic";
813 #endif
814 
815 	if (psm_state == NULL) {
816 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
817 		return (0);
818 	}
819 
820 	req->psr_cmd = PSM_STATE_SAVE;
821 	return ((*psm_state)(req));
822 }
823 
824 static int
825 i_cpr_restore_apic(psm_state_request_t *req)
826 {
827 #ifdef DEBUG
828 	char	*str = "i_cpr_restore_apic";
829 #endif
830 
831 	if (psm_state == NULL) {
832 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
833 		return (0);
834 	}
835 
836 	req->psr_cmd = PSM_STATE_RESTORE;
837 	return ((*psm_state)(req));
838 }
839 
840 static void
841 init_real_mode_platter(int cpun, uint32_t offset, uint_t cr4, wc_desctbr_t gdt)
842 {
843 	/*LINTED*/
844 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
845 
846 	/*
847 	 * Fill up the real mode platter to make it easy for real mode code to
848 	 * kick it off. This area should really be one passed by boot to kernel
849 	 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also
850 	 * have identical physical and virtual address in paged mode.
851 	 */
852 
853 	real_mode_platter->rm_pdbr = getcr3();
854 	real_mode_platter->rm_cpu = cpun;
855 	real_mode_platter->rm_cr4 = cr4;
856 
857 	real_mode_platter->rm_gdt_base = gdt.base;
858 	real_mode_platter->rm_gdt_lim = gdt.limit;
859 
860 	if (getcr3() > 0xffffffffUL)
861 		panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
862 		    "located above 4G in physical memory (@ 0x%llx).",
863 		    (unsigned long long)getcr3());
864 
865 	/*
866 	 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
867 	 * by code in real_mode_start():
868 	 *
869 	 * GDT[0]:  NULL selector
870 	 * GDT[1]:  64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1
871 	 *
872 	 * Clear the IDT as interrupts will be off and a limit of 0 will cause
873 	 * the CPU to triple fault and reset on an NMI, seemingly as reasonable
874 	 * a course of action as any other, though it may cause the entire
875 	 * platform to reset in some cases...
876 	 */
877 	real_mode_platter->rm_temp_gdt[0] = 0ULL;
878 	real_mode_platter->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL;
879 
880 	real_mode_platter->rm_temp_gdt_lim = (ushort_t)
881 	    (sizeof (real_mode_platter->rm_temp_gdt) - 1);
882 	real_mode_platter->rm_temp_gdt_base = rm_platter_pa +
883 	    offsetof(rm_platter_t, rm_temp_gdt);
884 
885 	real_mode_platter->rm_temp_idt_lim = 0;
886 	real_mode_platter->rm_temp_idt_base = 0;
887 
888 	/*
889 	 * Since the CPU needs to jump to protected mode using an identity
890 	 * mapped address, we need to calculate it here.
891 	 */
892 	real_mode_platter->rm_longmode64_addr = rm_platter_pa + offset;
893 
894 	/* return; */
895 }
896 
897 void
898 i_cpr_start_cpu(void)
899 {
900 
901 	struct cpu *cp = CPU;
902 
903 	char *str = "i_cpr_start_cpu";
904 	extern void init_cpu_syscall(struct cpu *cp);
905 
906 	PMD(PMD_SX, ("%s() called\n", str))
907 
908 	PMD(PMD_SX, ("%s() #0 cp->cpu_base_spl %d\n", str,
909 	    cp->cpu_base_spl))
910 
911 	mutex_enter(&cpu_lock);
912 	if (cp == i_cpr_bootcpu()) {
913 		mutex_exit(&cpu_lock);
914 		PMD(PMD_SX,
915 		    ("%s() called on bootcpu nothing to do!\n", str))
916 		return;
917 	}
918 	mutex_exit(&cpu_lock);
919 
920 	/*
921 	 * We need to Sync PAT with cpu0's PAT. We have to do
922 	 * this with interrupts disabled.
923 	 */
924 	pat_sync();
925 
926 	/*
927 	 * If we use XSAVE, we need to restore XFEATURE_ENABLE_MASK register.
928 	 */
929 	if (fp_save_mech == FP_XSAVE) {
930 		setup_xfem();
931 	}
932 
933 	/*
934 	 * Initialize this CPU's syscall handlers
935 	 */
936 	init_cpu_syscall(cp);
937 
938 	PMD(PMD_SX, ("%s() #1 cp->cpu_base_spl %d\n", str, cp->cpu_base_spl))
939 
940 	/*
941 	 * Do not need to call cpuid_pass2(), cpuid_pass3(), cpuid_pass4() or
942 	 * init_cpu_info(), since the work that they do is only needed to
943 	 * be done once at boot time
944 	 */
945 
946 
947 	mutex_enter(&cpu_lock);
948 	CPUSET_ADD(procset, cp->cpu_id);
949 	mutex_exit(&cpu_lock);
950 
951 	PMD(PMD_SX, ("%s() #2 cp->cpu_base_spl %d\n", str,
952 	    cp->cpu_base_spl))
953 
954 	if (tsc_gethrtime_enable) {
955 		PMD(PMD_SX, ("%s() calling tsc_sync_slave\n", str))
956 		tsc_sync_slave();
957 	}
958 
959 	PMD(PMD_SX, ("%s() cp->cpu_id %d, cp->cpu_intr_actv %d\n", str,
960 	    cp->cpu_id, cp->cpu_intr_actv))
961 	PMD(PMD_SX, ("%s() #3 cp->cpu_base_spl %d\n", str,
962 	    cp->cpu_base_spl))
963 
964 	(void) spl0();		/* enable interrupts */
965 
966 	PMD(PMD_SX, ("%s() #4 cp->cpu_base_spl %d\n", str,
967 	    cp->cpu_base_spl))
968 
969 	/*
970 	 * Set up the CPU module for this CPU.  This can't be done before
971 	 * this CPU is made CPU_READY, because we may (in heterogeneous systems)
972 	 * need to go load another CPU module.  The act of attempting to load
973 	 * a module may trigger a cross-call, which will ASSERT unless this
974 	 * cpu is CPU_READY.
975 	 */
976 
977 	/*
978 	 * cmi already been init'd (during boot), so do not need to do it again
979 	 */
980 #ifdef PM_REINITMCAONRESUME
981 	if (is_x86_feature(x86_featureset, X86FSET_MCA))
982 		cmi_mca_init();
983 #endif
984 
985 	PMD(PMD_SX, ("%s() returning\n", str))
986 
987 	/* return; */
988 }
989 
990 void
991 i_cpr_alloc_cpus(void)
992 {
993 	char *str = "i_cpr_alloc_cpus";
994 
995 	PMD(PMD_SX, ("%s() CPU->cpu_id %d\n", str, CPU->cpu_id))
996 	/*
997 	 * we allocate this only when we actually need it to save on
998 	 * kernel memory
999 	 */
1000 
1001 	if (wc_other_cpus == NULL) {
1002 		wc_other_cpus = kmem_zalloc(max_ncpus * sizeof (wc_cpu_t),
1003 		    KM_SLEEP);
1004 	}
1005 
1006 }
1007 
1008 void
1009 i_cpr_free_cpus(void)
1010 {
1011 	int index;
1012 	wc_cpu_t *wc_cpu;
1013 
1014 	if (wc_other_cpus != NULL) {
1015 		for (index = 0; index < max_ncpus; index++) {
1016 			wc_cpu = wc_other_cpus + index;
1017 			if (wc_cpu->wc_saved_stack != NULL) {
1018 				kmem_free(wc_cpu->wc_saved_stack,
1019 				    wc_cpu->wc_saved_stack_size);
1020 			}
1021 		}
1022 
1023 		kmem_free((void *) wc_other_cpus,
1024 		    max_ncpus * sizeof (wc_cpu_t));
1025 		wc_other_cpus = NULL;
1026 	}
1027 }
1028 
1029 /*
1030  * wrapper for acpica_ddi_save_resources()
1031  */
1032 void
1033 i_cpr_save_configuration(dev_info_t *dip)
1034 {
1035 	acpica_ddi_save_resources(dip);
1036 }
1037 
1038 /*
1039  * wrapper for acpica_ddi_restore_resources()
1040  */
1041 void
1042 i_cpr_restore_configuration(dev_info_t *dip)
1043 {
1044 	acpica_ddi_restore_resources(dip);
1045 }
1046 
1047 static int
1048 wait_for_set(cpuset_t *set, int who)
1049 {
1050 	int delays;
1051 	char *str = "wait_for_set";
1052 
1053 	for (delays = 0; !CPU_IN_SET(*set, who); delays++) {
1054 		if (delays == 500) {
1055 			/*
1056 			 * After five seconds, things are probably
1057 			 * looking a bit bleak - explain the hang.
1058 			 */
1059 			cmn_err(CE_NOTE, "cpu%d: started, "
1060 			    "but not running in the kernel yet", who);
1061 			PMD(PMD_SX, ("%s() %d cpu started "
1062 			    "but not running in the kernel yet\n",
1063 			    str, who))
1064 		} else if (delays > 2000) {
1065 			/*
1066 			 * We waited at least 20 seconds, bail ..
1067 			 */
1068 			cmn_err(CE_WARN, "cpu%d: timed out", who);
1069 			PMD(PMD_SX, ("%s() %d cpu timed out\n",
1070 			    str, who))
1071 			return (0);
1072 		}
1073 
1074 		/*
1075 		 * wait at least 10ms, then check again..
1076 		 */
1077 		drv_usecwait(10000);
1078 	}
1079 
1080 	return (1);
1081 }
1082 
1083 static	void
1084 i_cpr_save_stack(kthread_t *t, wc_cpu_t *wc_cpu)
1085 {
1086 	size_t	stack_size;	/* size of stack */
1087 	caddr_t	start = CPR_GET_STACK_START(t);	/* stack start */
1088 	caddr_t	end = CPR_GET_STACK_END(t);	/* stack end  */
1089 
1090 	stack_size = (size_t)end - (size_t)start;
1091 
1092 	if (wc_cpu->wc_saved_stack_size < stack_size) {
1093 		if (wc_cpu->wc_saved_stack != NULL) {
1094 			kmem_free(wc_cpu->wc_saved_stack,
1095 			    wc_cpu->wc_saved_stack_size);
1096 		}
1097 		wc_cpu->wc_saved_stack = kmem_zalloc(stack_size, KM_SLEEP);
1098 		wc_cpu->wc_saved_stack_size = stack_size;
1099 	}
1100 
1101 	bcopy(start, wc_cpu->wc_saved_stack, stack_size);
1102 }
1103 
1104 void
1105 i_cpr_restore_stack(kthread_t *t, greg_t *save_stack)
1106 {
1107 	size_t	stack_size;	/* size of stack */
1108 	caddr_t	start = CPR_GET_STACK_START(t);	/* stack start */
1109 	caddr_t	end = CPR_GET_STACK_END(t);	/* stack end  */
1110 
1111 	stack_size = (size_t)end - (size_t)start;
1112 
1113 	bcopy(save_stack, start, stack_size);
1114 }
1115