xref: /illumos-gate/usr/src/uts/i86pc/os/cpr_impl.c (revision b7b0558ae6cf66a2c72f08f9104c1559d962bc84)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Platform specific implementation code
28  * Currently only suspend to RAM is supported (ACPI S3)
29  */
30 
31 #define	SUNDDI_IMPL
32 
33 #include <sys/types.h>
34 #include <sys/promif.h>
35 #include <sys/prom_isa.h>
36 #include <sys/prom_plat.h>
37 #include <sys/cpuvar.h>
38 #include <sys/pte.h>
39 #include <vm/hat.h>
40 #include <vm/page.h>
41 #include <vm/as.h>
42 #include <sys/cpr.h>
43 #include <sys/kmem.h>
44 #include <sys/clock.h>
45 #include <sys/kmem.h>
46 #include <sys/panic.h>
47 #include <vm/seg_kmem.h>
48 #include <sys/cpu_module.h>
49 #include <sys/callb.h>
50 #include <sys/machsystm.h>
51 #include <sys/vmsystm.h>
52 #include <sys/systm.h>
53 #include <sys/archsystm.h>
54 #include <sys/stack.h>
55 #include <sys/fs/ufs_fs.h>
56 #include <sys/memlist.h>
57 #include <sys/bootconf.h>
58 #include <sys/thread.h>
59 #include <sys/x_call.h>
60 #include <sys/smp_impldefs.h>
61 #include <vm/vm_dep.h>
62 #include <sys/psm.h>
63 #include <sys/epm.h>
64 #include <sys/cpr_wakecode.h>
65 #include <sys/x86_archext.h>
66 #include <sys/reboot.h>
67 #include <sys/acpi/acpi.h>
68 #include <sys/acpica.h>
69 
70 #define	AFMT	"%lx"
71 
72 extern int	flushes_require_xcalls;
73 extern cpuset_t	cpu_ready_set;
74 
75 #if defined(__amd64)
76 extern void	*wc_long_mode_64(void);
77 #endif	/* __amd64 */
78 extern int	tsc_gethrtime_enable;
79 extern	void	i_cpr_start_cpu(void);
80 
81 ushort_t	cpr_mach_type = CPR_MACHTYPE_X86;
82 void		(*cpr_start_cpu_func)(void) = i_cpr_start_cpu;
83 
84 static wc_cpu_t	*wc_other_cpus = NULL;
85 static cpuset_t procset;
86 
87 static void
88 init_real_mode_platter(int cpun, uint32_t offset, uint_t cr4, wc_desctbr_t gdt);
89 
90 static int i_cpr_platform_alloc(psm_state_request_t *req);
91 static void i_cpr_platform_free(psm_state_request_t *req);
92 static int i_cpr_save_apic(psm_state_request_t *req);
93 static int i_cpr_restore_apic(psm_state_request_t *req);
94 static int wait_for_set(cpuset_t *set, int who);
95 
96 static	void i_cpr_save_stack(kthread_t *t, wc_cpu_t *wc_cpu);
97 void i_cpr_restore_stack(kthread_t *t, greg_t *save_stack);
98 
99 #ifdef STACK_GROWTH_DOWN
100 #define	CPR_GET_STACK_START(t) ((t)->t_stkbase)
101 #define	CPR_GET_STACK_END(t) ((t)->t_stk)
102 #else
103 #define	CPR_GET_STACK_START(t) ((t)->t_stk)
104 #define	CPR_GET_STACK_END(t) ((t)->t_stkbase)
105 #endif	/* STACK_GROWTH_DOWN */
106 
107 /*
108  * restart paused slave cpus
109  */
110 void
111 i_cpr_machdep_setup(void)
112 {
113 	if (ncpus > 1) {
114 		CPR_DEBUG(CPR_DEBUG1, ("MP restarted...\n"));
115 		mutex_enter(&cpu_lock);
116 		start_cpus();
117 		mutex_exit(&cpu_lock);
118 	}
119 }
120 
121 
122 /*
123  * Stop all interrupt activities in the system
124  */
125 void
126 i_cpr_stop_intr(void)
127 {
128 	(void) spl7();
129 }
130 
131 /*
132  * Set machine up to take interrupts
133  */
134 void
135 i_cpr_enable_intr(void)
136 {
137 	(void) spl0();
138 }
139 
140 /*
141  * Save miscellaneous information which needs to be written to the
142  * state file.  This information is required to re-initialize
143  * kernel/prom handshaking.
144  */
145 void
146 i_cpr_save_machdep_info(void)
147 {
148 	int notcalled = 0;
149 	ASSERT(notcalled);
150 }
151 
152 
153 void
154 i_cpr_set_tbr(void)
155 {
156 }
157 
158 
159 processorid_t
160 i_cpr_bootcpuid(void)
161 {
162 	return (0);
163 }
164 
165 /*
166  * cpu0 should contain bootcpu info
167  */
168 cpu_t *
169 i_cpr_bootcpu(void)
170 {
171 	ASSERT(MUTEX_HELD(&cpu_lock));
172 
173 	return (cpu_get(i_cpr_bootcpuid()));
174 }
175 
176 /*
177  *	Save context for the specified CPU
178  */
179 void *
180 i_cpr_save_context(void *arg)
181 {
182 	long	index = (long)arg;
183 	psm_state_request_t *papic_state;
184 	int resuming;
185 	int	ret;
186 	wc_cpu_t	*wc_cpu = wc_other_cpus + index;
187 
188 	PMD(PMD_SX, ("i_cpr_save_context() index = %ld\n", index))
189 
190 	ASSERT(index < NCPU);
191 
192 	papic_state = &(wc_cpu)->wc_apic_state;
193 
194 	ret = i_cpr_platform_alloc(papic_state);
195 	ASSERT(ret == 0);
196 
197 	ret = i_cpr_save_apic(papic_state);
198 	ASSERT(ret == 0);
199 
200 	i_cpr_save_stack(curthread, wc_cpu);
201 
202 	/*
203 	 * wc_save_context returns twice, once when susending and
204 	 * once when resuming,  wc_save_context() returns 0 when
205 	 * suspending and non-zero upon resume
206 	 */
207 	resuming = (wc_save_context(wc_cpu) == 0);
208 
209 	/*
210 	 * do NOT call any functions after this point, because doing so
211 	 * will modify the stack that we are running on
212 	 */
213 
214 	if (resuming) {
215 
216 		ret = i_cpr_restore_apic(papic_state);
217 		ASSERT(ret == 0);
218 
219 		i_cpr_platform_free(papic_state);
220 
221 		/*
222 		 * Enable interrupts on this cpu.
223 		 * Do not bind interrupts to this CPU's local APIC until
224 		 * the CPU is ready to receive interrupts.
225 		 */
226 		ASSERT(CPU->cpu_id != i_cpr_bootcpuid());
227 		mutex_enter(&cpu_lock);
228 		cpu_enable_intr(CPU);
229 		mutex_exit(&cpu_lock);
230 
231 		/*
232 		 * Setting the bit in cpu_ready_set must be the last operation
233 		 * in processor initialization; the boot CPU will continue to
234 		 * boot once it sees this bit set for all active CPUs.
235 		 */
236 		CPUSET_ATOMIC_ADD(cpu_ready_set, CPU->cpu_id);
237 
238 		PMD(PMD_SX,
239 		    ("i_cpr_save_context() resuming cpu %d in cpu_ready_set\n",
240 		    CPU->cpu_id))
241 	} else {
242 		/*
243 		 * Disable interrupts on this CPU so that PSM knows not to bind
244 		 * interrupts here on resume until the CPU has executed
245 		 * cpu_enable_intr() (above) in the resume path.
246 		 * We explicitly do not grab cpu_lock here because at this point
247 		 * in the suspend process, the boot cpu owns cpu_lock and all
248 		 * other cpus are also executing in the pause thread (only
249 		 * modifying their respective CPU structure).
250 		 */
251 		(void) cpu_disable_intr(CPU);
252 	}
253 
254 	PMD(PMD_SX, ("i_cpr_save_context: wc_save_context returns %d\n",
255 	    resuming))
256 
257 	return (NULL);
258 }
259 
260 static ushort_t *warm_reset_vector = NULL;
261 
262 static ushort_t *
263 map_warm_reset_vector()
264 {
265 	/*LINTED*/
266 	if (!(warm_reset_vector = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR,
267 	    sizeof (ushort_t *), PROT_READ|PROT_WRITE)))
268 		return (NULL);
269 
270 	/*
271 	 * setup secondary cpu bios boot up vector
272 	 */
273 	*warm_reset_vector = (ushort_t)((caddr_t)
274 	    /*LINTED*/
275 	    ((struct rm_platter *)rm_platter_va)->rm_code - rm_platter_va
276 	    + ((ulong_t)rm_platter_va & 0xf));
277 	warm_reset_vector++;
278 	*warm_reset_vector = (ushort_t)(rm_platter_pa >> 4);
279 
280 	--warm_reset_vector;
281 	return (warm_reset_vector);
282 }
283 
284 void
285 i_cpr_pre_resume_cpus()
286 {
287 	/*
288 	 * this is a cut down version of start_other_cpus()
289 	 * just do the initialization to wake the other cpus
290 	 */
291 	unsigned who;
292 	int boot_cpuid = i_cpr_bootcpuid();
293 	uint32_t		code_length = 0;
294 	caddr_t			wakevirt = rm_platter_va;
295 	/*LINTED*/
296 	wakecode_t		*wp = (wakecode_t *)wakevirt;
297 	char *str = "i_cpr_pre_resume_cpus";
298 	extern int get_tsc_ready();
299 	int err;
300 
301 	/*LINTED*/
302 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
303 
304 	/*
305 	 * If startup wasn't able to find a page under 1M, we cannot
306 	 * proceed.
307 	 */
308 	if (rm_platter_va == 0) {
309 		cmn_err(CE_WARN, "Cannot suspend the system because no "
310 		    "memory below 1M could be found for processor startup");
311 		return;
312 	}
313 
314 	/*
315 	 * Copy the real mode code at "real_mode_start" to the
316 	 * page at rm_platter_va.
317 	 */
318 	warm_reset_vector = map_warm_reset_vector();
319 	if (warm_reset_vector == NULL) {
320 		PMD(PMD_SX, ("i_cpr_pre_resume_cpus() returning #2\n"))
321 		return;
322 	}
323 
324 	flushes_require_xcalls = 1;
325 
326 	/*
327 	 * We lock our affinity to the master CPU to ensure that all slave CPUs
328 	 * do their TSC syncs with the same CPU.
329 	 */
330 
331 	affinity_set(CPU_CURRENT);
332 
333 	/*
334 	 * Mark the boot cpu as being ready and in the procset, since we are
335 	 * running on that cpu.
336 	 */
337 	CPUSET_ONLY(cpu_ready_set, boot_cpuid);
338 	CPUSET_ONLY(procset, boot_cpuid);
339 
340 	for (who = 0; who < ncpus; who++) {
341 
342 		wc_cpu_t	*cpup = wc_other_cpus + who;
343 		wc_desctbr_t	gdt;
344 
345 		if (who == boot_cpuid)
346 			continue;
347 
348 		if (!CPU_IN_SET(mp_cpus, who))
349 			continue;
350 
351 		PMD(PMD_SX, ("%s() waking up %d cpu\n", str, who))
352 
353 		bcopy(cpup, &(wp->wc_cpu), sizeof (wc_cpu_t));
354 
355 		gdt.base = cpup->wc_gdt_base;
356 		gdt.limit = cpup->wc_gdt_limit;
357 
358 #if defined(__amd64)
359 		code_length = (uint32_t)wc_long_mode_64 - (uint32_t)wc_rm_start;
360 #else
361 		code_length = 0;
362 #endif
363 
364 		init_real_mode_platter(who, code_length, cpup->wc_cr4, gdt);
365 
366 		if ((err = mach_cpuid_start(who, rm_platter_va)) != 0) {
367 			cmn_err(CE_WARN, "cpu%d: failed to start during "
368 			    "suspend/resume error %d", who, err);
369 			continue;
370 		}
371 
372 		PMD(PMD_SX, ("%s() #1 waiting for %d in procset\n", str, who))
373 
374 		if (!wait_for_set(&procset, who))
375 			continue;
376 
377 		PMD(PMD_SX, ("%s() %d cpu started\n", str, who))
378 
379 		PMD(PMD_SX, ("%s() tsc_ready = %d\n", str, get_tsc_ready()))
380 
381 		if (tsc_gethrtime_enable) {
382 			PMD(PMD_SX, ("%s() calling tsc_sync_master\n", str))
383 			tsc_sync_master(who);
384 		}
385 
386 		PMD(PMD_SX, ("%s() waiting for %d in cpu_ready_set\n", str,
387 		    who))
388 		/*
389 		 * Wait for cpu to declare that it is ready, we want the
390 		 * cpus to start serially instead of in parallel, so that
391 		 * they do not contend with each other in wc_rm_start()
392 		 */
393 		if (!wait_for_set(&cpu_ready_set, who))
394 			continue;
395 
396 		/*
397 		 * do not need to re-initialize dtrace using dtrace_cpu_init
398 		 * function
399 		 */
400 		PMD(PMD_SX, ("%s() cpu %d now ready\n", str, who))
401 	}
402 
403 	affinity_clear();
404 
405 	PMD(PMD_SX, ("%s() all cpus now ready\n", str))
406 
407 }
408 
409 static void
410 unmap_warm_reset_vector(ushort_t *warm_reset_vector)
411 {
412 	psm_unmap_phys((caddr_t)warm_reset_vector, sizeof (ushort_t *));
413 }
414 
415 /*
416  * We need to setup a 1:1 (virtual to physical) mapping for the
417  * page containing the wakeup code.
418  */
419 static struct as *save_as;	/* when switching to kas */
420 
421 static void
422 unmap_wakeaddr_1to1(uint64_t wakephys)
423 {
424 	uintptr_t	wp = (uintptr_t)wakephys;
425 	hat_setup(save_as->a_hat, 0);	/* switch back from kernel hat */
426 	hat_unload(kas.a_hat, (caddr_t)wp, PAGESIZE, HAT_UNLOAD);
427 }
428 
429 void
430 i_cpr_post_resume_cpus()
431 {
432 	uint64_t	wakephys = rm_platter_pa;
433 
434 	if (warm_reset_vector != NULL)
435 		unmap_warm_reset_vector(warm_reset_vector);
436 
437 	hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE,
438 	    HAT_UNLOAD);
439 
440 	/*
441 	 * cmi_post_mpstartup() is only required upon boot not upon
442 	 * resume from RAM
443 	 */
444 
445 	PT(PT_UNDO1to1);
446 	/* Tear down 1:1 mapping for wakeup code */
447 	unmap_wakeaddr_1to1(wakephys);
448 }
449 
450 /* ARGSUSED */
451 void
452 i_cpr_handle_xc(int flag)
453 {
454 }
455 
456 int
457 i_cpr_reusable_supported(void)
458 {
459 	return (0);
460 }
461 static void
462 map_wakeaddr_1to1(uint64_t wakephys)
463 {
464 	uintptr_t	wp = (uintptr_t)wakephys;
465 	hat_devload(kas.a_hat, (caddr_t)wp, PAGESIZE, btop(wakephys),
466 	    (PROT_READ|PROT_WRITE|PROT_EXEC|HAT_STORECACHING_OK|HAT_NOSYNC),
467 	    HAT_LOAD);
468 	save_as = curthread->t_procp->p_as;
469 	hat_setup(kas.a_hat, 0);	/* switch to kernel-only hat */
470 }
471 
472 
473 void
474 prt_other_cpus()
475 {
476 	int	who;
477 
478 	if (ncpus == 1) {
479 		PMD(PMD_SX, ("prt_other_cpus() other cpu table empty for "
480 		    "uniprocessor machine\n"))
481 		return;
482 	}
483 
484 	for (who = 0; who < ncpus; who++) {
485 
486 		wc_cpu_t	*cpup = wc_other_cpus + who;
487 
488 		PMD(PMD_SX, ("prt_other_cpus() who = %d, gdt=%p:%x, "
489 		    "idt=%p:%x, ldt=%lx, tr=%lx, kgsbase="
490 		    AFMT ", sp=%lx\n", who,
491 		    (void *)cpup->wc_gdt_base, cpup->wc_gdt_limit,
492 		    (void *)cpup->wc_idt_base, cpup->wc_idt_limit,
493 		    (long)cpup->wc_ldt, (long)cpup->wc_tr,
494 		    (long)cpup->wc_kgsbase, (long)cpup->wc_rsp))
495 	}
496 }
497 
498 /*
499  * Power down the system.
500  */
501 int
502 i_cpr_power_down(int sleeptype)
503 {
504 	caddr_t		wakevirt = rm_platter_va;
505 	uint64_t	wakephys = rm_platter_pa;
506 	ulong_t		saved_intr;
507 	uint32_t	code_length = 0;
508 	wc_desctbr_t	gdt;
509 	/*LINTED*/
510 	wakecode_t	*wp = (wakecode_t *)wakevirt;
511 	/*LINTED*/
512 	rm_platter_t	*wcpp = (rm_platter_t *)wakevirt;
513 	wc_cpu_t	*cpup = &(wp->wc_cpu);
514 	dev_info_t	*ppm;
515 	int		ret = 0;
516 	power_req_t	power_req;
517 	char *str =	"i_cpr_power_down";
518 #if defined(__amd64)
519 	/*LINTED*/
520 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
521 #endif
522 	extern int	cpr_suspend_succeeded;
523 	extern void	kernel_wc_code();
524 
525 	ASSERT(sleeptype == CPR_TORAM);
526 	ASSERT(CPU->cpu_id == 0);
527 
528 	if ((ppm = PPM(ddi_root_node())) == NULL) {
529 		PMD(PMD_SX, ("%s: root node not claimed\n", str))
530 		return (ENOTTY);
531 	}
532 
533 	PMD(PMD_SX, ("Entering %s()\n", str))
534 
535 	PT(PT_IC);
536 	saved_intr = intr_clear();
537 
538 	PT(PT_1to1);
539 	/* Setup 1:1 mapping for wakeup code */
540 	map_wakeaddr_1to1(wakephys);
541 
542 	PMD(PMD_SX, ("ncpus=%d\n", ncpus))
543 
544 	PMD(PMD_SX, ("wc_rm_end - wc_rm_start=%lx WC_CODESIZE=%x\n",
545 	    ((size_t)((uint_t)wc_rm_end - (uint_t)wc_rm_start)), WC_CODESIZE))
546 
547 	PMD(PMD_SX, ("wakevirt=%p, wakephys=%x\n",
548 	    (void *)wakevirt, (uint_t)wakephys))
549 
550 	ASSERT(((size_t)((uint_t)wc_rm_end - (uint_t)wc_rm_start)) <
551 	    WC_CODESIZE);
552 
553 	bzero(wakevirt, PAGESIZE);
554 
555 	/* Copy code to rm_platter */
556 	bcopy((caddr_t)wc_rm_start, wakevirt,
557 	    (size_t)((uint_t)wc_rm_end - (uint_t)wc_rm_start));
558 
559 	prt_other_cpus();
560 
561 #if defined(__amd64)
562 
563 	PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
564 	    (ulong_t)real_mode_platter->rm_cr4, (ulong_t)getcr4()))
565 	PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
566 	    (ulong_t)real_mode_platter->rm_pdbr, getcr3()))
567 
568 	real_mode_platter->rm_cr4 = getcr4();
569 	real_mode_platter->rm_pdbr = getcr3();
570 
571 	rmp_gdt_init(real_mode_platter);
572 
573 	/*
574 	 * Since the CPU needs to jump to protected mode using an identity
575 	 * mapped address, we need to calculate it here.
576 	 */
577 	real_mode_platter->rm_longmode64_addr = rm_platter_pa +
578 	    ((uint32_t)wc_long_mode_64 - (uint32_t)wc_rm_start);
579 
580 	PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
581 	    (ulong_t)real_mode_platter->rm_cr4, getcr4()))
582 
583 	PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
584 	    (ulong_t)real_mode_platter->rm_pdbr, getcr3()))
585 
586 	PMD(PMD_SX, ("real_mode_platter->rm_longmode64_addr=%lx\n",
587 	    (ulong_t)real_mode_platter->rm_longmode64_addr))
588 
589 #endif
590 
591 	PT(PT_SC);
592 	if (wc_save_context(cpup)) {
593 
594 		ret = i_cpr_platform_alloc(&(wc_other_cpus->wc_apic_state));
595 		if (ret != 0)
596 			return (ret);
597 
598 		ret = i_cpr_save_apic(&(wc_other_cpus->wc_apic_state));
599 		PMD(PMD_SX, ("%s: i_cpr_save_apic() returned %d\n", str, ret))
600 		if (ret != 0)
601 			return (ret);
602 
603 		PMD(PMD_SX, ("wakephys=%x, kernel_wc_code=%p\n",
604 		    (uint_t)wakephys, (void *)&kernel_wc_code))
605 		PMD(PMD_SX, ("virtaddr=%lx, retaddr=%lx\n",
606 		    (long)cpup->wc_virtaddr, (long)cpup->wc_retaddr))
607 		PMD(PMD_SX, ("ebx=%x, edi=%x, esi=%x, ebp=%x, esp=%x\n",
608 		    cpup->wc_ebx, cpup->wc_edi, cpup->wc_esi, cpup->wc_ebp,
609 		    cpup->wc_esp))
610 		PMD(PMD_SX, ("cr0=%lx, cr3=%lx, cr4=%lx\n",
611 		    (long)cpup->wc_cr0, (long)cpup->wc_cr3,
612 		    (long)cpup->wc_cr4))
613 		PMD(PMD_SX, ("cs=%x, ds=%x, es=%x, ss=%x, fs=%lx, gs=%lx, "
614 		    "flgs=%lx\n", cpup->wc_cs, cpup->wc_ds, cpup->wc_es,
615 		    cpup->wc_ss, (long)cpup->wc_fs, (long)cpup->wc_gs,
616 		    (long)cpup->wc_eflags))
617 
618 		PMD(PMD_SX, ("gdt=%p:%x, idt=%p:%x, ldt=%lx, tr=%lx, "
619 		    "kgbase=%lx\n", (void *)cpup->wc_gdt_base,
620 		    cpup->wc_gdt_limit, (void *)cpup->wc_idt_base,
621 		    cpup->wc_idt_limit, (long)cpup->wc_ldt,
622 		    (long)cpup->wc_tr, (long)cpup->wc_kgsbase))
623 
624 		gdt.base = cpup->wc_gdt_base;
625 		gdt.limit = cpup->wc_gdt_limit;
626 
627 #if defined(__amd64)
628 		code_length = (uint32_t)wc_long_mode_64 -
629 		    (uint32_t)wc_rm_start;
630 #else
631 		code_length = 0;
632 #endif
633 
634 		init_real_mode_platter(0, code_length, cpup->wc_cr4, gdt);
635 
636 #if defined(__amd64)
637 		PMD(PMD_SX, ("real_mode_platter->rm_cr4=%lx, getcr4()=%lx\n",
638 		    (ulong_t)wcpp->rm_cr4, getcr4()))
639 
640 		PMD(PMD_SX, ("real_mode_platter->rm_pdbr=%lx, getcr3()=%lx\n",
641 		    (ulong_t)wcpp->rm_pdbr, getcr3()))
642 
643 		PMD(PMD_SX, ("real_mode_platter->rm_longmode64_addr=%lx\n",
644 		    (ulong_t)wcpp->rm_longmode64_addr))
645 
646 		PMD(PMD_SX,
647 		    ("real_mode_platter->rm_temp_gdt[TEMPGDT_KCODE64]=%lx\n",
648 		    (ulong_t)wcpp->rm_temp_gdt[TEMPGDT_KCODE64]))
649 #endif
650 
651 		PMD(PMD_SX, ("gdt=%p:%x, idt=%p:%x, ldt=%lx, tr=%lx, "
652 		    "kgsbase=%lx\n", (void *)wcpp->rm_gdt_base,
653 		    wcpp->rm_gdt_lim, (void *)wcpp->rm_idt_base,
654 		    wcpp->rm_idt_lim, (long)cpup->wc_ldt, (long)cpup->wc_tr,
655 		    (long)cpup->wc_kgsbase))
656 
657 		power_req.request_type = PMR_PPM_ENTER_SX;
658 		power_req.req.ppm_power_enter_sx_req.sx_state = S3;
659 		power_req.req.ppm_power_enter_sx_req.test_point =
660 		    cpr_test_point;
661 		power_req.req.ppm_power_enter_sx_req.wakephys = wakephys;
662 
663 		PMD(PMD_SX, ("%s: pm_ctlops PMR_PPM_ENTER_SX\n", str))
664 		PT(PT_PPMCTLOP);
665 		(void) pm_ctlops(ppm, ddi_root_node(), DDI_CTLOPS_POWER,
666 		    &power_req, &ret);
667 		PMD(PMD_SX, ("%s: returns %d\n", str, ret))
668 
669 		/*
670 		 * If it works, we get control back to the else branch below
671 		 * If we get control back here, it didn't work.
672 		 * XXX return EINVAL here?
673 		 */
674 
675 		unmap_wakeaddr_1to1(wakephys);
676 		intr_restore(saved_intr);
677 
678 		return (ret);
679 	} else {
680 		cpr_suspend_succeeded = 1;
681 
682 		power_req.request_type = PMR_PPM_EXIT_SX;
683 		power_req.req.ppm_power_enter_sx_req.sx_state = S3;
684 
685 		PMD(PMD_SX, ("%s: pm_ctlops PMR_PPM_EXIT_SX\n", str))
686 		PT(PT_PPMCTLOP);
687 		(void) pm_ctlops(ppm, ddi_root_node(), DDI_CTLOPS_POWER,
688 		    &power_req, &ret);
689 		PMD(PMD_SX, ("%s: returns %d\n", str, ret))
690 
691 		ret = i_cpr_restore_apic(&(wc_other_cpus->wc_apic_state));
692 		/*
693 		 * the restore should never fail, if the saved suceeded
694 		 */
695 		ASSERT(ret == 0);
696 
697 		i_cpr_platform_free(&(wc_other_cpus->wc_apic_state));
698 
699 		/*
700 		 * Enable interrupts on boot cpu.
701 		 */
702 		ASSERT(CPU->cpu_id == i_cpr_bootcpuid());
703 		mutex_enter(&cpu_lock);
704 		cpu_enable_intr(CPU);
705 		mutex_exit(&cpu_lock);
706 
707 		PT(PT_INTRRESTORE);
708 		intr_restore(saved_intr);
709 		PT(PT_CPU);
710 
711 		return (ret);
712 	}
713 }
714 
715 /*
716  * Stop all other cpu's before halting or rebooting. We pause the cpu's
717  * instead of sending a cross call.
718  * Stolen from sun4/os/mp_states.c
719  */
720 
721 static int cpu_are_paused;	/* sic */
722 
723 void
724 i_cpr_stop_other_cpus(void)
725 {
726 	mutex_enter(&cpu_lock);
727 	if (cpu_are_paused) {
728 		mutex_exit(&cpu_lock);
729 		return;
730 	}
731 	pause_cpus(NULL);
732 	cpu_are_paused = 1;
733 
734 	mutex_exit(&cpu_lock);
735 }
736 
737 int
738 i_cpr_is_supported(int sleeptype)
739 {
740 	extern int cpr_supported_override;
741 	extern int cpr_platform_enable;
742 	extern int pm_S3_enabled;
743 
744 	if (sleeptype != CPR_TORAM)
745 		return (0);
746 
747 	/*
748 	 * The next statement tests if a specific platform has turned off
749 	 * cpr support.
750 	 */
751 	if (cpr_supported_override)
752 		return (0);
753 
754 	/*
755 	 * If a platform has specifically turned on cpr support ...
756 	 */
757 	if (cpr_platform_enable)
758 		return (1);
759 
760 	return (pm_S3_enabled);
761 }
762 
763 void
764 i_cpr_bitmap_cleanup(void)
765 {
766 }
767 
768 void
769 i_cpr_free_memory_resources(void)
770 {
771 }
772 
773 /*
774  * Needed only for S3 so far
775  */
776 static int
777 i_cpr_platform_alloc(psm_state_request_t *req)
778 {
779 #ifdef DEBUG
780 	char	*str = "i_cpr_platform_alloc";
781 #endif
782 
783 	PMD(PMD_SX, ("cpu = %d, %s(%p) \n", CPU->cpu_id, str, (void *)req))
784 
785 	if (psm_state == NULL) {
786 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
787 		return (0);
788 	}
789 
790 	req->psr_cmd = PSM_STATE_ALLOC;
791 	return ((*psm_state)(req));
792 }
793 
794 /*
795  * Needed only for S3 so far
796  */
797 static void
798 i_cpr_platform_free(psm_state_request_t *req)
799 {
800 #ifdef DEBUG
801 	char	*str = "i_cpr_platform_free";
802 #endif
803 
804 	PMD(PMD_SX, ("cpu = %d, %s(%p) \n", CPU->cpu_id, str, (void *)req))
805 
806 	if (psm_state == NULL) {
807 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
808 		return;
809 	}
810 
811 	req->psr_cmd = PSM_STATE_FREE;
812 	(void) (*psm_state)(req);
813 }
814 
815 static int
816 i_cpr_save_apic(psm_state_request_t *req)
817 {
818 #ifdef DEBUG
819 	char	*str = "i_cpr_save_apic";
820 #endif
821 
822 	if (psm_state == NULL) {
823 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
824 		return (0);
825 	}
826 
827 	req->psr_cmd = PSM_STATE_SAVE;
828 	return ((*psm_state)(req));
829 }
830 
831 static int
832 i_cpr_restore_apic(psm_state_request_t *req)
833 {
834 #ifdef DEBUG
835 	char	*str = "i_cpr_restore_apic";
836 #endif
837 
838 	if (psm_state == NULL) {
839 		PMD(PMD_SX, ("%s() : psm_state == NULL\n", str))
840 		return (0);
841 	}
842 
843 	req->psr_cmd = PSM_STATE_RESTORE;
844 	return ((*psm_state)(req));
845 }
846 
847 
848 /* stop lint complaining about offset not being used in 32bit mode */
849 #if !defined(__amd64)
850 /*ARGSUSED*/
851 #endif
852 static void
853 init_real_mode_platter(int cpun, uint32_t offset, uint_t cr4, wc_desctbr_t gdt)
854 {
855 	/*LINTED*/
856 	rm_platter_t *real_mode_platter = (rm_platter_t *)rm_platter_va;
857 
858 	/*
859 	 * Fill up the real mode platter to make it easy for real mode code to
860 	 * kick it off. This area should really be one passed by boot to kernel
861 	 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also
862 	 * have identical physical and virtual address in paged mode.
863 	 */
864 
865 	real_mode_platter->rm_pdbr = getcr3();
866 	real_mode_platter->rm_cpu = cpun;
867 	real_mode_platter->rm_cr4 = cr4;
868 
869 	real_mode_platter->rm_gdt_base = gdt.base;
870 	real_mode_platter->rm_gdt_lim = gdt.limit;
871 
872 #if defined(__amd64)
873 	real_mode_platter->rm_x86feature = x86_feature;
874 
875 	if (getcr3() > 0xffffffffUL)
876 		panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
877 		    "located above 4G in physical memory (@ 0x%llx).",
878 		    (unsigned long long)getcr3());
879 
880 	/*
881 	 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
882 	 * by code in real_mode_start():
883 	 *
884 	 * GDT[0]:  NULL selector
885 	 * GDT[1]:  64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1
886 	 *
887 	 * Clear the IDT as interrupts will be off and a limit of 0 will cause
888 	 * the CPU to triple fault and reset on an NMI, seemingly as reasonable
889 	 * a course of action as any other, though it may cause the entire
890 	 * platform to reset in some cases...
891 	 */
892 	real_mode_platter->rm_temp_gdt[0] = 0ULL;
893 	real_mode_platter->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL;
894 
895 	real_mode_platter->rm_temp_gdt_lim = (ushort_t)
896 	    (sizeof (real_mode_platter->rm_temp_gdt) - 1);
897 	real_mode_platter->rm_temp_gdt_base = rm_platter_pa +
898 	    (uint32_t)(&((rm_platter_t *)0)->rm_temp_gdt);
899 
900 	real_mode_platter->rm_temp_idt_lim = 0;
901 	real_mode_platter->rm_temp_idt_base = 0;
902 
903 	/*
904 	 * Since the CPU needs to jump to protected mode using an identity
905 	 * mapped address, we need to calculate it here.
906 	 */
907 	real_mode_platter->rm_longmode64_addr = rm_platter_pa + offset;
908 #endif	/* __amd64 */
909 
910 	/* return; */
911 }
912 
913 void
914 i_cpr_start_cpu(void)
915 {
916 
917 	struct cpu *cp = CPU;
918 
919 	char *str = "i_cpr_start_cpu";
920 	extern void init_cpu_syscall(struct cpu *cp);
921 
922 	PMD(PMD_SX, ("%s() called\n", str))
923 
924 	PMD(PMD_SX, ("%s() #0 cp->cpu_base_spl %d\n", str,
925 	    cp->cpu_base_spl))
926 
927 	mutex_enter(&cpu_lock);
928 	if (cp == i_cpr_bootcpu()) {
929 		mutex_exit(&cpu_lock);
930 		PMD(PMD_SX,
931 		    ("%s() called on bootcpu nothing to do!\n", str))
932 		return;
933 	}
934 	mutex_exit(&cpu_lock);
935 
936 	/*
937 	 * We need to Sync PAT with cpu0's PAT. We have to do
938 	 * this with interrupts disabled.
939 	 */
940 	if (x86_feature & X86_PAT)
941 		pat_sync();
942 
943 	/*
944 	 * Initialize this CPU's syscall handlers
945 	 */
946 	init_cpu_syscall(cp);
947 
948 	PMD(PMD_SX, ("%s() #1 cp->cpu_base_spl %d\n", str, cp->cpu_base_spl))
949 
950 	/*
951 	 * Do not need to call cpuid_pass2(), cpuid_pass3(), cpuid_pass4() or
952 	 * init_cpu_info(), since the work that they do is only needed to
953 	 * be done once at boot time
954 	 */
955 
956 
957 	mutex_enter(&cpu_lock);
958 	CPUSET_ADD(procset, cp->cpu_id);
959 	mutex_exit(&cpu_lock);
960 
961 	PMD(PMD_SX, ("%s() #2 cp->cpu_base_spl %d\n", str,
962 	    cp->cpu_base_spl))
963 
964 	if (tsc_gethrtime_enable) {
965 		PMD(PMD_SX, ("%s() calling tsc_sync_slave\n", str))
966 		tsc_sync_slave();
967 	}
968 
969 	PMD(PMD_SX, ("%s() cp->cpu_id %d, cp->cpu_intr_actv %d\n", str,
970 	    cp->cpu_id, cp->cpu_intr_actv))
971 	PMD(PMD_SX, ("%s() #3 cp->cpu_base_spl %d\n", str,
972 	    cp->cpu_base_spl))
973 
974 	(void) spl0();		/* enable interrupts */
975 
976 	PMD(PMD_SX, ("%s() #4 cp->cpu_base_spl %d\n", str,
977 	    cp->cpu_base_spl))
978 
979 	/*
980 	 * Set up the CPU module for this CPU.  This can't be done before
981 	 * this CPU is made CPU_READY, because we may (in heterogeneous systems)
982 	 * need to go load another CPU module.  The act of attempting to load
983 	 * a module may trigger a cross-call, which will ASSERT unless this
984 	 * cpu is CPU_READY.
985 	 */
986 
987 	/*
988 	 * cmi already been init'd (during boot), so do not need to do it again
989 	 */
990 #ifdef PM_REINITMCAONRESUME
991 	if (x86_feature & X86_MCA)
992 		cmi_mca_init();
993 #endif
994 
995 	PMD(PMD_SX, ("%s() returning\n", str))
996 
997 	/* return; */
998 }
999 
1000 void
1001 i_cpr_alloc_cpus(void)
1002 {
1003 	char *str = "i_cpr_alloc_cpus";
1004 
1005 	PMD(PMD_SX, ("%s() CPU->cpu_id %d\n", str, CPU->cpu_id))
1006 	/*
1007 	 * we allocate this only when we actually need it to save on
1008 	 * kernel memory
1009 	 */
1010 
1011 	if (wc_other_cpus == NULL) {
1012 		wc_other_cpus = kmem_zalloc(ncpus * sizeof (wc_cpu_t),
1013 		    KM_SLEEP);
1014 	}
1015 
1016 }
1017 
1018 void
1019 i_cpr_free_cpus(void)
1020 {
1021 	int index;
1022 	wc_cpu_t *wc_cpu;
1023 
1024 	if (wc_other_cpus != NULL) {
1025 		for (index = 0; index < ncpus; index++) {
1026 			wc_cpu = wc_other_cpus + index;
1027 			if (wc_cpu->wc_saved_stack != NULL) {
1028 				kmem_free(wc_cpu->wc_saved_stack,
1029 				    wc_cpu->wc_saved_stack_size);
1030 			}
1031 		}
1032 
1033 		kmem_free((void *) wc_other_cpus, ncpus * sizeof (wc_cpu_t));
1034 		wc_other_cpus = NULL;
1035 	}
1036 }
1037 
1038 /*
1039  * wrapper for acpica_ddi_save_resources()
1040  */
1041 void
1042 i_cpr_save_configuration(dev_info_t *dip)
1043 {
1044 	acpica_ddi_save_resources(dip);
1045 }
1046 
1047 /*
1048  * wrapper for acpica_ddi_restore_resources()
1049  */
1050 void
1051 i_cpr_restore_configuration(dev_info_t *dip)
1052 {
1053 	acpica_ddi_restore_resources(dip);
1054 }
1055 
1056 static int
1057 wait_for_set(cpuset_t *set, int who)
1058 {
1059 	int delays;
1060 	char *str = "wait_for_set";
1061 
1062 	for (delays = 0; !CPU_IN_SET(*set, who); delays++) {
1063 		if (delays == 500) {
1064 			/*
1065 			 * After five seconds, things are probably
1066 			 * looking a bit bleak - explain the hang.
1067 			 */
1068 			cmn_err(CE_NOTE, "cpu%d: started, "
1069 			    "but not running in the kernel yet", who);
1070 			PMD(PMD_SX, ("%s() %d cpu started "
1071 			    "but not running in the kernel yet\n",
1072 			    str, who))
1073 		} else if (delays > 2000) {
1074 			/*
1075 			 * We waited at least 20 seconds, bail ..
1076 			 */
1077 			cmn_err(CE_WARN, "cpu%d: timed out", who);
1078 			PMD(PMD_SX, ("%s() %d cpu timed out\n",
1079 			    str, who))
1080 			return (0);
1081 		}
1082 
1083 		/*
1084 		 * wait at least 10ms, then check again..
1085 		 */
1086 		drv_usecwait(10000);
1087 	}
1088 
1089 	return (1);
1090 }
1091 
1092 static	void
1093 i_cpr_save_stack(kthread_t *t, wc_cpu_t *wc_cpu)
1094 {
1095 	size_t	stack_size;	/* size of stack */
1096 	caddr_t	start = CPR_GET_STACK_START(t);	/* stack start */
1097 	caddr_t	end = CPR_GET_STACK_END(t);	/* stack end  */
1098 
1099 	stack_size = (size_t)end - (size_t)start;
1100 
1101 	if (wc_cpu->wc_saved_stack_size < stack_size) {
1102 		if (wc_cpu->wc_saved_stack != NULL) {
1103 			kmem_free(wc_cpu->wc_saved_stack,
1104 			    wc_cpu->wc_saved_stack_size);
1105 		}
1106 		wc_cpu->wc_saved_stack = kmem_zalloc(stack_size, KM_SLEEP);
1107 		wc_cpu->wc_saved_stack_size = stack_size;
1108 	}
1109 
1110 	bcopy(start, wc_cpu->wc_saved_stack, stack_size);
1111 }
1112 
1113 void
1114 i_cpr_restore_stack(kthread_t *t, greg_t *save_stack)
1115 {
1116 	size_t	stack_size;	/* size of stack */
1117 	caddr_t	start = CPR_GET_STACK_START(t);	/* stack start */
1118 	caddr_t	end = CPR_GET_STACK_END(t);	/* stack end  */
1119 
1120 	stack_size = (size_t)end - (size_t)start;
1121 
1122 	bcopy(save_stack, start, stack_size);
1123 }
1124