xref: /titanic_52/usr/src/uts/sun4v/os/suspend.c (revision 183ef8a1713ca188e24d970f22c6f9cc333007fd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/mutex.h>
27 #include <sys/cpuvar.h>
28 #include <sys/cyclic.h>
29 #include <sys/disp.h>
30 #include <sys/ddi.h>
31 #include <sys/wdt.h>
32 #include <sys/callb.h>
33 #include <sys/cmn_err.h>
34 #include <sys/hypervisor_api.h>
35 #include <sys/membar.h>
36 #include <sys/x_call.h>
37 #include <sys/promif.h>
38 #include <sys/systm.h>
39 #include <sys/mach_descrip.h>
40 #include <sys/cpu_module.h>
41 #include <sys/pg.h>
42 #include <sys/lgrp.h>
43 #include <sys/sysmacros.h>
44 #include <sys/sunddi.h>
45 #include <sys/cpupart.h>
46 #include <sys/hsvc.h>
47 #include <sys/mpo.h>
48 #include <vm/hat_sfmmu.h>
49 
50 /*
51  * Sun4v OS Suspend
52  *
53  * Provides a means to suspend a sun4v guest domain by pausing CPUs and then
54  * calling into the HV to initiate a suspension. Suspension is sequenced
55  * externally by calling suspend_pre, suspend_start, and suspend_post.
56  * suspend_pre and suspend_post are meant to perform any special operations
57  * that should be done before or after a suspend/resume operation. e.g.,
58  * callbacks to cluster software to disable heartbeat monitoring before the
59  * system is suspended. suspend_start prepares kernel services to be suspended
60  * and then suspends the domain by calling hv_guest_suspend.
61  *
62  * Special Handling for %tick and %stick Registers
63  *
64  * After a suspend/resume operation, the %tick and %stick registers may have
65  * jumped forwards or backwards. The delta is assumed to be consistent across
66  * all CPUs, within the negligible level of %tick and %stick variation
67  * acceptable on a cold boot. In order to maintain increasing %tick and %stick
68  * counter values without exposing large positive or negative jumps to kernel
69  * or user code, a %tick and %stick offset is used. Kernel reads of these
70  * counters return the sum of the hardware register counter and offset
71  * variable. After a suspend/resume operation, user reads of %tick or %stick
72  * are emulated. Suspend code enables emulation by setting the
73  * %{tick,stick}.NPT fields which trigger a privileged instruction access
74  * trap whenever the registers are read from user mode. If emulation has been
75  * enabled, the trap handler emulates the instruction. Emulation is only
76  * enabled during a successful suspend/resume operation. When emulation is
77  * enabled, CPUs that are DR'd into the system will have their
78  * %{tick,stick}.NPT bits set to 1 as well.
79  */
80 
81 extern u_longlong_t gettick(void);	/* returns %stick */
82 extern uint64_t gettick_counter(void);	/* returns %tick */
83 extern uint64_t gettick_npt(void);
84 extern uint64_t getstick_npt(void);
85 extern int mach_descrip_update(void);
86 extern cpuset_t cpu_ready_set;
87 extern uint64_t native_tick_offset;
88 extern uint64_t native_stick_offset;
89 
90 /*
91  * Global Sun Cluster pre/post callbacks.
92  */
93 const char *(*cl_suspend_error_decode)(int);
94 int (*cl_suspend_pre_callback)(void);
95 int (*cl_suspend_post_callback)(void);
96 #define	SC_PRE_FAIL_STR_FMT	"Sun Cluster pre-suspend failure: %d"
97 #define	SC_POST_FAIL_STR_FMT	"Sun Cluster post-suspend failure: %d"
98 #define	SC_FAIL_STR_MAX		256
99 
100 /*
101  * The minimum major and minor version of the HSVC_GROUP_CORE API group
102  * required in order to use OS suspend.
103  */
104 #define	SUSPEND_CORE_MAJOR	1
105 #define	SUSPEND_CORE_MINOR	2
106 
107 /*
108  * By default, sun4v OS suspend is supported if the required HV version
109  * is present. suspend_disabled should be set on platforms that do not
110  * allow OS suspend regardless of whether or not the HV supports it.
111  * It can also be set in /etc/system.
112  */
113 static int suspend_disabled = 0;
114 
115 /*
116  * Controls whether or not user-land tick and stick register emulation
117  * will be enabled following a successful suspend operation.
118  */
119 static int enable_user_tick_stick_emulation = 1;
120 
121 /*
122  * Indicates whether or not tick and stick emulation is currently active.
123  * After a successful suspend operation, if emulation is enabled, this
124  * variable is set to B_TRUE. Global scope to allow emulation code to
125  * check if emulation is active.
126  */
127 boolean_t tick_stick_emulation_active = B_FALSE;
128 
129 /*
130  * When non-zero, after a successful suspend and resume, cpunodes, CPU HW
131  * sharing data structures, and processor groups will be updated using
132  * information from the updated MD.
133  */
134 static int suspend_update_cpu_mappings = 1;
135 
136 /*
137  * DBG and DBG_PROM() macro.
138  */
139 #ifdef	DEBUG
140 
141 static int suspend_debug_flag = 0;
142 
143 #define	DBG_PROM		\
144 if (suspend_debug_flag)		\
145 	prom_printf
146 
147 #define	DBG			\
148 if (suspend_debug_flag)		\
149 	suspend_debug
150 
151 static void
152 suspend_debug(const char *fmt, ...)
153 {
154 	char	buf[512];
155 	va_list	ap;
156 
157 	va_start(ap, fmt);
158 	(void) vsprintf(buf, fmt, ap);
159 	va_end(ap);
160 
161 	cmn_err(CE_NOTE, "%s", buf);
162 }
163 
164 #else /* DEBUG */
165 
166 #define	DBG_PROM
167 #define	DBG
168 
169 #endif /* DEBUG */
170 
171 /*
172  * Return true if the HV supports OS suspend and if suspend has not been
173  * disabled on this platform.
174  */
175 boolean_t
176 suspend_supported(void)
177 {
178 	uint64_t major, minor;
179 
180 	if (suspend_disabled)
181 		return (B_FALSE);
182 
183 	if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0)
184 		return (B_FALSE);
185 
186 	return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) ||
187 	    (major > SUSPEND_CORE_MAJOR));
188 }
189 
190 /*
191  * Given a source tick and stick value, set the tick and stick offsets such
192  * that the (current physical register value + offset == source value).
193  */
194 static void
195 set_tick_offsets(uint64_t source_tick, uint64_t source_stick)
196 {
197 	uint64_t target_tick;
198 	uint64_t target_stick;
199 
200 	native_tick_offset = 0;
201 	native_stick_offset = 0;
202 
203 	target_tick = gettick_counter();	/* returns %tick */
204 	target_stick = gettick();		/* returns %stick */
205 
206 	native_tick_offset = source_tick - target_tick;
207 	native_stick_offset = source_stick - target_stick;
208 }
209 
210 /*
211  * Set the {tick,stick}.NPT field to 1 on this CPU.
212  */
213 static void
214 enable_tick_stick_npt(void)
215 {
216 	(void) hv_stick_set_npt(1);
217 	(void) hv_tick_set_npt(1);
218 }
219 
220 /*
221  * Synchronize a CPU's {tick,stick}.NPT fields with the current state
222  * of the system. This is used when a CPU is DR'd into the system.
223  */
224 void
225 suspend_sync_tick_stick_npt(void)
226 {
227 	if (tick_stick_emulation_active) {
228 		DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id);
229 		(void) hv_stick_set_npt(1);
230 		(void) hv_tick_set_npt(1);
231 	} else {
232 		ASSERT(gettick_npt() == 0);
233 		ASSERT(getstick_npt() == 0);
234 	}
235 }
236 
237 /*
238  * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW
239  * sharing data structures, and processor groups.
240  */
241 static void
242 update_cpu_mappings(void)
243 {
244 	md_t		*mdp;
245 	processorid_t	id;
246 	cpu_t		*cp;
247 	cpu_pg_t	*pgps[NCPU];
248 
249 	if ((mdp = md_get_handle()) == NULL) {
250 		DBG("suspend: md_get_handle failed");
251 		return;
252 	}
253 
254 	DBG("suspend: updating CPU mappings");
255 
256 	mutex_enter(&cpu_lock);
257 
258 	setup_chip_mappings(mdp);
259 	setup_exec_unit_mappings(mdp);
260 	for (id = 0; id < NCPU; id++) {
261 		if ((cp = cpu_get(id)) == NULL)
262 			continue;
263 		cpu_map_exec_units(cp);
264 	}
265 
266 	/*
267 	 * Re-calculate processor groups.
268 	 *
269 	 * First tear down all PG information before adding any new PG
270 	 * information derived from the MD we just downloaded. We must
271 	 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and
272 	 * we want to minimize the number of times pause_cpus is called.
273 	 * Inactivating all CPUs would leave PGs without any active CPUs,
274 	 * so while CPUs are paused, call pg_cpu_inactive and swap in the
275 	 * bootstrap PG structure saving the original PG structure to be
276 	 * fini'd afterwards. This prevents the dispatcher from encountering
277 	 * PGs in which all CPUs are inactive.
278 	 */
279 	pause_cpus(NULL);
280 	for (id = 0; id < NCPU; id++) {
281 		if ((cp = cpu_get(id)) == NULL)
282 			continue;
283 		pg_cpu_inactive(cp);
284 		pgps[id] = cp->cpu_pg;
285 		pg_cpu_bootstrap(cp);
286 	}
287 	start_cpus();
288 
289 	/*
290 	 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are
291 	 * not paused. Use two separate loops here so that we do not
292 	 * initialize PG data for CPUs until all the old PG data structures
293 	 * are torn down.
294 	 */
295 	for (id = 0; id < NCPU; id++) {
296 		if ((cp = cpu_get(id)) == NULL)
297 			continue;
298 		pg_cpu_fini(cp, pgps[id]);
299 		mpo_cpu_remove(id);
300 	}
301 
302 	/*
303 	 * Initialize PG data for each CPU, but leave the bootstrapped
304 	 * PG structure in place to avoid running with any PGs containing
305 	 * nothing but inactive CPUs.
306 	 */
307 	for (id = 0; id < NCPU; id++) {
308 		if ((cp = cpu_get(id)) == NULL)
309 			continue;
310 		mpo_cpu_add(mdp, id);
311 		pgps[id] = pg_cpu_init(cp, B_TRUE);
312 	}
313 
314 	/*
315 	 * Now that PG data has been initialized for all CPUs in the
316 	 * system, replace the bootstrapped PG structure with the
317 	 * initialized PG structure and call pg_cpu_active for each CPU.
318 	 */
319 	pause_cpus(NULL);
320 	for (id = 0; id < NCPU; id++) {
321 		if ((cp = cpu_get(id)) == NULL)
322 			continue;
323 		cp->cpu_pg = pgps[id];
324 		pg_cpu_active(cp);
325 	}
326 	start_cpus();
327 
328 	mutex_exit(&cpu_lock);
329 
330 	(void) md_fini_handle(mdp);
331 }
332 
333 /*
334  * Wrapper for the Sun Cluster error decoding function.
335  */
336 static int
337 cluster_error_decode(int error, char *error_reason, size_t max_reason_len)
338 {
339 	const char	*decoded;
340 	size_t		decoded_len;
341 
342 	ASSERT(error_reason != NULL);
343 	ASSERT(max_reason_len > 0);
344 
345 	max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX);
346 
347 	if (cl_suspend_error_decode == NULL)
348 		return (-1);
349 
350 	if ((decoded = (*cl_suspend_error_decode)(error)) == NULL)
351 		return (-1);
352 
353 	/* Get number of non-NULL bytes */
354 	if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0)
355 		return (-1);
356 
357 	bcopy(decoded, error_reason, decoded_len);
358 
359 	/*
360 	 * The error string returned from cl_suspend_error_decode
361 	 * should be NULL-terminated, but set the terminator here
362 	 * because we only copied non-NULL bytes. If the decoded
363 	 * string was not NULL-terminated, this guarantees that
364 	 * error_reason will be.
365 	 */
366 	error_reason[decoded_len] = '\0';
367 
368 	return (0);
369 }
370 
371 /*
372  * Wrapper for the Sun Cluster pre-suspend callback.
373  */
374 static int
375 cluster_pre_wrapper(char *error_reason, size_t max_reason_len)
376 {
377 	int rv = 0;
378 
379 	if (cl_suspend_pre_callback != NULL) {
380 		rv = (*cl_suspend_pre_callback)();
381 		DBG("suspend: cl_suspend_pre_callback returned %d", rv);
382 		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
383 			if (cluster_error_decode(rv, error_reason,
384 			    max_reason_len)) {
385 				(void) snprintf(error_reason, max_reason_len,
386 				    SC_PRE_FAIL_STR_FMT, rv);
387 			}
388 		}
389 	}
390 
391 	return (rv);
392 }
393 
394 /*
395  * Wrapper for the Sun Cluster post-suspend callback.
396  */
397 static int
398 cluster_post_wrapper(char *error_reason, size_t max_reason_len)
399 {
400 	int rv = 0;
401 
402 	if (cl_suspend_post_callback != NULL) {
403 		rv = (*cl_suspend_post_callback)();
404 		DBG("suspend: cl_suspend_post_callback returned %d", rv);
405 		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
406 			if (cluster_error_decode(rv, error_reason,
407 			    max_reason_len)) {
408 				(void) snprintf(error_reason,
409 				    max_reason_len, SC_POST_FAIL_STR_FMT, rv);
410 			}
411 		}
412 	}
413 
414 	return (rv);
415 }
416 
417 /*
418  * Execute pre-suspend callbacks preparing the system for a suspend operation.
419  * Returns zero on success, non-zero on failure. Sets the recovered argument
420  * to indicate whether or not callbacks could be undone in the event of a
421  * failure--if callbacks were successfully undone, *recovered is set to B_TRUE,
422  * otherwise *recovered is set to B_FALSE. Must be called successfully before
423  * suspend_start can be called. Callers should first call suspend_support to
424  * determine if OS suspend is supported.
425  */
426 int
427 suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered)
428 {
429 	int rv;
430 
431 	ASSERT(recovered != NULL);
432 
433 	/*
434 	 * Return an error if suspend_pre is erreoneously called
435 	 * when OS suspend is not supported.
436 	 */
437 	ASSERT(suspend_supported());
438 	if (!suspend_supported()) {
439 		DBG("suspend: suspend_pre called without suspend support");
440 		*recovered = B_TRUE;
441 		return (ENOTSUP);
442 	}
443 	DBG("suspend: %s", __func__);
444 
445 	rv = cluster_pre_wrapper(error_reason, max_reason_len);
446 
447 	/*
448 	 * At present, only one pre-suspend operation exists.
449 	 * If it fails, no recovery needs to be done.
450 	 */
451 	if (rv != 0 && recovered != NULL)
452 		*recovered = B_TRUE;
453 
454 	return (rv);
455 }
456 
457 /*
458  * Execute post-suspend callbacks. Returns zero on success, non-zero on
459  * failure. Must be called after suspend_start is called, regardless of
460  * whether or not suspend_start is successful.
461  */
462 int
463 suspend_post(char *error_reason, size_t max_reason_len)
464 {
465 	ASSERT(suspend_supported());
466 	DBG("suspend: %s", __func__);
467 	return (cluster_post_wrapper(error_reason, max_reason_len));
468 }
469 
470 /*
471  * Suspends the OS by pausing CPUs and calling into the HV to initiate
472  * the suspend. When the HV routine hv_guest_suspend returns, the system
473  * will be resumed. Must be called after a successful call to suspend_pre.
474  * suspend_post must be called after suspend_start, whether or not
475  * suspend_start returns an error.
476  */
477 /*ARGSUSED*/
478 int
479 suspend_start(char *error_reason, size_t max_reason_len)
480 {
481 	uint64_t	source_tick;
482 	uint64_t	source_stick;
483 	uint64_t	rv;
484 	timestruc_t	source_tod;
485 	int		spl;
486 
487 	ASSERT(suspend_supported());
488 	DBG("suspend: %s", __func__);
489 
490 	sfmmu_ctxdoms_lock();
491 
492 	mutex_enter(&cpu_lock);
493 
494 	/* Suspend the watchdog */
495 	watchdog_suspend();
496 
497 	/* Record the TOD */
498 	mutex_enter(&tod_lock);
499 	source_tod = tod_get();
500 	mutex_exit(&tod_lock);
501 
502 	/* Pause all other CPUs */
503 	pause_cpus(NULL);
504 	DBG_PROM("suspend: CPUs paused\n");
505 
506 	/* Suspend cyclics and disable interrupts */
507 	cyclic_suspend();
508 	DBG_PROM("suspend: cyclics suspended\n");
509 	spl = spl8();
510 
511 	source_tick = gettick_counter();
512 	source_stick = gettick();
513 	DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick);
514 	DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick);
515 
516 	/*
517 	 * Call into the HV to initiate the suspend.
518 	 * hv_guest_suspend() returns after the guest has been
519 	 * resumed or if the suspend operation failed or was
520 	 * cancelled. After a successful suspend, the %tick and
521 	 * %stick registers may have changed by an amount that is
522 	 * not proportional to the amount of time that has passed.
523 	 * They may have jumped forwards or backwards. This jump
524 	 * must be uniform across all CPUs and we operate under
525 	 * the assumption that it is (maintaining two global offset
526 	 * variables--one for %tick and one for %stick.)
527 	 */
528 	DBG_PROM("suspend: suspending... \n");
529 	rv = hv_guest_suspend();
530 	if (rv != 0) {
531 		splx(spl);
532 		cyclic_resume();
533 		start_cpus();
534 		watchdog_resume();
535 		mutex_exit(&cpu_lock);
536 		sfmmu_ctxdoms_unlock();
537 		DBG("suspend: failed, rv: %ld\n", rv);
538 		return (rv);
539 	}
540 
541 	/* Update the global tick and stick offsets */
542 	set_tick_offsets(source_tick, source_stick);
543 
544 	/* Ensure new offsets are globally visible before resuming CPUs */
545 	membar_sync();
546 
547 	/* Enable interrupts */
548 	splx(spl);
549 
550 	/* Set the {%tick,%stick}.NPT bits on all CPUs */
551 	if (enable_user_tick_stick_emulation) {
552 		xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL);
553 		xt_sync(cpu_ready_set);
554 		ASSERT(gettick_npt() != 0);
555 		ASSERT(getstick_npt() != 0);
556 	}
557 
558 	/* If emulation is enabled, but not currently active, enable it */
559 	if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) {
560 		tick_stick_emulation_active = B_TRUE;
561 	}
562 
563 	sfmmu_ctxdoms_remove();
564 
565 	/* Resume cyclics, unpause CPUs */
566 	cyclic_resume();
567 	start_cpus();
568 
569 	/* Set the TOD */
570 	mutex_enter(&tod_lock);
571 	tod_set(source_tod);
572 	mutex_exit(&tod_lock);
573 
574 	/* Re-enable the watchdog */
575 	watchdog_resume();
576 
577 	mutex_exit(&cpu_lock);
578 
579 	/* Download the latest MD */
580 	if ((rv = mach_descrip_update()) != 0)
581 		cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld",
582 		    rv);
583 
584 	sfmmu_ctxdoms_update();
585 	sfmmu_ctxdoms_unlock();
586 
587 	/* Get new MD, update CPU mappings/relationships */
588 	if (suspend_update_cpu_mappings)
589 		update_cpu_mappings();
590 
591 	DBG("suspend: target tick: 0x%lx", gettick_counter());
592 	DBG("suspend: target stick: 0x%llx", gettick());
593 	DBG("suspend: user %%tick/%%stick emulation is %d",
594 	    tick_stick_emulation_active);
595 	DBG("suspend: finished");
596 
597 	return (0);
598 }
599