xref: /titanic_44/usr/src/uts/sun4v/os/suspend.c (revision 2f172c55ef76964744bc62b4500ece87f3089b4d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/mutex.h>
27 #include <sys/cpuvar.h>
28 #include <sys/cyclic.h>
29 #include <sys/disp.h>
30 #include <sys/ddi.h>
31 #include <sys/wdt.h>
32 #include <sys/callb.h>
33 #include <sys/cmn_err.h>
34 #include <sys/hypervisor_api.h>
35 #include <sys/membar.h>
36 #include <sys/x_call.h>
37 #include <sys/promif.h>
38 #include <sys/systm.h>
39 #include <sys/mach_descrip.h>
40 #include <sys/cpu_module.h>
41 #include <sys/pg.h>
42 #include <sys/lgrp.h>
43 #include <sys/sysmacros.h>
44 #include <sys/sunddi.h>
45 #include <sys/cpupart.h>
46 #include <sys/hsvc.h>
47 
48 /*
49  * Sun4v OS Suspend
50  *
51  * Provides a means to suspend a sun4v guest domain by pausing CPUs and then
52  * calling into the HV to initiate a suspension. Suspension is sequenced
53  * externally by calling suspend_pre, suspend_start, and suspend_post.
54  * suspend_pre and suspend_post are meant to perform any special operations
55  * that should be done before or after a suspend/resume operation. e.g.,
56  * callbacks to cluster software to disable heartbeat monitoring before the
57  * system is suspended. suspend_start prepares kernel services to be suspended
58  * and then suspends the domain by calling hv_guest_suspend.
59  *
60  * Special Handling for %tick and %stick Registers
61  *
62  * After a suspend/resume operation, the %tick and %stick registers may have
63  * jumped forwards or backwards. The delta is assumed to be consistent across
64  * all CPUs, within the negligible level of %tick and %stick variation
65  * acceptable on a cold boot. In order to maintain increasing %tick and %stick
66  * counter values without exposing large positive or negative jumps to kernel
67  * or user code, a %tick and %stick offset is used. Kernel reads of these
68  * counters return the sum of the hardware register counter and offset
69  * variable. After a suspend/resume operation, user reads of %tick or %stick
70  * are emulated. Suspend code enables emulation by setting the
71  * %{tick,stick}.NPT fields which trigger a privileged instruction access
72  * trap whenever the registers are read from user mode. If emulation has been
73  * enabled, the trap handler emulates the instruction. Emulation is only
74  * enabled during a successful suspend/resume operation. When emulation is
75  * enabled, CPUs that are DR'd into the system will have their
76  * %{tick,stick}.NPT bits set to 1 as well.
77  */
78 
79 extern u_longlong_t gettick(void);	/* returns %stick */
80 extern uint64_t gettick_counter(void);	/* returns %tick */
81 extern uint64_t gettick_npt(void);
82 extern uint64_t getstick_npt(void);
83 extern int mach_descrip_update(void);
84 extern cpuset_t cpu_ready_set;
85 extern uint64_t native_tick_offset;
86 extern uint64_t native_stick_offset;
87 
88 /*
89  * Global Sun Cluster pre/post callbacks.
90  */
91 const char *(*cl_suspend_error_decode)(int);
92 int (*cl_suspend_pre_callback)(void);
93 int (*cl_suspend_post_callback)(void);
94 #define	SC_PRE_FAIL_STR_FMT	"Sun Cluster pre-suspend failure: %d"
95 #define	SC_POST_FAIL_STR_FMT	"Sun Cluster post-suspend failure: %d"
96 #define	SC_FAIL_STR_MAX		256
97 
98 /*
99  * The minimum major and minor version of the HSVC_GROUP_CORE API group
100  * required in order to use OS suspend.
101  */
102 #define	SUSPEND_CORE_MAJOR	1
103 #define	SUSPEND_CORE_MINOR	2
104 
105 /*
106  * By default, sun4v OS suspend is supported if the required HV version
107  * is present. suspend_disabled should be set on platforms that do not
108  * allow OS suspend regardless of whether or not the HV supports it.
109  * It can also be set in /etc/system.
110  */
111 static int suspend_disabled = 0;
112 
113 /*
114  * Controls whether or not user-land tick and stick register emulation
115  * will be enabled following a successful suspend operation.
116  */
117 static int enable_user_tick_stick_emulation = 1;
118 
119 /*
120  * Indicates whether or not tick and stick emulation is currently active.
121  * After a successful suspend operation, if emulation is enabled, this
122  * variable is set to B_TRUE. Global scope to allow emulation code to
123  * check if emulation is active.
124  */
125 boolean_t tick_stick_emulation_active = B_FALSE;
126 
127 /*
128  * Controls whether or not MD information is refreshed after a
129  * successful suspend and resume. When non-zero, after a successful
130  * suspend and resume, the MD will be downloaded, cpunodes updated,
131  * and processor grouping information recalculated.
132  */
133 static int suspend_update_cpu_mappings = 1;
134 
135 /*
136  * DBG and DBG_PROM() macro.
137  */
138 #ifdef	DEBUG
139 
140 static int suspend_debug_flag = 0;
141 
142 #define	DBG_PROM		\
143 if (suspend_debug_flag)		\
144 	prom_printf
145 
146 #define	DBG			\
147 if (suspend_debug_flag)		\
148 	suspend_debug
149 
150 static void
151 suspend_debug(const char *fmt, ...)
152 {
153 	char	buf[512];
154 	va_list	ap;
155 
156 	va_start(ap, fmt);
157 	(void) vsprintf(buf, fmt, ap);
158 	va_end(ap);
159 
160 	cmn_err(CE_NOTE, "%s", buf);
161 }
162 
163 #else /* DEBUG */
164 
165 #define	DBG_PROM
166 #define	DBG
167 
168 #endif /* DEBUG */
169 
170 /*
171  * Return true if the HV supports OS suspend and if suspend has not been
172  * disabled on this platform.
173  */
174 boolean_t
175 suspend_supported(void)
176 {
177 	uint64_t major, minor;
178 
179 	if (suspend_disabled)
180 		return (B_FALSE);
181 
182 	if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0)
183 		return (B_FALSE);
184 
185 	return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) ||
186 	    (major > SUSPEND_CORE_MAJOR));
187 }
188 
189 /*
190  * Given a source tick and stick value, set the tick and stick offsets such
191  * that the (current physical register value + offset == source value).
192  */
193 static void
194 set_tick_offsets(uint64_t source_tick, uint64_t source_stick)
195 {
196 	uint64_t target_tick;
197 	uint64_t target_stick;
198 
199 	native_tick_offset = 0;
200 	native_stick_offset = 0;
201 
202 	target_tick = gettick_counter();	/* returns %tick */
203 	target_stick = gettick();		/* returns %stick */
204 
205 	native_tick_offset = source_tick - target_tick;
206 	native_stick_offset = source_stick - target_stick;
207 }
208 
209 /*
210  * Set the {tick,stick}.NPT field to 1 on this CPU.
211  */
212 static void
213 enable_tick_stick_npt(void)
214 {
215 	hv_stick_set_npt(1);
216 	hv_tick_set_npt(1);
217 }
218 
219 /*
220  * Synchronize a CPU's {tick,stick}.NPT fields with the current state
221  * of the system. This is used when a CPU is DR'd into the system.
222  */
223 void
224 suspend_sync_tick_stick_npt(void)
225 {
226 	if (tick_stick_emulation_active) {
227 		DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id);
228 		hv_stick_set_npt(1);
229 		hv_tick_set_npt(1);
230 	} else {
231 		ASSERT(gettick_npt() == 0);
232 		ASSERT(getstick_npt() == 0);
233 	}
234 }
235 
236 /*
237  * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW
238  * sharing data structures, and processor groups.
239  */
240 static void
241 update_cpu_mappings(void)
242 {
243 	md_t		*mdp;
244 	processorid_t	id;
245 	cpu_t		*cp;
246 	int		rv;
247 	cpu_pg_t	*pgps[NCPU];
248 
249 	/* Download the latest MD */
250 	if ((rv = mach_descrip_update()) != 0) {
251 		DBG("suspend: mach_descrip_update error: %d", rv);
252 		return;
253 	}
254 
255 	if ((mdp = md_get_handle()) == NULL) {
256 		DBG("suspend: md_get_handle failed");
257 		return;
258 	}
259 
260 	DBG("suspend: updating CPU mappings");
261 
262 	mutex_enter(&cpu_lock);
263 
264 	setup_chip_mappings(mdp);
265 	setup_exec_unit_mappings(mdp);
266 	for (id = 0; id < NCPU; id++) {
267 		if ((cp = cpu_get(id)) == NULL)
268 			continue;
269 		cpu_map_exec_units(cp);
270 	}
271 
272 	/*
273 	 * Re-calculate processor groups.
274 	 *
275 	 * First tear down all PG information before adding any new PG
276 	 * information derived from the MD we just downloaded. We must
277 	 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and
278 	 * we want to minimize the number of times pause_cpus is called.
279 	 * Inactivating all CPUs would leave PGs without any active CPUs,
280 	 * so while CPUs are paused, call pg_cpu_inactive and swap in the
281 	 * bootstrap PG structure saving the original PG structure to be
282 	 * fini'd afterwards. This prevents the dispatcher from encountering
283 	 * PGs in which all CPUs are inactive.
284 	 */
285 	pause_cpus(NULL);
286 	for (id = 0; id < NCPU; id++) {
287 		if ((cp = cpu_get(id)) == NULL)
288 			continue;
289 		pg_cpu_inactive(cp);
290 		pgps[id] = cp->cpu_pg;
291 		pg_cpu_bootstrap(cp);
292 	}
293 	start_cpus();
294 
295 	/*
296 	 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are
297 	 * not paused. Use two separate loops here so that we do not
298 	 * initialize PG data for CPUs until all the old PG data structures
299 	 * are torn down.
300 	 */
301 	for (id = 0; id < NCPU; id++) {
302 		if ((cp = cpu_get(id)) == NULL)
303 			continue;
304 		pg_cpu_fini(cp, pgps[id]);
305 	}
306 
307 	/*
308 	 * Initialize PG data for each CPU, but leave the bootstrapped
309 	 * PG structure in place to avoid running with any PGs containing
310 	 * nothing but inactive CPUs.
311 	 */
312 	for (id = 0; id < NCPU; id++) {
313 		if ((cp = cpu_get(id)) == NULL)
314 			continue;
315 		pgps[id] = pg_cpu_init(cp, B_TRUE);
316 	}
317 
318 	/*
319 	 * Now that PG data has been initialized for all CPUs in the
320 	 * system, replace the bootstrapped PG structure with the
321 	 * initialized PG structure and call pg_cpu_active for each CPU.
322 	 */
323 	pause_cpus(NULL);
324 	for (id = 0; id < NCPU; id++) {
325 		if ((cp = cpu_get(id)) == NULL)
326 			continue;
327 		cp->cpu_pg = pgps[id];
328 		pg_cpu_active(cp);
329 	}
330 	start_cpus();
331 
332 	mutex_exit(&cpu_lock);
333 
334 	(void) md_fini_handle(mdp);
335 }
336 
337 /*
338  * Wrapper for the Sun Cluster error decoding function.
339  */
340 static int
341 cluster_error_decode(int error, char *error_reason, size_t max_reason_len)
342 {
343 	const char	*decoded;
344 	size_t		decoded_len;
345 
346 	ASSERT(error_reason != NULL);
347 	ASSERT(max_reason_len > 0);
348 
349 	max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX);
350 
351 	if (cl_suspend_error_decode == NULL)
352 		return (-1);
353 
354 	if ((decoded = (*cl_suspend_error_decode)(error)) == NULL)
355 		return (-1);
356 
357 	/* Get number of non-NULL bytes */
358 	if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0)
359 		return (-1);
360 
361 	bcopy(decoded, error_reason, decoded_len);
362 
363 	/*
364 	 * The error string returned from cl_suspend_error_decode
365 	 * should be NULL-terminated, but set the terminator here
366 	 * because we only copied non-NULL bytes. If the decoded
367 	 * string was not NULL-terminated, this guarantees that
368 	 * error_reason will be.
369 	 */
370 	error_reason[decoded_len] = '\0';
371 
372 	return (0);
373 }
374 
375 /*
376  * Wrapper for the Sun Cluster pre-suspend callback.
377  */
378 static int
379 cluster_pre_wrapper(char *error_reason, size_t max_reason_len)
380 {
381 	int rv = 0;
382 
383 	if (cl_suspend_pre_callback != NULL) {
384 		rv = (*cl_suspend_pre_callback)();
385 		DBG("suspend: cl_suspend_pre_callback returned %d", rv);
386 		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
387 			if (cluster_error_decode(rv, error_reason,
388 			    max_reason_len)) {
389 				(void) snprintf(error_reason, max_reason_len,
390 				    SC_PRE_FAIL_STR_FMT, rv);
391 			}
392 		}
393 	}
394 
395 	return (rv);
396 }
397 
398 /*
399  * Wrapper for the Sun Cluster post-suspend callback.
400  */
401 static int
402 cluster_post_wrapper(char *error_reason, size_t max_reason_len)
403 {
404 	int rv = 0;
405 
406 	if (cl_suspend_post_callback != NULL) {
407 		rv = (*cl_suspend_post_callback)();
408 		DBG("suspend: cl_suspend_post_callback returned %d", rv);
409 		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
410 			if (cluster_error_decode(rv, error_reason,
411 			    max_reason_len)) {
412 				(void) snprintf(error_reason,
413 				    max_reason_len, SC_POST_FAIL_STR_FMT, rv);
414 			}
415 		}
416 	}
417 
418 	return (rv);
419 }
420 
421 /*
422  * Execute pre-suspend callbacks preparing the system for a suspend operation.
423  * Returns zero on success, non-zero on failure. Sets the recovered argument
424  * to indicate whether or not callbacks could be undone in the event of a
425  * failure--if callbacks were successfully undone, *recovered is set to B_TRUE,
426  * otherwise *recovered is set to B_FALSE. Must be called successfully before
427  * suspend_start can be called. Callers should first call suspend_support to
428  * determine if OS suspend is supported.
429  */
430 int
431 suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered)
432 {
433 	int rv;
434 
435 	ASSERT(recovered != NULL);
436 
437 	/*
438 	 * Return an error if suspend_pre is erreoneously called
439 	 * when OS suspend is not supported.
440 	 */
441 	ASSERT(suspend_supported());
442 	if (!suspend_supported()) {
443 		DBG("suspend: suspend_pre called without suspend support");
444 		*recovered = B_TRUE;
445 		return (ENOTSUP);
446 	}
447 	DBG("suspend: %s", __func__);
448 
449 	rv = cluster_pre_wrapper(error_reason, max_reason_len);
450 
451 	/*
452 	 * At present, only one pre-suspend operation exists.
453 	 * If it fails, no recovery needs to be done.
454 	 */
455 	if (rv != 0 && recovered != NULL)
456 		*recovered = B_TRUE;
457 
458 	return (rv);
459 }
460 
461 /*
462  * Execute post-suspend callbacks. Returns zero on success, non-zero on
463  * failure. Must be called after suspend_start is called, regardless of
464  * whether or not suspend_start is successful.
465  */
466 int
467 suspend_post(char *error_reason, size_t max_reason_len)
468 {
469 	ASSERT(suspend_supported());
470 	DBG("suspend: %s", __func__);
471 	return (cluster_post_wrapper(error_reason, max_reason_len));
472 }
473 
474 /*
475  * Suspends the OS by pausing CPUs and calling into the HV to initiate
476  * the suspend. When the HV routine hv_guest_suspend returns, the system
477  * will be resumed. Must be called after a successful call to suspend_pre.
478  * suspend_post must be called after suspend_start, whether or not
479  * suspend_start returns an error.
480  */
481 /*ARGSUSED*/
482 int
483 suspend_start(char *error_reason, size_t max_reason_len)
484 {
485 	uint64_t	source_tick;
486 	uint64_t	source_stick;
487 	uint64_t	rv;
488 	timestruc_t	source_tod;
489 	int		spl;
490 
491 	ASSERT(suspend_supported());
492 	DBG("suspend: %s", __func__);
493 
494 	mutex_enter(&cpu_lock);
495 
496 	/* Suspend the watchdog */
497 	watchdog_suspend();
498 
499 	/* Record the TOD */
500 	mutex_enter(&tod_lock);
501 	source_tod = tod_get();
502 	mutex_exit(&tod_lock);
503 
504 	/* Pause all other CPUs */
505 	pause_cpus(NULL);
506 	DBG_PROM("suspend: CPUs paused\n");
507 
508 	/* Suspend cyclics and disable interrupts */
509 	cyclic_suspend();
510 	DBG_PROM("suspend: cyclics suspended\n");
511 	spl = spl8();
512 
513 	source_tick = gettick_counter();
514 	source_stick = gettick();
515 	DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick);
516 	DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick);
517 
518 	/*
519 	 * Call into the HV to initiate the suspend.
520 	 * hv_guest_suspend() returns after the guest has been
521 	 * resumed or if the suspend operation failed or was
522 	 * cancelled. After a successful suspend, the %tick and
523 	 * %stick registers may have changed by an amount that is
524 	 * not proportional to the amount of time that has passed.
525 	 * They may have jumped forwards or backwards. This jump
526 	 * must be uniform across all CPUs and we operate under
527 	 * the assumption that it is (maintaining two global offset
528 	 * variables--one for %tick and one for %stick.)
529 	 */
530 	DBG_PROM("suspend: suspending... \n");
531 	rv = hv_guest_suspend();
532 	if (rv != 0) {
533 		splx(spl);
534 		cyclic_resume();
535 		start_cpus();
536 		watchdog_resume();
537 		mutex_exit(&cpu_lock);
538 		DBG("suspend: failed, rv: %ld\n", rv);
539 		return (rv);
540 	}
541 
542 	/* Update the global tick and stick offsets */
543 	set_tick_offsets(source_tick, source_stick);
544 
545 	/* Ensure new offsets are globally visible before resuming CPUs */
546 	membar_sync();
547 
548 	/* Enable interrupts */
549 	splx(spl);
550 
551 	/* Set the {%tick,%stick}.NPT bits on all CPUs */
552 	if (enable_user_tick_stick_emulation) {
553 		xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL);
554 		xt_sync(cpu_ready_set);
555 		ASSERT(gettick_npt() != 0);
556 		ASSERT(getstick_npt() != 0);
557 	}
558 
559 	/* If emulation is enabled, but not currently active, enable it */
560 	if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) {
561 		tick_stick_emulation_active = B_TRUE;
562 	}
563 
564 	/* Resume cyclics, unpause CPUs */
565 	cyclic_resume();
566 	start_cpus();
567 
568 	/* Set the TOD */
569 	mutex_enter(&tod_lock);
570 	tod_set(source_tod);
571 	mutex_exit(&tod_lock);
572 
573 	/* Re-enable the watchdog */
574 	watchdog_resume();
575 
576 	mutex_exit(&cpu_lock);
577 
578 	/* Get new MD, update CPU mappings/relationships */
579 	if (suspend_update_cpu_mappings)
580 		update_cpu_mappings();
581 
582 	DBG("suspend: target tick: 0x%lx", gettick_counter());
583 	DBG("suspend: target stick: 0x%llx", gettick());
584 	DBG("suspend: user %%tick/%%stick emulation is %d",
585 	    tick_stick_emulation_active);
586 	DBG("suspend: finished");
587 
588 	return (0);
589 }
590