1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/mutex.h> 27 #include <sys/cpuvar.h> 28 #include <sys/cyclic.h> 29 #include <sys/disp.h> 30 #include <sys/ddi.h> 31 #include <sys/wdt.h> 32 #include <sys/callb.h> 33 #include <sys/cmn_err.h> 34 #include <sys/hypervisor_api.h> 35 #include <sys/membar.h> 36 #include <sys/x_call.h> 37 #include <sys/promif.h> 38 #include <sys/systm.h> 39 #include <sys/mach_descrip.h> 40 #include <sys/cpu_module.h> 41 #include <sys/pg.h> 42 #include <sys/lgrp.h> 43 #include <sys/sysmacros.h> 44 #include <sys/sunddi.h> 45 #include <sys/cpupart.h> 46 #include <sys/hsvc.h> 47 48 /* 49 * Sun4v OS Suspend 50 * 51 * Provides a means to suspend a sun4v guest domain by pausing CPUs and then 52 * calling into the HV to initiate a suspension. Suspension is sequenced 53 * externally by calling suspend_pre, suspend_start, and suspend_post. 54 * suspend_pre and suspend_post are meant to perform any special operations 55 * that should be done before or after a suspend/resume operation. e.g., 56 * callbacks to cluster software to disable heartbeat monitoring before the 57 * system is suspended. suspend_start prepares kernel services to be suspended 58 * and then suspends the domain by calling hv_guest_suspend. 59 * 60 * Special Handling for %tick and %stick Registers 61 * 62 * After a suspend/resume operation, the %tick and %stick registers may have 63 * jumped forwards or backwards. The delta is assumed to be consistent across 64 * all CPUs, within the negligible level of %tick and %stick variation 65 * acceptable on a cold boot. In order to maintain increasing %tick and %stick 66 * counter values without exposing large positive or negative jumps to kernel 67 * or user code, a %tick and %stick offset is used. Kernel reads of these 68 * counters return the sum of the hardware register counter and offset 69 * variable. After a suspend/resume operation, user reads of %tick or %stick 70 * are emulated. Suspend code enables emulation by setting the 71 * %{tick,stick}.NPT fields which trigger a privileged instruction access 72 * trap whenever the registers are read from user mode. If emulation has been 73 * enabled, the trap handler emulates the instruction. Emulation is only 74 * enabled during a successful suspend/resume operation. When emulation is 75 * enabled, CPUs that are DR'd into the system will have their 76 * %{tick,stick}.NPT bits set to 1 as well. 77 */ 78 79 extern u_longlong_t gettick(void); /* returns %stick */ 80 extern uint64_t gettick_counter(void); /* returns %tick */ 81 extern uint64_t gettick_npt(void); 82 extern uint64_t getstick_npt(void); 83 extern int mach_descrip_update(void); 84 extern cpuset_t cpu_ready_set; 85 extern uint64_t native_tick_offset; 86 extern uint64_t native_stick_offset; 87 88 /* 89 * Global Sun Cluster pre/post callbacks. 90 */ 91 const char *(*cl_suspend_error_decode)(int); 92 int (*cl_suspend_pre_callback)(void); 93 int (*cl_suspend_post_callback)(void); 94 #define SC_PRE_FAIL_STR_FMT "Sun Cluster pre-suspend failure: %d" 95 #define SC_POST_FAIL_STR_FMT "Sun Cluster post-suspend failure: %d" 96 #define SC_FAIL_STR_MAX 256 97 98 /* 99 * The minimum major and minor version of the HSVC_GROUP_CORE API group 100 * required in order to use OS suspend. 101 */ 102 #define SUSPEND_CORE_MAJOR 1 103 #define SUSPEND_CORE_MINOR 2 104 105 /* 106 * By default, sun4v OS suspend is supported if the required HV version 107 * is present. suspend_disabled should be set on platforms that do not 108 * allow OS suspend regardless of whether or not the HV supports it. 109 * It can also be set in /etc/system. 110 */ 111 static int suspend_disabled = 0; 112 113 /* 114 * Controls whether or not user-land tick and stick register emulation 115 * will be enabled following a successful suspend operation. 116 */ 117 static int enable_user_tick_stick_emulation = 1; 118 119 /* 120 * Indicates whether or not tick and stick emulation is currently active. 121 * After a successful suspend operation, if emulation is enabled, this 122 * variable is set to B_TRUE. Global scope to allow emulation code to 123 * check if emulation is active. 124 */ 125 boolean_t tick_stick_emulation_active = B_FALSE; 126 127 /* 128 * Controls whether or not MD information is refreshed after a 129 * successful suspend and resume. When non-zero, after a successful 130 * suspend and resume, the MD will be downloaded, cpunodes updated, 131 * and processor grouping information recalculated. 132 */ 133 static int suspend_update_cpu_mappings = 1; 134 135 /* 136 * DBG and DBG_PROM() macro. 137 */ 138 #ifdef DEBUG 139 140 static int suspend_debug_flag = 0; 141 142 #define DBG_PROM \ 143 if (suspend_debug_flag) \ 144 prom_printf 145 146 #define DBG \ 147 if (suspend_debug_flag) \ 148 suspend_debug 149 150 static void 151 suspend_debug(const char *fmt, ...) 152 { 153 char buf[512]; 154 va_list ap; 155 156 va_start(ap, fmt); 157 (void) vsprintf(buf, fmt, ap); 158 va_end(ap); 159 160 cmn_err(CE_NOTE, "%s", buf); 161 } 162 163 #else /* DEBUG */ 164 165 #define DBG_PROM 166 #define DBG 167 168 #endif /* DEBUG */ 169 170 /* 171 * Return true if the HV supports OS suspend and if suspend has not been 172 * disabled on this platform. 173 */ 174 boolean_t 175 suspend_supported(void) 176 { 177 uint64_t major, minor; 178 179 if (suspend_disabled) 180 return (B_FALSE); 181 182 if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0) 183 return (B_FALSE); 184 185 return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) || 186 (major > SUSPEND_CORE_MAJOR)); 187 } 188 189 /* 190 * Given a source tick and stick value, set the tick and stick offsets such 191 * that the (current physical register value + offset == source value). 192 */ 193 static void 194 set_tick_offsets(uint64_t source_tick, uint64_t source_stick) 195 { 196 uint64_t target_tick; 197 uint64_t target_stick; 198 199 native_tick_offset = 0; 200 native_stick_offset = 0; 201 202 target_tick = gettick_counter(); /* returns %tick */ 203 target_stick = gettick(); /* returns %stick */ 204 205 native_tick_offset = source_tick - target_tick; 206 native_stick_offset = source_stick - target_stick; 207 } 208 209 /* 210 * Set the {tick,stick}.NPT field to 1 on this CPU. 211 */ 212 static void 213 enable_tick_stick_npt(void) 214 { 215 (void) hv_stick_set_npt(1); 216 (void) hv_tick_set_npt(1); 217 } 218 219 /* 220 * Synchronize a CPU's {tick,stick}.NPT fields with the current state 221 * of the system. This is used when a CPU is DR'd into the system. 222 */ 223 void 224 suspend_sync_tick_stick_npt(void) 225 { 226 if (tick_stick_emulation_active) { 227 DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id); 228 (void) hv_stick_set_npt(1); 229 (void) hv_tick_set_npt(1); 230 } else { 231 ASSERT(gettick_npt() == 0); 232 ASSERT(getstick_npt() == 0); 233 } 234 } 235 236 /* 237 * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW 238 * sharing data structures, and processor groups. 239 */ 240 static void 241 update_cpu_mappings(void) 242 { 243 md_t *mdp; 244 processorid_t id; 245 cpu_t *cp; 246 int rv; 247 cpu_pg_t *pgps[NCPU]; 248 249 /* Download the latest MD */ 250 if ((rv = mach_descrip_update()) != 0) { 251 DBG("suspend: mach_descrip_update error: %d", rv); 252 return; 253 } 254 255 if ((mdp = md_get_handle()) == NULL) { 256 DBG("suspend: md_get_handle failed"); 257 return; 258 } 259 260 DBG("suspend: updating CPU mappings"); 261 262 mutex_enter(&cpu_lock); 263 264 setup_chip_mappings(mdp); 265 setup_exec_unit_mappings(mdp); 266 for (id = 0; id < NCPU; id++) { 267 if ((cp = cpu_get(id)) == NULL) 268 continue; 269 cpu_map_exec_units(cp); 270 } 271 272 /* 273 * Re-calculate processor groups. 274 * 275 * First tear down all PG information before adding any new PG 276 * information derived from the MD we just downloaded. We must 277 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and 278 * we want to minimize the number of times pause_cpus is called. 279 * Inactivating all CPUs would leave PGs without any active CPUs, 280 * so while CPUs are paused, call pg_cpu_inactive and swap in the 281 * bootstrap PG structure saving the original PG structure to be 282 * fini'd afterwards. This prevents the dispatcher from encountering 283 * PGs in which all CPUs are inactive. 284 */ 285 pause_cpus(NULL); 286 for (id = 0; id < NCPU; id++) { 287 if ((cp = cpu_get(id)) == NULL) 288 continue; 289 pg_cpu_inactive(cp); 290 pgps[id] = cp->cpu_pg; 291 pg_cpu_bootstrap(cp); 292 } 293 start_cpus(); 294 295 /* 296 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are 297 * not paused. Use two separate loops here so that we do not 298 * initialize PG data for CPUs until all the old PG data structures 299 * are torn down. 300 */ 301 for (id = 0; id < NCPU; id++) { 302 if ((cp = cpu_get(id)) == NULL) 303 continue; 304 pg_cpu_fini(cp, pgps[id]); 305 } 306 307 /* 308 * Initialize PG data for each CPU, but leave the bootstrapped 309 * PG structure in place to avoid running with any PGs containing 310 * nothing but inactive CPUs. 311 */ 312 for (id = 0; id < NCPU; id++) { 313 if ((cp = cpu_get(id)) == NULL) 314 continue; 315 pgps[id] = pg_cpu_init(cp, B_TRUE); 316 } 317 318 /* 319 * Now that PG data has been initialized for all CPUs in the 320 * system, replace the bootstrapped PG structure with the 321 * initialized PG structure and call pg_cpu_active for each CPU. 322 */ 323 pause_cpus(NULL); 324 for (id = 0; id < NCPU; id++) { 325 if ((cp = cpu_get(id)) == NULL) 326 continue; 327 cp->cpu_pg = pgps[id]; 328 pg_cpu_active(cp); 329 } 330 start_cpus(); 331 332 mutex_exit(&cpu_lock); 333 334 (void) md_fini_handle(mdp); 335 } 336 337 /* 338 * Wrapper for the Sun Cluster error decoding function. 339 */ 340 static int 341 cluster_error_decode(int error, char *error_reason, size_t max_reason_len) 342 { 343 const char *decoded; 344 size_t decoded_len; 345 346 ASSERT(error_reason != NULL); 347 ASSERT(max_reason_len > 0); 348 349 max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX); 350 351 if (cl_suspend_error_decode == NULL) 352 return (-1); 353 354 if ((decoded = (*cl_suspend_error_decode)(error)) == NULL) 355 return (-1); 356 357 /* Get number of non-NULL bytes */ 358 if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0) 359 return (-1); 360 361 bcopy(decoded, error_reason, decoded_len); 362 363 /* 364 * The error string returned from cl_suspend_error_decode 365 * should be NULL-terminated, but set the terminator here 366 * because we only copied non-NULL bytes. If the decoded 367 * string was not NULL-terminated, this guarantees that 368 * error_reason will be. 369 */ 370 error_reason[decoded_len] = '\0'; 371 372 return (0); 373 } 374 375 /* 376 * Wrapper for the Sun Cluster pre-suspend callback. 377 */ 378 static int 379 cluster_pre_wrapper(char *error_reason, size_t max_reason_len) 380 { 381 int rv = 0; 382 383 if (cl_suspend_pre_callback != NULL) { 384 rv = (*cl_suspend_pre_callback)(); 385 DBG("suspend: cl_suspend_pre_callback returned %d", rv); 386 if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 387 if (cluster_error_decode(rv, error_reason, 388 max_reason_len)) { 389 (void) snprintf(error_reason, max_reason_len, 390 SC_PRE_FAIL_STR_FMT, rv); 391 } 392 } 393 } 394 395 return (rv); 396 } 397 398 /* 399 * Wrapper for the Sun Cluster post-suspend callback. 400 */ 401 static int 402 cluster_post_wrapper(char *error_reason, size_t max_reason_len) 403 { 404 int rv = 0; 405 406 if (cl_suspend_post_callback != NULL) { 407 rv = (*cl_suspend_post_callback)(); 408 DBG("suspend: cl_suspend_post_callback returned %d", rv); 409 if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 410 if (cluster_error_decode(rv, error_reason, 411 max_reason_len)) { 412 (void) snprintf(error_reason, 413 max_reason_len, SC_POST_FAIL_STR_FMT, rv); 414 } 415 } 416 } 417 418 return (rv); 419 } 420 421 /* 422 * Execute pre-suspend callbacks preparing the system for a suspend operation. 423 * Returns zero on success, non-zero on failure. Sets the recovered argument 424 * to indicate whether or not callbacks could be undone in the event of a 425 * failure--if callbacks were successfully undone, *recovered is set to B_TRUE, 426 * otherwise *recovered is set to B_FALSE. Must be called successfully before 427 * suspend_start can be called. Callers should first call suspend_support to 428 * determine if OS suspend is supported. 429 */ 430 int 431 suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered) 432 { 433 int rv; 434 435 ASSERT(recovered != NULL); 436 437 /* 438 * Return an error if suspend_pre is erreoneously called 439 * when OS suspend is not supported. 440 */ 441 ASSERT(suspend_supported()); 442 if (!suspend_supported()) { 443 DBG("suspend: suspend_pre called without suspend support"); 444 *recovered = B_TRUE; 445 return (ENOTSUP); 446 } 447 DBG("suspend: %s", __func__); 448 449 rv = cluster_pre_wrapper(error_reason, max_reason_len); 450 451 /* 452 * At present, only one pre-suspend operation exists. 453 * If it fails, no recovery needs to be done. 454 */ 455 if (rv != 0 && recovered != NULL) 456 *recovered = B_TRUE; 457 458 return (rv); 459 } 460 461 /* 462 * Execute post-suspend callbacks. Returns zero on success, non-zero on 463 * failure. Must be called after suspend_start is called, regardless of 464 * whether or not suspend_start is successful. 465 */ 466 int 467 suspend_post(char *error_reason, size_t max_reason_len) 468 { 469 ASSERT(suspend_supported()); 470 DBG("suspend: %s", __func__); 471 return (cluster_post_wrapper(error_reason, max_reason_len)); 472 } 473 474 /* 475 * Suspends the OS by pausing CPUs and calling into the HV to initiate 476 * the suspend. When the HV routine hv_guest_suspend returns, the system 477 * will be resumed. Must be called after a successful call to suspend_pre. 478 * suspend_post must be called after suspend_start, whether or not 479 * suspend_start returns an error. 480 */ 481 /*ARGSUSED*/ 482 int 483 suspend_start(char *error_reason, size_t max_reason_len) 484 { 485 uint64_t source_tick; 486 uint64_t source_stick; 487 uint64_t rv; 488 timestruc_t source_tod; 489 int spl; 490 491 ASSERT(suspend_supported()); 492 DBG("suspend: %s", __func__); 493 494 mutex_enter(&cpu_lock); 495 496 /* Suspend the watchdog */ 497 watchdog_suspend(); 498 499 /* Record the TOD */ 500 mutex_enter(&tod_lock); 501 source_tod = tod_get(); 502 mutex_exit(&tod_lock); 503 504 /* Pause all other CPUs */ 505 pause_cpus(NULL); 506 DBG_PROM("suspend: CPUs paused\n"); 507 508 /* Suspend cyclics and disable interrupts */ 509 cyclic_suspend(); 510 DBG_PROM("suspend: cyclics suspended\n"); 511 spl = spl8(); 512 513 source_tick = gettick_counter(); 514 source_stick = gettick(); 515 DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick); 516 DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick); 517 518 /* 519 * Call into the HV to initiate the suspend. 520 * hv_guest_suspend() returns after the guest has been 521 * resumed or if the suspend operation failed or was 522 * cancelled. After a successful suspend, the %tick and 523 * %stick registers may have changed by an amount that is 524 * not proportional to the amount of time that has passed. 525 * They may have jumped forwards or backwards. This jump 526 * must be uniform across all CPUs and we operate under 527 * the assumption that it is (maintaining two global offset 528 * variables--one for %tick and one for %stick.) 529 */ 530 DBG_PROM("suspend: suspending... \n"); 531 rv = hv_guest_suspend(); 532 if (rv != 0) { 533 splx(spl); 534 cyclic_resume(); 535 start_cpus(); 536 watchdog_resume(); 537 mutex_exit(&cpu_lock); 538 DBG("suspend: failed, rv: %ld\n", rv); 539 return (rv); 540 } 541 542 /* Update the global tick and stick offsets */ 543 set_tick_offsets(source_tick, source_stick); 544 545 /* Ensure new offsets are globally visible before resuming CPUs */ 546 membar_sync(); 547 548 /* Enable interrupts */ 549 splx(spl); 550 551 /* Set the {%tick,%stick}.NPT bits on all CPUs */ 552 if (enable_user_tick_stick_emulation) { 553 xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL); 554 xt_sync(cpu_ready_set); 555 ASSERT(gettick_npt() != 0); 556 ASSERT(getstick_npt() != 0); 557 } 558 559 /* If emulation is enabled, but not currently active, enable it */ 560 if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) { 561 tick_stick_emulation_active = B_TRUE; 562 } 563 564 /* Resume cyclics, unpause CPUs */ 565 cyclic_resume(); 566 start_cpus(); 567 568 /* Set the TOD */ 569 mutex_enter(&tod_lock); 570 tod_set(source_tod); 571 mutex_exit(&tod_lock); 572 573 /* Re-enable the watchdog */ 574 watchdog_resume(); 575 576 mutex_exit(&cpu_lock); 577 578 /* Get new MD, update CPU mappings/relationships */ 579 if (suspend_update_cpu_mappings) 580 update_cpu_mappings(); 581 582 DBG("suspend: target tick: 0x%lx", gettick_counter()); 583 DBG("suspend: target stick: 0x%llx", gettick()); 584 DBG("suspend: user %%tick/%%stick emulation is %d", 585 tick_stick_emulation_active); 586 DBG("suspend: finished"); 587 588 return (0); 589 } 590