1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/mutex.h> 27 #include <sys/cpuvar.h> 28 #include <sys/cyclic.h> 29 #include <sys/disp.h> 30 #include <sys/ddi.h> 31 #include <sys/wdt.h> 32 #include <sys/callb.h> 33 #include <sys/cmn_err.h> 34 #include <sys/hypervisor_api.h> 35 #include <sys/membar.h> 36 #include <sys/x_call.h> 37 #include <sys/promif.h> 38 #include <sys/systm.h> 39 #include <sys/mach_descrip.h> 40 #include <sys/cpu_module.h> 41 #include <sys/pg.h> 42 #include <sys/lgrp.h> 43 #include <sys/sysmacros.h> 44 #include <sys/sunddi.h> 45 #include <sys/cpupart.h> 46 #include <sys/hsvc.h> 47 #include <vm/hat_sfmmu.h> 48 49 /* 50 * Sun4v OS Suspend 51 * 52 * Provides a means to suspend a sun4v guest domain by pausing CPUs and then 53 * calling into the HV to initiate a suspension. Suspension is sequenced 54 * externally by calling suspend_pre, suspend_start, and suspend_post. 55 * suspend_pre and suspend_post are meant to perform any special operations 56 * that should be done before or after a suspend/resume operation. e.g., 57 * callbacks to cluster software to disable heartbeat monitoring before the 58 * system is suspended. suspend_start prepares kernel services to be suspended 59 * and then suspends the domain by calling hv_guest_suspend. 60 * 61 * Special Handling for %tick and %stick Registers 62 * 63 * After a suspend/resume operation, the %tick and %stick registers may have 64 * jumped forwards or backwards. The delta is assumed to be consistent across 65 * all CPUs, within the negligible level of %tick and %stick variation 66 * acceptable on a cold boot. In order to maintain increasing %tick and %stick 67 * counter values without exposing large positive or negative jumps to kernel 68 * or user code, a %tick and %stick offset is used. Kernel reads of these 69 * counters return the sum of the hardware register counter and offset 70 * variable. After a suspend/resume operation, user reads of %tick or %stick 71 * are emulated. Suspend code enables emulation by setting the 72 * %{tick,stick}.NPT fields which trigger a privileged instruction access 73 * trap whenever the registers are read from user mode. If emulation has been 74 * enabled, the trap handler emulates the instruction. Emulation is only 75 * enabled during a successful suspend/resume operation. When emulation is 76 * enabled, CPUs that are DR'd into the system will have their 77 * %{tick,stick}.NPT bits set to 1 as well. 78 */ 79 80 extern u_longlong_t gettick(void); /* returns %stick */ 81 extern uint64_t gettick_counter(void); /* returns %tick */ 82 extern uint64_t gettick_npt(void); 83 extern uint64_t getstick_npt(void); 84 extern int mach_descrip_update(void); 85 extern cpuset_t cpu_ready_set; 86 extern uint64_t native_tick_offset; 87 extern uint64_t native_stick_offset; 88 89 /* 90 * Global Sun Cluster pre/post callbacks. 91 */ 92 const char *(*cl_suspend_error_decode)(int); 93 int (*cl_suspend_pre_callback)(void); 94 int (*cl_suspend_post_callback)(void); 95 #define SC_PRE_FAIL_STR_FMT "Sun Cluster pre-suspend failure: %d" 96 #define SC_POST_FAIL_STR_FMT "Sun Cluster post-suspend failure: %d" 97 #define SC_FAIL_STR_MAX 256 98 99 /* 100 * The minimum major and minor version of the HSVC_GROUP_CORE API group 101 * required in order to use OS suspend. 102 */ 103 #define SUSPEND_CORE_MAJOR 1 104 #define SUSPEND_CORE_MINOR 2 105 106 /* 107 * By default, sun4v OS suspend is supported if the required HV version 108 * is present. suspend_disabled should be set on platforms that do not 109 * allow OS suspend regardless of whether or not the HV supports it. 110 * It can also be set in /etc/system. 111 */ 112 static int suspend_disabled = 0; 113 114 /* 115 * Controls whether or not user-land tick and stick register emulation 116 * will be enabled following a successful suspend operation. 117 */ 118 static int enable_user_tick_stick_emulation = 1; 119 120 /* 121 * Indicates whether or not tick and stick emulation is currently active. 122 * After a successful suspend operation, if emulation is enabled, this 123 * variable is set to B_TRUE. Global scope to allow emulation code to 124 * check if emulation is active. 125 */ 126 boolean_t tick_stick_emulation_active = B_FALSE; 127 128 /* 129 * When non-zero, after a successful suspend and resume, cpunodes, CPU HW 130 * sharing data structures, and processor groups will be updated using 131 * information from the updated MD. 132 */ 133 static int suspend_update_cpu_mappings = 1; 134 135 /* 136 * DBG and DBG_PROM() macro. 137 */ 138 #ifdef DEBUG 139 140 static int suspend_debug_flag = 0; 141 142 #define DBG_PROM \ 143 if (suspend_debug_flag) \ 144 prom_printf 145 146 #define DBG \ 147 if (suspend_debug_flag) \ 148 suspend_debug 149 150 static void 151 suspend_debug(const char *fmt, ...) 152 { 153 char buf[512]; 154 va_list ap; 155 156 va_start(ap, fmt); 157 (void) vsprintf(buf, fmt, ap); 158 va_end(ap); 159 160 cmn_err(CE_NOTE, "%s", buf); 161 } 162 163 #else /* DEBUG */ 164 165 #define DBG_PROM 166 #define DBG 167 168 #endif /* DEBUG */ 169 170 /* 171 * Return true if the HV supports OS suspend and if suspend has not been 172 * disabled on this platform. 173 */ 174 boolean_t 175 suspend_supported(void) 176 { 177 uint64_t major, minor; 178 179 if (suspend_disabled) 180 return (B_FALSE); 181 182 if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0) 183 return (B_FALSE); 184 185 return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) || 186 (major > SUSPEND_CORE_MAJOR)); 187 } 188 189 /* 190 * Given a source tick and stick value, set the tick and stick offsets such 191 * that the (current physical register value + offset == source value). 192 */ 193 static void 194 set_tick_offsets(uint64_t source_tick, uint64_t source_stick) 195 { 196 uint64_t target_tick; 197 uint64_t target_stick; 198 199 native_tick_offset = 0; 200 native_stick_offset = 0; 201 202 target_tick = gettick_counter(); /* returns %tick */ 203 target_stick = gettick(); /* returns %stick */ 204 205 native_tick_offset = source_tick - target_tick; 206 native_stick_offset = source_stick - target_stick; 207 } 208 209 /* 210 * Set the {tick,stick}.NPT field to 1 on this CPU. 211 */ 212 static void 213 enable_tick_stick_npt(void) 214 { 215 (void) hv_stick_set_npt(1); 216 (void) hv_tick_set_npt(1); 217 } 218 219 /* 220 * Synchronize a CPU's {tick,stick}.NPT fields with the current state 221 * of the system. This is used when a CPU is DR'd into the system. 222 */ 223 void 224 suspend_sync_tick_stick_npt(void) 225 { 226 if (tick_stick_emulation_active) { 227 DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id); 228 (void) hv_stick_set_npt(1); 229 (void) hv_tick_set_npt(1); 230 } else { 231 ASSERT(gettick_npt() == 0); 232 ASSERT(getstick_npt() == 0); 233 } 234 } 235 236 /* 237 * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW 238 * sharing data structures, and processor groups. 239 */ 240 static void 241 update_cpu_mappings(void) 242 { 243 md_t *mdp; 244 processorid_t id; 245 cpu_t *cp; 246 cpu_pg_t *pgps[NCPU]; 247 248 if ((mdp = md_get_handle()) == NULL) { 249 DBG("suspend: md_get_handle failed"); 250 return; 251 } 252 253 DBG("suspend: updating CPU mappings"); 254 255 mutex_enter(&cpu_lock); 256 257 setup_chip_mappings(mdp); 258 setup_exec_unit_mappings(mdp); 259 for (id = 0; id < NCPU; id++) { 260 if ((cp = cpu_get(id)) == NULL) 261 continue; 262 cpu_map_exec_units(cp); 263 } 264 265 /* 266 * Re-calculate processor groups. 267 * 268 * First tear down all PG information before adding any new PG 269 * information derived from the MD we just downloaded. We must 270 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and 271 * we want to minimize the number of times pause_cpus is called. 272 * Inactivating all CPUs would leave PGs without any active CPUs, 273 * so while CPUs are paused, call pg_cpu_inactive and swap in the 274 * bootstrap PG structure saving the original PG structure to be 275 * fini'd afterwards. This prevents the dispatcher from encountering 276 * PGs in which all CPUs are inactive. 277 */ 278 pause_cpus(NULL); 279 for (id = 0; id < NCPU; id++) { 280 if ((cp = cpu_get(id)) == NULL) 281 continue; 282 pg_cpu_inactive(cp); 283 pgps[id] = cp->cpu_pg; 284 pg_cpu_bootstrap(cp); 285 } 286 start_cpus(); 287 288 /* 289 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are 290 * not paused. Use two separate loops here so that we do not 291 * initialize PG data for CPUs until all the old PG data structures 292 * are torn down. 293 */ 294 for (id = 0; id < NCPU; id++) { 295 if ((cp = cpu_get(id)) == NULL) 296 continue; 297 pg_cpu_fini(cp, pgps[id]); 298 } 299 300 /* 301 * Initialize PG data for each CPU, but leave the bootstrapped 302 * PG structure in place to avoid running with any PGs containing 303 * nothing but inactive CPUs. 304 */ 305 for (id = 0; id < NCPU; id++) { 306 if ((cp = cpu_get(id)) == NULL) 307 continue; 308 pgps[id] = pg_cpu_init(cp, B_TRUE); 309 } 310 311 /* 312 * Now that PG data has been initialized for all CPUs in the 313 * system, replace the bootstrapped PG structure with the 314 * initialized PG structure and call pg_cpu_active for each CPU. 315 */ 316 pause_cpus(NULL); 317 for (id = 0; id < NCPU; id++) { 318 if ((cp = cpu_get(id)) == NULL) 319 continue; 320 cp->cpu_pg = pgps[id]; 321 pg_cpu_active(cp); 322 } 323 start_cpus(); 324 325 mutex_exit(&cpu_lock); 326 327 (void) md_fini_handle(mdp); 328 } 329 330 /* 331 * Wrapper for the Sun Cluster error decoding function. 332 */ 333 static int 334 cluster_error_decode(int error, char *error_reason, size_t max_reason_len) 335 { 336 const char *decoded; 337 size_t decoded_len; 338 339 ASSERT(error_reason != NULL); 340 ASSERT(max_reason_len > 0); 341 342 max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX); 343 344 if (cl_suspend_error_decode == NULL) 345 return (-1); 346 347 if ((decoded = (*cl_suspend_error_decode)(error)) == NULL) 348 return (-1); 349 350 /* Get number of non-NULL bytes */ 351 if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0) 352 return (-1); 353 354 bcopy(decoded, error_reason, decoded_len); 355 356 /* 357 * The error string returned from cl_suspend_error_decode 358 * should be NULL-terminated, but set the terminator here 359 * because we only copied non-NULL bytes. If the decoded 360 * string was not NULL-terminated, this guarantees that 361 * error_reason will be. 362 */ 363 error_reason[decoded_len] = '\0'; 364 365 return (0); 366 } 367 368 /* 369 * Wrapper for the Sun Cluster pre-suspend callback. 370 */ 371 static int 372 cluster_pre_wrapper(char *error_reason, size_t max_reason_len) 373 { 374 int rv = 0; 375 376 if (cl_suspend_pre_callback != NULL) { 377 rv = (*cl_suspend_pre_callback)(); 378 DBG("suspend: cl_suspend_pre_callback returned %d", rv); 379 if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 380 if (cluster_error_decode(rv, error_reason, 381 max_reason_len)) { 382 (void) snprintf(error_reason, max_reason_len, 383 SC_PRE_FAIL_STR_FMT, rv); 384 } 385 } 386 } 387 388 return (rv); 389 } 390 391 /* 392 * Wrapper for the Sun Cluster post-suspend callback. 393 */ 394 static int 395 cluster_post_wrapper(char *error_reason, size_t max_reason_len) 396 { 397 int rv = 0; 398 399 if (cl_suspend_post_callback != NULL) { 400 rv = (*cl_suspend_post_callback)(); 401 DBG("suspend: cl_suspend_post_callback returned %d", rv); 402 if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 403 if (cluster_error_decode(rv, error_reason, 404 max_reason_len)) { 405 (void) snprintf(error_reason, 406 max_reason_len, SC_POST_FAIL_STR_FMT, rv); 407 } 408 } 409 } 410 411 return (rv); 412 } 413 414 /* 415 * Execute pre-suspend callbacks preparing the system for a suspend operation. 416 * Returns zero on success, non-zero on failure. Sets the recovered argument 417 * to indicate whether or not callbacks could be undone in the event of a 418 * failure--if callbacks were successfully undone, *recovered is set to B_TRUE, 419 * otherwise *recovered is set to B_FALSE. Must be called successfully before 420 * suspend_start can be called. Callers should first call suspend_support to 421 * determine if OS suspend is supported. 422 */ 423 int 424 suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered) 425 { 426 int rv; 427 428 ASSERT(recovered != NULL); 429 430 /* 431 * Return an error if suspend_pre is erreoneously called 432 * when OS suspend is not supported. 433 */ 434 ASSERT(suspend_supported()); 435 if (!suspend_supported()) { 436 DBG("suspend: suspend_pre called without suspend support"); 437 *recovered = B_TRUE; 438 return (ENOTSUP); 439 } 440 DBG("suspend: %s", __func__); 441 442 rv = cluster_pre_wrapper(error_reason, max_reason_len); 443 444 /* 445 * At present, only one pre-suspend operation exists. 446 * If it fails, no recovery needs to be done. 447 */ 448 if (rv != 0 && recovered != NULL) 449 *recovered = B_TRUE; 450 451 return (rv); 452 } 453 454 /* 455 * Execute post-suspend callbacks. Returns zero on success, non-zero on 456 * failure. Must be called after suspend_start is called, regardless of 457 * whether or not suspend_start is successful. 458 */ 459 int 460 suspend_post(char *error_reason, size_t max_reason_len) 461 { 462 ASSERT(suspend_supported()); 463 DBG("suspend: %s", __func__); 464 return (cluster_post_wrapper(error_reason, max_reason_len)); 465 } 466 467 /* 468 * Suspends the OS by pausing CPUs and calling into the HV to initiate 469 * the suspend. When the HV routine hv_guest_suspend returns, the system 470 * will be resumed. Must be called after a successful call to suspend_pre. 471 * suspend_post must be called after suspend_start, whether or not 472 * suspend_start returns an error. 473 */ 474 /*ARGSUSED*/ 475 int 476 suspend_start(char *error_reason, size_t max_reason_len) 477 { 478 uint64_t source_tick; 479 uint64_t source_stick; 480 uint64_t rv; 481 timestruc_t source_tod; 482 int spl; 483 484 ASSERT(suspend_supported()); 485 DBG("suspend: %s", __func__); 486 487 sfmmu_ctxdoms_lock(); 488 489 mutex_enter(&cpu_lock); 490 491 /* Suspend the watchdog */ 492 watchdog_suspend(); 493 494 /* Record the TOD */ 495 mutex_enter(&tod_lock); 496 source_tod = tod_get(); 497 mutex_exit(&tod_lock); 498 499 /* Pause all other CPUs */ 500 pause_cpus(NULL); 501 DBG_PROM("suspend: CPUs paused\n"); 502 503 /* Suspend cyclics and disable interrupts */ 504 cyclic_suspend(); 505 DBG_PROM("suspend: cyclics suspended\n"); 506 spl = spl8(); 507 508 source_tick = gettick_counter(); 509 source_stick = gettick(); 510 DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick); 511 DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick); 512 513 /* 514 * Call into the HV to initiate the suspend. 515 * hv_guest_suspend() returns after the guest has been 516 * resumed or if the suspend operation failed or was 517 * cancelled. After a successful suspend, the %tick and 518 * %stick registers may have changed by an amount that is 519 * not proportional to the amount of time that has passed. 520 * They may have jumped forwards or backwards. This jump 521 * must be uniform across all CPUs and we operate under 522 * the assumption that it is (maintaining two global offset 523 * variables--one for %tick and one for %stick.) 524 */ 525 DBG_PROM("suspend: suspending... \n"); 526 rv = hv_guest_suspend(); 527 if (rv != 0) { 528 splx(spl); 529 cyclic_resume(); 530 start_cpus(); 531 watchdog_resume(); 532 mutex_exit(&cpu_lock); 533 sfmmu_ctxdoms_unlock(); 534 DBG("suspend: failed, rv: %ld\n", rv); 535 return (rv); 536 } 537 538 /* Update the global tick and stick offsets */ 539 set_tick_offsets(source_tick, source_stick); 540 541 /* Ensure new offsets are globally visible before resuming CPUs */ 542 membar_sync(); 543 544 /* Enable interrupts */ 545 splx(spl); 546 547 /* Set the {%tick,%stick}.NPT bits on all CPUs */ 548 if (enable_user_tick_stick_emulation) { 549 xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL); 550 xt_sync(cpu_ready_set); 551 ASSERT(gettick_npt() != 0); 552 ASSERT(getstick_npt() != 0); 553 } 554 555 /* If emulation is enabled, but not currently active, enable it */ 556 if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) { 557 tick_stick_emulation_active = B_TRUE; 558 } 559 560 sfmmu_ctxdoms_remove(); 561 562 /* Resume cyclics, unpause CPUs */ 563 cyclic_resume(); 564 start_cpus(); 565 566 /* Set the TOD */ 567 mutex_enter(&tod_lock); 568 tod_set(source_tod); 569 mutex_exit(&tod_lock); 570 571 /* Re-enable the watchdog */ 572 watchdog_resume(); 573 574 mutex_exit(&cpu_lock); 575 576 /* Download the latest MD */ 577 if ((rv = mach_descrip_update()) != 0) 578 cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld", 579 rv); 580 581 sfmmu_ctxdoms_update(); 582 sfmmu_ctxdoms_unlock(); 583 584 /* Get new MD, update CPU mappings/relationships */ 585 if (suspend_update_cpu_mappings) 586 update_cpu_mappings(); 587 588 DBG("suspend: target tick: 0x%lx", gettick_counter()); 589 DBG("suspend: target stick: 0x%llx", gettick()); 590 DBG("suspend: user %%tick/%%stick emulation is %d", 591 tick_stick_emulation_active); 592 DBG("suspend: finished"); 593 594 return (0); 595 } 596