1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/mutex.h> 27 #include <sys/cpuvar.h> 28 #include <sys/cyclic.h> 29 #include <sys/disp.h> 30 #include <sys/ddi.h> 31 #include <sys/wdt.h> 32 #include <sys/callb.h> 33 #include <sys/cmn_err.h> 34 #include <sys/hypervisor_api.h> 35 #include <sys/membar.h> 36 #include <sys/x_call.h> 37 #include <sys/promif.h> 38 #include <sys/systm.h> 39 #include <sys/mach_descrip.h> 40 #include <sys/cpu_module.h> 41 #include <sys/pg.h> 42 #include <sys/lgrp.h> 43 #include <sys/sysmacros.h> 44 #include <sys/sunddi.h> 45 #include <sys/cpupart.h> 46 #include <sys/hsvc.h> 47 #include <sys/mpo.h> 48 #include <vm/hat_sfmmu.h> 49 50 /* 51 * Sun4v OS Suspend 52 * 53 * Provides a means to suspend a sun4v guest domain by pausing CPUs and then 54 * calling into the HV to initiate a suspension. Suspension is sequenced 55 * externally by calling suspend_pre, suspend_start, and suspend_post. 56 * suspend_pre and suspend_post are meant to perform any special operations 57 * that should be done before or after a suspend/resume operation. e.g., 58 * callbacks to cluster software to disable heartbeat monitoring before the 59 * system is suspended. suspend_start prepares kernel services to be suspended 60 * and then suspends the domain by calling hv_guest_suspend. 61 * 62 * Special Handling for %tick and %stick Registers 63 * 64 * After a suspend/resume operation, the %tick and %stick registers may have 65 * jumped forwards or backwards. The delta is assumed to be consistent across 66 * all CPUs, within the negligible level of %tick and %stick variation 67 * acceptable on a cold boot. In order to maintain increasing %tick and %stick 68 * counter values without exposing large positive or negative jumps to kernel 69 * or user code, a %tick and %stick offset is used. Kernel reads of these 70 * counters return the sum of the hardware register counter and offset 71 * variable. After a suspend/resume operation, user reads of %tick or %stick 72 * are emulated. Suspend code enables emulation by setting the 73 * %{tick,stick}.NPT fields which trigger a privileged instruction access 74 * trap whenever the registers are read from user mode. If emulation has been 75 * enabled, the trap handler emulates the instruction. Emulation is only 76 * enabled during a successful suspend/resume operation. When emulation is 77 * enabled, CPUs that are DR'd into the system will have their 78 * %{tick,stick}.NPT bits set to 1 as well. 79 */ 80 81 extern u_longlong_t gettick(void); /* returns %stick */ 82 extern uint64_t gettick_counter(void); /* returns %tick */ 83 extern uint64_t gettick_npt(void); 84 extern uint64_t getstick_npt(void); 85 extern int mach_descrip_update(void); 86 extern cpuset_t cpu_ready_set; 87 extern uint64_t native_tick_offset; 88 extern uint64_t native_stick_offset; 89 90 /* 91 * Global Sun Cluster pre/post callbacks. 92 */ 93 const char *(*cl_suspend_error_decode)(int); 94 int (*cl_suspend_pre_callback)(void); 95 int (*cl_suspend_post_callback)(void); 96 #define SC_PRE_FAIL_STR_FMT "Sun Cluster pre-suspend failure: %d" 97 #define SC_POST_FAIL_STR_FMT "Sun Cluster post-suspend failure: %d" 98 #define SC_FAIL_STR_MAX 256 99 100 /* 101 * The minimum major and minor version of the HSVC_GROUP_CORE API group 102 * required in order to use OS suspend. 103 */ 104 #define SUSPEND_CORE_MAJOR 1 105 #define SUSPEND_CORE_MINOR 2 106 107 /* 108 * By default, sun4v OS suspend is supported if the required HV version 109 * is present. suspend_disabled should be set on platforms that do not 110 * allow OS suspend regardless of whether or not the HV supports it. 111 * It can also be set in /etc/system. 112 */ 113 static int suspend_disabled = 0; 114 115 /* 116 * Controls whether or not user-land tick and stick register emulation 117 * will be enabled following a successful suspend operation. 118 */ 119 static int enable_user_tick_stick_emulation = 1; 120 121 /* 122 * Indicates whether or not tick and stick emulation is currently active. 123 * After a successful suspend operation, if emulation is enabled, this 124 * variable is set to B_TRUE. Global scope to allow emulation code to 125 * check if emulation is active. 126 */ 127 boolean_t tick_stick_emulation_active = B_FALSE; 128 129 /* 130 * When non-zero, after a successful suspend and resume, cpunodes, CPU HW 131 * sharing data structures, and processor groups will be updated using 132 * information from the updated MD. 133 */ 134 static int suspend_update_cpu_mappings = 1; 135 136 /* 137 * DBG and DBG_PROM() macro. 138 */ 139 #ifdef DEBUG 140 141 static int suspend_debug_flag = 0; 142 143 #define DBG_PROM \ 144 if (suspend_debug_flag) \ 145 prom_printf 146 147 #define DBG \ 148 if (suspend_debug_flag) \ 149 suspend_debug 150 151 static void 152 suspend_debug(const char *fmt, ...) 153 { 154 char buf[512]; 155 va_list ap; 156 157 va_start(ap, fmt); 158 (void) vsprintf(buf, fmt, ap); 159 va_end(ap); 160 161 cmn_err(CE_NOTE, "%s", buf); 162 } 163 164 #else /* DEBUG */ 165 166 #define DBG_PROM 167 #define DBG 168 169 #endif /* DEBUG */ 170 171 /* 172 * Return true if the HV supports OS suspend and if suspend has not been 173 * disabled on this platform. 174 */ 175 boolean_t 176 suspend_supported(void) 177 { 178 uint64_t major, minor; 179 180 if (suspend_disabled) 181 return (B_FALSE); 182 183 if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0) 184 return (B_FALSE); 185 186 return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) || 187 (major > SUSPEND_CORE_MAJOR)); 188 } 189 190 /* 191 * Given a source tick and stick value, set the tick and stick offsets such 192 * that the (current physical register value + offset == source value). 193 */ 194 static void 195 set_tick_offsets(uint64_t source_tick, uint64_t source_stick) 196 { 197 uint64_t target_tick; 198 uint64_t target_stick; 199 200 native_tick_offset = 0; 201 native_stick_offset = 0; 202 203 target_tick = gettick_counter(); /* returns %tick */ 204 target_stick = gettick(); /* returns %stick */ 205 206 native_tick_offset = source_tick - target_tick; 207 native_stick_offset = source_stick - target_stick; 208 } 209 210 /* 211 * Set the {tick,stick}.NPT field to 1 on this CPU. 212 */ 213 static void 214 enable_tick_stick_npt(void) 215 { 216 (void) hv_stick_set_npt(1); 217 (void) hv_tick_set_npt(1); 218 } 219 220 /* 221 * Synchronize a CPU's {tick,stick}.NPT fields with the current state 222 * of the system. This is used when a CPU is DR'd into the system. 223 */ 224 void 225 suspend_sync_tick_stick_npt(void) 226 { 227 if (tick_stick_emulation_active) { 228 DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id); 229 (void) hv_stick_set_npt(1); 230 (void) hv_tick_set_npt(1); 231 } else { 232 ASSERT(gettick_npt() == 0); 233 ASSERT(getstick_npt() == 0); 234 } 235 } 236 237 /* 238 * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW 239 * sharing data structures, and processor groups. 240 */ 241 static void 242 update_cpu_mappings(void) 243 { 244 md_t *mdp; 245 processorid_t id; 246 cpu_t *cp; 247 cpu_pg_t *pgps[NCPU]; 248 249 if ((mdp = md_get_handle()) == NULL) { 250 DBG("suspend: md_get_handle failed"); 251 return; 252 } 253 254 DBG("suspend: updating CPU mappings"); 255 256 mutex_enter(&cpu_lock); 257 258 setup_chip_mappings(mdp); 259 setup_exec_unit_mappings(mdp); 260 for (id = 0; id < NCPU; id++) { 261 if ((cp = cpu_get(id)) == NULL) 262 continue; 263 cpu_map_exec_units(cp); 264 } 265 266 /* 267 * Re-calculate processor groups. 268 * 269 * First tear down all PG information before adding any new PG 270 * information derived from the MD we just downloaded. We must 271 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and 272 * we want to minimize the number of times pause_cpus is called. 273 * Inactivating all CPUs would leave PGs without any active CPUs, 274 * so while CPUs are paused, call pg_cpu_inactive and swap in the 275 * bootstrap PG structure saving the original PG structure to be 276 * fini'd afterwards. This prevents the dispatcher from encountering 277 * PGs in which all CPUs are inactive. 278 */ 279 pause_cpus(NULL); 280 for (id = 0; id < NCPU; id++) { 281 if ((cp = cpu_get(id)) == NULL) 282 continue; 283 pg_cpu_inactive(cp); 284 pgps[id] = cp->cpu_pg; 285 pg_cpu_bootstrap(cp); 286 } 287 start_cpus(); 288 289 /* 290 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are 291 * not paused. Use two separate loops here so that we do not 292 * initialize PG data for CPUs until all the old PG data structures 293 * are torn down. 294 */ 295 for (id = 0; id < NCPU; id++) { 296 if ((cp = cpu_get(id)) == NULL) 297 continue; 298 pg_cpu_fini(cp, pgps[id]); 299 mpo_cpu_remove(id); 300 } 301 302 /* 303 * Initialize PG data for each CPU, but leave the bootstrapped 304 * PG structure in place to avoid running with any PGs containing 305 * nothing but inactive CPUs. 306 */ 307 for (id = 0; id < NCPU; id++) { 308 if ((cp = cpu_get(id)) == NULL) 309 continue; 310 mpo_cpu_add(mdp, id); 311 pgps[id] = pg_cpu_init(cp, B_TRUE); 312 } 313 314 /* 315 * Now that PG data has been initialized for all CPUs in the 316 * system, replace the bootstrapped PG structure with the 317 * initialized PG structure and call pg_cpu_active for each CPU. 318 */ 319 pause_cpus(NULL); 320 for (id = 0; id < NCPU; id++) { 321 if ((cp = cpu_get(id)) == NULL) 322 continue; 323 cp->cpu_pg = pgps[id]; 324 pg_cpu_active(cp); 325 } 326 start_cpus(); 327 328 mutex_exit(&cpu_lock); 329 330 (void) md_fini_handle(mdp); 331 } 332 333 /* 334 * Wrapper for the Sun Cluster error decoding function. 335 */ 336 static int 337 cluster_error_decode(int error, char *error_reason, size_t max_reason_len) 338 { 339 const char *decoded; 340 size_t decoded_len; 341 342 ASSERT(error_reason != NULL); 343 ASSERT(max_reason_len > 0); 344 345 max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX); 346 347 if (cl_suspend_error_decode == NULL) 348 return (-1); 349 350 if ((decoded = (*cl_suspend_error_decode)(error)) == NULL) 351 return (-1); 352 353 /* Get number of non-NULL bytes */ 354 if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0) 355 return (-1); 356 357 bcopy(decoded, error_reason, decoded_len); 358 359 /* 360 * The error string returned from cl_suspend_error_decode 361 * should be NULL-terminated, but set the terminator here 362 * because we only copied non-NULL bytes. If the decoded 363 * string was not NULL-terminated, this guarantees that 364 * error_reason will be. 365 */ 366 error_reason[decoded_len] = '\0'; 367 368 return (0); 369 } 370 371 /* 372 * Wrapper for the Sun Cluster pre-suspend callback. 373 */ 374 static int 375 cluster_pre_wrapper(char *error_reason, size_t max_reason_len) 376 { 377 int rv = 0; 378 379 if (cl_suspend_pre_callback != NULL) { 380 rv = (*cl_suspend_pre_callback)(); 381 DBG("suspend: cl_suspend_pre_callback returned %d", rv); 382 if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 383 if (cluster_error_decode(rv, error_reason, 384 max_reason_len)) { 385 (void) snprintf(error_reason, max_reason_len, 386 SC_PRE_FAIL_STR_FMT, rv); 387 } 388 } 389 } 390 391 return (rv); 392 } 393 394 /* 395 * Wrapper for the Sun Cluster post-suspend callback. 396 */ 397 static int 398 cluster_post_wrapper(char *error_reason, size_t max_reason_len) 399 { 400 int rv = 0; 401 402 if (cl_suspend_post_callback != NULL) { 403 rv = (*cl_suspend_post_callback)(); 404 DBG("suspend: cl_suspend_post_callback returned %d", rv); 405 if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 406 if (cluster_error_decode(rv, error_reason, 407 max_reason_len)) { 408 (void) snprintf(error_reason, 409 max_reason_len, SC_POST_FAIL_STR_FMT, rv); 410 } 411 } 412 } 413 414 return (rv); 415 } 416 417 /* 418 * Execute pre-suspend callbacks preparing the system for a suspend operation. 419 * Returns zero on success, non-zero on failure. Sets the recovered argument 420 * to indicate whether or not callbacks could be undone in the event of a 421 * failure--if callbacks were successfully undone, *recovered is set to B_TRUE, 422 * otherwise *recovered is set to B_FALSE. Must be called successfully before 423 * suspend_start can be called. Callers should first call suspend_support to 424 * determine if OS suspend is supported. 425 */ 426 int 427 suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered) 428 { 429 int rv; 430 431 ASSERT(recovered != NULL); 432 433 /* 434 * Return an error if suspend_pre is erreoneously called 435 * when OS suspend is not supported. 436 */ 437 ASSERT(suspend_supported()); 438 if (!suspend_supported()) { 439 DBG("suspend: suspend_pre called without suspend support"); 440 *recovered = B_TRUE; 441 return (ENOTSUP); 442 } 443 DBG("suspend: %s", __func__); 444 445 rv = cluster_pre_wrapper(error_reason, max_reason_len); 446 447 /* 448 * At present, only one pre-suspend operation exists. 449 * If it fails, no recovery needs to be done. 450 */ 451 if (rv != 0 && recovered != NULL) 452 *recovered = B_TRUE; 453 454 return (rv); 455 } 456 457 /* 458 * Execute post-suspend callbacks. Returns zero on success, non-zero on 459 * failure. Must be called after suspend_start is called, regardless of 460 * whether or not suspend_start is successful. 461 */ 462 int 463 suspend_post(char *error_reason, size_t max_reason_len) 464 { 465 ASSERT(suspend_supported()); 466 DBG("suspend: %s", __func__); 467 return (cluster_post_wrapper(error_reason, max_reason_len)); 468 } 469 470 /* 471 * Suspends the OS by pausing CPUs and calling into the HV to initiate 472 * the suspend. When the HV routine hv_guest_suspend returns, the system 473 * will be resumed. Must be called after a successful call to suspend_pre. 474 * suspend_post must be called after suspend_start, whether or not 475 * suspend_start returns an error. 476 */ 477 /*ARGSUSED*/ 478 int 479 suspend_start(char *error_reason, size_t max_reason_len) 480 { 481 uint64_t source_tick; 482 uint64_t source_stick; 483 uint64_t rv; 484 timestruc_t source_tod; 485 int spl; 486 487 ASSERT(suspend_supported()); 488 DBG("suspend: %s", __func__); 489 490 sfmmu_ctxdoms_lock(); 491 492 mutex_enter(&cpu_lock); 493 494 /* Suspend the watchdog */ 495 watchdog_suspend(); 496 497 /* Record the TOD */ 498 mutex_enter(&tod_lock); 499 source_tod = tod_get(); 500 mutex_exit(&tod_lock); 501 502 /* Pause all other CPUs */ 503 pause_cpus(NULL); 504 DBG_PROM("suspend: CPUs paused\n"); 505 506 /* Suspend cyclics and disable interrupts */ 507 cyclic_suspend(); 508 DBG_PROM("suspend: cyclics suspended\n"); 509 spl = spl8(); 510 511 source_tick = gettick_counter(); 512 source_stick = gettick(); 513 DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick); 514 DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick); 515 516 /* 517 * Call into the HV to initiate the suspend. 518 * hv_guest_suspend() returns after the guest has been 519 * resumed or if the suspend operation failed or was 520 * cancelled. After a successful suspend, the %tick and 521 * %stick registers may have changed by an amount that is 522 * not proportional to the amount of time that has passed. 523 * They may have jumped forwards or backwards. This jump 524 * must be uniform across all CPUs and we operate under 525 * the assumption that it is (maintaining two global offset 526 * variables--one for %tick and one for %stick.) 527 */ 528 DBG_PROM("suspend: suspending... \n"); 529 rv = hv_guest_suspend(); 530 if (rv != 0) { 531 splx(spl); 532 cyclic_resume(); 533 start_cpus(); 534 watchdog_resume(); 535 mutex_exit(&cpu_lock); 536 sfmmu_ctxdoms_unlock(); 537 DBG("suspend: failed, rv: %ld\n", rv); 538 return (rv); 539 } 540 541 /* Update the global tick and stick offsets */ 542 set_tick_offsets(source_tick, source_stick); 543 544 /* Ensure new offsets are globally visible before resuming CPUs */ 545 membar_sync(); 546 547 /* Enable interrupts */ 548 splx(spl); 549 550 /* Set the {%tick,%stick}.NPT bits on all CPUs */ 551 if (enable_user_tick_stick_emulation) { 552 xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL); 553 xt_sync(cpu_ready_set); 554 ASSERT(gettick_npt() != 0); 555 ASSERT(getstick_npt() != 0); 556 } 557 558 /* If emulation is enabled, but not currently active, enable it */ 559 if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) { 560 tick_stick_emulation_active = B_TRUE; 561 } 562 563 sfmmu_ctxdoms_remove(); 564 565 /* Resume cyclics, unpause CPUs */ 566 cyclic_resume(); 567 start_cpus(); 568 569 /* Set the TOD */ 570 mutex_enter(&tod_lock); 571 tod_set(source_tod); 572 mutex_exit(&tod_lock); 573 574 /* Re-enable the watchdog */ 575 watchdog_resume(); 576 577 mutex_exit(&cpu_lock); 578 579 /* Download the latest MD */ 580 if ((rv = mach_descrip_update()) != 0) 581 cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld", 582 rv); 583 584 sfmmu_ctxdoms_update(); 585 sfmmu_ctxdoms_unlock(); 586 587 /* Get new MD, update CPU mappings/relationships */ 588 if (suspend_update_cpu_mappings) 589 update_cpu_mappings(); 590 591 DBG("suspend: target tick: 0x%lx", gettick_counter()); 592 DBG("suspend: target stick: 0x%llx", gettick()); 593 DBG("suspend: user %%tick/%%stick emulation is %d", 594 tick_stick_emulation_active); 595 DBG("suspend: finished"); 596 597 return (0); 598 } 599