1023e71deSHaik Aftandilian /* 2023e71deSHaik Aftandilian * CDDL HEADER START 3023e71deSHaik Aftandilian * 4023e71deSHaik Aftandilian * The contents of this file are subject to the terms of the 5023e71deSHaik Aftandilian * Common Development and Distribution License (the "License"). 6023e71deSHaik Aftandilian * You may not use this file except in compliance with the License. 7023e71deSHaik Aftandilian * 8023e71deSHaik Aftandilian * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9023e71deSHaik Aftandilian * or http://www.opensolaris.org/os/licensing. 10023e71deSHaik Aftandilian * See the License for the specific language governing permissions 11023e71deSHaik Aftandilian * and limitations under the License. 12023e71deSHaik Aftandilian * 13023e71deSHaik Aftandilian * When distributing Covered Code, include this CDDL HEADER in each 14023e71deSHaik Aftandilian * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15023e71deSHaik Aftandilian * If applicable, add the following below this CDDL HEADER, with the 16023e71deSHaik Aftandilian * fields enclosed by brackets "[]" replaced with your own identifying 17023e71deSHaik Aftandilian * information: Portions Copyright [yyyy] [name of copyright owner] 18023e71deSHaik Aftandilian * 19023e71deSHaik Aftandilian * CDDL HEADER END 20023e71deSHaik Aftandilian */ 21023e71deSHaik Aftandilian /* 22d2365b01SPavel Tatashin * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23023e71deSHaik Aftandilian * Use is subject to license terms. 24023e71deSHaik Aftandilian */ 25023e71deSHaik Aftandilian 26023e71deSHaik Aftandilian #include <sys/mutex.h> 27023e71deSHaik Aftandilian #include <sys/cpuvar.h> 28023e71deSHaik Aftandilian #include <sys/cyclic.h> 29023e71deSHaik Aftandilian #include <sys/disp.h> 30023e71deSHaik Aftandilian #include <sys/ddi.h> 31023e71deSHaik Aftandilian #include <sys/wdt.h> 32023e71deSHaik Aftandilian #include <sys/callb.h> 33023e71deSHaik Aftandilian #include <sys/cmn_err.h> 34023e71deSHaik Aftandilian #include <sys/hypervisor_api.h> 35023e71deSHaik Aftandilian #include <sys/membar.h> 36023e71deSHaik Aftandilian #include <sys/x_call.h> 37023e71deSHaik Aftandilian #include <sys/promif.h> 38023e71deSHaik Aftandilian #include <sys/systm.h> 39023e71deSHaik Aftandilian #include <sys/mach_descrip.h> 40023e71deSHaik Aftandilian #include <sys/cpu_module.h> 41023e71deSHaik Aftandilian #include <sys/pg.h> 42023e71deSHaik Aftandilian #include <sys/lgrp.h> 43023e71deSHaik Aftandilian #include <sys/sysmacros.h> 44023e71deSHaik Aftandilian #include <sys/sunddi.h> 45023e71deSHaik Aftandilian #include <sys/cpupart.h> 46023e71deSHaik Aftandilian #include <sys/hsvc.h> 47183ef8a1SHaik Aftandilian #include <sys/mpo.h> 48d2365b01SPavel Tatashin #include <vm/hat_sfmmu.h> 49*00a57bdfSHaik Aftandilian #include <sys/time.h> 50*00a57bdfSHaik Aftandilian #include <sys/clock.h> 51023e71deSHaik Aftandilian 52023e71deSHaik Aftandilian /* 53023e71deSHaik Aftandilian * Sun4v OS Suspend 54023e71deSHaik Aftandilian * 55023e71deSHaik Aftandilian * Provides a means to suspend a sun4v guest domain by pausing CPUs and then 56023e71deSHaik Aftandilian * calling into the HV to initiate a suspension. Suspension is sequenced 57023e71deSHaik Aftandilian * externally by calling suspend_pre, suspend_start, and suspend_post. 58023e71deSHaik Aftandilian * suspend_pre and suspend_post are meant to perform any special operations 59023e71deSHaik Aftandilian * that should be done before or after a suspend/resume operation. e.g., 60023e71deSHaik Aftandilian * callbacks to cluster software to disable heartbeat monitoring before the 61023e71deSHaik Aftandilian * system is suspended. suspend_start prepares kernel services to be suspended 62023e71deSHaik Aftandilian * and then suspends the domain by calling hv_guest_suspend. 63023e71deSHaik Aftandilian * 64023e71deSHaik Aftandilian * Special Handling for %tick and %stick Registers 65023e71deSHaik Aftandilian * 66023e71deSHaik Aftandilian * After a suspend/resume operation, the %tick and %stick registers may have 67023e71deSHaik Aftandilian * jumped forwards or backwards. The delta is assumed to be consistent across 68023e71deSHaik Aftandilian * all CPUs, within the negligible level of %tick and %stick variation 69023e71deSHaik Aftandilian * acceptable on a cold boot. In order to maintain increasing %tick and %stick 70023e71deSHaik Aftandilian * counter values without exposing large positive or negative jumps to kernel 71023e71deSHaik Aftandilian * or user code, a %tick and %stick offset is used. Kernel reads of these 72023e71deSHaik Aftandilian * counters return the sum of the hardware register counter and offset 73023e71deSHaik Aftandilian * variable. After a suspend/resume operation, user reads of %tick or %stick 74023e71deSHaik Aftandilian * are emulated. Suspend code enables emulation by setting the 75023e71deSHaik Aftandilian * %{tick,stick}.NPT fields which trigger a privileged instruction access 76023e71deSHaik Aftandilian * trap whenever the registers are read from user mode. If emulation has been 77023e71deSHaik Aftandilian * enabled, the trap handler emulates the instruction. Emulation is only 78023e71deSHaik Aftandilian * enabled during a successful suspend/resume operation. When emulation is 79023e71deSHaik Aftandilian * enabled, CPUs that are DR'd into the system will have their 80023e71deSHaik Aftandilian * %{tick,stick}.NPT bits set to 1 as well. 81023e71deSHaik Aftandilian */ 82023e71deSHaik Aftandilian 83023e71deSHaik Aftandilian extern u_longlong_t gettick(void); /* returns %stick */ 84023e71deSHaik Aftandilian extern uint64_t gettick_counter(void); /* returns %tick */ 85023e71deSHaik Aftandilian extern uint64_t gettick_npt(void); 86023e71deSHaik Aftandilian extern uint64_t getstick_npt(void); 87023e71deSHaik Aftandilian extern int mach_descrip_update(void); 88023e71deSHaik Aftandilian extern cpuset_t cpu_ready_set; 89023e71deSHaik Aftandilian extern uint64_t native_tick_offset; 90023e71deSHaik Aftandilian extern uint64_t native_stick_offset; 91*00a57bdfSHaik Aftandilian extern uint64_t sys_tick_freq; 92023e71deSHaik Aftandilian 93023e71deSHaik Aftandilian /* 94023e71deSHaik Aftandilian * Global Sun Cluster pre/post callbacks. 95023e71deSHaik Aftandilian */ 96023e71deSHaik Aftandilian const char *(*cl_suspend_error_decode)(int); 97023e71deSHaik Aftandilian int (*cl_suspend_pre_callback)(void); 98023e71deSHaik Aftandilian int (*cl_suspend_post_callback)(void); 99023e71deSHaik Aftandilian #define SC_PRE_FAIL_STR_FMT "Sun Cluster pre-suspend failure: %d" 100023e71deSHaik Aftandilian #define SC_POST_FAIL_STR_FMT "Sun Cluster post-suspend failure: %d" 101023e71deSHaik Aftandilian #define SC_FAIL_STR_MAX 256 102023e71deSHaik Aftandilian 103023e71deSHaik Aftandilian /* 104023e71deSHaik Aftandilian * The minimum major and minor version of the HSVC_GROUP_CORE API group 105023e71deSHaik Aftandilian * required in order to use OS suspend. 106023e71deSHaik Aftandilian */ 107023e71deSHaik Aftandilian #define SUSPEND_CORE_MAJOR 1 108023e71deSHaik Aftandilian #define SUSPEND_CORE_MINOR 2 109023e71deSHaik Aftandilian 110023e71deSHaik Aftandilian /* 111023e71deSHaik Aftandilian * By default, sun4v OS suspend is supported if the required HV version 112023e71deSHaik Aftandilian * is present. suspend_disabled should be set on platforms that do not 113023e71deSHaik Aftandilian * allow OS suspend regardless of whether or not the HV supports it. 114023e71deSHaik Aftandilian * It can also be set in /etc/system. 115023e71deSHaik Aftandilian */ 116023e71deSHaik Aftandilian static int suspend_disabled = 0; 117023e71deSHaik Aftandilian 118023e71deSHaik Aftandilian /* 119023e71deSHaik Aftandilian * Controls whether or not user-land tick and stick register emulation 120023e71deSHaik Aftandilian * will be enabled following a successful suspend operation. 121023e71deSHaik Aftandilian */ 122023e71deSHaik Aftandilian static int enable_user_tick_stick_emulation = 1; 123023e71deSHaik Aftandilian 124023e71deSHaik Aftandilian /* 125023e71deSHaik Aftandilian * Indicates whether or not tick and stick emulation is currently active. 126023e71deSHaik Aftandilian * After a successful suspend operation, if emulation is enabled, this 127023e71deSHaik Aftandilian * variable is set to B_TRUE. Global scope to allow emulation code to 128023e71deSHaik Aftandilian * check if emulation is active. 129023e71deSHaik Aftandilian */ 130023e71deSHaik Aftandilian boolean_t tick_stick_emulation_active = B_FALSE; 131023e71deSHaik Aftandilian 132023e71deSHaik Aftandilian /* 133d2365b01SPavel Tatashin * When non-zero, after a successful suspend and resume, cpunodes, CPU HW 134d2365b01SPavel Tatashin * sharing data structures, and processor groups will be updated using 135d2365b01SPavel Tatashin * information from the updated MD. 136023e71deSHaik Aftandilian */ 137023e71deSHaik Aftandilian static int suspend_update_cpu_mappings = 1; 138023e71deSHaik Aftandilian 139023e71deSHaik Aftandilian /* 140*00a57bdfSHaik Aftandilian * The maximum number of microseconds by which the %tick or %stick register 141*00a57bdfSHaik Aftandilian * can vary between any two CPUs in the system. To calculate the 142*00a57bdfSHaik Aftandilian * native_stick_offset and native_tick_offset, we measure the change in these 143*00a57bdfSHaik Aftandilian * registers on one CPU over a suspend/resume. Other CPUs may experience 144*00a57bdfSHaik Aftandilian * slightly larger or smaller changes. %tick and %stick should be synchronized 145*00a57bdfSHaik Aftandilian * between CPUs, but there may be some variation. So we add an additional value 146*00a57bdfSHaik Aftandilian * derived from this variable to ensure that these registers always increase 147*00a57bdfSHaik Aftandilian * over a suspend/resume operation, assuming all %tick and %stick registers 148*00a57bdfSHaik Aftandilian * are synchronized (within a certain limit) across CPUs in the system. The 149*00a57bdfSHaik Aftandilian * delta between %sticks on different CPUs should be a small number of cycles, 150*00a57bdfSHaik Aftandilian * not perceptible to readers of %stick that migrate between CPUs. We set this 151*00a57bdfSHaik Aftandilian * value to 1 millisecond which means that over a suspend/resume operation, 152*00a57bdfSHaik Aftandilian * all CPU's %tick and %stick will advance forwards as long as, across all 153*00a57bdfSHaik Aftandilian * CPUs, the %tick and %stick are synchronized to within 1 ms. This applies to 154*00a57bdfSHaik Aftandilian * CPUs before the suspend and CPUs after the resume. 1 ms is conservative, 155*00a57bdfSHaik Aftandilian * but small enough to not trigger TOD faults. 156*00a57bdfSHaik Aftandilian */ 157*00a57bdfSHaik Aftandilian static uint64_t suspend_tick_stick_max_delta = 1000; /* microseconds */ 158*00a57bdfSHaik Aftandilian 159*00a57bdfSHaik Aftandilian /* 160023e71deSHaik Aftandilian * DBG and DBG_PROM() macro. 161023e71deSHaik Aftandilian */ 162023e71deSHaik Aftandilian #ifdef DEBUG 163023e71deSHaik Aftandilian 164023e71deSHaik Aftandilian static int suspend_debug_flag = 0; 165023e71deSHaik Aftandilian 166023e71deSHaik Aftandilian #define DBG_PROM \ 167023e71deSHaik Aftandilian if (suspend_debug_flag) \ 168023e71deSHaik Aftandilian prom_printf 169023e71deSHaik Aftandilian 170023e71deSHaik Aftandilian #define DBG \ 171023e71deSHaik Aftandilian if (suspend_debug_flag) \ 172023e71deSHaik Aftandilian suspend_debug 173023e71deSHaik Aftandilian 174023e71deSHaik Aftandilian static void 175023e71deSHaik Aftandilian suspend_debug(const char *fmt, ...) 176023e71deSHaik Aftandilian { 177023e71deSHaik Aftandilian char buf[512]; 178023e71deSHaik Aftandilian va_list ap; 179023e71deSHaik Aftandilian 180023e71deSHaik Aftandilian va_start(ap, fmt); 181023e71deSHaik Aftandilian (void) vsprintf(buf, fmt, ap); 182023e71deSHaik Aftandilian va_end(ap); 183023e71deSHaik Aftandilian 184023e71deSHaik Aftandilian cmn_err(CE_NOTE, "%s", buf); 185023e71deSHaik Aftandilian } 186023e71deSHaik Aftandilian 187023e71deSHaik Aftandilian #else /* DEBUG */ 188023e71deSHaik Aftandilian 189023e71deSHaik Aftandilian #define DBG_PROM 190023e71deSHaik Aftandilian #define DBG 191023e71deSHaik Aftandilian 192023e71deSHaik Aftandilian #endif /* DEBUG */ 193023e71deSHaik Aftandilian 194023e71deSHaik Aftandilian /* 195023e71deSHaik Aftandilian * Return true if the HV supports OS suspend and if suspend has not been 196023e71deSHaik Aftandilian * disabled on this platform. 197023e71deSHaik Aftandilian */ 198023e71deSHaik Aftandilian boolean_t 199023e71deSHaik Aftandilian suspend_supported(void) 200023e71deSHaik Aftandilian { 201023e71deSHaik Aftandilian uint64_t major, minor; 202023e71deSHaik Aftandilian 203023e71deSHaik Aftandilian if (suspend_disabled) 204023e71deSHaik Aftandilian return (B_FALSE); 205023e71deSHaik Aftandilian 206023e71deSHaik Aftandilian if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0) 207023e71deSHaik Aftandilian return (B_FALSE); 208023e71deSHaik Aftandilian 209023e71deSHaik Aftandilian return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) || 210023e71deSHaik Aftandilian (major > SUSPEND_CORE_MAJOR)); 211023e71deSHaik Aftandilian } 212023e71deSHaik Aftandilian 213023e71deSHaik Aftandilian /* 214*00a57bdfSHaik Aftandilian * Given a source tick, stick, and tod value, set the tick and stick offsets 215*00a57bdfSHaik Aftandilian * such that the (current physical register value) + offset == (source value) 216*00a57bdfSHaik Aftandilian * and in addition account for some variation between the %tick/%stick on 217*00a57bdfSHaik Aftandilian * different CPUs. We account for this variation by adding in double the value 218*00a57bdfSHaik Aftandilian * of suspend_tick_stick_max_delta. The following is an explanation of why 219*00a57bdfSHaik Aftandilian * suspend_tick_stick_max_delta must be multplied by two and added to 220*00a57bdfSHaik Aftandilian * native_stick_offset. 221*00a57bdfSHaik Aftandilian * 222*00a57bdfSHaik Aftandilian * Consider a guest instance that is yet to be suspended with CPUs p0 and p1 223*00a57bdfSHaik Aftandilian * with physical "source" %stick values s0 and s1 respectively. When the guest 224*00a57bdfSHaik Aftandilian * is first resumed, the physical "target" %stick values are t0 and t1 225*00a57bdfSHaik Aftandilian * respectively. The virtual %stick values after the resume are v0 and v1 226*00a57bdfSHaik Aftandilian * respectively. Let x be the maximum difference between any two CPU's %stick 227*00a57bdfSHaik Aftandilian * register at a given point in time and let the %stick values be assigned 228*00a57bdfSHaik Aftandilian * such that 229*00a57bdfSHaik Aftandilian * 230*00a57bdfSHaik Aftandilian * s1 = s0 + x and 231*00a57bdfSHaik Aftandilian * t1 = t0 - x 232*00a57bdfSHaik Aftandilian * 233*00a57bdfSHaik Aftandilian * Let us assume that p0 is driving the suspend and resume. Then, we will 234*00a57bdfSHaik Aftandilian * calculate the stick offset f and the virtual %stick on p0 after the 235*00a57bdfSHaik Aftandilian * resume as follows. 236*00a57bdfSHaik Aftandilian * 237*00a57bdfSHaik Aftandilian * f = s0 - t0 and 238*00a57bdfSHaik Aftandilian * v0 = t0 + f 239*00a57bdfSHaik Aftandilian * 240*00a57bdfSHaik Aftandilian * We calculate the virtual %stick v1 on p1 after the resume as 241*00a57bdfSHaik Aftandilian * 242*00a57bdfSHaik Aftandilian * v1 = t1 + f 243*00a57bdfSHaik Aftandilian * 244*00a57bdfSHaik Aftandilian * Substitution yields 245*00a57bdfSHaik Aftandilian * 246*00a57bdfSHaik Aftandilian * v1 = t1 + (s0 - t0) 247*00a57bdfSHaik Aftandilian * v1 = (t0 - x) + (s0 - t0) 248*00a57bdfSHaik Aftandilian * v1 = -x + s0 249*00a57bdfSHaik Aftandilian * v1 = s0 - x 250*00a57bdfSHaik Aftandilian * v1 = (s1 - x) - x 251*00a57bdfSHaik Aftandilian * v1 = s1 - 2x 252*00a57bdfSHaik Aftandilian * 253*00a57bdfSHaik Aftandilian * Therefore, in this scenario, without accounting for %stick variation in 254*00a57bdfSHaik Aftandilian * the calculation of the native_stick_offset f, the virtual %stick on p1 255*00a57bdfSHaik Aftandilian * is less than the value of the %stick on p1 before the suspend which is 256*00a57bdfSHaik Aftandilian * unacceptable. By adding 2x to v1, we guarantee it will be equal to s1 257*00a57bdfSHaik Aftandilian * which means the %stick on p1 after the resume will always be greater 258*00a57bdfSHaik Aftandilian * than or equal to the %stick on p1 before the suspend. Since v1 = t1 + f 259*00a57bdfSHaik Aftandilian * at any point in time, we can accomplish this by adding 2x to f. This 260*00a57bdfSHaik Aftandilian * guarantees any processes bound to CPU P0 or P1 will not see a %stick 261*00a57bdfSHaik Aftandilian * decrease across a suspend/resume. Hence, in the code below, we multiply 262*00a57bdfSHaik Aftandilian * suspend_tick_stick_max_delta by two in the calculation for 263*00a57bdfSHaik Aftandilian * native_stick_offset, native_tick_offset, and target_hrtime. 264023e71deSHaik Aftandilian */ 265023e71deSHaik Aftandilian static void 266*00a57bdfSHaik Aftandilian set_tick_offsets(uint64_t source_tick, uint64_t source_stick, timestruc_t *tsp) 267023e71deSHaik Aftandilian { 268023e71deSHaik Aftandilian uint64_t target_tick; 269023e71deSHaik Aftandilian uint64_t target_stick; 270*00a57bdfSHaik Aftandilian hrtime_t source_hrtime; 271*00a57bdfSHaik Aftandilian hrtime_t target_hrtime; 272023e71deSHaik Aftandilian 273*00a57bdfSHaik Aftandilian /* 274*00a57bdfSHaik Aftandilian * Temporarily set the offsets to zero so that the following reads 275*00a57bdfSHaik Aftandilian * of the registers will yield physical unadjusted counter values. 276*00a57bdfSHaik Aftandilian */ 277023e71deSHaik Aftandilian native_tick_offset = 0; 278023e71deSHaik Aftandilian native_stick_offset = 0; 279023e71deSHaik Aftandilian 280023e71deSHaik Aftandilian target_tick = gettick_counter(); /* returns %tick */ 281023e71deSHaik Aftandilian target_stick = gettick(); /* returns %stick */ 282023e71deSHaik Aftandilian 283*00a57bdfSHaik Aftandilian /* 284*00a57bdfSHaik Aftandilian * Calculate the new offsets. In addition to the delta observed on 285*00a57bdfSHaik Aftandilian * this CPU, add an additional value. Multiply the %tick/%stick 286*00a57bdfSHaik Aftandilian * frequency by suspend_tick_stick_max_delta (us). Then, multiply by 2 287*00a57bdfSHaik Aftandilian * to account for a delta between CPUs before the suspend and a 288*00a57bdfSHaik Aftandilian * delta between CPUs after the resume. 289*00a57bdfSHaik Aftandilian */ 290*00a57bdfSHaik Aftandilian native_tick_offset = (source_tick - target_tick) + 291*00a57bdfSHaik Aftandilian (CPU->cpu_curr_clock * suspend_tick_stick_max_delta * 2 / MICROSEC); 292*00a57bdfSHaik Aftandilian native_stick_offset = (source_stick - target_stick) + 293*00a57bdfSHaik Aftandilian (sys_tick_freq * suspend_tick_stick_max_delta * 2 / MICROSEC); 294*00a57bdfSHaik Aftandilian 295*00a57bdfSHaik Aftandilian /* 296*00a57bdfSHaik Aftandilian * We've effectively increased %stick and %tick by twice the value 297*00a57bdfSHaik Aftandilian * of suspend_tick_stick_max_delta to account for variation across 298*00a57bdfSHaik Aftandilian * CPUs. Now adjust the preserved TOD by the same amount. 299*00a57bdfSHaik Aftandilian */ 300*00a57bdfSHaik Aftandilian source_hrtime = ts2hrt(tsp); 301*00a57bdfSHaik Aftandilian target_hrtime = source_hrtime + 302*00a57bdfSHaik Aftandilian (suspend_tick_stick_max_delta * 2 * (NANOSEC/MICROSEC)); 303*00a57bdfSHaik Aftandilian hrt2ts(target_hrtime, tsp); 304023e71deSHaik Aftandilian } 305023e71deSHaik Aftandilian 306023e71deSHaik Aftandilian /* 307023e71deSHaik Aftandilian * Set the {tick,stick}.NPT field to 1 on this CPU. 308023e71deSHaik Aftandilian */ 309023e71deSHaik Aftandilian static void 310023e71deSHaik Aftandilian enable_tick_stick_npt(void) 311023e71deSHaik Aftandilian { 312c1374a13SSurya Prakki (void) hv_stick_set_npt(1); 313c1374a13SSurya Prakki (void) hv_tick_set_npt(1); 314023e71deSHaik Aftandilian } 315023e71deSHaik Aftandilian 316023e71deSHaik Aftandilian /* 317023e71deSHaik Aftandilian * Synchronize a CPU's {tick,stick}.NPT fields with the current state 318023e71deSHaik Aftandilian * of the system. This is used when a CPU is DR'd into the system. 319023e71deSHaik Aftandilian */ 320023e71deSHaik Aftandilian void 321023e71deSHaik Aftandilian suspend_sync_tick_stick_npt(void) 322023e71deSHaik Aftandilian { 323023e71deSHaik Aftandilian if (tick_stick_emulation_active) { 324023e71deSHaik Aftandilian DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id); 325c1374a13SSurya Prakki (void) hv_stick_set_npt(1); 326c1374a13SSurya Prakki (void) hv_tick_set_npt(1); 327023e71deSHaik Aftandilian } else { 328023e71deSHaik Aftandilian ASSERT(gettick_npt() == 0); 329023e71deSHaik Aftandilian ASSERT(getstick_npt() == 0); 330023e71deSHaik Aftandilian } 331023e71deSHaik Aftandilian } 332023e71deSHaik Aftandilian 333023e71deSHaik Aftandilian /* 334023e71deSHaik Aftandilian * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW 335023e71deSHaik Aftandilian * sharing data structures, and processor groups. 336023e71deSHaik Aftandilian */ 337023e71deSHaik Aftandilian static void 338023e71deSHaik Aftandilian update_cpu_mappings(void) 339023e71deSHaik Aftandilian { 340023e71deSHaik Aftandilian md_t *mdp; 341023e71deSHaik Aftandilian processorid_t id; 342023e71deSHaik Aftandilian cpu_t *cp; 343023e71deSHaik Aftandilian cpu_pg_t *pgps[NCPU]; 344023e71deSHaik Aftandilian 345023e71deSHaik Aftandilian if ((mdp = md_get_handle()) == NULL) { 346023e71deSHaik Aftandilian DBG("suspend: md_get_handle failed"); 347023e71deSHaik Aftandilian return; 348023e71deSHaik Aftandilian } 349023e71deSHaik Aftandilian 350023e71deSHaik Aftandilian DBG("suspend: updating CPU mappings"); 351023e71deSHaik Aftandilian 352023e71deSHaik Aftandilian mutex_enter(&cpu_lock); 353023e71deSHaik Aftandilian 354023e71deSHaik Aftandilian setup_chip_mappings(mdp); 355023e71deSHaik Aftandilian setup_exec_unit_mappings(mdp); 356023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 357023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 358023e71deSHaik Aftandilian continue; 359023e71deSHaik Aftandilian cpu_map_exec_units(cp); 360023e71deSHaik Aftandilian } 361023e71deSHaik Aftandilian 362023e71deSHaik Aftandilian /* 363023e71deSHaik Aftandilian * Re-calculate processor groups. 364023e71deSHaik Aftandilian * 365023e71deSHaik Aftandilian * First tear down all PG information before adding any new PG 366023e71deSHaik Aftandilian * information derived from the MD we just downloaded. We must 367023e71deSHaik Aftandilian * call pg_cpu_inactive and pg_cpu_active with CPUs paused and 368023e71deSHaik Aftandilian * we want to minimize the number of times pause_cpus is called. 369023e71deSHaik Aftandilian * Inactivating all CPUs would leave PGs without any active CPUs, 370023e71deSHaik Aftandilian * so while CPUs are paused, call pg_cpu_inactive and swap in the 371023e71deSHaik Aftandilian * bootstrap PG structure saving the original PG structure to be 372023e71deSHaik Aftandilian * fini'd afterwards. This prevents the dispatcher from encountering 373023e71deSHaik Aftandilian * PGs in which all CPUs are inactive. 374023e71deSHaik Aftandilian */ 375023e71deSHaik Aftandilian pause_cpus(NULL); 376023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 377023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 378023e71deSHaik Aftandilian continue; 379023e71deSHaik Aftandilian pg_cpu_inactive(cp); 380023e71deSHaik Aftandilian pgps[id] = cp->cpu_pg; 381023e71deSHaik Aftandilian pg_cpu_bootstrap(cp); 382023e71deSHaik Aftandilian } 383023e71deSHaik Aftandilian start_cpus(); 384023e71deSHaik Aftandilian 385023e71deSHaik Aftandilian /* 386023e71deSHaik Aftandilian * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are 387023e71deSHaik Aftandilian * not paused. Use two separate loops here so that we do not 388023e71deSHaik Aftandilian * initialize PG data for CPUs until all the old PG data structures 389023e71deSHaik Aftandilian * are torn down. 390023e71deSHaik Aftandilian */ 391023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 392023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 393023e71deSHaik Aftandilian continue; 394023e71deSHaik Aftandilian pg_cpu_fini(cp, pgps[id]); 395183ef8a1SHaik Aftandilian mpo_cpu_remove(id); 396023e71deSHaik Aftandilian } 397023e71deSHaik Aftandilian 398023e71deSHaik Aftandilian /* 399023e71deSHaik Aftandilian * Initialize PG data for each CPU, but leave the bootstrapped 400023e71deSHaik Aftandilian * PG structure in place to avoid running with any PGs containing 401023e71deSHaik Aftandilian * nothing but inactive CPUs. 402023e71deSHaik Aftandilian */ 403023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 404023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 405023e71deSHaik Aftandilian continue; 406183ef8a1SHaik Aftandilian mpo_cpu_add(mdp, id); 407023e71deSHaik Aftandilian pgps[id] = pg_cpu_init(cp, B_TRUE); 408023e71deSHaik Aftandilian } 409023e71deSHaik Aftandilian 410023e71deSHaik Aftandilian /* 411023e71deSHaik Aftandilian * Now that PG data has been initialized for all CPUs in the 412023e71deSHaik Aftandilian * system, replace the bootstrapped PG structure with the 413023e71deSHaik Aftandilian * initialized PG structure and call pg_cpu_active for each CPU. 414023e71deSHaik Aftandilian */ 415023e71deSHaik Aftandilian pause_cpus(NULL); 416023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 417023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 418023e71deSHaik Aftandilian continue; 419023e71deSHaik Aftandilian cp->cpu_pg = pgps[id]; 420023e71deSHaik Aftandilian pg_cpu_active(cp); 421023e71deSHaik Aftandilian } 422023e71deSHaik Aftandilian start_cpus(); 423023e71deSHaik Aftandilian 424023e71deSHaik Aftandilian mutex_exit(&cpu_lock); 425023e71deSHaik Aftandilian 426023e71deSHaik Aftandilian (void) md_fini_handle(mdp); 427023e71deSHaik Aftandilian } 428023e71deSHaik Aftandilian 429023e71deSHaik Aftandilian /* 430023e71deSHaik Aftandilian * Wrapper for the Sun Cluster error decoding function. 431023e71deSHaik Aftandilian */ 432023e71deSHaik Aftandilian static int 433023e71deSHaik Aftandilian cluster_error_decode(int error, char *error_reason, size_t max_reason_len) 434023e71deSHaik Aftandilian { 435023e71deSHaik Aftandilian const char *decoded; 436023e71deSHaik Aftandilian size_t decoded_len; 437023e71deSHaik Aftandilian 438023e71deSHaik Aftandilian ASSERT(error_reason != NULL); 439023e71deSHaik Aftandilian ASSERT(max_reason_len > 0); 440023e71deSHaik Aftandilian 441023e71deSHaik Aftandilian max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX); 442023e71deSHaik Aftandilian 443023e71deSHaik Aftandilian if (cl_suspend_error_decode == NULL) 444023e71deSHaik Aftandilian return (-1); 445023e71deSHaik Aftandilian 446023e71deSHaik Aftandilian if ((decoded = (*cl_suspend_error_decode)(error)) == NULL) 447023e71deSHaik Aftandilian return (-1); 448023e71deSHaik Aftandilian 449023e71deSHaik Aftandilian /* Get number of non-NULL bytes */ 450023e71deSHaik Aftandilian if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0) 451023e71deSHaik Aftandilian return (-1); 452023e71deSHaik Aftandilian 453023e71deSHaik Aftandilian bcopy(decoded, error_reason, decoded_len); 454023e71deSHaik Aftandilian 455023e71deSHaik Aftandilian /* 456023e71deSHaik Aftandilian * The error string returned from cl_suspend_error_decode 457023e71deSHaik Aftandilian * should be NULL-terminated, but set the terminator here 458023e71deSHaik Aftandilian * because we only copied non-NULL bytes. If the decoded 459023e71deSHaik Aftandilian * string was not NULL-terminated, this guarantees that 460023e71deSHaik Aftandilian * error_reason will be. 461023e71deSHaik Aftandilian */ 462023e71deSHaik Aftandilian error_reason[decoded_len] = '\0'; 463023e71deSHaik Aftandilian 464023e71deSHaik Aftandilian return (0); 465023e71deSHaik Aftandilian } 466023e71deSHaik Aftandilian 467023e71deSHaik Aftandilian /* 468023e71deSHaik Aftandilian * Wrapper for the Sun Cluster pre-suspend callback. 469023e71deSHaik Aftandilian */ 470023e71deSHaik Aftandilian static int 471023e71deSHaik Aftandilian cluster_pre_wrapper(char *error_reason, size_t max_reason_len) 472023e71deSHaik Aftandilian { 473023e71deSHaik Aftandilian int rv = 0; 474023e71deSHaik Aftandilian 475023e71deSHaik Aftandilian if (cl_suspend_pre_callback != NULL) { 476023e71deSHaik Aftandilian rv = (*cl_suspend_pre_callback)(); 477023e71deSHaik Aftandilian DBG("suspend: cl_suspend_pre_callback returned %d", rv); 478023e71deSHaik Aftandilian if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 479023e71deSHaik Aftandilian if (cluster_error_decode(rv, error_reason, 480023e71deSHaik Aftandilian max_reason_len)) { 481023e71deSHaik Aftandilian (void) snprintf(error_reason, max_reason_len, 482023e71deSHaik Aftandilian SC_PRE_FAIL_STR_FMT, rv); 483023e71deSHaik Aftandilian } 484023e71deSHaik Aftandilian } 485023e71deSHaik Aftandilian } 486023e71deSHaik Aftandilian 487023e71deSHaik Aftandilian return (rv); 488023e71deSHaik Aftandilian } 489023e71deSHaik Aftandilian 490023e71deSHaik Aftandilian /* 491023e71deSHaik Aftandilian * Wrapper for the Sun Cluster post-suspend callback. 492023e71deSHaik Aftandilian */ 493023e71deSHaik Aftandilian static int 494023e71deSHaik Aftandilian cluster_post_wrapper(char *error_reason, size_t max_reason_len) 495023e71deSHaik Aftandilian { 496023e71deSHaik Aftandilian int rv = 0; 497023e71deSHaik Aftandilian 498023e71deSHaik Aftandilian if (cl_suspend_post_callback != NULL) { 499023e71deSHaik Aftandilian rv = (*cl_suspend_post_callback)(); 500023e71deSHaik Aftandilian DBG("suspend: cl_suspend_post_callback returned %d", rv); 501023e71deSHaik Aftandilian if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 502023e71deSHaik Aftandilian if (cluster_error_decode(rv, error_reason, 503023e71deSHaik Aftandilian max_reason_len)) { 504023e71deSHaik Aftandilian (void) snprintf(error_reason, 505023e71deSHaik Aftandilian max_reason_len, SC_POST_FAIL_STR_FMT, rv); 506023e71deSHaik Aftandilian } 507023e71deSHaik Aftandilian } 508023e71deSHaik Aftandilian } 509023e71deSHaik Aftandilian 510023e71deSHaik Aftandilian return (rv); 511023e71deSHaik Aftandilian } 512023e71deSHaik Aftandilian 513023e71deSHaik Aftandilian /* 514023e71deSHaik Aftandilian * Execute pre-suspend callbacks preparing the system for a suspend operation. 515023e71deSHaik Aftandilian * Returns zero on success, non-zero on failure. Sets the recovered argument 516023e71deSHaik Aftandilian * to indicate whether or not callbacks could be undone in the event of a 517023e71deSHaik Aftandilian * failure--if callbacks were successfully undone, *recovered is set to B_TRUE, 518023e71deSHaik Aftandilian * otherwise *recovered is set to B_FALSE. Must be called successfully before 519023e71deSHaik Aftandilian * suspend_start can be called. Callers should first call suspend_support to 520023e71deSHaik Aftandilian * determine if OS suspend is supported. 521023e71deSHaik Aftandilian */ 522023e71deSHaik Aftandilian int 523023e71deSHaik Aftandilian suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered) 524023e71deSHaik Aftandilian { 525023e71deSHaik Aftandilian int rv; 526023e71deSHaik Aftandilian 527023e71deSHaik Aftandilian ASSERT(recovered != NULL); 528023e71deSHaik Aftandilian 529023e71deSHaik Aftandilian /* 530023e71deSHaik Aftandilian * Return an error if suspend_pre is erreoneously called 531023e71deSHaik Aftandilian * when OS suspend is not supported. 532023e71deSHaik Aftandilian */ 533023e71deSHaik Aftandilian ASSERT(suspend_supported()); 534023e71deSHaik Aftandilian if (!suspend_supported()) { 535023e71deSHaik Aftandilian DBG("suspend: suspend_pre called without suspend support"); 536023e71deSHaik Aftandilian *recovered = B_TRUE; 537023e71deSHaik Aftandilian return (ENOTSUP); 538023e71deSHaik Aftandilian } 539023e71deSHaik Aftandilian DBG("suspend: %s", __func__); 540023e71deSHaik Aftandilian 541023e71deSHaik Aftandilian rv = cluster_pre_wrapper(error_reason, max_reason_len); 542023e71deSHaik Aftandilian 543023e71deSHaik Aftandilian /* 544023e71deSHaik Aftandilian * At present, only one pre-suspend operation exists. 545023e71deSHaik Aftandilian * If it fails, no recovery needs to be done. 546023e71deSHaik Aftandilian */ 547023e71deSHaik Aftandilian if (rv != 0 && recovered != NULL) 548023e71deSHaik Aftandilian *recovered = B_TRUE; 549023e71deSHaik Aftandilian 550023e71deSHaik Aftandilian return (rv); 551023e71deSHaik Aftandilian } 552023e71deSHaik Aftandilian 553023e71deSHaik Aftandilian /* 554023e71deSHaik Aftandilian * Execute post-suspend callbacks. Returns zero on success, non-zero on 555023e71deSHaik Aftandilian * failure. Must be called after suspend_start is called, regardless of 556023e71deSHaik Aftandilian * whether or not suspend_start is successful. 557023e71deSHaik Aftandilian */ 558023e71deSHaik Aftandilian int 559023e71deSHaik Aftandilian suspend_post(char *error_reason, size_t max_reason_len) 560023e71deSHaik Aftandilian { 561023e71deSHaik Aftandilian ASSERT(suspend_supported()); 562023e71deSHaik Aftandilian DBG("suspend: %s", __func__); 563023e71deSHaik Aftandilian return (cluster_post_wrapper(error_reason, max_reason_len)); 564023e71deSHaik Aftandilian } 565023e71deSHaik Aftandilian 566023e71deSHaik Aftandilian /* 567023e71deSHaik Aftandilian * Suspends the OS by pausing CPUs and calling into the HV to initiate 568023e71deSHaik Aftandilian * the suspend. When the HV routine hv_guest_suspend returns, the system 569023e71deSHaik Aftandilian * will be resumed. Must be called after a successful call to suspend_pre. 570023e71deSHaik Aftandilian * suspend_post must be called after suspend_start, whether or not 571023e71deSHaik Aftandilian * suspend_start returns an error. 572023e71deSHaik Aftandilian */ 573023e71deSHaik Aftandilian /*ARGSUSED*/ 574023e71deSHaik Aftandilian int 575023e71deSHaik Aftandilian suspend_start(char *error_reason, size_t max_reason_len) 576023e71deSHaik Aftandilian { 577023e71deSHaik Aftandilian uint64_t source_tick; 578023e71deSHaik Aftandilian uint64_t source_stick; 579023e71deSHaik Aftandilian uint64_t rv; 580023e71deSHaik Aftandilian timestruc_t source_tod; 581023e71deSHaik Aftandilian int spl; 582023e71deSHaik Aftandilian 583023e71deSHaik Aftandilian ASSERT(suspend_supported()); 584023e71deSHaik Aftandilian DBG("suspend: %s", __func__); 585023e71deSHaik Aftandilian 586d2365b01SPavel Tatashin sfmmu_ctxdoms_lock(); 587d2365b01SPavel Tatashin 588023e71deSHaik Aftandilian mutex_enter(&cpu_lock); 589023e71deSHaik Aftandilian 590023e71deSHaik Aftandilian /* Suspend the watchdog */ 591023e71deSHaik Aftandilian watchdog_suspend(); 592023e71deSHaik Aftandilian 593023e71deSHaik Aftandilian /* Record the TOD */ 594023e71deSHaik Aftandilian mutex_enter(&tod_lock); 595023e71deSHaik Aftandilian source_tod = tod_get(); 596023e71deSHaik Aftandilian mutex_exit(&tod_lock); 597023e71deSHaik Aftandilian 598023e71deSHaik Aftandilian /* Pause all other CPUs */ 599023e71deSHaik Aftandilian pause_cpus(NULL); 600023e71deSHaik Aftandilian DBG_PROM("suspend: CPUs paused\n"); 601023e71deSHaik Aftandilian 602*00a57bdfSHaik Aftandilian /* Suspend cyclics */ 603023e71deSHaik Aftandilian cyclic_suspend(); 604023e71deSHaik Aftandilian DBG_PROM("suspend: cyclics suspended\n"); 605*00a57bdfSHaik Aftandilian 606*00a57bdfSHaik Aftandilian /* Disable interrupts */ 607023e71deSHaik Aftandilian spl = spl8(); 608*00a57bdfSHaik Aftandilian DBG_PROM("suspend: spl8()\n"); 609023e71deSHaik Aftandilian 610023e71deSHaik Aftandilian source_tick = gettick_counter(); 611023e71deSHaik Aftandilian source_stick = gettick(); 612023e71deSHaik Aftandilian DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick); 613023e71deSHaik Aftandilian DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick); 614023e71deSHaik Aftandilian 615023e71deSHaik Aftandilian /* 616*00a57bdfSHaik Aftandilian * Call into the HV to initiate the suspend. hv_guest_suspend() 617*00a57bdfSHaik Aftandilian * returns after the guest has been resumed or if the suspend 618*00a57bdfSHaik Aftandilian * operation failed or was cancelled. After a successful suspend, 619*00a57bdfSHaik Aftandilian * the %tick and %stick registers may have changed by an amount 620*00a57bdfSHaik Aftandilian * that is not proportional to the amount of time that has passed. 621*00a57bdfSHaik Aftandilian * They may have jumped forwards or backwards. Some variation is 622*00a57bdfSHaik Aftandilian * allowed and accounted for using suspend_tick_stick_max_delta, 623*00a57bdfSHaik Aftandilian * but otherwise this jump must be uniform across all CPUs and we 624*00a57bdfSHaik Aftandilian * operate under the assumption that it is (maintaining two global 625*00a57bdfSHaik Aftandilian * offset variables--one for %tick and one for %stick.) 626023e71deSHaik Aftandilian */ 627023e71deSHaik Aftandilian DBG_PROM("suspend: suspending... \n"); 628023e71deSHaik Aftandilian rv = hv_guest_suspend(); 629023e71deSHaik Aftandilian if (rv != 0) { 630023e71deSHaik Aftandilian splx(spl); 631023e71deSHaik Aftandilian cyclic_resume(); 632023e71deSHaik Aftandilian start_cpus(); 633023e71deSHaik Aftandilian watchdog_resume(); 634023e71deSHaik Aftandilian mutex_exit(&cpu_lock); 635d2365b01SPavel Tatashin sfmmu_ctxdoms_unlock(); 636023e71deSHaik Aftandilian DBG("suspend: failed, rv: %ld\n", rv); 637023e71deSHaik Aftandilian return (rv); 638023e71deSHaik Aftandilian } 639023e71deSHaik Aftandilian 640*00a57bdfSHaik Aftandilian /* Update the global tick and stick offsets and the preserved TOD */ 641*00a57bdfSHaik Aftandilian set_tick_offsets(source_tick, source_stick, &source_tod); 642023e71deSHaik Aftandilian 643023e71deSHaik Aftandilian /* Ensure new offsets are globally visible before resuming CPUs */ 644023e71deSHaik Aftandilian membar_sync(); 645023e71deSHaik Aftandilian 646023e71deSHaik Aftandilian /* Enable interrupts */ 647023e71deSHaik Aftandilian splx(spl); 648023e71deSHaik Aftandilian 649023e71deSHaik Aftandilian /* Set the {%tick,%stick}.NPT bits on all CPUs */ 650023e71deSHaik Aftandilian if (enable_user_tick_stick_emulation) { 651023e71deSHaik Aftandilian xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL); 652023e71deSHaik Aftandilian xt_sync(cpu_ready_set); 653023e71deSHaik Aftandilian ASSERT(gettick_npt() != 0); 654023e71deSHaik Aftandilian ASSERT(getstick_npt() != 0); 655023e71deSHaik Aftandilian } 656023e71deSHaik Aftandilian 657023e71deSHaik Aftandilian /* If emulation is enabled, but not currently active, enable it */ 658023e71deSHaik Aftandilian if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) { 659023e71deSHaik Aftandilian tick_stick_emulation_active = B_TRUE; 660023e71deSHaik Aftandilian } 661023e71deSHaik Aftandilian 662d2365b01SPavel Tatashin sfmmu_ctxdoms_remove(); 663d2365b01SPavel Tatashin 664023e71deSHaik Aftandilian /* Resume cyclics, unpause CPUs */ 665023e71deSHaik Aftandilian cyclic_resume(); 666023e71deSHaik Aftandilian start_cpus(); 667023e71deSHaik Aftandilian 668023e71deSHaik Aftandilian /* Set the TOD */ 669023e71deSHaik Aftandilian mutex_enter(&tod_lock); 670023e71deSHaik Aftandilian tod_set(source_tod); 671023e71deSHaik Aftandilian mutex_exit(&tod_lock); 672023e71deSHaik Aftandilian 673023e71deSHaik Aftandilian /* Re-enable the watchdog */ 674023e71deSHaik Aftandilian watchdog_resume(); 675023e71deSHaik Aftandilian 676023e71deSHaik Aftandilian mutex_exit(&cpu_lock); 677023e71deSHaik Aftandilian 678d2365b01SPavel Tatashin /* Download the latest MD */ 679d2365b01SPavel Tatashin if ((rv = mach_descrip_update()) != 0) 680d2365b01SPavel Tatashin cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld", 681d2365b01SPavel Tatashin rv); 682d2365b01SPavel Tatashin 683d2365b01SPavel Tatashin sfmmu_ctxdoms_update(); 684d2365b01SPavel Tatashin sfmmu_ctxdoms_unlock(); 685d2365b01SPavel Tatashin 686023e71deSHaik Aftandilian /* Get new MD, update CPU mappings/relationships */ 687023e71deSHaik Aftandilian if (suspend_update_cpu_mappings) 688023e71deSHaik Aftandilian update_cpu_mappings(); 689023e71deSHaik Aftandilian 690023e71deSHaik Aftandilian DBG("suspend: target tick: 0x%lx", gettick_counter()); 691023e71deSHaik Aftandilian DBG("suspend: target stick: 0x%llx", gettick()); 692023e71deSHaik Aftandilian DBG("suspend: user %%tick/%%stick emulation is %d", 693023e71deSHaik Aftandilian tick_stick_emulation_active); 694023e71deSHaik Aftandilian DBG("suspend: finished"); 695023e71deSHaik Aftandilian 696023e71deSHaik Aftandilian return (0); 697023e71deSHaik Aftandilian } 698