1023e71deSHaik Aftandilian /* 2023e71deSHaik Aftandilian * CDDL HEADER START 3023e71deSHaik Aftandilian * 4023e71deSHaik Aftandilian * The contents of this file are subject to the terms of the 5023e71deSHaik Aftandilian * Common Development and Distribution License (the "License"). 6023e71deSHaik Aftandilian * You may not use this file except in compliance with the License. 7023e71deSHaik Aftandilian * 8023e71deSHaik Aftandilian * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9023e71deSHaik Aftandilian * or http://www.opensolaris.org/os/licensing. 10023e71deSHaik Aftandilian * See the License for the specific language governing permissions 11023e71deSHaik Aftandilian * and limitations under the License. 12023e71deSHaik Aftandilian * 13023e71deSHaik Aftandilian * When distributing Covered Code, include this CDDL HEADER in each 14023e71deSHaik Aftandilian * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15023e71deSHaik Aftandilian * If applicable, add the following below this CDDL HEADER, with the 16023e71deSHaik Aftandilian * fields enclosed by brackets "[]" replaced with your own identifying 17023e71deSHaik Aftandilian * information: Portions Copyright [yyyy] [name of copyright owner] 18023e71deSHaik Aftandilian * 19023e71deSHaik Aftandilian * CDDL HEADER END 20023e71deSHaik Aftandilian */ 21023e71deSHaik Aftandilian /* 22*02b4e56cSHaik Aftandilian * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23023e71deSHaik Aftandilian */ 24023e71deSHaik Aftandilian 25023e71deSHaik Aftandilian #include <sys/mutex.h> 26023e71deSHaik Aftandilian #include <sys/cpuvar.h> 27023e71deSHaik Aftandilian #include <sys/cyclic.h> 28023e71deSHaik Aftandilian #include <sys/disp.h> 29023e71deSHaik Aftandilian #include <sys/ddi.h> 30023e71deSHaik Aftandilian #include <sys/wdt.h> 31023e71deSHaik Aftandilian #include <sys/callb.h> 32023e71deSHaik Aftandilian #include <sys/cmn_err.h> 33023e71deSHaik Aftandilian #include <sys/hypervisor_api.h> 34023e71deSHaik Aftandilian #include <sys/membar.h> 35023e71deSHaik Aftandilian #include <sys/x_call.h> 36023e71deSHaik Aftandilian #include <sys/promif.h> 37023e71deSHaik Aftandilian #include <sys/systm.h> 38023e71deSHaik Aftandilian #include <sys/mach_descrip.h> 39023e71deSHaik Aftandilian #include <sys/cpu_module.h> 40023e71deSHaik Aftandilian #include <sys/pg.h> 41023e71deSHaik Aftandilian #include <sys/lgrp.h> 42023e71deSHaik Aftandilian #include <sys/sysmacros.h> 43023e71deSHaik Aftandilian #include <sys/sunddi.h> 44023e71deSHaik Aftandilian #include <sys/cpupart.h> 45023e71deSHaik Aftandilian #include <sys/hsvc.h> 46183ef8a1SHaik Aftandilian #include <sys/mpo.h> 47d2365b01SPavel Tatashin #include <vm/hat_sfmmu.h> 4800a57bdfSHaik Aftandilian #include <sys/time.h> 4900a57bdfSHaik Aftandilian #include <sys/clock.h> 50023e71deSHaik Aftandilian 51023e71deSHaik Aftandilian /* 52023e71deSHaik Aftandilian * Sun4v OS Suspend 53023e71deSHaik Aftandilian * 54023e71deSHaik Aftandilian * Provides a means to suspend a sun4v guest domain by pausing CPUs and then 55023e71deSHaik Aftandilian * calling into the HV to initiate a suspension. Suspension is sequenced 56023e71deSHaik Aftandilian * externally by calling suspend_pre, suspend_start, and suspend_post. 57023e71deSHaik Aftandilian * suspend_pre and suspend_post are meant to perform any special operations 58023e71deSHaik Aftandilian * that should be done before or after a suspend/resume operation. e.g., 59023e71deSHaik Aftandilian * callbacks to cluster software to disable heartbeat monitoring before the 60023e71deSHaik Aftandilian * system is suspended. suspend_start prepares kernel services to be suspended 61023e71deSHaik Aftandilian * and then suspends the domain by calling hv_guest_suspend. 62023e71deSHaik Aftandilian * 63023e71deSHaik Aftandilian * Special Handling for %tick and %stick Registers 64023e71deSHaik Aftandilian * 65023e71deSHaik Aftandilian * After a suspend/resume operation, the %tick and %stick registers may have 66023e71deSHaik Aftandilian * jumped forwards or backwards. The delta is assumed to be consistent across 67023e71deSHaik Aftandilian * all CPUs, within the negligible level of %tick and %stick variation 68023e71deSHaik Aftandilian * acceptable on a cold boot. In order to maintain increasing %tick and %stick 69023e71deSHaik Aftandilian * counter values without exposing large positive or negative jumps to kernel 70023e71deSHaik Aftandilian * or user code, a %tick and %stick offset is used. Kernel reads of these 71023e71deSHaik Aftandilian * counters return the sum of the hardware register counter and offset 72023e71deSHaik Aftandilian * variable. After a suspend/resume operation, user reads of %tick or %stick 73023e71deSHaik Aftandilian * are emulated. Suspend code enables emulation by setting the 74023e71deSHaik Aftandilian * %{tick,stick}.NPT fields which trigger a privileged instruction access 75023e71deSHaik Aftandilian * trap whenever the registers are read from user mode. If emulation has been 76023e71deSHaik Aftandilian * enabled, the trap handler emulates the instruction. Emulation is only 77023e71deSHaik Aftandilian * enabled during a successful suspend/resume operation. When emulation is 78023e71deSHaik Aftandilian * enabled, CPUs that are DR'd into the system will have their 79023e71deSHaik Aftandilian * %{tick,stick}.NPT bits set to 1 as well. 80023e71deSHaik Aftandilian */ 81023e71deSHaik Aftandilian 82023e71deSHaik Aftandilian extern u_longlong_t gettick(void); /* returns %stick */ 83023e71deSHaik Aftandilian extern uint64_t gettick_counter(void); /* returns %tick */ 84023e71deSHaik Aftandilian extern uint64_t gettick_npt(void); 85023e71deSHaik Aftandilian extern uint64_t getstick_npt(void); 86023e71deSHaik Aftandilian extern int mach_descrip_update(void); 87023e71deSHaik Aftandilian extern cpuset_t cpu_ready_set; 88023e71deSHaik Aftandilian extern uint64_t native_tick_offset; 89023e71deSHaik Aftandilian extern uint64_t native_stick_offset; 9000a57bdfSHaik Aftandilian extern uint64_t sys_tick_freq; 91023e71deSHaik Aftandilian 92023e71deSHaik Aftandilian /* 93023e71deSHaik Aftandilian * Global Sun Cluster pre/post callbacks. 94023e71deSHaik Aftandilian */ 95023e71deSHaik Aftandilian const char *(*cl_suspend_error_decode)(int); 96023e71deSHaik Aftandilian int (*cl_suspend_pre_callback)(void); 97023e71deSHaik Aftandilian int (*cl_suspend_post_callback)(void); 98023e71deSHaik Aftandilian #define SC_PRE_FAIL_STR_FMT "Sun Cluster pre-suspend failure: %d" 99023e71deSHaik Aftandilian #define SC_POST_FAIL_STR_FMT "Sun Cluster post-suspend failure: %d" 100023e71deSHaik Aftandilian #define SC_FAIL_STR_MAX 256 101023e71deSHaik Aftandilian 102023e71deSHaik Aftandilian /* 103023e71deSHaik Aftandilian * The minimum major and minor version of the HSVC_GROUP_CORE API group 104023e71deSHaik Aftandilian * required in order to use OS suspend. 105023e71deSHaik Aftandilian */ 106023e71deSHaik Aftandilian #define SUSPEND_CORE_MAJOR 1 107023e71deSHaik Aftandilian #define SUSPEND_CORE_MINOR 2 108023e71deSHaik Aftandilian 109023e71deSHaik Aftandilian /* 110023e71deSHaik Aftandilian * By default, sun4v OS suspend is supported if the required HV version 111023e71deSHaik Aftandilian * is present. suspend_disabled should be set on platforms that do not 112023e71deSHaik Aftandilian * allow OS suspend regardless of whether or not the HV supports it. 113023e71deSHaik Aftandilian * It can also be set in /etc/system. 114023e71deSHaik Aftandilian */ 115023e71deSHaik Aftandilian static int suspend_disabled = 0; 116023e71deSHaik Aftandilian 117023e71deSHaik Aftandilian /* 118023e71deSHaik Aftandilian * Controls whether or not user-land tick and stick register emulation 119023e71deSHaik Aftandilian * will be enabled following a successful suspend operation. 120023e71deSHaik Aftandilian */ 121023e71deSHaik Aftandilian static int enable_user_tick_stick_emulation = 1; 122023e71deSHaik Aftandilian 123023e71deSHaik Aftandilian /* 124023e71deSHaik Aftandilian * Indicates whether or not tick and stick emulation is currently active. 125023e71deSHaik Aftandilian * After a successful suspend operation, if emulation is enabled, this 126023e71deSHaik Aftandilian * variable is set to B_TRUE. Global scope to allow emulation code to 127023e71deSHaik Aftandilian * check if emulation is active. 128023e71deSHaik Aftandilian */ 129023e71deSHaik Aftandilian boolean_t tick_stick_emulation_active = B_FALSE; 130023e71deSHaik Aftandilian 131023e71deSHaik Aftandilian /* 132d2365b01SPavel Tatashin * When non-zero, after a successful suspend and resume, cpunodes, CPU HW 133d2365b01SPavel Tatashin * sharing data structures, and processor groups will be updated using 134d2365b01SPavel Tatashin * information from the updated MD. 135023e71deSHaik Aftandilian */ 136023e71deSHaik Aftandilian static int suspend_update_cpu_mappings = 1; 137023e71deSHaik Aftandilian 138023e71deSHaik Aftandilian /* 13900a57bdfSHaik Aftandilian * The maximum number of microseconds by which the %tick or %stick register 14000a57bdfSHaik Aftandilian * can vary between any two CPUs in the system. To calculate the 14100a57bdfSHaik Aftandilian * native_stick_offset and native_tick_offset, we measure the change in these 14200a57bdfSHaik Aftandilian * registers on one CPU over a suspend/resume. Other CPUs may experience 14300a57bdfSHaik Aftandilian * slightly larger or smaller changes. %tick and %stick should be synchronized 14400a57bdfSHaik Aftandilian * between CPUs, but there may be some variation. So we add an additional value 14500a57bdfSHaik Aftandilian * derived from this variable to ensure that these registers always increase 14600a57bdfSHaik Aftandilian * over a suspend/resume operation, assuming all %tick and %stick registers 14700a57bdfSHaik Aftandilian * are synchronized (within a certain limit) across CPUs in the system. The 14800a57bdfSHaik Aftandilian * delta between %sticks on different CPUs should be a small number of cycles, 14900a57bdfSHaik Aftandilian * not perceptible to readers of %stick that migrate between CPUs. We set this 15000a57bdfSHaik Aftandilian * value to 1 millisecond which means that over a suspend/resume operation, 15100a57bdfSHaik Aftandilian * all CPU's %tick and %stick will advance forwards as long as, across all 15200a57bdfSHaik Aftandilian * CPUs, the %tick and %stick are synchronized to within 1 ms. This applies to 15300a57bdfSHaik Aftandilian * CPUs before the suspend and CPUs after the resume. 1 ms is conservative, 15400a57bdfSHaik Aftandilian * but small enough to not trigger TOD faults. 15500a57bdfSHaik Aftandilian */ 15600a57bdfSHaik Aftandilian static uint64_t suspend_tick_stick_max_delta = 1000; /* microseconds */ 15700a57bdfSHaik Aftandilian 15800a57bdfSHaik Aftandilian /* 159*02b4e56cSHaik Aftandilian * The number of times the system has been suspended and resumed. 160*02b4e56cSHaik Aftandilian */ 161*02b4e56cSHaik Aftandilian static uint64_t suspend_count = 0; 162*02b4e56cSHaik Aftandilian 163*02b4e56cSHaik Aftandilian /* 164023e71deSHaik Aftandilian * DBG and DBG_PROM() macro. 165023e71deSHaik Aftandilian */ 166023e71deSHaik Aftandilian #ifdef DEBUG 167023e71deSHaik Aftandilian 168023e71deSHaik Aftandilian static int suspend_debug_flag = 0; 169023e71deSHaik Aftandilian 170023e71deSHaik Aftandilian #define DBG_PROM \ 171023e71deSHaik Aftandilian if (suspend_debug_flag) \ 172023e71deSHaik Aftandilian prom_printf 173023e71deSHaik Aftandilian 174023e71deSHaik Aftandilian #define DBG \ 175023e71deSHaik Aftandilian if (suspend_debug_flag) \ 176023e71deSHaik Aftandilian suspend_debug 177023e71deSHaik Aftandilian 178023e71deSHaik Aftandilian static void 179023e71deSHaik Aftandilian suspend_debug(const char *fmt, ...) 180023e71deSHaik Aftandilian { 181023e71deSHaik Aftandilian char buf[512]; 182023e71deSHaik Aftandilian va_list ap; 183023e71deSHaik Aftandilian 184023e71deSHaik Aftandilian va_start(ap, fmt); 185023e71deSHaik Aftandilian (void) vsprintf(buf, fmt, ap); 186023e71deSHaik Aftandilian va_end(ap); 187023e71deSHaik Aftandilian 188023e71deSHaik Aftandilian cmn_err(CE_NOTE, "%s", buf); 189023e71deSHaik Aftandilian } 190023e71deSHaik Aftandilian 191023e71deSHaik Aftandilian #else /* DEBUG */ 192023e71deSHaik Aftandilian 193023e71deSHaik Aftandilian #define DBG_PROM 194023e71deSHaik Aftandilian #define DBG 195023e71deSHaik Aftandilian 196023e71deSHaik Aftandilian #endif /* DEBUG */ 197023e71deSHaik Aftandilian 198023e71deSHaik Aftandilian /* 199023e71deSHaik Aftandilian * Return true if the HV supports OS suspend and if suspend has not been 200023e71deSHaik Aftandilian * disabled on this platform. 201023e71deSHaik Aftandilian */ 202023e71deSHaik Aftandilian boolean_t 203023e71deSHaik Aftandilian suspend_supported(void) 204023e71deSHaik Aftandilian { 205023e71deSHaik Aftandilian uint64_t major, minor; 206023e71deSHaik Aftandilian 207023e71deSHaik Aftandilian if (suspend_disabled) 208023e71deSHaik Aftandilian return (B_FALSE); 209023e71deSHaik Aftandilian 210023e71deSHaik Aftandilian if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0) 211023e71deSHaik Aftandilian return (B_FALSE); 212023e71deSHaik Aftandilian 213023e71deSHaik Aftandilian return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) || 214023e71deSHaik Aftandilian (major > SUSPEND_CORE_MAJOR)); 215023e71deSHaik Aftandilian } 216023e71deSHaik Aftandilian 217023e71deSHaik Aftandilian /* 218*02b4e56cSHaik Aftandilian * Memory DR is not permitted if the system has been suspended and resumed. 219*02b4e56cSHaik Aftandilian * It is the responsibility of the caller of suspend_start and the DR 220*02b4e56cSHaik Aftandilian * subsystem to serialize DR operations and suspend_memdr_allowed() checks. 221*02b4e56cSHaik Aftandilian */ 222*02b4e56cSHaik Aftandilian boolean_t 223*02b4e56cSHaik Aftandilian suspend_memdr_allowed(void) 224*02b4e56cSHaik Aftandilian { 225*02b4e56cSHaik Aftandilian return (suspend_count == 0); 226*02b4e56cSHaik Aftandilian } 227*02b4e56cSHaik Aftandilian 228*02b4e56cSHaik Aftandilian /* 22900a57bdfSHaik Aftandilian * Given a source tick, stick, and tod value, set the tick and stick offsets 23000a57bdfSHaik Aftandilian * such that the (current physical register value) + offset == (source value) 23100a57bdfSHaik Aftandilian * and in addition account for some variation between the %tick/%stick on 23200a57bdfSHaik Aftandilian * different CPUs. We account for this variation by adding in double the value 23300a57bdfSHaik Aftandilian * of suspend_tick_stick_max_delta. The following is an explanation of why 23400a57bdfSHaik Aftandilian * suspend_tick_stick_max_delta must be multplied by two and added to 23500a57bdfSHaik Aftandilian * native_stick_offset. 23600a57bdfSHaik Aftandilian * 23700a57bdfSHaik Aftandilian * Consider a guest instance that is yet to be suspended with CPUs p0 and p1 23800a57bdfSHaik Aftandilian * with physical "source" %stick values s0 and s1 respectively. When the guest 23900a57bdfSHaik Aftandilian * is first resumed, the physical "target" %stick values are t0 and t1 24000a57bdfSHaik Aftandilian * respectively. The virtual %stick values after the resume are v0 and v1 24100a57bdfSHaik Aftandilian * respectively. Let x be the maximum difference between any two CPU's %stick 24200a57bdfSHaik Aftandilian * register at a given point in time and let the %stick values be assigned 24300a57bdfSHaik Aftandilian * such that 24400a57bdfSHaik Aftandilian * 24500a57bdfSHaik Aftandilian * s1 = s0 + x and 24600a57bdfSHaik Aftandilian * t1 = t0 - x 24700a57bdfSHaik Aftandilian * 24800a57bdfSHaik Aftandilian * Let us assume that p0 is driving the suspend and resume. Then, we will 24900a57bdfSHaik Aftandilian * calculate the stick offset f and the virtual %stick on p0 after the 25000a57bdfSHaik Aftandilian * resume as follows. 25100a57bdfSHaik Aftandilian * 25200a57bdfSHaik Aftandilian * f = s0 - t0 and 25300a57bdfSHaik Aftandilian * v0 = t0 + f 25400a57bdfSHaik Aftandilian * 25500a57bdfSHaik Aftandilian * We calculate the virtual %stick v1 on p1 after the resume as 25600a57bdfSHaik Aftandilian * 25700a57bdfSHaik Aftandilian * v1 = t1 + f 25800a57bdfSHaik Aftandilian * 25900a57bdfSHaik Aftandilian * Substitution yields 26000a57bdfSHaik Aftandilian * 26100a57bdfSHaik Aftandilian * v1 = t1 + (s0 - t0) 26200a57bdfSHaik Aftandilian * v1 = (t0 - x) + (s0 - t0) 26300a57bdfSHaik Aftandilian * v1 = -x + s0 26400a57bdfSHaik Aftandilian * v1 = s0 - x 26500a57bdfSHaik Aftandilian * v1 = (s1 - x) - x 26600a57bdfSHaik Aftandilian * v1 = s1 - 2x 26700a57bdfSHaik Aftandilian * 26800a57bdfSHaik Aftandilian * Therefore, in this scenario, without accounting for %stick variation in 26900a57bdfSHaik Aftandilian * the calculation of the native_stick_offset f, the virtual %stick on p1 27000a57bdfSHaik Aftandilian * is less than the value of the %stick on p1 before the suspend which is 27100a57bdfSHaik Aftandilian * unacceptable. By adding 2x to v1, we guarantee it will be equal to s1 27200a57bdfSHaik Aftandilian * which means the %stick on p1 after the resume will always be greater 27300a57bdfSHaik Aftandilian * than or equal to the %stick on p1 before the suspend. Since v1 = t1 + f 27400a57bdfSHaik Aftandilian * at any point in time, we can accomplish this by adding 2x to f. This 27500a57bdfSHaik Aftandilian * guarantees any processes bound to CPU P0 or P1 will not see a %stick 27600a57bdfSHaik Aftandilian * decrease across a suspend/resume. Hence, in the code below, we multiply 27700a57bdfSHaik Aftandilian * suspend_tick_stick_max_delta by two in the calculation for 27800a57bdfSHaik Aftandilian * native_stick_offset, native_tick_offset, and target_hrtime. 279023e71deSHaik Aftandilian */ 280023e71deSHaik Aftandilian static void 28100a57bdfSHaik Aftandilian set_tick_offsets(uint64_t source_tick, uint64_t source_stick, timestruc_t *tsp) 282023e71deSHaik Aftandilian { 283023e71deSHaik Aftandilian uint64_t target_tick; 284023e71deSHaik Aftandilian uint64_t target_stick; 28500a57bdfSHaik Aftandilian hrtime_t source_hrtime; 28600a57bdfSHaik Aftandilian hrtime_t target_hrtime; 287023e71deSHaik Aftandilian 28800a57bdfSHaik Aftandilian /* 28900a57bdfSHaik Aftandilian * Temporarily set the offsets to zero so that the following reads 29000a57bdfSHaik Aftandilian * of the registers will yield physical unadjusted counter values. 29100a57bdfSHaik Aftandilian */ 292023e71deSHaik Aftandilian native_tick_offset = 0; 293023e71deSHaik Aftandilian native_stick_offset = 0; 294023e71deSHaik Aftandilian 295023e71deSHaik Aftandilian target_tick = gettick_counter(); /* returns %tick */ 296023e71deSHaik Aftandilian target_stick = gettick(); /* returns %stick */ 297023e71deSHaik Aftandilian 29800a57bdfSHaik Aftandilian /* 29900a57bdfSHaik Aftandilian * Calculate the new offsets. In addition to the delta observed on 30000a57bdfSHaik Aftandilian * this CPU, add an additional value. Multiply the %tick/%stick 30100a57bdfSHaik Aftandilian * frequency by suspend_tick_stick_max_delta (us). Then, multiply by 2 30200a57bdfSHaik Aftandilian * to account for a delta between CPUs before the suspend and a 30300a57bdfSHaik Aftandilian * delta between CPUs after the resume. 30400a57bdfSHaik Aftandilian */ 30500a57bdfSHaik Aftandilian native_tick_offset = (source_tick - target_tick) + 30600a57bdfSHaik Aftandilian (CPU->cpu_curr_clock * suspend_tick_stick_max_delta * 2 / MICROSEC); 30700a57bdfSHaik Aftandilian native_stick_offset = (source_stick - target_stick) + 30800a57bdfSHaik Aftandilian (sys_tick_freq * suspend_tick_stick_max_delta * 2 / MICROSEC); 30900a57bdfSHaik Aftandilian 31000a57bdfSHaik Aftandilian /* 31100a57bdfSHaik Aftandilian * We've effectively increased %stick and %tick by twice the value 31200a57bdfSHaik Aftandilian * of suspend_tick_stick_max_delta to account for variation across 31300a57bdfSHaik Aftandilian * CPUs. Now adjust the preserved TOD by the same amount. 31400a57bdfSHaik Aftandilian */ 31500a57bdfSHaik Aftandilian source_hrtime = ts2hrt(tsp); 31600a57bdfSHaik Aftandilian target_hrtime = source_hrtime + 31700a57bdfSHaik Aftandilian (suspend_tick_stick_max_delta * 2 * (NANOSEC/MICROSEC)); 31800a57bdfSHaik Aftandilian hrt2ts(target_hrtime, tsp); 319023e71deSHaik Aftandilian } 320023e71deSHaik Aftandilian 321023e71deSHaik Aftandilian /* 322023e71deSHaik Aftandilian * Set the {tick,stick}.NPT field to 1 on this CPU. 323023e71deSHaik Aftandilian */ 324023e71deSHaik Aftandilian static void 325023e71deSHaik Aftandilian enable_tick_stick_npt(void) 326023e71deSHaik Aftandilian { 327c1374a13SSurya Prakki (void) hv_stick_set_npt(1); 328c1374a13SSurya Prakki (void) hv_tick_set_npt(1); 329023e71deSHaik Aftandilian } 330023e71deSHaik Aftandilian 331023e71deSHaik Aftandilian /* 332023e71deSHaik Aftandilian * Synchronize a CPU's {tick,stick}.NPT fields with the current state 333023e71deSHaik Aftandilian * of the system. This is used when a CPU is DR'd into the system. 334023e71deSHaik Aftandilian */ 335023e71deSHaik Aftandilian void 336023e71deSHaik Aftandilian suspend_sync_tick_stick_npt(void) 337023e71deSHaik Aftandilian { 338023e71deSHaik Aftandilian if (tick_stick_emulation_active) { 339023e71deSHaik Aftandilian DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id); 340c1374a13SSurya Prakki (void) hv_stick_set_npt(1); 341c1374a13SSurya Prakki (void) hv_tick_set_npt(1); 342023e71deSHaik Aftandilian } else { 343023e71deSHaik Aftandilian ASSERT(gettick_npt() == 0); 344023e71deSHaik Aftandilian ASSERT(getstick_npt() == 0); 345023e71deSHaik Aftandilian } 346023e71deSHaik Aftandilian } 347023e71deSHaik Aftandilian 348023e71deSHaik Aftandilian /* 349023e71deSHaik Aftandilian * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW 350023e71deSHaik Aftandilian * sharing data structures, and processor groups. 351023e71deSHaik Aftandilian */ 352023e71deSHaik Aftandilian static void 353023e71deSHaik Aftandilian update_cpu_mappings(void) 354023e71deSHaik Aftandilian { 355023e71deSHaik Aftandilian md_t *mdp; 356023e71deSHaik Aftandilian processorid_t id; 357023e71deSHaik Aftandilian cpu_t *cp; 358023e71deSHaik Aftandilian cpu_pg_t *pgps[NCPU]; 359023e71deSHaik Aftandilian 360023e71deSHaik Aftandilian if ((mdp = md_get_handle()) == NULL) { 361023e71deSHaik Aftandilian DBG("suspend: md_get_handle failed"); 362023e71deSHaik Aftandilian return; 363023e71deSHaik Aftandilian } 364023e71deSHaik Aftandilian 365023e71deSHaik Aftandilian DBG("suspend: updating CPU mappings"); 366023e71deSHaik Aftandilian 367023e71deSHaik Aftandilian mutex_enter(&cpu_lock); 368023e71deSHaik Aftandilian 369023e71deSHaik Aftandilian setup_chip_mappings(mdp); 370023e71deSHaik Aftandilian setup_exec_unit_mappings(mdp); 371023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 372023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 373023e71deSHaik Aftandilian continue; 374023e71deSHaik Aftandilian cpu_map_exec_units(cp); 375023e71deSHaik Aftandilian } 376023e71deSHaik Aftandilian 377023e71deSHaik Aftandilian /* 378023e71deSHaik Aftandilian * Re-calculate processor groups. 379023e71deSHaik Aftandilian * 380023e71deSHaik Aftandilian * First tear down all PG information before adding any new PG 381023e71deSHaik Aftandilian * information derived from the MD we just downloaded. We must 382023e71deSHaik Aftandilian * call pg_cpu_inactive and pg_cpu_active with CPUs paused and 383023e71deSHaik Aftandilian * we want to minimize the number of times pause_cpus is called. 384023e71deSHaik Aftandilian * Inactivating all CPUs would leave PGs without any active CPUs, 385023e71deSHaik Aftandilian * so while CPUs are paused, call pg_cpu_inactive and swap in the 386023e71deSHaik Aftandilian * bootstrap PG structure saving the original PG structure to be 387023e71deSHaik Aftandilian * fini'd afterwards. This prevents the dispatcher from encountering 388023e71deSHaik Aftandilian * PGs in which all CPUs are inactive. 389023e71deSHaik Aftandilian */ 390023e71deSHaik Aftandilian pause_cpus(NULL); 391023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 392023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 393023e71deSHaik Aftandilian continue; 394023e71deSHaik Aftandilian pg_cpu_inactive(cp); 395023e71deSHaik Aftandilian pgps[id] = cp->cpu_pg; 396023e71deSHaik Aftandilian pg_cpu_bootstrap(cp); 397023e71deSHaik Aftandilian } 398023e71deSHaik Aftandilian start_cpus(); 399023e71deSHaik Aftandilian 400023e71deSHaik Aftandilian /* 401023e71deSHaik Aftandilian * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are 402023e71deSHaik Aftandilian * not paused. Use two separate loops here so that we do not 403023e71deSHaik Aftandilian * initialize PG data for CPUs until all the old PG data structures 404023e71deSHaik Aftandilian * are torn down. 405023e71deSHaik Aftandilian */ 406023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 407023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 408023e71deSHaik Aftandilian continue; 409023e71deSHaik Aftandilian pg_cpu_fini(cp, pgps[id]); 410183ef8a1SHaik Aftandilian mpo_cpu_remove(id); 411023e71deSHaik Aftandilian } 412023e71deSHaik Aftandilian 413023e71deSHaik Aftandilian /* 414023e71deSHaik Aftandilian * Initialize PG data for each CPU, but leave the bootstrapped 415023e71deSHaik Aftandilian * PG structure in place to avoid running with any PGs containing 416023e71deSHaik Aftandilian * nothing but inactive CPUs. 417023e71deSHaik Aftandilian */ 418023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 419023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 420023e71deSHaik Aftandilian continue; 421183ef8a1SHaik Aftandilian mpo_cpu_add(mdp, id); 422023e71deSHaik Aftandilian pgps[id] = pg_cpu_init(cp, B_TRUE); 423023e71deSHaik Aftandilian } 424023e71deSHaik Aftandilian 425023e71deSHaik Aftandilian /* 426023e71deSHaik Aftandilian * Now that PG data has been initialized for all CPUs in the 427023e71deSHaik Aftandilian * system, replace the bootstrapped PG structure with the 428023e71deSHaik Aftandilian * initialized PG structure and call pg_cpu_active for each CPU. 429023e71deSHaik Aftandilian */ 430023e71deSHaik Aftandilian pause_cpus(NULL); 431023e71deSHaik Aftandilian for (id = 0; id < NCPU; id++) { 432023e71deSHaik Aftandilian if ((cp = cpu_get(id)) == NULL) 433023e71deSHaik Aftandilian continue; 434023e71deSHaik Aftandilian cp->cpu_pg = pgps[id]; 435023e71deSHaik Aftandilian pg_cpu_active(cp); 436023e71deSHaik Aftandilian } 437023e71deSHaik Aftandilian start_cpus(); 438023e71deSHaik Aftandilian 439023e71deSHaik Aftandilian mutex_exit(&cpu_lock); 440023e71deSHaik Aftandilian 441023e71deSHaik Aftandilian (void) md_fini_handle(mdp); 442023e71deSHaik Aftandilian } 443023e71deSHaik Aftandilian 444023e71deSHaik Aftandilian /* 445023e71deSHaik Aftandilian * Wrapper for the Sun Cluster error decoding function. 446023e71deSHaik Aftandilian */ 447023e71deSHaik Aftandilian static int 448023e71deSHaik Aftandilian cluster_error_decode(int error, char *error_reason, size_t max_reason_len) 449023e71deSHaik Aftandilian { 450023e71deSHaik Aftandilian const char *decoded; 451023e71deSHaik Aftandilian size_t decoded_len; 452023e71deSHaik Aftandilian 453023e71deSHaik Aftandilian ASSERT(error_reason != NULL); 454023e71deSHaik Aftandilian ASSERT(max_reason_len > 0); 455023e71deSHaik Aftandilian 456023e71deSHaik Aftandilian max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX); 457023e71deSHaik Aftandilian 458023e71deSHaik Aftandilian if (cl_suspend_error_decode == NULL) 459023e71deSHaik Aftandilian return (-1); 460023e71deSHaik Aftandilian 461023e71deSHaik Aftandilian if ((decoded = (*cl_suspend_error_decode)(error)) == NULL) 462023e71deSHaik Aftandilian return (-1); 463023e71deSHaik Aftandilian 464023e71deSHaik Aftandilian /* Get number of non-NULL bytes */ 465023e71deSHaik Aftandilian if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0) 466023e71deSHaik Aftandilian return (-1); 467023e71deSHaik Aftandilian 468023e71deSHaik Aftandilian bcopy(decoded, error_reason, decoded_len); 469023e71deSHaik Aftandilian 470023e71deSHaik Aftandilian /* 471023e71deSHaik Aftandilian * The error string returned from cl_suspend_error_decode 472023e71deSHaik Aftandilian * should be NULL-terminated, but set the terminator here 473023e71deSHaik Aftandilian * because we only copied non-NULL bytes. If the decoded 474023e71deSHaik Aftandilian * string was not NULL-terminated, this guarantees that 475023e71deSHaik Aftandilian * error_reason will be. 476023e71deSHaik Aftandilian */ 477023e71deSHaik Aftandilian error_reason[decoded_len] = '\0'; 478023e71deSHaik Aftandilian 479023e71deSHaik Aftandilian return (0); 480023e71deSHaik Aftandilian } 481023e71deSHaik Aftandilian 482023e71deSHaik Aftandilian /* 483023e71deSHaik Aftandilian * Wrapper for the Sun Cluster pre-suspend callback. 484023e71deSHaik Aftandilian */ 485023e71deSHaik Aftandilian static int 486023e71deSHaik Aftandilian cluster_pre_wrapper(char *error_reason, size_t max_reason_len) 487023e71deSHaik Aftandilian { 488023e71deSHaik Aftandilian int rv = 0; 489023e71deSHaik Aftandilian 490023e71deSHaik Aftandilian if (cl_suspend_pre_callback != NULL) { 491023e71deSHaik Aftandilian rv = (*cl_suspend_pre_callback)(); 492023e71deSHaik Aftandilian DBG("suspend: cl_suspend_pre_callback returned %d", rv); 493023e71deSHaik Aftandilian if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 494023e71deSHaik Aftandilian if (cluster_error_decode(rv, error_reason, 495023e71deSHaik Aftandilian max_reason_len)) { 496023e71deSHaik Aftandilian (void) snprintf(error_reason, max_reason_len, 497023e71deSHaik Aftandilian SC_PRE_FAIL_STR_FMT, rv); 498023e71deSHaik Aftandilian } 499023e71deSHaik Aftandilian } 500023e71deSHaik Aftandilian } 501023e71deSHaik Aftandilian 502023e71deSHaik Aftandilian return (rv); 503023e71deSHaik Aftandilian } 504023e71deSHaik Aftandilian 505023e71deSHaik Aftandilian /* 506023e71deSHaik Aftandilian * Wrapper for the Sun Cluster post-suspend callback. 507023e71deSHaik Aftandilian */ 508023e71deSHaik Aftandilian static int 509023e71deSHaik Aftandilian cluster_post_wrapper(char *error_reason, size_t max_reason_len) 510023e71deSHaik Aftandilian { 511023e71deSHaik Aftandilian int rv = 0; 512023e71deSHaik Aftandilian 513023e71deSHaik Aftandilian if (cl_suspend_post_callback != NULL) { 514023e71deSHaik Aftandilian rv = (*cl_suspend_post_callback)(); 515023e71deSHaik Aftandilian DBG("suspend: cl_suspend_post_callback returned %d", rv); 516023e71deSHaik Aftandilian if (rv != 0 && error_reason != NULL && max_reason_len > 0) { 517023e71deSHaik Aftandilian if (cluster_error_decode(rv, error_reason, 518023e71deSHaik Aftandilian max_reason_len)) { 519023e71deSHaik Aftandilian (void) snprintf(error_reason, 520023e71deSHaik Aftandilian max_reason_len, SC_POST_FAIL_STR_FMT, rv); 521023e71deSHaik Aftandilian } 522023e71deSHaik Aftandilian } 523023e71deSHaik Aftandilian } 524023e71deSHaik Aftandilian 525023e71deSHaik Aftandilian return (rv); 526023e71deSHaik Aftandilian } 527023e71deSHaik Aftandilian 528023e71deSHaik Aftandilian /* 529023e71deSHaik Aftandilian * Execute pre-suspend callbacks preparing the system for a suspend operation. 530023e71deSHaik Aftandilian * Returns zero on success, non-zero on failure. Sets the recovered argument 531023e71deSHaik Aftandilian * to indicate whether or not callbacks could be undone in the event of a 532023e71deSHaik Aftandilian * failure--if callbacks were successfully undone, *recovered is set to B_TRUE, 533023e71deSHaik Aftandilian * otherwise *recovered is set to B_FALSE. Must be called successfully before 534023e71deSHaik Aftandilian * suspend_start can be called. Callers should first call suspend_support to 535023e71deSHaik Aftandilian * determine if OS suspend is supported. 536023e71deSHaik Aftandilian */ 537023e71deSHaik Aftandilian int 538023e71deSHaik Aftandilian suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered) 539023e71deSHaik Aftandilian { 540023e71deSHaik Aftandilian int rv; 541023e71deSHaik Aftandilian 542023e71deSHaik Aftandilian ASSERT(recovered != NULL); 543023e71deSHaik Aftandilian 544023e71deSHaik Aftandilian /* 545023e71deSHaik Aftandilian * Return an error if suspend_pre is erreoneously called 546023e71deSHaik Aftandilian * when OS suspend is not supported. 547023e71deSHaik Aftandilian */ 548023e71deSHaik Aftandilian ASSERT(suspend_supported()); 549023e71deSHaik Aftandilian if (!suspend_supported()) { 550023e71deSHaik Aftandilian DBG("suspend: suspend_pre called without suspend support"); 551023e71deSHaik Aftandilian *recovered = B_TRUE; 552023e71deSHaik Aftandilian return (ENOTSUP); 553023e71deSHaik Aftandilian } 554023e71deSHaik Aftandilian DBG("suspend: %s", __func__); 555023e71deSHaik Aftandilian 556023e71deSHaik Aftandilian rv = cluster_pre_wrapper(error_reason, max_reason_len); 557023e71deSHaik Aftandilian 558023e71deSHaik Aftandilian /* 559023e71deSHaik Aftandilian * At present, only one pre-suspend operation exists. 560023e71deSHaik Aftandilian * If it fails, no recovery needs to be done. 561023e71deSHaik Aftandilian */ 562023e71deSHaik Aftandilian if (rv != 0 && recovered != NULL) 563023e71deSHaik Aftandilian *recovered = B_TRUE; 564023e71deSHaik Aftandilian 565023e71deSHaik Aftandilian return (rv); 566023e71deSHaik Aftandilian } 567023e71deSHaik Aftandilian 568023e71deSHaik Aftandilian /* 569023e71deSHaik Aftandilian * Execute post-suspend callbacks. Returns zero on success, non-zero on 570023e71deSHaik Aftandilian * failure. Must be called after suspend_start is called, regardless of 571023e71deSHaik Aftandilian * whether or not suspend_start is successful. 572023e71deSHaik Aftandilian */ 573023e71deSHaik Aftandilian int 574023e71deSHaik Aftandilian suspend_post(char *error_reason, size_t max_reason_len) 575023e71deSHaik Aftandilian { 576023e71deSHaik Aftandilian ASSERT(suspend_supported()); 577023e71deSHaik Aftandilian DBG("suspend: %s", __func__); 578023e71deSHaik Aftandilian return (cluster_post_wrapper(error_reason, max_reason_len)); 579023e71deSHaik Aftandilian } 580023e71deSHaik Aftandilian 581023e71deSHaik Aftandilian /* 582023e71deSHaik Aftandilian * Suspends the OS by pausing CPUs and calling into the HV to initiate 583023e71deSHaik Aftandilian * the suspend. When the HV routine hv_guest_suspend returns, the system 584023e71deSHaik Aftandilian * will be resumed. Must be called after a successful call to suspend_pre. 585023e71deSHaik Aftandilian * suspend_post must be called after suspend_start, whether or not 586023e71deSHaik Aftandilian * suspend_start returns an error. 587023e71deSHaik Aftandilian */ 588023e71deSHaik Aftandilian /*ARGSUSED*/ 589023e71deSHaik Aftandilian int 590023e71deSHaik Aftandilian suspend_start(char *error_reason, size_t max_reason_len) 591023e71deSHaik Aftandilian { 592023e71deSHaik Aftandilian uint64_t source_tick; 593023e71deSHaik Aftandilian uint64_t source_stick; 594023e71deSHaik Aftandilian uint64_t rv; 595023e71deSHaik Aftandilian timestruc_t source_tod; 596023e71deSHaik Aftandilian int spl; 597023e71deSHaik Aftandilian 598023e71deSHaik Aftandilian ASSERT(suspend_supported()); 599023e71deSHaik Aftandilian DBG("suspend: %s", __func__); 600023e71deSHaik Aftandilian 601d2365b01SPavel Tatashin sfmmu_ctxdoms_lock(); 602d2365b01SPavel Tatashin 603023e71deSHaik Aftandilian mutex_enter(&cpu_lock); 604023e71deSHaik Aftandilian 605023e71deSHaik Aftandilian /* Suspend the watchdog */ 606023e71deSHaik Aftandilian watchdog_suspend(); 607023e71deSHaik Aftandilian 608023e71deSHaik Aftandilian /* Record the TOD */ 609023e71deSHaik Aftandilian mutex_enter(&tod_lock); 610023e71deSHaik Aftandilian source_tod = tod_get(); 611023e71deSHaik Aftandilian mutex_exit(&tod_lock); 612023e71deSHaik Aftandilian 613023e71deSHaik Aftandilian /* Pause all other CPUs */ 614023e71deSHaik Aftandilian pause_cpus(NULL); 615023e71deSHaik Aftandilian DBG_PROM("suspend: CPUs paused\n"); 616023e71deSHaik Aftandilian 61700a57bdfSHaik Aftandilian /* Suspend cyclics */ 618023e71deSHaik Aftandilian cyclic_suspend(); 619023e71deSHaik Aftandilian DBG_PROM("suspend: cyclics suspended\n"); 62000a57bdfSHaik Aftandilian 62100a57bdfSHaik Aftandilian /* Disable interrupts */ 622023e71deSHaik Aftandilian spl = spl8(); 62300a57bdfSHaik Aftandilian DBG_PROM("suspend: spl8()\n"); 624023e71deSHaik Aftandilian 625023e71deSHaik Aftandilian source_tick = gettick_counter(); 626023e71deSHaik Aftandilian source_stick = gettick(); 627023e71deSHaik Aftandilian DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick); 628023e71deSHaik Aftandilian DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick); 629023e71deSHaik Aftandilian 630023e71deSHaik Aftandilian /* 63100a57bdfSHaik Aftandilian * Call into the HV to initiate the suspend. hv_guest_suspend() 63200a57bdfSHaik Aftandilian * returns after the guest has been resumed or if the suspend 63300a57bdfSHaik Aftandilian * operation failed or was cancelled. After a successful suspend, 63400a57bdfSHaik Aftandilian * the %tick and %stick registers may have changed by an amount 63500a57bdfSHaik Aftandilian * that is not proportional to the amount of time that has passed. 63600a57bdfSHaik Aftandilian * They may have jumped forwards or backwards. Some variation is 63700a57bdfSHaik Aftandilian * allowed and accounted for using suspend_tick_stick_max_delta, 63800a57bdfSHaik Aftandilian * but otherwise this jump must be uniform across all CPUs and we 63900a57bdfSHaik Aftandilian * operate under the assumption that it is (maintaining two global 64000a57bdfSHaik Aftandilian * offset variables--one for %tick and one for %stick.) 641023e71deSHaik Aftandilian */ 642023e71deSHaik Aftandilian DBG_PROM("suspend: suspending... \n"); 643023e71deSHaik Aftandilian rv = hv_guest_suspend(); 644023e71deSHaik Aftandilian if (rv != 0) { 645023e71deSHaik Aftandilian splx(spl); 646023e71deSHaik Aftandilian cyclic_resume(); 647023e71deSHaik Aftandilian start_cpus(); 648023e71deSHaik Aftandilian watchdog_resume(); 649023e71deSHaik Aftandilian mutex_exit(&cpu_lock); 650d2365b01SPavel Tatashin sfmmu_ctxdoms_unlock(); 651023e71deSHaik Aftandilian DBG("suspend: failed, rv: %ld\n", rv); 652023e71deSHaik Aftandilian return (rv); 653023e71deSHaik Aftandilian } 654023e71deSHaik Aftandilian 655*02b4e56cSHaik Aftandilian suspend_count++; 656*02b4e56cSHaik Aftandilian 65700a57bdfSHaik Aftandilian /* Update the global tick and stick offsets and the preserved TOD */ 65800a57bdfSHaik Aftandilian set_tick_offsets(source_tick, source_stick, &source_tod); 659023e71deSHaik Aftandilian 660023e71deSHaik Aftandilian /* Ensure new offsets are globally visible before resuming CPUs */ 661023e71deSHaik Aftandilian membar_sync(); 662023e71deSHaik Aftandilian 663023e71deSHaik Aftandilian /* Enable interrupts */ 664023e71deSHaik Aftandilian splx(spl); 665023e71deSHaik Aftandilian 666023e71deSHaik Aftandilian /* Set the {%tick,%stick}.NPT bits on all CPUs */ 667023e71deSHaik Aftandilian if (enable_user_tick_stick_emulation) { 668023e71deSHaik Aftandilian xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL); 669023e71deSHaik Aftandilian xt_sync(cpu_ready_set); 670023e71deSHaik Aftandilian ASSERT(gettick_npt() != 0); 671023e71deSHaik Aftandilian ASSERT(getstick_npt() != 0); 672023e71deSHaik Aftandilian } 673023e71deSHaik Aftandilian 674023e71deSHaik Aftandilian /* If emulation is enabled, but not currently active, enable it */ 675023e71deSHaik Aftandilian if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) { 676023e71deSHaik Aftandilian tick_stick_emulation_active = B_TRUE; 677023e71deSHaik Aftandilian } 678023e71deSHaik Aftandilian 679d2365b01SPavel Tatashin sfmmu_ctxdoms_remove(); 680d2365b01SPavel Tatashin 681023e71deSHaik Aftandilian /* Resume cyclics, unpause CPUs */ 682023e71deSHaik Aftandilian cyclic_resume(); 683023e71deSHaik Aftandilian start_cpus(); 684023e71deSHaik Aftandilian 685023e71deSHaik Aftandilian /* Set the TOD */ 686023e71deSHaik Aftandilian mutex_enter(&tod_lock); 687023e71deSHaik Aftandilian tod_set(source_tod); 688023e71deSHaik Aftandilian mutex_exit(&tod_lock); 689023e71deSHaik Aftandilian 690023e71deSHaik Aftandilian /* Re-enable the watchdog */ 691023e71deSHaik Aftandilian watchdog_resume(); 692023e71deSHaik Aftandilian 693023e71deSHaik Aftandilian mutex_exit(&cpu_lock); 694023e71deSHaik Aftandilian 695d2365b01SPavel Tatashin /* Download the latest MD */ 696d2365b01SPavel Tatashin if ((rv = mach_descrip_update()) != 0) 697d2365b01SPavel Tatashin cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld", 698d2365b01SPavel Tatashin rv); 699d2365b01SPavel Tatashin 700d2365b01SPavel Tatashin sfmmu_ctxdoms_update(); 701d2365b01SPavel Tatashin sfmmu_ctxdoms_unlock(); 702d2365b01SPavel Tatashin 703023e71deSHaik Aftandilian /* Get new MD, update CPU mappings/relationships */ 704023e71deSHaik Aftandilian if (suspend_update_cpu_mappings) 705023e71deSHaik Aftandilian update_cpu_mappings(); 706023e71deSHaik Aftandilian 707023e71deSHaik Aftandilian DBG("suspend: target tick: 0x%lx", gettick_counter()); 708023e71deSHaik Aftandilian DBG("suspend: target stick: 0x%llx", gettick()); 709023e71deSHaik Aftandilian DBG("suspend: user %%tick/%%stick emulation is %d", 710023e71deSHaik Aftandilian tick_stick_emulation_active); 711023e71deSHaik Aftandilian DBG("suspend: finished"); 712023e71deSHaik Aftandilian 713023e71deSHaik Aftandilian return (0); 714023e71deSHaik Aftandilian } 715