1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/conf.h> 31 #include <sys/ddi.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/param.h> 35 #include <sys/mutex.h> 36 #include <sys/kmem.h> 37 #include <sys/machparam.h> 38 #include <sys/machsystm.h> 39 #include <sys/machthread.h> 40 #include <sys/cpu.h> 41 #include <sys/cpuvar.h> 42 #include <vm/page.h> 43 #include <vm/hat.h> 44 #include <vm/seg.h> 45 #include <vm/seg_kmem.h> 46 #include <sys/vmsystm.h> 47 #include <sys/vmem.h> 48 #include <sys/mman.h> 49 #include <sys/cmn_err.h> 50 #include <sys/time.h> 51 #include <sys/async.h> 52 #include <sys/spl.h> 53 #include <sys/trap.h> 54 #include <sys/machtrap.h> 55 #include <sys/promif.h> 56 #include <sys/prom_plat.h> 57 #include <sys/debug.h> 58 #include <sys/x_call.h> 59 #include <sys/membar.h> 60 #include <sys/ivintr.h> 61 #include <sys/cred.h> 62 #include <sys/cpu_module.h> 63 #include <sys/ontrap.h> 64 #include <sys/sdt.h> 65 #include <sys/errorq.h> 66 67 #define MAX_CE_FLTS 10 68 #define MAX_ASYNC_FLTS 6 69 70 errorq_t *ue_queue; /* queue of uncorrectable errors */ 71 errorq_t *ce_queue; /* queue of correctable errors */ 72 73 /* 74 * ce_verbose_memory - covers CEs in DIMMs 75 * ce_verbose_other - covers "others" (ecache, IO, etc.) 76 * 77 * If the value is 0, nothing is logged. 78 * If the value is 1, the error is logged to the log file, but not console. 79 * If the value is 2, the error is logged to the log file and console. 80 */ 81 int ce_verbose_memory = 1; 82 int ce_verbose_other = 1; 83 84 int ce_show_data = 0; 85 int ce_debug = 0; 86 int ue_debug = 0; 87 int reset_debug = 0; 88 89 /* 90 * Tunables for controlling the handling of asynchronous faults (AFTs). Setting 91 * these to non-default values on a non-DEBUG kernel is NOT supported. 92 */ 93 int aft_verbose = 0; /* log AFT messages > 1 to log only */ 94 int aft_panic = 0; /* panic (not reboot) on fatal usermode AFLT */ 95 int aft_testfatal = 0; /* force all AFTs to panic immediately */ 96 97 /* 98 * Panic_* variables specific to the AFT code. These are used to record 99 * information that the platform-specific code will need once we panic. 100 */ 101 struct async_flt panic_aflt; 102 103 /* 104 * Defined in bus_func.c but initialised in error_init 105 */ 106 extern kmutex_t bfd_lock; 107 108 /* 109 * Common bus driver async error logging routine. This routine can be shared 110 * by all sun4u CPUs (unlike cpu_async_log_err) because we are assuming that 111 * if an i/o bus error required a panic, the error interrupt handler will 112 * enqueue the error and call panic itself. 113 */ 114 void 115 bus_async_log_err(struct async_flt *aflt) 116 { 117 char unum[UNUM_NAMLEN]; 118 int len; 119 120 /* 121 * Call back into the processor specific routine 122 * to check for cpu related errors that may 123 * have resulted in this error. (E.g. copyout trap) 124 */ 125 if (aflt->flt_in_memory) 126 cpu_check_allcpus(aflt); 127 128 /* 129 * Note that aflt->flt_stat is not the CPU afsr. 130 */ 131 (void) cpu_get_mem_unum_aflt(AFLT_STAT_INVALID, aflt, 132 unum, UNUM_NAMLEN, &len); 133 aflt->flt_func(aflt, unum); 134 } 135 136 /* 137 * ecc_cpu_call called from bus drain functions to run cpu 138 * specific functions to check other cpus and get the unum. 139 */ 140 void 141 ecc_cpu_call(struct async_flt *ecc, char *unum, int err_type) 142 { 143 int len; 144 145 /* 146 * Call back into the processor 147 * specific routine to check for cpu related errors 148 * that may have resulted in this error. 149 * (E.g. copyout trap) 150 */ 151 if (ecc->flt_in_memory) 152 cpu_check_allcpus(ecc); 153 154 (void) cpu_get_mem_unum(AFLT_STAT_VALID, ecc->flt_synd, 155 (uint64_t)-1, ecc->flt_addr, 156 ecc->flt_bus_id, ecc->flt_in_memory, 157 ecc->flt_status, unum, 158 UNUM_NAMLEN, &len); 159 160 if (err_type == ECC_IO_CE) 161 cpu_ce_count_unum(ecc, len, unum); 162 } 163 164 /* 165 * Handler to process a fatal error. This routine can be called from a 166 * softint, called from trap()'s AST handling, or called from the panic flow. 167 */ 168 /*ARGSUSED*/ 169 static void 170 ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep) 171 { 172 cpu_ue_log_err(aflt); 173 } 174 175 /* 176 * Handler to process a correctable error. This routine can be called from a 177 * softint. We just call the CPU module's logging routine. 178 */ 179 /*ARGSUSED*/ 180 static void 181 ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep) 182 { 183 cpu_ce_log_err(aflt, eqep); 184 } 185 186 /* 187 * Scrub a non-fatal correctable ecc error. 188 */ 189 void 190 ce_scrub(struct async_flt *aflt) 191 { 192 if (aflt->flt_in_memory) 193 cpu_ce_scrub_mem_err(aflt, B_FALSE); 194 } 195 196 /* 197 * Allocate error queue sizes based on max_ncpus. max_ncpus is set just 198 * after ncpunode has been determined. ncpus is set in start_other_cpus 199 * which is called after error_init() but may change dynamically. 200 */ 201 void 202 error_init(void) 203 { 204 char tmp_name[MAXSYSNAME]; 205 pnode_t node; 206 size_t size = cpu_aflt_size(); 207 208 /* 209 * Initialize the correctable and uncorrectable error queues. 210 */ 211 ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL, 212 MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL); 213 214 ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL, 215 MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0); 216 217 if (ue_queue == NULL || ce_queue == NULL) 218 panic("failed to create required system error queue"); 219 220 /* 221 * Initialize the busfunc list mutex. This must be a PIL_15 spin lock 222 * because we will need to acquire it from cpu_async_error(). 223 */ 224 mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15); 225 226 node = prom_rootnode(); 227 if ((node == OBP_NONODE) || (node == OBP_BADNODE)) { 228 cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node); 229 return; 230 } 231 232 if (((size = prom_getproplen(node, "reset-reason")) != -1) && 233 (size <= MAXSYSNAME) && 234 (prom_getprop(node, "reset-reason", tmp_name) != -1)) { 235 if (reset_debug) { 236 cmn_err(CE_CONT, "System booting after %s\n", tmp_name); 237 } else if (strncmp(tmp_name, "FATAL", 5) == 0) { 238 cmn_err(CE_CONT, 239 "System booting after fatal error %s\n", tmp_name); 240 } 241 } 242 243 if (&cpu_error_init) { 244 cpu_error_init((MAX_ASYNC_FLTS + MAX_CE_FLTS) * 245 (max_ncpus + 1)); 246 } 247 } 248 249 /* 250 * Flags for ecc_page_zero DTrace probe since ecc_page_zero() is called 251 * as a softint handler. 252 */ 253 #define PAGE_ZERO_SUCCESS 0 254 #define PAGE_ZERO_FAIL_NOLOCK 1 255 #define PAGE_ZERO_FAIL_ONTRAP 2 256 257 void 258 ecc_page_zero(void *arg) 259 { 260 uint64_t pa = (uint64_t)arg; 261 int ret, success_flag; 262 page_t *pp = page_numtopp_nolock(mmu_btop(pa)); 263 264 if (page_retire_check(pa, NULL) != 0) 265 return; 266 267 /* 268 * Must hold a lock on the page before calling pagezero() 269 * 270 * This will only fail if someone has or wants an exclusive lock on 271 * the page. Since it's a retired page, this shouldn't happen. 272 */ 273 ret = page_lock(pp, SE_SHARED, (kmutex_t *)NULL, P_NO_RECLAIM); 274 275 if (ret > 0) { 276 on_trap_data_t otd; 277 278 /* 279 * Protect pagezero() from async faults 280 */ 281 if (!on_trap(&otd, OT_DATA_EC)) { 282 pagezero(pp, 0, PAGESIZE); 283 success_flag = PAGE_ZERO_SUCCESS; 284 } else { 285 success_flag = PAGE_ZERO_FAIL_ONTRAP; 286 } 287 no_trap(); 288 page_unlock(pp); 289 } else { 290 success_flag = PAGE_ZERO_FAIL_NOLOCK; 291 } 292 DTRACE_PROBE2(page_zero_result, int, success_flag, uint64_t, pa); 293 } 294