17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 57c478bd9Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 67c478bd9Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 77c478bd9Sstevel@tonic-gate * with the License. 87c478bd9Sstevel@tonic-gate * 97c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 107c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 117c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 127c478bd9Sstevel@tonic-gate * and limitations under the License. 137c478bd9Sstevel@tonic-gate * 147c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 157c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 167c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 177c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 187c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 197c478bd9Sstevel@tonic-gate * 207c478bd9Sstevel@tonic-gate * CDDL HEADER END 217c478bd9Sstevel@tonic-gate */ 227c478bd9Sstevel@tonic-gate /* 23*4fc2445aSelowe * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 277c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate #include <sys/types.h> 307c478bd9Sstevel@tonic-gate #include <sys/conf.h> 317c478bd9Sstevel@tonic-gate #include <sys/ddi.h> 327c478bd9Sstevel@tonic-gate #include <sys/systm.h> 337c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 347c478bd9Sstevel@tonic-gate #include <sys/param.h> 357c478bd9Sstevel@tonic-gate #include <sys/mutex.h> 367c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 377c478bd9Sstevel@tonic-gate #include <sys/machparam.h> 387c478bd9Sstevel@tonic-gate #include <sys/machsystm.h> 397c478bd9Sstevel@tonic-gate #include <sys/machthread.h> 407c478bd9Sstevel@tonic-gate #include <sys/cpu.h> 417c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 427c478bd9Sstevel@tonic-gate #include <vm/page.h> 437c478bd9Sstevel@tonic-gate #include <vm/hat.h> 447c478bd9Sstevel@tonic-gate #include <vm/seg.h> 457c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 467c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 477c478bd9Sstevel@tonic-gate #include <sys/vmem.h> 487c478bd9Sstevel@tonic-gate #include <sys/mman.h> 497c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 507c478bd9Sstevel@tonic-gate #include <sys/time.h> 517c478bd9Sstevel@tonic-gate #include <sys/async.h> 527c478bd9Sstevel@tonic-gate #include <sys/spl.h> 537c478bd9Sstevel@tonic-gate #include <sys/trap.h> 547c478bd9Sstevel@tonic-gate #include <sys/machtrap.h> 557c478bd9Sstevel@tonic-gate #include <sys/promif.h> 567c478bd9Sstevel@tonic-gate #include <sys/prom_plat.h> 577c478bd9Sstevel@tonic-gate #include <sys/debug.h> 587c478bd9Sstevel@tonic-gate #include <sys/x_call.h> 597c478bd9Sstevel@tonic-gate #include <sys/membar.h> 607c478bd9Sstevel@tonic-gate #include <sys/ivintr.h> 617c478bd9Sstevel@tonic-gate #include <sys/cred.h> 627c478bd9Sstevel@tonic-gate #include <sys/cpu_module.h> 637c478bd9Sstevel@tonic-gate #include <sys/ontrap.h> 647c478bd9Sstevel@tonic-gate #include <sys/sdt.h> 657c478bd9Sstevel@tonic-gate #include <sys/errorq.h> 667c478bd9Sstevel@tonic-gate 677c478bd9Sstevel@tonic-gate #define MAX_CE_FLTS 10 687c478bd9Sstevel@tonic-gate #define MAX_ASYNC_FLTS 6 697c478bd9Sstevel@tonic-gate 707c478bd9Sstevel@tonic-gate errorq_t *ue_queue; /* queue of uncorrectable errors */ 717c478bd9Sstevel@tonic-gate errorq_t *ce_queue; /* queue of correctable errors */ 727c478bd9Sstevel@tonic-gate 737c478bd9Sstevel@tonic-gate /* 747c478bd9Sstevel@tonic-gate * ce_verbose_memory - covers CEs in DIMMs 757c478bd9Sstevel@tonic-gate * ce_verbose_other - covers "others" (ecache, IO, etc.) 767c478bd9Sstevel@tonic-gate * 777c478bd9Sstevel@tonic-gate * If the value is 0, nothing is logged. 787c478bd9Sstevel@tonic-gate * If the value is 1, the error is logged to the log file, but not console. 797c478bd9Sstevel@tonic-gate * If the value is 2, the error is logged to the log file and console. 807c478bd9Sstevel@tonic-gate */ 817c478bd9Sstevel@tonic-gate int ce_verbose_memory = 1; 827c478bd9Sstevel@tonic-gate int ce_verbose_other = 1; 837c478bd9Sstevel@tonic-gate 847c478bd9Sstevel@tonic-gate int ce_show_data = 0; 857c478bd9Sstevel@tonic-gate int ce_debug = 0; 867c478bd9Sstevel@tonic-gate int ue_debug = 0; 877c478bd9Sstevel@tonic-gate int reset_debug = 0; 887c478bd9Sstevel@tonic-gate 897c478bd9Sstevel@tonic-gate /* 907c478bd9Sstevel@tonic-gate * Tunables for controlling the handling of asynchronous faults (AFTs). Setting 917c478bd9Sstevel@tonic-gate * these to non-default values on a non-DEBUG kernel is NOT supported. 927c478bd9Sstevel@tonic-gate */ 937c478bd9Sstevel@tonic-gate int aft_verbose = 0; /* log AFT messages > 1 to log only */ 947c478bd9Sstevel@tonic-gate int aft_panic = 0; /* panic (not reboot) on fatal usermode AFLT */ 957c478bd9Sstevel@tonic-gate int aft_testfatal = 0; /* force all AFTs to panic immediately */ 967c478bd9Sstevel@tonic-gate 977c478bd9Sstevel@tonic-gate /* 987c478bd9Sstevel@tonic-gate * Panic_* variables specific to the AFT code. These are used to record 997c478bd9Sstevel@tonic-gate * information that the platform-specific code will need once we panic. 1007c478bd9Sstevel@tonic-gate */ 1017c478bd9Sstevel@tonic-gate struct async_flt panic_aflt; 1027c478bd9Sstevel@tonic-gate 1037c478bd9Sstevel@tonic-gate /* 1047c478bd9Sstevel@tonic-gate * Defined in bus_func.c but initialised in error_init 1057c478bd9Sstevel@tonic-gate */ 1067c478bd9Sstevel@tonic-gate extern kmutex_t bfd_lock; 1077c478bd9Sstevel@tonic-gate 1087c478bd9Sstevel@tonic-gate /* 1097c478bd9Sstevel@tonic-gate * Common bus driver async error logging routine. This routine can be shared 1107c478bd9Sstevel@tonic-gate * by all sun4u CPUs (unlike cpu_async_log_err) because we are assuming that 1117c478bd9Sstevel@tonic-gate * if an i/o bus error required a panic, the error interrupt handler will 1127c478bd9Sstevel@tonic-gate * enqueue the error and call panic itself. 1137c478bd9Sstevel@tonic-gate */ 1147c478bd9Sstevel@tonic-gate void 1157c478bd9Sstevel@tonic-gate bus_async_log_err(struct async_flt *aflt) 1167c478bd9Sstevel@tonic-gate { 1177c478bd9Sstevel@tonic-gate char unum[UNUM_NAMLEN]; 1187c478bd9Sstevel@tonic-gate int len; 1197c478bd9Sstevel@tonic-gate 1207c478bd9Sstevel@tonic-gate /* 1217c478bd9Sstevel@tonic-gate * Call back into the processor specific routine 1227c478bd9Sstevel@tonic-gate * to check for cpu related errors that may 1237c478bd9Sstevel@tonic-gate * have resulted in this error. (E.g. copyout trap) 1247c478bd9Sstevel@tonic-gate */ 1257c478bd9Sstevel@tonic-gate if (aflt->flt_in_memory) 1267c478bd9Sstevel@tonic-gate cpu_check_allcpus(aflt); 1277c478bd9Sstevel@tonic-gate 1287c478bd9Sstevel@tonic-gate /* 1297c478bd9Sstevel@tonic-gate * Note that aflt->flt_stat is not the CPU afsr. 1307c478bd9Sstevel@tonic-gate */ 1317c478bd9Sstevel@tonic-gate (void) cpu_get_mem_unum_aflt(AFLT_STAT_INVALID, aflt, 1327c478bd9Sstevel@tonic-gate unum, UNUM_NAMLEN, &len); 1337c478bd9Sstevel@tonic-gate aflt->flt_func(aflt, unum); 1347c478bd9Sstevel@tonic-gate } 1357c478bd9Sstevel@tonic-gate 1367c478bd9Sstevel@tonic-gate /* 1377c478bd9Sstevel@tonic-gate * ecc_cpu_call called from bus drain functions to run cpu 1387c478bd9Sstevel@tonic-gate * specific functions to check other cpus and get the unum. 1397c478bd9Sstevel@tonic-gate */ 1407c478bd9Sstevel@tonic-gate void 1417c478bd9Sstevel@tonic-gate ecc_cpu_call(struct async_flt *ecc, char *unum, int err_type) 1427c478bd9Sstevel@tonic-gate { 1437c478bd9Sstevel@tonic-gate int len; 1447c478bd9Sstevel@tonic-gate 1457c478bd9Sstevel@tonic-gate /* 1467c478bd9Sstevel@tonic-gate * Call back into the processor 1477c478bd9Sstevel@tonic-gate * specific routine to check for cpu related errors 1487c478bd9Sstevel@tonic-gate * that may have resulted in this error. 1497c478bd9Sstevel@tonic-gate * (E.g. copyout trap) 1507c478bd9Sstevel@tonic-gate */ 1517c478bd9Sstevel@tonic-gate if (ecc->flt_in_memory) 1527c478bd9Sstevel@tonic-gate cpu_check_allcpus(ecc); 1537c478bd9Sstevel@tonic-gate 1547c478bd9Sstevel@tonic-gate (void) cpu_get_mem_unum(AFLT_STAT_VALID, ecc->flt_synd, 1557c478bd9Sstevel@tonic-gate (uint64_t)-1, ecc->flt_addr, 1567c478bd9Sstevel@tonic-gate ecc->flt_bus_id, ecc->flt_in_memory, 1577c478bd9Sstevel@tonic-gate ecc->flt_status, unum, 1587c478bd9Sstevel@tonic-gate UNUM_NAMLEN, &len); 1597c478bd9Sstevel@tonic-gate 1607c478bd9Sstevel@tonic-gate if (err_type == ECC_IO_CE) 1617c478bd9Sstevel@tonic-gate cpu_ce_count_unum(ecc, len, unum); 1627c478bd9Sstevel@tonic-gate } 1637c478bd9Sstevel@tonic-gate 1647c478bd9Sstevel@tonic-gate /* 1657c478bd9Sstevel@tonic-gate * Handler to process a fatal error. This routine can be called from a 1667c478bd9Sstevel@tonic-gate * softint, called from trap()'s AST handling, or called from the panic flow. 1677c478bd9Sstevel@tonic-gate */ 1687c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 1697c478bd9Sstevel@tonic-gate static void 1707c478bd9Sstevel@tonic-gate ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep) 1717c478bd9Sstevel@tonic-gate { 1727c478bd9Sstevel@tonic-gate cpu_ue_log_err(aflt); 1737c478bd9Sstevel@tonic-gate } 1747c478bd9Sstevel@tonic-gate 1757c478bd9Sstevel@tonic-gate /* 1767c478bd9Sstevel@tonic-gate * Handler to process a correctable error. This routine can be called from a 1777c478bd9Sstevel@tonic-gate * softint. We just call the CPU module's logging routine. 1787c478bd9Sstevel@tonic-gate */ 1797c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 1807c478bd9Sstevel@tonic-gate static void 1817c478bd9Sstevel@tonic-gate ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep) 1827c478bd9Sstevel@tonic-gate { 1837c478bd9Sstevel@tonic-gate cpu_ce_log_err(aflt, eqep); 1847c478bd9Sstevel@tonic-gate } 1857c478bd9Sstevel@tonic-gate 1867c478bd9Sstevel@tonic-gate /* 1877c478bd9Sstevel@tonic-gate * Scrub a non-fatal correctable ecc error. 1887c478bd9Sstevel@tonic-gate */ 1897c478bd9Sstevel@tonic-gate void 1907c478bd9Sstevel@tonic-gate ce_scrub(struct async_flt *aflt) 1917c478bd9Sstevel@tonic-gate { 1927c478bd9Sstevel@tonic-gate if (aflt->flt_in_memory) 1937c478bd9Sstevel@tonic-gate cpu_ce_scrub_mem_err(aflt, B_FALSE); 1947c478bd9Sstevel@tonic-gate } 1957c478bd9Sstevel@tonic-gate 1967c478bd9Sstevel@tonic-gate /* 1977c478bd9Sstevel@tonic-gate * Allocate error queue sizes based on max_ncpus. max_ncpus is set just 1987c478bd9Sstevel@tonic-gate * after ncpunode has been determined. ncpus is set in start_other_cpus 1997c478bd9Sstevel@tonic-gate * which is called after error_init() but may change dynamically. 2007c478bd9Sstevel@tonic-gate */ 2017c478bd9Sstevel@tonic-gate void 2027c478bd9Sstevel@tonic-gate error_init(void) 2037c478bd9Sstevel@tonic-gate { 2047c478bd9Sstevel@tonic-gate char tmp_name[MAXSYSNAME]; 205fa9e4066Sahrens pnode_t node; 2067c478bd9Sstevel@tonic-gate size_t size = cpu_aflt_size(); 2077c478bd9Sstevel@tonic-gate 2087c478bd9Sstevel@tonic-gate /* 2097c478bd9Sstevel@tonic-gate * Initialize the correctable and uncorrectable error queues. 2107c478bd9Sstevel@tonic-gate */ 2117c478bd9Sstevel@tonic-gate ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL, 2127c478bd9Sstevel@tonic-gate MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL); 2137c478bd9Sstevel@tonic-gate 2147c478bd9Sstevel@tonic-gate ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL, 2157c478bd9Sstevel@tonic-gate MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0); 2167c478bd9Sstevel@tonic-gate 2177c478bd9Sstevel@tonic-gate if (ue_queue == NULL || ce_queue == NULL) 2187c478bd9Sstevel@tonic-gate panic("failed to create required system error queue"); 2197c478bd9Sstevel@tonic-gate 2207c478bd9Sstevel@tonic-gate /* 2217c478bd9Sstevel@tonic-gate * Initialize the busfunc list mutex. This must be a PIL_15 spin lock 2227c478bd9Sstevel@tonic-gate * because we will need to acquire it from cpu_async_error(). 2237c478bd9Sstevel@tonic-gate */ 2247c478bd9Sstevel@tonic-gate mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15); 2257c478bd9Sstevel@tonic-gate 2267c478bd9Sstevel@tonic-gate node = prom_rootnode(); 2277c478bd9Sstevel@tonic-gate if ((node == OBP_NONODE) || (node == OBP_BADNODE)) { 2287c478bd9Sstevel@tonic-gate cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node); 2297c478bd9Sstevel@tonic-gate return; 2307c478bd9Sstevel@tonic-gate } 2317c478bd9Sstevel@tonic-gate 2327c478bd9Sstevel@tonic-gate if (((size = prom_getproplen(node, "reset-reason")) != -1) && 2337c478bd9Sstevel@tonic-gate (size <= MAXSYSNAME) && 2347c478bd9Sstevel@tonic-gate (prom_getprop(node, "reset-reason", tmp_name) != -1)) { 2357c478bd9Sstevel@tonic-gate if (reset_debug) { 2367c478bd9Sstevel@tonic-gate cmn_err(CE_CONT, "System booting after %s\n", tmp_name); 2377c478bd9Sstevel@tonic-gate } else if (strncmp(tmp_name, "FATAL", 5) == 0) { 2387c478bd9Sstevel@tonic-gate cmn_err(CE_CONT, 2397c478bd9Sstevel@tonic-gate "System booting after fatal error %s\n", tmp_name); 2407c478bd9Sstevel@tonic-gate } 2417c478bd9Sstevel@tonic-gate } 2427c478bd9Sstevel@tonic-gate 2437c478bd9Sstevel@tonic-gate if (&cpu_error_init) { 2447c478bd9Sstevel@tonic-gate cpu_error_init((MAX_ASYNC_FLTS + MAX_CE_FLTS) * 2457c478bd9Sstevel@tonic-gate (max_ncpus + 1)); 2467c478bd9Sstevel@tonic-gate } 2477c478bd9Sstevel@tonic-gate } 2487c478bd9Sstevel@tonic-gate 2497c478bd9Sstevel@tonic-gate /* 250db874c57Selowe * Flags for ecc_page_zero DTrace probe since ecc_page_zero() is called 251db874c57Selowe * as a softint handler. 2527c478bd9Sstevel@tonic-gate */ 2537c478bd9Sstevel@tonic-gate #define PAGE_ZERO_SUCCESS 0 2547c478bd9Sstevel@tonic-gate #define PAGE_ZERO_FAIL_NOLOCK 1 2557c478bd9Sstevel@tonic-gate #define PAGE_ZERO_FAIL_ONTRAP 2 2567c478bd9Sstevel@tonic-gate 2577c478bd9Sstevel@tonic-gate void 2587c478bd9Sstevel@tonic-gate ecc_page_zero(void *arg) 2597c478bd9Sstevel@tonic-gate { 2607c478bd9Sstevel@tonic-gate uint64_t pa = (uint64_t)arg; 2617c478bd9Sstevel@tonic-gate int ret, success_flag; 262db874c57Selowe page_t *pp = page_numtopp_nolock(mmu_btop(pa)); 2637c478bd9Sstevel@tonic-gate 264db874c57Selowe if (page_retire_check(pa, NULL) != 0) 2657c478bd9Sstevel@tonic-gate return; 2667c478bd9Sstevel@tonic-gate 2677c478bd9Sstevel@tonic-gate /* 2687c478bd9Sstevel@tonic-gate * Must hold a lock on the page before calling pagezero() 2697c478bd9Sstevel@tonic-gate * 2707c478bd9Sstevel@tonic-gate * This will only fail if someone has or wants an exclusive lock on 2717c478bd9Sstevel@tonic-gate * the page. Since it's a retired page, this shouldn't happen. 2727c478bd9Sstevel@tonic-gate */ 273*4fc2445aSelowe ret = page_lock_es(pp, SE_SHARED, (kmutex_t *)NULL, 274*4fc2445aSelowe P_NO_RECLAIM, SE_RETIRED); 2757c478bd9Sstevel@tonic-gate 2767c478bd9Sstevel@tonic-gate if (ret > 0) { 2777c478bd9Sstevel@tonic-gate on_trap_data_t otd; 2787c478bd9Sstevel@tonic-gate 2797c478bd9Sstevel@tonic-gate /* 2807c478bd9Sstevel@tonic-gate * Protect pagezero() from async faults 2817c478bd9Sstevel@tonic-gate */ 2827c478bd9Sstevel@tonic-gate if (!on_trap(&otd, OT_DATA_EC)) { 2837c478bd9Sstevel@tonic-gate pagezero(pp, 0, PAGESIZE); 2847c478bd9Sstevel@tonic-gate success_flag = PAGE_ZERO_SUCCESS; 2857c478bd9Sstevel@tonic-gate } else { 2867c478bd9Sstevel@tonic-gate success_flag = PAGE_ZERO_FAIL_ONTRAP; 2877c478bd9Sstevel@tonic-gate } 2887c478bd9Sstevel@tonic-gate no_trap(); 2897c478bd9Sstevel@tonic-gate page_unlock(pp); 2907c478bd9Sstevel@tonic-gate } else { 2917c478bd9Sstevel@tonic-gate success_flag = PAGE_ZERO_FAIL_NOLOCK; 2927c478bd9Sstevel@tonic-gate } 2937c478bd9Sstevel@tonic-gate DTRACE_PROBE2(page_zero_result, int, success_flag, uint64_t, pa); 2947c478bd9Sstevel@tonic-gate } 295