1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _SYS_ASYNC_H 28 #define _SYS_ASYNC_H 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #include <sys/privregs.h> 33 34 #ifdef __cplusplus 35 extern "C" { 36 #endif 37 38 #ifndef _ASM 39 40 #include <sys/errorq.h> 41 42 /* 43 * The async_flt structure is used to record all pertinent information about 44 * an asynchronous CPU or bus-related memory error. Typically, the structure 45 * is initialized by a high-level interrupt or trap handler, and then enqueued 46 * for later processing. Separate queues are maintained for correctable and 47 * uncorrectable errors. The current CPU module determines the size of the 48 * queue elements, so that it may declare a CPU-specific fault structure 49 * which contains a struct async_flt as its first member. Each async_flt also 50 * contains a callback function (flt_func) that is invoked by the processing 51 * code in order to actually log messages when the event is dequeued. This 52 * function may be called from a softint, from trap() as part of AST handling 53 * before the victim thread returns to userland, or as part of panic(). As 54 * such, the flt_func should basically only be calling cmn_err (but NOT with 55 * the CE_PANIC flag). It must not call panic(), acquire locks, or block. 56 * The owner of the event is responsible for determining whether the event is 57 * fatal; if so, the owner should set flt_panic and panic() after enqueuing 58 * the event. The event will then be dequeued and logged as part of panic 59 * processing. If flt_panic is not set, the queue function will schedule a 60 * soft interrupt to process the event. 61 */ 62 63 struct async_flt; 64 typedef void (*async_func_t)(struct async_flt *, char *); 65 66 struct async_flt { 67 uint64_t flt_id; /* gethrtime() at time of fault */ 68 uint64_t flt_stat; /* async fault status register */ 69 uint64_t flt_addr; /* async fault address register */ 70 caddr_t flt_pc; /* program counter from error trap */ 71 async_func_t flt_func; /* logging function */ 72 uint_t flt_bus_id; /* hardware bus id# of cpu/sbus/pci */ 73 uint_t flt_inst; /* software instance of cpu/sbus/pci */ 74 ushort_t flt_status; /* error information */ 75 ushort_t flt_synd; /* ECC syndrome */ 76 uchar_t flt_in_memory; /* fault occurred in memory if != 0 */ 77 uchar_t flt_class; /* fault class (cpu or bus) */ 78 uchar_t flt_prot; /* type of fault protection (if any) */ 79 uchar_t flt_priv; /* fault occurred in kernel if != 0 */ 80 uchar_t flt_panic; /* fault caused owner to panic() */ 81 uchar_t flt_tl; /* fault occurred at TL > 0 */ 82 uchar_t flt_core; /* fault occurred during core() dump */ 83 uchar_t flt_pad; /* reserved for future use */ 84 uint64_t flt_disp; /* error disposition information */ 85 uint64_t flt_payload; /* ereport payload information */ 86 char *flt_erpt_class; /* ereport class string */ 87 }; 88 89 /* 90 * Bus nexus drivers can use the bus_func_register() interface to register 91 * callback functions for error handling and panic handling. The handler 92 * functions should be registered and unregistered from driver attach and 93 * detach context, where it is safe to perform a sleeping allocation. The 94 * callbacks themselves can be invoked from panic, or from the CPU module's 95 * asynchronous trap handler at high PIL. As such, these routines may only 96 * test for errors and enqueue async_flt events. They may not grab adaptive 97 * locks, call panic(), or invoke bus_func_register() or bus_func_unregister(). 98 * Each callback function should return one of the BF_* return status values 99 * below. The bus_func_invoke() function calls all the registered handlers of 100 * the specified type, and returns the maximum of their return values (e.g. 101 * BF_FATAL if any callback returned BF_FATAL). If any callback returns 102 * BF_FATAL, the system will panic at the end of callback processing. 103 */ 104 105 typedef uint_t (*busfunc_t)(void *); 106 107 #define BF_TYPE_UE 1 /* check for uncorrectable errors */ 108 #define BF_TYPE_ERRDIS 2 /* disable error detection */ 109 #define BF_TYPE_RESINTR 3 /* reset interrupts */ 110 111 #define BF_NONE 0 /* no errors were detected */ 112 #define BF_NONFATAL 1 /* one or more non-fatal errors found */ 113 #define BF_FATAL 2 /* one or more fatal errors found */ 114 115 typedef struct bus_func_desc { 116 int bf_type; /* type of function (see above) */ 117 busfunc_t bf_func; /* function to call */ 118 void *bf_arg; /* function argument */ 119 struct bus_func_desc *bf_next; /* pointer to next registered desc */ 120 } bus_func_desc_t; 121 122 extern void bus_func_register(int, busfunc_t, void *); 123 extern void bus_func_unregister(int, busfunc_t, void *); 124 extern void bus_async_log_err(struct async_flt *); 125 extern uint_t bus_func_invoke(int); 126 127 extern void ecc_cpu_call(struct async_flt *, char *, int); 128 129 extern void ce_scrub(struct async_flt *); 130 extern void ecc_page_zero(void *); 131 132 extern void error_init(void); 133 134 extern int ce_verbose_memory; 135 extern int ce_verbose_other; 136 extern int ce_show_data; 137 extern int ce_debug; 138 extern int ue_debug; 139 140 extern int aft_verbose; 141 extern int aft_panic; 142 extern int aft_testfatal; 143 144 extern struct async_flt panic_aflt; 145 146 extern errorq_t *ce_queue; 147 extern errorq_t *ue_queue; 148 149 #endif /* !_ASM */ 150 151 /* 152 * ECC or parity error status for async_flt.flt_status. 153 */ 154 #define ECC_C_TRAP 0x0001 /* Trap 0x63 Corrected ECC Error */ 155 #define ECC_I_TRAP 0x0002 /* Trap 0x0A Instr Access Error */ 156 #define ECC_ECACHE 0x0004 /* Ecache ECC Error */ 157 #define ECC_IOBUS 0x0008 /* Pci or sysio ECC Error */ 158 #define ECC_INTERMITTENT 0x0010 /* Intermittent ECC Error */ 159 #define ECC_PERSISTENT 0x0020 /* Persistent ECC Error */ 160 #define ECC_STICKY 0x0040 /* Sticky ECC Error */ 161 #define ECC_D_TRAP 0x0080 /* Trap 0x32 Data Access Error */ 162 #define ECC_F_TRAP 0x0100 /* Cheetah Trap 0x70 Fast ECC Error */ 163 #define ECC_DP_TRAP 0x0200 /* Cheetah+ Trap 0x71 D$ Parity Error */ 164 #define ECC_IP_TRAP 0x0400 /* Cheetah+ Trap 0x72 I$ Parity Error */ 165 #define ECC_ITLB_TRAP 0x0800 /* Panther ITLB Parity Error */ 166 #define ECC_DTLB_TRAP 0x1000 /* Panther DTLB Parity Error */ 167 #define ECC_IO_CE 0x2000 /* Pci or sysio CE */ 168 #define ECC_IO_UE 0x4000 /* Pci or sysio UE */ 169 170 /* 171 * Trap type numbers corresponding to the fault types defined above. 172 */ 173 #define TRAP_TYPE_ECC_I 0x0A 174 #define TRAP_TYPE_ECC_D 0x32 175 #define TRAP_TYPE_ECC_F 0x70 176 #define TRAP_TYPE_ECC_C 0x63 177 #define TRAP_TYPE_ECC_DP 0x71 178 #define TRAP_TYPE_ECC_IP 0x72 179 #define TRAP_TYPE_ECC_ITLB 0x08 180 #define TRAP_TYPE_ECC_DTLB 0x30 181 #define TRAP_TYPE_UNKNOWN 0 182 183 /* 184 * Fault classes for async_flt.flt_class. 185 */ 186 #define BUS_FAULT 0 /* originating from bus drivers */ 187 #define CPU_FAULT 1 /* originating from CPUs */ 188 #define RECIRC_BUS_FAULT 2 /* scheduled diagnostic */ 189 #define RECIRC_CPU_FAULT 3 /* scheduled diagnostic */ 190 191 /* 192 * Invalid or unknown physical address for async_flt.flt_addr. 193 */ 194 #define AFLT_INV_ADDR (-1ULL) 195 196 /* 197 * Fault protection values for async_flt.flt_prot. The async error handling 198 * code may be able to recover from errors when kernel code has explicitly 199 * protected itself using one of the mechanisms specified here. 200 */ 201 #define AFLT_PROT_NONE 0 /* no protection active */ 202 #define AFLT_PROT_ACCESS 1 /* on_trap OT_DATA_ACCESS protection */ 203 #define AFLT_PROT_EC 2 /* on_trap OT_DATA_EC protection */ 204 #define AFLT_PROT_COPY 3 /* t_lofault protection (ucopy, etc.) */ 205 206 /* 207 * These flags are used to indicate the validity of certain data based on 208 * the various overwrite priority features of the AFSR/AFAR: 209 * AFAR, ESYND and MSYND, each of which have different overwrite priorities. 210 * 211 * Given a specific afsr error bit and the entire afsr, there are three cases: 212 * INVALID: The specified bit is lower overwrite priority than some other 213 * error bit which is on in the afsr (or IVU/IVC). 214 * VALID: The specified bit is higher priority than all other error bits 215 * which are on in the afsr. 216 * AMBIGUOUS: Another error bit (or bits) of equal priority to the specified 217 * bit is on in the afsr. 218 * 219 * NB: The domain-to-SC communications depend on these values. If they are 220 * changed, plat_ecc_unum.[ch] must be updated to match. 221 */ 222 #define AFLT_STAT_INVALID 0 /* higher priority afsr bit is on */ 223 #define AFLT_STAT_VALID 1 /* this is highest priority afsr bit */ 224 #define AFLT_STAT_AMBIGUOUS 2 /* two afsr bits of equal priority */ 225 226 /* 227 * Maximum length of unum string. 228 */ 229 #define UNUM_NAMLEN 60 230 231 /* 232 * Maximum length of a DIMM serial id string + null 233 */ 234 #define DIMM_SERIAL_ID_LEN 16 235 236 #ifdef __cplusplus 237 } 238 #endif 239 240 #endif /* _SYS_ASYNC_H */ 241