xref: /titanic_50/usr/src/uts/i86pc/cpu/amd_opteron/ao.h (revision 9404882939d18ddd3c94a5bd3da7a0449c195a5d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef _AO_H
28 #define	_AO_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/types.h>
33 #include <sys/mc.h>
34 #include <sys/mca_amd.h>
35 #include <sys/mc_amd.h>
36 #include <sys/cpu_module_impl.h>
37 #include <sys/nvpair.h>
38 #include <sys/cyclic.h>
39 #include <sys/errorq.h>
40 #include <sys/kobj.h>
41 #include <sys/fm/util.h>
42 
43 #ifdef __cplusplus
44 extern "C" {
45 #endif
46 
47 #define	AO_MCA_MAX_ERRORS	10
48 
49 typedef struct ao_data ao_data_t;
50 
51 typedef struct ao_bank_regs {
52 	uint32_t abr_status;
53 	uint32_t abr_addr;
54 	uint32_t abr_misc;
55 } ao_bank_regs_t;
56 
57 extern ao_bank_regs_t ao_bank_regs[AMD_MCA_BANK_COUNT];
58 
59 /*
60  * Rather than using torturous conditionals, we match errors using a table of
61  * ao_error_disp_t's.  The members in the ao_error_disp_t are matched against
62  * the value of MCi_STATUS, with a successful match indicating that the given
63  * error occurred.
64  *
65  * While aed_stat_code will match most of the status code bits, a few of the
66  * status code fields are either/or, and are treated separately so as to
67  * minimize the number of ao_error_disp_t structures that must be created.
68  * For example, the dc.tag_par error can have r4 values drd or dwr.  Rather
69  * than creating two ao_error_disp_t's, we use the separate aed_stat_r4_bits
70  * field to indicate both AO_MCA_R4_BIT_DRD and AO_MCA_R4_BIT_DWD.  As the
71  * matching r4 values are drawn from aed_stat_r4_bits, we don't use the r4
72  * bits in aed_stat_code for matching.  Similar reasoning lies behind the
73  * creation of the pp and ii fields.
74  */
75 #define	AO_AED_PANIC_NEVER	0x00
76 #define	AO_AED_PANIC_IFMCE	0x01
77 #define	AO_AED_PANIC_ALWAYS	0x80
78 
79 #define	AO_AED_F_CORRECTABLE	0x01
80 #define	AO_AED_F_LOFAULT_OK	0x02
81 #define	AO_AED_F_LINEAR		0x04	/* MCi_ADDR is a linear address */
82 #define	AO_AED_F_PHYSICAL	0x08	/* MCi_ADDR is a physical address */
83 #define	AO_AED_F_PAGEALIGNED	0x10	/* MCi_ADDR aligns to page size */
84 #define	AO_AED_F_L2SETWAY	0x20	/* 3:0 = way, 15/14/13/12:6 = set */
85 
86 #define	AO_AED_FLAGS_ADDRTYPE	(AO_AED_F_LINEAR | AO_AED_F_PHYSICAL | \
87     AO_AED_F_PAGEALIGNED | AO_AED_F_L2SETWAY)
88 
89 typedef struct ao_error_disp {
90 	const char *aed_class;		/* ereport class for use if match */
91 	uint64_t aed_ereport_members;	/* ereport contents flags if match */
92 	uint64_t aed_stat_mask;		/* status msr bits for match */
93 	uint64_t aed_stat_mask_res;	/* status mask result for match */
94 	uint16_t aed_stat_code;		/* status code for match */
95 	uint8_t aed_stat_extcode;	/* extended status code for match */
96 	uint8_t aed_stat_pp_bits:4;	/* AO_MCA_PP_BIT_* for pp matching */
97 	uint8_t aed_stat_ii_bits:4;	/* AO_MCA_II_BIT_* for ii matching */
98 	uint16_t aed_stat_r4_bits;	/* AO_MCA_R4_BIT_* for r4 matching */
99 	uint8_t aed_addrvalid_hi;	/* most significant valid addr bit */
100 	uint8_t aed_addrvalid_lo;	/* least significant valid addr bit */
101 	uint8_t aed_panic_when;		/* extra conditions for panic */
102 	uint8_t aed_flags;		/* AO_AED_F_* */
103 } ao_error_disp_t;
104 
105 /*
106  * The poller has two parts.  First is the omni cyclic, which runs on all
107  * CPUs, and which polls the error MSRs at some fixed (long) interval.  This
108  * cyclic will run on all machines, all the time, and thus must have minimal
109  * runtime impact.  The second portion of the poller is manually-initiated, and
110  * is used by the error injector/synthesizer to request an immediate poll of the
111  * error state registers.
112  *
113  * With this number of moving parts, it is essential that we have some sort of
114  * audit log for post-mortem analysis.  A circular array of trace buffers
115  * (ao_mca_poll_trace_t structures) is kept to record this activity.  Whenever
116  * an event occurs that is of interest to the poller, an entry is made in
117  * the trace array describing that event.
118  */
119 #define	AO_MPT_WHAT_CYC_ERR		0	/* cyclic-induced poll */
120 #define	AO_MPT_WHAT_POKE_ERR		1	/* manually-induced poll */
121 #define	AO_MPT_WHAT_UNFAULTING		2	/* discarded error state */
122 
123 typedef struct ao_mca_poll_trace {
124 	hrtime_t mpt_when;		/* timestamp of event */
125 	uint8_t mpt_what;		/* AO_MPT_WHAT_* (which event?) */
126 	uint8_t mpt_nerr;		/* number of errors discovered */
127 	uint16_t mpt_pad1;
128 	uint32_t mpt_pad2;
129 } ao_mca_poll_trace_t;
130 
131 /*
132  * Processor error state is saved in logout areas.  There are three separate
133  * logout areas, each used for a different purpose.  The logout areas are stored
134  * in an array (ao_mca_logout), indexed by the AO_MCA_LOGOUT_* macros.
135  *
136  * The save areas are:
137  *
138  * 1. Exception handler MSR save - Written to by the initial portion of the #mc
139  *    handler.  Read from by the main body of the exception handler.
140  *
141  * 3. Poller MSR save - Used by the poller to store error state MSR values.
142  *    While this logout area doesn't necessarily have to live in the ao_mca_t,
143  *    it does so to enhance observability.
144  *
145  * The logout areas contain both global error state (acl_ip, acl_timestamp,
146  * etc.), as well as a bank array.  The bank array contains one ao_bank_logout_t
147  * per error reporting bank.
148  */
149 
150 typedef struct ao_bank_logout {
151 	uint64_t abl_status;		/* Saved MCi_STATUS register */
152 	uint64_t abl_addr;		/* Saved MCi_ADDR register */
153 	uint64_t abl_misc;		/* Saved MCi_MISC register */
154 	uint8_t abl_addr_type;		/* flags & AO_AED_FLAGS_ADDRTYPE */
155 	uint8_t abl_addr_valid_hi;	/* most significant valid addr bit */
156 	uint8_t abl_addr_valid_lo;	/* least significant valid addr bit */
157 } ao_bank_logout_t;
158 
159 #define	AO_ACL_F_PRIV		0x1	/* #mc in kernel mode (else user) */
160 #define	AO_ACL_F_FATAL		0x2	/* logout detected fatal error(s) */
161 
162 typedef struct ao_cpu_logout {
163 	ao_data_t *acl_ao;		/* pointer to per-cpu ao_data_t */
164 	uintptr_t acl_ip;		/* instruction pointer if #mc trap */
165 	uint64_t acl_timestamp;		/* gethrtime() at time of logout */
166 	uint64_t acl_mcg_status;	/* MCG_STATUS register value */
167 	ao_bank_logout_t acl_banks[AMD_MCA_BANK_COUNT]; /* bank state saves */
168 	pc_t acl_stack[FM_STK_DEPTH];	/* saved stack trace (if any) */
169 	int acl_stackdepth;		/* saved stack trace depth */
170 	uint_t acl_flags;		/* flags (see AO_ACL_F_* above) */
171 } ao_cpu_logout_t;
172 
173 /* Index for ao_mca_logout, below */
174 #define	AO_MCA_LOGOUT_EXCEPTION		0
175 #define	AO_MCA_LOGOUT_POLLER		1
176 #define	AO_MCA_LOGOUT_NUM		2
177 
178 #define	AO_MCA_F_UNFAULTING		0x1	/* CPU exiting faulted state */
179 
180 /*
181  * We store config as inherited from the BIOS to assist in troubleshooting.
182  * The NorthBridge config is stored in the chipshared structure below.
183  */
184 typedef struct ao_bios_cfg {
185 	uint64_t bcfg_bank_ctl[AMD_MCA_BANK_COUNT];
186 	uint64_t bcfg_bank_mask[AMD_MCA_BANK_COUNT];
187 	uint64_t bcfg_bank_misc[AMD_MCA_BANK_COUNT];
188 } ao_bios_cfg_t;
189 
190 /*
191  * The master data structure used to hold MCA-related state.
192  */
193 typedef struct ao_mca {
194 	ao_bios_cfg_t ao_mca_bios_cfg;	/* Bank and NB config before our init */
195 	ao_cpu_logout_t ao_mca_logout[AO_MCA_LOGOUT_NUM]; /* save areas */
196 	kmutex_t ao_mca_poll_lock;	/* keep pollers from colliding */
197 	ao_mca_poll_trace_t *ao_mca_poll_trace; /* trace buffers for this cpu */
198 	uint_t ao_mca_poll_curtrace;	/* most recently-filled trace buffer */
199 	uint_t ao_mca_flags;		/* AO_MCA_F_* */
200 } ao_mca_t;
201 
202 /*
203  * Per-chip state
204  */
205 struct ao_chipshared {
206 	uint32_t aos_chiprev;		/* Chip revision */
207 	volatile ulong_t aos_cfgonce;	/* Config performed once per chip */
208 	kmutex_t aos_nb_poll_lock;	/* Keep NB pollers from colliding */
209 	uint64_t aos_nb_poll_timestamp;	/* Timestamp of last NB poll */
210 	int aos_nb_poll_owner;		/* The cpuid of current NB poller */
211 	uint64_t aos_bcfg_nb_ctl;	/* BIOS value of MC4_CTL */
212 	uint64_t aos_bcfg_nb_mask;	/* BIOS value of MC4_MASK */
213 	uint64_t aos_bcfg_nb_misc;	/* BIOS value of MC4_MISC */
214 	uint32_t aos_bcfg_nb_cfg;	/* BIOS value of NB MCA Config */
215 	uint32_t aos_bcfg_nb_sparectl;	/* BIOS value of Online Spare Control */
216 	uint32_t aos_bcfg_dcfg_lo;	/* BIOS value of DRAM Config Low */
217 	uint32_t aos_bcfg_dcfg_hi;	/* BIOS value of DRAM Config High */
218 };
219 
220 /* Bit numbers for aos_cfgonce */
221 enum ao_cfgonce_bitnum {
222 	AO_CFGONCE_NBMCA,
223 	AO_CFGONCE_DRAMCFG
224 };
225 
226 /*
227  * Per-CPU state
228  */
229 struct ao_data {
230 	ao_mca_t ao_mca;			/* MCA state for this CPU */
231 	cpu_t *ao_cpu;				/* link to CPU's cpu_t */
232 	const cmi_mc_ops_t *ao_mc_ops;		/* memory controller ops */
233 	void *ao_mc_data;			/* argument for MC ops */
234 	struct ao_chipshared *ao_shared;	/* Shared state for the chip */
235 };
236 
237 #ifdef _KERNEL
238 
239 struct regs;
240 
241 extern errorq_t *ao_mca_queue;
242 extern const cmi_ops_t _cmi_ops;
243 
244 extern void ao_faulted_enter(void *);
245 extern void ao_faulted_exit(void *);
246 extern int ao_scrubber_enable(void *, uint64_t, uint64_t, int);
247 
248 extern void ao_mca_post_init(void *);
249 extern void ao_mca_init(void *);
250 extern int ao_mca_trap(void *, struct regs *);
251 extern int ao_mca_inject(void *, cmi_mca_regs_t *, uint_t);
252 extern void ao_mca_poke(void *);
253 extern void ao_mca_poll_init(ao_data_t *, int);
254 extern void ao_mca_poll_start(void);
255 
256 extern int ao_mca_logout(ao_cpu_logout_t *, struct regs *, int *, int,
257     uint32_t);
258 extern void ao_mca_drain(void *, const void *, const errorq_elem_t *);
259 extern nvlist_t *ao_fmri_create(ao_data_t *, nv_alloc_t *);
260 
261 extern void ao_mc_register(void *, const cmi_mc_ops_t *, void *);
262 extern const struct cmi_mc_ops *ao_mc_getops(void *);
263 extern int ao_mc_patounum(ao_data_t *, uint64_t, uint8_t, uint8_t, uint32_t,
264     int, mc_unum_t *);
265 extern int ao_mc_unumtopa(ao_data_t *, mc_unum_t *, nvlist_t *, uint64_t *);
266 
267 extern void ao_pcicfg_write(uint_t, uint_t, uint_t, uint32_t);
268 extern uint32_t ao_pcicfg_read(uint_t, uint_t, uint_t);
269 
270 extern int ao_chip_once(ao_data_t *, enum ao_cfgonce_bitnum);
271 
272 #endif /* _KERNEL */
273 
274 #ifdef __cplusplus
275 }
276 #endif
277 
278 #endif /* _AO_H */
279