xref: /titanic_50/usr/src/uts/sun4v/io/px/px_err.c (revision d0f40dc6a997c84bacf5f9ba83d57a95495c399b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * sun4v Fire Error Handling
27  */
28 
29 #include <sys/types.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/sunndi.h>
33 #include <sys/fm/protocol.h>
34 #include <sys/fm/util.h>
35 #include <sys/membar.h>
36 #include "px_obj.h"
37 #include "px_err.h"
38 
39 static void px_err_fill_pfd(dev_info_t *dip, pf_data_t *pfd_p,
40     px_rc_err_t *epkt);
41 static uint_t px_err_intr(px_fault_t *fault_p, px_rc_err_t *epkt);
42 static int  px_err_epkt_severity(px_t *px_p, ddi_fm_error_t *derr,
43     px_rc_err_t *epkt, pf_data_t *pfd_p);
44 
45 static void px_err_log_handle(dev_info_t *dip, px_rc_err_t *epkt,
46     boolean_t is_block_pci, char *msg);
47 static void px_err_send_epkt_erpt(dev_info_t *dip, px_rc_err_t *epkt,
48     boolean_t is_block_pci, int err, ddi_fm_error_t *derr,
49     boolean_t is_valid_epkt);
50 static int px_cb_epkt_severity(dev_info_t *dip, ddi_fm_error_t *derr,
51     px_rc_err_t *epkt, pf_data_t *pfd_p);
52 static int px_mmu_epkt_severity(dev_info_t *dip, ddi_fm_error_t *derr,
53     px_rc_err_t *epkt, pf_data_t *pfd_p);
54 static int px_intr_epkt_severity(dev_info_t *dip, ddi_fm_error_t *derr,
55     px_rc_err_t *epkt, pf_data_t *pfd_p);
56 static int px_port_epkt_severity(dev_info_t *dip, ddi_fm_error_t *derr,
57     px_rc_err_t *epkt, pf_data_t *pfd_p);
58 static int px_pcie_epkt_severity(dev_info_t *dip, ddi_fm_error_t *derr,
59     px_rc_err_t *epkt, pf_data_t *pfd_p);
60 static int px_intr_handle_errors(dev_info_t *dip, ddi_fm_error_t *derr,
61     px_rc_err_t *epkt, pf_data_t *pfd_p);
62 static int px_port_handle_errors(dev_info_t *dip, ddi_fm_error_t *derr,
63     px_rc_err_t *epkt, pf_data_t *pfd_p);
64 static void px_fix_legacy_epkt(dev_info_t *dip, ddi_fm_error_t *derr,
65     px_rc_err_t *epkt);
66 static int px_mmu_handle_lookup(dev_info_t *dip, ddi_fm_error_t *derr,
67     px_rc_err_t *epkt);
68 
69 /* Include the code generated sun4v epkt checking code */
70 #include "px_err_gen.c"
71 
72 /*
73  * This variable indicates if we have a hypervisor that could potentially send
74  * incorrect epkts. We always set this to TRUE for now until we find a way to
75  * tell if this HV bug has been fixed.
76  */
77 boolean_t px_legacy_epkt = B_TRUE;
78 
79 /*
80  * px_err_cb_intr:
81  * Interrupt handler for the Host Bus Block.
82  */
83 uint_t
px_err_cb_intr(caddr_t arg)84 px_err_cb_intr(caddr_t arg)
85 {
86 	px_fault_t	*fault_p = (px_fault_t *)arg;
87 	px_rc_err_t	*epkt = (px_rc_err_t *)fault_p->px_intr_payload;
88 
89 	if (epkt != NULL) {
90 		return (px_err_intr(fault_p, epkt));
91 	}
92 
93 	return (DDI_INTR_UNCLAIMED);
94 }
95 
96 /*
97  * px_err_dmc_pec_intr:
98  * Interrupt handler for the DMC/PEC block.
99  */
100 uint_t
px_err_dmc_pec_intr(caddr_t arg)101 px_err_dmc_pec_intr(caddr_t arg)
102 {
103 	px_fault_t	*fault_p = (px_fault_t *)arg;
104 	px_rc_err_t	*epkt = (px_rc_err_t *)fault_p->px_intr_payload;
105 
106 	if (epkt != NULL) {
107 		return (px_err_intr(fault_p, epkt));
108 	}
109 
110 	return (DDI_INTR_UNCLAIMED);
111 }
112 
113 /*
114  * px_err_cmn_intr:
115  * Common function called by trap, mondo and fabric intr.
116  * This function is more meaningful in sun4u implementation.  Kept
117  * to mirror sun4u call stack.
118  * o check for safe access
119  * o create and queue RC info for later use in fabric scan.
120  *   o RUC/WUC, PTLP, MMU Errors(CA), UR
121  *
122  * @param px_p		leaf in which to check access
123  * @param derr		fm err data structure to be updated
124  * @param caller	PX_TRAP_CALL | PX_INTR_CALL
125  * @param chkjbc	whether to handle hostbus registers (ignored)
126  * @return err		PX_NO_PANIC | PX_PROTECTED |
127  *                      PX_PANIC | PX_HW_RESET | PX_EXPECTED
128  */
129 /* ARGSUSED */
130 int
px_err_cmn_intr(px_t * px_p,ddi_fm_error_t * derr,int caller,int block)131 px_err_cmn_intr(px_t *px_p, ddi_fm_error_t *derr, int caller, int block)
132 {
133 	px_err_safeacc_check(px_p, derr);
134 	return (PX_NO_ERROR);
135 }
136 
137 /*
138  * fills RC specific fault data
139  */
140 static void
px_err_fill_pfd(dev_info_t * dip,pf_data_t * pfd_p,px_rc_err_t * epkt)141 px_err_fill_pfd(dev_info_t *dip, pf_data_t *pfd_p, px_rc_err_t *epkt) {
142 	pf_pcie_adv_err_regs_t adv_reg;
143 	pcie_req_id_t	fault_bdf = PCIE_INVALID_BDF;
144 	uint64_t	fault_addr = 0;
145 	uint16_t	s_status = 0;
146 	px_pec_err_t	*pec_p;
147 	uint32_t	dir;
148 
149 	/* Add an PCIE PF_DATA Entry */
150 	switch (epkt->rc_descr.block) {
151 	case BLOCK_MMU:
152 		/* Only PIO Fault Addresses are valid, this is DMA */
153 		s_status = PCI_STAT_S_TARG_AB;
154 		fault_addr = NULL;
155 
156 		if (epkt->rc_descr.H) {
157 			fault_bdf = (pcie_req_id_t)(epkt->hdr[0] >> 16);
158 			PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags =
159 			    PF_AFFECTED_BDF;
160 			PFD_AFFECTED_DEV(pfd_p)->pe_affected_bdf =
161 			    fault_bdf;
162 		}
163 		break;
164 	case BLOCK_PCIE:
165 		pec_p = (px_pec_err_t *)epkt;
166 		dir = pec_p->pec_descr.dir;
167 
168 		/* translate RC UR/CA to legacy secondary errors */
169 		if ((dir == DIR_READ || dir == DIR_WRITE) &&
170 		    pec_p->pec_descr.U) {
171 			if (pec_p->ue_reg_status & PCIE_AER_UCE_UR)
172 				s_status |= PCI_STAT_R_MAST_AB;
173 			if (pec_p->ue_reg_status & PCIE_AER_UCE_CA)
174 				s_status |= PCI_STAT_R_TARG_AB;
175 		}
176 
177 		if (pec_p->ue_reg_status & PCIE_AER_UCE_PTLP)
178 			s_status |= PCI_STAT_PERROR;
179 
180 		if (pec_p->ue_reg_status & PCIE_AER_UCE_CA)
181 			s_status |= PCI_STAT_S_TARG_AB;
182 
183 		if (pec_p->pec_descr.H) {
184 			adv_reg.pcie_ue_hdr[0] = (uint32_t)(pec_p->hdr[0] >>32);
185 			adv_reg.pcie_ue_hdr[1] = (uint32_t)(pec_p->hdr[0]);
186 			adv_reg.pcie_ue_hdr[2] = (uint32_t)(pec_p->hdr[1] >>32);
187 			adv_reg.pcie_ue_hdr[3] = (uint32_t)(pec_p->hdr[1]);
188 
189 			if (pf_tlp_decode(PCIE_DIP2BUS(dip), &adv_reg) ==
190 			    DDI_SUCCESS) {
191 				fault_bdf = adv_reg.pcie_ue_tgt_bdf;
192 				fault_addr = adv_reg.pcie_ue_tgt_addr;
193 				/*
194 				 * affected BDF is to be filled in by
195 				 * px_scan_fabric
196 				 */
197 			}
198 		}
199 		break;
200 	case BLOCK_HOSTBUS:
201 	case BLOCK_INTR:
202 	case BLOCK_PORT:
203 		/*
204 		 *  If the affected device information is available then we
205 		 *  add the affected_bdf to the pfd, so the affected device
206 		 *  will be scanned and added to the error q. This will then
207 		 *  go through the pciev_eh code path and forgive the error
208 		 *  as needed.
209 		 */
210 		if (PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags ==
211 		    PF_AFFECTED_BDF)
212 			fault_bdf = PFD_AFFECTED_DEV(pfd_p)->pe_affected_bdf;
213 
214 		break;
215 	default:
216 		break;
217 	}
218 
219 	PCIE_ROOT_FAULT(pfd_p)->scan_bdf = fault_bdf;
220 	PCIE_ROOT_FAULT(pfd_p)->scan_addr = (uint64_t)fault_addr;
221 	PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = s_status;
222 }
223 
224 /*
225  * Convert error severity from PX internal values to PCIe Fabric values.  Most
226  * are self explanitory, except PX_PROTECTED.  PX_PROTECTED will never be
227  * returned as is if forgivable.
228  */
229 static int
px_err_to_fab_sev(int * rc_err)230 px_err_to_fab_sev(int *rc_err) {
231 	int fab_err = 0;
232 
233 	if (*rc_err & px_die) {
234 		/*
235 		 * Let fabric scan decide the final severity of the error.
236 		 * This is needed incase IOV code needs to forgive the error.
237 		 */
238 		*rc_err = PX_FABRIC_SCAN;
239 		fab_err |= PF_ERR_PANIC;
240 	}
241 
242 	if (*rc_err & (PX_EXPECTED | PX_NO_PANIC))
243 		fab_err |= PF_ERR_NO_PANIC;
244 
245 	if (*rc_err & PX_NO_ERROR)
246 		fab_err |= PF_ERR_NO_ERROR;
247 
248 	return (fab_err);
249 }
250 
251 /*
252  * px_err_intr:
253  * Interrupt handler for the JBC/DMC/PEC block.
254  * o lock
255  * o create derr
256  * o check safe access
257  * o px_err_check_severity(epkt)
258  * o pcie_scan_fabric
259  * o Idle intr state
260  * o unlock
261  * o handle error: fatal? fm_panic() : return INTR_CLAIMED)
262  */
263 static uint_t
px_err_intr(px_fault_t * fault_p,px_rc_err_t * epkt)264 px_err_intr(px_fault_t *fault_p, px_rc_err_t *epkt)
265 {
266 	px_t		*px_p = DIP_TO_STATE(fault_p->px_fh_dip);
267 	dev_info_t	*rpdip = px_p->px_dip;
268 	int		rc_err, tmp_rc_err, fab_err, msg;
269 	ddi_fm_error_t	derr;
270 	pf_data_t	*pfd_p;
271 
272 	if (px_fm_enter(px_p) != DDI_SUCCESS)
273 		goto done;
274 
275 	pfd_p = px_get_pfd(px_p);
276 	PCIE_ROOT_EH_SRC(pfd_p)->intr_type = PF_INTR_TYPE_INTERNAL;
277 	PCIE_ROOT_EH_SRC(pfd_p)->intr_data = epkt;
278 
279 	/* Create the derr */
280 	bzero(&derr, sizeof (ddi_fm_error_t));
281 	derr.fme_version = DDI_FME_VERSION;
282 	derr.fme_ena = fm_ena_generate(epkt->stick, FM_ENA_FMT1);
283 	derr.fme_flag = DDI_FM_ERR_UNEXPECTED;
284 
285 	/* Basically check for safe access */
286 	(void) px_err_cmn_intr(px_p, &derr, PX_INTR_CALL, PX_FM_BLOCK_ALL);
287 
288 	/* Check the severity of this error */
289 	rc_err = px_err_epkt_severity(px_p, &derr, epkt, pfd_p);
290 
291 	/* Pass the 'rc_err' severity to the fabric scan code. */
292 	tmp_rc_err = rc_err;
293 	pfd_p->pe_severity_flags = px_err_to_fab_sev(&rc_err);
294 
295 	/* Scan the fabric */
296 	if (!(fab_err = px_scan_fabric(px_p, rpdip, &derr))) {
297 		/*
298 		 * Fabric scan didn't occur because of some error condition
299 		 * such as Root Port being in drain state, so reset rc_err.
300 		 */
301 		rc_err = tmp_rc_err;
302 	}
303 
304 	/* Set the intr state to idle for the leaf that received the mondo */
305 	if (px_lib_intr_setstate(rpdip, fault_p->px_fh_sysino,
306 	    INTR_IDLE_STATE) != DDI_SUCCESS) {
307 		px_fm_exit(px_p);
308 		return (DDI_INTR_UNCLAIMED);
309 	}
310 
311 	switch (epkt->rc_descr.block) {
312 	case BLOCK_MMU: /* FALLTHROUGH */
313 	case BLOCK_INTR:
314 		msg = PX_RC;
315 		break;
316 	case BLOCK_PCIE:
317 		msg = PX_RP;
318 		break;
319 	case BLOCK_HOSTBUS: /* FALLTHROUGH */
320 	default:
321 		msg = PX_HB;
322 		break;
323 	}
324 
325 	px_err_panic(rc_err, msg, fab_err, B_TRUE);
326 	px_fm_exit(px_p);
327 	px_err_panic(rc_err, msg, fab_err, B_FALSE);
328 
329 done:
330 	return (DDI_INTR_CLAIMED);
331 }
332 
333 /*
334  * px_err_epkt_severity:
335  * Check the severity of the fire error based the epkt received
336  *
337  * @param px_p		leaf in which to take the snap shot.
338  * @param derr		fm err in which the ereport is to be based on
339  * @param epkt		epkt recevied from HV
340  */
341 static int
px_err_epkt_severity(px_t * px_p,ddi_fm_error_t * derr,px_rc_err_t * epkt,pf_data_t * pfd_p)342 px_err_epkt_severity(px_t *px_p, ddi_fm_error_t *derr, px_rc_err_t *epkt,
343     pf_data_t *pfd_p)
344 {
345 	px_pec_t 	*pec_p = px_p->px_pec_p;
346 	dev_info_t	*dip = px_p->px_dip;
347 	boolean_t	is_safeacc = B_FALSE;
348 	boolean_t	is_block_pci = B_FALSE;
349 	boolean_t	is_valid_epkt = B_FALSE;
350 	int		err = 0;
351 
352 	/* Cautious access error handling  */
353 	switch (derr->fme_flag) {
354 	case DDI_FM_ERR_EXPECTED:
355 		/*
356 		 * For ddi_caut_put treat all events as nonfatal. Here
357 		 * we have the handle and can call ndi_fm_acc_err_set().
358 		 */
359 		derr->fme_status = DDI_FM_NONFATAL;
360 		ndi_fm_acc_err_set(pec_p->pec_acc_hdl, derr);
361 		is_safeacc = B_TRUE;
362 		break;
363 	case DDI_FM_ERR_PEEK:
364 	case DDI_FM_ERR_POKE:
365 		/*
366 		 * For ddi_peek/poke treat all events as nonfatal.
367 		 */
368 		is_safeacc = B_TRUE;
369 		break;
370 	default:
371 		is_safeacc = B_FALSE;
372 	}
373 
374 	/*
375 	 * Older hypervisors in some cases send epkts with incorrect fields.
376 	 * We have to handle these "special" epkts correctly.
377 	 */
378 	if (px_legacy_epkt)
379 		px_fix_legacy_epkt(dip, derr, epkt);
380 
381 	/*
382 	 * The affected device by default is set to 'SELF'. The 'block'
383 	 * specific error handling below will update this as needed.
384 	 */
385 	PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_SELF;
386 
387 	switch (epkt->rc_descr.block) {
388 	case BLOCK_HOSTBUS:
389 		err = px_cb_epkt_severity(dip, derr, epkt, pfd_p);
390 		break;
391 	case BLOCK_MMU:
392 		err = px_mmu_epkt_severity(dip, derr, epkt, pfd_p);
393 		break;
394 	case BLOCK_INTR:
395 		err = px_intr_epkt_severity(dip, derr, epkt, pfd_p);
396 		break;
397 	case BLOCK_PORT:
398 		err = px_port_epkt_severity(dip, derr, epkt, pfd_p);
399 		break;
400 	case BLOCK_PCIE:
401 		is_block_pci = B_TRUE;
402 		err = px_pcie_epkt_severity(dip, derr, epkt, pfd_p);
403 		break;
404 	default:
405 		err = 0;
406 	}
407 
408 	px_err_fill_pfd(dip, pfd_p, epkt);
409 
410 	if ((err & PX_HW_RESET) || (err & PX_PANIC)) {
411 		if (px_log & PX_PANIC)
412 			px_err_log_handle(dip, epkt, is_block_pci, "PANIC");
413 		is_valid_epkt = B_TRUE;
414 	} else if (err & PX_PROTECTED) {
415 		if (px_log & PX_PROTECTED)
416 			px_err_log_handle(dip, epkt, is_block_pci, "PROTECTED");
417 		is_valid_epkt = B_TRUE;
418 	} else if (err & PX_NO_PANIC) {
419 		if (px_log & PX_NO_PANIC)
420 			px_err_log_handle(dip, epkt, is_block_pci, "NO PANIC");
421 		is_valid_epkt = B_TRUE;
422 	} else if (err & PX_NO_ERROR) {
423 		if (px_log & PX_NO_ERROR)
424 			px_err_log_handle(dip, epkt, is_block_pci, "NO ERROR");
425 		is_valid_epkt = B_TRUE;
426 	} else if (err == 0) {
427 		px_err_log_handle(dip, epkt, is_block_pci, "UNRECOGNIZED");
428 		is_valid_epkt = B_FALSE;
429 
430 		/* Panic on a unrecognized epkt */
431 		err = PX_PANIC;
432 	}
433 
434 	px_err_send_epkt_erpt(dip, epkt, is_block_pci, err, derr,
435 	    is_valid_epkt);
436 
437 	/* Readjust the severity as a result of safe access */
438 	if (is_safeacc && !(err & PX_PANIC) && !(px_die & PX_PROTECTED))
439 		err = PX_NO_PANIC;
440 
441 	return (err);
442 }
443 
444 static void
px_err_send_epkt_erpt(dev_info_t * dip,px_rc_err_t * epkt,boolean_t is_block_pci,int err,ddi_fm_error_t * derr,boolean_t is_valid_epkt)445 px_err_send_epkt_erpt(dev_info_t *dip, px_rc_err_t *epkt,
446     boolean_t is_block_pci, int err, ddi_fm_error_t *derr,
447     boolean_t is_valid_epkt)
448 {
449 	char buf[FM_MAX_CLASS], descr_buf[1024];
450 
451 	/* send ereport for debug purposes */
452 	(void) snprintf(buf, FM_MAX_CLASS, "%s", PX_FM_RC_UNRECOG);
453 
454 	if (is_block_pci) {
455 		px_pec_err_t *pec = (px_pec_err_t *)epkt;
456 		(void) snprintf(descr_buf, sizeof (descr_buf),
457 		    "%s Epkt contents:\n"
458 		    "Block: 0x%x, Dir: 0x%x, Flags: Z=%d, S=%d, R=%d\n"
459 		    "I=%d, H=%d, C=%d, U=%d, E=%d, P=%d\n"
460 		    "PCI Err Status: 0x%x, PCIe Err Status: 0x%x\n"
461 		    "CE Status Reg: 0x%x, UE Status Reg: 0x%x\n"
462 		    "HDR1: 0x%lx, HDR2: 0x%lx\n"
463 		    "Err Src Reg: 0x%x, Root Err Status: 0x%x\n"
464 		    "Err Severity: 0x%x\n",
465 		    is_valid_epkt ? "Valid" : "Invalid",
466 		    pec->pec_descr.block, pec->pec_descr.dir,
467 		    pec->pec_descr.Z, pec->pec_descr.S,
468 		    pec->pec_descr.R, pec->pec_descr.I,
469 		    pec->pec_descr.H, pec->pec_descr.C,
470 		    pec->pec_descr.U, pec->pec_descr.E,
471 		    pec->pec_descr.P, pec->pci_err_status,
472 		    pec->pcie_err_status, pec->ce_reg_status,
473 		    pec->ue_reg_status, pec->hdr[0],
474 		    pec->hdr[1], pec->err_src_reg,
475 		    pec->root_err_status, err);
476 
477 		ddi_fm_ereport_post(dip, buf, derr->fme_ena,
478 		    DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0,
479 		    EPKT_SYSINO, DATA_TYPE_UINT64,
480 		    is_valid_epkt ? pec->sysino : 0,
481 		    EPKT_EHDL, DATA_TYPE_UINT64,
482 		    is_valid_epkt ? pec->ehdl : 0,
483 		    EPKT_STICK, DATA_TYPE_UINT64,
484 		    is_valid_epkt ? pec->stick : 0,
485 		    EPKT_DW0, DATA_TYPE_UINT64, ((uint64_t *)pec)[3],
486 		    EPKT_DW1, DATA_TYPE_UINT64, ((uint64_t *)pec)[4],
487 		    EPKT_DW2, DATA_TYPE_UINT64, ((uint64_t *)pec)[5],
488 		    EPKT_DW3, DATA_TYPE_UINT64, ((uint64_t *)pec)[6],
489 		    EPKT_DW4, DATA_TYPE_UINT64, ((uint64_t *)pec)[7],
490 		    EPKT_PEC_DESCR, DATA_TYPE_STRING, descr_buf);
491 	} else {
492 		(void) snprintf(descr_buf, sizeof (descr_buf),
493 		    "%s Epkt contents:\n"
494 		    "Block: 0x%x, Op: 0x%x, Phase: 0x%x, Cond: 0x%x\n"
495 		    "Dir: 0x%x, Flags: STOP=%d, H=%d, R=%d, D=%d\n"
496 		    "M=%d, S=%d, Size: 0x%x, Addr: 0x%lx\n"
497 		    "Hdr1: 0x%lx, Hdr2: 0x%lx, Res: 0x%lx\n"
498 		    "Err Severity: 0x%x\n",
499 		    is_valid_epkt ? "Valid" : "Invalid",
500 		    epkt->rc_descr.block, epkt->rc_descr.op,
501 		    epkt->rc_descr.phase, epkt->rc_descr.cond,
502 		    epkt->rc_descr.dir, epkt->rc_descr.STOP,
503 		    epkt->rc_descr.H, epkt->rc_descr.R,
504 		    epkt->rc_descr.D, epkt->rc_descr.M,
505 		    epkt->rc_descr.S, epkt->size, epkt->addr,
506 		    epkt->hdr[0], epkt->hdr[1], epkt->reserved,
507 		    err);
508 
509 		ddi_fm_ereport_post(dip, buf, derr->fme_ena,
510 		    DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0,
511 		    EPKT_SYSINO, DATA_TYPE_UINT64,
512 		    is_valid_epkt ? epkt->sysino : 0,
513 		    EPKT_EHDL, DATA_TYPE_UINT64,
514 		    is_valid_epkt ? epkt->ehdl : 0,
515 		    EPKT_STICK, DATA_TYPE_UINT64,
516 		    is_valid_epkt ? epkt->stick : 0,
517 		    EPKT_DW0, DATA_TYPE_UINT64, ((uint64_t *)epkt)[3],
518 		    EPKT_DW1, DATA_TYPE_UINT64, ((uint64_t *)epkt)[4],
519 		    EPKT_DW2, DATA_TYPE_UINT64, ((uint64_t *)epkt)[5],
520 		    EPKT_DW3, DATA_TYPE_UINT64, ((uint64_t *)epkt)[6],
521 		    EPKT_DW4, DATA_TYPE_UINT64, ((uint64_t *)epkt)[7],
522 		    EPKT_RC_DESCR, DATA_TYPE_STRING, descr_buf);
523 	}
524 }
525 
526 static void
px_err_log_handle(dev_info_t * dip,px_rc_err_t * epkt,boolean_t is_block_pci,char * msg)527 px_err_log_handle(dev_info_t *dip, px_rc_err_t *epkt, boolean_t is_block_pci,
528     char *msg)
529 {
530 	if (is_block_pci) {
531 		px_pec_err_t *pec = (px_pec_err_t *)epkt;
532 		DBG(DBG_ERR_INTR, dip,
533 		    "A PCIe root port error has occured with a severity"
534 		    " \"%s\"\n"
535 		    "\tBlock: 0x%x, Dir: 0x%x, Flags: Z=%d, S=%d, R=%d, I=%d\n"
536 		    "\tH=%d, C=%d, U=%d, E=%d, P=%d\n"
537 		    "\tpci_err: 0x%x, pcie_err=0x%x, ce_reg: 0x%x\n"
538 		    "\tue_reg: 0x%x, Hdr1: 0x%p, Hdr2: 0x%p\n"
539 		    "\terr_src: 0x%x, root_err: 0x%x\n",
540 		    msg, pec->pec_descr.block, pec->pec_descr.dir,
541 		    pec->pec_descr.Z, pec->pec_descr.S, pec->pec_descr.R,
542 		    pec->pec_descr.I, pec->pec_descr.H, pec->pec_descr.C,
543 		    pec->pec_descr.U, pec->pec_descr.E, pec->pec_descr.P,
544 		    pec->pci_err_status, pec->pcie_err_status,
545 		    pec->ce_reg_status, pec->ue_reg_status, pec->hdr[0],
546 		    pec->hdr[1], pec->err_src_reg, pec->root_err_status);
547 	} else {
548 		DBG(DBG_ERR_INTR, dip,
549 		    "A PCIe root complex error has occured with a severity"
550 		    " \"%s\"\n"
551 		    "\tBlock: 0x%x, Op: 0x%x, Phase: 0x%x, Cond: 0x%x\n"
552 		    "\tDir: 0x%x, Flags: STOP=%d, H=%d, R=%d, D=%d, M=%d\n"
553 		    "\tS=%d, Size: 0x%x, Addr: 0x%p\n"
554 		    "\tHdr1: 0x%p, Hdr2: 0x%p, Res: 0x%p\n",
555 		    msg, epkt->rc_descr.block, epkt->rc_descr.op,
556 		    epkt->rc_descr.phase, epkt->rc_descr.cond,
557 		    epkt->rc_descr.dir, epkt->rc_descr.STOP, epkt->rc_descr.H,
558 		    epkt->rc_descr.R, epkt->rc_descr.D, epkt->rc_descr.M,
559 		    epkt->rc_descr.S, epkt->size, epkt->addr, epkt->hdr[0],
560 		    epkt->hdr[1], epkt->reserved);
561 	}
562 }
563 
564 /* ARGSUSED */
565 static void
px_fix_legacy_epkt(dev_info_t * dip,ddi_fm_error_t * derr,px_rc_err_t * epkt)566 px_fix_legacy_epkt(dev_info_t *dip, ddi_fm_error_t *derr, px_rc_err_t *epkt)
567 {
568 	/*
569 	 * We don't have a default case for any of the below switch statements
570 	 * since we are ok with the code falling through.
571 	 */
572 	switch (epkt->rc_descr.block) {
573 	case BLOCK_HOSTBUS:
574 		switch (epkt->rc_descr.op) {
575 		case OP_DMA:
576 			switch (epkt->rc_descr.phase) {
577 			case PH_UNKNOWN:
578 				switch (epkt->rc_descr.cond) {
579 				case CND_UNKNOWN:
580 					switch (epkt->rc_descr.dir) {
581 					case DIR_RESERVED:
582 						epkt->rc_descr.dir = DIR_READ;
583 						break;
584 					} /* DIR */
585 				} /* CND */
586 			} /* PH */
587 		} /* OP */
588 		break;
589 	case BLOCK_MMU:
590 		switch (epkt->rc_descr.op) {
591 		case OP_XLAT:
592 			switch (epkt->rc_descr.phase) {
593 			case PH_DATA:
594 				switch (epkt->rc_descr.cond) {
595 				case CND_PROT:
596 					switch (epkt->rc_descr.dir) {
597 					case DIR_UNKNOWN:
598 						epkt->rc_descr.dir = DIR_WRITE;
599 						break;
600 					} /* DIR */
601 				} /* CND */
602 				break;
603 			case PH_IRR:
604 				switch (epkt->rc_descr.cond) {
605 				case CND_RESERVED:
606 					switch (epkt->rc_descr.dir) {
607 					case DIR_IRR:
608 						epkt->rc_descr.phase = PH_ADDR;
609 						epkt->rc_descr.cond = CND_IRR;
610 					} /* DIR */
611 				} /* CND */
612 			} /* PH */
613 		} /* OP */
614 		break;
615 	case BLOCK_INTR:
616 		switch (epkt->rc_descr.op) {
617 		case OP_MSIQ:
618 			switch (epkt->rc_descr.phase) {
619 			case PH_UNKNOWN:
620 				switch (epkt->rc_descr.cond) {
621 				case CND_ILL:
622 					switch (epkt->rc_descr.dir) {
623 					case DIR_RESERVED:
624 						epkt->rc_descr.dir = DIR_IRR;
625 						break;
626 					} /* DIR */
627 					break;
628 				case CND_IRR:
629 					switch (epkt->rc_descr.dir) {
630 					case DIR_IRR:
631 						epkt->rc_descr.cond = CND_OV;
632 						break;
633 					} /* DIR */
634 				} /* CND */
635 			} /* PH */
636 			break;
637 		case OP_RESERVED:
638 			switch (epkt->rc_descr.phase) {
639 			case PH_UNKNOWN:
640 				switch (epkt->rc_descr.cond) {
641 				case CND_ILL:
642 					switch (epkt->rc_descr.dir) {
643 					case DIR_IRR:
644 						epkt->rc_descr.op = OP_MSI32;
645 						epkt->rc_descr.phase = PH_DATA;
646 						break;
647 					} /* DIR */
648 				} /* CND */
649 				break;
650 			case PH_DATA:
651 				switch (epkt->rc_descr.cond) {
652 				case CND_INT:
653 					switch (epkt->rc_descr.dir) {
654 					case DIR_UNKNOWN:
655 						epkt->rc_descr.op = OP_MSI32;
656 						break;
657 					} /* DIR */
658 				} /* CND */
659 			} /* PH */
660 		} /* OP */
661 	} /* BLOCK */
662 }
663 
664 /* ARGSUSED */
665 static int
px_intr_handle_errors(dev_info_t * dip,ddi_fm_error_t * derr,px_rc_err_t * epkt,pf_data_t * pfd_p)666 px_intr_handle_errors(dev_info_t *dip, ddi_fm_error_t *derr, px_rc_err_t *epkt,
667     pf_data_t *pfd_p)
668 {
669 	return (px_err_check_eq(dip));
670 }
671 
672 /* ARGSUSED */
673 static int
px_port_handle_errors(dev_info_t * dip,ddi_fm_error_t * derr,px_rc_err_t * epkt,pf_data_t * pfd_p)674 px_port_handle_errors(dev_info_t *dip, ddi_fm_error_t *derr, px_rc_err_t *epkt,
675     pf_data_t *pfd_p)
676 {
677 	pf_pcie_adv_err_regs_t	adv_reg;
678 	uint16_t		s_status;
679 	int			sts = PX_PANIC;
680 
681 	/*
682 	 * Check for failed non-posted writes, which are errors that are not
683 	 * defined in the PCIe spec.  If not return panic.
684 	 */
685 	if (!((epkt->rc_descr.op == OP_PIO) &&
686 	    (epkt->rc_descr.phase == PH_IRR))) {
687 		sts = (PX_PANIC);
688 		goto done;
689 	}
690 
691 	/*
692 	 * Gather the error logs, if they do not exist just return with no panic
693 	 * and let the fabric message take care of the error.
694 	 */
695 	if (!epkt->rc_descr.H) {
696 		sts = (PX_NO_PANIC);
697 		goto done;
698 	}
699 
700 	adv_reg.pcie_ue_hdr[0] = (uint32_t)(epkt->hdr[0] >> 32);
701 	adv_reg.pcie_ue_hdr[1] = (uint32_t)(epkt->hdr[0]);
702 	adv_reg.pcie_ue_hdr[2] = (uint32_t)(epkt->hdr[1] >> 32);
703 	adv_reg.pcie_ue_hdr[3] = (uint32_t)(epkt->hdr[1]);
704 
705 	sts = pf_tlp_decode(PCIE_DIP2BUS(dip), &adv_reg);
706 
707 	if (epkt->rc_descr.M)
708 		adv_reg.pcie_ue_tgt_addr = epkt->addr;
709 
710 	if (!((sts == DDI_SUCCESS) || (epkt->rc_descr.M))) {
711 		/* Let the fabric message take care of error */
712 		sts = PX_NO_PANIC;
713 		goto done;
714 	}
715 
716 	/* See if the failed transaction belonged to a hardened driver */
717 	if (pf_hdl_lookup(dip, derr->fme_ena,
718 	    adv_reg.pcie_ue_tgt_trans, adv_reg.pcie_ue_tgt_addr,
719 	    adv_reg.pcie_ue_tgt_bdf) == PF_HDL_FOUND)
720 		sts = (PX_NO_PANIC);
721 	else
722 		sts = (PX_PANIC);
723 
724 	/* Add pfd to cause a fabric scan */
725 	switch (epkt->rc_descr.cond) {
726 	case CND_RCA:
727 		s_status = PCI_STAT_R_TARG_AB;
728 		break;
729 	case CND_RUR:
730 		s_status = PCI_STAT_R_MAST_AB;
731 		break;
732 	}
733 	PCIE_ROOT_FAULT(pfd_p)->scan_bdf = adv_reg.pcie_ue_tgt_bdf;
734 	PCIE_ROOT_FAULT(pfd_p)->scan_addr = adv_reg.pcie_ue_tgt_addr;
735 	PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = s_status;
736 
737 	PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_BDF;
738 	PFD_AFFECTED_DEV(pfd_p)->pe_affected_bdf = adv_reg.pcie_ue_tgt_bdf;
739 
740 done:
741 	return (sts);
742 }
743 
744 /* ARGSUSED */
745 static int
px_pcie_epkt_severity(dev_info_t * dip,ddi_fm_error_t * derr,px_rc_err_t * epkt,pf_data_t * pfd_p)746 px_pcie_epkt_severity(dev_info_t *dip, ddi_fm_error_t *derr, px_rc_err_t *epkt,
747     pf_data_t *pfd_p)
748 {
749 	px_pec_err_t	*pec_p = (px_pec_err_t *)epkt;
750 	px_err_pcie_t	*pcie = (px_err_pcie_t *)epkt;
751 	pf_pcie_adv_err_regs_t adv_reg;
752 	int		sts;
753 	uint32_t	temp;
754 
755 	/*
756 	 * Check for failed PIO Read/Writes, which are errors that are not
757 	 * defined in the PCIe spec.
758 	 */
759 
760 	temp = PCIE_AER_UCE_UR | PCIE_AER_UCE_CA;
761 	if (((pec_p->pec_descr.dir == DIR_READ) ||
762 	    (pec_p->pec_descr.dir == DIR_WRITE)) &&
763 	    pec_p->pec_descr.U && (pec_p->ue_reg_status & temp)) {
764 
765 		adv_reg.pcie_ue_hdr[0] = (uint32_t)(pec_p->hdr[0] >> 32);
766 		adv_reg.pcie_ue_hdr[1] = (uint32_t)(pec_p->hdr[0]);
767 		adv_reg.pcie_ue_hdr[2] = (uint32_t)(pec_p->hdr[1] >> 32);
768 		adv_reg.pcie_ue_hdr[3] = (uint32_t)(pec_p->hdr[1]);
769 
770 		sts = pf_tlp_decode(PCIE_DIP2BUS(dip), &adv_reg);
771 
772 		if (sts == DDI_SUCCESS &&
773 		    pf_hdl_lookup(dip, derr->fme_ena,
774 		    adv_reg.pcie_ue_tgt_trans,
775 		    adv_reg.pcie_ue_tgt_addr,
776 		    adv_reg.pcie_ue_tgt_bdf) == PF_HDL_FOUND)
777 			return (PX_NO_PANIC);
778 		else
779 			return (PX_PANIC);
780 	}
781 
782 	if (!pec_p->pec_descr.C)
783 		pec_p->ce_reg_status = 0;
784 	if (!pec_p->pec_descr.U)
785 		pec_p->ue_reg_status = 0;
786 	if (!pec_p->pec_descr.H)
787 		pec_p->hdr[0] = 0;
788 	if (!pec_p->pec_descr.I)
789 		pec_p->hdr[1] = 0;
790 
791 	/*
792 	 * According to the PCIe spec, there is a first error pointer.  If there
793 	 * are header logs recorded and there are more than one error, the log
794 	 * will belong to the error that the first error pointer points to.
795 	 *
796 	 * The regs.primary_ue expects a bit number, go through the ue register
797 	 * and find the first error that occured.  Because the sun4v epkt spec
798 	 * does not define this value, the algorithm below gives the lower bit
799 	 * priority.
800 	 */
801 	temp = pcie->ue_reg;
802 	if (temp) {
803 		int x;
804 		for (x = 0; !(temp & 0x1); x++) {
805 			temp = temp >> 1;
806 		}
807 		pcie->primary_ue = 1 << x;
808 	} else {
809 		pcie->primary_ue = 0;
810 	}
811 
812 	/* Sun4v doesn't log the TX hdr except for CTOs */
813 	if (pcie->primary_ue == PCIE_AER_UCE_TO) {
814 		pcie->tx_hdr1 = pcie->rx_hdr1;
815 		pcie->tx_hdr2 = pcie->rx_hdr2;
816 		pcie->tx_hdr3 = pcie->rx_hdr3;
817 		pcie->tx_hdr4 = pcie->rx_hdr4;
818 		pcie->rx_hdr1 = 0;
819 		pcie->rx_hdr2 = 0;
820 		pcie->rx_hdr3 = 0;
821 		pcie->rx_hdr4 = 0;
822 	} else {
823 		pcie->tx_hdr1 = 0;
824 		pcie->tx_hdr2 = 0;
825 		pcie->tx_hdr3 = 0;
826 		pcie->tx_hdr4 = 0;
827 	}
828 
829 	return (px_err_check_pcie(dip, derr, pcie, PF_INTR_TYPE_INTERNAL));
830 }
831 
832 static int
px_mmu_handle_lookup(dev_info_t * dip,ddi_fm_error_t * derr,px_rc_err_t * epkt)833 px_mmu_handle_lookup(dev_info_t *dip, ddi_fm_error_t *derr, px_rc_err_t *epkt)
834 {
835 	uint64_t addr = (uint64_t)epkt->addr;
836 	pcie_req_id_t bdf = PCIE_INVALID_BDF;
837 
838 	if (epkt->rc_descr.H) {
839 		bdf = (uint32_t)((epkt->hdr[0] >> 16) && 0xFFFF);
840 	}
841 
842 	return (pf_hdl_lookup(dip, derr->fme_ena, PF_ADDR_DMA, addr,
843 	    bdf));
844 }
845