xref: /titanic_50/usr/src/uts/sun4v/os/error.c (revision 0990bc30bb018eab2d35d33f1f635e6ae0ee3ca1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/machsystm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/cpuvar.h>
32 #include <sys/async.h>
33 #include <sys/ontrap.h>
34 #include <sys/ddifm.h>
35 #include <sys/hypervisor_api.h>
36 #include <sys/errorq.h>
37 #include <sys/promif.h>
38 #include <sys/prom_plat.h>
39 #include <sys/x_call.h>
40 #include <sys/error.h>
41 #include <sys/fm/util.h>
42 #include <sys/ivintr.h>
43 #include <sys/archsystm.h>
44 
45 #define	MAX_CE_FLTS		10
46 #define	MAX_ASYNC_FLTS		6
47 
48 errorq_t *ue_queue;			/* queue of uncorrectable errors */
49 errorq_t *ce_queue;			/* queue of correctable errors */
50 
51 /*
52  * Being used by memory test driver.
53  * ce_verbose_memory - covers CEs in DIMMs
54  * ce_verbose_other - covers "others" (ecache, IO, etc.)
55  *
56  * If the value is 0, nothing is logged.
57  * If the value is 1, the error is logged to the log file, but not console.
58  * If the value is 2, the error is logged to the log file and console.
59  */
60 int	ce_verbose_memory = 1;
61 int	ce_verbose_other = 1;
62 
63 int	ce_show_data = 0;
64 int	ce_debug = 0;
65 int	ue_debug = 0;
66 int	reset_debug = 0;
67 
68 /*
69  * Tunables for controlling the handling of asynchronous faults (AFTs). Setting
70  * these to non-default values on a non-DEBUG kernel is NOT supported.
71  */
72 int	aft_verbose = 0;	/* log AFT messages > 1 to log only */
73 int	aft_panic = 0;		/* panic (not reboot) on fatal usermode AFLT */
74 int	aft_testfatal = 0;	/* force all AFTs to panic immediately */
75 
76 /*
77  * Used for vbsc hostshutdown (power-off buton)
78  */
79 int	err_shutdown_triggered = 0;	/* only once */
80 uint64_t err_shutdown_inum = 0;	/* used to pull the trigger */
81 
82 /*
83  * Defined in bus_func.c but initialised in error_init
84  */
85 extern kmutex_t bfd_lock;
86 
87 static uint32_t rq_overflow_count = 0;		/* counter for rq overflow */
88 
89 static void cpu_queue_one_event(errh_async_flt_t *);
90 static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t);
91 static void errh_page_retire(errh_async_flt_t *, uchar_t);
92 static int errh_error_protected(struct regs *, struct async_flt *, int *);
93 static void errh_rq_full(struct async_flt *);
94 static void ue_drain(void *, struct async_flt *, errorq_elem_t *);
95 static void ce_drain(void *, struct async_flt *, errorq_elem_t *);
96 static void errh_handle_attr(errh_async_flt_t *);
97 static void errh_handle_asr(errh_async_flt_t *);
98 
99 /*ARGSUSED*/
100 void
101 process_resumable_error(struct regs *rp, uint32_t head_offset,
102     uint32_t tail_offset)
103 {
104 	struct machcpu *mcpup;
105 	struct async_flt *aflt;
106 	errh_async_flt_t errh_flt;
107 	errh_er_t *head_va;
108 
109 	mcpup = &(CPU->cpu_m);
110 
111 	while (head_offset != tail_offset) {
112 		/* kernel buffer starts right after the resumable queue */
113 		head_va = (errh_er_t *)(mcpup->cpu_rq_va + head_offset +
114 		    CPU_RQ_SIZE);
115 		/* Copy the error report to local buffer */
116 		bzero(&errh_flt, sizeof (errh_async_flt_t));
117 		bcopy((char *)head_va, &(errh_flt.errh_er),
118 		    sizeof (errh_er_t));
119 
120 		/* Increment the queue head */
121 		head_offset += Q_ENTRY_SIZE;
122 		/* Wrap around */
123 		head_offset &= (CPU_RQ_SIZE - 1);
124 
125 		/* set error handle to zero so it can hold new error report */
126 		head_va->ehdl = 0;
127 
128 		switch (errh_flt.errh_er.desc) {
129 		case ERRH_DESC_UCOR_RE:
130 			/*
131 			 * Check error attribute, handle individual error
132 			 * if it is needed.
133 			 */
134 			errh_handle_attr(&errh_flt);
135 			break;
136 
137 		case ERRH_DESC_WARN_RE:
138 			/*
139 			 * Power-off requested, but handle it one time only.
140 			 */
141 			if (!err_shutdown_triggered) {
142 				setsoftint(err_shutdown_inum);
143 				++err_shutdown_triggered;
144 			}
145 			continue;
146 
147 		default:
148 			cmn_err(CE_WARN, "Error Descriptor 0x%llx "
149 			    " invalid in resumable error handler",
150 			    (long long) errh_flt.errh_er.desc);
151 			continue;
152 		}
153 
154 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
155 		aflt->flt_id = gethrtime();
156 		aflt->flt_bus_id = getprocessorid();
157 		aflt->flt_class = CPU_FAULT;
158 		aflt->flt_prot = AFLT_PROT_NONE;
159 		aflt->flt_priv = (((errh_flt.errh_er.attr & ERRH_MODE_MASK)
160 		    >> ERRH_MODE_SHIFT) == ERRH_MODE_PRIV);
161 
162 		if (errh_flt.errh_er.attr & ERRH_ATTR_CPU)
163 			/* If it is an error on other cpu */
164 			aflt->flt_panic = 1;
165 		else
166 			aflt->flt_panic = 0;
167 
168 		/*
169 		 * Handle resumable queue full case.
170 		 */
171 		if (errh_flt.errh_er.attr & ERRH_ATTR_RQF) {
172 			(void) errh_rq_full(aflt);
173 		}
174 
175 		/*
176 		 * Queue the error on ce or ue queue depend on flt_panic.
177 		 * Even if flt_panic is set, the code still keep processing
178 		 * the rest element on rq until the panic starts.
179 		 */
180 		(void) cpu_queue_one_event(&errh_flt);
181 
182 		/*
183 		 * Panic here if aflt->flt_panic has been set.
184 		 * Enqueued errors will be logged as part of the panic flow.
185 		 */
186 		if (aflt->flt_panic) {
187 			fm_panic("Unrecoverable error on another CPU");
188 		}
189 	}
190 }
191 
192 void
193 process_nonresumable_error(struct regs *rp, uint64_t flags,
194     uint32_t head_offset, uint32_t tail_offset)
195 {
196 	struct machcpu *mcpup;
197 	struct async_flt *aflt;
198 	errh_async_flt_t errh_flt;
199 	errh_er_t *head_va;
200 	int trampolined = 0;
201 	int expected = DDI_FM_ERR_UNEXPECTED;
202 	uint64_t exec_mode;
203 	uint8_t u_spill_fill;
204 
205 	mcpup = &(CPU->cpu_m);
206 
207 	while (head_offset != tail_offset) {
208 		/* kernel buffer starts right after the nonresumable queue */
209 		head_va = (errh_er_t *)(mcpup->cpu_nrq_va + head_offset +
210 		    CPU_NRQ_SIZE);
211 
212 		/* Copy the error report to local buffer */
213 		bzero(&errh_flt, sizeof (errh_async_flt_t));
214 
215 		bcopy((char *)head_va, &(errh_flt.errh_er),
216 		    sizeof (errh_er_t));
217 
218 		/* Increment the queue head */
219 		head_offset += Q_ENTRY_SIZE;
220 		/* Wrap around */
221 		head_offset &= (CPU_NRQ_SIZE - 1);
222 
223 		/* set error handle to zero so it can hold new error report */
224 		head_va->ehdl = 0;
225 
226 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
227 
228 		trampolined = 0;
229 
230 		if (errh_flt.errh_er.attr & ERRH_ATTR_PIO)
231 			aflt->flt_class = BUS_FAULT;
232 		else
233 			aflt->flt_class = CPU_FAULT;
234 
235 		aflt->flt_id = gethrtime();
236 		aflt->flt_bus_id = getprocessorid();
237 		aflt->flt_pc = (caddr_t)rp->r_pc;
238 		exec_mode = (errh_flt.errh_er.attr & ERRH_MODE_MASK)
239 		    >> ERRH_MODE_SHIFT;
240 		aflt->flt_priv = (exec_mode == ERRH_MODE_PRIV ||
241 		    exec_mode == ERRH_MODE_UNKNOWN);
242 		aflt->flt_prot = AFLT_PROT_NONE;
243 		aflt->flt_tl = (uchar_t)(flags & ERRH_TL_MASK);
244 		aflt->flt_panic = ((aflt->flt_tl != 0) ||
245 		    (aft_testfatal != 0));
246 
247 		/*
248 		 * For the first error packet on the queue, check if it
249 		 * happened in user fill/spill trap.
250 		 */
251 		if (flags & ERRH_U_SPILL_FILL) {
252 			u_spill_fill = 1;
253 			/* clear the user fill/spill flag in flags */
254 			flags = (uint64_t)aflt->flt_tl;
255 		} else
256 			u_spill_fill = 0;
257 
258 		switch (errh_flt.errh_er.desc) {
259 		case ERRH_DESC_PR_NRE:
260 			if (u_spill_fill) {
261 				aflt->flt_panic = 0;
262 				break;
263 			}
264 			/*
265 			 * Fall through, precise fault also need to check
266 			 * to see if it was protected.
267 			 */
268 			/*FALLTHRU*/
269 
270 		case ERRH_DESC_DEF_NRE:
271 			/*
272 			 * If the trap occurred in privileged mode at TL=0,
273 			 * we need to check to see if we were executing
274 			 * in kernel under on_trap() or t_lofault
275 			 * protection. If so, and if it was a PIO or MEM
276 			 * error, then modify the saved registers so that
277 			 * we return from the trap to the appropriate
278 			 * trampoline routine.
279 			 */
280 			if (aflt->flt_priv == 1 && aflt->flt_tl == 0 &&
281 			    ((errh_flt.errh_er.attr & ERRH_ATTR_PIO) ||
282 			    (errh_flt.errh_er.attr & ERRH_ATTR_MEM))) {
283 				trampolined =
284 				    errh_error_protected(rp, aflt, &expected);
285 			}
286 
287 			if (!aflt->flt_priv || aflt->flt_prot ==
288 			    AFLT_PROT_COPY) {
289 				aflt->flt_panic |= aft_panic;
290 			} else if (!trampolined &&
291 			    (aflt->flt_class != BUS_FAULT)) {
292 				aflt->flt_panic = 1;
293 			}
294 
295 			/*
296 			 * Check error attribute, handle individual error
297 			 * if it is needed.
298 			 */
299 			errh_handle_attr(&errh_flt);
300 
301 			/*
302 			 * If PIO error, we need to query the bus nexus
303 			 * for fatal errors.
304 			 */
305 			if (aflt->flt_class == BUS_FAULT) {
306 				aflt->flt_addr = errh_flt.errh_er.ra;
307 				errh_cpu_run_bus_error_handlers(aflt,
308 				    expected);
309 			}
310 
311 			break;
312 
313 		default:
314 			cmn_err(CE_WARN, "Panic - Error Descriptor 0x%llx "
315 			    " invalid in non-resumable error handler",
316 			    (long long) errh_flt.errh_er.desc);
317 			aflt->flt_panic = 1;
318 			break;
319 		}
320 
321 		/*
322 		 * Queue the error report for further processing. If
323 		 * flt_panic is set, code still process other errors
324 		 * in the queue until the panic routine stops the
325 		 * kernel.
326 		 */
327 		(void) cpu_queue_one_event(&errh_flt);
328 
329 		/*
330 		 * Panic here if aflt->flt_panic has been set.
331 		 * Enqueued errors will be logged as part of the panic flow.
332 		 */
333 		if (aflt->flt_panic) {
334 			fm_panic("Unrecoverable hardware error");
335 		}
336 
337 		/*
338 		 * Call page_retire() to handle memory errors.
339 		 */
340 		if (errh_flt.errh_er.attr & ERRH_ATTR_MEM)
341 			errh_page_retire(&errh_flt, PR_UE);
342 
343 		/*
344 		 * If we queued an error and the it was in user mode, or
345 		 * protected by t_lofault, or user_spill_fill is set, we
346 		 * set AST flag so the queue will be drained before
347 		 * returning to user mode.
348 		 */
349 		if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY ||
350 		    u_spill_fill) {
351 			int pcb_flag = 0;
352 
353 			if (aflt->flt_class == CPU_FAULT)
354 				pcb_flag |= ASYNC_HWERR;
355 			else if (aflt->flt_class == BUS_FAULT)
356 				pcb_flag |= ASYNC_BERR;
357 
358 			ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag;
359 			aston(curthread);
360 		}
361 	}
362 }
363 
364 /*
365  * For PIO errors, this routine calls nexus driver's error
366  * callback routines. If the callback routine returns fatal, and
367  * we are in kernel or unknow mode without any error protection,
368  * we need to turn on the panic flag.
369  */
370 void
371 errh_cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
372 {
373 	int status;
374 	ddi_fm_error_t de;
375 
376 	bzero(&de, sizeof (ddi_fm_error_t));
377 
378 	de.fme_version = DDI_FME_VERSION;
379 	de.fme_ena = fm_ena_generate(aflt->flt_id, FM_ENA_FMT1);
380 	de.fme_flag = expected;
381 	de.fme_bus_specific = (void *)aflt->flt_addr;
382 	status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de);
383 
384 	/*
385 	 * If error is protected, it will jump to proper routine
386 	 * to handle the handle; if it is in user level, we just
387 	 * kill the user process; if the driver thinks the error is
388 	 * not fatal, we can drive on. If none of above are true,
389 	 * we panic
390 	 */
391 	if ((aflt->flt_prot == AFLT_PROT_NONE) && (aflt->flt_priv == 1) &&
392 	    (status == DDI_FM_FATAL))
393 		aflt->flt_panic = 1;
394 }
395 
396 /*
397  * This routine checks to see if we are under any error protection when
398  * the error happens. If we are under error protection, we unwind to
399  * the protection and indicate fault.
400  */
401 static int
402 errh_error_protected(struct regs *rp, struct async_flt *aflt, int *expected)
403 {
404 	int trampolined = 0;
405 	ddi_acc_hdl_t *hp;
406 
407 	if (curthread->t_ontrap != NULL) {
408 		on_trap_data_t *otp = curthread->t_ontrap;
409 
410 		if (otp->ot_prot & OT_DATA_EC) {
411 			aflt->flt_prot = AFLT_PROT_EC;
412 			otp->ot_trap |= OT_DATA_EC;
413 			rp->r_pc = otp->ot_trampoline;
414 			rp->r_npc = rp->r_pc +4;
415 			trampolined = 1;
416 		}
417 
418 		if (otp->ot_prot & OT_DATA_ACCESS) {
419 			aflt->flt_prot = AFLT_PROT_ACCESS;
420 			otp->ot_trap |= OT_DATA_ACCESS;
421 			rp->r_pc = otp->ot_trampoline;
422 			rp->r_npc = rp->r_pc + 4;
423 			trampolined = 1;
424 			/*
425 			 * for peek and caut_gets
426 			 * errors are expected
427 			 */
428 			hp = (ddi_acc_hdl_t *)otp->ot_handle;
429 			if (!hp)
430 				*expected = DDI_FM_ERR_PEEK;
431 			else if (hp->ah_acc.devacc_attr_access ==
432 			    DDI_CAUTIOUS_ACC)
433 				*expected = DDI_FM_ERR_EXPECTED;
434 		}
435 	} else if (curthread->t_lofault) {
436 		aflt->flt_prot = AFLT_PROT_COPY;
437 		rp->r_g1 = EFAULT;
438 		rp->r_pc = curthread->t_lofault;
439 		rp->r_npc = rp->r_pc + 4;
440 		trampolined = 1;
441 	}
442 
443 	return (trampolined);
444 }
445 
446 /*
447  * Queue one event.
448  */
449 static void
450 cpu_queue_one_event(errh_async_flt_t *errh_fltp)
451 {
452 	struct async_flt *aflt = (struct async_flt *)errh_fltp;
453 	errorq_t *eqp;
454 
455 	if (aflt->flt_panic)
456 		eqp = ue_queue;
457 	else
458 		eqp = ce_queue;
459 
460 	errorq_dispatch(eqp, errh_fltp, sizeof (errh_async_flt_t),
461 	    aflt->flt_panic);
462 }
463 
464 /*
465  * The cpu_async_log_err() function is called by the ce/ue_drain() function to
466  * handle logging for CPU events that are dequeued.  As such, it can be invoked
467  * from softint context, from AST processing in the trap() flow, or from the
468  * panic flow.  We decode the CPU-specific data, and log appropriate messages.
469  */
470 void
471 cpu_async_log_err(void *flt)
472 {
473 	errh_async_flt_t *errh_fltp = (errh_async_flt_t *)flt;
474 	errh_er_t *errh_erp = (errh_er_t *)&errh_fltp->errh_er;
475 
476 	switch (errh_erp->desc) {
477 	case ERRH_DESC_UCOR_RE:
478 		if (errh_erp->attr & ERRH_ATTR_MEM) {
479 			/*
480 			 * Turn on the PR_UE flag. The page will be
481 			 * scrubbed when it is freed.
482 			 */
483 			errh_page_retire(errh_fltp, PR_UE);
484 		}
485 
486 		break;
487 
488 	case ERRH_DESC_PR_NRE:
489 	case ERRH_DESC_DEF_NRE:
490 		if (errh_erp->attr & ERRH_ATTR_MEM) {
491 			/*
492 			 * For non-resumable memory error, retire
493 			 * the page here.
494 			 */
495 			errh_page_retire(errh_fltp, PR_UE);
496 
497 			/*
498 			 * If we are going to panic, scrub the page first
499 			 */
500 			if (errh_fltp->cmn_asyncflt.flt_panic)
501 				mem_scrub(errh_fltp->errh_er.ra,
502 				    errh_fltp->errh_er.sz);
503 		}
504 		break;
505 
506 	default:
507 		break;
508 	}
509 }
510 
511 /*
512  * Called from ce_drain().
513  */
514 void
515 cpu_ce_log_err(struct async_flt *aflt)
516 {
517 	switch (aflt->flt_class) {
518 	case CPU_FAULT:
519 		cpu_async_log_err(aflt);
520 		break;
521 
522 	case BUS_FAULT:
523 		cpu_async_log_err(aflt);
524 		break;
525 
526 	default:
527 		break;
528 	}
529 }
530 
531 /*
532  * Called from ue_drain().
533  */
534 void
535 cpu_ue_log_err(struct async_flt *aflt)
536 {
537 	switch (aflt->flt_class) {
538 	case CPU_FAULT:
539 		cpu_async_log_err(aflt);
540 		break;
541 
542 	case BUS_FAULT:
543 		cpu_async_log_err(aflt);
544 		break;
545 
546 	default:
547 		break;
548 	}
549 }
550 
551 /*
552  * Turn on flag on the error memory region.
553  */
554 static void
555 errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag)
556 {
557 	uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
558 	uint64_t flt_real_addr_end = flt_real_addr_start +
559 	    errh_fltp->errh_er.sz - 1;
560 	int64_t current_addr;
561 
562 	if (errh_fltp->errh_er.sz == 0)
563 		return;
564 
565 	for (current_addr = flt_real_addr_start;
566 	    current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
567 		(void) page_retire(current_addr, flag);
568 	}
569 }
570 
571 void
572 mem_scrub(uint64_t paddr, uint64_t len)
573 {
574 	uint64_t pa, length, scrubbed_len;
575 
576 	pa = paddr;
577 	length = len;
578 	scrubbed_len = 0;
579 
580 	while (length > 0) {
581 		if (hv_mem_scrub(pa, length, &scrubbed_len) != H_EOK)
582 			break;
583 
584 		pa += scrubbed_len;
585 		length -= scrubbed_len;
586 	}
587 }
588 
589 /*
590  * Call hypervisor to flush the memory region.
591  * Both va and len must be MMU_PAGESIZE aligned.
592  * Returns the total number of bytes flushed.
593  */
594 uint64_t
595 mem_sync(caddr_t va, size_t len)
596 {
597 	uint64_t pa, length, flushed;
598 	uint64_t chunk_len = MMU_PAGESIZE;
599 	uint64_t total_flushed = 0;
600 
601 	if (((uint64_t)va | (uint64_t)len) & MMU_PAGEOFFSET)
602 		return (total_flushed);
603 
604 	while (len > 0) {
605 		pa = va_to_pa((caddr_t)va);
606 		if (pa == (uint64_t)-1)
607 			return (total_flushed);
608 
609 		length = chunk_len;
610 		flushed = 0;
611 
612 		while (length > 0) {
613 			if (hv_mem_sync(pa, length, &flushed) != H_EOK)
614 				return (total_flushed);
615 
616 			pa += flushed;
617 			length -= flushed;
618 			total_flushed += flushed;
619 		}
620 
621 		va += chunk_len;
622 		len -= chunk_len;
623 	}
624 
625 	return (total_flushed);
626 }
627 
628 /*
629  * If resumable queue is full, we need to check if any cpu is in
630  * error state. If not, we drive on. If yes, we need to panic. The
631  * hypervisor call hv_cpu_state() is being used for checking the
632  * cpu state.
633  */
634 static void
635 errh_rq_full(struct async_flt *afltp)
636 {
637 	processorid_t who;
638 	uint64_t cpu_state;
639 	uint64_t retval;
640 
641 	for (who = 0; who < NCPU; who++)
642 		if (CPU_IN_SET(cpu_ready_set, who)) {
643 			retval = hv_cpu_state(who, &cpu_state);
644 			if (retval != H_EOK || cpu_state == CPU_STATE_ERROR) {
645 				afltp->flt_panic = 1;
646 				break;
647 			}
648 		}
649 }
650 
651 /*
652  * Return processor specific async error structure
653  * size used.
654  */
655 int
656 cpu_aflt_size(void)
657 {
658 	return (sizeof (errh_async_flt_t));
659 }
660 
661 #define	SZ_TO_ETRS_SHIFT	6
662 
663 /*
664  * Message print out when resumable queue is overflown
665  */
666 /*ARGSUSED*/
667 void
668 rq_overflow(struct regs *rp, uint64_t head_offset,
669     uint64_t tail_offset)
670 {
671 	rq_overflow_count++;
672 }
673 
674 /*
675  * Handler to process a fatal error.  This routine can be called from a
676  * softint, called from trap()'s AST handling, or called from the panic flow.
677  */
678 /*ARGSUSED*/
679 static void
680 ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
681 {
682 	cpu_ue_log_err(aflt);
683 }
684 
685 /*
686  * Handler to process a correctable error.  This routine can be called from a
687  * softint.  We just call the CPU module's logging routine.
688  */
689 /*ARGSUSED*/
690 static void
691 ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
692 {
693 	cpu_ce_log_err(aflt);
694 }
695 
696 /*
697  * Handler to process vbsc hostshutdown (power-off button).
698  */
699 static int
700 err_shutdown_softintr()
701 {
702 	cmn_err(CE_WARN, "Power-off requested, system will now shutdown.");
703 	do_shutdown();
704 
705 	/*
706 	 * just in case do_shutdown() fails
707 	 */
708 	(void) timeout((void(*)(void *))power_down, NULL, 100 * hz);
709 	return (DDI_INTR_CLAIMED);
710 }
711 
712 /*
713  * Allocate error queue sizes based on max_ncpus.  max_ncpus is set just
714  * after ncpunode has been determined.  ncpus is set in start_other_cpus
715  * which is called after error_init() but may change dynamically.
716  */
717 void
718 error_init(void)
719 {
720 	char tmp_name[MAXSYSNAME];
721 	pnode_t node;
722 	size_t size = cpu_aflt_size();
723 
724 	/*
725 	 * Initialize the correctable and uncorrectable error queues.
726 	 */
727 	ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL,
728 	    MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL);
729 
730 	ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL,
731 	    MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0);
732 
733 	if (ue_queue == NULL || ce_queue == NULL)
734 		panic("failed to create required system error queue");
735 
736 	/*
737 	 * Setup interrupt handler for power-off button.
738 	 */
739 	err_shutdown_inum = add_softintr(PIL_9,
740 	    (softintrfunc)err_shutdown_softintr, NULL, SOFTINT_ST);
741 
742 	/*
743 	 * Initialize the busfunc list mutex.  This must be a PIL_15 spin lock
744 	 * because we will need to acquire it from cpu_async_error().
745 	 */
746 	mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15);
747 
748 	node = prom_rootnode();
749 	if ((node == OBP_NONODE) || (node == OBP_BADNODE)) {
750 		cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node);
751 		return;
752 	}
753 
754 	if (((size = prom_getproplen(node, "reset-reason")) != -1) &&
755 	    (size <= MAXSYSNAME) &&
756 	    (prom_getprop(node, "reset-reason", tmp_name) != -1)) {
757 		if (reset_debug) {
758 			cmn_err(CE_CONT, "System booting after %s\n", tmp_name);
759 		} else if (strncmp(tmp_name, "FATAL", 5) == 0) {
760 			cmn_err(CE_CONT,
761 			    "System booting after fatal error %s\n", tmp_name);
762 		}
763 	}
764 }
765 
766 /*
767  * Nonresumable queue is full, panic here
768  */
769 /*ARGSUSED*/
770 void
771 nrq_overflow(struct regs *rp)
772 {
773 	fm_panic("Nonresumable queue full");
774 }
775 
776 /*
777  * This is the place for special error handling for individual errors.
778  */
779 static void
780 errh_handle_attr(errh_async_flt_t *errh_fltp)
781 {
782 	switch (errh_fltp->errh_er.attr & ~ERRH_MODE_MASK) {
783 	case ERRH_ATTR_CPU:
784 	case ERRH_ATTR_MEM:
785 	case ERRH_ATTR_PIO:
786 	case ERRH_ATTR_IRF:
787 	case ERRH_ATTR_FRF:
788 	case ERRH_ATTR_SHUT:
789 		break;
790 
791 	case ERRH_ATTR_ASR:
792 		errh_handle_asr(errh_fltp);
793 		break;
794 
795 	case ERRH_ATTR_ASI:
796 	case ERRH_ATTR_PREG:
797 	case ERRH_ATTR_RQF:
798 		break;
799 
800 	default:
801 		break;
802 	}
803 }
804 
805 /*
806  * Handle ASR bit set in ATTR
807  */
808 static void
809 errh_handle_asr(errh_async_flt_t *errh_fltp)
810 {
811 	uint64_t current_tick;
812 
813 	switch (errh_fltp->errh_er.reg) {
814 	case ASR_REG_VALID | ASR_REG_TICK:
815 		/*
816 		 * For Tick Compare Register error, it only happens when
817 		 * the register is being read or compared with the %tick
818 		 * register. Since we lost the contents of the register,
819 		 * we set the %tick_compr in the future. An interrupt will
820 		 * happen when %tick matches the value field of %tick_compr.
821 		 */
822 		current_tick = (uint64_t)gettick();
823 		tickcmpr_set(current_tick);
824 		/* Do not panic */
825 		errh_fltp->cmn_asyncflt.flt_panic = 0;
826 		break;
827 
828 	default:
829 		break;
830 	}
831 }
832