xref: /titanic_51/usr/src/uts/sun4v/os/error.c (revision 4496171313bed39e96f21bc2f9faf2868e267ae3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/machsystm.h>
30 #include <sys/cpuvar.h>
31 #include <sys/async.h>
32 #include <sys/ontrap.h>
33 #include <sys/ddifm.h>
34 #include <sys/hypervisor_api.h>
35 #include <sys/errorq.h>
36 #include <sys/promif.h>
37 #include <sys/prom_plat.h>
38 #include <sys/x_call.h>
39 #include <sys/error.h>
40 #include <sys/fm/util.h>
41 #include <sys/ivintr.h>
42 #include <sys/archsystm.h>
43 
44 #define	MAX_CE_FLTS		10
45 #define	MAX_ASYNC_FLTS		6
46 
47 errorq_t *ue_queue;			/* queue of uncorrectable errors */
48 errorq_t *ce_queue;			/* queue of correctable errors */
49 
50 /*
51  * Being used by memory test driver.
52  * ce_verbose_memory - covers CEs in DIMMs
53  * ce_verbose_other - covers "others" (ecache, IO, etc.)
54  *
55  * If the value is 0, nothing is logged.
56  * If the value is 1, the error is logged to the log file, but not console.
57  * If the value is 2, the error is logged to the log file and console.
58  */
59 int	ce_verbose_memory = 1;
60 int	ce_verbose_other = 1;
61 
62 int	ce_show_data = 0;
63 int	ce_debug = 0;
64 int	ue_debug = 0;
65 int	reset_debug = 0;
66 
67 /*
68  * Tunables for controlling the handling of asynchronous faults (AFTs). Setting
69  * these to non-default values on a non-DEBUG kernel is NOT supported.
70  */
71 int	aft_verbose = 0;	/* log AFT messages > 1 to log only */
72 int	aft_panic = 0;		/* panic (not reboot) on fatal usermode AFLT */
73 int	aft_testfatal = 0;	/* force all AFTs to panic immediately */
74 
75 /*
76  * Used for vbsc hostshutdown (power-off buton)
77  */
78 int	err_shutdown_triggered = 0;	/* only once */
79 uint64_t err_shutdown_inum = 0;	/* used to pull the trigger */
80 
81 /*
82  * Defined in bus_func.c but initialised in error_init
83  */
84 extern kmutex_t bfd_lock;
85 
86 static uint32_t rq_overflow_count = 0;		/* counter for rq overflow */
87 
88 static void cpu_queue_one_event(errh_async_flt_t *);
89 static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t);
90 static void errh_page_retire(errh_async_flt_t *, uchar_t);
91 static int errh_error_protected(struct regs *, struct async_flt *, int *);
92 static void errh_rq_full(struct async_flt *);
93 static void ue_drain(void *, struct async_flt *, errorq_elem_t *);
94 static void ce_drain(void *, struct async_flt *, errorq_elem_t *);
95 static void errh_handle_attr(errh_async_flt_t *);
96 static void errh_handle_asr(errh_async_flt_t *);
97 
98 /*ARGSUSED*/
99 void
100 process_resumable_error(struct regs *rp, uint32_t head_offset,
101     uint32_t tail_offset)
102 {
103 	struct machcpu *mcpup;
104 	struct async_flt *aflt;
105 	errh_async_flt_t errh_flt;
106 	errh_er_t *head_va;
107 
108 	mcpup = &(CPU->cpu_m);
109 
110 	while (head_offset != tail_offset) {
111 		/* kernel buffer starts right after the resumable queue */
112 		head_va = (errh_er_t *)(mcpup->cpu_rq_va + head_offset +
113 		    CPU_RQ_SIZE);
114 		/* Copy the error report to local buffer */
115 		bzero(&errh_flt, sizeof (errh_async_flt_t));
116 		bcopy((char *)head_va, &(errh_flt.errh_er),
117 		    sizeof (errh_er_t));
118 
119 		/* Increment the queue head */
120 		head_offset += Q_ENTRY_SIZE;
121 		/* Wrap around */
122 		head_offset &= (CPU_RQ_SIZE - 1);
123 
124 		/* set error handle to zero so it can hold new error report */
125 		head_va->ehdl = 0;
126 
127 		switch (errh_flt.errh_er.desc) {
128 		case ERRH_DESC_UCOR_RE:
129 			/*
130 			 * Check error attribute, handle individual error
131 			 * if it is needed.
132 			 */
133 			errh_handle_attr(&errh_flt);
134 			break;
135 
136 		case ERRH_DESC_WARN_RE:
137 			/*
138 			 * Power-off requested, but handle it one time only.
139 			 */
140 			if (!err_shutdown_triggered) {
141 				setsoftint(err_shutdown_inum);
142 				++err_shutdown_triggered;
143 			}
144 			continue;
145 
146 		default:
147 			cmn_err(CE_WARN, "Error Descriptor 0x%llx "
148 			    " invalid in resumable error handler",
149 			    (long long) errh_flt.errh_er.desc);
150 			continue;
151 		}
152 
153 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
154 		aflt->flt_id = gethrtime();
155 		aflt->flt_bus_id = getprocessorid();
156 		aflt->flt_class = CPU_FAULT;
157 		aflt->flt_prot = AFLT_PROT_NONE;
158 		aflt->flt_priv = (((errh_flt.errh_er.attr & ERRH_MODE_MASK)
159 		    >> ERRH_MODE_SHIFT) == ERRH_MODE_PRIV);
160 
161 		if (errh_flt.errh_er.attr & ERRH_ATTR_CPU)
162 			/* If it is an error on other cpu */
163 			aflt->flt_panic = 1;
164 		else
165 			aflt->flt_panic = 0;
166 
167 		/*
168 		 * Handle resumable queue full case.
169 		 */
170 		if (errh_flt.errh_er.attr & ERRH_ATTR_RQF) {
171 			(void) errh_rq_full(aflt);
172 		}
173 
174 		/*
175 		 * Queue the error on ce or ue queue depend on flt_panic.
176 		 * Even if flt_panic is set, the code still keep processing
177 		 * the rest element on rq until the panic starts.
178 		 */
179 		(void) cpu_queue_one_event(&errh_flt);
180 
181 		/*
182 		 * Panic here if aflt->flt_panic has been set.
183 		 * Enqueued errors will be logged as part of the panic flow.
184 		 */
185 		if (aflt->flt_panic) {
186 			fm_panic("Unrecoverable error on another CPU");
187 		}
188 	}
189 }
190 
191 void
192 process_nonresumable_error(struct regs *rp, uint64_t flags,
193     uint32_t head_offset, uint32_t tail_offset)
194 {
195 	struct machcpu *mcpup;
196 	struct async_flt *aflt;
197 	errh_async_flt_t errh_flt;
198 	errh_er_t *head_va;
199 	int trampolined = 0;
200 	int expected = DDI_FM_ERR_UNEXPECTED;
201 	uint64_t exec_mode;
202 	uint8_t u_spill_fill;
203 
204 	mcpup = &(CPU->cpu_m);
205 
206 	while (head_offset != tail_offset) {
207 		/* kernel buffer starts right after the nonresumable queue */
208 		head_va = (errh_er_t *)(mcpup->cpu_nrq_va + head_offset +
209 		    CPU_NRQ_SIZE);
210 
211 		/* Copy the error report to local buffer */
212 		bzero(&errh_flt, sizeof (errh_async_flt_t));
213 
214 		bcopy((char *)head_va, &(errh_flt.errh_er),
215 		    sizeof (errh_er_t));
216 
217 		/* Increment the queue head */
218 		head_offset += Q_ENTRY_SIZE;
219 		/* Wrap around */
220 		head_offset &= (CPU_NRQ_SIZE - 1);
221 
222 		/* set error handle to zero so it can hold new error report */
223 		head_va->ehdl = 0;
224 
225 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
226 
227 		trampolined = 0;
228 
229 		if (errh_flt.errh_er.attr & ERRH_ATTR_PIO)
230 			aflt->flt_class = BUS_FAULT;
231 		else
232 			aflt->flt_class = CPU_FAULT;
233 
234 		aflt->flt_id = gethrtime();
235 		aflt->flt_bus_id = getprocessorid();
236 		aflt->flt_pc = (caddr_t)rp->r_pc;
237 		exec_mode = (errh_flt.errh_er.attr & ERRH_MODE_MASK)
238 		    >> ERRH_MODE_SHIFT;
239 		aflt->flt_priv = (exec_mode == ERRH_MODE_PRIV ||
240 		    exec_mode == ERRH_MODE_UNKNOWN);
241 		aflt->flt_prot = AFLT_PROT_NONE;
242 		aflt->flt_tl = (uchar_t)(flags & ERRH_TL_MASK);
243 		aflt->flt_panic = ((aflt->flt_tl != 0) ||
244 		    (aft_testfatal != 0));
245 
246 		/*
247 		 * For the first error packet on the queue, check if it
248 		 * happened in user fill/spill trap.
249 		 */
250 		if (flags & ERRH_U_SPILL_FILL) {
251 			u_spill_fill = 1;
252 			/* clear the user fill/spill flag in flags */
253 			flags = (uint64_t)aflt->flt_tl;
254 		} else
255 			u_spill_fill = 0;
256 
257 		switch (errh_flt.errh_er.desc) {
258 		case ERRH_DESC_PR_NRE:
259 			if (u_spill_fill) {
260 				aflt->flt_panic = 0;
261 				break;
262 			}
263 			/*
264 			 * Fall through, precise fault also need to check
265 			 * to see if it was protected.
266 			 */
267 			/*FALLTHRU*/
268 
269 		case ERRH_DESC_DEF_NRE:
270 			/*
271 			 * If the trap occurred in privileged mode at TL=0,
272 			 * we need to check to see if we were executing
273 			 * in kernel under on_trap() or t_lofault
274 			 * protection. If so, and if it was a PIO or MEM
275 			 * error, then modify the saved registers so that
276 			 * we return from the trap to the appropriate
277 			 * trampoline routine.
278 			 */
279 			if (aflt->flt_priv == 1 && aflt->flt_tl == 0 &&
280 			    ((errh_flt.errh_er.attr & ERRH_ATTR_PIO) ||
281 			    (errh_flt.errh_er.attr & ERRH_ATTR_MEM))) {
282 				trampolined =
283 				    errh_error_protected(rp, aflt, &expected);
284 			}
285 
286 			if (!aflt->flt_priv || aflt->flt_prot ==
287 			    AFLT_PROT_COPY) {
288 				aflt->flt_panic |= aft_panic;
289 			} else if (!trampolined &&
290 			    (aflt->flt_class != BUS_FAULT)) {
291 				aflt->flt_panic = 1;
292 			}
293 
294 			/*
295 			 * Check error attribute, handle individual error
296 			 * if it is needed.
297 			 */
298 			errh_handle_attr(&errh_flt);
299 
300 			/*
301 			 * If PIO error, we need to query the bus nexus
302 			 * for fatal errors.
303 			 */
304 			if (aflt->flt_class == BUS_FAULT) {
305 				aflt->flt_addr = errh_flt.errh_er.ra;
306 				errh_cpu_run_bus_error_handlers(aflt,
307 				    expected);
308 			}
309 
310 			break;
311 
312 		default:
313 			cmn_err(CE_WARN, "Panic - Error Descriptor 0x%llx "
314 			    " invalid in non-resumable error handler",
315 			    (long long) errh_flt.errh_er.desc);
316 			aflt->flt_panic = 1;
317 			break;
318 		}
319 
320 		/*
321 		 * Queue the error report for further processing. If
322 		 * flt_panic is set, code still process other errors
323 		 * in the queue until the panic routine stops the
324 		 * kernel.
325 		 */
326 		(void) cpu_queue_one_event(&errh_flt);
327 
328 		/*
329 		 * Panic here if aflt->flt_panic has been set.
330 		 * Enqueued errors will be logged as part of the panic flow.
331 		 */
332 		if (aflt->flt_panic) {
333 			fm_panic("Unrecoverable hardware error");
334 		}
335 
336 		/*
337 		 * Call page_retire() to handle memory errors.
338 		 */
339 		if (errh_flt.errh_er.attr & ERRH_ATTR_MEM)
340 			errh_page_retire(&errh_flt, PR_UE);
341 
342 		/*
343 		 * If we queued an error and the it was in user mode, or
344 		 * protected by t_lofault, or user_spill_fill is set, we
345 		 * set AST flag so the queue will be drained before
346 		 * returning to user mode.
347 		 */
348 		if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY ||
349 		    u_spill_fill) {
350 			int pcb_flag = 0;
351 
352 			if (aflt->flt_class == CPU_FAULT)
353 				pcb_flag |= ASYNC_HWERR;
354 			else if (aflt->flt_class == BUS_FAULT)
355 				pcb_flag |= ASYNC_BERR;
356 
357 			ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag;
358 			aston(curthread);
359 		}
360 	}
361 }
362 
363 /*
364  * For PIO errors, this routine calls nexus driver's error
365  * callback routines. If the callback routine returns fatal, and
366  * we are in kernel or unknow mode without any error protection,
367  * we need to turn on the panic flag.
368  */
369 void
370 errh_cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
371 {
372 	int status;
373 	ddi_fm_error_t de;
374 
375 	bzero(&de, sizeof (ddi_fm_error_t));
376 
377 	de.fme_version = DDI_FME_VERSION;
378 	de.fme_ena = fm_ena_generate(aflt->flt_id, FM_ENA_FMT1);
379 	de.fme_flag = expected;
380 	de.fme_bus_specific = (void *)aflt->flt_addr;
381 	status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de);
382 
383 	/*
384 	 * If error is protected, it will jump to proper routine
385 	 * to handle the handle; if it is in user level, we just
386 	 * kill the user process; if the driver thinks the error is
387 	 * not fatal, we can drive on. If none of above are true,
388 	 * we panic
389 	 */
390 	if ((aflt->flt_prot == AFLT_PROT_NONE) && (aflt->flt_priv == 1) &&
391 	    (status == DDI_FM_FATAL))
392 		aflt->flt_panic = 1;
393 }
394 
395 /*
396  * This routine checks to see if we are under any error protection when
397  * the error happens. If we are under error protection, we unwind to
398  * the protection and indicate fault.
399  */
400 static int
401 errh_error_protected(struct regs *rp, struct async_flt *aflt, int *expected)
402 {
403 	int trampolined = 0;
404 	ddi_acc_hdl_t *hp;
405 
406 	if (curthread->t_ontrap != NULL) {
407 		on_trap_data_t *otp = curthread->t_ontrap;
408 
409 		if (otp->ot_prot & OT_DATA_EC) {
410 			aflt->flt_prot = AFLT_PROT_EC;
411 			otp->ot_trap |= OT_DATA_EC;
412 			rp->r_pc = otp->ot_trampoline;
413 			rp->r_npc = rp->r_pc +4;
414 			trampolined = 1;
415 		}
416 
417 		if (otp->ot_prot & OT_DATA_ACCESS) {
418 			aflt->flt_prot = AFLT_PROT_ACCESS;
419 			otp->ot_trap |= OT_DATA_ACCESS;
420 			rp->r_pc = otp->ot_trampoline;
421 			rp->r_npc = rp->r_pc + 4;
422 			trampolined = 1;
423 			/*
424 			 * for peek and caut_gets
425 			 * errors are expected
426 			 */
427 			hp = (ddi_acc_hdl_t *)otp->ot_handle;
428 			if (!hp)
429 				*expected = DDI_FM_ERR_PEEK;
430 			else if (hp->ah_acc.devacc_attr_access ==
431 			    DDI_CAUTIOUS_ACC)
432 				*expected = DDI_FM_ERR_EXPECTED;
433 		}
434 	} else if (curthread->t_lofault) {
435 		aflt->flt_prot = AFLT_PROT_COPY;
436 		rp->r_g1 = EFAULT;
437 		rp->r_pc = curthread->t_lofault;
438 		rp->r_npc = rp->r_pc + 4;
439 		trampolined = 1;
440 	}
441 
442 	return (trampolined);
443 }
444 
445 /*
446  * Queue one event.
447  */
448 static void
449 cpu_queue_one_event(errh_async_flt_t *errh_fltp)
450 {
451 	struct async_flt *aflt = (struct async_flt *)errh_fltp;
452 	errorq_t *eqp;
453 
454 	if (aflt->flt_panic)
455 		eqp = ue_queue;
456 	else
457 		eqp = ce_queue;
458 
459 	errorq_dispatch(eqp, errh_fltp, sizeof (errh_async_flt_t),
460 	    aflt->flt_panic);
461 }
462 
463 /*
464  * The cpu_async_log_err() function is called by the ce/ue_drain() function to
465  * handle logging for CPU events that are dequeued.  As such, it can be invoked
466  * from softint context, from AST processing in the trap() flow, or from the
467  * panic flow.  We decode the CPU-specific data, and log appropriate messages.
468  */
469 void
470 cpu_async_log_err(void *flt)
471 {
472 	errh_async_flt_t *errh_fltp = (errh_async_flt_t *)flt;
473 	errh_er_t *errh_erp = (errh_er_t *)&errh_fltp->errh_er;
474 
475 	switch (errh_erp->desc) {
476 	case ERRH_DESC_UCOR_RE:
477 		if (errh_erp->attr & ERRH_ATTR_MEM) {
478 			/*
479 			 * Turn on the PR_UE flag. The page will be
480 			 * scrubbed when it is freed.
481 			 */
482 			errh_page_retire(errh_fltp, PR_UE);
483 		}
484 
485 		break;
486 
487 	case ERRH_DESC_PR_NRE:
488 	case ERRH_DESC_DEF_NRE:
489 		if (errh_erp->attr & ERRH_ATTR_MEM) {
490 			/*
491 			 * For non-resumable memory error, retire
492 			 * the page here.
493 			 */
494 			errh_page_retire(errh_fltp, PR_UE);
495 
496 			/*
497 			 * If we are going to panic, scrub the page first
498 			 */
499 			if (errh_fltp->cmn_asyncflt.flt_panic)
500 				mem_scrub(errh_fltp->errh_er.ra,
501 				    errh_fltp->errh_er.sz);
502 		}
503 		break;
504 
505 	default:
506 		break;
507 	}
508 }
509 
510 /*
511  * Called from ce_drain().
512  */
513 void
514 cpu_ce_log_err(struct async_flt *aflt)
515 {
516 	switch (aflt->flt_class) {
517 	case CPU_FAULT:
518 		cpu_async_log_err(aflt);
519 		break;
520 
521 	case BUS_FAULT:
522 		cpu_async_log_err(aflt);
523 		break;
524 
525 	default:
526 		break;
527 	}
528 }
529 
530 /*
531  * Called from ue_drain().
532  */
533 void
534 cpu_ue_log_err(struct async_flt *aflt)
535 {
536 	switch (aflt->flt_class) {
537 	case CPU_FAULT:
538 		cpu_async_log_err(aflt);
539 		break;
540 
541 	case BUS_FAULT:
542 		cpu_async_log_err(aflt);
543 		break;
544 
545 	default:
546 		break;
547 	}
548 }
549 
550 /*
551  * Turn on flag on the error memory region.
552  */
553 static void
554 errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag)
555 {
556 	uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
557 	uint64_t flt_real_addr_end = flt_real_addr_start +
558 	    errh_fltp->errh_er.sz - 1;
559 	int64_t current_addr;
560 
561 	if (errh_fltp->errh_er.sz == 0)
562 		return;
563 
564 	for (current_addr = flt_real_addr_start;
565 	    current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
566 		(void) page_retire(current_addr, flag);
567 	}
568 }
569 
570 void
571 mem_scrub(uint64_t paddr, uint64_t len)
572 {
573 	uint64_t pa, length, scrubbed_len;
574 
575 	pa = paddr;
576 	length = len;
577 	scrubbed_len = 0;
578 
579 	while (length > 0) {
580 		if (hv_mem_scrub(pa, length, &scrubbed_len) != H_EOK)
581 			break;
582 
583 		pa += scrubbed_len;
584 		length -= scrubbed_len;
585 	}
586 }
587 
588 /*
589  * Call hypervisor to flush the memory region. The memory region
590  * must be within the same page frame.
591  */
592 void
593 mem_sync(caddr_t va, size_t len)
594 {
595 	uint64_t pa, length, flushed;
596 
597 	pa = va_to_pa((caddr_t)va);
598 
599 	if (pa == (uint64_t)-1)
600 		return;
601 
602 	ASSERT((pa >> MMU_PAGESHIFT) == ((pa + len - 1) >> MMU_PAGESHIFT));
603 
604 	length = len;
605 	flushed = 0;
606 
607 	while (length > 0) {
608 		if (hv_mem_sync(pa, length, &flushed) != H_EOK)
609 			break;
610 
611 		pa += flushed;
612 		length -= flushed;
613 	}
614 }
615 
616 /*
617  * If resumable queue is full, we need to check if any cpu is in
618  * error state. If not, we drive on. If yes, we need to panic. The
619  * hypervisor call hv_cpu_state() is being used for checking the
620  * cpu state.
621  */
622 static void
623 errh_rq_full(struct async_flt *afltp)
624 {
625 	processorid_t who;
626 	uint64_t cpu_state;
627 	uint64_t retval;
628 
629 	for (who = 0; who < NCPU; who++)
630 		if (CPU_IN_SET(cpu_ready_set, who)) {
631 			retval = hv_cpu_state(who, &cpu_state);
632 			if (retval != H_EOK || cpu_state == CPU_STATE_ERROR) {
633 				afltp->flt_panic = 1;
634 				break;
635 			}
636 		}
637 }
638 
639 /*
640  * Return processor specific async error structure
641  * size used.
642  */
643 int
644 cpu_aflt_size(void)
645 {
646 	return (sizeof (errh_async_flt_t));
647 }
648 
649 #define	SZ_TO_ETRS_SHIFT	6
650 
651 /*
652  * Message print out when resumable queue is overflown
653  */
654 /*ARGSUSED*/
655 void
656 rq_overflow(struct regs *rp, uint64_t head_offset,
657     uint64_t tail_offset)
658 {
659 	rq_overflow_count++;
660 }
661 
662 /*
663  * Handler to process a fatal error.  This routine can be called from a
664  * softint, called from trap()'s AST handling, or called from the panic flow.
665  */
666 /*ARGSUSED*/
667 static void
668 ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
669 {
670 	cpu_ue_log_err(aflt);
671 }
672 
673 /*
674  * Handler to process a correctable error.  This routine can be called from a
675  * softint.  We just call the CPU module's logging routine.
676  */
677 /*ARGSUSED*/
678 static void
679 ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
680 {
681 	cpu_ce_log_err(aflt);
682 }
683 
684 /*
685  * Handler to process vbsc hostshutdown (power-off button).
686  */
687 static int
688 err_shutdown_softintr()
689 {
690 	cmn_err(CE_WARN, "Power-off requested, system will now shutdown.");
691 	do_shutdown();
692 
693 	/*
694 	 * just in case do_shutdown() fails
695 	 */
696 	(void) timeout((void(*)(void *))power_down, NULL, 100 * hz);
697 	return (DDI_INTR_CLAIMED);
698 }
699 
700 /*
701  * Allocate error queue sizes based on max_ncpus.  max_ncpus is set just
702  * after ncpunode has been determined.  ncpus is set in start_other_cpus
703  * which is called after error_init() but may change dynamically.
704  */
705 void
706 error_init(void)
707 {
708 	char tmp_name[MAXSYSNAME];
709 	pnode_t node;
710 	size_t size = cpu_aflt_size();
711 
712 	/*
713 	 * Initialize the correctable and uncorrectable error queues.
714 	 */
715 	ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL,
716 	    MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL);
717 
718 	ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL,
719 	    MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0);
720 
721 	if (ue_queue == NULL || ce_queue == NULL)
722 		panic("failed to create required system error queue");
723 
724 	/*
725 	 * Setup interrupt handler for power-off button.
726 	 */
727 	err_shutdown_inum = add_softintr(PIL_9,
728 	    (softintrfunc)err_shutdown_softintr, NULL, SOFTINT_ST);
729 
730 	/*
731 	 * Initialize the busfunc list mutex.  This must be a PIL_15 spin lock
732 	 * because we will need to acquire it from cpu_async_error().
733 	 */
734 	mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15);
735 
736 	node = prom_rootnode();
737 	if ((node == OBP_NONODE) || (node == OBP_BADNODE)) {
738 		cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node);
739 		return;
740 	}
741 
742 	if (((size = prom_getproplen(node, "reset-reason")) != -1) &&
743 	    (size <= MAXSYSNAME) &&
744 	    (prom_getprop(node, "reset-reason", tmp_name) != -1)) {
745 		if (reset_debug) {
746 			cmn_err(CE_CONT, "System booting after %s\n", tmp_name);
747 		} else if (strncmp(tmp_name, "FATAL", 5) == 0) {
748 			cmn_err(CE_CONT,
749 			    "System booting after fatal error %s\n", tmp_name);
750 		}
751 	}
752 }
753 
754 /*
755  * Nonresumable queue is full, panic here
756  */
757 /*ARGSUSED*/
758 void
759 nrq_overflow(struct regs *rp)
760 {
761 	fm_panic("Nonresumable queue full");
762 }
763 
764 /*
765  * This is the place for special error handling for individual errors.
766  */
767 static void
768 errh_handle_attr(errh_async_flt_t *errh_fltp)
769 {
770 	switch (errh_fltp->errh_er.attr & ~ERRH_MODE_MASK) {
771 	case ERRH_ATTR_CPU:
772 	case ERRH_ATTR_MEM:
773 	case ERRH_ATTR_PIO:
774 	case ERRH_ATTR_IRF:
775 	case ERRH_ATTR_FRF:
776 	case ERRH_ATTR_SHUT:
777 		break;
778 
779 	case ERRH_ATTR_ASR:
780 		errh_handle_asr(errh_fltp);
781 		break;
782 
783 	case ERRH_ATTR_ASI:
784 	case ERRH_ATTR_PREG:
785 	case ERRH_ATTR_RQF:
786 		break;
787 
788 	default:
789 		break;
790 	}
791 }
792 
793 /*
794  * Handle ASR bit set in ATTR
795  */
796 static void
797 errh_handle_asr(errh_async_flt_t *errh_fltp)
798 {
799 	uint64_t current_tick;
800 
801 	switch (errh_fltp->errh_er.reg) {
802 	case ASR_REG_VALID | ASR_REG_TICK:
803 		/*
804 		 * For Tick Compare Register error, it only happens when
805 		 * the register is being read or compared with the %tick
806 		 * register. Since we lost the contents of the register,
807 		 * we set the %tick_compr in the future. An interrupt will
808 		 * happen when %tick matches the value field of %tick_compr.
809 		 */
810 		current_tick = (uint64_t)gettick();
811 		tickcmpr_set(current_tick);
812 		/* Do not panic */
813 		errh_fltp->cmn_asyncflt.flt_panic = 0;
814 		break;
815 
816 	default:
817 		break;
818 	}
819 }
820