xref: /titanic_44/usr/src/uts/common/os/errorq.c (revision 69bb4bb45c98da60d21839c4dc3c01ea1be60585)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Kernel Error Queues
31  *
32  * A common problem when handling hardware error traps and interrupts is that
33  * these errors frequently must be handled at high interrupt level, where
34  * reliably producing error messages and safely examining and manipulating
35  * other kernel state may not be possible.  The kernel error queue primitive is
36  * a common set of routines that allow a subsystem to maintain a queue of
37  * errors that can be processed by an explicit call from a safe context or by a
38  * soft interrupt that fires at a specific lower interrupt level.  The queue
39  * management code also ensures that if the system panics, all in-transit
40  * errors are logged prior to reset.  Each queue has an associated kstat for
41  * observing the number of errors dispatched and logged, and mdb(1) debugging
42  * support is provided for live and post-mortem observability.
43  *
44  * Memory Allocation
45  *
46  * 	All of the queue data structures are allocated in advance as part of
47  * 	the errorq_create() call.  No additional memory allocations are
48  * 	performed as part of errorq_dispatch(), errorq_reserve(),
49  *	errorq_commit() or errorq_drain().  This design
50  * 	facilitates reliable error queue processing even when the system is low
51  * 	on memory, and ensures that errorq_dispatch() can be called from any
52  * 	context.  When the queue is created, the maximum queue length is
53  * 	specified as a parameter to errorq_create() errorq_nvcreate().  This
54  *	length should represent a reasonable upper bound on the number of
55  *	simultaneous errors.  If errorq_dispatch() or errorq_reserve() is
56  *	invoked and no free queue elements are available, the error is
57  *	dropped and will not be logged.  Typically, the queue will only be
58  *	exhausted by an error storm, and in this case
59  * 	the earlier errors provide the most important data for analysis.
60  * 	When a new error is dispatched, the error data is copied into the
61  * 	preallocated queue element so that the caller's buffer can be reused.
62  *
63  *	When a new error is reserved, an element is moved from the free list
64  *	and returned to the caller.  The element buffer data, eqe_data, may be
65  *	managed by the caller and dispatched to the errorq by calling
66  *	errorq_commit().  This is useful for additions to errorq's
67  *	created with errorq_nvcreate() to handle name-value pair (nvpair) data.
68  *	See below for a discussion on nvlist errorq's.
69  *
70  * Queue Drain Callback
71  *
72  *      When the error queue is drained, the caller's queue drain callback is
73  *      invoked with a pointer to the saved error data.  This function may be
74  *      called from passive kernel context or soft interrupt context at or
75  *      below LOCK_LEVEL, or as part of panic().  As such, the callback should
76  *      basically only be calling cmn_err (but NOT with the CE_PANIC flag).
77  *      The callback must not call panic(), attempt to allocate memory, or wait
78  *      on a condition variable.  The callback may not call errorq_destroy()
79  *      or errorq_drain() on the same error queue that called it.
80  *
81  *      The queue drain callback will always be called for each pending error
82  *      in the order in which errors were enqueued (oldest to newest).  The
83  *      queue drain callback is guaranteed to provide at *least* once semantics
84  *      for all errors that are successfully dispatched (i.e. for which
85  *      errorq_dispatch() has successfully completed).  If an unrelated panic
86  *      occurs while the queue drain callback is running on a vital queue, the
87  *      panic subsystem will continue the queue drain and the callback may be
88  *      invoked again for the same error.  Therefore, the callback should
89  *      restrict itself to logging messages and taking other actions that are
90  *      not destructive if repeated.
91  *
92  * Name-Value Pair Error Queues
93  *
94  *	During error handling, it may be more convenient to store error
95  *	queue element data as a fixed buffer of name-value pairs.  The
96  *	nvpair library allows construction and destruction of nvlists in
97  *	in pre-allocated memory buffers.
98  *
99  *	Error queues created via errorq_nvcreate() store queue element
100  *	data as fixed buffer nvlists (ereports).  errorq_reserve()
101  *	allocates an errorq element from eqp->eq_free and returns a valid
102  *	pointer	to a errorq_elem_t (queue element) and a pre-allocated
103  *	fixed buffer nvlist.  errorq_elem_nvl() is used to gain access
104  *	to the nvlist to add name-value ereport members prior to
105  *	dispatching the error queue element in errorq_commit().
106  *
107  *	Once dispatched, the drain function will return the element to
108  *	eqp->eq_free and reset the associated nv_alloc structure.
109  *	error_cancel() may be called to cancel an element reservation
110  *	element that was never dispatched (committed).  This is useful in
111  *	cases where a programming error prevents a queue element from being
112  *	dispatched.
113  *
114  * Queue Management
115  *
116  *      The queue element structures and error data buffers are allocated in
117  *      two contiguous chunks as part of errorq_create() or errorq_nvcreate().
118  *	Each queue element structure contains a next pointer,
119  *	a previous pointer, and a pointer to the corresponding error data
120  *	buffer.  The data buffer for a nvlist errorq is a shared buffer
121  *	for the allocation of name-value pair lists. The elements are kept on
122  *      one of three lists:
123  *
124  *      Unused elements are kept on the free list, a singly-linked list pointed
125  *      to by eqp->eq_free, and linked together using eqe_prev.  The eqe_next
126  *      pointer is not used by the free list and will be set to NULL.
127  *
128  *      Pending errors are kept on the pending list, a singly-linked list
129  *      pointed to by eqp->eq_pend, and linked together using eqe_prev.  This
130  *      list is maintained in order from newest error to oldest.  The eqe_next
131  *      pointer is not used by the pending list and will be set to NULL.
132  *
133  *      The processing list is a doubly-linked list pointed to by eqp->eq_phead
134  *      (the oldest element) and eqp->eq_ptail (the newest element).  The
135  *      eqe_next pointer is used to traverse from eq_phead to eq_ptail, and the
136  *      eqe_prev pointer is used to traverse from eq_ptail to eq_phead.  Once a
137  *      queue drain operation begins, the current pending list is moved to the
138  *      processing list in a two-phase commit fashion, allowing the panic code
139  *      to always locate and process all pending errors in the event that a
140  *      panic occurs in the middle of queue processing.
141  *
142  *	A fourth list is maintained for nvlist errorqs.  The dump list,
143  *	eq_dump is used to link all errorq elements that should be stored
144  *	in a crash dump file in the event of a system panic.  During
145  *	errorq_panic(), the list is created and subsequently traversed
146  *	in errorq_dump() during the final phases of a crash dump.
147  *
148  * Platform Considerations
149  *
150  *      In order to simplify their implementation, error queues make use of the
151  *      C wrappers for compare-and-swap.  If the platform itself does not
152  *      support compare-and-swap in hardware and the kernel emulation routines
153  *      are used instead, then the context in which errorq_dispatch() can be
154  *      safely invoked is further constrained by the implementation of the
155  *      compare-and-swap emulation.  Specifically, if errorq_dispatch() is
156  *      called from a code path that can be executed above ATOMIC_LEVEL on such
157  *      a platform, the dispatch code could potentially deadlock unless the
158  *      corresponding error interrupt is blocked or disabled prior to calling
159  *      errorq_dispatch().  Error queues should therefore be deployed with
160  *      caution on these platforms.
161  *
162  * Interfaces
163  *
164  * errorq_t *errorq_create(name, func, private, qlen, eltsize, ipl, flags);
165  * errorq_t *errorq_nvcreate(name, func, private, qlen, eltsize, ipl, flags);
166  *
167  *      Create a new error queue with the specified name, callback, and
168  *      properties.  A pointer to the new error queue is returned upon success,
169  *      or NULL is returned to indicate that the queue could not be created.
170  *      This function must be called from passive kernel context with no locks
171  *      held that can prevent a sleeping memory allocation from occurring.
172  *      errorq_create() will return failure if the queue kstats cannot be
173  *      created, or if a soft interrupt handler cannot be registered.
174  *
175  *      The queue 'name' is a string that is recorded for live and post-mortem
176  *      examination by a debugger.  The queue callback 'func' will be invoked
177  *      for each error drained from the queue, and will receive the 'private'
178  *      pointer as its first argument.  The callback must obey the rules for
179  *      callbacks described above.  The queue will have maximum length 'qlen'
180  *      and each element will be able to record up to 'eltsize' bytes of data.
181  *      The queue's soft interrupt (see errorq_dispatch(), below) will fire
182  *      at 'ipl', which should not exceed LOCK_LEVEL.  The queue 'flags' may
183  *      include the following flag:
184  *
185  *      ERRORQ_VITAL    - This queue contains information that is considered
186  *         vital to problem diagnosis.  Error queues that are marked vital will
187  *         be automatically drained by the panic subsystem prior to printing
188  *         the panic messages to the console.
189  *
190  * void errorq_destroy(errorq);
191  *
192  *      Destroy the specified error queue.  The queue is drained of any
193  *      pending elements and these are logged before errorq_destroy returns.
194  *      Once errorq_destroy() begins draining the queue, any simultaneous
195  *      calls to dispatch errors will result in the errors being dropped.
196  *      The caller must invoke a higher-level abstraction (e.g. disabling
197  *      an error interrupt) to ensure that error handling code does not
198  *      attempt to dispatch errors to the queue while it is being freed.
199  *
200  * void errorq_dispatch(errorq, data, len, flag);
201  *
202  *      Attempt to enqueue the specified error data.  If a free queue element
203  *      is available, the data is copied into a free element and placed on a
204  *      pending list.  If no free queue element is available, the error is
205  *      dropped.  The data length (len) is specified in bytes and should not
206  *      exceed the queue's maximum element size.  If the data length is less
207  *      than the maximum element size, the remainder of the queue element is
208  *      filled with zeroes.  The flag parameter should be one of:
209  *
210  *      ERRORQ_ASYNC    - Schedule a soft interrupt at the previously specified
211  *         IPL to asynchronously drain the queue on behalf of the caller.
212  *
213  *      ERRORQ_SYNC     - Do not schedule a soft interrupt to drain the queue.
214  *         The caller is presumed to be calling errorq_drain() or panic() in
215  *         the near future in order to drain the queue and log the error.
216  *
217  *      The errorq_dispatch() function may be called from any context, subject
218  *      to the Platform Considerations described above.
219  *
220  * void errorq_drain(errorq);
221  *
222  *      Drain the error queue of all pending errors.  The queue's callback
223  *      function is invoked for each error in order from oldest to newest.
224  *      This function may be used at or below LOCK_LEVEL or from panic context.
225  *
226  * errorq_elem_t *errorq_reserve(errorq);
227  *
228  *	Reserve an error queue element for later processing and dispatching.
229  *	The element is returned to the caller who may add error-specific data
230  *	to element.  The element is retured to the free list when either
231  *	errorq_commit() is called and the element asynchronously processed
232  *	or immediately when errorq_cancel() is called.
233  *
234  * void errorq_commit(errorq, errorq_elem, flag);
235  *
236  *	Commit an errorq element (eqep) for dispatching, see
237  *	errorq_dispatch().
238  *
239  * void errorq_cancel(errorq, errorq_elem);
240  *
241  *	Cancel a pending errorq element reservation.  The errorq element is
242  *	returned to the free list upon cancelation.
243  */
244 
245 #include <sys/errorq_impl.h>
246 #include <sys/sysmacros.h>
247 #include <sys/machlock.h>
248 #include <sys/cmn_err.h>
249 #include <sys/atomic.h>
250 #include <sys/systm.h>
251 #include <sys/kmem.h>
252 #include <sys/conf.h>
253 #include <sys/ddi.h>
254 #include <sys/sunddi.h>
255 #include <sys/bootconf.h>
256 #include <sys/spl.h>
257 #include <sys/dumphdr.h>
258 #include <sys/compress.h>
259 #include <sys/time.h>
260 #include <sys/panic.h>
261 #include <sys/fm/protocol.h>
262 #include <sys/fm/util.h>
263 
264 static struct errorq_kstat errorq_kstat_template = {
265 	{ "dispatched", KSTAT_DATA_UINT64 },
266 	{ "dropped", KSTAT_DATA_UINT64 },
267 	{ "logged", KSTAT_DATA_UINT64 },
268 	{ "reserved", KSTAT_DATA_UINT64 },
269 	{ "reserve_fail", KSTAT_DATA_UINT64 },
270 	{ "committed", KSTAT_DATA_UINT64 },
271 	{ "commit_fail", KSTAT_DATA_UINT64 },
272 	{ "cancelled", KSTAT_DATA_UINT64 }
273 };
274 
275 static uint64_t errorq_lost = 0;
276 static errorq_t *errorq_list = NULL;
277 static kmutex_t errorq_lock;
278 static uint64_t errorq_vitalmin = 5;
279 
280 static uint_t
281 errorq_intr(caddr_t eqp)
282 {
283 	errorq_drain((errorq_t *)eqp);
284 	return (DDI_INTR_CLAIMED);
285 }
286 
287 /*
288  * Create a new error queue with the specified properties and add a software
289  * interrupt handler and kstat for it.  This function must be called from
290  * passive kernel context with no locks held that can prevent a sleeping
291  * memory allocation from occurring.  This function will return NULL if the
292  * softint or kstat for this queue cannot be created.
293  */
294 errorq_t *
295 errorq_create(const char *name, errorq_func_t func, void *private,
296     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
297 {
298 	errorq_t *eqp = kmem_alloc(sizeof (errorq_t), KM_SLEEP);
299 	ddi_iblock_cookie_t ibc = (ddi_iblock_cookie_t)(uintptr_t)ipltospl(ipl);
300 	dev_info_t *dip = ddi_root_node();
301 
302 	errorq_elem_t *eep;
303 	ddi_softintr_t id = NULL;
304 	caddr_t data;
305 
306 	ASSERT(qlen != 0 && size != 0);
307 	ASSERT(ipl > 0 && ipl <= LOCK_LEVEL);
308 
309 	/*
310 	 * If a queue is created very early in boot before device tree services
311 	 * are available, the queue softint handler cannot be created.  We
312 	 * manually drain these queues and create their softint handlers when
313 	 * it is safe to do so as part of errorq_init(), below.
314 	 */
315 	if (modrootloaded && ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id,
316 	    &ibc, NULL, errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
317 		cmn_err(CE_WARN, "errorq_create: failed to register "
318 		    "IPL %u softint for queue %s", ipl, name);
319 		kmem_free(eqp, sizeof (errorq_t));
320 		return (NULL);
321 	}
322 
323 	if ((eqp->eq_ksp = kstat_create("unix", 0, (char *)name, "errorq",
324 	    KSTAT_TYPE_NAMED, sizeof (struct errorq_kstat) /
325 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) == NULL) {
326 		cmn_err(CE_WARN, "errorq_create: failed to create kstat "
327 		    "for queue %s", name);
328 		if (id != NULL)
329 			ddi_remove_softintr(id);
330 		kmem_free(eqp, sizeof (errorq_t));
331 		return (NULL);
332 	}
333 
334 	bcopy(&errorq_kstat_template, &eqp->eq_kstat,
335 	    sizeof (struct errorq_kstat));
336 	eqp->eq_ksp->ks_data = &eqp->eq_kstat;
337 	eqp->eq_ksp->ks_private = eqp;
338 	kstat_install(eqp->eq_ksp);
339 
340 	(void) strncpy(eqp->eq_name, name, ERRORQ_NAMELEN);
341 	eqp->eq_name[ERRORQ_NAMELEN] = '\0';
342 	eqp->eq_func = func;
343 	eqp->eq_private = private;
344 	eqp->eq_data = kmem_alloc(qlen * size, KM_SLEEP);
345 	eqp->eq_qlen = qlen;
346 	eqp->eq_size = size;
347 	eqp->eq_ipl = ipl;
348 	eqp->eq_flags = flags | ERRORQ_ACTIVE;
349 	eqp->eq_id = id;
350 	mutex_init(&eqp->eq_lock, NULL, MUTEX_DEFAULT, NULL);
351 	eqp->eq_elems = kmem_alloc(qlen * sizeof (errorq_elem_t), KM_SLEEP);
352 	eqp->eq_phead = NULL;
353 	eqp->eq_ptail = NULL;
354 	eqp->eq_pend = NULL;
355 	eqp->eq_dump = NULL;
356 	eqp->eq_free = eqp->eq_elems;
357 
358 	/*
359 	 * Iterate over the array of errorq_elem_t structures and place each
360 	 * one on the free list and set its data pointer.
361 	 */
362 	for (eep = eqp->eq_free, data = eqp->eq_data; qlen > 1; qlen--) {
363 		eep->eqe_next = NULL;
364 		eep->eqe_dump = NULL;
365 		eep->eqe_prev = eep + 1;
366 		eep->eqe_data = data;
367 		data += size;
368 		eep++;
369 	}
370 
371 	eep->eqe_next = NULL;
372 	eep->eqe_prev = NULL;
373 	eep->eqe_data = data;
374 	eep->eqe_dump = NULL;
375 
376 	/*
377 	 * Once the errorq is initialized, add it to the global list of queues,
378 	 * and then return a pointer to the new queue to the caller.
379 	 */
380 	mutex_enter(&errorq_lock);
381 	eqp->eq_next = errorq_list;
382 	errorq_list = eqp;
383 	mutex_exit(&errorq_lock);
384 
385 	return (eqp);
386 }
387 
388 /*
389  * Create a new errorq as if by errorq_create(), but set the ERRORQ_NVLIST
390  * flag and initialize each element to have the start of its data region used
391  * as an errorq_nvelem_t with a nvlist allocator that consumes the data region.
392  */
393 errorq_t *
394 errorq_nvcreate(const char *name, errorq_func_t func, void *private,
395     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
396 {
397 	errorq_t *eqp;
398 	errorq_elem_t *eep;
399 
400 	eqp = errorq_create(name, func, private, qlen,
401 	    size + sizeof (errorq_nvelem_t), ipl, flags | ERRORQ_NVLIST);
402 
403 	if (eqp == NULL)
404 		return (NULL);
405 
406 	mutex_enter(&eqp->eq_lock);
407 
408 	for (eep = eqp->eq_elems; qlen != 0; eep++, qlen--) {
409 		errorq_nvelem_t *eqnp = eep->eqe_data;
410 		eqnp->eqn_buf = (char *)eqnp + sizeof (errorq_nvelem_t);
411 		eqnp->eqn_nva = fm_nva_xcreate(eqnp->eqn_buf, size);
412 	}
413 
414 	mutex_exit(&eqp->eq_lock);
415 	return (eqp);
416 }
417 
418 /*
419  * To destroy an error queue, we mark it as disabled and then explicitly drain
420  * all pending errors.  Once the drain is complete, we can remove the queue
421  * from the global list of queues examined by errorq_panic(), and then free
422  * the various queue data structures.  The caller must use some higher-level
423  * abstraction (e.g. disabling an error interrupt) to ensure that no one will
424  * attempt to enqueue new errors while we are freeing this queue.
425  */
426 void
427 errorq_destroy(errorq_t *eqp)
428 {
429 	errorq_t *p, **pp;
430 	errorq_elem_t *eep;
431 	ulong_t i;
432 
433 	ASSERT(eqp != NULL);
434 	eqp->eq_flags &= ~ERRORQ_ACTIVE;
435 	errorq_drain(eqp);
436 
437 	mutex_enter(&errorq_lock);
438 	pp = &errorq_list;
439 
440 	for (p = errorq_list; p != NULL; p = p->eq_next) {
441 		if (p == eqp) {
442 			*pp = p->eq_next;
443 			break;
444 		}
445 		pp = &p->eq_next;
446 	}
447 
448 	mutex_exit(&errorq_lock);
449 	ASSERT(p != NULL);
450 
451 	if (eqp->eq_flags & ERRORQ_NVLIST) {
452 		for (eep = eqp->eq_elems, i = 0; i < eqp->eq_qlen; i++, eep++) {
453 			errorq_nvelem_t *eqnp = eep->eqe_data;
454 			fm_nva_xdestroy(eqnp->eqn_nva);
455 		}
456 	}
457 
458 	mutex_destroy(&eqp->eq_lock);
459 	kstat_delete(eqp->eq_ksp);
460 
461 	if (eqp->eq_id != NULL)
462 		ddi_remove_softintr(eqp->eq_id);
463 
464 	kmem_free(eqp->eq_elems, eqp->eq_qlen * sizeof (errorq_elem_t));
465 	kmem_free(eqp->eq_data, eqp->eq_qlen * eqp->eq_size);
466 
467 	kmem_free(eqp, sizeof (errorq_t));
468 }
469 
470 /*
471  * Dispatch a new error into the queue for later processing.  The specified
472  * data buffer is copied into a preallocated queue element.  If 'len' is
473  * smaller than the queue element size, the remainder of the queue element is
474  * filled with zeroes.  This function may be called from any context subject
475  * to the Platform Considerations described above.
476  */
477 void
478 errorq_dispatch(errorq_t *eqp, const void *data, size_t len, uint_t flag)
479 {
480 	errorq_elem_t *eep, *old;
481 
482 	if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
483 		atomic_add_64(&errorq_lost, 1);
484 		return; /* drop error if queue is uninitialized or disabled */
485 	}
486 
487 	while ((eep = eqp->eq_free) != NULL) {
488 		if (casptr(&eqp->eq_free, eep, eep->eqe_prev) == eep)
489 			break;
490 	}
491 
492 	if (eep == NULL) {
493 		atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1);
494 		return;
495 	}
496 
497 	ASSERT(len <= eqp->eq_size);
498 	bcopy(data, eep->eqe_data, MIN(eqp->eq_size, len));
499 
500 	if (len < eqp->eq_size)
501 		bzero((caddr_t)eep->eqe_data + len, eqp->eq_size - len);
502 
503 	for (;;) {
504 		old = eqp->eq_pend;
505 		eep->eqe_prev = old;
506 		membar_producer();
507 
508 		if (casptr(&eqp->eq_pend, old, eep) == old)
509 			break;
510 	}
511 
512 	atomic_add_64(&eqp->eq_kstat.eqk_dispatched.value.ui64, 1);
513 
514 	if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
515 		ddi_trigger_softintr(eqp->eq_id);
516 }
517 
518 /*
519  * Drain the specified error queue by calling eq_func() for each pending error.
520  * This function must be called at or below LOCK_LEVEL or from panic context.
521  * In order to synchronize with other attempts to drain the queue, we acquire
522  * the adaptive eq_lock, blocking other consumers.  Once this lock is held,
523  * we must use compare-and-swap to move the pending list to the processing
524  * list and to return elements to the free list in order to synchronize
525  * with producers, who do not acquire any locks and only use compare-and-swap.
526  *
527  * An additional constraint on this function is that if the system panics
528  * while this function is running, the panic code must be able to detect and
529  * handle all intermediate states and correctly dequeue all errors.  The
530  * errorq_panic() function below will be used for detecting and handling
531  * these intermediate states.  The comments in errorq_drain() below explain
532  * how we make sure each intermediate state is distinct and consistent.
533  */
534 void
535 errorq_drain(errorq_t *eqp)
536 {
537 	errorq_elem_t *eep, *fep, *dep;
538 
539 	ASSERT(eqp != NULL);
540 	mutex_enter(&eqp->eq_lock);
541 
542 	/*
543 	 * If there are one or more pending errors, set eq_ptail to point to
544 	 * the first element on the pending list and then attempt to compare-
545 	 * and-swap NULL to the pending list.  We use membar_producer() to
546 	 * make sure that eq_ptail will be visible to errorq_panic() below
547 	 * before the pending list is NULLed out.  This section is labeled
548 	 * case (1) for errorq_panic, below.  If eq_ptail is not yet set (1A)
549 	 * eq_pend has all the pending errors.  If casptr fails or has not
550 	 * been called yet (1B), eq_pend still has all the pending errors.
551 	 * If casptr succeeds (1C), eq_ptail has all the pending errors.
552 	 */
553 	while ((eep = eqp->eq_pend) != NULL) {
554 		eqp->eq_ptail = eep;
555 		membar_producer();
556 
557 		if (casptr(&eqp->eq_pend, eep, NULL) == eep)
558 			break;
559 	}
560 
561 	/*
562 	 * If no errors were pending, assert that eq_ptail is set to NULL,
563 	 * drop the consumer lock, and return without doing anything.
564 	 */
565 	if (eep == NULL) {
566 		ASSERT(eqp->eq_ptail == NULL);
567 		mutex_exit(&eqp->eq_lock);
568 		return;
569 	}
570 
571 	/*
572 	 * Now iterate from eq_ptail (a.k.a. eep, the newest error) to the
573 	 * oldest error, setting the eqe_next pointer so that we can iterate
574 	 * over the errors from oldest to newest.  We use membar_producer()
575 	 * to make sure that these stores are visible before we set eq_phead.
576 	 * If we panic before, during, or just after this loop (case 2),
577 	 * errorq_panic() will simply redo this work, as described below.
578 	 */
579 	for (eep->eqe_next = NULL; eep->eqe_prev != NULL; eep = eep->eqe_prev)
580 		eep->eqe_prev->eqe_next = eep;
581 	membar_producer();
582 
583 	/*
584 	 * Now set eq_phead to the head of the processing list (the oldest
585 	 * error) and issue another membar_producer() to make sure that
586 	 * eq_phead is seen as non-NULL before we clear eq_ptail.  If we panic
587 	 * after eq_phead is set (case 3), we will detect and log these errors
588 	 * in errorq_panic(), as described below.
589 	 */
590 	eqp->eq_phead = eep;
591 	membar_producer();
592 
593 	eqp->eq_ptail = NULL;
594 	membar_producer();
595 
596 	/*
597 	 * If we enter from errorq_panic_drain(), we may already have
598 	 * errorq elements on the dump list.  Find the tail of
599 	 * the list ready for append.
600 	 */
601 	if (panicstr && (dep = eqp->eq_dump) != NULL) {
602 		while (dep->eqe_dump != NULL)
603 			dep = dep->eqe_dump;
604 	}
605 
606 	/*
607 	 * Now iterate over the processing list from oldest (eq_phead) to
608 	 * newest and log each error.  Once an error is logged, we use
609 	 * compare-and-swap to return it to the free list.  If we panic before,
610 	 * during, or after calling eq_func() (case 4), the error will still be
611 	 * found on eq_phead and will be logged in errorq_panic below.
612 	 */
613 
614 	while ((eep = eqp->eq_phead) != NULL) {
615 		eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
616 		eqp->eq_kstat.eqk_logged.value.ui64++;
617 
618 		eqp->eq_phead = eep->eqe_next;
619 		membar_producer();
620 
621 		eep->eqe_next = NULL;
622 
623 		for (;;) {
624 			fep = eqp->eq_free;
625 			eep->eqe_prev = fep;
626 			membar_producer();
627 
628 			if (casptr(&eqp->eq_free, fep, eep) == fep)
629 				break;
630 		}
631 
632 		/*
633 		 * On panic, we add the element to the dump list for each
634 		 * nvlist errorq.  Elements are stored oldest to newest.
635 		 */
636 		if (panicstr && (eqp->eq_flags & ERRORQ_NVLIST)) {
637 			if (eqp->eq_dump == NULL)
638 				dep = eqp->eq_dump = eep;
639 			else
640 				dep = dep->eqe_dump = eep;
641 			membar_producer();
642 		}
643 	}
644 
645 	mutex_exit(&eqp->eq_lock);
646 }
647 
648 /*
649  * Now that device tree services are available, set up the soft interrupt
650  * handlers for any queues that were created early in boot.  We then
651  * manually drain these queues to report any pending early errors.
652  */
653 void
654 errorq_init(void)
655 {
656 	dev_info_t *dip = ddi_root_node();
657 	ddi_softintr_t id;
658 	errorq_t *eqp;
659 
660 	ASSERT(modrootloaded != 0);
661 	ASSERT(dip != NULL);
662 
663 	mutex_enter(&errorq_lock);
664 
665 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
666 		ddi_iblock_cookie_t ibc =
667 		    (ddi_iblock_cookie_t)(uintptr_t)ipltospl(eqp->eq_ipl);
668 
669 		if (eqp->eq_id != NULL)
670 			continue; /* softint already initialized */
671 
672 		if (ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, &ibc, NULL,
673 		    errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
674 			panic("errorq_init: failed to register IPL %u softint "
675 			    "for queue %s", eqp->eq_ipl, eqp->eq_name);
676 		}
677 
678 		eqp->eq_id = id;
679 		errorq_drain(eqp);
680 	}
681 
682 	mutex_exit(&errorq_lock);
683 }
684 
685 /*
686  * This function is designed to be called from panic context only, and
687  * therefore does not need to acquire errorq_lock when iterating over
688  * errorq_list.  This function must be called no more than once for each
689  * 'what' value (if you change this then review the manipulation of 'dep'.
690  */
691 static uint64_t
692 errorq_panic_drain(uint_t what)
693 {
694 	errorq_elem_t *eep, *nep, *fep, *dep;
695 	errorq_t *eqp;
696 	uint64_t loggedtmp;
697 	uint64_t logged = 0;
698 
699 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
700 		if ((eqp->eq_flags & (ERRORQ_VITAL | ERRORQ_NVLIST)) != what)
701 			continue; /* do not drain this queue on this pass */
702 
703 		loggedtmp = eqp->eq_kstat.eqk_logged.value.ui64;
704 
705 		/*
706 		 * In case (1B) above, eq_ptail may be set but the casptr may
707 		 * not have been executed yet or may have failed.  Either way,
708 		 * we must log errors in chronological order.  So we search
709 		 * the pending list for the error pointed to by eq_ptail.  If
710 		 * it is found, we know that all subsequent errors are also
711 		 * still on the pending list, so just NULL out eq_ptail and let
712 		 * errorq_drain(), below, take care of the logging.
713 		 */
714 		for (eep = eqp->eq_pend; eep != NULL; eep = eep->eqe_prev) {
715 			if (eep == eqp->eq_ptail) {
716 				ASSERT(eqp->eq_phead == NULL);
717 				eqp->eq_ptail = NULL;
718 				break;
719 			}
720 		}
721 
722 		/*
723 		 * In cases (1C) and (2) above, eq_ptail will be set to the
724 		 * newest error on the processing list but eq_phead will still
725 		 * be NULL.  We set the eqe_next pointers so we can iterate
726 		 * over the processing list in order from oldest error to the
727 		 * newest error.  We then set eq_phead to point to the oldest
728 		 * error and fall into the for-loop below.
729 		 */
730 		if (eqp->eq_phead == NULL && (eep = eqp->eq_ptail) != NULL) {
731 			for (eep->eqe_next = NULL; eep->eqe_prev != NULL;
732 			    eep = eep->eqe_prev)
733 				eep->eqe_prev->eqe_next = eep;
734 
735 			eqp->eq_phead = eep;
736 			eqp->eq_ptail = NULL;
737 		}
738 
739 		/*
740 		 * In cases (3) and (4) above (or after case (1C/2) handling),
741 		 * eq_phead will be set to the oldest error on the processing
742 		 * list.  We log each error and return it to the free list.
743 		 *
744 		 * Unlike errorq_drain(), we don't need to worry about updating
745 		 * eq_phead because errorq_panic() will be called at most once.
746 		 * However, we must use casptr to update the freelist in case
747 		 * errors are still being enqueued during panic.
748 		 */
749 		for (eep = eqp->eq_phead; eep != NULL; eep = nep) {
750 			eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
751 			eqp->eq_kstat.eqk_logged.value.ui64++;
752 
753 			nep = eep->eqe_next;
754 			eep->eqe_next = NULL;
755 
756 			for (;;) {
757 				fep = eqp->eq_free;
758 				eep->eqe_prev = fep;
759 				membar_producer();
760 
761 				if (casptr(&eqp->eq_free, fep, eep) == fep)
762 					break;
763 			}
764 
765 			/*
766 			 * On panic, we add the element to the dump list for
767 			 * each nvlist errorq, stored oldest to newest.
768 			 */
769 			if (eqp->eq_flags & ERRORQ_NVLIST) {
770 				if (eqp->eq_dump == NULL)
771 					dep = eqp->eq_dump = eep;
772 				else
773 					dep = dep->eqe_dump = eep;
774 				membar_producer();
775 			}
776 		}
777 
778 		/*
779 		 * Now go ahead and drain any other errors on the pending list.
780 		 * This call transparently handles case (1A) above, as well as
781 		 * any other errors that were dispatched after errorq_drain()
782 		 * completed its first compare-and-swap.
783 		 */
784 		errorq_drain(eqp);
785 
786 		logged += eqp->eq_kstat.eqk_logged.value.ui64 - loggedtmp;
787 	}
788 	return (logged);
789 }
790 
791 /*
792  * Drain all error queues - called only from panic context.  Some drain
793  * functions may enqueue errors to ERRORQ_NVLIST error queues so that
794  * they may be written out in the panic dump - so ERRORQ_NVLIST queues
795  * must be drained last.  Drain ERRORQ_VITAL queues before nonvital queues
796  * so that vital errors get to fill the ERRORQ_NVLIST queues first, and
797  * do not drain the nonvital queues if there are many vital errors.
798  */
799 void
800 errorq_panic(void)
801 {
802 	ASSERT(panicstr != NULL);
803 
804 	if (errorq_panic_drain(ERRORQ_VITAL) <= errorq_vitalmin)
805 		(void) errorq_panic_drain(0);
806 	(void) errorq_panic_drain(ERRORQ_VITAL | ERRORQ_NVLIST);
807 	(void) errorq_panic_drain(ERRORQ_NVLIST);
808 }
809 
810 /*
811  * Reserve an error queue element for later processing and dispatching.  The
812  * element is returned to the caller who may add error-specific data to
813  * element.  The element is retured to the free list when either
814  * errorq_commit() is called and the element asynchronously processed
815  * or immediately when errorq_cancel() is called.
816  */
817 errorq_elem_t *
818 errorq_reserve(errorq_t *eqp)
819 {
820 	errorq_elem_t *eqep;
821 
822 	if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
823 		atomic_add_64(&errorq_lost, 1);
824 		return (NULL);
825 	}
826 
827 	while ((eqep = eqp->eq_free) != NULL) {
828 		if (casptr(&eqp->eq_free, eqep, eqep->eqe_prev) == eqep)
829 			break;
830 	}
831 
832 	if (eqep == NULL) {
833 		atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1);
834 		return (NULL);
835 	}
836 
837 	if (eqp->eq_flags & ERRORQ_NVLIST) {
838 		errorq_nvelem_t *eqnp = eqep->eqe_data;
839 		nv_alloc_reset(eqnp->eqn_nva);
840 		eqnp->eqn_nvl = fm_nvlist_create(eqnp->eqn_nva);
841 	}
842 
843 	atomic_add_64(&eqp->eq_kstat.eqk_reserved.value.ui64, 1);
844 	return (eqep);
845 }
846 
847 /*
848  * Commit an errorq element (eqep) for dispatching.
849  * This function may be called from any context subject
850  * to the Platform Considerations described above.
851  */
852 void
853 errorq_commit(errorq_t *eqp, errorq_elem_t *eqep, uint_t flag)
854 {
855 	errorq_elem_t *old;
856 
857 	if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
858 		atomic_add_64(&eqp->eq_kstat.eqk_commit_fail.value.ui64, 1);
859 		return;
860 	}
861 
862 	for (;;) {
863 		old = eqp->eq_pend;
864 		eqep->eqe_prev = old;
865 		membar_producer();
866 
867 		if (casptr(&eqp->eq_pend, old, eqep) == old)
868 			break;
869 	}
870 
871 	atomic_add_64(&eqp->eq_kstat.eqk_committed.value.ui64, 1);
872 
873 	if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
874 		ddi_trigger_softintr(eqp->eq_id);
875 }
876 
877 /*
878  * Cancel an errorq element reservation by returning the specified element
879  * to the free list.  Duplicate or invalid frees are not supported.
880  */
881 void
882 errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep)
883 {
884 	errorq_elem_t *fep;
885 
886 	if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE))
887 		return;
888 
889 	for (;;) {
890 		fep = eqp->eq_free;
891 		eqep->eqe_prev = fep;
892 		membar_producer();
893 
894 		if (casptr(&eqp->eq_free, fep, eqep) == fep)
895 			break;
896 	}
897 
898 	atomic_add_64(&eqp->eq_kstat.eqk_cancelled.value.ui64, 1);
899 }
900 
901 /*
902  * Write elements on the dump list of each nvlist errorq to the dump device.
903  * Upon reboot, fmd(1M) will extract and replay them for diagnosis.
904  */
905 void
906 errorq_dump(void)
907 {
908 	errorq_elem_t *eep;
909 	errorq_t *eqp;
910 
911 	if (ereport_dumpbuf == NULL)
912 		return; /* reboot or panic before errorq is even set up */
913 
914 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
915 		if (!(eqp->eq_flags & ERRORQ_NVLIST) ||
916 		    !(eqp->eq_flags & ERRORQ_ACTIVE))
917 			continue; /* do not dump this queue on panic */
918 
919 		for (eep = eqp->eq_dump; eep != NULL; eep = eep->eqe_dump) {
920 			errorq_nvelem_t *eqnp = eep->eqe_data;
921 			size_t len = 0;
922 			erpt_dump_t ed;
923 			int err;
924 
925 			(void) nvlist_size(eqnp->eqn_nvl,
926 			    &len, NV_ENCODE_NATIVE);
927 
928 			if (len > ereport_dumplen || len == 0) {
929 				cmn_err(CE_WARN, "%s: unable to save error "
930 				    "report %p due to size %lu\n",
931 				    eqp->eq_name, (void *)eep, len);
932 				continue;
933 			}
934 
935 			if ((err = nvlist_pack(eqnp->eqn_nvl,
936 			    (char **)&ereport_dumpbuf, &ereport_dumplen,
937 			    NV_ENCODE_NATIVE, KM_NOSLEEP)) != 0) {
938 				cmn_err(CE_WARN, "%s: unable to save error "
939 				    "report %p due to pack error %d\n",
940 				    eqp->eq_name, (void *)eep, err);
941 				continue;
942 			}
943 
944 			ed.ed_magic = ERPT_MAGIC;
945 			ed.ed_chksum = checksum32(ereport_dumpbuf, len);
946 			ed.ed_size = (uint32_t)len;
947 			ed.ed_pad = 0;
948 			ed.ed_hrt_nsec = 0;
949 			ed.ed_hrt_base = panic_hrtime;
950 			ed.ed_tod_base.sec = panic_hrestime.tv_sec;
951 			ed.ed_tod_base.nsec = panic_hrestime.tv_nsec;
952 
953 			dumpvp_write(&ed, sizeof (ed));
954 			dumpvp_write(ereport_dumpbuf, len);
955 		}
956 	}
957 }
958 
959 nvlist_t *
960 errorq_elem_nvl(errorq_t *eqp, const errorq_elem_t *eqep)
961 {
962 	errorq_nvelem_t *eqnp = eqep->eqe_data;
963 
964 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
965 
966 	return (eqnp->eqn_nvl);
967 }
968 
969 nv_alloc_t *
970 errorq_elem_nva(errorq_t *eqp, const errorq_elem_t *eqep)
971 {
972 	errorq_nvelem_t *eqnp = eqep->eqe_data;
973 
974 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
975 
976 	return (eqnp->eqn_nva);
977 }
978 
979 /*
980  * Reserve a new element and duplicate the data of the original into it.
981  */
982 void *
983 errorq_elem_dup(errorq_t *eqp, const errorq_elem_t *eqep, errorq_elem_t **neqep)
984 {
985 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE);
986 	ASSERT(!(eqp->eq_flags & ERRORQ_NVLIST));
987 
988 	if ((*neqep = errorq_reserve(eqp)) == NULL)
989 		return (NULL);
990 
991 	bcopy(eqep->eqe_data, (*neqep)->eqe_data, eqp->eq_size);
992 	return ((*neqep)->eqe_data);
993 }
994