xref: /illumos-gate/usr/src/uts/common/os/errorq.c (revision 24f5a37652e188ebdcdd6da454511686935025df)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Kernel Error Queues
28  *
29  * A common problem when handling hardware error traps and interrupts is that
30  * these errors frequently must be handled at high interrupt level, where
31  * reliably producing error messages and safely examining and manipulating
32  * other kernel state may not be possible.  The kernel error queue primitive is
33  * a common set of routines that allow a subsystem to maintain a queue of
34  * errors that can be processed by an explicit call from a safe context or by a
35  * soft interrupt that fires at a specific lower interrupt level.  The queue
36  * management code also ensures that if the system panics, all in-transit
37  * errors are logged prior to reset.  Each queue has an associated kstat for
38  * observing the number of errors dispatched and logged, and mdb(1) debugging
39  * support is provided for live and post-mortem observability.
40  *
41  * Memory Allocation
42  *
43  * 	All of the queue data structures are allocated in advance as part of
44  * 	the errorq_create() call.  No additional memory allocations are
45  * 	performed as part of errorq_dispatch(), errorq_reserve(),
46  *	errorq_commit() or errorq_drain().  This design
47  * 	facilitates reliable error queue processing even when the system is low
48  * 	on memory, and ensures that errorq_dispatch() can be called from any
49  * 	context.  When the queue is created, the maximum queue length is
50  * 	specified as a parameter to errorq_create() and errorq_nvcreate().  This
51  *	length should represent a reasonable upper bound on the number of
52  *	simultaneous errors.  If errorq_dispatch() or errorq_reserve() is
53  *	invoked and no free queue elements are available, the error is
54  *	dropped and will not be logged.  Typically, the queue will only be
55  *	exhausted by an error storm, and in this case
56  * 	the earlier errors provide the most important data for analysis.
57  * 	When a new error is dispatched, the error data is copied into the
58  * 	preallocated queue element so that the caller's buffer can be reused.
59  *
60  *	When a new error is reserved, an element is moved from the free pool
61  *	and returned to the caller.  The element buffer data, eqe_data, may be
62  *	managed by the caller and dispatched to the errorq by calling
63  *	errorq_commit().  This is useful for additions to errorq's
64  *	created with errorq_nvcreate() to handle name-value pair (nvpair) data.
65  *	See below for a discussion on nvlist errorq's.
66  *
67  * Queue Drain Callback
68  *
69  *      When the error queue is drained, the caller's queue drain callback is
70  *      invoked with a pointer to the saved error data.  This function may be
71  *      called from passive kernel context or soft interrupt context at or
72  *      below LOCK_LEVEL, or as part of panic().  As such, the callback should
73  *      basically only be calling cmn_err (but NOT with the CE_PANIC flag).
74  *      The callback must not call panic(), attempt to allocate memory, or wait
75  *      on a condition variable.  The callback may not call errorq_destroy()
76  *      or errorq_drain() on the same error queue that called it.
77  *
78  *      The queue drain callback will always be called for each pending error
79  *      in the order in which errors were enqueued (oldest to newest).  The
80  *      queue drain callback is guaranteed to provide at *least* once semantics
81  *      for all errors that are successfully dispatched (i.e. for which
82  *      errorq_dispatch() has successfully completed).  If an unrelated panic
83  *      occurs while the queue drain callback is running on a vital queue, the
84  *      panic subsystem will continue the queue drain and the callback may be
85  *      invoked again for the same error.  Therefore, the callback should
86  *      restrict itself to logging messages and taking other actions that are
87  *      not destructive if repeated.
88  *
89  * Name-Value Pair Error Queues
90  *
91  *	During error handling, it may be more convenient to store error
92  *	queue element data as a fixed buffer of name-value pairs.  The
93  *	nvpair library allows construction and destruction of nvlists
94  *	in pre-allocated memory buffers.
95  *
96  *	Error queues created via errorq_nvcreate() store queue element
97  *	data as fixed buffer nvlists (ereports).  errorq_reserve()
98  *	allocates an errorq element from eqp->eq_bitmap and returns a valid
99  *	pointer	to a errorq_elem_t (queue element) and a pre-allocated
100  *	fixed buffer nvlist.  errorq_elem_nvl() is used to gain access
101  *	to the nvlist to add name-value ereport members prior to
102  *	dispatching the error queue element in errorq_commit().
103  *
104  *	Once dispatched, the drain function will return the element to
105  *	eqp->eq_bitmap and reset the associated nv_alloc structure.
106  *	error_cancel() may be called to cancel an element reservation
107  *	element that was never dispatched (committed).  This is useful in
108  *	cases where a programming error prevents a queue element from being
109  *	dispatched.
110  *
111  * Queue Management
112  *
113  *      The queue element structures and error data buffers are allocated in
114  *      two contiguous chunks as part of errorq_create() or errorq_nvcreate().
115  *	Each queue element structure contains a next pointer,
116  *	a previous pointer, and a pointer to the corresponding error data
117  *	buffer.  The data buffer for a nvlist errorq is a shared buffer
118  *	for the allocation of name-value pair lists. The elements are kept on
119  *      one of four lists:
120  *
121  *	Unused elements are kept in the free pool, managed by eqp->eq_bitmap.
122  *	The eqe_prev and eqe_next pointers are not used while in the free pool
123  *	and will be set to NULL.
124  *
125  *      Pending errors are kept on the pending list, a singly-linked list
126  *      pointed to by eqp->eq_pend, and linked together using eqe_prev.  This
127  *      list is maintained in order from newest error to oldest.  The eqe_next
128  *      pointer is not used by the pending list and will be set to NULL.
129  *
130  *      The processing list is a doubly-linked list pointed to by eqp->eq_phead
131  *      (the oldest element) and eqp->eq_ptail (the newest element).  The
132  *      eqe_next pointer is used to traverse from eq_phead to eq_ptail, and the
133  *      eqe_prev pointer is used to traverse from eq_ptail to eq_phead.  Once a
134  *      queue drain operation begins, the current pending list is moved to the
135  *      processing list in a two-phase commit fashion (eq_ptail being cleared
136  *	at the beginning but eq_phead only at the end), allowing the panic code
137  *      to always locate and process all pending errors in the event that a
138  *      panic occurs in the middle of queue processing.
139  *
140  *	A fourth list is maintained for nvlist errorqs.  The dump list,
141  *	eq_dump is used to link all errorq elements that should be stored
142  *	in a crash dump file in the event of a system panic.  During
143  *	errorq_panic(), the list is created and subsequently traversed
144  *	in errorq_dump() during the final phases of a crash dump.
145  *
146  * Platform Considerations
147  *
148  *      In order to simplify their implementation, error queues make use of the
149  *      C wrappers for compare-and-swap.  If the platform itself does not
150  *      support compare-and-swap in hardware and the kernel emulation routines
151  *      are used instead, then the context in which errorq_dispatch() can be
152  *      safely invoked is further constrained by the implementation of the
153  *      compare-and-swap emulation.  Specifically, if errorq_dispatch() is
154  *      called from a code path that can be executed above ATOMIC_LEVEL on such
155  *      a platform, the dispatch code could potentially deadlock unless the
156  *      corresponding error interrupt is blocked or disabled prior to calling
157  *      errorq_dispatch().  Error queues should therefore be deployed with
158  *      caution on these platforms.
159  *
160  * Interfaces
161  *
162  * errorq_t *errorq_create(name, func, private, qlen, eltsize, ipl, flags);
163  * errorq_t *errorq_nvcreate(name, func, private, qlen, eltsize, ipl, flags);
164  *
165  *      Create a new error queue with the specified name, callback, and
166  *      properties.  A pointer to the new error queue is returned upon success,
167  *      or NULL is returned to indicate that the queue could not be created.
168  *      This function must be called from passive kernel context with no locks
169  *      held that can prevent a sleeping memory allocation from occurring.
170  *      errorq_create() will return failure if the queue kstats cannot be
171  *      created, or if a soft interrupt handler cannot be registered.
172  *
173  *      The queue 'name' is a string that is recorded for live and post-mortem
174  *      examination by a debugger.  The queue callback 'func' will be invoked
175  *      for each error drained from the queue, and will receive the 'private'
176  *      pointer as its first argument.  The callback must obey the rules for
177  *      callbacks described above.  The queue will have maximum length 'qlen'
178  *      and each element will be able to record up to 'eltsize' bytes of data.
179  *      The queue's soft interrupt (see errorq_dispatch(), below) will fire
180  *      at 'ipl', which should not exceed LOCK_LEVEL.  The queue 'flags' may
181  *      include the following flag:
182  *
183  *      ERRORQ_VITAL    - This queue contains information that is considered
184  *         vital to problem diagnosis.  Error queues that are marked vital will
185  *         be automatically drained by the panic subsystem prior to printing
186  *         the panic messages to the console.
187  *
188  * void errorq_destroy(errorq);
189  *
190  *      Destroy the specified error queue.  The queue is drained of any
191  *      pending elements and these are logged before errorq_destroy returns.
192  *      Once errorq_destroy() begins draining the queue, any simultaneous
193  *      calls to dispatch errors will result in the errors being dropped.
194  *      The caller must invoke a higher-level abstraction (e.g. disabling
195  *      an error interrupt) to ensure that error handling code does not
196  *      attempt to dispatch errors to the queue while it is being freed.
197  *
198  * void errorq_dispatch(errorq, data, len, flag);
199  *
200  *      Attempt to enqueue the specified error data.  If a free queue element
201  *      is available, the data is copied into a free element and placed on a
202  *      pending list.  If no free queue element is available, the error is
203  *      dropped.  The data length (len) is specified in bytes and should not
204  *      exceed the queue's maximum element size.  If the data length is less
205  *      than the maximum element size, the remainder of the queue element is
206  *      filled with zeroes.  The flag parameter should be one of:
207  *
208  *      ERRORQ_ASYNC    - Schedule a soft interrupt at the previously specified
209  *         IPL to asynchronously drain the queue on behalf of the caller.
210  *
211  *      ERRORQ_SYNC     - Do not schedule a soft interrupt to drain the queue.
212  *         The caller is presumed to be calling errorq_drain() or panic() in
213  *         the near future in order to drain the queue and log the error.
214  *
215  *      The errorq_dispatch() function may be called from any context, subject
216  *      to the Platform Considerations described above.
217  *
218  * void errorq_drain(errorq);
219  *
220  *      Drain the error queue of all pending errors.  The queue's callback
221  *      function is invoked for each error in order from oldest to newest.
222  *      This function may be used at or below LOCK_LEVEL or from panic context.
223  *
224  * errorq_elem_t *errorq_reserve(errorq);
225  *
226  *	Reserve an error queue element for later processing and dispatching.
227  *	The element is returned to the caller who may add error-specific data
228  *	to element.  The element is retured to the free pool when either
229  *	errorq_commit() is called and the element asynchronously processed
230  *	or immediately when errorq_cancel() is called.
231  *
232  * void errorq_commit(errorq, errorq_elem, flag);
233  *
234  *	Commit an errorq element (eqep) for dispatching, see
235  *	errorq_dispatch().
236  *
237  * void errorq_cancel(errorq, errorq_elem);
238  *
239  *	Cancel a pending errorq element reservation.  The errorq element is
240  *	returned to the free pool upon cancelation.
241  */
242 
243 #include <sys/errorq_impl.h>
244 #include <sys/sysmacros.h>
245 #include <sys/machlock.h>
246 #include <sys/cmn_err.h>
247 #include <sys/atomic.h>
248 #include <sys/systm.h>
249 #include <sys/kmem.h>
250 #include <sys/conf.h>
251 #include <sys/ddi.h>
252 #include <sys/sunddi.h>
253 #include <sys/bootconf.h>
254 #include <sys/spl.h>
255 #include <sys/dumphdr.h>
256 #include <sys/compress.h>
257 #include <sys/time.h>
258 #include <sys/panic.h>
259 #include <sys/bitmap.h>
260 #include <sys/fm/protocol.h>
261 #include <sys/fm/util.h>
262 
263 static struct errorq_kstat errorq_kstat_template = {
264 	{ "dispatched", KSTAT_DATA_UINT64 },
265 	{ "dropped", KSTAT_DATA_UINT64 },
266 	{ "logged", KSTAT_DATA_UINT64 },
267 	{ "reserved", KSTAT_DATA_UINT64 },
268 	{ "reserve_fail", KSTAT_DATA_UINT64 },
269 	{ "committed", KSTAT_DATA_UINT64 },
270 	{ "commit_fail", KSTAT_DATA_UINT64 },
271 	{ "cancelled", KSTAT_DATA_UINT64 }
272 };
273 
274 static uint64_t errorq_lost = 0;
275 static errorq_t *errorq_list = NULL;
276 static kmutex_t errorq_lock;
277 static uint64_t errorq_vitalmin = 5;
278 
279 static uint_t
280 errorq_intr(caddr_t eqp)
281 {
282 	errorq_drain((errorq_t *)eqp);
283 	return (DDI_INTR_CLAIMED);
284 }
285 
286 /*
287  * Create a new error queue with the specified properties and add a software
288  * interrupt handler and kstat for it.  This function must be called from
289  * passive kernel context with no locks held that can prevent a sleeping
290  * memory allocation from occurring.  This function will return NULL if the
291  * softint or kstat for this queue cannot be created.
292  */
293 errorq_t *
294 errorq_create(const char *name, errorq_func_t func, void *private,
295     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
296 {
297 	errorq_t *eqp = kmem_alloc(sizeof (errorq_t), KM_SLEEP);
298 	ddi_iblock_cookie_t ibc = (ddi_iblock_cookie_t)(uintptr_t)ipltospl(ipl);
299 	dev_info_t *dip = ddi_root_node();
300 
301 	errorq_elem_t *eep;
302 	ddi_softintr_t id = NULL;
303 	caddr_t data;
304 
305 	ASSERT(qlen != 0 && size != 0);
306 	ASSERT(ipl > 0 && ipl <= LOCK_LEVEL);
307 
308 	/*
309 	 * If a queue is created very early in boot before device tree services
310 	 * are available, the queue softint handler cannot be created.  We
311 	 * manually drain these queues and create their softint handlers when
312 	 * it is safe to do so as part of errorq_init(), below.
313 	 */
314 	if (modrootloaded && ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id,
315 	    &ibc, NULL, errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
316 		cmn_err(CE_WARN, "errorq_create: failed to register "
317 		    "IPL %u softint for queue %s", ipl, name);
318 		kmem_free(eqp, sizeof (errorq_t));
319 		return (NULL);
320 	}
321 
322 	if ((eqp->eq_ksp = kstat_create("unix", 0, name, "errorq",
323 	    KSTAT_TYPE_NAMED, sizeof (struct errorq_kstat) /
324 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) == NULL) {
325 		cmn_err(CE_WARN, "errorq_create: failed to create kstat "
326 		    "for queue %s", name);
327 		if (id != NULL)
328 			ddi_remove_softintr(id);
329 		kmem_free(eqp, sizeof (errorq_t));
330 		return (NULL);
331 	}
332 
333 	bcopy(&errorq_kstat_template, &eqp->eq_kstat,
334 	    sizeof (struct errorq_kstat));
335 	eqp->eq_ksp->ks_data = &eqp->eq_kstat;
336 	eqp->eq_ksp->ks_private = eqp;
337 	kstat_install(eqp->eq_ksp);
338 
339 	(void) strncpy(eqp->eq_name, name, ERRORQ_NAMELEN);
340 	eqp->eq_name[ERRORQ_NAMELEN] = '\0';
341 	eqp->eq_func = func;
342 	eqp->eq_private = private;
343 	eqp->eq_data = kmem_alloc(qlen * size, KM_SLEEP);
344 	eqp->eq_qlen = qlen;
345 	eqp->eq_size = size;
346 	eqp->eq_ipl = ipl;
347 	eqp->eq_flags = flags | ERRORQ_ACTIVE;
348 	eqp->eq_id = id;
349 	mutex_init(&eqp->eq_lock, NULL, MUTEX_DEFAULT, NULL);
350 	eqp->eq_elems = kmem_alloc(qlen * sizeof (errorq_elem_t), KM_SLEEP);
351 	eqp->eq_phead = NULL;
352 	eqp->eq_ptail = NULL;
353 	eqp->eq_pend = NULL;
354 	eqp->eq_dump = NULL;
355 	eqp->eq_bitmap = kmem_zalloc(BT_SIZEOFMAP(qlen), KM_SLEEP);
356 	eqp->eq_rotor = 0;
357 
358 	/*
359 	 * Iterate over the array of errorq_elem_t structures and set its
360 	 * data pointer.
361 	 */
362 	for (eep = eqp->eq_elems, data = eqp->eq_data; qlen > 1; qlen--) {
363 		eep->eqe_next = NULL;
364 		eep->eqe_dump = NULL;
365 		eep->eqe_prev = NULL;
366 		eep->eqe_data = data;
367 		data += size;
368 		eep++;
369 	}
370 	eep->eqe_next = NULL;
371 	eep->eqe_prev = NULL;
372 	eep->eqe_data = data;
373 	eep->eqe_dump = NULL;
374 
375 	/*
376 	 * Once the errorq is initialized, add it to the global list of queues,
377 	 * and then return a pointer to the new queue to the caller.
378 	 */
379 	mutex_enter(&errorq_lock);
380 	eqp->eq_next = errorq_list;
381 	errorq_list = eqp;
382 	mutex_exit(&errorq_lock);
383 
384 	return (eqp);
385 }
386 
387 /*
388  * Create a new errorq as if by errorq_create(), but set the ERRORQ_NVLIST
389  * flag and initialize each element to have the start of its data region used
390  * as an errorq_nvelem_t with a nvlist allocator that consumes the data region.
391  */
392 errorq_t *
393 errorq_nvcreate(const char *name, errorq_func_t func, void *private,
394     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
395 {
396 	errorq_t *eqp;
397 	errorq_elem_t *eep;
398 
399 	eqp = errorq_create(name, func, private, qlen,
400 	    size + sizeof (errorq_nvelem_t), ipl, flags | ERRORQ_NVLIST);
401 
402 	if (eqp == NULL)
403 		return (NULL);
404 
405 	mutex_enter(&eqp->eq_lock);
406 
407 	for (eep = eqp->eq_elems; qlen != 0; eep++, qlen--) {
408 		errorq_nvelem_t *eqnp = eep->eqe_data;
409 		eqnp->eqn_buf = (char *)eqnp + sizeof (errorq_nvelem_t);
410 		eqnp->eqn_nva = fm_nva_xcreate(eqnp->eqn_buf, size);
411 	}
412 
413 	mutex_exit(&eqp->eq_lock);
414 	return (eqp);
415 }
416 
417 /*
418  * To destroy an error queue, we mark it as disabled and then explicitly drain
419  * all pending errors.  Once the drain is complete, we can remove the queue
420  * from the global list of queues examined by errorq_panic(), and then free
421  * the various queue data structures.  The caller must use some higher-level
422  * abstraction (e.g. disabling an error interrupt) to ensure that no one will
423  * attempt to enqueue new errors while we are freeing this queue.
424  */
425 void
426 errorq_destroy(errorq_t *eqp)
427 {
428 	errorq_t *p, **pp;
429 	errorq_elem_t *eep;
430 	ulong_t i;
431 
432 	ASSERT(eqp != NULL);
433 	eqp->eq_flags &= ~ERRORQ_ACTIVE;
434 	errorq_drain(eqp);
435 
436 	mutex_enter(&errorq_lock);
437 	pp = &errorq_list;
438 
439 	for (p = errorq_list; p != NULL; p = p->eq_next) {
440 		if (p == eqp) {
441 			*pp = p->eq_next;
442 			break;
443 		}
444 		pp = &p->eq_next;
445 	}
446 
447 	mutex_exit(&errorq_lock);
448 	ASSERT(p != NULL);
449 
450 	if (eqp->eq_flags & ERRORQ_NVLIST) {
451 		for (eep = eqp->eq_elems, i = 0; i < eqp->eq_qlen; i++, eep++) {
452 			errorq_nvelem_t *eqnp = eep->eqe_data;
453 			fm_nva_xdestroy(eqnp->eqn_nva);
454 		}
455 	}
456 
457 	mutex_destroy(&eqp->eq_lock);
458 	kstat_delete(eqp->eq_ksp);
459 
460 	if (eqp->eq_id != NULL)
461 		ddi_remove_softintr(eqp->eq_id);
462 
463 	kmem_free(eqp->eq_elems, eqp->eq_qlen * sizeof (errorq_elem_t));
464 	kmem_free(eqp->eq_bitmap, BT_SIZEOFMAP(eqp->eq_qlen));
465 	kmem_free(eqp->eq_data, eqp->eq_qlen * eqp->eq_size);
466 
467 	kmem_free(eqp, sizeof (errorq_t));
468 }
469 
470 /*
471  * private version of bt_availbit which makes a best-efforts attempt
472  * at allocating in a round-robin fashion in order to facilitate post-mortem
473  * diagnosis.
474  */
475 static index_t
476 errorq_availbit(ulong_t *bitmap, size_t nbits, index_t curindex)
477 {
478 	ulong_t bit, maxbit, bx;
479 	index_t rval, nextindex = curindex + 1;
480 	index_t nextword = nextindex >> BT_ULSHIFT;
481 	ulong_t nextbitindex = nextindex & BT_ULMASK;
482 	index_t maxindex = nbits - 1;
483 	index_t maxword = maxindex >> BT_ULSHIFT;
484 	ulong_t maxbitindex = maxindex & BT_ULMASK;
485 
486 	/*
487 	 * First check if there are still some bits remaining in the current
488 	 * word, and see if any of those are available. We need to do this by
489 	 * hand as the bt_availbit() function always starts at the beginning
490 	 * of a word.
491 	 */
492 	if (nextindex <= maxindex && nextbitindex != 0) {
493 		maxbit = (nextword == maxword) ? maxbitindex : BT_ULMASK;
494 		for (bx = 0, bit = 1; bx <= maxbit; bx++, bit <<= 1)
495 			if (bx >= nextbitindex && !(bitmap[nextword] & bit))
496 				return ((nextword << BT_ULSHIFT) + bx);
497 		nextword++;
498 	}
499 	/*
500 	 * Now check if there are any words remaining before the end of the
501 	 * bitmap. Use bt_availbit() to find any free bits.
502 	 */
503 	if (nextword <= maxword)
504 		if ((rval = bt_availbit(&bitmap[nextword],
505 		    nbits - (nextword << BT_ULSHIFT))) != -1)
506 			return ((nextword << BT_ULSHIFT) + rval);
507 	/*
508 	 * Finally loop back to the start and look for any free bits starting
509 	 * from the beginning of the bitmap to the current rotor position.
510 	 */
511 	return (bt_availbit(bitmap, nextindex));
512 }
513 
514 /*
515  * Dispatch a new error into the queue for later processing.  The specified
516  * data buffer is copied into a preallocated queue element.  If 'len' is
517  * smaller than the queue element size, the remainder of the queue element is
518  * filled with zeroes.  This function may be called from any context subject
519  * to the Platform Considerations described above.
520  */
521 void
522 errorq_dispatch(errorq_t *eqp, const void *data, size_t len, uint_t flag)
523 {
524 	errorq_elem_t *eep, *old;
525 
526 	if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
527 		atomic_inc_64(&errorq_lost);
528 		return; /* drop error if queue is uninitialized or disabled */
529 	}
530 
531 	for (;;) {
532 		int i, rval;
533 
534 		if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen,
535 		    eqp->eq_rotor)) == -1) {
536 			atomic_inc_64(&eqp->eq_kstat.eqk_dropped.value.ui64);
537 			return;
538 		}
539 		BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval);
540 		if (rval == 0) {
541 			eqp->eq_rotor = i;
542 			eep = &eqp->eq_elems[i];
543 			break;
544 		}
545 	}
546 
547 	ASSERT(len <= eqp->eq_size);
548 	bcopy(data, eep->eqe_data, MIN(eqp->eq_size, len));
549 
550 	if (len < eqp->eq_size)
551 		bzero((caddr_t)eep->eqe_data + len, eqp->eq_size - len);
552 
553 	for (;;) {
554 		old = eqp->eq_pend;
555 		eep->eqe_prev = old;
556 		membar_producer();
557 
558 		if (atomic_cas_ptr(&eqp->eq_pend, old, eep) == old)
559 			break;
560 	}
561 
562 	atomic_inc_64(&eqp->eq_kstat.eqk_dispatched.value.ui64);
563 
564 	if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
565 		ddi_trigger_softintr(eqp->eq_id);
566 }
567 
568 /*
569  * Drain the specified error queue by calling eq_func() for each pending error.
570  * This function must be called at or below LOCK_LEVEL or from panic context.
571  * In order to synchronize with other attempts to drain the queue, we acquire
572  * the adaptive eq_lock, blocking other consumers.  Once this lock is held,
573  * we must use compare-and-swap to move the pending list to the processing
574  * list and to return elements to the free pool in order to synchronize
575  * with producers, who do not acquire any locks and only use atomic set/clear.
576  *
577  * An additional constraint on this function is that if the system panics
578  * while this function is running, the panic code must be able to detect and
579  * handle all intermediate states and correctly dequeue all errors.  The
580  * errorq_panic() function below will be used for detecting and handling
581  * these intermediate states.  The comments in errorq_drain() below explain
582  * how we make sure each intermediate state is distinct and consistent.
583  */
584 void
585 errorq_drain(errorq_t *eqp)
586 {
587 	errorq_elem_t *eep, *dep;
588 
589 	ASSERT(eqp != NULL);
590 	mutex_enter(&eqp->eq_lock);
591 
592 	/*
593 	 * If there are one or more pending errors, set eq_ptail to point to
594 	 * the first element on the pending list and then attempt to compare-
595 	 * and-swap NULL to the pending list.  We use membar_producer() to
596 	 * make sure that eq_ptail will be visible to errorq_panic() below
597 	 * before the pending list is NULLed out.  This section is labeled
598 	 * case (1) for errorq_panic, below.  If eq_ptail is not yet set (1A)
599 	 * eq_pend has all the pending errors.  If atomic_cas_ptr fails or
600 	 * has not been called yet (1B), eq_pend still has all the pending
601 	 * errors.  If atomic_cas_ptr succeeds (1C), eq_ptail has all the
602 	 * pending errors.
603 	 */
604 	while ((eep = eqp->eq_pend) != NULL) {
605 		eqp->eq_ptail = eep;
606 		membar_producer();
607 
608 		if (atomic_cas_ptr(&eqp->eq_pend, eep, NULL) == eep)
609 			break;
610 	}
611 
612 	/*
613 	 * If no errors were pending, assert that eq_ptail is set to NULL,
614 	 * drop the consumer lock, and return without doing anything.
615 	 */
616 	if (eep == NULL) {
617 		ASSERT(eqp->eq_ptail == NULL);
618 		mutex_exit(&eqp->eq_lock);
619 		return;
620 	}
621 
622 	/*
623 	 * Now iterate from eq_ptail (a.k.a. eep, the newest error) to the
624 	 * oldest error, setting the eqe_next pointer so that we can iterate
625 	 * over the errors from oldest to newest.  We use membar_producer()
626 	 * to make sure that these stores are visible before we set eq_phead.
627 	 * If we panic before, during, or just after this loop (case 2),
628 	 * errorq_panic() will simply redo this work, as described below.
629 	 */
630 	for (eep->eqe_next = NULL; eep->eqe_prev != NULL; eep = eep->eqe_prev)
631 		eep->eqe_prev->eqe_next = eep;
632 	membar_producer();
633 
634 	/*
635 	 * Now set eq_phead to the head of the processing list (the oldest
636 	 * error) and issue another membar_producer() to make sure that
637 	 * eq_phead is seen as non-NULL before we clear eq_ptail.  If we panic
638 	 * after eq_phead is set (case 3), we will detect and log these errors
639 	 * in errorq_panic(), as described below.
640 	 */
641 	eqp->eq_phead = eep;
642 	membar_producer();
643 
644 	eqp->eq_ptail = NULL;
645 	membar_producer();
646 
647 	/*
648 	 * If we enter from errorq_panic_drain(), we may already have
649 	 * errorq elements on the dump list.  Find the tail of
650 	 * the list ready for append.
651 	 */
652 	if (panicstr && (dep = eqp->eq_dump) != NULL) {
653 		while (dep->eqe_dump != NULL)
654 			dep = dep->eqe_dump;
655 	}
656 
657 	/*
658 	 * Now iterate over the processing list from oldest (eq_phead) to
659 	 * newest and log each error.  Once an error is logged, we use
660 	 * atomic clear to return it to the free pool.  If we panic before,
661 	 * during, or after calling eq_func() (case 4), the error will still be
662 	 * found on eq_phead and will be logged in errorq_panic below.
663 	 */
664 
665 	while ((eep = eqp->eq_phead) != NULL) {
666 		eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
667 		eqp->eq_kstat.eqk_logged.value.ui64++;
668 
669 		eqp->eq_phead = eep->eqe_next;
670 		membar_producer();
671 
672 		eep->eqe_next = NULL;
673 
674 		/*
675 		 * On panic, we add the element to the dump list for each
676 		 * nvlist errorq.  Elements are stored oldest to newest.
677 		 * Then continue, so we don't free and subsequently overwrite
678 		 * any elements which we've put on the dump queue.
679 		 */
680 		if (panicstr && (eqp->eq_flags & ERRORQ_NVLIST)) {
681 			if (eqp->eq_dump == NULL)
682 				dep = eqp->eq_dump = eep;
683 			else
684 				dep = dep->eqe_dump = eep;
685 			membar_producer();
686 			continue;
687 		}
688 
689 		eep->eqe_prev = NULL;
690 		BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems);
691 	}
692 
693 	mutex_exit(&eqp->eq_lock);
694 }
695 
696 /*
697  * Now that device tree services are available, set up the soft interrupt
698  * handlers for any queues that were created early in boot.  We then
699  * manually drain these queues to report any pending early errors.
700  */
701 void
702 errorq_init(void)
703 {
704 	dev_info_t *dip = ddi_root_node();
705 	ddi_softintr_t id;
706 	errorq_t *eqp;
707 
708 	ASSERT(modrootloaded != 0);
709 	ASSERT(dip != NULL);
710 
711 	mutex_enter(&errorq_lock);
712 
713 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
714 		ddi_iblock_cookie_t ibc =
715 		    (ddi_iblock_cookie_t)(uintptr_t)ipltospl(eqp->eq_ipl);
716 
717 		if (eqp->eq_id != NULL)
718 			continue; /* softint already initialized */
719 
720 		if (ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, &ibc, NULL,
721 		    errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
722 			panic("errorq_init: failed to register IPL %u softint "
723 			    "for queue %s", eqp->eq_ipl, eqp->eq_name);
724 		}
725 
726 		eqp->eq_id = id;
727 		errorq_drain(eqp);
728 	}
729 
730 	mutex_exit(&errorq_lock);
731 }
732 
733 /*
734  * This function is designed to be called from panic context only, and
735  * therefore does not need to acquire errorq_lock when iterating over
736  * errorq_list.  This function must be called no more than once for each
737  * 'what' value (if you change this then review the manipulation of 'dep'.
738  */
739 static uint64_t
740 errorq_panic_drain(uint_t what)
741 {
742 	errorq_elem_t *eep, *nep, *dep;
743 	errorq_t *eqp;
744 	uint64_t loggedtmp;
745 	uint64_t logged = 0;
746 
747 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
748 		if ((eqp->eq_flags & (ERRORQ_VITAL | ERRORQ_NVLIST)) != what)
749 			continue; /* do not drain this queue on this pass */
750 
751 		loggedtmp = eqp->eq_kstat.eqk_logged.value.ui64;
752 
753 		/*
754 		 * In case (1B) above, eq_ptail may be set but the
755 		 * atomic_cas_ptr may not have been executed yet or may have
756 		 * failed.  Either way, we must log errors in chronological
757 		 * order.  So we search the pending list for the error
758 		 * pointed to by eq_ptail.  If it is found, we know that all
759 		 * subsequent errors are also still on the pending list, so
760 		 * just NULL out eq_ptail and let errorq_drain(), below,
761 		 * take care of the logging.
762 		 */
763 		for (eep = eqp->eq_pend; eep != NULL; eep = eep->eqe_prev) {
764 			if (eep == eqp->eq_ptail) {
765 				ASSERT(eqp->eq_phead == NULL);
766 				eqp->eq_ptail = NULL;
767 				break;
768 			}
769 		}
770 
771 		/*
772 		 * In cases (1C) and (2) above, eq_ptail will be set to the
773 		 * newest error on the processing list but eq_phead will still
774 		 * be NULL.  We set the eqe_next pointers so we can iterate
775 		 * over the processing list in order from oldest error to the
776 		 * newest error.  We then set eq_phead to point to the oldest
777 		 * error and fall into the for-loop below.
778 		 */
779 		if (eqp->eq_phead == NULL && (eep = eqp->eq_ptail) != NULL) {
780 			for (eep->eqe_next = NULL; eep->eqe_prev != NULL;
781 			    eep = eep->eqe_prev)
782 				eep->eqe_prev->eqe_next = eep;
783 
784 			eqp->eq_phead = eep;
785 			eqp->eq_ptail = NULL;
786 		}
787 
788 		/*
789 		 * In cases (3) and (4) above (or after case (1C/2) handling),
790 		 * eq_phead will be set to the oldest error on the processing
791 		 * list.  We log each error and return it to the free pool.
792 		 *
793 		 * Unlike errorq_drain(), we don't need to worry about updating
794 		 * eq_phead because errorq_panic() will be called at most once.
795 		 * However, we must use atomic_cas_ptr to update the
796 		 * freelist in case errors are still being enqueued during
797 		 * panic.
798 		 */
799 		for (eep = eqp->eq_phead; eep != NULL; eep = nep) {
800 			eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
801 			eqp->eq_kstat.eqk_logged.value.ui64++;
802 
803 			nep = eep->eqe_next;
804 			eep->eqe_next = NULL;
805 
806 			/*
807 			 * On panic, we add the element to the dump list for
808 			 * each nvlist errorq, stored oldest to newest. Then
809 			 * continue, so we don't free and subsequently overwrite
810 			 * any elements which we've put on the dump queue.
811 			 */
812 			if (eqp->eq_flags & ERRORQ_NVLIST) {
813 				if (eqp->eq_dump == NULL)
814 					dep = eqp->eq_dump = eep;
815 				else
816 					dep = dep->eqe_dump = eep;
817 				membar_producer();
818 				continue;
819 			}
820 
821 			eep->eqe_prev = NULL;
822 			BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems);
823 		}
824 
825 		/*
826 		 * Now go ahead and drain any other errors on the pending list.
827 		 * This call transparently handles case (1A) above, as well as
828 		 * any other errors that were dispatched after errorq_drain()
829 		 * completed its first compare-and-swap.
830 		 */
831 		errorq_drain(eqp);
832 
833 		logged += eqp->eq_kstat.eqk_logged.value.ui64 - loggedtmp;
834 	}
835 	return (logged);
836 }
837 
838 /*
839  * Drain all error queues - called only from panic context.  Some drain
840  * functions may enqueue errors to ERRORQ_NVLIST error queues so that
841  * they may be written out in the panic dump - so ERRORQ_NVLIST queues
842  * must be drained last.  Drain ERRORQ_VITAL queues before nonvital queues
843  * so that vital errors get to fill the ERRORQ_NVLIST queues first, and
844  * do not drain the nonvital queues if there are many vital errors.
845  */
846 void
847 errorq_panic(void)
848 {
849 	ASSERT(panicstr != NULL);
850 
851 	if (errorq_panic_drain(ERRORQ_VITAL) <= errorq_vitalmin)
852 		(void) errorq_panic_drain(0);
853 	(void) errorq_panic_drain(ERRORQ_VITAL | ERRORQ_NVLIST);
854 	(void) errorq_panic_drain(ERRORQ_NVLIST);
855 }
856 
857 /*
858  * Reserve an error queue element for later processing and dispatching.  The
859  * element is returned to the caller who may add error-specific data to
860  * element.  The element is retured to the free pool when either
861  * errorq_commit() is called and the element asynchronously processed
862  * or immediately when errorq_cancel() is called.
863  */
864 errorq_elem_t *
865 errorq_reserve(errorq_t *eqp)
866 {
867 	errorq_elem_t *eqep;
868 
869 	if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
870 		atomic_inc_64(&errorq_lost);
871 		return (NULL);
872 	}
873 
874 	for (;;) {
875 		int i, rval;
876 
877 		if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen,
878 		    eqp->eq_rotor)) == -1) {
879 			atomic_inc_64(&eqp->eq_kstat.eqk_dropped.value.ui64);
880 			return (NULL);
881 		}
882 		BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval);
883 		if (rval == 0) {
884 			eqp->eq_rotor = i;
885 			eqep = &eqp->eq_elems[i];
886 			break;
887 		}
888 	}
889 
890 	if (eqp->eq_flags & ERRORQ_NVLIST) {
891 		errorq_nvelem_t *eqnp = eqep->eqe_data;
892 		nv_alloc_reset(eqnp->eqn_nva);
893 		eqnp->eqn_nvl = fm_nvlist_create(eqnp->eqn_nva);
894 	}
895 
896 	atomic_inc_64(&eqp->eq_kstat.eqk_reserved.value.ui64);
897 	return (eqep);
898 }
899 
900 /*
901  * Commit an errorq element (eqep) for dispatching.
902  * This function may be called from any context subject
903  * to the Platform Considerations described above.
904  */
905 void
906 errorq_commit(errorq_t *eqp, errorq_elem_t *eqep, uint_t flag)
907 {
908 	errorq_elem_t *old;
909 
910 	if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
911 		atomic_inc_64(&eqp->eq_kstat.eqk_commit_fail.value.ui64);
912 		return;
913 	}
914 
915 	for (;;) {
916 		old = eqp->eq_pend;
917 		eqep->eqe_prev = old;
918 		membar_producer();
919 
920 		if (atomic_cas_ptr(&eqp->eq_pend, old, eqep) == old)
921 			break;
922 	}
923 
924 	atomic_inc_64(&eqp->eq_kstat.eqk_committed.value.ui64);
925 
926 	if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
927 		ddi_trigger_softintr(eqp->eq_id);
928 }
929 
930 /*
931  * Cancel an errorq element reservation by returning the specified element
932  * to the free pool.  Duplicate or invalid frees are not supported.
933  */
934 void
935 errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep)
936 {
937 	if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE))
938 		return;
939 
940 	BT_ATOMIC_CLEAR(eqp->eq_bitmap, eqep - eqp->eq_elems);
941 
942 	atomic_inc_64(&eqp->eq_kstat.eqk_cancelled.value.ui64);
943 }
944 
945 /*
946  * Write elements on the dump list of each nvlist errorq to the dump device.
947  * Upon reboot, fmd(1M) will extract and replay them for diagnosis.
948  */
949 void
950 errorq_dump(void)
951 {
952 	errorq_elem_t *eep;
953 	errorq_t *eqp;
954 
955 	if (ereport_dumpbuf == NULL)
956 		return; /* reboot or panic before errorq is even set up */
957 
958 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
959 		if (!(eqp->eq_flags & ERRORQ_NVLIST) ||
960 		    !(eqp->eq_flags & ERRORQ_ACTIVE))
961 			continue; /* do not dump this queue on panic */
962 
963 		for (eep = eqp->eq_dump; eep != NULL; eep = eep->eqe_dump) {
964 			errorq_nvelem_t *eqnp = eep->eqe_data;
965 			size_t len = 0;
966 			erpt_dump_t ed;
967 			int err;
968 
969 			(void) nvlist_size(eqnp->eqn_nvl,
970 			    &len, NV_ENCODE_NATIVE);
971 
972 			if (len > ereport_dumplen || len == 0) {
973 				cmn_err(CE_WARN, "%s: unable to save error "
974 				    "report %p due to size %lu\n",
975 				    eqp->eq_name, (void *)eep, len);
976 				continue;
977 			}
978 
979 			if ((err = nvlist_pack(eqnp->eqn_nvl,
980 			    (char **)&ereport_dumpbuf, &ereport_dumplen,
981 			    NV_ENCODE_NATIVE, KM_NOSLEEP)) != 0) {
982 				cmn_err(CE_WARN, "%s: unable to save error "
983 				    "report %p due to pack error %d\n",
984 				    eqp->eq_name, (void *)eep, err);
985 				continue;
986 			}
987 
988 			ed.ed_magic = ERPT_MAGIC;
989 			ed.ed_chksum = checksum32(ereport_dumpbuf, len);
990 			ed.ed_size = (uint32_t)len;
991 			ed.ed_pad = 0;
992 			ed.ed_hrt_nsec = 0;
993 			ed.ed_hrt_base = panic_hrtime;
994 			ed.ed_tod_base.sec = panic_hrestime.tv_sec;
995 			ed.ed_tod_base.nsec = panic_hrestime.tv_nsec;
996 
997 			dumpvp_write(&ed, sizeof (ed));
998 			dumpvp_write(ereport_dumpbuf, len);
999 		}
1000 	}
1001 }
1002 
1003 nvlist_t *
1004 errorq_elem_nvl(errorq_t *eqp, const errorq_elem_t *eqep)
1005 {
1006 	errorq_nvelem_t *eqnp = eqep->eqe_data;
1007 
1008 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
1009 
1010 	return (eqnp->eqn_nvl);
1011 }
1012 
1013 nv_alloc_t *
1014 errorq_elem_nva(errorq_t *eqp, const errorq_elem_t *eqep)
1015 {
1016 	errorq_nvelem_t *eqnp = eqep->eqe_data;
1017 
1018 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
1019 
1020 	return (eqnp->eqn_nva);
1021 }
1022 
1023 /*
1024  * Reserve a new element and duplicate the data of the original into it.
1025  */
1026 void *
1027 errorq_elem_dup(errorq_t *eqp, const errorq_elem_t *eqep, errorq_elem_t **neqep)
1028 {
1029 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE);
1030 	ASSERT(!(eqp->eq_flags & ERRORQ_NVLIST));
1031 
1032 	if ((*neqep = errorq_reserve(eqp)) == NULL)
1033 		return (NULL);
1034 
1035 	bcopy(eqep->eqe_data, (*neqep)->eqe_data, eqp->eq_size);
1036 	return ((*neqep)->eqe_data);
1037 }
1038