1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Kernel Error Queues 28 * 29 * A common problem when handling hardware error traps and interrupts is that 30 * these errors frequently must be handled at high interrupt level, where 31 * reliably producing error messages and safely examining and manipulating 32 * other kernel state may not be possible. The kernel error queue primitive is 33 * a common set of routines that allow a subsystem to maintain a queue of 34 * errors that can be processed by an explicit call from a safe context or by a 35 * soft interrupt that fires at a specific lower interrupt level. The queue 36 * management code also ensures that if the system panics, all in-transit 37 * errors are logged prior to reset. Each queue has an associated kstat for 38 * observing the number of errors dispatched and logged, and mdb(1) debugging 39 * support is provided for live and post-mortem observability. 40 * 41 * Memory Allocation 42 * 43 * All of the queue data structures are allocated in advance as part of 44 * the errorq_create() call. No additional memory allocations are 45 * performed as part of errorq_dispatch(), errorq_reserve(), 46 * errorq_commit() or errorq_drain(). This design 47 * facilitates reliable error queue processing even when the system is low 48 * on memory, and ensures that errorq_dispatch() can be called from any 49 * context. When the queue is created, the maximum queue length is 50 * specified as a parameter to errorq_create() and errorq_nvcreate(). This 51 * length should represent a reasonable upper bound on the number of 52 * simultaneous errors. If errorq_dispatch() or errorq_reserve() is 53 * invoked and no free queue elements are available, the error is 54 * dropped and will not be logged. Typically, the queue will only be 55 * exhausted by an error storm, and in this case 56 * the earlier errors provide the most important data for analysis. 57 * When a new error is dispatched, the error data is copied into the 58 * preallocated queue element so that the caller's buffer can be reused. 59 * 60 * When a new error is reserved, an element is moved from the free pool 61 * and returned to the caller. The element buffer data, eqe_data, may be 62 * managed by the caller and dispatched to the errorq by calling 63 * errorq_commit(). This is useful for additions to errorq's 64 * created with errorq_nvcreate() to handle name-value pair (nvpair) data. 65 * See below for a discussion on nvlist errorq's. 66 * 67 * Queue Drain Callback 68 * 69 * When the error queue is drained, the caller's queue drain callback is 70 * invoked with a pointer to the saved error data. This function may be 71 * called from passive kernel context or soft interrupt context at or 72 * below LOCK_LEVEL, or as part of panic(). As such, the callback should 73 * basically only be calling cmn_err (but NOT with the CE_PANIC flag). 74 * The callback must not call panic(), attempt to allocate memory, or wait 75 * on a condition variable. The callback may not call errorq_destroy() 76 * or errorq_drain() on the same error queue that called it. 77 * 78 * The queue drain callback will always be called for each pending error 79 * in the order in which errors were enqueued (oldest to newest). The 80 * queue drain callback is guaranteed to provide at *least* once semantics 81 * for all errors that are successfully dispatched (i.e. for which 82 * errorq_dispatch() has successfully completed). If an unrelated panic 83 * occurs while the queue drain callback is running on a vital queue, the 84 * panic subsystem will continue the queue drain and the callback may be 85 * invoked again for the same error. Therefore, the callback should 86 * restrict itself to logging messages and taking other actions that are 87 * not destructive if repeated. 88 * 89 * Name-Value Pair Error Queues 90 * 91 * During error handling, it may be more convenient to store error 92 * queue element data as a fixed buffer of name-value pairs. The 93 * nvpair library allows construction and destruction of nvlists 94 * in pre-allocated memory buffers. 95 * 96 * Error queues created via errorq_nvcreate() store queue element 97 * data as fixed buffer nvlists (ereports). errorq_reserve() 98 * allocates an errorq element from eqp->eq_bitmap and returns a valid 99 * pointer to a errorq_elem_t (queue element) and a pre-allocated 100 * fixed buffer nvlist. errorq_elem_nvl() is used to gain access 101 * to the nvlist to add name-value ereport members prior to 102 * dispatching the error queue element in errorq_commit(). 103 * 104 * Once dispatched, the drain function will return the element to 105 * eqp->eq_bitmap and reset the associated nv_alloc structure. 106 * error_cancel() may be called to cancel an element reservation 107 * element that was never dispatched (committed). This is useful in 108 * cases where a programming error prevents a queue element from being 109 * dispatched. 110 * 111 * Queue Management 112 * 113 * The queue element structures and error data buffers are allocated in 114 * two contiguous chunks as part of errorq_create() or errorq_nvcreate(). 115 * Each queue element structure contains a next pointer, 116 * a previous pointer, and a pointer to the corresponding error data 117 * buffer. The data buffer for a nvlist errorq is a shared buffer 118 * for the allocation of name-value pair lists. The elements are kept on 119 * one of four lists: 120 * 121 * Unused elements are kept in the free pool, managed by eqp->eq_bitmap. 122 * The eqe_prev and eqe_next pointers are not used while in the free pool 123 * and will be set to NULL. 124 * 125 * Pending errors are kept on the pending list, a singly-linked list 126 * pointed to by eqp->eq_pend, and linked together using eqe_prev. This 127 * list is maintained in order from newest error to oldest. The eqe_next 128 * pointer is not used by the pending list and will be set to NULL. 129 * 130 * The processing list is a doubly-linked list pointed to by eqp->eq_phead 131 * (the oldest element) and eqp->eq_ptail (the newest element). The 132 * eqe_next pointer is used to traverse from eq_phead to eq_ptail, and the 133 * eqe_prev pointer is used to traverse from eq_ptail to eq_phead. Once a 134 * queue drain operation begins, the current pending list is moved to the 135 * processing list in a two-phase commit fashion (eq_ptail being cleared 136 * at the beginning but eq_phead only at the end), allowing the panic code 137 * to always locate and process all pending errors in the event that a 138 * panic occurs in the middle of queue processing. 139 * 140 * A fourth list is maintained for nvlist errorqs. The dump list, 141 * eq_dump is used to link all errorq elements that should be stored 142 * in a crash dump file in the event of a system panic. During 143 * errorq_panic(), the list is created and subsequently traversed 144 * in errorq_dump() during the final phases of a crash dump. 145 * 146 * Platform Considerations 147 * 148 * In order to simplify their implementation, error queues make use of the 149 * C wrappers for compare-and-swap. If the platform itself does not 150 * support compare-and-swap in hardware and the kernel emulation routines 151 * are used instead, then the context in which errorq_dispatch() can be 152 * safely invoked is further constrained by the implementation of the 153 * compare-and-swap emulation. Specifically, if errorq_dispatch() is 154 * called from a code path that can be executed above ATOMIC_LEVEL on such 155 * a platform, the dispatch code could potentially deadlock unless the 156 * corresponding error interrupt is blocked or disabled prior to calling 157 * errorq_dispatch(). Error queues should therefore be deployed with 158 * caution on these platforms. 159 * 160 * Interfaces 161 * 162 * errorq_t *errorq_create(name, func, private, qlen, eltsize, ipl, flags); 163 * errorq_t *errorq_nvcreate(name, func, private, qlen, eltsize, ipl, flags); 164 * 165 * Create a new error queue with the specified name, callback, and 166 * properties. A pointer to the new error queue is returned upon success, 167 * or NULL is returned to indicate that the queue could not be created. 168 * This function must be called from passive kernel context with no locks 169 * held that can prevent a sleeping memory allocation from occurring. 170 * errorq_create() will return failure if the queue kstats cannot be 171 * created, or if a soft interrupt handler cannot be registered. 172 * 173 * The queue 'name' is a string that is recorded for live and post-mortem 174 * examination by a debugger. The queue callback 'func' will be invoked 175 * for each error drained from the queue, and will receive the 'private' 176 * pointer as its first argument. The callback must obey the rules for 177 * callbacks described above. The queue will have maximum length 'qlen' 178 * and each element will be able to record up to 'eltsize' bytes of data. 179 * The queue's soft interrupt (see errorq_dispatch(), below) will fire 180 * at 'ipl', which should not exceed LOCK_LEVEL. The queue 'flags' may 181 * include the following flag: 182 * 183 * ERRORQ_VITAL - This queue contains information that is considered 184 * vital to problem diagnosis. Error queues that are marked vital will 185 * be automatically drained by the panic subsystem prior to printing 186 * the panic messages to the console. 187 * 188 * void errorq_destroy(errorq); 189 * 190 * Destroy the specified error queue. The queue is drained of any 191 * pending elements and these are logged before errorq_destroy returns. 192 * Once errorq_destroy() begins draining the queue, any simultaneous 193 * calls to dispatch errors will result in the errors being dropped. 194 * The caller must invoke a higher-level abstraction (e.g. disabling 195 * an error interrupt) to ensure that error handling code does not 196 * attempt to dispatch errors to the queue while it is being freed. 197 * 198 * void errorq_dispatch(errorq, data, len, flag); 199 * 200 * Attempt to enqueue the specified error data. If a free queue element 201 * is available, the data is copied into a free element and placed on a 202 * pending list. If no free queue element is available, the error is 203 * dropped. The data length (len) is specified in bytes and should not 204 * exceed the queue's maximum element size. If the data length is less 205 * than the maximum element size, the remainder of the queue element is 206 * filled with zeroes. The flag parameter should be one of: 207 * 208 * ERRORQ_ASYNC - Schedule a soft interrupt at the previously specified 209 * IPL to asynchronously drain the queue on behalf of the caller. 210 * 211 * ERRORQ_SYNC - Do not schedule a soft interrupt to drain the queue. 212 * The caller is presumed to be calling errorq_drain() or panic() in 213 * the near future in order to drain the queue and log the error. 214 * 215 * The errorq_dispatch() function may be called from any context, subject 216 * to the Platform Considerations described above. 217 * 218 * void errorq_drain(errorq); 219 * 220 * Drain the error queue of all pending errors. The queue's callback 221 * function is invoked for each error in order from oldest to newest. 222 * This function may be used at or below LOCK_LEVEL or from panic context. 223 * 224 * errorq_elem_t *errorq_reserve(errorq); 225 * 226 * Reserve an error queue element for later processing and dispatching. 227 * The element is returned to the caller who may add error-specific data 228 * to element. The element is retured to the free pool when either 229 * errorq_commit() is called and the element asynchronously processed 230 * or immediately when errorq_cancel() is called. 231 * 232 * void errorq_commit(errorq, errorq_elem, flag); 233 * 234 * Commit an errorq element (eqep) for dispatching, see 235 * errorq_dispatch(). 236 * 237 * void errorq_cancel(errorq, errorq_elem); 238 * 239 * Cancel a pending errorq element reservation. The errorq element is 240 * returned to the free pool upon cancelation. 241 */ 242 243 #include <sys/errorq_impl.h> 244 #include <sys/sysmacros.h> 245 #include <sys/machlock.h> 246 #include <sys/cmn_err.h> 247 #include <sys/atomic.h> 248 #include <sys/systm.h> 249 #include <sys/kmem.h> 250 #include <sys/conf.h> 251 #include <sys/ddi.h> 252 #include <sys/sunddi.h> 253 #include <sys/bootconf.h> 254 #include <sys/spl.h> 255 #include <sys/dumphdr.h> 256 #include <sys/compress.h> 257 #include <sys/time.h> 258 #include <sys/panic.h> 259 #include <sys/bitmap.h> 260 #include <sys/fm/protocol.h> 261 #include <sys/fm/util.h> 262 263 static struct errorq_kstat errorq_kstat_template = { 264 { "dispatched", KSTAT_DATA_UINT64 }, 265 { "dropped", KSTAT_DATA_UINT64 }, 266 { "logged", KSTAT_DATA_UINT64 }, 267 { "reserved", KSTAT_DATA_UINT64 }, 268 { "reserve_fail", KSTAT_DATA_UINT64 }, 269 { "committed", KSTAT_DATA_UINT64 }, 270 { "commit_fail", KSTAT_DATA_UINT64 }, 271 { "cancelled", KSTAT_DATA_UINT64 } 272 }; 273 274 static uint64_t errorq_lost = 0; 275 static errorq_t *errorq_list = NULL; 276 static kmutex_t errorq_lock; 277 static uint64_t errorq_vitalmin = 5; 278 279 static uint_t 280 errorq_intr(caddr_t eqp) 281 { 282 errorq_drain((errorq_t *)eqp); 283 return (DDI_INTR_CLAIMED); 284 } 285 286 /* 287 * Create a new error queue with the specified properties and add a software 288 * interrupt handler and kstat for it. This function must be called from 289 * passive kernel context with no locks held that can prevent a sleeping 290 * memory allocation from occurring. This function will return NULL if the 291 * softint or kstat for this queue cannot be created. 292 */ 293 errorq_t * 294 errorq_create(const char *name, errorq_func_t func, void *private, 295 ulong_t qlen, size_t size, uint_t ipl, uint_t flags) 296 { 297 errorq_t *eqp = kmem_alloc(sizeof (errorq_t), KM_SLEEP); 298 ddi_iblock_cookie_t ibc = (ddi_iblock_cookie_t)(uintptr_t)ipltospl(ipl); 299 dev_info_t *dip = ddi_root_node(); 300 301 errorq_elem_t *eep; 302 ddi_softintr_t id = NULL; 303 caddr_t data; 304 305 ASSERT(qlen != 0 && size != 0); 306 ASSERT(ipl > 0 && ipl <= LOCK_LEVEL); 307 308 /* 309 * If a queue is created very early in boot before device tree services 310 * are available, the queue softint handler cannot be created. We 311 * manually drain these queues and create their softint handlers when 312 * it is safe to do so as part of errorq_init(), below. 313 */ 314 if (modrootloaded && ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, 315 &ibc, NULL, errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) { 316 cmn_err(CE_WARN, "errorq_create: failed to register " 317 "IPL %u softint for queue %s", ipl, name); 318 kmem_free(eqp, sizeof (errorq_t)); 319 return (NULL); 320 } 321 322 if ((eqp->eq_ksp = kstat_create("unix", 0, name, "errorq", 323 KSTAT_TYPE_NAMED, sizeof (struct errorq_kstat) / 324 sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) == NULL) { 325 cmn_err(CE_WARN, "errorq_create: failed to create kstat " 326 "for queue %s", name); 327 if (id != NULL) 328 ddi_remove_softintr(id); 329 kmem_free(eqp, sizeof (errorq_t)); 330 return (NULL); 331 } 332 333 bcopy(&errorq_kstat_template, &eqp->eq_kstat, 334 sizeof (struct errorq_kstat)); 335 eqp->eq_ksp->ks_data = &eqp->eq_kstat; 336 eqp->eq_ksp->ks_private = eqp; 337 kstat_install(eqp->eq_ksp); 338 339 (void) strncpy(eqp->eq_name, name, ERRORQ_NAMELEN); 340 eqp->eq_name[ERRORQ_NAMELEN] = '\0'; 341 eqp->eq_func = func; 342 eqp->eq_private = private; 343 eqp->eq_data = kmem_alloc(qlen * size, KM_SLEEP); 344 eqp->eq_qlen = qlen; 345 eqp->eq_size = size; 346 eqp->eq_ipl = ipl; 347 eqp->eq_flags = flags | ERRORQ_ACTIVE; 348 eqp->eq_id = id; 349 mutex_init(&eqp->eq_lock, NULL, MUTEX_DEFAULT, NULL); 350 eqp->eq_elems = kmem_alloc(qlen * sizeof (errorq_elem_t), KM_SLEEP); 351 eqp->eq_phead = NULL; 352 eqp->eq_ptail = NULL; 353 eqp->eq_pend = NULL; 354 eqp->eq_dump = NULL; 355 eqp->eq_bitmap = kmem_zalloc(BT_SIZEOFMAP(qlen), KM_SLEEP); 356 eqp->eq_rotor = 0; 357 358 /* 359 * Iterate over the array of errorq_elem_t structures and set its 360 * data pointer. 361 */ 362 for (eep = eqp->eq_elems, data = eqp->eq_data; qlen > 1; qlen--) { 363 eep->eqe_next = NULL; 364 eep->eqe_dump = NULL; 365 eep->eqe_prev = NULL; 366 eep->eqe_data = data; 367 data += size; 368 eep++; 369 } 370 eep->eqe_next = NULL; 371 eep->eqe_prev = NULL; 372 eep->eqe_data = data; 373 eep->eqe_dump = NULL; 374 375 /* 376 * Once the errorq is initialized, add it to the global list of queues, 377 * and then return a pointer to the new queue to the caller. 378 */ 379 mutex_enter(&errorq_lock); 380 eqp->eq_next = errorq_list; 381 errorq_list = eqp; 382 mutex_exit(&errorq_lock); 383 384 return (eqp); 385 } 386 387 /* 388 * Create a new errorq as if by errorq_create(), but set the ERRORQ_NVLIST 389 * flag and initialize each element to have the start of its data region used 390 * as an errorq_nvelem_t with a nvlist allocator that consumes the data region. 391 */ 392 errorq_t * 393 errorq_nvcreate(const char *name, errorq_func_t func, void *private, 394 ulong_t qlen, size_t size, uint_t ipl, uint_t flags) 395 { 396 errorq_t *eqp; 397 errorq_elem_t *eep; 398 399 eqp = errorq_create(name, func, private, qlen, 400 size + sizeof (errorq_nvelem_t), ipl, flags | ERRORQ_NVLIST); 401 402 if (eqp == NULL) 403 return (NULL); 404 405 mutex_enter(&eqp->eq_lock); 406 407 for (eep = eqp->eq_elems; qlen != 0; eep++, qlen--) { 408 errorq_nvelem_t *eqnp = eep->eqe_data; 409 eqnp->eqn_buf = (char *)eqnp + sizeof (errorq_nvelem_t); 410 eqnp->eqn_nva = fm_nva_xcreate(eqnp->eqn_buf, size); 411 } 412 413 mutex_exit(&eqp->eq_lock); 414 return (eqp); 415 } 416 417 /* 418 * To destroy an error queue, we mark it as disabled and then explicitly drain 419 * all pending errors. Once the drain is complete, we can remove the queue 420 * from the global list of queues examined by errorq_panic(), and then free 421 * the various queue data structures. The caller must use some higher-level 422 * abstraction (e.g. disabling an error interrupt) to ensure that no one will 423 * attempt to enqueue new errors while we are freeing this queue. 424 */ 425 void 426 errorq_destroy(errorq_t *eqp) 427 { 428 errorq_t *p, **pp; 429 errorq_elem_t *eep; 430 ulong_t i; 431 432 ASSERT(eqp != NULL); 433 eqp->eq_flags &= ~ERRORQ_ACTIVE; 434 errorq_drain(eqp); 435 436 mutex_enter(&errorq_lock); 437 pp = &errorq_list; 438 439 for (p = errorq_list; p != NULL; p = p->eq_next) { 440 if (p == eqp) { 441 *pp = p->eq_next; 442 break; 443 } 444 pp = &p->eq_next; 445 } 446 447 mutex_exit(&errorq_lock); 448 ASSERT(p != NULL); 449 450 if (eqp->eq_flags & ERRORQ_NVLIST) { 451 for (eep = eqp->eq_elems, i = 0; i < eqp->eq_qlen; i++, eep++) { 452 errorq_nvelem_t *eqnp = eep->eqe_data; 453 fm_nva_xdestroy(eqnp->eqn_nva); 454 } 455 } 456 457 mutex_destroy(&eqp->eq_lock); 458 kstat_delete(eqp->eq_ksp); 459 460 if (eqp->eq_id != NULL) 461 ddi_remove_softintr(eqp->eq_id); 462 463 kmem_free(eqp->eq_elems, eqp->eq_qlen * sizeof (errorq_elem_t)); 464 kmem_free(eqp->eq_bitmap, BT_SIZEOFMAP(eqp->eq_qlen)); 465 kmem_free(eqp->eq_data, eqp->eq_qlen * eqp->eq_size); 466 467 kmem_free(eqp, sizeof (errorq_t)); 468 } 469 470 /* 471 * private version of bt_availbit which makes a best-efforts attempt 472 * at allocating in a round-robin fashion in order to facilitate post-mortem 473 * diagnosis. 474 */ 475 static index_t 476 errorq_availbit(ulong_t *bitmap, size_t nbits, index_t curindex) 477 { 478 ulong_t bit, maxbit, bx; 479 index_t rval, nextindex = curindex + 1; 480 index_t nextword = nextindex >> BT_ULSHIFT; 481 ulong_t nextbitindex = nextindex & BT_ULMASK; 482 index_t maxindex = nbits - 1; 483 index_t maxword = maxindex >> BT_ULSHIFT; 484 ulong_t maxbitindex = maxindex & BT_ULMASK; 485 486 /* 487 * First check if there are still some bits remaining in the current 488 * word, and see if any of those are available. We need to do this by 489 * hand as the bt_availbit() function always starts at the beginning 490 * of a word. 491 */ 492 if (nextindex <= maxindex && nextbitindex != 0) { 493 maxbit = (nextword == maxword) ? maxbitindex : BT_ULMASK; 494 for (bx = 0, bit = 1; bx <= maxbit; bx++, bit <<= 1) 495 if (bx >= nextbitindex && !(bitmap[nextword] & bit)) 496 return ((nextword << BT_ULSHIFT) + bx); 497 nextword++; 498 } 499 /* 500 * Now check if there are any words remaining before the end of the 501 * bitmap. Use bt_availbit() to find any free bits. 502 */ 503 if (nextword <= maxword) 504 if ((rval = bt_availbit(&bitmap[nextword], 505 nbits - (nextword << BT_ULSHIFT))) != -1) 506 return ((nextword << BT_ULSHIFT) + rval); 507 /* 508 * Finally loop back to the start and look for any free bits starting 509 * from the beginning of the bitmap to the current rotor position. 510 */ 511 return (bt_availbit(bitmap, nextindex)); 512 } 513 514 /* 515 * Dispatch a new error into the queue for later processing. The specified 516 * data buffer is copied into a preallocated queue element. If 'len' is 517 * smaller than the queue element size, the remainder of the queue element is 518 * filled with zeroes. This function may be called from any context subject 519 * to the Platform Considerations described above. 520 */ 521 void 522 errorq_dispatch(errorq_t *eqp, const void *data, size_t len, uint_t flag) 523 { 524 errorq_elem_t *eep, *old; 525 526 if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 527 atomic_add_64(&errorq_lost, 1); 528 return; /* drop error if queue is uninitialized or disabled */ 529 } 530 531 for (;;) { 532 int i, rval; 533 534 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen, 535 eqp->eq_rotor)) == -1) { 536 atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1); 537 return; 538 } 539 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval); 540 if (rval == 0) { 541 eqp->eq_rotor = i; 542 eep = &eqp->eq_elems[i]; 543 break; 544 } 545 } 546 547 ASSERT(len <= eqp->eq_size); 548 bcopy(data, eep->eqe_data, MIN(eqp->eq_size, len)); 549 550 if (len < eqp->eq_size) 551 bzero((caddr_t)eep->eqe_data + len, eqp->eq_size - len); 552 553 for (;;) { 554 old = eqp->eq_pend; 555 eep->eqe_prev = old; 556 membar_producer(); 557 558 if (casptr(&eqp->eq_pend, old, eep) == old) 559 break; 560 } 561 562 atomic_add_64(&eqp->eq_kstat.eqk_dispatched.value.ui64, 1); 563 564 if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL) 565 ddi_trigger_softintr(eqp->eq_id); 566 } 567 568 /* 569 * Drain the specified error queue by calling eq_func() for each pending error. 570 * This function must be called at or below LOCK_LEVEL or from panic context. 571 * In order to synchronize with other attempts to drain the queue, we acquire 572 * the adaptive eq_lock, blocking other consumers. Once this lock is held, 573 * we must use compare-and-swap to move the pending list to the processing 574 * list and to return elements to the free pool in order to synchronize 575 * with producers, who do not acquire any locks and only use atomic set/clear. 576 * 577 * An additional constraint on this function is that if the system panics 578 * while this function is running, the panic code must be able to detect and 579 * handle all intermediate states and correctly dequeue all errors. The 580 * errorq_panic() function below will be used for detecting and handling 581 * these intermediate states. The comments in errorq_drain() below explain 582 * how we make sure each intermediate state is distinct and consistent. 583 */ 584 void 585 errorq_drain(errorq_t *eqp) 586 { 587 errorq_elem_t *eep, *dep; 588 589 ASSERT(eqp != NULL); 590 mutex_enter(&eqp->eq_lock); 591 592 /* 593 * If there are one or more pending errors, set eq_ptail to point to 594 * the first element on the pending list and then attempt to compare- 595 * and-swap NULL to the pending list. We use membar_producer() to 596 * make sure that eq_ptail will be visible to errorq_panic() below 597 * before the pending list is NULLed out. This section is labeled 598 * case (1) for errorq_panic, below. If eq_ptail is not yet set (1A) 599 * eq_pend has all the pending errors. If casptr fails or has not 600 * been called yet (1B), eq_pend still has all the pending errors. 601 * If casptr succeeds (1C), eq_ptail has all the pending errors. 602 */ 603 while ((eep = eqp->eq_pend) != NULL) { 604 eqp->eq_ptail = eep; 605 membar_producer(); 606 607 if (casptr(&eqp->eq_pend, eep, NULL) == eep) 608 break; 609 } 610 611 /* 612 * If no errors were pending, assert that eq_ptail is set to NULL, 613 * drop the consumer lock, and return without doing anything. 614 */ 615 if (eep == NULL) { 616 ASSERT(eqp->eq_ptail == NULL); 617 mutex_exit(&eqp->eq_lock); 618 return; 619 } 620 621 /* 622 * Now iterate from eq_ptail (a.k.a. eep, the newest error) to the 623 * oldest error, setting the eqe_next pointer so that we can iterate 624 * over the errors from oldest to newest. We use membar_producer() 625 * to make sure that these stores are visible before we set eq_phead. 626 * If we panic before, during, or just after this loop (case 2), 627 * errorq_panic() will simply redo this work, as described below. 628 */ 629 for (eep->eqe_next = NULL; eep->eqe_prev != NULL; eep = eep->eqe_prev) 630 eep->eqe_prev->eqe_next = eep; 631 membar_producer(); 632 633 /* 634 * Now set eq_phead to the head of the processing list (the oldest 635 * error) and issue another membar_producer() to make sure that 636 * eq_phead is seen as non-NULL before we clear eq_ptail. If we panic 637 * after eq_phead is set (case 3), we will detect and log these errors 638 * in errorq_panic(), as described below. 639 */ 640 eqp->eq_phead = eep; 641 membar_producer(); 642 643 eqp->eq_ptail = NULL; 644 membar_producer(); 645 646 /* 647 * If we enter from errorq_panic_drain(), we may already have 648 * errorq elements on the dump list. Find the tail of 649 * the list ready for append. 650 */ 651 if (panicstr && (dep = eqp->eq_dump) != NULL) { 652 while (dep->eqe_dump != NULL) 653 dep = dep->eqe_dump; 654 } 655 656 /* 657 * Now iterate over the processing list from oldest (eq_phead) to 658 * newest and log each error. Once an error is logged, we use 659 * atomic clear to return it to the free pool. If we panic before, 660 * during, or after calling eq_func() (case 4), the error will still be 661 * found on eq_phead and will be logged in errorq_panic below. 662 */ 663 664 while ((eep = eqp->eq_phead) != NULL) { 665 eqp->eq_func(eqp->eq_private, eep->eqe_data, eep); 666 eqp->eq_kstat.eqk_logged.value.ui64++; 667 668 eqp->eq_phead = eep->eqe_next; 669 membar_producer(); 670 671 eep->eqe_next = NULL; 672 673 /* 674 * On panic, we add the element to the dump list for each 675 * nvlist errorq. Elements are stored oldest to newest. 676 * Then continue, so we don't free and subsequently overwrite 677 * any elements which we've put on the dump queue. 678 */ 679 if (panicstr && (eqp->eq_flags & ERRORQ_NVLIST)) { 680 if (eqp->eq_dump == NULL) 681 dep = eqp->eq_dump = eep; 682 else 683 dep = dep->eqe_dump = eep; 684 membar_producer(); 685 continue; 686 } 687 688 eep->eqe_prev = NULL; 689 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems); 690 } 691 692 mutex_exit(&eqp->eq_lock); 693 } 694 695 /* 696 * Now that device tree services are available, set up the soft interrupt 697 * handlers for any queues that were created early in boot. We then 698 * manually drain these queues to report any pending early errors. 699 */ 700 void 701 errorq_init(void) 702 { 703 dev_info_t *dip = ddi_root_node(); 704 ddi_softintr_t id; 705 errorq_t *eqp; 706 707 ASSERT(modrootloaded != 0); 708 ASSERT(dip != NULL); 709 710 mutex_enter(&errorq_lock); 711 712 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 713 ddi_iblock_cookie_t ibc = 714 (ddi_iblock_cookie_t)(uintptr_t)ipltospl(eqp->eq_ipl); 715 716 if (eqp->eq_id != NULL) 717 continue; /* softint already initialized */ 718 719 if (ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, &ibc, NULL, 720 errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) { 721 panic("errorq_init: failed to register IPL %u softint " 722 "for queue %s", eqp->eq_ipl, eqp->eq_name); 723 } 724 725 eqp->eq_id = id; 726 errorq_drain(eqp); 727 } 728 729 mutex_exit(&errorq_lock); 730 } 731 732 /* 733 * This function is designed to be called from panic context only, and 734 * therefore does not need to acquire errorq_lock when iterating over 735 * errorq_list. This function must be called no more than once for each 736 * 'what' value (if you change this then review the manipulation of 'dep'. 737 */ 738 static uint64_t 739 errorq_panic_drain(uint_t what) 740 { 741 errorq_elem_t *eep, *nep, *dep; 742 errorq_t *eqp; 743 uint64_t loggedtmp; 744 uint64_t logged = 0; 745 746 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 747 if ((eqp->eq_flags & (ERRORQ_VITAL | ERRORQ_NVLIST)) != what) 748 continue; /* do not drain this queue on this pass */ 749 750 loggedtmp = eqp->eq_kstat.eqk_logged.value.ui64; 751 752 /* 753 * In case (1B) above, eq_ptail may be set but the casptr may 754 * not have been executed yet or may have failed. Either way, 755 * we must log errors in chronological order. So we search 756 * the pending list for the error pointed to by eq_ptail. If 757 * it is found, we know that all subsequent errors are also 758 * still on the pending list, so just NULL out eq_ptail and let 759 * errorq_drain(), below, take care of the logging. 760 */ 761 for (eep = eqp->eq_pend; eep != NULL; eep = eep->eqe_prev) { 762 if (eep == eqp->eq_ptail) { 763 ASSERT(eqp->eq_phead == NULL); 764 eqp->eq_ptail = NULL; 765 break; 766 } 767 } 768 769 /* 770 * In cases (1C) and (2) above, eq_ptail will be set to the 771 * newest error on the processing list but eq_phead will still 772 * be NULL. We set the eqe_next pointers so we can iterate 773 * over the processing list in order from oldest error to the 774 * newest error. We then set eq_phead to point to the oldest 775 * error and fall into the for-loop below. 776 */ 777 if (eqp->eq_phead == NULL && (eep = eqp->eq_ptail) != NULL) { 778 for (eep->eqe_next = NULL; eep->eqe_prev != NULL; 779 eep = eep->eqe_prev) 780 eep->eqe_prev->eqe_next = eep; 781 782 eqp->eq_phead = eep; 783 eqp->eq_ptail = NULL; 784 } 785 786 /* 787 * In cases (3) and (4) above (or after case (1C/2) handling), 788 * eq_phead will be set to the oldest error on the processing 789 * list. We log each error and return it to the free pool. 790 * 791 * Unlike errorq_drain(), we don't need to worry about updating 792 * eq_phead because errorq_panic() will be called at most once. 793 * However, we must use casptr to update the freelist in case 794 * errors are still being enqueued during panic. 795 */ 796 for (eep = eqp->eq_phead; eep != NULL; eep = nep) { 797 eqp->eq_func(eqp->eq_private, eep->eqe_data, eep); 798 eqp->eq_kstat.eqk_logged.value.ui64++; 799 800 nep = eep->eqe_next; 801 eep->eqe_next = NULL; 802 803 /* 804 * On panic, we add the element to the dump list for 805 * each nvlist errorq, stored oldest to newest. Then 806 * continue, so we don't free and subsequently overwrite 807 * any elements which we've put on the dump queue. 808 */ 809 if (eqp->eq_flags & ERRORQ_NVLIST) { 810 if (eqp->eq_dump == NULL) 811 dep = eqp->eq_dump = eep; 812 else 813 dep = dep->eqe_dump = eep; 814 membar_producer(); 815 continue; 816 } 817 818 eep->eqe_prev = NULL; 819 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems); 820 } 821 822 /* 823 * Now go ahead and drain any other errors on the pending list. 824 * This call transparently handles case (1A) above, as well as 825 * any other errors that were dispatched after errorq_drain() 826 * completed its first compare-and-swap. 827 */ 828 errorq_drain(eqp); 829 830 logged += eqp->eq_kstat.eqk_logged.value.ui64 - loggedtmp; 831 } 832 return (logged); 833 } 834 835 /* 836 * Drain all error queues - called only from panic context. Some drain 837 * functions may enqueue errors to ERRORQ_NVLIST error queues so that 838 * they may be written out in the panic dump - so ERRORQ_NVLIST queues 839 * must be drained last. Drain ERRORQ_VITAL queues before nonvital queues 840 * so that vital errors get to fill the ERRORQ_NVLIST queues first, and 841 * do not drain the nonvital queues if there are many vital errors. 842 */ 843 void 844 errorq_panic(void) 845 { 846 ASSERT(panicstr != NULL); 847 848 if (errorq_panic_drain(ERRORQ_VITAL) <= errorq_vitalmin) 849 (void) errorq_panic_drain(0); 850 (void) errorq_panic_drain(ERRORQ_VITAL | ERRORQ_NVLIST); 851 (void) errorq_panic_drain(ERRORQ_NVLIST); 852 } 853 854 /* 855 * Reserve an error queue element for later processing and dispatching. The 856 * element is returned to the caller who may add error-specific data to 857 * element. The element is retured to the free pool when either 858 * errorq_commit() is called and the element asynchronously processed 859 * or immediately when errorq_cancel() is called. 860 */ 861 errorq_elem_t * 862 errorq_reserve(errorq_t *eqp) 863 { 864 errorq_elem_t *eqep; 865 866 if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 867 atomic_add_64(&errorq_lost, 1); 868 return (NULL); 869 } 870 871 for (;;) { 872 int i, rval; 873 874 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen, 875 eqp->eq_rotor)) == -1) { 876 atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1); 877 return (NULL); 878 } 879 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval); 880 if (rval == 0) { 881 eqp->eq_rotor = i; 882 eqep = &eqp->eq_elems[i]; 883 break; 884 } 885 } 886 887 if (eqp->eq_flags & ERRORQ_NVLIST) { 888 errorq_nvelem_t *eqnp = eqep->eqe_data; 889 nv_alloc_reset(eqnp->eqn_nva); 890 eqnp->eqn_nvl = fm_nvlist_create(eqnp->eqn_nva); 891 } 892 893 atomic_add_64(&eqp->eq_kstat.eqk_reserved.value.ui64, 1); 894 return (eqep); 895 } 896 897 /* 898 * Commit an errorq element (eqep) for dispatching. 899 * This function may be called from any context subject 900 * to the Platform Considerations described above. 901 */ 902 void 903 errorq_commit(errorq_t *eqp, errorq_elem_t *eqep, uint_t flag) 904 { 905 errorq_elem_t *old; 906 907 if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 908 atomic_add_64(&eqp->eq_kstat.eqk_commit_fail.value.ui64, 1); 909 return; 910 } 911 912 for (;;) { 913 old = eqp->eq_pend; 914 eqep->eqe_prev = old; 915 membar_producer(); 916 917 if (casptr(&eqp->eq_pend, old, eqep) == old) 918 break; 919 } 920 921 atomic_add_64(&eqp->eq_kstat.eqk_committed.value.ui64, 1); 922 923 if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL) 924 ddi_trigger_softintr(eqp->eq_id); 925 } 926 927 /* 928 * Cancel an errorq element reservation by returning the specified element 929 * to the free pool. Duplicate or invalid frees are not supported. 930 */ 931 void 932 errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep) 933 { 934 if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) 935 return; 936 937 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eqep - eqp->eq_elems); 938 939 atomic_add_64(&eqp->eq_kstat.eqk_cancelled.value.ui64, 1); 940 } 941 942 /* 943 * Write elements on the dump list of each nvlist errorq to the dump device. 944 * Upon reboot, fmd(1M) will extract and replay them for diagnosis. 945 */ 946 void 947 errorq_dump(void) 948 { 949 errorq_elem_t *eep; 950 errorq_t *eqp; 951 952 if (ereport_dumpbuf == NULL) 953 return; /* reboot or panic before errorq is even set up */ 954 955 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 956 if (!(eqp->eq_flags & ERRORQ_NVLIST) || 957 !(eqp->eq_flags & ERRORQ_ACTIVE)) 958 continue; /* do not dump this queue on panic */ 959 960 for (eep = eqp->eq_dump; eep != NULL; eep = eep->eqe_dump) { 961 errorq_nvelem_t *eqnp = eep->eqe_data; 962 size_t len = 0; 963 erpt_dump_t ed; 964 int err; 965 966 (void) nvlist_size(eqnp->eqn_nvl, 967 &len, NV_ENCODE_NATIVE); 968 969 if (len > ereport_dumplen || len == 0) { 970 cmn_err(CE_WARN, "%s: unable to save error " 971 "report %p due to size %lu\n", 972 eqp->eq_name, (void *)eep, len); 973 continue; 974 } 975 976 if ((err = nvlist_pack(eqnp->eqn_nvl, 977 (char **)&ereport_dumpbuf, &ereport_dumplen, 978 NV_ENCODE_NATIVE, KM_NOSLEEP)) != 0) { 979 cmn_err(CE_WARN, "%s: unable to save error " 980 "report %p due to pack error %d\n", 981 eqp->eq_name, (void *)eep, err); 982 continue; 983 } 984 985 ed.ed_magic = ERPT_MAGIC; 986 ed.ed_chksum = checksum32(ereport_dumpbuf, len); 987 ed.ed_size = (uint32_t)len; 988 ed.ed_pad = 0; 989 ed.ed_hrt_nsec = 0; 990 ed.ed_hrt_base = panic_hrtime; 991 ed.ed_tod_base.sec = panic_hrestime.tv_sec; 992 ed.ed_tod_base.nsec = panic_hrestime.tv_nsec; 993 994 dumpvp_write(&ed, sizeof (ed)); 995 dumpvp_write(ereport_dumpbuf, len); 996 } 997 } 998 } 999 1000 nvlist_t * 1001 errorq_elem_nvl(errorq_t *eqp, const errorq_elem_t *eqep) 1002 { 1003 errorq_nvelem_t *eqnp = eqep->eqe_data; 1004 1005 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST); 1006 1007 return (eqnp->eqn_nvl); 1008 } 1009 1010 nv_alloc_t * 1011 errorq_elem_nva(errorq_t *eqp, const errorq_elem_t *eqep) 1012 { 1013 errorq_nvelem_t *eqnp = eqep->eqe_data; 1014 1015 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST); 1016 1017 return (eqnp->eqn_nva); 1018 } 1019 1020 /* 1021 * Reserve a new element and duplicate the data of the original into it. 1022 */ 1023 void * 1024 errorq_elem_dup(errorq_t *eqp, const errorq_elem_t *eqep, errorq_elem_t **neqep) 1025 { 1026 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE); 1027 ASSERT(!(eqp->eq_flags & ERRORQ_NVLIST)); 1028 1029 if ((*neqep = errorq_reserve(eqp)) == NULL) 1030 return (NULL); 1031 1032 bcopy(eqep->eqe_data, (*neqep)->eqe_data, eqp->eq_size); 1033 return ((*neqep)->eqe_data); 1034 } 1035