1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Kernel Error Queues 28 * 29 * A common problem when handling hardware error traps and interrupts is that 30 * these errors frequently must be handled at high interrupt level, where 31 * reliably producing error messages and safely examining and manipulating 32 * other kernel state may not be possible. The kernel error queue primitive is 33 * a common set of routines that allow a subsystem to maintain a queue of 34 * errors that can be processed by an explicit call from a safe context or by a 35 * soft interrupt that fires at a specific lower interrupt level. The queue 36 * management code also ensures that if the system panics, all in-transit 37 * errors are logged prior to reset. Each queue has an associated kstat for 38 * observing the number of errors dispatched and logged, and mdb(1) debugging 39 * support is provided for live and post-mortem observability. 40 * 41 * Memory Allocation 42 * 43 * All of the queue data structures are allocated in advance as part of 44 * the errorq_create() call. No additional memory allocations are 45 * performed as part of errorq_dispatch(), errorq_reserve(), 46 * errorq_commit() or errorq_drain(). This design 47 * facilitates reliable error queue processing even when the system is low 48 * on memory, and ensures that errorq_dispatch() can be called from any 49 * context. When the queue is created, the maximum queue length is 50 * specified as a parameter to errorq_create() and errorq_nvcreate(). This 51 * length should represent a reasonable upper bound on the number of 52 * simultaneous errors. If errorq_dispatch() or errorq_reserve() is 53 * invoked and no free queue elements are available, the error is 54 * dropped and will not be logged. Typically, the queue will only be 55 * exhausted by an error storm, and in this case 56 * the earlier errors provide the most important data for analysis. 57 * When a new error is dispatched, the error data is copied into the 58 * preallocated queue element so that the caller's buffer can be reused. 59 * 60 * When a new error is reserved, an element is moved from the free pool 61 * and returned to the caller. The element buffer data, eqe_data, may be 62 * managed by the caller and dispatched to the errorq by calling 63 * errorq_commit(). This is useful for additions to errorq's 64 * created with errorq_nvcreate() to handle name-value pair (nvpair) data. 65 * See below for a discussion on nvlist errorq's. 66 * 67 * Queue Drain Callback 68 * 69 * When the error queue is drained, the caller's queue drain callback is 70 * invoked with a pointer to the saved error data. This function may be 71 * called from passive kernel context or soft interrupt context at or 72 * below LOCK_LEVEL, or as part of panic(). As such, the callback should 73 * basically only be calling cmn_err (but NOT with the CE_PANIC flag). 74 * The callback must not call panic(), attempt to allocate memory, or wait 75 * on a condition variable. The callback may not call errorq_destroy() 76 * or errorq_drain() on the same error queue that called it. 77 * 78 * The queue drain callback will always be called for each pending error 79 * in the order in which errors were enqueued (oldest to newest). The 80 * queue drain callback is guaranteed to provide at *least* once semantics 81 * for all errors that are successfully dispatched (i.e. for which 82 * errorq_dispatch() has successfully completed). If an unrelated panic 83 * occurs while the queue drain callback is running on a vital queue, the 84 * panic subsystem will continue the queue drain and the callback may be 85 * invoked again for the same error. Therefore, the callback should 86 * restrict itself to logging messages and taking other actions that are 87 * not destructive if repeated. 88 * 89 * Name-Value Pair Error Queues 90 * 91 * During error handling, it may be more convenient to store error 92 * queue element data as a fixed buffer of name-value pairs. The 93 * nvpair library allows construction and destruction of nvlists 94 * in pre-allocated memory buffers. 95 * 96 * Error queues created via errorq_nvcreate() store queue element 97 * data as fixed buffer nvlists (ereports). errorq_reserve() 98 * allocates an errorq element from eqp->eq_bitmap and returns a valid 99 * pointer to a errorq_elem_t (queue element) and a pre-allocated 100 * fixed buffer nvlist. errorq_elem_nvl() is used to gain access 101 * to the nvlist to add name-value ereport members prior to 102 * dispatching the error queue element in errorq_commit(). 103 * 104 * Once dispatched, the drain function will return the element to 105 * eqp->eq_bitmap and reset the associated nv_alloc structure. 106 * error_cancel() may be called to cancel an element reservation 107 * element that was never dispatched (committed). This is useful in 108 * cases where a programming error prevents a queue element from being 109 * dispatched. 110 * 111 * Queue Management 112 * 113 * The queue element structures and error data buffers are allocated in 114 * two contiguous chunks as part of errorq_create() or errorq_nvcreate(). 115 * Each queue element structure contains a next pointer, 116 * a previous pointer, and a pointer to the corresponding error data 117 * buffer. The data buffer for a nvlist errorq is a shared buffer 118 * for the allocation of name-value pair lists. The elements are kept on 119 * one of four lists: 120 * 121 * Unused elements are kept in the free pool, managed by eqp->eq_bitmap. 122 * The eqe_prev and eqe_next pointers are not used while in the free pool 123 * and will be set to NULL. 124 * 125 * Pending errors are kept on the pending list, a singly-linked list 126 * pointed to by eqp->eq_pend, and linked together using eqe_prev. This 127 * list is maintained in order from newest error to oldest. The eqe_next 128 * pointer is not used by the pending list and will be set to NULL. 129 * 130 * The processing list is a doubly-linked list pointed to by eqp->eq_phead 131 * (the oldest element) and eqp->eq_ptail (the newest element). The 132 * eqe_next pointer is used to traverse from eq_phead to eq_ptail, and the 133 * eqe_prev pointer is used to traverse from eq_ptail to eq_phead. Once a 134 * queue drain operation begins, the current pending list is moved to the 135 * processing list in a two-phase commit fashion (eq_ptail being cleared 136 * at the beginning but eq_phead only at the end), allowing the panic code 137 * to always locate and process all pending errors in the event that a 138 * panic occurs in the middle of queue processing. 139 * 140 * A fourth list is maintained for nvlist errorqs. The dump list, 141 * eq_dump is used to link all errorq elements that should be stored 142 * in a crash dump file in the event of a system panic. During 143 * errorq_panic(), the list is created and subsequently traversed 144 * in errorq_dump() during the final phases of a crash dump. 145 * 146 * Platform Considerations 147 * 148 * In order to simplify their implementation, error queues make use of the 149 * C wrappers for compare-and-swap. If the platform itself does not 150 * support compare-and-swap in hardware and the kernel emulation routines 151 * are used instead, then the context in which errorq_dispatch() can be 152 * safely invoked is further constrained by the implementation of the 153 * compare-and-swap emulation. Specifically, if errorq_dispatch() is 154 * called from a code path that can be executed above ATOMIC_LEVEL on such 155 * a platform, the dispatch code could potentially deadlock unless the 156 * corresponding error interrupt is blocked or disabled prior to calling 157 * errorq_dispatch(). Error queues should therefore be deployed with 158 * caution on these platforms. 159 * 160 * Interfaces 161 * 162 * errorq_t *errorq_create(name, func, private, qlen, eltsize, ipl, flags); 163 * errorq_t *errorq_nvcreate(name, func, private, qlen, eltsize, ipl, flags); 164 * 165 * Create a new error queue with the specified name, callback, and 166 * properties. A pointer to the new error queue is returned upon success, 167 * or NULL is returned to indicate that the queue could not be created. 168 * This function must be called from passive kernel context with no locks 169 * held that can prevent a sleeping memory allocation from occurring. 170 * errorq_create() will return failure if the queue kstats cannot be 171 * created, or if a soft interrupt handler cannot be registered. 172 * 173 * The queue 'name' is a string that is recorded for live and post-mortem 174 * examination by a debugger. The queue callback 'func' will be invoked 175 * for each error drained from the queue, and will receive the 'private' 176 * pointer as its first argument. The callback must obey the rules for 177 * callbacks described above. The queue will have maximum length 'qlen' 178 * and each element will be able to record up to 'eltsize' bytes of data. 179 * The queue's soft interrupt (see errorq_dispatch(), below) will fire 180 * at 'ipl', which should not exceed LOCK_LEVEL. The queue 'flags' may 181 * include the following flag: 182 * 183 * ERRORQ_VITAL - This queue contains information that is considered 184 * vital to problem diagnosis. Error queues that are marked vital will 185 * be automatically drained by the panic subsystem prior to printing 186 * the panic messages to the console. 187 * 188 * void errorq_destroy(errorq); 189 * 190 * Destroy the specified error queue. The queue is drained of any 191 * pending elements and these are logged before errorq_destroy returns. 192 * Once errorq_destroy() begins draining the queue, any simultaneous 193 * calls to dispatch errors will result in the errors being dropped. 194 * The caller must invoke a higher-level abstraction (e.g. disabling 195 * an error interrupt) to ensure that error handling code does not 196 * attempt to dispatch errors to the queue while it is being freed. 197 * 198 * void errorq_dispatch(errorq, data, len, flag); 199 * 200 * Attempt to enqueue the specified error data. If a free queue element 201 * is available, the data is copied into a free element and placed on a 202 * pending list. If no free queue element is available, the error is 203 * dropped. The data length (len) is specified in bytes and should not 204 * exceed the queue's maximum element size. If the data length is less 205 * than the maximum element size, the remainder of the queue element is 206 * filled with zeroes. The flag parameter should be one of: 207 * 208 * ERRORQ_ASYNC - Schedule a soft interrupt at the previously specified 209 * IPL to asynchronously drain the queue on behalf of the caller. 210 * 211 * ERRORQ_SYNC - Do not schedule a soft interrupt to drain the queue. 212 * The caller is presumed to be calling errorq_drain() or panic() in 213 * the near future in order to drain the queue and log the error. 214 * 215 * The errorq_dispatch() function may be called from any context, subject 216 * to the Platform Considerations described above. 217 * 218 * void errorq_drain(errorq); 219 * 220 * Drain the error queue of all pending errors. The queue's callback 221 * function is invoked for each error in order from oldest to newest. 222 * This function may be used at or below LOCK_LEVEL or from panic context. 223 * 224 * errorq_elem_t *errorq_reserve(errorq); 225 * 226 * Reserve an error queue element for later processing and dispatching. 227 * The element is returned to the caller who may add error-specific data 228 * to element. The element is retured to the free pool when either 229 * errorq_commit() is called and the element asynchronously processed 230 * or immediately when errorq_cancel() is called. 231 * 232 * void errorq_commit(errorq, errorq_elem, flag); 233 * 234 * Commit an errorq element (eqep) for dispatching, see 235 * errorq_dispatch(). 236 * 237 * void errorq_cancel(errorq, errorq_elem); 238 * 239 * Cancel a pending errorq element reservation. The errorq element is 240 * returned to the free pool upon cancelation. 241 */ 242 243 #include <sys/errorq_impl.h> 244 #include <sys/sysmacros.h> 245 #include <sys/machlock.h> 246 #include <sys/cmn_err.h> 247 #include <sys/atomic.h> 248 #include <sys/systm.h> 249 #include <sys/kmem.h> 250 #include <sys/conf.h> 251 #include <sys/ddi.h> 252 #include <sys/sunddi.h> 253 #include <sys/bootconf.h> 254 #include <sys/spl.h> 255 #include <sys/dumphdr.h> 256 #include <sys/compress.h> 257 #include <sys/time.h> 258 #include <sys/panic.h> 259 #include <sys/bitmap.h> 260 #include <sys/fm/protocol.h> 261 #include <sys/fm/util.h> 262 263 static struct errorq_kstat errorq_kstat_template = { 264 { "dispatched", KSTAT_DATA_UINT64 }, 265 { "dropped", KSTAT_DATA_UINT64 }, 266 { "logged", KSTAT_DATA_UINT64 }, 267 { "reserved", KSTAT_DATA_UINT64 }, 268 { "reserve_fail", KSTAT_DATA_UINT64 }, 269 { "committed", KSTAT_DATA_UINT64 }, 270 { "commit_fail", KSTAT_DATA_UINT64 }, 271 { "cancelled", KSTAT_DATA_UINT64 } 272 }; 273 274 static uint64_t errorq_lost = 0; 275 static errorq_t *errorq_list = NULL; 276 static kmutex_t errorq_lock; 277 static uint64_t errorq_vitalmin = 5; 278 279 static uint_t 280 errorq_intr(caddr_t eqp) 281 { 282 errorq_drain((errorq_t *)eqp); 283 return (DDI_INTR_CLAIMED); 284 } 285 286 /* 287 * Create a new error queue with the specified properties and add a software 288 * interrupt handler and kstat for it. This function must be called from 289 * passive kernel context with no locks held that can prevent a sleeping 290 * memory allocation from occurring. This function will return NULL if the 291 * softint or kstat for this queue cannot be created. 292 */ 293 errorq_t * 294 errorq_create(const char *name, errorq_func_t func, void *private, 295 ulong_t qlen, size_t size, uint_t ipl, uint_t flags) 296 { 297 errorq_t *eqp = kmem_alloc(sizeof (errorq_t), KM_SLEEP); 298 ddi_iblock_cookie_t ibc = (ddi_iblock_cookie_t)(uintptr_t)ipltospl(ipl); 299 dev_info_t *dip = ddi_root_node(); 300 301 errorq_elem_t *eep; 302 ddi_softintr_t id = NULL; 303 caddr_t data; 304 305 ASSERT(qlen != 0 && size != 0); 306 ASSERT(ipl > 0 && ipl <= LOCK_LEVEL); 307 308 /* 309 * If a queue is created very early in boot before device tree services 310 * are available, the queue softint handler cannot be created. We 311 * manually drain these queues and create their softint handlers when 312 * it is safe to do so as part of errorq_init(), below. 313 */ 314 if (modrootloaded && ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, 315 &ibc, NULL, errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) { 316 cmn_err(CE_WARN, "errorq_create: failed to register " 317 "IPL %u softint for queue %s", ipl, name); 318 kmem_free(eqp, sizeof (errorq_t)); 319 return (NULL); 320 } 321 322 if ((eqp->eq_ksp = kstat_create("unix", 0, name, "errorq", 323 KSTAT_TYPE_NAMED, sizeof (struct errorq_kstat) / 324 sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) == NULL) { 325 cmn_err(CE_WARN, "errorq_create: failed to create kstat " 326 "for queue %s", name); 327 if (id != NULL) 328 ddi_remove_softintr(id); 329 kmem_free(eqp, sizeof (errorq_t)); 330 return (NULL); 331 } 332 333 bcopy(&errorq_kstat_template, &eqp->eq_kstat, 334 sizeof (struct errorq_kstat)); 335 eqp->eq_ksp->ks_data = &eqp->eq_kstat; 336 eqp->eq_ksp->ks_private = eqp; 337 kstat_install(eqp->eq_ksp); 338 339 (void) strncpy(eqp->eq_name, name, ERRORQ_NAMELEN); 340 eqp->eq_name[ERRORQ_NAMELEN] = '\0'; 341 eqp->eq_func = func; 342 eqp->eq_private = private; 343 eqp->eq_data = kmem_alloc(qlen * size, KM_SLEEP); 344 eqp->eq_qlen = qlen; 345 eqp->eq_size = size; 346 eqp->eq_ipl = ipl; 347 eqp->eq_flags = flags | ERRORQ_ACTIVE; 348 eqp->eq_id = id; 349 mutex_init(&eqp->eq_lock, NULL, MUTEX_DEFAULT, NULL); 350 eqp->eq_elems = kmem_alloc(qlen * sizeof (errorq_elem_t), KM_SLEEP); 351 eqp->eq_phead = NULL; 352 eqp->eq_ptail = NULL; 353 eqp->eq_pend = NULL; 354 eqp->eq_dump = NULL; 355 eqp->eq_bitmap = kmem_zalloc(BT_SIZEOFMAP(qlen), KM_SLEEP); 356 eqp->eq_rotor = 0; 357 358 /* 359 * Iterate over the array of errorq_elem_t structures and set its 360 * data pointer. 361 */ 362 for (eep = eqp->eq_elems, data = eqp->eq_data; qlen > 1; qlen--) { 363 eep->eqe_next = NULL; 364 eep->eqe_dump = NULL; 365 eep->eqe_prev = NULL; 366 eep->eqe_data = data; 367 data += size; 368 eep++; 369 } 370 eep->eqe_next = NULL; 371 eep->eqe_prev = NULL; 372 eep->eqe_data = data; 373 eep->eqe_dump = NULL; 374 375 /* 376 * Once the errorq is initialized, add it to the global list of queues, 377 * and then return a pointer to the new queue to the caller. 378 */ 379 mutex_enter(&errorq_lock); 380 eqp->eq_next = errorq_list; 381 errorq_list = eqp; 382 mutex_exit(&errorq_lock); 383 384 return (eqp); 385 } 386 387 /* 388 * Create a new errorq as if by errorq_create(), but set the ERRORQ_NVLIST 389 * flag and initialize each element to have the start of its data region used 390 * as an errorq_nvelem_t with a nvlist allocator that consumes the data region. 391 */ 392 errorq_t * 393 errorq_nvcreate(const char *name, errorq_func_t func, void *private, 394 ulong_t qlen, size_t size, uint_t ipl, uint_t flags) 395 { 396 errorq_t *eqp; 397 errorq_elem_t *eep; 398 399 eqp = errorq_create(name, func, private, qlen, 400 size + sizeof (errorq_nvelem_t), ipl, flags | ERRORQ_NVLIST); 401 402 if (eqp == NULL) 403 return (NULL); 404 405 mutex_enter(&eqp->eq_lock); 406 407 for (eep = eqp->eq_elems; qlen != 0; eep++, qlen--) { 408 errorq_nvelem_t *eqnp = eep->eqe_data; 409 eqnp->eqn_buf = (char *)eqnp + sizeof (errorq_nvelem_t); 410 eqnp->eqn_nva = fm_nva_xcreate(eqnp->eqn_buf, size); 411 } 412 413 mutex_exit(&eqp->eq_lock); 414 return (eqp); 415 } 416 417 /* 418 * To destroy an error queue, we mark it as disabled and then explicitly drain 419 * all pending errors. Once the drain is complete, we can remove the queue 420 * from the global list of queues examined by errorq_panic(), and then free 421 * the various queue data structures. The caller must use some higher-level 422 * abstraction (e.g. disabling an error interrupt) to ensure that no one will 423 * attempt to enqueue new errors while we are freeing this queue. 424 */ 425 void 426 errorq_destroy(errorq_t *eqp) 427 { 428 errorq_t *p, **pp; 429 errorq_elem_t *eep; 430 ulong_t i; 431 432 ASSERT(eqp != NULL); 433 eqp->eq_flags &= ~ERRORQ_ACTIVE; 434 errorq_drain(eqp); 435 436 mutex_enter(&errorq_lock); 437 pp = &errorq_list; 438 439 for (p = errorq_list; p != NULL; p = p->eq_next) { 440 if (p == eqp) { 441 *pp = p->eq_next; 442 break; 443 } 444 pp = &p->eq_next; 445 } 446 447 mutex_exit(&errorq_lock); 448 ASSERT(p != NULL); 449 450 if (eqp->eq_flags & ERRORQ_NVLIST) { 451 for (eep = eqp->eq_elems, i = 0; i < eqp->eq_qlen; i++, eep++) { 452 errorq_nvelem_t *eqnp = eep->eqe_data; 453 fm_nva_xdestroy(eqnp->eqn_nva); 454 } 455 } 456 457 mutex_destroy(&eqp->eq_lock); 458 kstat_delete(eqp->eq_ksp); 459 460 if (eqp->eq_id != NULL) 461 ddi_remove_softintr(eqp->eq_id); 462 463 kmem_free(eqp->eq_elems, eqp->eq_qlen * sizeof (errorq_elem_t)); 464 kmem_free(eqp->eq_bitmap, BT_SIZEOFMAP(eqp->eq_qlen)); 465 kmem_free(eqp->eq_data, eqp->eq_qlen * eqp->eq_size); 466 467 kmem_free(eqp, sizeof (errorq_t)); 468 } 469 470 /* 471 * private version of bt_availbit which makes a best-efforts attempt 472 * at allocating in a round-robin fashion in order to facilitate post-mortem 473 * diagnosis. 474 */ 475 static index_t 476 errorq_availbit(ulong_t *bitmap, size_t nbits, index_t curindex) 477 { 478 ulong_t bit, maxbit, bx; 479 index_t rval, nextindex = curindex + 1; 480 index_t nextword = nextindex >> BT_ULSHIFT; 481 ulong_t nextbitindex = nextindex & BT_ULMASK; 482 index_t maxindex = nbits - 1; 483 index_t maxword = maxindex >> BT_ULSHIFT; 484 ulong_t maxbitindex = maxindex & BT_ULMASK; 485 486 /* 487 * First check if there are still some bits remaining in the current 488 * word, and see if any of those are available. We need to do this by 489 * hand as the bt_availbit() function always starts at the beginning 490 * of a word. 491 */ 492 if (nextindex <= maxindex && nextbitindex != 0) { 493 maxbit = (nextword == maxword) ? maxbitindex : BT_ULMASK; 494 for (bx = 0, bit = 1; bx <= maxbit; bx++, bit <<= 1) 495 if (bx >= nextbitindex && !(bitmap[nextword] & bit)) 496 return ((nextword << BT_ULSHIFT) + bx); 497 nextword++; 498 } 499 /* 500 * Now check if there are any words remaining before the end of the 501 * bitmap. Use bt_availbit() to find any free bits. 502 */ 503 if (nextword <= maxword) 504 if ((rval = bt_availbit(&bitmap[nextword], 505 nbits - (nextword << BT_ULSHIFT))) != -1) 506 return ((nextword << BT_ULSHIFT) + rval); 507 /* 508 * Finally loop back to the start and look for any free bits starting 509 * from the beginning of the bitmap to the current rotor position. 510 */ 511 return (bt_availbit(bitmap, nextindex)); 512 } 513 514 /* 515 * Dispatch a new error into the queue for later processing. The specified 516 * data buffer is copied into a preallocated queue element. If 'len' is 517 * smaller than the queue element size, the remainder of the queue element is 518 * filled with zeroes. This function may be called from any context subject 519 * to the Platform Considerations described above. 520 */ 521 void 522 errorq_dispatch(errorq_t *eqp, const void *data, size_t len, uint_t flag) 523 { 524 errorq_elem_t *eep, *old; 525 526 if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 527 atomic_inc_64(&errorq_lost); 528 return; /* drop error if queue is uninitialized or disabled */ 529 } 530 531 for (;;) { 532 int i, rval; 533 534 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen, 535 eqp->eq_rotor)) == -1) { 536 atomic_inc_64(&eqp->eq_kstat.eqk_dropped.value.ui64); 537 return; 538 } 539 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval); 540 if (rval == 0) { 541 eqp->eq_rotor = i; 542 eep = &eqp->eq_elems[i]; 543 break; 544 } 545 } 546 547 ASSERT(len <= eqp->eq_size); 548 bcopy(data, eep->eqe_data, MIN(eqp->eq_size, len)); 549 550 if (len < eqp->eq_size) 551 bzero((caddr_t)eep->eqe_data + len, eqp->eq_size - len); 552 553 for (;;) { 554 old = eqp->eq_pend; 555 eep->eqe_prev = old; 556 membar_producer(); 557 558 if (atomic_cas_ptr(&eqp->eq_pend, old, eep) == old) 559 break; 560 } 561 562 atomic_inc_64(&eqp->eq_kstat.eqk_dispatched.value.ui64); 563 564 if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL) 565 ddi_trigger_softintr(eqp->eq_id); 566 } 567 568 /* 569 * Drain the specified error queue by calling eq_func() for each pending error. 570 * This function must be called at or below LOCK_LEVEL or from panic context. 571 * In order to synchronize with other attempts to drain the queue, we acquire 572 * the adaptive eq_lock, blocking other consumers. Once this lock is held, 573 * we must use compare-and-swap to move the pending list to the processing 574 * list and to return elements to the free pool in order to synchronize 575 * with producers, who do not acquire any locks and only use atomic set/clear. 576 * 577 * An additional constraint on this function is that if the system panics 578 * while this function is running, the panic code must be able to detect and 579 * handle all intermediate states and correctly dequeue all errors. The 580 * errorq_panic() function below will be used for detecting and handling 581 * these intermediate states. The comments in errorq_drain() below explain 582 * how we make sure each intermediate state is distinct and consistent. 583 */ 584 void 585 errorq_drain(errorq_t *eqp) 586 { 587 errorq_elem_t *eep, *dep; 588 589 ASSERT(eqp != NULL); 590 mutex_enter(&eqp->eq_lock); 591 592 /* 593 * If there are one or more pending errors, set eq_ptail to point to 594 * the first element on the pending list and then attempt to compare- 595 * and-swap NULL to the pending list. We use membar_producer() to 596 * make sure that eq_ptail will be visible to errorq_panic() below 597 * before the pending list is NULLed out. This section is labeled 598 * case (1) for errorq_panic, below. If eq_ptail is not yet set (1A) 599 * eq_pend has all the pending errors. If atomic_cas_ptr fails or 600 * has not been called yet (1B), eq_pend still has all the pending 601 * errors. If atomic_cas_ptr succeeds (1C), eq_ptail has all the 602 * pending errors. 603 */ 604 while ((eep = eqp->eq_pend) != NULL) { 605 eqp->eq_ptail = eep; 606 membar_producer(); 607 608 if (atomic_cas_ptr(&eqp->eq_pend, eep, NULL) == eep) 609 break; 610 } 611 612 /* 613 * If no errors were pending, assert that eq_ptail is set to NULL, 614 * drop the consumer lock, and return without doing anything. 615 */ 616 if (eep == NULL) { 617 ASSERT(eqp->eq_ptail == NULL); 618 mutex_exit(&eqp->eq_lock); 619 return; 620 } 621 622 /* 623 * Now iterate from eq_ptail (a.k.a. eep, the newest error) to the 624 * oldest error, setting the eqe_next pointer so that we can iterate 625 * over the errors from oldest to newest. We use membar_producer() 626 * to make sure that these stores are visible before we set eq_phead. 627 * If we panic before, during, or just after this loop (case 2), 628 * errorq_panic() will simply redo this work, as described below. 629 */ 630 for (eep->eqe_next = NULL; eep->eqe_prev != NULL; eep = eep->eqe_prev) 631 eep->eqe_prev->eqe_next = eep; 632 membar_producer(); 633 634 /* 635 * Now set eq_phead to the head of the processing list (the oldest 636 * error) and issue another membar_producer() to make sure that 637 * eq_phead is seen as non-NULL before we clear eq_ptail. If we panic 638 * after eq_phead is set (case 3), we will detect and log these errors 639 * in errorq_panic(), as described below. 640 */ 641 eqp->eq_phead = eep; 642 membar_producer(); 643 644 eqp->eq_ptail = NULL; 645 membar_producer(); 646 647 /* 648 * If we enter from errorq_panic_drain(), we may already have 649 * errorq elements on the dump list. Find the tail of 650 * the list ready for append. 651 */ 652 dep = eqp->eq_dump; 653 if (panicstr && dep != NULL) { 654 while (dep->eqe_dump != NULL) 655 dep = dep->eqe_dump; 656 } 657 658 /* 659 * Now iterate over the processing list from oldest (eq_phead) to 660 * newest and log each error. Once an error is logged, we use 661 * atomic clear to return it to the free pool. If we panic before, 662 * during, or after calling eq_func() (case 4), the error will still be 663 * found on eq_phead and will be logged in errorq_panic below. 664 */ 665 666 while ((eep = eqp->eq_phead) != NULL) { 667 eqp->eq_func(eqp->eq_private, eep->eqe_data, eep); 668 eqp->eq_kstat.eqk_logged.value.ui64++; 669 670 eqp->eq_phead = eep->eqe_next; 671 membar_producer(); 672 673 eep->eqe_next = NULL; 674 675 /* 676 * On panic, we add the element to the dump list for each 677 * nvlist errorq. Elements are stored oldest to newest. 678 * Then continue, so we don't free and subsequently overwrite 679 * any elements which we've put on the dump queue. 680 */ 681 if (panicstr && (eqp->eq_flags & ERRORQ_NVLIST)) { 682 if (eqp->eq_dump == NULL) 683 dep = eqp->eq_dump = eep; 684 else 685 dep = dep->eqe_dump = eep; 686 membar_producer(); 687 continue; 688 } 689 690 eep->eqe_prev = NULL; 691 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems); 692 } 693 694 mutex_exit(&eqp->eq_lock); 695 } 696 697 /* 698 * Now that device tree services are available, set up the soft interrupt 699 * handlers for any queues that were created early in boot. We then 700 * manually drain these queues to report any pending early errors. 701 */ 702 void 703 errorq_init(void) 704 { 705 dev_info_t *dip = ddi_root_node(); 706 ddi_softintr_t id; 707 errorq_t *eqp; 708 709 ASSERT(modrootloaded != 0); 710 ASSERT(dip != NULL); 711 712 mutex_enter(&errorq_lock); 713 714 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 715 ddi_iblock_cookie_t ibc = 716 (ddi_iblock_cookie_t)(uintptr_t)ipltospl(eqp->eq_ipl); 717 718 if (eqp->eq_id != NULL) 719 continue; /* softint already initialized */ 720 721 if (ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, &ibc, NULL, 722 errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) { 723 panic("errorq_init: failed to register IPL %u softint " 724 "for queue %s", eqp->eq_ipl, eqp->eq_name); 725 } 726 727 eqp->eq_id = id; 728 errorq_drain(eqp); 729 } 730 731 mutex_exit(&errorq_lock); 732 } 733 734 /* 735 * This function is designed to be called from panic context only, and 736 * therefore does not need to acquire errorq_lock when iterating over 737 * errorq_list. This function must be called no more than once for each 738 * 'what' value (if you change this then review the manipulation of 'dep'. 739 */ 740 static uint64_t 741 errorq_panic_drain(uint_t what) 742 { 743 errorq_elem_t *eep, *nep, *dep; 744 errorq_t *eqp; 745 uint64_t loggedtmp; 746 uint64_t logged = 0; 747 748 dep = NULL; 749 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 750 if ((eqp->eq_flags & (ERRORQ_VITAL | ERRORQ_NVLIST)) != what) 751 continue; /* do not drain this queue on this pass */ 752 753 loggedtmp = eqp->eq_kstat.eqk_logged.value.ui64; 754 755 /* 756 * In case (1B) above, eq_ptail may be set but the 757 * atomic_cas_ptr may not have been executed yet or may have 758 * failed. Either way, we must log errors in chronological 759 * order. So we search the pending list for the error 760 * pointed to by eq_ptail. If it is found, we know that all 761 * subsequent errors are also still on the pending list, so 762 * just NULL out eq_ptail and let errorq_drain(), below, 763 * take care of the logging. 764 */ 765 for (eep = eqp->eq_pend; eep != NULL; eep = eep->eqe_prev) { 766 if (eep == eqp->eq_ptail) { 767 ASSERT(eqp->eq_phead == NULL); 768 eqp->eq_ptail = NULL; 769 break; 770 } 771 } 772 773 /* 774 * In cases (1C) and (2) above, eq_ptail will be set to the 775 * newest error on the processing list but eq_phead will still 776 * be NULL. We set the eqe_next pointers so we can iterate 777 * over the processing list in order from oldest error to the 778 * newest error. We then set eq_phead to point to the oldest 779 * error and fall into the for-loop below. 780 */ 781 if (eqp->eq_phead == NULL && (eep = eqp->eq_ptail) != NULL) { 782 for (eep->eqe_next = NULL; eep->eqe_prev != NULL; 783 eep = eep->eqe_prev) 784 eep->eqe_prev->eqe_next = eep; 785 786 eqp->eq_phead = eep; 787 eqp->eq_ptail = NULL; 788 } 789 790 /* 791 * In cases (3) and (4) above (or after case (1C/2) handling), 792 * eq_phead will be set to the oldest error on the processing 793 * list. We log each error and return it to the free pool. 794 * 795 * Unlike errorq_drain(), we don't need to worry about updating 796 * eq_phead because errorq_panic() will be called at most once. 797 * However, we must use atomic_cas_ptr to update the 798 * freelist in case errors are still being enqueued during 799 * panic. 800 */ 801 for (eep = eqp->eq_phead; eep != NULL; eep = nep) { 802 eqp->eq_func(eqp->eq_private, eep->eqe_data, eep); 803 eqp->eq_kstat.eqk_logged.value.ui64++; 804 805 nep = eep->eqe_next; 806 eep->eqe_next = NULL; 807 808 /* 809 * On panic, we add the element to the dump list for 810 * each nvlist errorq, stored oldest to newest. Then 811 * continue, so we don't free and subsequently overwrite 812 * any elements which we've put on the dump queue. 813 */ 814 if (eqp->eq_flags & ERRORQ_NVLIST) { 815 if (eqp->eq_dump == NULL) 816 dep = eqp->eq_dump = eep; 817 else 818 dep = dep->eqe_dump = eep; 819 membar_producer(); 820 continue; 821 } 822 823 eep->eqe_prev = NULL; 824 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems); 825 } 826 827 /* 828 * Now go ahead and drain any other errors on the pending list. 829 * This call transparently handles case (1A) above, as well as 830 * any other errors that were dispatched after errorq_drain() 831 * completed its first compare-and-swap. 832 */ 833 errorq_drain(eqp); 834 835 logged += eqp->eq_kstat.eqk_logged.value.ui64 - loggedtmp; 836 } 837 return (logged); 838 } 839 840 /* 841 * Drain all error queues - called only from panic context. Some drain 842 * functions may enqueue errors to ERRORQ_NVLIST error queues so that 843 * they may be written out in the panic dump - so ERRORQ_NVLIST queues 844 * must be drained last. Drain ERRORQ_VITAL queues before nonvital queues 845 * so that vital errors get to fill the ERRORQ_NVLIST queues first, and 846 * do not drain the nonvital queues if there are many vital errors. 847 */ 848 void 849 errorq_panic(void) 850 { 851 ASSERT(panicstr != NULL); 852 853 if (errorq_panic_drain(ERRORQ_VITAL) <= errorq_vitalmin) 854 (void) errorq_panic_drain(0); 855 (void) errorq_panic_drain(ERRORQ_VITAL | ERRORQ_NVLIST); 856 (void) errorq_panic_drain(ERRORQ_NVLIST); 857 } 858 859 /* 860 * Reserve an error queue element for later processing and dispatching. The 861 * element is returned to the caller who may add error-specific data to 862 * element. The element is retured to the free pool when either 863 * errorq_commit() is called and the element asynchronously processed 864 * or immediately when errorq_cancel() is called. 865 */ 866 errorq_elem_t * 867 errorq_reserve(errorq_t *eqp) 868 { 869 errorq_elem_t *eqep; 870 871 if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 872 atomic_inc_64(&errorq_lost); 873 return (NULL); 874 } 875 876 for (;;) { 877 int i, rval; 878 879 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen, 880 eqp->eq_rotor)) == -1) { 881 atomic_inc_64(&eqp->eq_kstat.eqk_dropped.value.ui64); 882 return (NULL); 883 } 884 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval); 885 if (rval == 0) { 886 eqp->eq_rotor = i; 887 eqep = &eqp->eq_elems[i]; 888 break; 889 } 890 } 891 892 if (eqp->eq_flags & ERRORQ_NVLIST) { 893 errorq_nvelem_t *eqnp = eqep->eqe_data; 894 nv_alloc_reset(eqnp->eqn_nva); 895 eqnp->eqn_nvl = fm_nvlist_create(eqnp->eqn_nva); 896 } 897 898 atomic_inc_64(&eqp->eq_kstat.eqk_reserved.value.ui64); 899 return (eqep); 900 } 901 902 /* 903 * Commit an errorq element (eqep) for dispatching. 904 * This function may be called from any context subject 905 * to the Platform Considerations described above. 906 */ 907 void 908 errorq_commit(errorq_t *eqp, errorq_elem_t *eqep, uint_t flag) 909 { 910 errorq_elem_t *old; 911 912 if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 913 atomic_inc_64(&eqp->eq_kstat.eqk_commit_fail.value.ui64); 914 return; 915 } 916 917 for (;;) { 918 old = eqp->eq_pend; 919 eqep->eqe_prev = old; 920 membar_producer(); 921 922 if (atomic_cas_ptr(&eqp->eq_pend, old, eqep) == old) 923 break; 924 } 925 926 atomic_inc_64(&eqp->eq_kstat.eqk_committed.value.ui64); 927 928 if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL) 929 ddi_trigger_softintr(eqp->eq_id); 930 } 931 932 /* 933 * Cancel an errorq element reservation by returning the specified element 934 * to the free pool. Duplicate or invalid frees are not supported. 935 */ 936 void 937 errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep) 938 { 939 if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) 940 return; 941 942 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eqep - eqp->eq_elems); 943 944 atomic_inc_64(&eqp->eq_kstat.eqk_cancelled.value.ui64); 945 } 946 947 /* 948 * Write elements on the dump list of each nvlist errorq to the dump device. 949 * Upon reboot, fmd(8) will extract and replay them for diagnosis. 950 */ 951 void 952 errorq_dump(void) 953 { 954 errorq_elem_t *eep; 955 errorq_t *eqp; 956 957 if (ereport_dumpbuf == NULL) 958 return; /* reboot or panic before errorq is even set up */ 959 960 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 961 if (!(eqp->eq_flags & ERRORQ_NVLIST) || 962 !(eqp->eq_flags & ERRORQ_ACTIVE)) 963 continue; /* do not dump this queue on panic */ 964 965 for (eep = eqp->eq_dump; eep != NULL; eep = eep->eqe_dump) { 966 errorq_nvelem_t *eqnp = eep->eqe_data; 967 size_t len = 0; 968 erpt_dump_t ed; 969 int err; 970 971 (void) nvlist_size(eqnp->eqn_nvl, 972 &len, NV_ENCODE_NATIVE); 973 974 if (len > ereport_dumplen || len == 0) { 975 cmn_err(CE_WARN, "%s: unable to save error " 976 "report %p due to size %lu\n", 977 eqp->eq_name, (void *)eep, len); 978 continue; 979 } 980 981 if ((err = nvlist_pack(eqnp->eqn_nvl, 982 (char **)&ereport_dumpbuf, &ereport_dumplen, 983 NV_ENCODE_NATIVE, KM_NOSLEEP)) != 0) { 984 cmn_err(CE_WARN, "%s: unable to save error " 985 "report %p due to pack error %d\n", 986 eqp->eq_name, (void *)eep, err); 987 continue; 988 } 989 990 ed.ed_magic = ERPT_MAGIC; 991 ed.ed_chksum = checksum32(ereport_dumpbuf, len); 992 ed.ed_size = (uint32_t)len; 993 ed.ed_pad = 0; 994 ed.ed_hrt_nsec = 0; 995 ed.ed_hrt_base = panic_hrtime; 996 ed.ed_tod_base.sec = panic_hrestime.tv_sec; 997 ed.ed_tod_base.nsec = panic_hrestime.tv_nsec; 998 999 dumpvp_write(&ed, sizeof (ed)); 1000 dumpvp_write(ereport_dumpbuf, len); 1001 } 1002 } 1003 } 1004 1005 nvlist_t * 1006 errorq_elem_nvl(errorq_t *eqp, const errorq_elem_t *eqep) 1007 { 1008 errorq_nvelem_t *eqnp = eqep->eqe_data; 1009 1010 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST); 1011 1012 return (eqnp->eqn_nvl); 1013 } 1014 1015 nv_alloc_t * 1016 errorq_elem_nva(errorq_t *eqp, const errorq_elem_t *eqep) 1017 { 1018 errorq_nvelem_t *eqnp = eqep->eqe_data; 1019 1020 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST); 1021 1022 return (eqnp->eqn_nva); 1023 } 1024 1025 /* 1026 * Reserve a new element and duplicate the data of the original into it. 1027 */ 1028 void * 1029 errorq_elem_dup(errorq_t *eqp, const errorq_elem_t *eqep, errorq_elem_t **neqep) 1030 { 1031 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE); 1032 ASSERT(!(eqp->eq_flags & ERRORQ_NVLIST)); 1033 1034 if ((*neqep = errorq_reserve(eqp)) == NULL) 1035 return (NULL); 1036 1037 bcopy(eqep->eqe_data, (*neqep)->eqe_data, eqp->eq_size); 1038 return ((*neqep)->eqe_data); 1039 } 1040