1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Kernel Error Queues 28 * 29 * A common problem when handling hardware error traps and interrupts is that 30 * these errors frequently must be handled at high interrupt level, where 31 * reliably producing error messages and safely examining and manipulating 32 * other kernel state may not be possible. The kernel error queue primitive is 33 * a common set of routines that allow a subsystem to maintain a queue of 34 * errors that can be processed by an explicit call from a safe context or by a 35 * soft interrupt that fires at a specific lower interrupt level. The queue 36 * management code also ensures that if the system panics, all in-transit 37 * errors are logged prior to reset. Each queue has an associated kstat for 38 * observing the number of errors dispatched and logged, and mdb(1) debugging 39 * support is provided for live and post-mortem observability. 40 * 41 * Memory Allocation 42 * 43 * All of the queue data structures are allocated in advance as part of 44 * the errorq_create() call. No additional memory allocations are 45 * performed as part of errorq_dispatch(), errorq_reserve(), 46 * errorq_commit() or errorq_drain(). This design 47 * facilitates reliable error queue processing even when the system is low 48 * on memory, and ensures that errorq_dispatch() can be called from any 49 * context. When the queue is created, the maximum queue length is 50 * specified as a parameter to errorq_create() and errorq_nvcreate(). This 51 * length should represent a reasonable upper bound on the number of 52 * simultaneous errors. If errorq_dispatch() or errorq_reserve() is 53 * invoked and no free queue elements are available, the error is 54 * dropped and will not be logged. Typically, the queue will only be 55 * exhausted by an error storm, and in this case 56 * the earlier errors provide the most important data for analysis. 57 * When a new error is dispatched, the error data is copied into the 58 * preallocated queue element so that the caller's buffer can be reused. 59 * 60 * When a new error is reserved, an element is moved from the free pool 61 * and returned to the caller. The element buffer data, eqe_data, may be 62 * managed by the caller and dispatched to the errorq by calling 63 * errorq_commit(). This is useful for additions to errorq's 64 * created with errorq_nvcreate() to handle name-value pair (nvpair) data. 65 * See below for a discussion on nvlist errorq's. 66 * 67 * Queue Drain Callback 68 * 69 * When the error queue is drained, the caller's queue drain callback is 70 * invoked with a pointer to the saved error data. This function may be 71 * called from passive kernel context or soft interrupt context at or 72 * below LOCK_LEVEL, or as part of panic(). As such, the callback should 73 * basically only be calling cmn_err (but NOT with the CE_PANIC flag). 74 * The callback must not call panic(), attempt to allocate memory, or wait 75 * on a condition variable. The callback may not call errorq_destroy() 76 * or errorq_drain() on the same error queue that called it. 77 * 78 * The queue drain callback will always be called for each pending error 79 * in the order in which errors were enqueued (oldest to newest). The 80 * queue drain callback is guaranteed to provide at *least* once semantics 81 * for all errors that are successfully dispatched (i.e. for which 82 * errorq_dispatch() has successfully completed). If an unrelated panic 83 * occurs while the queue drain callback is running on a vital queue, the 84 * panic subsystem will continue the queue drain and the callback may be 85 * invoked again for the same error. Therefore, the callback should 86 * restrict itself to logging messages and taking other actions that are 87 * not destructive if repeated. 88 * 89 * Name-Value Pair Error Queues 90 * 91 * During error handling, it may be more convenient to store error 92 * queue element data as a fixed buffer of name-value pairs. The 93 * nvpair library allows construction and destruction of nvlists 94 * in pre-allocated memory buffers. 95 * 96 * Error queues created via errorq_nvcreate() store queue element 97 * data as fixed buffer nvlists (ereports). errorq_reserve() 98 * allocates an errorq element from eqp->eq_bitmap and returns a valid 99 * pointer to a errorq_elem_t (queue element) and a pre-allocated 100 * fixed buffer nvlist. errorq_elem_nvl() is used to gain access 101 * to the nvlist to add name-value ereport members prior to 102 * dispatching the error queue element in errorq_commit(). 103 * 104 * Once dispatched, the drain function will return the element to 105 * eqp->eq_bitmap and reset the associated nv_alloc structure. 106 * error_cancel() may be called to cancel an element reservation 107 * element that was never dispatched (committed). This is useful in 108 * cases where a programming error prevents a queue element from being 109 * dispatched. 110 * 111 * Queue Management 112 * 113 * The queue element structures and error data buffers are allocated in 114 * two contiguous chunks as part of errorq_create() or errorq_nvcreate(). 115 * Each queue element structure contains a next pointer, 116 * a previous pointer, and a pointer to the corresponding error data 117 * buffer. The data buffer for a nvlist errorq is a shared buffer 118 * for the allocation of name-value pair lists. The elements are kept on 119 * one of four lists: 120 * 121 * Unused elements are kept in the free pool, managed by eqp->eq_bitmap. 122 * The eqe_prev and eqe_next pointers are not used while in the free pool 123 * and will be set to NULL. 124 * 125 * Pending errors are kept on the pending list, a singly-linked list 126 * pointed to by eqp->eq_pend, and linked together using eqe_prev. This 127 * list is maintained in order from newest error to oldest. The eqe_next 128 * pointer is not used by the pending list and will be set to NULL. 129 * 130 * The processing list is a doubly-linked list pointed to by eqp->eq_phead 131 * (the oldest element) and eqp->eq_ptail (the newest element). The 132 * eqe_next pointer is used to traverse from eq_phead to eq_ptail, and the 133 * eqe_prev pointer is used to traverse from eq_ptail to eq_phead. Once a 134 * queue drain operation begins, the current pending list is moved to the 135 * processing list in a two-phase commit fashion (eq_ptail being cleared 136 * at the beginning but eq_phead only at the end), allowing the panic code 137 * to always locate and process all pending errors in the event that a 138 * panic occurs in the middle of queue processing. 139 * 140 * A fourth list is maintained for nvlist errorqs. The dump list, 141 * eq_dump is used to link all errorq elements that should be stored 142 * in a crash dump file in the event of a system panic. During 143 * errorq_panic(), the list is created and subsequently traversed 144 * in errorq_dump() during the final phases of a crash dump. 145 * 146 * Platform Considerations 147 * 148 * In order to simplify their implementation, error queues make use of the 149 * C wrappers for compare-and-swap. If the platform itself does not 150 * support compare-and-swap in hardware and the kernel emulation routines 151 * are used instead, then the context in which errorq_dispatch() can be 152 * safely invoked is further constrained by the implementation of the 153 * compare-and-swap emulation. Specifically, if errorq_dispatch() is 154 * called from a code path that can be executed above ATOMIC_LEVEL on such 155 * a platform, the dispatch code could potentially deadlock unless the 156 * corresponding error interrupt is blocked or disabled prior to calling 157 * errorq_dispatch(). Error queues should therefore be deployed with 158 * caution on these platforms. 159 * 160 * Interfaces 161 * 162 * errorq_t *errorq_create(name, func, private, qlen, eltsize, ipl, flags); 163 * errorq_t *errorq_nvcreate(name, func, private, qlen, eltsize, ipl, flags); 164 * 165 * Create a new error queue with the specified name, callback, and 166 * properties. A pointer to the new error queue is returned upon success, 167 * or NULL is returned to indicate that the queue could not be created. 168 * This function must be called from passive kernel context with no locks 169 * held that can prevent a sleeping memory allocation from occurring. 170 * errorq_create() will return failure if the queue kstats cannot be 171 * created, or if a soft interrupt handler cannot be registered. 172 * 173 * The queue 'name' is a string that is recorded for live and post-mortem 174 * examination by a debugger. The queue callback 'func' will be invoked 175 * for each error drained from the queue, and will receive the 'private' 176 * pointer as its first argument. The callback must obey the rules for 177 * callbacks described above. The queue will have maximum length 'qlen' 178 * and each element will be able to record up to 'eltsize' bytes of data. 179 * The queue's soft interrupt (see errorq_dispatch(), below) will fire 180 * at 'ipl', which should not exceed LOCK_LEVEL. The queue 'flags' may 181 * include the following flag: 182 * 183 * ERRORQ_VITAL - This queue contains information that is considered 184 * vital to problem diagnosis. Error queues that are marked vital will 185 * be automatically drained by the panic subsystem prior to printing 186 * the panic messages to the console. 187 * 188 * void errorq_destroy(errorq); 189 * 190 * Destroy the specified error queue. The queue is drained of any 191 * pending elements and these are logged before errorq_destroy returns. 192 * Once errorq_destroy() begins draining the queue, any simultaneous 193 * calls to dispatch errors will result in the errors being dropped. 194 * The caller must invoke a higher-level abstraction (e.g. disabling 195 * an error interrupt) to ensure that error handling code does not 196 * attempt to dispatch errors to the queue while it is being freed. 197 * 198 * void errorq_dispatch(errorq, data, len, flag); 199 * 200 * Attempt to enqueue the specified error data. If a free queue element 201 * is available, the data is copied into a free element and placed on a 202 * pending list. If no free queue element is available, the error is 203 * dropped. The data length (len) is specified in bytes and should not 204 * exceed the queue's maximum element size. If the data length is less 205 * than the maximum element size, the remainder of the queue element is 206 * filled with zeroes. The flag parameter should be one of: 207 * 208 * ERRORQ_ASYNC - Schedule a soft interrupt at the previously specified 209 * IPL to asynchronously drain the queue on behalf of the caller. 210 * 211 * ERRORQ_SYNC - Do not schedule a soft interrupt to drain the queue. 212 * The caller is presumed to be calling errorq_drain() or panic() in 213 * the near future in order to drain the queue and log the error. 214 * 215 * The errorq_dispatch() function may be called from any context, subject 216 * to the Platform Considerations described above. 217 * 218 * void errorq_drain(errorq); 219 * 220 * Drain the error queue of all pending errors. The queue's callback 221 * function is invoked for each error in order from oldest to newest. 222 * This function may be used at or below LOCK_LEVEL or from panic context. 223 * 224 * errorq_elem_t *errorq_reserve(errorq); 225 * 226 * Reserve an error queue element for later processing and dispatching. 227 * The element is returned to the caller who may add error-specific data 228 * to element. The element is retured to the free pool when either 229 * errorq_commit() is called and the element asynchronously processed 230 * or immediately when errorq_cancel() is called. 231 * 232 * void errorq_commit(errorq, errorq_elem, flag); 233 * 234 * Commit an errorq element (eqep) for dispatching, see 235 * errorq_dispatch(). 236 * 237 * void errorq_cancel(errorq, errorq_elem); 238 * 239 * Cancel a pending errorq element reservation. The errorq element is 240 * returned to the free pool upon cancelation. 241 */ 242 243 #include <sys/errorq_impl.h> 244 #include <sys/sysmacros.h> 245 #include <sys/machlock.h> 246 #include <sys/cmn_err.h> 247 #include <sys/atomic.h> 248 #include <sys/systm.h> 249 #include <sys/kmem.h> 250 #include <sys/conf.h> 251 #include <sys/ddi.h> 252 #include <sys/sunddi.h> 253 #include <sys/bootconf.h> 254 #include <sys/spl.h> 255 #include <sys/dumphdr.h> 256 #include <sys/compress.h> 257 #include <sys/time.h> 258 #include <sys/panic.h> 259 #include <sys/bitmap.h> 260 #include <sys/fm/protocol.h> 261 #include <sys/fm/util.h> 262 263 static struct errorq_kstat errorq_kstat_template = { 264 { "dispatched", KSTAT_DATA_UINT64 }, 265 { "dropped", KSTAT_DATA_UINT64 }, 266 { "logged", KSTAT_DATA_UINT64 }, 267 { "reserved", KSTAT_DATA_UINT64 }, 268 { "reserve_fail", KSTAT_DATA_UINT64 }, 269 { "committed", KSTAT_DATA_UINT64 }, 270 { "commit_fail", KSTAT_DATA_UINT64 }, 271 { "cancelled", KSTAT_DATA_UINT64 } 272 }; 273 274 static uint64_t errorq_lost = 0; 275 static errorq_t *errorq_list = NULL; 276 static kmutex_t errorq_lock; 277 static uint64_t errorq_vitalmin = 5; 278 279 static uint_t 280 errorq_intr(caddr_t eqp) 281 { 282 errorq_drain((errorq_t *)eqp); 283 return (DDI_INTR_CLAIMED); 284 } 285 286 /* 287 * Create a new error queue with the specified properties and add a software 288 * interrupt handler and kstat for it. This function must be called from 289 * passive kernel context with no locks held that can prevent a sleeping 290 * memory allocation from occurring. This function will return NULL if the 291 * softint or kstat for this queue cannot be created. 292 */ 293 errorq_t * 294 errorq_create(const char *name, errorq_func_t func, void *private, 295 ulong_t qlen, size_t size, uint_t ipl, uint_t flags) 296 { 297 errorq_t *eqp = kmem_alloc(sizeof (errorq_t), KM_SLEEP); 298 ddi_iblock_cookie_t ibc = (ddi_iblock_cookie_t)(uintptr_t)ipltospl(ipl); 299 dev_info_t *dip = ddi_root_node(); 300 301 errorq_elem_t *eep; 302 ddi_softintr_t id = NULL; 303 caddr_t data; 304 305 ASSERT(qlen != 0 && size != 0); 306 ASSERT(ipl > 0 && ipl <= LOCK_LEVEL); 307 308 /* 309 * If a queue is created very early in boot before device tree services 310 * are available, the queue softint handler cannot be created. We 311 * manually drain these queues and create their softint handlers when 312 * it is safe to do so as part of errorq_init(), below. 313 */ 314 if (modrootloaded && ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, 315 &ibc, NULL, errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) { 316 cmn_err(CE_WARN, "errorq_create: failed to register " 317 "IPL %u softint for queue %s", ipl, name); 318 kmem_free(eqp, sizeof (errorq_t)); 319 return (NULL); 320 } 321 322 if ((eqp->eq_ksp = kstat_create("unix", 0, name, "errorq", 323 KSTAT_TYPE_NAMED, sizeof (struct errorq_kstat) / 324 sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) == NULL) { 325 cmn_err(CE_WARN, "errorq_create: failed to create kstat " 326 "for queue %s", name); 327 if (id != NULL) 328 ddi_remove_softintr(id); 329 kmem_free(eqp, sizeof (errorq_t)); 330 return (NULL); 331 } 332 333 bcopy(&errorq_kstat_template, &eqp->eq_kstat, 334 sizeof (struct errorq_kstat)); 335 eqp->eq_ksp->ks_data = &eqp->eq_kstat; 336 eqp->eq_ksp->ks_private = eqp; 337 kstat_install(eqp->eq_ksp); 338 339 (void) strncpy(eqp->eq_name, name, ERRORQ_NAMELEN); 340 eqp->eq_name[ERRORQ_NAMELEN] = '\0'; 341 eqp->eq_func = func; 342 eqp->eq_private = private; 343 eqp->eq_data = kmem_alloc(qlen * size, KM_SLEEP); 344 eqp->eq_qlen = qlen; 345 eqp->eq_size = size; 346 eqp->eq_ipl = ipl; 347 eqp->eq_flags = flags | ERRORQ_ACTIVE; 348 eqp->eq_id = id; 349 mutex_init(&eqp->eq_lock, NULL, MUTEX_DEFAULT, NULL); 350 eqp->eq_elems = kmem_alloc(qlen * sizeof (errorq_elem_t), KM_SLEEP); 351 eqp->eq_phead = NULL; 352 eqp->eq_ptail = NULL; 353 eqp->eq_pend = NULL; 354 eqp->eq_dump = NULL; 355 eqp->eq_bitmap = kmem_zalloc(BT_SIZEOFMAP(qlen), KM_SLEEP); 356 eqp->eq_rotor = 0; 357 358 /* 359 * Iterate over the array of errorq_elem_t structures and set its 360 * data pointer. 361 */ 362 for (eep = eqp->eq_elems, data = eqp->eq_data; qlen > 1; qlen--) { 363 eep->eqe_next = NULL; 364 eep->eqe_dump = NULL; 365 eep->eqe_prev = NULL; 366 eep->eqe_data = data; 367 data += size; 368 eep++; 369 } 370 eep->eqe_next = NULL; 371 eep->eqe_prev = NULL; 372 eep->eqe_data = data; 373 eep->eqe_dump = NULL; 374 375 /* 376 * Once the errorq is initialized, add it to the global list of queues, 377 * and then return a pointer to the new queue to the caller. 378 */ 379 mutex_enter(&errorq_lock); 380 eqp->eq_next = errorq_list; 381 errorq_list = eqp; 382 mutex_exit(&errorq_lock); 383 384 return (eqp); 385 } 386 387 /* 388 * Create a new errorq as if by errorq_create(), but set the ERRORQ_NVLIST 389 * flag and initialize each element to have the start of its data region used 390 * as an errorq_nvelem_t with a nvlist allocator that consumes the data region. 391 */ 392 errorq_t * 393 errorq_nvcreate(const char *name, errorq_func_t func, void *private, 394 ulong_t qlen, size_t size, uint_t ipl, uint_t flags) 395 { 396 errorq_t *eqp; 397 errorq_elem_t *eep; 398 399 eqp = errorq_create(name, func, private, qlen, 400 size + sizeof (errorq_nvelem_t), ipl, flags | ERRORQ_NVLIST); 401 402 if (eqp == NULL) 403 return (NULL); 404 405 mutex_enter(&eqp->eq_lock); 406 407 for (eep = eqp->eq_elems; qlen != 0; eep++, qlen--) { 408 errorq_nvelem_t *eqnp = eep->eqe_data; 409 eqnp->eqn_buf = (char *)eqnp + sizeof (errorq_nvelem_t); 410 eqnp->eqn_nva = fm_nva_xcreate(eqnp->eqn_buf, size); 411 } 412 413 mutex_exit(&eqp->eq_lock); 414 return (eqp); 415 } 416 417 /* 418 * To destroy an error queue, we mark it as disabled and then explicitly drain 419 * all pending errors. Once the drain is complete, we can remove the queue 420 * from the global list of queues examined by errorq_panic(), and then free 421 * the various queue data structures. The caller must use some higher-level 422 * abstraction (e.g. disabling an error interrupt) to ensure that no one will 423 * attempt to enqueue new errors while we are freeing this queue. 424 */ 425 void 426 errorq_destroy(errorq_t *eqp) 427 { 428 errorq_t *p, **pp; 429 errorq_elem_t *eep; 430 ulong_t i; 431 432 ASSERT(eqp != NULL); 433 eqp->eq_flags &= ~ERRORQ_ACTIVE; 434 errorq_drain(eqp); 435 436 mutex_enter(&errorq_lock); 437 pp = &errorq_list; 438 439 for (p = errorq_list; p != NULL; p = p->eq_next) { 440 if (p == eqp) { 441 *pp = p->eq_next; 442 break; 443 } 444 pp = &p->eq_next; 445 } 446 447 mutex_exit(&errorq_lock); 448 ASSERT(p != NULL); 449 450 if (eqp->eq_flags & ERRORQ_NVLIST) { 451 for (eep = eqp->eq_elems, i = 0; i < eqp->eq_qlen; i++, eep++) { 452 errorq_nvelem_t *eqnp = eep->eqe_data; 453 fm_nva_xdestroy(eqnp->eqn_nva); 454 } 455 } 456 457 mutex_destroy(&eqp->eq_lock); 458 kstat_delete(eqp->eq_ksp); 459 460 if (eqp->eq_id != NULL) 461 ddi_remove_softintr(eqp->eq_id); 462 463 kmem_free(eqp->eq_elems, eqp->eq_qlen * sizeof (errorq_elem_t)); 464 kmem_free(eqp->eq_bitmap, BT_SIZEOFMAP(eqp->eq_qlen)); 465 kmem_free(eqp->eq_data, eqp->eq_qlen * eqp->eq_size); 466 467 kmem_free(eqp, sizeof (errorq_t)); 468 } 469 470 /* 471 * private version of bt_availbit which makes a best-efforts attempt 472 * at allocating in a round-robin fashion in order to facilitate post-mortem 473 * diagnosis. 474 */ 475 static index_t 476 errorq_availbit(ulong_t *bitmap, size_t nbits, index_t curindex) 477 { 478 ulong_t bit, maxbit, bx; 479 index_t rval, nextindex = curindex + 1; 480 index_t nextword = nextindex >> BT_ULSHIFT; 481 ulong_t nextbitindex = nextindex & BT_ULMASK; 482 index_t maxindex = nbits - 1; 483 index_t maxword = maxindex >> BT_ULSHIFT; 484 ulong_t maxbitindex = maxindex & BT_ULMASK; 485 486 /* 487 * First check if there are still some bits remaining in the current 488 * word, and see if any of those are available. We need to do this by 489 * hand as the bt_availbit() function always starts at the beginning 490 * of a word. 491 */ 492 if (nextindex <= maxindex && nextbitindex != 0) { 493 maxbit = (nextword == maxword) ? maxbitindex : BT_ULMASK; 494 for (bx = 0, bit = 1; bx <= maxbit; bx++, bit <<= 1) 495 if (bx >= nextbitindex && !(bitmap[nextword] & bit)) 496 return ((nextword << BT_ULSHIFT) + bx); 497 nextword++; 498 } 499 /* 500 * Now check if there are any words remaining before the end of the 501 * bitmap. Use bt_availbit() to find any free bits. 502 */ 503 if (nextword <= maxword) 504 if ((rval = bt_availbit(&bitmap[nextword], 505 nbits - (nextword << BT_ULSHIFT))) != -1) 506 return ((nextword << BT_ULSHIFT) + rval); 507 /* 508 * Finally loop back to the start and look for any free bits starting 509 * from the beginning of the bitmap to the current rotor position. 510 */ 511 return (bt_availbit(bitmap, nextindex)); 512 } 513 514 /* 515 * Dispatch a new error into the queue for later processing. The specified 516 * data buffer is copied into a preallocated queue element. If 'len' is 517 * smaller than the queue element size, the remainder of the queue element is 518 * filled with zeroes. This function may be called from any context subject 519 * to the Platform Considerations described above. 520 */ 521 void 522 errorq_dispatch(errorq_t *eqp, const void *data, size_t len, uint_t flag) 523 { 524 errorq_elem_t *eep, *old; 525 526 if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 527 atomic_inc_64(&errorq_lost); 528 return; /* drop error if queue is uninitialized or disabled */ 529 } 530 531 for (;;) { 532 int i, rval; 533 534 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen, 535 eqp->eq_rotor)) == -1) { 536 atomic_inc_64(&eqp->eq_kstat.eqk_dropped.value.ui64); 537 return; 538 } 539 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval); 540 if (rval == 0) { 541 eqp->eq_rotor = i; 542 eep = &eqp->eq_elems[i]; 543 break; 544 } 545 } 546 547 ASSERT(len <= eqp->eq_size); 548 bcopy(data, eep->eqe_data, MIN(eqp->eq_size, len)); 549 550 if (len < eqp->eq_size) 551 bzero((caddr_t)eep->eqe_data + len, eqp->eq_size - len); 552 553 for (;;) { 554 old = eqp->eq_pend; 555 eep->eqe_prev = old; 556 membar_producer(); 557 558 if (atomic_cas_ptr(&eqp->eq_pend, old, eep) == old) 559 break; 560 } 561 562 atomic_inc_64(&eqp->eq_kstat.eqk_dispatched.value.ui64); 563 564 if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL) 565 ddi_trigger_softintr(eqp->eq_id); 566 } 567 568 /* 569 * Drain the specified error queue by calling eq_func() for each pending error. 570 * This function must be called at or below LOCK_LEVEL or from panic context. 571 * In order to synchronize with other attempts to drain the queue, we acquire 572 * the adaptive eq_lock, blocking other consumers. Once this lock is held, 573 * we must use compare-and-swap to move the pending list to the processing 574 * list and to return elements to the free pool in order to synchronize 575 * with producers, who do not acquire any locks and only use atomic set/clear. 576 * 577 * An additional constraint on this function is that if the system panics 578 * while this function is running, the panic code must be able to detect and 579 * handle all intermediate states and correctly dequeue all errors. The 580 * errorq_panic() function below will be used for detecting and handling 581 * these intermediate states. The comments in errorq_drain() below explain 582 * how we make sure each intermediate state is distinct and consistent. 583 */ 584 void 585 errorq_drain(errorq_t *eqp) 586 { 587 errorq_elem_t *eep, *dep; 588 589 ASSERT(eqp != NULL); 590 mutex_enter(&eqp->eq_lock); 591 592 /* 593 * If there are one or more pending errors, set eq_ptail to point to 594 * the first element on the pending list and then attempt to compare- 595 * and-swap NULL to the pending list. We use membar_producer() to 596 * make sure that eq_ptail will be visible to errorq_panic() below 597 * before the pending list is NULLed out. This section is labeled 598 * case (1) for errorq_panic, below. If eq_ptail is not yet set (1A) 599 * eq_pend has all the pending errors. If atomic_cas_ptr fails or 600 * has not been called yet (1B), eq_pend still has all the pending 601 * errors. If atomic_cas_ptr succeeds (1C), eq_ptail has all the 602 * pending errors. 603 */ 604 while ((eep = eqp->eq_pend) != NULL) { 605 eqp->eq_ptail = eep; 606 membar_producer(); 607 608 if (atomic_cas_ptr(&eqp->eq_pend, eep, NULL) == eep) 609 break; 610 } 611 612 /* 613 * If no errors were pending, assert that eq_ptail is set to NULL, 614 * drop the consumer lock, and return without doing anything. 615 */ 616 if (eep == NULL) { 617 ASSERT(eqp->eq_ptail == NULL); 618 mutex_exit(&eqp->eq_lock); 619 return; 620 } 621 622 /* 623 * Now iterate from eq_ptail (a.k.a. eep, the newest error) to the 624 * oldest error, setting the eqe_next pointer so that we can iterate 625 * over the errors from oldest to newest. We use membar_producer() 626 * to make sure that these stores are visible before we set eq_phead. 627 * If we panic before, during, or just after this loop (case 2), 628 * errorq_panic() will simply redo this work, as described below. 629 */ 630 for (eep->eqe_next = NULL; eep->eqe_prev != NULL; eep = eep->eqe_prev) 631 eep->eqe_prev->eqe_next = eep; 632 membar_producer(); 633 634 /* 635 * Now set eq_phead to the head of the processing list (the oldest 636 * error) and issue another membar_producer() to make sure that 637 * eq_phead is seen as non-NULL before we clear eq_ptail. If we panic 638 * after eq_phead is set (case 3), we will detect and log these errors 639 * in errorq_panic(), as described below. 640 */ 641 eqp->eq_phead = eep; 642 membar_producer(); 643 644 eqp->eq_ptail = NULL; 645 membar_producer(); 646 647 /* 648 * If we enter from errorq_panic_drain(), we may already have 649 * errorq elements on the dump list. Find the tail of 650 * the list ready for append. 651 */ 652 if (panicstr && (dep = eqp->eq_dump) != NULL) { 653 while (dep->eqe_dump != NULL) 654 dep = dep->eqe_dump; 655 } 656 657 /* 658 * Now iterate over the processing list from oldest (eq_phead) to 659 * newest and log each error. Once an error is logged, we use 660 * atomic clear to return it to the free pool. If we panic before, 661 * during, or after calling eq_func() (case 4), the error will still be 662 * found on eq_phead and will be logged in errorq_panic below. 663 */ 664 665 while ((eep = eqp->eq_phead) != NULL) { 666 eqp->eq_func(eqp->eq_private, eep->eqe_data, eep); 667 eqp->eq_kstat.eqk_logged.value.ui64++; 668 669 eqp->eq_phead = eep->eqe_next; 670 membar_producer(); 671 672 eep->eqe_next = NULL; 673 674 /* 675 * On panic, we add the element to the dump list for each 676 * nvlist errorq. Elements are stored oldest to newest. 677 * Then continue, so we don't free and subsequently overwrite 678 * any elements which we've put on the dump queue. 679 */ 680 if (panicstr && (eqp->eq_flags & ERRORQ_NVLIST)) { 681 if (eqp->eq_dump == NULL) 682 dep = eqp->eq_dump = eep; 683 else 684 dep = dep->eqe_dump = eep; 685 membar_producer(); 686 continue; 687 } 688 689 eep->eqe_prev = NULL; 690 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems); 691 } 692 693 mutex_exit(&eqp->eq_lock); 694 } 695 696 /* 697 * Now that device tree services are available, set up the soft interrupt 698 * handlers for any queues that were created early in boot. We then 699 * manually drain these queues to report any pending early errors. 700 */ 701 void 702 errorq_init(void) 703 { 704 dev_info_t *dip = ddi_root_node(); 705 ddi_softintr_t id; 706 errorq_t *eqp; 707 708 ASSERT(modrootloaded != 0); 709 ASSERT(dip != NULL); 710 711 mutex_enter(&errorq_lock); 712 713 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 714 ddi_iblock_cookie_t ibc = 715 (ddi_iblock_cookie_t)(uintptr_t)ipltospl(eqp->eq_ipl); 716 717 if (eqp->eq_id != NULL) 718 continue; /* softint already initialized */ 719 720 if (ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, &ibc, NULL, 721 errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) { 722 panic("errorq_init: failed to register IPL %u softint " 723 "for queue %s", eqp->eq_ipl, eqp->eq_name); 724 } 725 726 eqp->eq_id = id; 727 errorq_drain(eqp); 728 } 729 730 mutex_exit(&errorq_lock); 731 } 732 733 /* 734 * This function is designed to be called from panic context only, and 735 * therefore does not need to acquire errorq_lock when iterating over 736 * errorq_list. This function must be called no more than once for each 737 * 'what' value (if you change this then review the manipulation of 'dep'. 738 */ 739 static uint64_t 740 errorq_panic_drain(uint_t what) 741 { 742 errorq_elem_t *eep, *nep, *dep; 743 errorq_t *eqp; 744 uint64_t loggedtmp; 745 uint64_t logged = 0; 746 747 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 748 if ((eqp->eq_flags & (ERRORQ_VITAL | ERRORQ_NVLIST)) != what) 749 continue; /* do not drain this queue on this pass */ 750 751 loggedtmp = eqp->eq_kstat.eqk_logged.value.ui64; 752 753 /* 754 * In case (1B) above, eq_ptail may be set but the 755 * atomic_cas_ptr may not have been executed yet or may have 756 * failed. Either way, we must log errors in chronological 757 * order. So we search the pending list for the error 758 * pointed to by eq_ptail. If it is found, we know that all 759 * subsequent errors are also still on the pending list, so 760 * just NULL out eq_ptail and let errorq_drain(), below, 761 * take care of the logging. 762 */ 763 for (eep = eqp->eq_pend; eep != NULL; eep = eep->eqe_prev) { 764 if (eep == eqp->eq_ptail) { 765 ASSERT(eqp->eq_phead == NULL); 766 eqp->eq_ptail = NULL; 767 break; 768 } 769 } 770 771 /* 772 * In cases (1C) and (2) above, eq_ptail will be set to the 773 * newest error on the processing list but eq_phead will still 774 * be NULL. We set the eqe_next pointers so we can iterate 775 * over the processing list in order from oldest error to the 776 * newest error. We then set eq_phead to point to the oldest 777 * error and fall into the for-loop below. 778 */ 779 if (eqp->eq_phead == NULL && (eep = eqp->eq_ptail) != NULL) { 780 for (eep->eqe_next = NULL; eep->eqe_prev != NULL; 781 eep = eep->eqe_prev) 782 eep->eqe_prev->eqe_next = eep; 783 784 eqp->eq_phead = eep; 785 eqp->eq_ptail = NULL; 786 } 787 788 /* 789 * In cases (3) and (4) above (or after case (1C/2) handling), 790 * eq_phead will be set to the oldest error on the processing 791 * list. We log each error and return it to the free pool. 792 * 793 * Unlike errorq_drain(), we don't need to worry about updating 794 * eq_phead because errorq_panic() will be called at most once. 795 * However, we must use atomic_cas_ptr to update the 796 * freelist in case errors are still being enqueued during 797 * panic. 798 */ 799 for (eep = eqp->eq_phead; eep != NULL; eep = nep) { 800 eqp->eq_func(eqp->eq_private, eep->eqe_data, eep); 801 eqp->eq_kstat.eqk_logged.value.ui64++; 802 803 nep = eep->eqe_next; 804 eep->eqe_next = NULL; 805 806 /* 807 * On panic, we add the element to the dump list for 808 * each nvlist errorq, stored oldest to newest. Then 809 * continue, so we don't free and subsequently overwrite 810 * any elements which we've put on the dump queue. 811 */ 812 if (eqp->eq_flags & ERRORQ_NVLIST) { 813 if (eqp->eq_dump == NULL) 814 dep = eqp->eq_dump = eep; 815 else 816 dep = dep->eqe_dump = eep; 817 membar_producer(); 818 continue; 819 } 820 821 eep->eqe_prev = NULL; 822 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems); 823 } 824 825 /* 826 * Now go ahead and drain any other errors on the pending list. 827 * This call transparently handles case (1A) above, as well as 828 * any other errors that were dispatched after errorq_drain() 829 * completed its first compare-and-swap. 830 */ 831 errorq_drain(eqp); 832 833 logged += eqp->eq_kstat.eqk_logged.value.ui64 - loggedtmp; 834 } 835 return (logged); 836 } 837 838 /* 839 * Drain all error queues - called only from panic context. Some drain 840 * functions may enqueue errors to ERRORQ_NVLIST error queues so that 841 * they may be written out in the panic dump - so ERRORQ_NVLIST queues 842 * must be drained last. Drain ERRORQ_VITAL queues before nonvital queues 843 * so that vital errors get to fill the ERRORQ_NVLIST queues first, and 844 * do not drain the nonvital queues if there are many vital errors. 845 */ 846 void 847 errorq_panic(void) 848 { 849 ASSERT(panicstr != NULL); 850 851 if (errorq_panic_drain(ERRORQ_VITAL) <= errorq_vitalmin) 852 (void) errorq_panic_drain(0); 853 (void) errorq_panic_drain(ERRORQ_VITAL | ERRORQ_NVLIST); 854 (void) errorq_panic_drain(ERRORQ_NVLIST); 855 } 856 857 /* 858 * Reserve an error queue element for later processing and dispatching. The 859 * element is returned to the caller who may add error-specific data to 860 * element. The element is retured to the free pool when either 861 * errorq_commit() is called and the element asynchronously processed 862 * or immediately when errorq_cancel() is called. 863 */ 864 errorq_elem_t * 865 errorq_reserve(errorq_t *eqp) 866 { 867 errorq_elem_t *eqep; 868 869 if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 870 atomic_inc_64(&errorq_lost); 871 return (NULL); 872 } 873 874 for (;;) { 875 int i, rval; 876 877 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen, 878 eqp->eq_rotor)) == -1) { 879 atomic_inc_64(&eqp->eq_kstat.eqk_dropped.value.ui64); 880 return (NULL); 881 } 882 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval); 883 if (rval == 0) { 884 eqp->eq_rotor = i; 885 eqep = &eqp->eq_elems[i]; 886 break; 887 } 888 } 889 890 if (eqp->eq_flags & ERRORQ_NVLIST) { 891 errorq_nvelem_t *eqnp = eqep->eqe_data; 892 nv_alloc_reset(eqnp->eqn_nva); 893 eqnp->eqn_nvl = fm_nvlist_create(eqnp->eqn_nva); 894 } 895 896 atomic_inc_64(&eqp->eq_kstat.eqk_reserved.value.ui64); 897 return (eqep); 898 } 899 900 /* 901 * Commit an errorq element (eqep) for dispatching. 902 * This function may be called from any context subject 903 * to the Platform Considerations described above. 904 */ 905 void 906 errorq_commit(errorq_t *eqp, errorq_elem_t *eqep, uint_t flag) 907 { 908 errorq_elem_t *old; 909 910 if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) { 911 atomic_inc_64(&eqp->eq_kstat.eqk_commit_fail.value.ui64); 912 return; 913 } 914 915 for (;;) { 916 old = eqp->eq_pend; 917 eqep->eqe_prev = old; 918 membar_producer(); 919 920 if (atomic_cas_ptr(&eqp->eq_pend, old, eqep) == old) 921 break; 922 } 923 924 atomic_inc_64(&eqp->eq_kstat.eqk_committed.value.ui64); 925 926 if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL) 927 ddi_trigger_softintr(eqp->eq_id); 928 } 929 930 /* 931 * Cancel an errorq element reservation by returning the specified element 932 * to the free pool. Duplicate or invalid frees are not supported. 933 */ 934 void 935 errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep) 936 { 937 if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) 938 return; 939 940 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eqep - eqp->eq_elems); 941 942 atomic_inc_64(&eqp->eq_kstat.eqk_cancelled.value.ui64); 943 } 944 945 /* 946 * Write elements on the dump list of each nvlist errorq to the dump device. 947 * Upon reboot, fmd(1M) will extract and replay them for diagnosis. 948 */ 949 void 950 errorq_dump(void) 951 { 952 errorq_elem_t *eep; 953 errorq_t *eqp; 954 955 if (ereport_dumpbuf == NULL) 956 return; /* reboot or panic before errorq is even set up */ 957 958 for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) { 959 if (!(eqp->eq_flags & ERRORQ_NVLIST) || 960 !(eqp->eq_flags & ERRORQ_ACTIVE)) 961 continue; /* do not dump this queue on panic */ 962 963 for (eep = eqp->eq_dump; eep != NULL; eep = eep->eqe_dump) { 964 errorq_nvelem_t *eqnp = eep->eqe_data; 965 size_t len = 0; 966 erpt_dump_t ed; 967 int err; 968 969 (void) nvlist_size(eqnp->eqn_nvl, 970 &len, NV_ENCODE_NATIVE); 971 972 if (len > ereport_dumplen || len == 0) { 973 cmn_err(CE_WARN, "%s: unable to save error " 974 "report %p due to size %lu\n", 975 eqp->eq_name, (void *)eep, len); 976 continue; 977 } 978 979 if ((err = nvlist_pack(eqnp->eqn_nvl, 980 (char **)&ereport_dumpbuf, &ereport_dumplen, 981 NV_ENCODE_NATIVE, KM_NOSLEEP)) != 0) { 982 cmn_err(CE_WARN, "%s: unable to save error " 983 "report %p due to pack error %d\n", 984 eqp->eq_name, (void *)eep, err); 985 continue; 986 } 987 988 ed.ed_magic = ERPT_MAGIC; 989 ed.ed_chksum = checksum32(ereport_dumpbuf, len); 990 ed.ed_size = (uint32_t)len; 991 ed.ed_pad = 0; 992 ed.ed_hrt_nsec = 0; 993 ed.ed_hrt_base = panic_hrtime; 994 ed.ed_tod_base.sec = panic_hrestime.tv_sec; 995 ed.ed_tod_base.nsec = panic_hrestime.tv_nsec; 996 997 dumpvp_write(&ed, sizeof (ed)); 998 dumpvp_write(ereport_dumpbuf, len); 999 } 1000 } 1001 } 1002 1003 nvlist_t * 1004 errorq_elem_nvl(errorq_t *eqp, const errorq_elem_t *eqep) 1005 { 1006 errorq_nvelem_t *eqnp = eqep->eqe_data; 1007 1008 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST); 1009 1010 return (eqnp->eqn_nvl); 1011 } 1012 1013 nv_alloc_t * 1014 errorq_elem_nva(errorq_t *eqp, const errorq_elem_t *eqep) 1015 { 1016 errorq_nvelem_t *eqnp = eqep->eqe_data; 1017 1018 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST); 1019 1020 return (eqnp->eqn_nva); 1021 } 1022 1023 /* 1024 * Reserve a new element and duplicate the data of the original into it. 1025 */ 1026 void * 1027 errorq_elem_dup(errorq_t *eqp, const errorq_elem_t *eqep, errorq_elem_t **neqep) 1028 { 1029 ASSERT(eqp->eq_flags & ERRORQ_ACTIVE); 1030 ASSERT(!(eqp->eq_flags & ERRORQ_NVLIST)); 1031 1032 if ((*neqep = errorq_reserve(eqp)) == NULL) 1033 return (NULL); 1034 1035 bcopy(eqep->eqe_data, (*neqep)->eqe_data, eqp->eq_size); 1036 return ((*neqep)->eqe_data); 1037 } 1038