1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/callo.h> 27 #include <sys/param.h> 28 #include <sys/types.h> 29 #include <sys/cpuvar.h> 30 #include <sys/thread.h> 31 #include <sys/kmem.h> 32 #include <sys/kmem_impl.h> 33 #include <sys/cmn_err.h> 34 #include <sys/callb.h> 35 #include <sys/debug.h> 36 #include <sys/vtrace.h> 37 #include <sys/sysmacros.h> 38 #include <sys/sdt.h> 39 40 /* 41 * Callout tables. See timeout(9F) for details. 42 */ 43 static int callout_threads; /* callout normal threads */ 44 static hrtime_t callout_debug_hrtime; /* debugger entry time */ 45 static int callout_min_reap; /* callout minimum reap count */ 46 static int callout_tolerance; /* callout hires tolerance */ 47 static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */ 48 static clock_t callout_max_ticks; /* max interval */ 49 static hrtime_t callout_longterm; /* longterm nanoseconds */ 50 static ulong_t callout_counter_low; /* callout ID increment */ 51 static ulong_t callout_table_bits; /* number of table bits in ID */ 52 static ulong_t callout_table_mask; /* mask for the table bits */ 53 static callout_cache_t *callout_caches; /* linked list of caches */ 54 #pragma align 64(callout_table) 55 static callout_table_t *callout_table; /* global callout table array */ 56 57 /* 58 * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal' 59 * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout, 60 * via taskq, to a thread that executes at PIL 0 - so we end up running 61 * 'normal' callouts at PIL 0. 62 */ 63 static volatile int callout_realtime_level = CY_LOW_LEVEL; 64 static volatile int callout_normal_level = CY_LOCK_LEVEL; 65 66 static char *callout_kstat_names[] = { 67 "callout_timeouts", 68 "callout_timeouts_pending", 69 "callout_untimeouts_unexpired", 70 "callout_untimeouts_executing", 71 "callout_untimeouts_expired", 72 "callout_expirations", 73 "callout_allocations", 74 "callout_cleanups", 75 }; 76 77 static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int); 78 79 #define CALLOUT_HASH_INSERT(hash, cp, cnext, cprev) \ 80 { \ 81 callout_hash_t *hashp = &(hash); \ 82 \ 83 cp->cprev = NULL; \ 84 cp->cnext = hashp->ch_head; \ 85 if (hashp->ch_head == NULL) \ 86 hashp->ch_tail = cp; \ 87 else \ 88 cp->cnext->cprev = cp; \ 89 hashp->ch_head = cp; \ 90 } 91 92 #define CALLOUT_HASH_APPEND(hash, cp, cnext, cprev) \ 93 { \ 94 callout_hash_t *hashp = &(hash); \ 95 \ 96 cp->cnext = NULL; \ 97 cp->cprev = hashp->ch_tail; \ 98 if (hashp->ch_tail == NULL) \ 99 hashp->ch_head = cp; \ 100 else \ 101 cp->cprev->cnext = cp; \ 102 hashp->ch_tail = cp; \ 103 } 104 105 #define CALLOUT_HASH_DELETE(hash, cp, cnext, cprev) \ 106 { \ 107 callout_hash_t *hashp = &(hash); \ 108 \ 109 if (cp->cnext == NULL) \ 110 hashp->ch_tail = cp->cprev; \ 111 else \ 112 cp->cnext->cprev = cp->cprev; \ 113 if (cp->cprev == NULL) \ 114 hashp->ch_head = cp->cnext; \ 115 else \ 116 cp->cprev->cnext = cp->cnext; \ 117 } 118 119 /* 120 * These definitions help us queue callouts and callout lists. Here is 121 * the queueing rationale: 122 * 123 * - callouts are queued in a FIFO manner in the ID hash table. 124 * TCP timers are typically cancelled in the same order that they 125 * were issued. The FIFO queueing shortens the search for a callout 126 * during untimeout(). 127 * 128 * - callouts are queued in a FIFO manner in their callout lists. 129 * This ensures that the callouts are executed in the same order that 130 * they were queued. This is fair. Plus, it helps to make each 131 * callout expiration timely. It also favors cancellations. 132 * 133 * - callout lists are queued in the following manner in the callout 134 * hash table buckets: 135 * 136 * - appended, if the callout list is a 1-nanosecond resolution 137 * callout list. When a callout is created, we first look for 138 * a callout list that has the same expiration so we can avoid 139 * allocating a callout list and inserting the expiration into 140 * the heap. However, we do not want to look at 1-nanosecond 141 * resolution callout lists as we will seldom find a match in 142 * them. Keeping these callout lists in the rear of the hash 143 * buckets allows us to skip these during the lookup. 144 * 145 * - inserted at the beginning, if the callout list is not a 146 * 1-nanosecond resolution callout list. This also has the 147 * side-effect of keeping the long term timers away from the 148 * front of the buckets. 149 * 150 * - callout lists are queued in a FIFO manner in the expired callouts 151 * list. This ensures that callout lists are executed in the order 152 * of expiration. 153 */ 154 #define CALLOUT_APPEND(ct, cp) \ 155 CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \ 156 cp, c_idnext, c_idprev); \ 157 CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev) 158 159 #define CALLOUT_DELETE(ct, cp) \ 160 CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \ 161 cp, c_idnext, c_idprev); \ 162 CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev) 163 164 #define CALLOUT_LIST_INSERT(hash, cl) \ 165 CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev) 166 167 #define CALLOUT_LIST_APPEND(hash, cl) \ 168 CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev) 169 170 #define CALLOUT_LIST_DELETE(hash, cl) \ 171 CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev) 172 173 /* 174 * For normal callouts, there is a deadlock scenario if two callouts that 175 * have an inter-dependency end up on the same callout list. To break the 176 * deadlock, you need two taskq threads running in parallel. We compute 177 * the number of taskq threads here using a bunch of conditions to make 178 * it optimal for the common case. This is an ugly hack, but one that is 179 * necessary (sigh). 180 */ 181 #define CALLOUT_THRESHOLD 100000000 182 #define CALLOUT_EXEC_COMPUTE(ct, exec) \ 183 { \ 184 callout_list_t *cl; \ 185 \ 186 cl = ct->ct_expired.ch_head; \ 187 if (cl == NULL) { \ 188 /* \ 189 * If the expired list is NULL, there is nothing to \ 190 * process. \ 191 */ \ 192 exec = 0; \ 193 } else if ((cl->cl_next == NULL) && \ 194 (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) { \ 195 /* \ 196 * If there is only one callout list and it contains \ 197 * only one callout, there is no need for two threads. \ 198 */ \ 199 exec = 1; \ 200 } else if ((ct->ct_heap_num == 0) || \ 201 (ct->ct_heap[0].ch_expiration > gethrtime() + CALLOUT_THRESHOLD)) {\ 202 /* \ 203 * If the heap has become empty, we need two threads as \ 204 * there is no one to kick off the second thread in the \ 205 * future. If the heap is not empty and the top of the \ 206 * heap does not expire in the near future, we need two \ 207 * threads. \ 208 */ \ 209 exec = 2; \ 210 } else { \ 211 /* \ 212 * We have multiple callouts to process. But the cyclic \ 213 * will fire in the near future. So, we only need one \ 214 * thread for now. \ 215 */ \ 216 exec = 1; \ 217 } \ 218 } 219 220 /* 221 * Macro to swap two heap items. 222 */ 223 #define CALLOUT_SWAP(h1, h2) \ 224 { \ 225 callout_heap_t tmp; \ 226 \ 227 tmp = *h1; \ 228 *h1 = *h2; \ 229 *h2 = tmp; \ 230 } 231 232 /* 233 * Macro to free a callout list. 234 */ 235 #define CALLOUT_LIST_FREE(ct, cl) \ 236 { \ 237 cl->cl_next = ct->ct_lfree; \ 238 ct->ct_lfree = cl; \ 239 cl->cl_flags |= CALLOUT_LIST_FLAG_FREE; \ 240 } 241 242 /* 243 * Allocate a callout structure. We try quite hard because we 244 * can't sleep, and if we can't do the allocation, we're toast. 245 * Failing all, we try a KM_PANIC allocation. Note that we never 246 * deallocate a callout. See untimeout() for the reasoning. 247 */ 248 static callout_t * 249 callout_alloc(callout_table_t *ct) 250 { 251 size_t size; 252 callout_t *cp; 253 254 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 255 mutex_exit(&ct->ct_mutex); 256 257 cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP); 258 if (cp == NULL) { 259 size = sizeof (callout_t); 260 cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC); 261 } 262 cp->c_xid = 0; 263 cp->c_executor = NULL; 264 cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL); 265 cp->c_waiting = 0; 266 267 mutex_enter(&ct->ct_mutex); 268 ct->ct_allocations++; 269 return (cp); 270 } 271 272 /* 273 * Allocate a callout list structure. We try quite hard because we 274 * can't sleep, and if we can't do the allocation, we're toast. 275 * Failing all, we try a KM_PANIC allocation. Note that we never 276 * deallocate a callout list. 277 */ 278 static void 279 callout_list_alloc(callout_table_t *ct) 280 { 281 size_t size; 282 callout_list_t *cl; 283 284 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 285 mutex_exit(&ct->ct_mutex); 286 287 cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP); 288 if (cl == NULL) { 289 size = sizeof (callout_list_t); 290 cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC); 291 } 292 bzero(cl, sizeof (callout_list_t)); 293 294 mutex_enter(&ct->ct_mutex); 295 CALLOUT_LIST_FREE(ct, cl); 296 } 297 298 /* 299 * Find a callout list that corresponds to an expiration and matching flags. 300 */ 301 static callout_list_t * 302 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash) 303 { 304 callout_list_t *cl; 305 int clflags; 306 307 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 308 309 if (flags & CALLOUT_LIST_FLAG_NANO) { 310 /* 311 * This is a 1-nanosecond resolution callout. We will rarely 312 * find a match for this. So, bail out. 313 */ 314 return (NULL); 315 } 316 317 clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME); 318 for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) { 319 /* 320 * If we have reached a 1-nanosecond resolution callout list, 321 * we don't have much hope of finding a match in this hash 322 * bucket. So, just bail out. 323 */ 324 if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) 325 return (NULL); 326 327 if ((cl->cl_expiration == expiration) && 328 ((cl->cl_flags & clflags) == (flags & clflags))) 329 return (cl); 330 } 331 332 return (NULL); 333 } 334 335 /* 336 * Initialize a callout table's heap, if necessary. Preallocate some free 337 * entries so we don't have to check for NULL elsewhere. 338 */ 339 static void 340 callout_heap_init(callout_table_t *ct) 341 { 342 size_t size; 343 344 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 345 ASSERT(ct->ct_heap == NULL); 346 347 ct->ct_heap_num = 0; 348 ct->ct_heap_max = CALLOUT_CHUNK; 349 size = sizeof (callout_heap_t) * CALLOUT_CHUNK; 350 ct->ct_heap = kmem_alloc(size, KM_SLEEP); 351 } 352 353 /* 354 * Reallocate the heap. We try quite hard because we can't sleep, and if 355 * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC 356 * allocation. Note that the heap only expands, it never contracts. 357 */ 358 static void 359 callout_heap_expand(callout_table_t *ct) 360 { 361 size_t max, size, osize; 362 callout_heap_t *heap; 363 364 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 365 ASSERT(ct->ct_heap_num <= ct->ct_heap_max); 366 367 while (ct->ct_heap_num == ct->ct_heap_max) { 368 max = ct->ct_heap_max; 369 mutex_exit(&ct->ct_mutex); 370 371 osize = sizeof (callout_heap_t) * max; 372 size = sizeof (callout_heap_t) * (max + CALLOUT_CHUNK); 373 heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC); 374 375 mutex_enter(&ct->ct_mutex); 376 if (max < ct->ct_heap_max) { 377 /* 378 * Someone beat us to the allocation. Free what we 379 * just allocated and proceed. 380 */ 381 kmem_free(heap, size); 382 continue; 383 } 384 385 bcopy(ct->ct_heap, heap, osize); 386 kmem_free(ct->ct_heap, osize); 387 ct->ct_heap = heap; 388 ct->ct_heap_max = size / sizeof (callout_heap_t); 389 } 390 } 391 392 /* 393 * Move an expiration from the bottom of the heap to its correct place 394 * in the heap. If we reached the root doing this, return 1. Else, 395 * return 0. 396 */ 397 static int 398 callout_upheap(callout_table_t *ct) 399 { 400 int current, parent; 401 callout_heap_t *heap, *hcurrent, *hparent; 402 403 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 404 ASSERT(ct->ct_heap_num >= 1); 405 406 if (ct->ct_heap_num == 1) { 407 return (1); 408 } 409 410 heap = ct->ct_heap; 411 current = ct->ct_heap_num - 1; 412 413 for (;;) { 414 parent = CALLOUT_HEAP_PARENT(current); 415 hparent = &heap[parent]; 416 hcurrent = &heap[current]; 417 418 /* 419 * We have an expiration later than our parent; we're done. 420 */ 421 if (hcurrent->ch_expiration >= hparent->ch_expiration) { 422 return (0); 423 } 424 425 /* 426 * We need to swap with our parent, and continue up the heap. 427 */ 428 CALLOUT_SWAP(hparent, hcurrent); 429 430 /* 431 * If we just reached the root, we're done. 432 */ 433 if (parent == 0) { 434 return (1); 435 } 436 437 current = parent; 438 } 439 /*NOTREACHED*/ 440 } 441 442 /* 443 * Insert a new heap item into a callout table's heap. 444 */ 445 static void 446 callout_heap_insert(callout_table_t *ct, callout_list_t *cl) 447 { 448 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 449 ASSERT(ct->ct_heap_num < ct->ct_heap_max); 450 451 /* 452 * First, copy the expiration and callout list pointer to the bottom 453 * of the heap. 454 */ 455 ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration; 456 ct->ct_heap[ct->ct_heap_num].ch_list = cl; 457 ct->ct_heap_num++; 458 459 /* 460 * Now, perform an upheap operation. If we reached the root, then 461 * the cyclic needs to be reprogrammed as we have an earlier 462 * expiration. 463 * 464 * Also, during the CPR suspend phase, do not reprogram the cyclic. 465 * We don't want any callout activity. When the CPR resume phase is 466 * entered, the cyclic will be programmed for the earliest expiration 467 * in the heap. 468 */ 469 if (callout_upheap(ct) && (ct->ct_suspend == 0)) 470 (void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration); 471 } 472 473 /* 474 * Move an expiration from the top of the heap to its correct place 475 * in the heap. 476 */ 477 static void 478 callout_downheap(callout_table_t *ct) 479 { 480 int current, left, right, nelems; 481 callout_heap_t *heap, *hleft, *hright, *hcurrent; 482 483 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 484 ASSERT(ct->ct_heap_num >= 1); 485 486 heap = ct->ct_heap; 487 current = 0; 488 nelems = ct->ct_heap_num; 489 490 for (;;) { 491 /* 492 * If we don't have a left child (i.e., we're a leaf), we're 493 * done. 494 */ 495 if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems) 496 return; 497 498 hleft = &heap[left]; 499 hcurrent = &heap[current]; 500 501 right = CALLOUT_HEAP_RIGHT(current); 502 503 /* 504 * Even if we don't have a right child, we still need to compare 505 * our expiration against that of our left child. 506 */ 507 if (right >= nelems) 508 goto comp_left; 509 510 hright = &heap[right]; 511 512 /* 513 * We have both a left and a right child. We need to compare 514 * the expiration of the children to determine which 515 * expires earlier. 516 */ 517 if (hright->ch_expiration < hleft->ch_expiration) { 518 /* 519 * Our right child is the earlier of our children. 520 * We'll now compare our expiration to its expiration. 521 * If ours is the earlier one, we're done. 522 */ 523 if (hcurrent->ch_expiration <= hright->ch_expiration) 524 return; 525 526 /* 527 * Our right child expires earlier than we do; swap 528 * with our right child, and descend right. 529 */ 530 CALLOUT_SWAP(hright, hcurrent); 531 current = right; 532 continue; 533 } 534 535 comp_left: 536 /* 537 * Our left child is the earlier of our children (or we have 538 * no right child). We'll now compare our expiration 539 * to its expiration. If ours is the earlier one, we're done. 540 */ 541 if (hcurrent->ch_expiration <= hleft->ch_expiration) 542 return; 543 544 /* 545 * Our left child expires earlier than we do; swap with our 546 * left child, and descend left. 547 */ 548 CALLOUT_SWAP(hleft, hcurrent); 549 current = left; 550 } 551 } 552 553 /* 554 * Delete and handle all past expirations in a callout table's heap. 555 */ 556 static void 557 callout_heap_delete(callout_table_t *ct) 558 { 559 hrtime_t now, expiration, next; 560 callout_list_t *cl; 561 callout_heap_t *heap; 562 int hash; 563 564 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 565 566 if (CALLOUT_CLEANUP(ct)) { 567 /* 568 * There are too many heap elements pointing to empty callout 569 * lists. Clean them out. 570 */ 571 (void) callout_heap_process(ct, 0, 0); 572 } 573 574 now = gethrtime(); 575 heap = ct->ct_heap; 576 577 while (ct->ct_heap_num > 0) { 578 expiration = heap->ch_expiration; 579 hash = CALLOUT_CLHASH(expiration); 580 cl = heap->ch_list; 581 ASSERT(expiration == cl->cl_expiration); 582 583 if (cl->cl_callouts.ch_head == NULL) { 584 /* 585 * If the callout list is empty, reap it. 586 * Decrement the reap count. 587 */ 588 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); 589 CALLOUT_LIST_FREE(ct, cl); 590 ct->ct_nreap--; 591 } else { 592 /* 593 * If the root of the heap expires in the future, 594 * bail out. 595 */ 596 if (expiration > now) 597 break; 598 599 /* 600 * Move the callout list for this expiration to the 601 * list of expired callout lists. It will be processed 602 * by the callout executor. 603 */ 604 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); 605 CALLOUT_LIST_APPEND(ct->ct_expired, cl); 606 } 607 608 /* 609 * Now delete the root. This is done by swapping the root with 610 * the last item in the heap and downheaping the item. 611 */ 612 ct->ct_heap_num--; 613 if (ct->ct_heap_num > 0) { 614 heap[0] = heap[ct->ct_heap_num]; 615 callout_downheap(ct); 616 } 617 } 618 619 /* 620 * If this callout table is empty or callouts have been suspended, 621 * just return. The cyclic has already been programmed to 622 * infinity by the cyclic subsystem. 623 */ 624 if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0)) 625 return; 626 627 /* 628 * If the top expirations are within callout_tolerance of each other, 629 * delay the cyclic expire so that they can be processed together. 630 * This is to prevent high resolution timers from swamping the system 631 * with cyclic activity. 632 */ 633 if (ct->ct_heap_num > 2) { 634 next = expiration + callout_tolerance; 635 if ((heap[1].ch_expiration < next) || 636 (heap[2].ch_expiration < next)) 637 expiration = next; 638 } 639 640 (void) cyclic_reprogram(ct->ct_cyclic, expiration); 641 } 642 643 /* 644 * There are some situations when the entire heap is walked and processed. 645 * This function is called to do the processing. These are the situations: 646 * 647 * 1. When the reap count reaches its threshold, the heap has to be cleared 648 * of all empty callout lists. 649 * 650 * 2. When the system enters and exits KMDB/OBP, all entries in the heap 651 * need to be adjusted by the interval spent in KMDB/OBP. 652 * 653 * 3. When system time is changed, the heap has to be scanned for 654 * absolute hrestime timers. These need to be removed from the heap 655 * and expired immediately. 656 * 657 * In cases 2 and 3, it is a good idea to do 1 as well since we are 658 * scanning the heap anyway. 659 * 660 * If the root gets changed and/or callout lists are expired, return the 661 * new expiration to the caller so he can reprogram the cyclic accordingly. 662 */ 663 static hrtime_t 664 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange) 665 { 666 callout_heap_t *heap; 667 callout_list_t *cl, *rootcl; 668 hrtime_t expiration, now; 669 int i, hash, clflags, expired; 670 ulong_t num; 671 672 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 673 674 if (ct->ct_heap_num == 0) 675 return (0); 676 677 if (ct->ct_nreap > 0) 678 ct->ct_cleanups++; 679 680 heap = ct->ct_heap; 681 rootcl = heap->ch_list; 682 683 /* 684 * We walk the heap from the top to the bottom. If we encounter 685 * a heap item that points to an empty callout list, we clean 686 * it out. If we encounter a hrestime entry that must be removed, 687 * again we clean it out. Otherwise, we apply any adjustments needed 688 * to an element. 689 * 690 * During the walk, we also compact the heap from the bottom and 691 * reconstruct the heap using upheap operations. This is very 692 * efficient if the number of elements to be cleaned is greater than 693 * or equal to half the heap. This is the common case. 694 * 695 * Even in the non-common case, the upheap operations should be short 696 * as the entries below generally tend to be bigger than the entries 697 * above. 698 */ 699 num = ct->ct_heap_num; 700 ct->ct_heap_num = 0; 701 clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE); 702 now = gethrtime(); 703 expired = 0; 704 for (i = 0; i < num; i++) { 705 cl = heap[i].ch_list; 706 /* 707 * If the callout list is empty, delete the heap element and 708 * free the callout list. 709 */ 710 if (cl->cl_callouts.ch_head == NULL) { 711 hash = CALLOUT_CLHASH(cl->cl_expiration); 712 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); 713 CALLOUT_LIST_FREE(ct, cl); 714 continue; 715 } 716 717 /* 718 * Delete the heap element and expire the callout list, if 719 * one of the following is true: 720 * - the callout list has expired 721 * - the callout list is an absolute hrestime one and 722 * there has been a system time change 723 */ 724 if ((cl->cl_expiration <= now) || 725 (timechange && ((cl->cl_flags & clflags) == clflags))) { 726 hash = CALLOUT_CLHASH(cl->cl_expiration); 727 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); 728 CALLOUT_LIST_APPEND(ct->ct_expired, cl); 729 expired = 1; 730 continue; 731 } 732 733 /* 734 * Apply adjustments, if any. Adjustments are applied after 735 * the system returns from KMDB or OBP. They are only applied 736 * to relative callout lists. 737 */ 738 if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) { 739 hash = CALLOUT_CLHASH(cl->cl_expiration); 740 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); 741 expiration = cl->cl_expiration + delta; 742 if (expiration <= 0) 743 expiration = CY_INFINITY; 744 heap[i].ch_expiration = expiration; 745 cl->cl_expiration = expiration; 746 hash = CALLOUT_CLHASH(cl->cl_expiration); 747 if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) { 748 CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl); 749 } else { 750 CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl); 751 } 752 } 753 754 heap[ct->ct_heap_num] = heap[i]; 755 ct->ct_heap_num++; 756 (void) callout_upheap(ct); 757 } 758 759 ct->ct_nreap = 0; 760 761 if (expired) 762 expiration = gethrtime(); 763 else if (ct->ct_heap_num == 0) 764 expiration = CY_INFINITY; 765 else if (rootcl != heap->ch_list) 766 expiration = heap->ch_expiration; 767 else 768 expiration = 0; 769 770 return (expiration); 771 } 772 773 /* 774 * Common function used to create normal and realtime callouts. 775 * 776 * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So, 777 * there is one restriction on a realtime callout handler - it should not 778 * directly or indirectly acquire cpu_lock. CPU offline waits for pending 779 * cyclic handlers to complete while holding cpu_lock. So, if a realtime 780 * callout handler were to try to get cpu_lock, there would be a deadlock 781 * during CPU offline. 782 */ 783 callout_id_t 784 timeout_generic(int type, void (*func)(void *), void *arg, 785 hrtime_t expiration, hrtime_t resolution, int flags) 786 { 787 callout_table_t *ct; 788 callout_t *cp; 789 callout_id_t id; 790 callout_list_t *cl; 791 hrtime_t now, interval, rexpiration; 792 int hash, clflags; 793 794 ASSERT(resolution > 0); 795 ASSERT(func != NULL); 796 797 /* 798 * We get the current hrtime right upfront so that latencies in 799 * this function do not affect the accuracy of the callout. 800 */ 801 now = gethrtime(); 802 803 /* 804 * We disable kernel preemption so that we remain on the same CPU 805 * throughout. If we needed to reprogram the callout table's cyclic, 806 * we can avoid X-calls if we are on the same CPU. 807 * 808 * Note that callout_alloc() releases and reacquires the callout 809 * table mutex. While reacquiring the mutex, it is possible for us 810 * to go to sleep and later migrate to another CPU. This should be 811 * pretty rare, though. 812 */ 813 kpreempt_disable(); 814 815 ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)]; 816 mutex_enter(&ct->ct_mutex); 817 818 if (ct->ct_cyclic == CYCLIC_NONE) { 819 mutex_exit(&ct->ct_mutex); 820 /* 821 * The callout table has not yet been initialized fully. 822 * So, put this one on the boot callout table which is 823 * always initialized. 824 */ 825 ct = &callout_boot_ct[type]; 826 mutex_enter(&ct->ct_mutex); 827 } 828 829 if (CALLOUT_CLEANUP(ct)) { 830 /* 831 * There are too many heap elements pointing to empty callout 832 * lists. Clean them out. 833 */ 834 rexpiration = callout_heap_process(ct, 0, 0); 835 if ((rexpiration != 0) && (ct->ct_suspend == 0)) 836 (void) cyclic_reprogram(ct->ct_cyclic, rexpiration); 837 } 838 839 if ((cp = ct->ct_free) == NULL) 840 cp = callout_alloc(ct); 841 else 842 ct->ct_free = cp->c_idnext; 843 844 cp->c_func = func; 845 cp->c_arg = arg; 846 847 /* 848 * Compute the expiration hrtime. 849 */ 850 if (flags & CALLOUT_FLAG_ABSOLUTE) { 851 interval = expiration - now; 852 } else { 853 interval = expiration; 854 expiration += now; 855 } 856 857 if (resolution > 1) { 858 /* 859 * Align expiration to the specified resolution. 860 */ 861 if (flags & CALLOUT_FLAG_ROUNDUP) 862 expiration += resolution - 1; 863 expiration = (expiration / resolution) * resolution; 864 } 865 866 if (expiration <= 0) { 867 /* 868 * expiration hrtime overflow has occurred. Just set the 869 * expiration to infinity. 870 */ 871 expiration = CY_INFINITY; 872 } 873 874 /* 875 * Assign an ID to this callout 876 */ 877 if (flags & CALLOUT_FLAG_32BIT) { 878 if (interval > callout_longterm) { 879 id = (ct->ct_long_id - callout_counter_low); 880 id |= CALLOUT_COUNTER_HIGH; 881 ct->ct_long_id = id; 882 } else { 883 id = (ct->ct_short_id - callout_counter_low); 884 id |= CALLOUT_COUNTER_HIGH; 885 ct->ct_short_id = id; 886 } 887 } else { 888 id = (ct->ct_gen_id - callout_counter_low); 889 if ((id & CALLOUT_COUNTER_HIGH) == 0) { 890 id |= CALLOUT_COUNTER_HIGH; 891 id += CALLOUT_GENERATION_LOW; 892 } 893 ct->ct_gen_id = id; 894 } 895 896 cp->c_xid = id; 897 898 clflags = 0; 899 if (flags & CALLOUT_FLAG_ABSOLUTE) 900 clflags |= CALLOUT_LIST_FLAG_ABSOLUTE; 901 if (flags & CALLOUT_FLAG_HRESTIME) 902 clflags |= CALLOUT_LIST_FLAG_HRESTIME; 903 if (resolution == 1) 904 clflags |= CALLOUT_LIST_FLAG_NANO; 905 hash = CALLOUT_CLHASH(expiration); 906 907 again: 908 /* 909 * Try to see if a callout list already exists for this expiration. 910 */ 911 cl = callout_list_get(ct, expiration, clflags, hash); 912 if (cl == NULL) { 913 /* 914 * Check if we have enough space in the heap to insert one 915 * expiration. If not, expand the heap. 916 */ 917 if (ct->ct_heap_num == ct->ct_heap_max) { 918 callout_heap_expand(ct); 919 /* 920 * In the above call, we drop the lock, allocate and 921 * reacquire the lock. So, we could have been away 922 * for a while. In the meantime, someone could have 923 * inserted a callout list with the same expiration. 924 * So, the best course is to repeat the steps. This 925 * should be an infrequent event. 926 */ 927 goto again; 928 } 929 930 /* 931 * Check the free list. If we don't find one, we have to 932 * take the slow path and allocate from kmem. 933 */ 934 if ((cl = ct->ct_lfree) == NULL) { 935 callout_list_alloc(ct); 936 /* 937 * In the above call, we drop the lock, allocate and 938 * reacquire the lock. So, we could have been away 939 * for a while. In the meantime, someone could have 940 * inserted a callout list with the same expiration. 941 * Plus, the heap could have become full. So, the best 942 * course is to repeat the steps. This should be an 943 * infrequent event. 944 */ 945 goto again; 946 } 947 ct->ct_lfree = cl->cl_next; 948 cl->cl_expiration = expiration; 949 cl->cl_flags = clflags; 950 951 if (clflags & CALLOUT_LIST_FLAG_NANO) { 952 CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl); 953 } else { 954 CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl); 955 } 956 957 /* 958 * This is a new expiration. So, insert it into the heap. 959 * This will also reprogram the cyclic, if the expiration 960 * propagated to the root of the heap. 961 */ 962 callout_heap_insert(ct, cl); 963 } else { 964 /* 965 * If the callout list was empty, untimeout_generic() would 966 * have incremented a reap count. Decrement the reap count 967 * as we are going to insert a callout into this list. 968 */ 969 if (cl->cl_callouts.ch_head == NULL) 970 ct->ct_nreap--; 971 } 972 cp->c_list = cl; 973 CALLOUT_APPEND(ct, cp); 974 975 ct->ct_timeouts++; 976 ct->ct_timeouts_pending++; 977 978 mutex_exit(&ct->ct_mutex); 979 980 kpreempt_enable(); 981 982 TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT, 983 "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration, 984 cp); 985 986 return (id); 987 } 988 989 timeout_id_t 990 timeout(void (*func)(void *), void *arg, clock_t delta) 991 { 992 ulong_t id; 993 994 /* 995 * Make sure the callout runs at least 1 tick in the future. 996 */ 997 if (delta <= 0) 998 delta = 1; 999 else if (delta > callout_max_ticks) 1000 delta = callout_max_ticks; 1001 1002 id = (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg, 1003 TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY); 1004 1005 return ((timeout_id_t)id); 1006 } 1007 1008 /* 1009 * Convenience function that creates a normal callout with default parameters 1010 * and returns a full ID. 1011 */ 1012 callout_id_t 1013 timeout_default(void (*func)(void *), void *arg, clock_t delta) 1014 { 1015 callout_id_t id; 1016 1017 /* 1018 * Make sure the callout runs at least 1 tick in the future. 1019 */ 1020 if (delta <= 0) 1021 delta = 1; 1022 else if (delta > callout_max_ticks) 1023 delta = callout_max_ticks; 1024 1025 id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta), 1026 nsec_per_tick, 0); 1027 1028 return (id); 1029 } 1030 1031 timeout_id_t 1032 realtime_timeout(void (*func)(void *), void *arg, clock_t delta) 1033 { 1034 ulong_t id; 1035 1036 /* 1037 * Make sure the callout runs at least 1 tick in the future. 1038 */ 1039 if (delta <= 0) 1040 delta = 1; 1041 else if (delta > callout_max_ticks) 1042 delta = callout_max_ticks; 1043 1044 id = (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg, 1045 TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY); 1046 1047 return ((timeout_id_t)id); 1048 } 1049 1050 /* 1051 * Convenience function that creates a realtime callout with default parameters 1052 * and returns a full ID. 1053 */ 1054 callout_id_t 1055 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta) 1056 { 1057 callout_id_t id; 1058 1059 /* 1060 * Make sure the callout runs at least 1 tick in the future. 1061 */ 1062 if (delta <= 0) 1063 delta = 1; 1064 else if (delta > callout_max_ticks) 1065 delta = callout_max_ticks; 1066 1067 id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta), 1068 nsec_per_tick, 0); 1069 1070 return (id); 1071 } 1072 1073 hrtime_t 1074 untimeout_generic(callout_id_t id, int nowait) 1075 { 1076 callout_table_t *ct; 1077 callout_t *cp; 1078 callout_id_t xid; 1079 callout_list_t *cl; 1080 int hash; 1081 callout_id_t bogus; 1082 1083 ct = &callout_table[CALLOUT_ID_TO_TABLE(id)]; 1084 hash = CALLOUT_IDHASH(id); 1085 1086 mutex_enter(&ct->ct_mutex); 1087 1088 /* 1089 * Search the ID hash table for the callout. 1090 */ 1091 for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) { 1092 1093 xid = cp->c_xid; 1094 1095 /* 1096 * Match the ID and generation number. 1097 */ 1098 if ((xid & CALLOUT_ID_MASK) != id) 1099 continue; 1100 1101 if ((xid & CALLOUT_EXECUTING) == 0) { 1102 hrtime_t expiration; 1103 1104 /* 1105 * Delete the callout. If the callout list becomes 1106 * NULL, we don't remove it from the table. This is 1107 * so it can be reused. If the empty callout list 1108 * corresponds to the top of the the callout heap, we 1109 * don't reprogram the table cyclic here. This is in 1110 * order to avoid lots of X-calls to the CPU associated 1111 * with the callout table. 1112 */ 1113 cl = cp->c_list; 1114 expiration = cl->cl_expiration; 1115 CALLOUT_DELETE(ct, cp); 1116 cp->c_idnext = ct->ct_free; 1117 ct->ct_free = cp; 1118 cp->c_xid |= CALLOUT_FREE; 1119 ct->ct_untimeouts_unexpired++; 1120 ct->ct_timeouts_pending--; 1121 1122 /* 1123 * If the callout list has become empty, it needs 1124 * to be cleaned along with its heap entry. Increment 1125 * a reap count. 1126 */ 1127 if (cl->cl_callouts.ch_head == NULL) 1128 ct->ct_nreap++; 1129 mutex_exit(&ct->ct_mutex); 1130 1131 expiration -= gethrtime(); 1132 TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT, 1133 "untimeout:ID %lx hrtime left %llx", id, 1134 expiration); 1135 return (expiration < 0 ? 0 : expiration); 1136 } 1137 1138 ct->ct_untimeouts_executing++; 1139 /* 1140 * The callout we want to delete is currently executing. 1141 * The DDI states that we must wait until the callout 1142 * completes before returning, so we block on c_done until the 1143 * callout ID changes (to the old ID if it's on the freelist, 1144 * or to a new callout ID if it's in use). This implicitly 1145 * assumes that callout structures are persistent (they are). 1146 */ 1147 if (cp->c_executor == curthread) { 1148 /* 1149 * The timeout handler called untimeout() on itself. 1150 * Stupid, but legal. We can't wait for the timeout 1151 * to complete without deadlocking, so we just return. 1152 */ 1153 mutex_exit(&ct->ct_mutex); 1154 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF, 1155 "untimeout_self:ID %x", id); 1156 return (-1); 1157 } 1158 if (nowait == 0) { 1159 /* 1160 * We need to wait. Indicate that we are waiting by 1161 * incrementing c_waiting. This prevents the executor 1162 * from doing a wakeup on c_done if there are no 1163 * waiters. 1164 */ 1165 while (cp->c_xid == xid) { 1166 cp->c_waiting = 1; 1167 cv_wait(&cp->c_done, &ct->ct_mutex); 1168 } 1169 } 1170 mutex_exit(&ct->ct_mutex); 1171 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING, 1172 "untimeout_executing:ID %lx", id); 1173 return (-1); 1174 } 1175 ct->ct_untimeouts_expired++; 1176 1177 mutex_exit(&ct->ct_mutex); 1178 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID, 1179 "untimeout_bogus_id:ID %lx", id); 1180 1181 /* 1182 * We didn't find the specified callout ID. This means either 1183 * (1) the callout already fired, or (2) the caller passed us 1184 * a bogus value. Perform a sanity check to detect case (2). 1185 */ 1186 bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH); 1187 if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0)) 1188 panic("untimeout: impossible timeout id %llx", 1189 (unsigned long long)id); 1190 1191 return (-1); 1192 } 1193 1194 clock_t 1195 untimeout(timeout_id_t id_arg) 1196 { 1197 hrtime_t hleft; 1198 clock_t tleft; 1199 callout_id_t id; 1200 1201 id = (ulong_t)id_arg; 1202 hleft = untimeout_generic(id, 0); 1203 if (hleft < 0) 1204 tleft = -1; 1205 else if (hleft == 0) 1206 tleft = 0; 1207 else 1208 tleft = NSEC_TO_TICK(hleft); 1209 1210 return (tleft); 1211 } 1212 1213 /* 1214 * Convenience function to untimeout a timeout with a full ID with default 1215 * parameters. 1216 */ 1217 clock_t 1218 untimeout_default(callout_id_t id, int nowait) 1219 { 1220 hrtime_t hleft; 1221 clock_t tleft; 1222 1223 hleft = untimeout_generic(id, nowait); 1224 if (hleft < 0) 1225 tleft = -1; 1226 else if (hleft == 0) 1227 tleft = 0; 1228 else 1229 tleft = NSEC_TO_TICK(hleft); 1230 1231 return (tleft); 1232 } 1233 1234 /* 1235 * Expire all the callouts queued in the specified callout list. 1236 */ 1237 static void 1238 callout_list_expire(callout_table_t *ct, callout_list_t *cl) 1239 { 1240 callout_t *cp, *cnext; 1241 1242 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 1243 ASSERT(cl != NULL); 1244 1245 for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) { 1246 /* 1247 * Multiple executor threads could be running at the same 1248 * time. If this callout is already being executed, 1249 * go on to the next one. 1250 */ 1251 if (cp->c_xid & CALLOUT_EXECUTING) { 1252 cnext = cp->c_clnext; 1253 continue; 1254 } 1255 1256 /* 1257 * Indicate to untimeout() that a callout is 1258 * being expired by the executor. 1259 */ 1260 cp->c_xid |= CALLOUT_EXECUTING; 1261 cp->c_executor = curthread; 1262 mutex_exit(&ct->ct_mutex); 1263 1264 DTRACE_PROBE1(callout__start, callout_t *, cp); 1265 (*cp->c_func)(cp->c_arg); 1266 DTRACE_PROBE1(callout__end, callout_t *, cp); 1267 1268 mutex_enter(&ct->ct_mutex); 1269 1270 ct->ct_expirations++; 1271 ct->ct_timeouts_pending--; 1272 /* 1273 * Indicate completion for c_done. 1274 */ 1275 cp->c_xid &= ~CALLOUT_EXECUTING; 1276 cp->c_executor = NULL; 1277 cnext = cp->c_clnext; 1278 1279 /* 1280 * Delete callout from ID hash table and the callout 1281 * list, return to freelist, and tell any untimeout() that 1282 * cares that we're done. 1283 */ 1284 CALLOUT_DELETE(ct, cp); 1285 cp->c_idnext = ct->ct_free; 1286 ct->ct_free = cp; 1287 cp->c_xid |= CALLOUT_FREE; 1288 1289 if (cp->c_waiting) { 1290 cp->c_waiting = 0; 1291 cv_broadcast(&cp->c_done); 1292 } 1293 } 1294 } 1295 1296 /* 1297 * Execute all expired callout lists for a callout table. 1298 */ 1299 static void 1300 callout_expire(callout_table_t *ct) 1301 { 1302 callout_list_t *cl, *clnext; 1303 1304 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 1305 1306 for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) { 1307 /* 1308 * Expire all the callouts in this callout list. 1309 */ 1310 callout_list_expire(ct, cl); 1311 1312 clnext = cl->cl_next; 1313 if (cl->cl_callouts.ch_head == NULL) { 1314 /* 1315 * Free the callout list. 1316 */ 1317 CALLOUT_LIST_DELETE(ct->ct_expired, cl); 1318 CALLOUT_LIST_FREE(ct, cl); 1319 } 1320 } 1321 } 1322 1323 /* 1324 * The cyclic handlers below process callouts in two steps: 1325 * 1326 * 1. Find all expired callout lists and queue them in a separate 1327 * list of expired callouts. 1328 * 2. Execute the expired callout lists. 1329 * 1330 * This is done for two reasons: 1331 * 1332 * 1. We want to quickly find the next earliest expiration to program 1333 * the cyclic to and reprogram it. We can do this right at the end 1334 * of step 1. 1335 * 2. The realtime cyclic handler expires callouts in place. However, 1336 * for normal callouts, callouts are expired by a taskq thread. 1337 * So, it is simpler and more robust to have the taskq thread just 1338 * do step 2. 1339 */ 1340 1341 /* 1342 * Realtime callout cyclic handler. 1343 */ 1344 void 1345 callout_realtime(callout_table_t *ct) 1346 { 1347 mutex_enter(&ct->ct_mutex); 1348 callout_heap_delete(ct); 1349 callout_expire(ct); 1350 mutex_exit(&ct->ct_mutex); 1351 } 1352 1353 void 1354 callout_execute(callout_table_t *ct) 1355 { 1356 mutex_enter(&ct->ct_mutex); 1357 callout_expire(ct); 1358 mutex_exit(&ct->ct_mutex); 1359 } 1360 1361 /* 1362 * Normal callout cyclic handler. 1363 */ 1364 void 1365 callout_normal(callout_table_t *ct) 1366 { 1367 int i, exec; 1368 1369 mutex_enter(&ct->ct_mutex); 1370 callout_heap_delete(ct); 1371 CALLOUT_EXEC_COMPUTE(ct, exec); 1372 mutex_exit(&ct->ct_mutex); 1373 1374 for (i = 0; i < exec; i++) { 1375 ASSERT(ct->ct_taskq != NULL); 1376 (void) taskq_dispatch(ct->ct_taskq, 1377 (task_func_t *)callout_execute, ct, TQ_NOSLEEP); 1378 } 1379 } 1380 1381 /* 1382 * Suspend callout processing. 1383 */ 1384 static void 1385 callout_suspend(void) 1386 { 1387 int t, f; 1388 callout_table_t *ct; 1389 1390 /* 1391 * Traverse every callout table in the system and suspend callout 1392 * processing. 1393 * 1394 * We need to suspend all the tables (including the inactive ones) 1395 * so that if a table is made active while the suspend is still on, 1396 * the table remains suspended. 1397 */ 1398 for (f = 0; f < max_ncpus; f++) { 1399 for (t = 0; t < CALLOUT_NTYPES; t++) { 1400 ct = &callout_table[CALLOUT_TABLE(t, f)]; 1401 1402 mutex_enter(&ct->ct_mutex); 1403 ct->ct_suspend++; 1404 if (ct->ct_cyclic == CYCLIC_NONE) { 1405 mutex_exit(&ct->ct_mutex); 1406 continue; 1407 } 1408 if (ct->ct_suspend == 1) 1409 (void) cyclic_reprogram(ct->ct_cyclic, 1410 CY_INFINITY); 1411 mutex_exit(&ct->ct_mutex); 1412 } 1413 } 1414 } 1415 1416 /* 1417 * Resume callout processing. 1418 */ 1419 static void 1420 callout_resume(hrtime_t delta, int timechange) 1421 { 1422 hrtime_t exp; 1423 int t, f; 1424 callout_table_t *ct; 1425 1426 /* 1427 * Traverse every callout table in the system and resume callout 1428 * processing. For active tables, perform any hrtime adjustments 1429 * necessary. 1430 */ 1431 for (f = 0; f < max_ncpus; f++) { 1432 for (t = 0; t < CALLOUT_NTYPES; t++) { 1433 ct = &callout_table[CALLOUT_TABLE(t, f)]; 1434 1435 mutex_enter(&ct->ct_mutex); 1436 if (ct->ct_cyclic == CYCLIC_NONE) { 1437 ct->ct_suspend--; 1438 mutex_exit(&ct->ct_mutex); 1439 continue; 1440 } 1441 1442 /* 1443 * If a delta is specified, adjust the expirations in 1444 * the heap by delta. Also, if the caller indicates 1445 * a timechange, process that. This step also cleans 1446 * out any empty callout lists that might happen to 1447 * be there. 1448 */ 1449 (void) callout_heap_process(ct, delta, timechange); 1450 1451 ct->ct_suspend--; 1452 if (ct->ct_suspend == 0) { 1453 /* 1454 * If the expired list is non-empty, then have 1455 * the cyclic expire immediately. Else, program 1456 * the cyclic based on the heap. 1457 */ 1458 if (ct->ct_expired.ch_head != NULL) 1459 exp = gethrtime(); 1460 else if (ct->ct_heap_num > 0) 1461 exp = ct->ct_heap[0].ch_expiration; 1462 else 1463 exp = 0; 1464 if (exp != 0) 1465 (void) cyclic_reprogram(ct->ct_cyclic, 1466 exp); 1467 } 1468 1469 mutex_exit(&ct->ct_mutex); 1470 } 1471 } 1472 } 1473 1474 /* 1475 * Callback handler used by CPR to stop and resume callouts. 1476 * The cyclic subsystem saves and restores hrtime during CPR. 1477 * That is why callout_resume() is called with a 0 delta. 1478 * Although hrtime is the same, hrestime (system time) has 1479 * progressed during CPR. So, we have to indicate a time change 1480 * to expire the absolute hrestime timers. 1481 */ 1482 /*ARGSUSED*/ 1483 static boolean_t 1484 callout_cpr_callb(void *arg, int code) 1485 { 1486 if (code == CB_CODE_CPR_CHKPT) 1487 callout_suspend(); 1488 else 1489 callout_resume(0, 1); 1490 1491 return (B_TRUE); 1492 } 1493 1494 /* 1495 * Callback handler invoked when the debugger is entered or exited. 1496 */ 1497 /*ARGSUSED*/ 1498 static boolean_t 1499 callout_debug_callb(void *arg, int code) 1500 { 1501 hrtime_t delta; 1502 1503 /* 1504 * When the system enters the debugger. make a note of the hrtime. 1505 * When it is resumed, compute how long the system was in the 1506 * debugger. This interval should not be counted for callouts. 1507 */ 1508 if (code == 0) { 1509 callout_suspend(); 1510 callout_debug_hrtime = gethrtime(); 1511 } else { 1512 delta = gethrtime() - callout_debug_hrtime; 1513 callout_resume(delta, 0); 1514 } 1515 1516 return (B_TRUE); 1517 } 1518 1519 /* 1520 * Move the absolute hrestime callouts to the expired list. Then program the 1521 * table's cyclic to expire immediately so that the callouts can be executed 1522 * immediately. 1523 */ 1524 static void 1525 callout_hrestime_one(callout_table_t *ct) 1526 { 1527 hrtime_t expiration; 1528 1529 mutex_enter(&ct->ct_mutex); 1530 if (ct->ct_heap_num == 0) { 1531 mutex_exit(&ct->ct_mutex); 1532 return; 1533 } 1534 1535 /* 1536 * Walk the heap and process all the absolute hrestime entries. 1537 */ 1538 expiration = callout_heap_process(ct, 0, 1); 1539 1540 if ((expiration != 0) && (ct->ct_suspend == 0)) 1541 (void) cyclic_reprogram(ct->ct_cyclic, expiration); 1542 1543 mutex_exit(&ct->ct_mutex); 1544 } 1545 1546 /* 1547 * This function is called whenever system time (hrestime) is changed 1548 * explicitly. All the HRESTIME callouts must be expired at once. 1549 */ 1550 /*ARGSUSED*/ 1551 void 1552 callout_hrestime(void) 1553 { 1554 int t, f; 1555 callout_table_t *ct; 1556 1557 /* 1558 * Traverse every callout table in the system and process the hrestime 1559 * callouts therein. 1560 * 1561 * We look at all the tables because we don't know which ones were 1562 * onlined and offlined in the past. The offlined tables may still 1563 * have active cyclics processing timers somewhere. 1564 */ 1565 for (f = 0; f < max_ncpus; f++) { 1566 for (t = 0; t < CALLOUT_NTYPES; t++) { 1567 ct = &callout_table[CALLOUT_TABLE(t, f)]; 1568 callout_hrestime_one(ct); 1569 } 1570 } 1571 } 1572 1573 /* 1574 * Create the hash tables for this callout table. 1575 */ 1576 static void 1577 callout_hash_init(callout_table_t *ct) 1578 { 1579 size_t size; 1580 1581 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 1582 ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL)); 1583 1584 size = sizeof (callout_hash_t) * CALLOUT_BUCKETS; 1585 ct->ct_idhash = kmem_zalloc(size, KM_SLEEP); 1586 ct->ct_clhash = kmem_zalloc(size, KM_SLEEP); 1587 } 1588 1589 /* 1590 * Create per-callout table kstats. 1591 */ 1592 static void 1593 callout_kstat_init(callout_table_t *ct) 1594 { 1595 callout_stat_type_t stat; 1596 kstat_t *ct_kstats; 1597 int ndx; 1598 1599 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 1600 ASSERT(ct->ct_kstats == NULL); 1601 1602 ndx = ct - callout_table; 1603 ct_kstats = kstat_create("unix", ndx, "callout", 1604 "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1605 1606 if (ct_kstats == NULL) { 1607 cmn_err(CE_WARN, "kstat_create for callout table %p failed", 1608 (void *)ct); 1609 } else { 1610 ct_kstats->ks_data = ct->ct_kstat_data; 1611 for (stat = 0; stat < CALLOUT_NUM_STATS; stat++) 1612 kstat_named_init(&ct->ct_kstat_data[stat], 1613 callout_kstat_names[stat], KSTAT_DATA_INT64); 1614 ct->ct_kstats = ct_kstats; 1615 kstat_install(ct_kstats); 1616 } 1617 } 1618 1619 static void 1620 callout_cyclic_init(callout_table_t *ct) 1621 { 1622 cyc_handler_t hdlr; 1623 cyc_time_t when; 1624 processorid_t seqid; 1625 int t; 1626 cyclic_id_t cyclic; 1627 1628 ASSERT(MUTEX_HELD(&ct->ct_mutex)); 1629 1630 t = CALLOUT_TABLE_TYPE(ct); 1631 seqid = CALLOUT_TABLE_SEQID(ct); 1632 1633 /* 1634 * Create the taskq thread if the table type is normal. 1635 * Realtime tables are handled at PIL1 by a softint 1636 * handler. 1637 */ 1638 if (t == CALLOUT_NORMAL) { 1639 ASSERT(ct->ct_taskq == NULL); 1640 /* 1641 * Each callout thread consumes exactly one 1642 * task structure while active. Therefore, 1643 * prepopulating with 2 * callout_threads tasks 1644 * ensures that there's at least one task per 1645 * thread that's either scheduled or on the 1646 * freelist. In turn, this guarantees that 1647 * taskq_dispatch() will always either succeed 1648 * (because there's a free task structure) or 1649 * be unnecessary (because "callout_excute(ct)" 1650 * has already scheduled). 1651 */ 1652 ct->ct_taskq = 1653 taskq_create_instance("callout_taskq", seqid, 1654 callout_threads, maxclsyspri, 1655 2 * callout_threads, 2 * callout_threads, 1656 TASKQ_PREPOPULATE | TASKQ_CPR_SAFE); 1657 } 1658 1659 /* 1660 * callouts can only be created in a table whose 1661 * cyclic has been initialized. 1662 */ 1663 ASSERT(ct->ct_heap_num == 0); 1664 1665 /* 1666 * Drop the mutex before creating the callout cyclics. cyclic_add() 1667 * could potentially expand the cyclic heap. We don't want to be 1668 * holding the callout table mutex in that case. Note that this 1669 * function is called during CPU online. cpu_lock is held at this 1670 * point. So, only one thread can be executing the cyclic add logic 1671 * below at any time. 1672 */ 1673 mutex_exit(&ct->ct_mutex); 1674 1675 /* 1676 * Create the callout table cyclics. 1677 * 1678 * The realtime cyclic handler executes at low PIL. The normal cyclic 1679 * handler executes at lock PIL. This is because there are cases 1680 * where code can block at PIL > 1 waiting for a normal callout handler 1681 * to unblock it directly or indirectly. If the normal cyclic were to 1682 * be executed at low PIL, it could get blocked out by the waiter 1683 * and cause a deadlock. 1684 */ 1685 ASSERT(ct->ct_cyclic == CYCLIC_NONE); 1686 1687 hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t); 1688 if (ct->ct_type == CALLOUT_REALTIME) 1689 hdlr.cyh_level = callout_realtime_level; 1690 else 1691 hdlr.cyh_level = callout_normal_level; 1692 hdlr.cyh_arg = ct; 1693 when.cyt_when = CY_INFINITY; 1694 when.cyt_interval = CY_INFINITY; 1695 1696 cyclic = cyclic_add(&hdlr, &when); 1697 1698 mutex_enter(&ct->ct_mutex); 1699 ct->ct_cyclic = cyclic; 1700 } 1701 1702 void 1703 callout_cpu_online(cpu_t *cp) 1704 { 1705 lgrp_handle_t hand; 1706 callout_cache_t *cache; 1707 char s[KMEM_CACHE_NAMELEN]; 1708 callout_table_t *ct; 1709 processorid_t seqid; 1710 int t; 1711 1712 ASSERT(MUTEX_HELD(&cpu_lock)); 1713 1714 /* 1715 * Locate the cache corresponding to the onlined CPU's lgroup. 1716 * Note that access to callout_caches is protected by cpu_lock. 1717 */ 1718 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 1719 for (cache = callout_caches; cache != NULL; cache = cache->cc_next) { 1720 if (cache->cc_hand == hand) 1721 break; 1722 } 1723 1724 /* 1725 * If not found, create one. The caches are never destroyed. 1726 */ 1727 if (cache == NULL) { 1728 cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP); 1729 cache->cc_hand = hand; 1730 (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx", 1731 (long)hand); 1732 cache->cc_cache = kmem_cache_create(s, sizeof (callout_t), 1733 CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 1734 (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx", 1735 (long)hand); 1736 cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t), 1737 CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 1738 cache->cc_next = callout_caches; 1739 callout_caches = cache; 1740 } 1741 1742 seqid = cp->cpu_seqid; 1743 1744 for (t = 0; t < CALLOUT_NTYPES; t++) { 1745 ct = &callout_table[CALLOUT_TABLE(t, seqid)]; 1746 1747 mutex_enter(&ct->ct_mutex); 1748 /* 1749 * Store convinience pointers to the kmem caches 1750 * in the callout table. These assignments should always be 1751 * done as callout tables can map to different physical 1752 * CPUs each time. 1753 */ 1754 ct->ct_cache = cache->cc_cache; 1755 ct->ct_lcache = cache->cc_lcache; 1756 1757 /* 1758 * We use the heap pointer to check if stuff has been 1759 * initialized for this callout table. 1760 */ 1761 if (ct->ct_heap == NULL) { 1762 callout_heap_init(ct); 1763 callout_hash_init(ct); 1764 callout_kstat_init(ct); 1765 callout_cyclic_init(ct); 1766 } 1767 1768 mutex_exit(&ct->ct_mutex); 1769 1770 /* 1771 * Move the cyclic to this CPU by doing a bind. 1772 */ 1773 cyclic_bind(ct->ct_cyclic, cp, NULL); 1774 } 1775 } 1776 1777 void 1778 callout_cpu_offline(cpu_t *cp) 1779 { 1780 callout_table_t *ct; 1781 processorid_t seqid; 1782 int t; 1783 1784 ASSERT(MUTEX_HELD(&cpu_lock)); 1785 1786 seqid = cp->cpu_seqid; 1787 1788 for (t = 0; t < CALLOUT_NTYPES; t++) { 1789 ct = &callout_table[CALLOUT_TABLE(t, seqid)]; 1790 1791 /* 1792 * Unbind the cyclic. This will allow the cyclic subsystem 1793 * to juggle the cyclic during CPU offline. 1794 */ 1795 cyclic_bind(ct->ct_cyclic, NULL, NULL); 1796 } 1797 } 1798 1799 /* 1800 * This is called to perform per-CPU initialization for slave CPUs at 1801 * boot time. 1802 */ 1803 void 1804 callout_mp_init(void) 1805 { 1806 cpu_t *cp; 1807 1808 mutex_enter(&cpu_lock); 1809 1810 cp = cpu_active; 1811 do { 1812 callout_cpu_online(cp); 1813 } while ((cp = cp->cpu_next_onln) != cpu_active); 1814 1815 mutex_exit(&cpu_lock); 1816 } 1817 1818 /* 1819 * Initialize all callout tables. Called at boot time just before clkstart(). 1820 */ 1821 void 1822 callout_init(void) 1823 { 1824 int f, t; 1825 size_t size; 1826 int table_id; 1827 callout_table_t *ct; 1828 long bits, fanout; 1829 uintptr_t buf; 1830 1831 /* 1832 * Initialize callout globals. 1833 */ 1834 bits = 0; 1835 for (fanout = 1; (fanout < max_ncpus); fanout <<= 1) 1836 bits++; 1837 callout_table_bits = CALLOUT_TYPE_BITS + bits; 1838 callout_table_mask = (1 << callout_table_bits) - 1; 1839 callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT; 1840 callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS); 1841 callout_max_ticks = CALLOUT_MAX_TICKS; 1842 if (callout_min_reap == 0) 1843 callout_min_reap = CALLOUT_MIN_REAP; 1844 1845 if (callout_tolerance <= 0) 1846 callout_tolerance = CALLOUT_TOLERANCE; 1847 if (callout_threads <= 0) 1848 callout_threads = CALLOUT_THREADS; 1849 1850 /* 1851 * Allocate all the callout tables based on max_ncpus. We have chosen 1852 * to do boot-time allocation instead of dynamic allocation because: 1853 * 1854 * - the size of the callout tables is not too large. 1855 * - there are race conditions involved in making this dynamic. 1856 * - the hash tables that go with the callout tables consume 1857 * most of the memory and they are only allocated in 1858 * callout_cpu_online(). 1859 * 1860 * Each CPU has two tables that are consecutive in the array. The first 1861 * one is for realtime callouts and the second one is for normal ones. 1862 * 1863 * We do this alignment dance to make sure that callout table 1864 * structures will always be on a cache line boundary. 1865 */ 1866 size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus; 1867 size += CALLOUT_ALIGN; 1868 buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP); 1869 callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN); 1870 1871 size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS; 1872 /* 1873 * Now, initialize the tables for all the CPUs. 1874 */ 1875 for (f = 0; f < max_ncpus; f++) { 1876 for (t = 0; t < CALLOUT_NTYPES; t++) { 1877 table_id = CALLOUT_TABLE(t, f); 1878 ct = &callout_table[table_id]; 1879 ct->ct_type = t; 1880 mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL); 1881 /* 1882 * Precompute the base IDs for long and short-term 1883 * legacy IDs. This makes ID generation during 1884 * timeout() fast. 1885 */ 1886 ct->ct_short_id = CALLOUT_SHORT_ID(table_id); 1887 ct->ct_long_id = CALLOUT_LONG_ID(table_id); 1888 /* 1889 * Precompute the base ID for generation-based IDs. 1890 * Note that when the first ID gets allocated, the 1891 * ID will wrap. This will cause the generation 1892 * number to be incremented to 1. 1893 */ 1894 ct->ct_gen_id = CALLOUT_SHORT_ID(table_id); 1895 /* 1896 * Initialize the cyclic as NONE. This will get set 1897 * during CPU online. This is so that partially 1898 * populated systems will only have the required 1899 * number of cyclics, not more. 1900 */ 1901 ct->ct_cyclic = CYCLIC_NONE; 1902 ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP); 1903 } 1904 } 1905 1906 /* 1907 * Add the callback for CPR. This is called during checkpoint 1908 * resume to suspend and resume callouts. 1909 */ 1910 (void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT, 1911 "callout_cpr"); 1912 (void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER, 1913 "callout_debug"); 1914 1915 /* 1916 * Call the per-CPU initialization function for the boot CPU. This 1917 * is done here because the function is not called automatically for 1918 * the boot CPU from the CPU online/offline hooks. Note that the 1919 * CPU lock is taken here because of convention. 1920 */ 1921 mutex_enter(&cpu_lock); 1922 callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)]; 1923 callout_cpu_online(CPU); 1924 mutex_exit(&cpu_lock); 1925 } 1926