1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 */
25
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/kmem.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
38 #include <sys/sdt.h>
39
40 int callout_init_done; /* useful during boot */
41
42 /*
43 * Callout tables. See timeout(9F) for details.
44 */
45 static int callout_threads; /* callout normal threads */
46 static hrtime_t callout_debug_hrtime; /* debugger entry time */
47 static int callout_chunk; /* callout heap chunk size */
48 static int callout_min_reap; /* callout minimum reap count */
49 static int callout_tolerance; /* callout hires tolerance */
50 static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */
51 static clock_t callout_max_ticks; /* max interval */
52 static hrtime_t callout_longterm; /* longterm nanoseconds */
53 static ulong_t callout_counter_low; /* callout ID increment */
54 static ulong_t callout_table_bits; /* number of table bits in ID */
55 static ulong_t callout_table_mask; /* mask for the table bits */
56 static callout_cache_t *callout_caches; /* linked list of caches */
57 #pragma align 64(callout_table)
58 static callout_table_t *callout_table; /* global callout table array */
59
60 /*
61 * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
62 * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
63 * via taskq, to a thread that executes at PIL 0 - so we end up running
64 * 'normal' callouts at PIL 0.
65 */
66 static volatile int callout_realtime_level = CY_LOW_LEVEL;
67 static volatile int callout_normal_level = CY_LOCK_LEVEL;
68
69 static char *callout_kstat_names[] = {
70 "callout_timeouts",
71 "callout_timeouts_pending",
72 "callout_untimeouts_unexpired",
73 "callout_untimeouts_executing",
74 "callout_untimeouts_expired",
75 "callout_expirations",
76 "callout_allocations",
77 "callout_cleanups",
78 };
79
80 static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int);
81
82 #define CALLOUT_HASH_INSERT(hash, cp, cnext, cprev) \
83 { \
84 callout_hash_t *hashp = &(hash); \
85 \
86 cp->cprev = NULL; \
87 cp->cnext = hashp->ch_head; \
88 if (hashp->ch_head == NULL) \
89 hashp->ch_tail = cp; \
90 else \
91 cp->cnext->cprev = cp; \
92 hashp->ch_head = cp; \
93 }
94
95 #define CALLOUT_HASH_APPEND(hash, cp, cnext, cprev) \
96 { \
97 callout_hash_t *hashp = &(hash); \
98 \
99 cp->cnext = NULL; \
100 cp->cprev = hashp->ch_tail; \
101 if (hashp->ch_tail == NULL) \
102 hashp->ch_head = cp; \
103 else \
104 cp->cprev->cnext = cp; \
105 hashp->ch_tail = cp; \
106 }
107
108 #define CALLOUT_HASH_DELETE(hash, cp, cnext, cprev) \
109 { \
110 callout_hash_t *hashp = &(hash); \
111 \
112 if (cp->cnext == NULL) \
113 hashp->ch_tail = cp->cprev; \
114 else \
115 cp->cnext->cprev = cp->cprev; \
116 if (cp->cprev == NULL) \
117 hashp->ch_head = cp->cnext; \
118 else \
119 cp->cprev->cnext = cp->cnext; \
120 }
121
122 /*
123 * These definitions help us queue callouts and callout lists. Here is
124 * the queueing rationale:
125 *
126 * - callouts are queued in a FIFO manner in the ID hash table.
127 * TCP timers are typically cancelled in the same order that they
128 * were issued. The FIFO queueing shortens the search for a callout
129 * during untimeout().
130 *
131 * - callouts are queued in a FIFO manner in their callout lists.
132 * This ensures that the callouts are executed in the same order that
133 * they were queued. This is fair. Plus, it helps to make each
134 * callout expiration timely. It also favors cancellations.
135 *
136 * - callout lists are queued in the following manner in the callout
137 * hash table buckets:
138 *
139 * - appended, if the callout list is a 1-nanosecond resolution
140 * callout list. When a callout is created, we first look for
141 * a callout list that has the same expiration so we can avoid
142 * allocating a callout list and inserting the expiration into
143 * the heap. However, we do not want to look at 1-nanosecond
144 * resolution callout lists as we will seldom find a match in
145 * them. Keeping these callout lists in the rear of the hash
146 * buckets allows us to skip these during the lookup.
147 *
148 * - inserted at the beginning, if the callout list is not a
149 * 1-nanosecond resolution callout list. This also has the
150 * side-effect of keeping the long term timers away from the
151 * front of the buckets.
152 *
153 * - callout lists are queued in a FIFO manner in the expired callouts
154 * list. This ensures that callout lists are executed in the order
155 * of expiration.
156 */
157 #define CALLOUT_APPEND(ct, cp) \
158 CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \
159 cp, c_idnext, c_idprev); \
160 CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
161
162 #define CALLOUT_DELETE(ct, cp) \
163 CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \
164 cp, c_idnext, c_idprev); \
165 CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
166
167 #define CALLOUT_LIST_INSERT(hash, cl) \
168 CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
169
170 #define CALLOUT_LIST_APPEND(hash, cl) \
171 CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
172
173 #define CALLOUT_LIST_DELETE(hash, cl) \
174 CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
175
176 #define CALLOUT_LIST_BEFORE(cl, nextcl) \
177 { \
178 (cl)->cl_prev = (nextcl)->cl_prev; \
179 (cl)->cl_next = (nextcl); \
180 (nextcl)->cl_prev = (cl); \
181 if (cl->cl_prev != NULL) \
182 cl->cl_prev->cl_next = cl; \
183 }
184
185 /*
186 * For normal callouts, there is a deadlock scenario if two callouts that
187 * have an inter-dependency end up on the same callout list. To break the
188 * deadlock, you need two taskq threads running in parallel. We compute
189 * the number of taskq threads here using a bunch of conditions to make
190 * it optimal for the common case. This is an ugly hack, but one that is
191 * necessary (sigh).
192 */
193 #define CALLOUT_THRESHOLD 100000000
194 #define CALLOUT_EXEC_COMPUTE(ct, nextexp, exec) \
195 { \
196 callout_list_t *cl; \
197 \
198 cl = ct->ct_expired.ch_head; \
199 if (cl == NULL) { \
200 /* \
201 * If the expired list is NULL, there is nothing to \
202 * process. \
203 */ \
204 exec = 0; \
205 } else if ((cl->cl_next == NULL) && \
206 (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) { \
207 /* \
208 * If there is only one callout list and it contains \
209 * only one callout, there is no need for two threads. \
210 */ \
211 exec = 1; \
212 } else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) { \
213 /* \
214 * If the next expiration of the cyclic is way out into \
215 * the future, we need two threads. \
216 */ \
217 exec = 2; \
218 } else { \
219 /* \
220 * We have multiple callouts to process. But the cyclic \
221 * will fire in the near future. So, we only need one \
222 * thread for now. \
223 */ \
224 exec = 1; \
225 } \
226 }
227
228 /*
229 * Macro to swap two heap items.
230 */
231 #define CALLOUT_SWAP(h1, h2) \
232 { \
233 callout_heap_t tmp; \
234 \
235 tmp = *h1; \
236 *h1 = *h2; \
237 *h2 = tmp; \
238 }
239
240 /*
241 * Macro to free a callout list.
242 */
243 #define CALLOUT_LIST_FREE(ct, cl) \
244 { \
245 cl->cl_next = ct->ct_lfree; \
246 ct->ct_lfree = cl; \
247 cl->cl_flags |= CALLOUT_LIST_FLAG_FREE; \
248 }
249
250 /*
251 * Macro to free a callout.
252 */
253 #define CALLOUT_FREE(ct, cl) \
254 { \
255 cp->c_idnext = ct->ct_free; \
256 ct->ct_free = cp; \
257 cp->c_xid |= CALLOUT_ID_FREE; \
258 }
259
260 /*
261 * Allocate a callout structure. We try quite hard because we
262 * can't sleep, and if we can't do the allocation, we're toast.
263 * Failing all, we try a KM_PANIC allocation. Note that we never
264 * deallocate a callout. See untimeout() for the reasoning.
265 */
266 static callout_t *
callout_alloc(callout_table_t * ct)267 callout_alloc(callout_table_t *ct)
268 {
269 size_t size;
270 callout_t *cp;
271
272 ASSERT(MUTEX_HELD(&ct->ct_mutex));
273 mutex_exit(&ct->ct_mutex);
274
275 cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
276 if (cp == NULL) {
277 size = sizeof (callout_t);
278 cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
279 }
280 cp->c_xid = 0;
281 cp->c_executor = NULL;
282 cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
283 cp->c_waiting = 0;
284
285 mutex_enter(&ct->ct_mutex);
286 ct->ct_allocations++;
287 return (cp);
288 }
289
290 /*
291 * Allocate a callout list structure. We try quite hard because we
292 * can't sleep, and if we can't do the allocation, we're toast.
293 * Failing all, we try a KM_PANIC allocation. Note that we never
294 * deallocate a callout list.
295 */
296 static void
callout_list_alloc(callout_table_t * ct)297 callout_list_alloc(callout_table_t *ct)
298 {
299 size_t size;
300 callout_list_t *cl;
301
302 ASSERT(MUTEX_HELD(&ct->ct_mutex));
303 mutex_exit(&ct->ct_mutex);
304
305 cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
306 if (cl == NULL) {
307 size = sizeof (callout_list_t);
308 cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
309 }
310 bzero(cl, sizeof (callout_list_t));
311
312 mutex_enter(&ct->ct_mutex);
313 CALLOUT_LIST_FREE(ct, cl);
314 }
315
316 /*
317 * Find a callout list that corresponds to an expiration and matching flags.
318 */
319 static callout_list_t *
callout_list_get(callout_table_t * ct,hrtime_t expiration,int flags,int hash)320 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
321 {
322 callout_list_t *cl;
323 int clflags;
324
325 ASSERT(MUTEX_HELD(&ct->ct_mutex));
326
327 if (flags & CALLOUT_LIST_FLAG_NANO) {
328 /*
329 * This is a 1-nanosecond resolution callout. We will rarely
330 * find a match for this. So, bail out.
331 */
332 return (NULL);
333 }
334
335 clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
336 for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
337 /*
338 * If we have reached a 1-nanosecond resolution callout list,
339 * we don't have much hope of finding a match in this hash
340 * bucket. So, just bail out.
341 */
342 if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
343 return (NULL);
344
345 if ((cl->cl_expiration == expiration) &&
346 ((cl->cl_flags & clflags) == (flags & clflags)))
347 return (cl);
348 }
349
350 return (NULL);
351 }
352
353 /*
354 * Add a new callout list into a callout table's queue in sorted order by
355 * expiration.
356 */
357 static int
callout_queue_add(callout_table_t * ct,callout_list_t * cl)358 callout_queue_add(callout_table_t *ct, callout_list_t *cl)
359 {
360 callout_list_t *nextcl;
361 hrtime_t expiration;
362
363 expiration = cl->cl_expiration;
364 nextcl = ct->ct_queue.ch_head;
365 if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) {
366 CALLOUT_LIST_INSERT(ct->ct_queue, cl);
367 return (1);
368 }
369
370 while (nextcl != NULL) {
371 if (expiration < nextcl->cl_expiration) {
372 CALLOUT_LIST_BEFORE(cl, nextcl);
373 return (0);
374 }
375 nextcl = nextcl->cl_next;
376 }
377 CALLOUT_LIST_APPEND(ct->ct_queue, cl);
378
379 return (0);
380 }
381
382 /*
383 * Insert a callout list into a callout table's queue and reprogram the queue
384 * cyclic if needed.
385 */
386 static void
callout_queue_insert(callout_table_t * ct,callout_list_t * cl)387 callout_queue_insert(callout_table_t *ct, callout_list_t *cl)
388 {
389 cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED;
390
391 /*
392 * Add the callout to the callout queue. If it ends up at the head,
393 * the cyclic needs to be reprogrammed as we have an earlier
394 * expiration.
395 *
396 * Also, during the CPR suspend phase, do not reprogram the cyclic.
397 * We don't want any callout activity. When the CPR resume phase is
398 * entered, the cyclic will be programmed for the earliest expiration
399 * in the queue.
400 */
401 if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0))
402 (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
403 }
404
405 /*
406 * Delete and handle all past expirations in a callout table's queue.
407 */
408 static hrtime_t
callout_queue_delete(callout_table_t * ct)409 callout_queue_delete(callout_table_t *ct)
410 {
411 callout_list_t *cl;
412 hrtime_t now;
413
414 ASSERT(MUTEX_HELD(&ct->ct_mutex));
415
416 now = gethrtime();
417 while ((cl = ct->ct_queue.ch_head) != NULL) {
418 if (cl->cl_expiration > now)
419 break;
420 cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
421 CALLOUT_LIST_DELETE(ct->ct_queue, cl);
422 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
423 }
424
425 /*
426 * If this callout queue is empty or callouts have been suspended,
427 * just return.
428 */
429 if ((cl == NULL) || (ct->ct_suspend > 0))
430 return (CY_INFINITY);
431
432 (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
433
434 return (cl->cl_expiration);
435 }
436
437 static hrtime_t
callout_queue_process(callout_table_t * ct,hrtime_t delta,int timechange)438 callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange)
439 {
440 callout_list_t *firstcl, *cl;
441 hrtime_t expiration, now;
442 int clflags;
443 callout_hash_t temp;
444
445 ASSERT(MUTEX_HELD(&ct->ct_mutex));
446
447 firstcl = ct->ct_queue.ch_head;
448 if (firstcl == NULL)
449 return (CY_INFINITY);
450
451 /*
452 * We walk the callout queue. If we encounter a hrestime entry that
453 * must be removed, we clean it out. Otherwise, we apply any
454 * adjustments needed to it. Because of the latter, we need to
455 * recreate the list as we go along.
456 */
457 temp = ct->ct_queue;
458 ct->ct_queue.ch_head = NULL;
459 ct->ct_queue.ch_tail = NULL;
460
461 clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
462 now = gethrtime();
463 while ((cl = temp.ch_head) != NULL) {
464 CALLOUT_LIST_DELETE(temp, cl);
465
466 /*
467 * Delete the callout and expire it, if one of the following
468 * is true:
469 * - the callout has expired
470 * - the callout is an absolute hrestime one and
471 * there has been a system time change
472 */
473 if ((cl->cl_expiration <= now) ||
474 (timechange && ((cl->cl_flags & clflags) == clflags))) {
475 cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
476 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
477 continue;
478 }
479
480 /*
481 * Apply adjustments, if any. Adjustments are applied after
482 * the system returns from KMDB or OBP. They are only applied
483 * to relative callout lists.
484 */
485 if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
486 expiration = cl->cl_expiration + delta;
487 if (expiration <= 0)
488 expiration = CY_INFINITY;
489 cl->cl_expiration = expiration;
490 }
491
492 (void) callout_queue_add(ct, cl);
493 }
494
495 /*
496 * We need to return the expiration to help program the cyclic.
497 * If there are expired callouts, the cyclic needs to go off
498 * immediately. If the queue has become empty, then we return infinity.
499 * Else, we return the expiration of the earliest callout in the queue.
500 */
501 if (ct->ct_expired.ch_head != NULL)
502 return (gethrtime());
503
504 cl = ct->ct_queue.ch_head;
505 if (cl == NULL)
506 return (CY_INFINITY);
507
508 return (cl->cl_expiration);
509 }
510
511 /*
512 * Initialize a callout table's heap, if necessary. Preallocate some free
513 * entries so we don't have to check for NULL elsewhere.
514 */
515 static void
callout_heap_init(callout_table_t * ct)516 callout_heap_init(callout_table_t *ct)
517 {
518 size_t size;
519
520 ASSERT(MUTEX_HELD(&ct->ct_mutex));
521 ASSERT(ct->ct_heap == NULL);
522
523 ct->ct_heap_num = 0;
524 ct->ct_heap_max = callout_chunk;
525 size = sizeof (callout_heap_t) * callout_chunk;
526 ct->ct_heap = kmem_alloc(size, KM_SLEEP);
527 }
528
529 /*
530 * Reallocate the heap. Return 0 if the heap is still full at the end of it.
531 * Return 1 otherwise. Note that the heap only expands, it never contracts.
532 */
533 static int
callout_heap_expand(callout_table_t * ct)534 callout_heap_expand(callout_table_t *ct)
535 {
536 size_t max, size, osize;
537 callout_heap_t *heap;
538
539 ASSERT(MUTEX_HELD(&ct->ct_mutex));
540 ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
541
542 while (ct->ct_heap_num == ct->ct_heap_max) {
543 max = ct->ct_heap_max;
544 mutex_exit(&ct->ct_mutex);
545
546 osize = sizeof (callout_heap_t) * max;
547 size = sizeof (callout_heap_t) * (max + callout_chunk);
548 heap = kmem_alloc(size, KM_NOSLEEP);
549
550 mutex_enter(&ct->ct_mutex);
551 if (heap == NULL) {
552 /*
553 * We could not allocate memory. If we can free up
554 * some entries, that would be great.
555 */
556 if (ct->ct_nreap > 0)
557 (void) callout_heap_process(ct, 0, 0);
558 /*
559 * If we still have no space in the heap, inform the
560 * caller.
561 */
562 if (ct->ct_heap_num == ct->ct_heap_max)
563 return (0);
564 return (1);
565 }
566 if (max < ct->ct_heap_max) {
567 /*
568 * Someone beat us to the allocation. Free what we
569 * just allocated and proceed.
570 */
571 kmem_free(heap, size);
572 continue;
573 }
574
575 bcopy(ct->ct_heap, heap, osize);
576 kmem_free(ct->ct_heap, osize);
577 ct->ct_heap = heap;
578 ct->ct_heap_max = size / sizeof (callout_heap_t);
579 }
580
581 return (1);
582 }
583
584 /*
585 * Move an expiration from the bottom of the heap to its correct place
586 * in the heap. If we reached the root doing this, return 1. Else,
587 * return 0.
588 */
589 static int
callout_upheap(callout_table_t * ct)590 callout_upheap(callout_table_t *ct)
591 {
592 int current, parent;
593 callout_heap_t *heap, *hcurrent, *hparent;
594
595 ASSERT(MUTEX_HELD(&ct->ct_mutex));
596 ASSERT(ct->ct_heap_num >= 1);
597
598 if (ct->ct_heap_num == 1) {
599 return (1);
600 }
601
602 heap = ct->ct_heap;
603 current = ct->ct_heap_num - 1;
604
605 for (;;) {
606 parent = CALLOUT_HEAP_PARENT(current);
607 hparent = &heap[parent];
608 hcurrent = &heap[current];
609
610 /*
611 * We have an expiration later than our parent; we're done.
612 */
613 if (hcurrent->ch_expiration >= hparent->ch_expiration) {
614 return (0);
615 }
616
617 /*
618 * We need to swap with our parent, and continue up the heap.
619 */
620 CALLOUT_SWAP(hparent, hcurrent);
621
622 /*
623 * If we just reached the root, we're done.
624 */
625 if (parent == 0) {
626 return (1);
627 }
628
629 current = parent;
630 }
631 /*NOTREACHED*/
632 }
633
634 /*
635 * Insert a new heap item into a callout table's heap.
636 */
637 static void
callout_heap_insert(callout_table_t * ct,callout_list_t * cl)638 callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
639 {
640 ASSERT(MUTEX_HELD(&ct->ct_mutex));
641 ASSERT(ct->ct_heap_num < ct->ct_heap_max);
642
643 cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED;
644 /*
645 * First, copy the expiration and callout list pointer to the bottom
646 * of the heap.
647 */
648 ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
649 ct->ct_heap[ct->ct_heap_num].ch_list = cl;
650 ct->ct_heap_num++;
651
652 /*
653 * Now, perform an upheap operation. If we reached the root, then
654 * the cyclic needs to be reprogrammed as we have an earlier
655 * expiration.
656 *
657 * Also, during the CPR suspend phase, do not reprogram the cyclic.
658 * We don't want any callout activity. When the CPR resume phase is
659 * entered, the cyclic will be programmed for the earliest expiration
660 * in the heap.
661 */
662 if (callout_upheap(ct) && (ct->ct_suspend == 0))
663 (void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
664 }
665
666 /*
667 * Move an expiration from the top of the heap to its correct place
668 * in the heap.
669 */
670 static void
callout_downheap(callout_table_t * ct)671 callout_downheap(callout_table_t *ct)
672 {
673 int current, left, right, nelems;
674 callout_heap_t *heap, *hleft, *hright, *hcurrent;
675
676 ASSERT(MUTEX_HELD(&ct->ct_mutex));
677 ASSERT(ct->ct_heap_num >= 1);
678
679 heap = ct->ct_heap;
680 current = 0;
681 nelems = ct->ct_heap_num;
682
683 for (;;) {
684 /*
685 * If we don't have a left child (i.e., we're a leaf), we're
686 * done.
687 */
688 if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
689 return;
690
691 hleft = &heap[left];
692 hcurrent = &heap[current];
693
694 right = CALLOUT_HEAP_RIGHT(current);
695
696 /*
697 * Even if we don't have a right child, we still need to compare
698 * our expiration against that of our left child.
699 */
700 if (right >= nelems)
701 goto comp_left;
702
703 hright = &heap[right];
704
705 /*
706 * We have both a left and a right child. We need to compare
707 * the expiration of the children to determine which
708 * expires earlier.
709 */
710 if (hright->ch_expiration < hleft->ch_expiration) {
711 /*
712 * Our right child is the earlier of our children.
713 * We'll now compare our expiration to its expiration.
714 * If ours is the earlier one, we're done.
715 */
716 if (hcurrent->ch_expiration <= hright->ch_expiration)
717 return;
718
719 /*
720 * Our right child expires earlier than we do; swap
721 * with our right child, and descend right.
722 */
723 CALLOUT_SWAP(hright, hcurrent);
724 current = right;
725 continue;
726 }
727
728 comp_left:
729 /*
730 * Our left child is the earlier of our children (or we have
731 * no right child). We'll now compare our expiration
732 * to its expiration. If ours is the earlier one, we're done.
733 */
734 if (hcurrent->ch_expiration <= hleft->ch_expiration)
735 return;
736
737 /*
738 * Our left child expires earlier than we do; swap with our
739 * left child, and descend left.
740 */
741 CALLOUT_SWAP(hleft, hcurrent);
742 current = left;
743 }
744 }
745
746 /*
747 * Delete and handle all past expirations in a callout table's heap.
748 */
749 static hrtime_t
callout_heap_delete(callout_table_t * ct)750 callout_heap_delete(callout_table_t *ct)
751 {
752 hrtime_t now, expiration, next;
753 callout_list_t *cl;
754 callout_heap_t *heap;
755 int hash;
756
757 ASSERT(MUTEX_HELD(&ct->ct_mutex));
758
759 if (CALLOUT_CLEANUP(ct)) {
760 /*
761 * There are too many heap elements pointing to empty callout
762 * lists. Clean them out.
763 */
764 (void) callout_heap_process(ct, 0, 0);
765 }
766
767 now = gethrtime();
768 heap = ct->ct_heap;
769
770 while (ct->ct_heap_num > 0) {
771 expiration = heap->ch_expiration;
772 hash = CALLOUT_CLHASH(expiration);
773 cl = heap->ch_list;
774 ASSERT(expiration == cl->cl_expiration);
775
776 if (cl->cl_callouts.ch_head == NULL) {
777 /*
778 * If the callout list is empty, reap it.
779 * Decrement the reap count.
780 */
781 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
782 CALLOUT_LIST_FREE(ct, cl);
783 ct->ct_nreap--;
784 } else {
785 /*
786 * If the root of the heap expires in the future,
787 * bail out.
788 */
789 if (expiration > now)
790 break;
791
792 /*
793 * Move the callout list for this expiration to the
794 * list of expired callout lists. It will be processed
795 * by the callout executor.
796 */
797 cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
798 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
799 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
800 }
801
802 /*
803 * Now delete the root. This is done by swapping the root with
804 * the last item in the heap and downheaping the item.
805 */
806 ct->ct_heap_num--;
807 if (ct->ct_heap_num > 0) {
808 heap[0] = heap[ct->ct_heap_num];
809 callout_downheap(ct);
810 }
811 }
812
813 /*
814 * If this callout table is empty or callouts have been suspended,
815 * just return. The cyclic has already been programmed to
816 * infinity by the cyclic subsystem.
817 */
818 if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
819 return (CY_INFINITY);
820
821 /*
822 * If the top expirations are within callout_tolerance of each other,
823 * delay the cyclic expire so that they can be processed together.
824 * This is to prevent high resolution timers from swamping the system
825 * with cyclic activity.
826 */
827 if (ct->ct_heap_num > 2) {
828 next = expiration + callout_tolerance;
829 if ((heap[1].ch_expiration < next) ||
830 (heap[2].ch_expiration < next))
831 expiration = next;
832 }
833
834 (void) cyclic_reprogram(ct->ct_cyclic, expiration);
835
836 return (expiration);
837 }
838
839 /*
840 * There are some situations when the entire heap is walked and processed.
841 * This function is called to do the processing. These are the situations:
842 *
843 * 1. When the reap count reaches its threshold, the heap has to be cleared
844 * of all empty callout lists.
845 *
846 * 2. When the system enters and exits KMDB/OBP, all entries in the heap
847 * need to be adjusted by the interval spent in KMDB/OBP.
848 *
849 * 3. When system time is changed, the heap has to be scanned for
850 * absolute hrestime timers. These need to be removed from the heap
851 * and expired immediately.
852 *
853 * In cases 2 and 3, it is a good idea to do 1 as well since we are
854 * scanning the heap anyway.
855 *
856 * If the root gets changed and/or callout lists are expired, return the
857 * new expiration to the caller so it can reprogram the cyclic accordingly.
858 */
859 static hrtime_t
callout_heap_process(callout_table_t * ct,hrtime_t delta,int timechange)860 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
861 {
862 callout_heap_t *heap;
863 callout_list_t *cl;
864 hrtime_t expiration, now;
865 int i, hash, clflags;
866 ulong_t num;
867
868 ASSERT(MUTEX_HELD(&ct->ct_mutex));
869
870 if (ct->ct_heap_num == 0)
871 return (CY_INFINITY);
872
873 if (ct->ct_nreap > 0)
874 ct->ct_cleanups++;
875
876 heap = ct->ct_heap;
877
878 /*
879 * We walk the heap from the top to the bottom. If we encounter
880 * a heap item that points to an empty callout list, we clean
881 * it out. If we encounter a hrestime entry that must be removed,
882 * again we clean it out. Otherwise, we apply any adjustments needed
883 * to an element.
884 *
885 * During the walk, we also compact the heap from the bottom and
886 * reconstruct the heap using upheap operations. This is very
887 * efficient if the number of elements to be cleaned is greater than
888 * or equal to half the heap. This is the common case.
889 *
890 * Even in the non-common case, the upheap operations should be short
891 * as the entries below generally tend to be bigger than the entries
892 * above.
893 */
894 num = ct->ct_heap_num;
895 ct->ct_heap_num = 0;
896 clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
897 now = gethrtime();
898 for (i = 0; i < num; i++) {
899 cl = heap[i].ch_list;
900 /*
901 * If the callout list is empty, delete the heap element and
902 * free the callout list.
903 */
904 if (cl->cl_callouts.ch_head == NULL) {
905 hash = CALLOUT_CLHASH(cl->cl_expiration);
906 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
907 CALLOUT_LIST_FREE(ct, cl);
908 continue;
909 }
910
911 /*
912 * Delete the heap element and expire the callout list, if
913 * one of the following is true:
914 * - the callout list has expired
915 * - the callout list is an absolute hrestime one and
916 * there has been a system time change
917 */
918 if ((cl->cl_expiration <= now) ||
919 (timechange && ((cl->cl_flags & clflags) == clflags))) {
920 hash = CALLOUT_CLHASH(cl->cl_expiration);
921 cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
922 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
923 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
924 continue;
925 }
926
927 /*
928 * Apply adjustments, if any. Adjustments are applied after
929 * the system returns from KMDB or OBP. They are only applied
930 * to relative callout lists.
931 */
932 if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
933 hash = CALLOUT_CLHASH(cl->cl_expiration);
934 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
935 expiration = cl->cl_expiration + delta;
936 if (expiration <= 0)
937 expiration = CY_INFINITY;
938 heap[i].ch_expiration = expiration;
939 cl->cl_expiration = expiration;
940 hash = CALLOUT_CLHASH(cl->cl_expiration);
941 if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
942 CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
943 } else {
944 CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
945 }
946 }
947
948 heap[ct->ct_heap_num] = heap[i];
949 ct->ct_heap_num++;
950 (void) callout_upheap(ct);
951 }
952
953 ct->ct_nreap = 0;
954
955 /*
956 * We need to return the expiration to help program the cyclic.
957 * If there are expired callouts, the cyclic needs to go off
958 * immediately. If the heap has become empty, then we return infinity.
959 * Else, return the expiration of the earliest callout in the heap.
960 */
961 if (ct->ct_expired.ch_head != NULL)
962 return (gethrtime());
963
964 if (ct->ct_heap_num == 0)
965 return (CY_INFINITY);
966
967 return (heap->ch_expiration);
968 }
969
970 /*
971 * Common function used to create normal and realtime callouts.
972 *
973 * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
974 * there is one restriction on a realtime callout handler - it should not
975 * directly or indirectly acquire cpu_lock. CPU offline waits for pending
976 * cyclic handlers to complete while holding cpu_lock. So, if a realtime
977 * callout handler were to try to get cpu_lock, there would be a deadlock
978 * during CPU offline.
979 */
980 callout_id_t
timeout_generic(int type,void (* func)(void *),void * arg,hrtime_t expiration,hrtime_t resolution,int flags)981 timeout_generic(int type, void (*func)(void *), void *arg,
982 hrtime_t expiration, hrtime_t resolution, int flags)
983 {
984 callout_table_t *ct;
985 callout_t *cp;
986 callout_id_t id;
987 callout_list_t *cl;
988 hrtime_t now, interval;
989 int hash, clflags;
990
991 ASSERT(resolution > 0);
992 ASSERT(func != NULL);
993
994 /*
995 * We get the current hrtime right upfront so that latencies in
996 * this function do not affect the accuracy of the callout.
997 */
998 now = gethrtime();
999
1000 /*
1001 * We disable kernel preemption so that we remain on the same CPU
1002 * throughout. If we needed to reprogram the callout table's cyclic,
1003 * we can avoid X-calls if we are on the same CPU.
1004 *
1005 * Note that callout_alloc() releases and reacquires the callout
1006 * table mutex. While reacquiring the mutex, it is possible for us
1007 * to go to sleep and later migrate to another CPU. This should be
1008 * pretty rare, though.
1009 */
1010 kpreempt_disable();
1011
1012 ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
1013 mutex_enter(&ct->ct_mutex);
1014
1015 if (ct->ct_cyclic == CYCLIC_NONE) {
1016 mutex_exit(&ct->ct_mutex);
1017 /*
1018 * The callout table has not yet been initialized fully.
1019 * So, put this one on the boot callout table which is
1020 * always initialized.
1021 */
1022 ct = &callout_boot_ct[type];
1023 mutex_enter(&ct->ct_mutex);
1024 }
1025
1026 if (CALLOUT_CLEANUP(ct)) {
1027 /*
1028 * There are too many heap elements pointing to empty callout
1029 * lists. Clean them out. Since cleanup is only done once
1030 * in a while, no need to reprogram the cyclic if the root
1031 * of the heap gets cleaned out.
1032 */
1033 (void) callout_heap_process(ct, 0, 0);
1034 }
1035
1036 if ((cp = ct->ct_free) == NULL)
1037 cp = callout_alloc(ct);
1038 else
1039 ct->ct_free = cp->c_idnext;
1040
1041 cp->c_func = func;
1042 cp->c_arg = arg;
1043
1044 /*
1045 * Compute the expiration hrtime.
1046 */
1047 if (flags & CALLOUT_FLAG_ABSOLUTE) {
1048 interval = expiration - now;
1049 } else {
1050 interval = expiration;
1051 expiration += now;
1052 }
1053
1054 if (resolution > 1) {
1055 /*
1056 * Align expiration to the specified resolution.
1057 */
1058 if (flags & CALLOUT_FLAG_ROUNDUP)
1059 expiration += resolution - 1;
1060 expiration = (expiration / resolution) * resolution;
1061 }
1062
1063 if (expiration <= 0) {
1064 /*
1065 * expiration hrtime overflow has occurred. Just set the
1066 * expiration to infinity.
1067 */
1068 expiration = CY_INFINITY;
1069 }
1070
1071 /*
1072 * Assign an ID to this callout
1073 */
1074 if (flags & CALLOUT_FLAG_32BIT) {
1075 if (interval > callout_longterm) {
1076 id = (ct->ct_long_id - callout_counter_low);
1077 id |= CALLOUT_COUNTER_HIGH;
1078 ct->ct_long_id = id;
1079 } else {
1080 id = (ct->ct_short_id - callout_counter_low);
1081 id |= CALLOUT_COUNTER_HIGH;
1082 ct->ct_short_id = id;
1083 }
1084 } else {
1085 id = (ct->ct_gen_id - callout_counter_low);
1086 if ((id & CALLOUT_COUNTER_HIGH) == 0) {
1087 id |= CALLOUT_COUNTER_HIGH;
1088 id += CALLOUT_GENERATION_LOW;
1089 }
1090 ct->ct_gen_id = id;
1091 }
1092
1093 cp->c_xid = id;
1094
1095 clflags = 0;
1096 if (flags & CALLOUT_FLAG_ABSOLUTE)
1097 clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
1098 if (flags & CALLOUT_FLAG_HRESTIME)
1099 clflags |= CALLOUT_LIST_FLAG_HRESTIME;
1100 if (resolution == 1)
1101 clflags |= CALLOUT_LIST_FLAG_NANO;
1102 hash = CALLOUT_CLHASH(expiration);
1103
1104 again:
1105 /*
1106 * Try to see if a callout list already exists for this expiration.
1107 */
1108 cl = callout_list_get(ct, expiration, clflags, hash);
1109 if (cl == NULL) {
1110 /*
1111 * Check the free list. If we don't find one, we have to
1112 * take the slow path and allocate from kmem.
1113 */
1114 if ((cl = ct->ct_lfree) == NULL) {
1115 callout_list_alloc(ct);
1116 /*
1117 * In the above call, we drop the lock, allocate and
1118 * reacquire the lock. So, we could have been away
1119 * for a while. In the meantime, someone could have
1120 * inserted a callout list with the same expiration.
1121 * Plus, the heap could have become full. So, the best
1122 * course is to repeat the steps. This should be an
1123 * infrequent event.
1124 */
1125 goto again;
1126 }
1127 ct->ct_lfree = cl->cl_next;
1128 cl->cl_expiration = expiration;
1129 cl->cl_flags = clflags;
1130
1131 /*
1132 * Check if we have enough space in the heap to insert one
1133 * expiration. If not, expand the heap.
1134 */
1135 if (ct->ct_heap_num == ct->ct_heap_max) {
1136 if (callout_heap_expand(ct) == 0) {
1137 /*
1138 * Could not expand the heap. Just queue it.
1139 */
1140 callout_queue_insert(ct, cl);
1141 goto out;
1142 }
1143
1144 /*
1145 * In the above call, we drop the lock, allocate and
1146 * reacquire the lock. So, we could have been away
1147 * for a while. In the meantime, someone could have
1148 * inserted a callout list with the same expiration.
1149 * But we will not go back and check for it as this
1150 * should be a really infrequent event. There is no
1151 * point.
1152 */
1153 }
1154
1155 if (clflags & CALLOUT_LIST_FLAG_NANO) {
1156 CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
1157 } else {
1158 CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
1159 }
1160
1161 /*
1162 * This is a new expiration. So, insert it into the heap.
1163 * This will also reprogram the cyclic, if the expiration
1164 * propagated to the root of the heap.
1165 */
1166 callout_heap_insert(ct, cl);
1167 } else {
1168 /*
1169 * If the callout list was empty, untimeout_generic() would
1170 * have incremented a reap count. Decrement the reap count
1171 * as we are going to insert a callout into this list.
1172 */
1173 if (cl->cl_callouts.ch_head == NULL)
1174 ct->ct_nreap--;
1175 }
1176 out:
1177 cp->c_list = cl;
1178 CALLOUT_APPEND(ct, cp);
1179
1180 ct->ct_timeouts++;
1181 ct->ct_timeouts_pending++;
1182
1183 mutex_exit(&ct->ct_mutex);
1184
1185 kpreempt_enable();
1186
1187 TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
1188 "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
1189 cp);
1190
1191 return (id);
1192 }
1193
1194 timeout_id_t
timeout(void (* func)(void *),void * arg,clock_t delta)1195 timeout(void (*func)(void *), void *arg, clock_t delta)
1196 {
1197 ulong_t id;
1198
1199 /*
1200 * Make sure the callout runs at least 1 tick in the future.
1201 */
1202 if (delta <= 0)
1203 delta = 1;
1204 else if (delta > callout_max_ticks)
1205 delta = callout_max_ticks;
1206
1207 id = (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
1208 TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1209
1210 return ((timeout_id_t)id);
1211 }
1212
1213 /*
1214 * Convenience function that creates a normal callout with default parameters
1215 * and returns a full ID.
1216 */
1217 callout_id_t
timeout_default(void (* func)(void *),void * arg,clock_t delta)1218 timeout_default(void (*func)(void *), void *arg, clock_t delta)
1219 {
1220 callout_id_t id;
1221
1222 /*
1223 * Make sure the callout runs at least 1 tick in the future.
1224 */
1225 if (delta <= 0)
1226 delta = 1;
1227 else if (delta > callout_max_ticks)
1228 delta = callout_max_ticks;
1229
1230 id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
1231 nsec_per_tick, 0);
1232
1233 return (id);
1234 }
1235
1236 timeout_id_t
realtime_timeout(void (* func)(void *),void * arg,clock_t delta)1237 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
1238 {
1239 ulong_t id;
1240
1241 /*
1242 * Make sure the callout runs at least 1 tick in the future.
1243 */
1244 if (delta <= 0)
1245 delta = 1;
1246 else if (delta > callout_max_ticks)
1247 delta = callout_max_ticks;
1248
1249 id = (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
1250 TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1251
1252 return ((timeout_id_t)id);
1253 }
1254
1255 /*
1256 * Convenience function that creates a realtime callout with default parameters
1257 * and returns a full ID.
1258 */
1259 callout_id_t
realtime_timeout_default(void (* func)(void *),void * arg,clock_t delta)1260 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
1261 {
1262 callout_id_t id;
1263
1264 /*
1265 * Make sure the callout runs at least 1 tick in the future.
1266 */
1267 if (delta <= 0)
1268 delta = 1;
1269 else if (delta > callout_max_ticks)
1270 delta = callout_max_ticks;
1271
1272 id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
1273 nsec_per_tick, 0);
1274
1275 return (id);
1276 }
1277
1278 hrtime_t
untimeout_generic(callout_id_t id,int nowait)1279 untimeout_generic(callout_id_t id, int nowait)
1280 {
1281 callout_table_t *ct;
1282 callout_t *cp;
1283 callout_id_t xid;
1284 callout_list_t *cl;
1285 int hash, flags;
1286 callout_id_t bogus;
1287
1288 ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
1289 hash = CALLOUT_IDHASH(id);
1290
1291 mutex_enter(&ct->ct_mutex);
1292
1293 /*
1294 * Search the ID hash table for the callout.
1295 */
1296 for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
1297
1298 xid = cp->c_xid;
1299
1300 /*
1301 * Match the ID and generation number.
1302 */
1303 if ((xid & CALLOUT_ID_MASK) != id)
1304 continue;
1305
1306 if ((xid & CALLOUT_EXECUTING) == 0) {
1307 hrtime_t expiration;
1308
1309 /*
1310 * Delete the callout. If the callout list becomes
1311 * NULL, we don't remove it from the table. This is
1312 * so it can be reused. If the empty callout list
1313 * corresponds to the top of the the callout heap, we
1314 * don't reprogram the table cyclic here. This is in
1315 * order to avoid lots of X-calls to the CPU associated
1316 * with the callout table.
1317 */
1318 cl = cp->c_list;
1319 expiration = cl->cl_expiration;
1320 CALLOUT_DELETE(ct, cp);
1321 CALLOUT_FREE(ct, cp);
1322 ct->ct_untimeouts_unexpired++;
1323 ct->ct_timeouts_pending--;
1324
1325 /*
1326 * If the callout list has become empty, there are 3
1327 * possibilities. If it is present:
1328 * - in the heap, it needs to be cleaned along
1329 * with its heap entry. Increment a reap count.
1330 * - in the callout queue, free it.
1331 * - in the expired list, free it.
1332 */
1333 if (cl->cl_callouts.ch_head == NULL) {
1334 flags = cl->cl_flags;
1335 if (flags & CALLOUT_LIST_FLAG_HEAPED) {
1336 ct->ct_nreap++;
1337 } else if (flags & CALLOUT_LIST_FLAG_QUEUED) {
1338 CALLOUT_LIST_DELETE(ct->ct_queue, cl);
1339 CALLOUT_LIST_FREE(ct, cl);
1340 } else {
1341 CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1342 CALLOUT_LIST_FREE(ct, cl);
1343 }
1344 }
1345 mutex_exit(&ct->ct_mutex);
1346
1347 expiration -= gethrtime();
1348 TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
1349 "untimeout:ID %lx hrtime left %llx", id,
1350 expiration);
1351 return (expiration < 0 ? 0 : expiration);
1352 }
1353
1354 ct->ct_untimeouts_executing++;
1355 /*
1356 * The callout we want to delete is currently executing.
1357 * The DDI states that we must wait until the callout
1358 * completes before returning, so we block on c_done until the
1359 * callout ID changes (to the old ID if it's on the freelist,
1360 * or to a new callout ID if it's in use). This implicitly
1361 * assumes that callout structures are persistent (they are).
1362 */
1363 if (cp->c_executor == curthread) {
1364 /*
1365 * The timeout handler called untimeout() on itself.
1366 * Stupid, but legal. We can't wait for the timeout
1367 * to complete without deadlocking, so we just return.
1368 */
1369 mutex_exit(&ct->ct_mutex);
1370 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
1371 "untimeout_self:ID %x", id);
1372 return (-1);
1373 }
1374 if (nowait == 0) {
1375 /*
1376 * We need to wait. Indicate that we are waiting by
1377 * incrementing c_waiting. This prevents the executor
1378 * from doing a wakeup on c_done if there are no
1379 * waiters.
1380 */
1381 while (cp->c_xid == xid) {
1382 cp->c_waiting = 1;
1383 cv_wait(&cp->c_done, &ct->ct_mutex);
1384 }
1385 }
1386 mutex_exit(&ct->ct_mutex);
1387 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
1388 "untimeout_executing:ID %lx", id);
1389 return (-1);
1390 }
1391 ct->ct_untimeouts_expired++;
1392
1393 mutex_exit(&ct->ct_mutex);
1394 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
1395 "untimeout_bogus_id:ID %lx", id);
1396
1397 /*
1398 * We didn't find the specified callout ID. This means either
1399 * (1) the callout already fired, or (2) the caller passed us
1400 * a bogus value. Perform a sanity check to detect case (2).
1401 */
1402 bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
1403 if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
1404 panic("untimeout: impossible timeout id %llx",
1405 (unsigned long long)id);
1406
1407 return (-1);
1408 }
1409
1410 clock_t
untimeout(timeout_id_t id_arg)1411 untimeout(timeout_id_t id_arg)
1412 {
1413 hrtime_t hleft;
1414 clock_t tleft;
1415 callout_id_t id;
1416
1417 id = (ulong_t)id_arg;
1418 hleft = untimeout_generic(id, 0);
1419 if (hleft < 0)
1420 tleft = -1;
1421 else if (hleft == 0)
1422 tleft = 0;
1423 else
1424 tleft = NSEC_TO_TICK(hleft);
1425
1426 return (tleft);
1427 }
1428
1429 /*
1430 * Convenience function to untimeout a timeout with a full ID with default
1431 * parameters.
1432 */
1433 clock_t
untimeout_default(callout_id_t id,int nowait)1434 untimeout_default(callout_id_t id, int nowait)
1435 {
1436 hrtime_t hleft;
1437 clock_t tleft;
1438
1439 hleft = untimeout_generic(id, nowait);
1440 if (hleft < 0)
1441 tleft = -1;
1442 else if (hleft == 0)
1443 tleft = 0;
1444 else
1445 tleft = NSEC_TO_TICK(hleft);
1446
1447 return (tleft);
1448 }
1449
1450 /*
1451 * Expire all the callouts queued in the specified callout list.
1452 */
1453 static void
callout_list_expire(callout_table_t * ct,callout_list_t * cl)1454 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
1455 {
1456 callout_t *cp, *cnext;
1457
1458 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1459 ASSERT(cl != NULL);
1460
1461 for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
1462 /*
1463 * Multiple executor threads could be running at the same
1464 * time. If this callout is already being executed,
1465 * go on to the next one.
1466 */
1467 if (cp->c_xid & CALLOUT_EXECUTING) {
1468 cnext = cp->c_clnext;
1469 continue;
1470 }
1471
1472 /*
1473 * Indicate to untimeout() that a callout is
1474 * being expired by the executor.
1475 */
1476 cp->c_xid |= CALLOUT_EXECUTING;
1477 cp->c_executor = curthread;
1478 mutex_exit(&ct->ct_mutex);
1479
1480 DTRACE_PROBE1(callout__start, callout_t *, cp);
1481 (*cp->c_func)(cp->c_arg);
1482 DTRACE_PROBE1(callout__end, callout_t *, cp);
1483
1484 mutex_enter(&ct->ct_mutex);
1485
1486 ct->ct_expirations++;
1487 ct->ct_timeouts_pending--;
1488 /*
1489 * Indicate completion for c_done.
1490 */
1491 cp->c_xid &= ~CALLOUT_EXECUTING;
1492 cp->c_executor = NULL;
1493 cnext = cp->c_clnext;
1494
1495 /*
1496 * Delete callout from ID hash table and the callout
1497 * list, return to freelist, and tell any untimeout() that
1498 * cares that we're done.
1499 */
1500 CALLOUT_DELETE(ct, cp);
1501 CALLOUT_FREE(ct, cp);
1502
1503 if (cp->c_waiting) {
1504 cp->c_waiting = 0;
1505 cv_broadcast(&cp->c_done);
1506 }
1507 }
1508 }
1509
1510 /*
1511 * Execute all expired callout lists for a callout table.
1512 */
1513 static void
callout_expire(callout_table_t * ct)1514 callout_expire(callout_table_t *ct)
1515 {
1516 callout_list_t *cl, *clnext;
1517
1518 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1519
1520 for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1521 /*
1522 * Expire all the callouts in this callout list.
1523 */
1524 callout_list_expire(ct, cl);
1525
1526 clnext = cl->cl_next;
1527 if (cl->cl_callouts.ch_head == NULL) {
1528 /*
1529 * Free the callout list.
1530 */
1531 CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1532 CALLOUT_LIST_FREE(ct, cl);
1533 }
1534 }
1535 }
1536
1537 /*
1538 * The cyclic handlers below process callouts in two steps:
1539 *
1540 * 1. Find all expired callout lists and queue them in a separate
1541 * list of expired callouts.
1542 * 2. Execute the expired callout lists.
1543 *
1544 * This is done for two reasons:
1545 *
1546 * 1. We want to quickly find the next earliest expiration to program
1547 * the cyclic to and reprogram it. We can do this right at the end
1548 * of step 1.
1549 * 2. The realtime cyclic handler expires callouts in place. However,
1550 * for normal callouts, callouts are expired by a taskq thread.
1551 * So, it is simpler and more robust to have the taskq thread just
1552 * do step 2.
1553 */
1554
1555 /*
1556 * Realtime callout cyclic handlers.
1557 */
1558 void
callout_realtime(callout_table_t * ct)1559 callout_realtime(callout_table_t *ct)
1560 {
1561 mutex_enter(&ct->ct_mutex);
1562 (void) callout_heap_delete(ct);
1563 callout_expire(ct);
1564 mutex_exit(&ct->ct_mutex);
1565 }
1566
1567 void
callout_queue_realtime(callout_table_t * ct)1568 callout_queue_realtime(callout_table_t *ct)
1569 {
1570 mutex_enter(&ct->ct_mutex);
1571 (void) callout_queue_delete(ct);
1572 callout_expire(ct);
1573 mutex_exit(&ct->ct_mutex);
1574 }
1575
1576 void
callout_execute(callout_table_t * ct)1577 callout_execute(callout_table_t *ct)
1578 {
1579 mutex_enter(&ct->ct_mutex);
1580 callout_expire(ct);
1581 mutex_exit(&ct->ct_mutex);
1582 }
1583
1584 /*
1585 * Normal callout cyclic handlers.
1586 */
1587 void
callout_normal(callout_table_t * ct)1588 callout_normal(callout_table_t *ct)
1589 {
1590 int i, exec;
1591 hrtime_t exp;
1592
1593 mutex_enter(&ct->ct_mutex);
1594 exp = callout_heap_delete(ct);
1595 CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1596 mutex_exit(&ct->ct_mutex);
1597
1598 for (i = 0; i < exec; i++) {
1599 ASSERT(ct->ct_taskq != NULL);
1600 (void) taskq_dispatch(ct->ct_taskq,
1601 (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1602 }
1603 }
1604
1605 void
callout_queue_normal(callout_table_t * ct)1606 callout_queue_normal(callout_table_t *ct)
1607 {
1608 int i, exec;
1609 hrtime_t exp;
1610
1611 mutex_enter(&ct->ct_mutex);
1612 exp = callout_queue_delete(ct);
1613 CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1614 mutex_exit(&ct->ct_mutex);
1615
1616 for (i = 0; i < exec; i++) {
1617 ASSERT(ct->ct_taskq != NULL);
1618 (void) taskq_dispatch(ct->ct_taskq,
1619 (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1620 }
1621 }
1622
1623 /*
1624 * Suspend callout processing.
1625 */
1626 static void
callout_suspend(void)1627 callout_suspend(void)
1628 {
1629 int t, f;
1630 callout_table_t *ct;
1631
1632 /*
1633 * Traverse every callout table in the system and suspend callout
1634 * processing.
1635 *
1636 * We need to suspend all the tables (including the inactive ones)
1637 * so that if a table is made active while the suspend is still on,
1638 * the table remains suspended.
1639 */
1640 for (f = 0; f < max_ncpus; f++) {
1641 for (t = 0; t < CALLOUT_NTYPES; t++) {
1642 ct = &callout_table[CALLOUT_TABLE(t, f)];
1643
1644 mutex_enter(&ct->ct_mutex);
1645 ct->ct_suspend++;
1646 if (ct->ct_cyclic == CYCLIC_NONE) {
1647 mutex_exit(&ct->ct_mutex);
1648 continue;
1649 }
1650 if (ct->ct_suspend == 1) {
1651 (void) cyclic_reprogram(ct->ct_cyclic,
1652 CY_INFINITY);
1653 (void) cyclic_reprogram(ct->ct_qcyclic,
1654 CY_INFINITY);
1655 }
1656 mutex_exit(&ct->ct_mutex);
1657 }
1658 }
1659 }
1660
1661 /*
1662 * Resume callout processing.
1663 */
1664 static void
callout_resume(hrtime_t delta,int timechange)1665 callout_resume(hrtime_t delta, int timechange)
1666 {
1667 hrtime_t hexp, qexp;
1668 int t, f;
1669 callout_table_t *ct;
1670
1671 /*
1672 * Traverse every callout table in the system and resume callout
1673 * processing. For active tables, perform any hrtime adjustments
1674 * necessary.
1675 */
1676 for (f = 0; f < max_ncpus; f++) {
1677 for (t = 0; t < CALLOUT_NTYPES; t++) {
1678 ct = &callout_table[CALLOUT_TABLE(t, f)];
1679
1680 mutex_enter(&ct->ct_mutex);
1681 if (ct->ct_cyclic == CYCLIC_NONE) {
1682 ct->ct_suspend--;
1683 mutex_exit(&ct->ct_mutex);
1684 continue;
1685 }
1686
1687 /*
1688 * If a delta is specified, adjust the expirations in
1689 * the heap by delta. Also, if the caller indicates
1690 * a timechange, process that. This step also cleans
1691 * out any empty callout lists that might happen to
1692 * be there.
1693 */
1694 hexp = callout_heap_process(ct, delta, timechange);
1695 qexp = callout_queue_process(ct, delta, timechange);
1696
1697 ct->ct_suspend--;
1698 if (ct->ct_suspend == 0) {
1699 (void) cyclic_reprogram(ct->ct_cyclic, hexp);
1700 (void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1701 }
1702
1703 mutex_exit(&ct->ct_mutex);
1704 }
1705 }
1706 }
1707
1708 /*
1709 * Callback handler used by CPR to stop and resume callouts.
1710 * The cyclic subsystem saves and restores hrtime during CPR.
1711 * That is why callout_resume() is called with a 0 delta.
1712 * Although hrtime is the same, hrestime (system time) has
1713 * progressed during CPR. So, we have to indicate a time change
1714 * to expire the absolute hrestime timers.
1715 */
1716 /*ARGSUSED*/
1717 static boolean_t
callout_cpr_callb(void * arg,int code)1718 callout_cpr_callb(void *arg, int code)
1719 {
1720 if (code == CB_CODE_CPR_CHKPT)
1721 callout_suspend();
1722 else
1723 callout_resume(0, 1);
1724
1725 return (B_TRUE);
1726 }
1727
1728 /*
1729 * Callback handler invoked when the debugger is entered or exited.
1730 */
1731 /*ARGSUSED*/
1732 static boolean_t
callout_debug_callb(void * arg,int code)1733 callout_debug_callb(void *arg, int code)
1734 {
1735 hrtime_t delta;
1736
1737 /*
1738 * When the system enters the debugger. make a note of the hrtime.
1739 * When it is resumed, compute how long the system was in the
1740 * debugger. This interval should not be counted for callouts.
1741 */
1742 if (code == 0) {
1743 callout_suspend();
1744 callout_debug_hrtime = gethrtime();
1745 } else {
1746 delta = gethrtime() - callout_debug_hrtime;
1747 callout_resume(delta, 0);
1748 }
1749
1750 return (B_TRUE);
1751 }
1752
1753 /*
1754 * Move the absolute hrestime callouts to the expired list. Then program the
1755 * table's cyclic to expire immediately so that the callouts can be executed
1756 * immediately.
1757 */
1758 static void
callout_hrestime_one(callout_table_t * ct)1759 callout_hrestime_one(callout_table_t *ct)
1760 {
1761 hrtime_t hexp, qexp;
1762
1763 mutex_enter(&ct->ct_mutex);
1764 if (ct->ct_cyclic == CYCLIC_NONE) {
1765 mutex_exit(&ct->ct_mutex);
1766 return;
1767 }
1768
1769 /*
1770 * Walk the heap and process all the absolute hrestime entries.
1771 */
1772 hexp = callout_heap_process(ct, 0, 1);
1773 qexp = callout_queue_process(ct, 0, 1);
1774
1775 if (ct->ct_suspend == 0) {
1776 (void) cyclic_reprogram(ct->ct_cyclic, hexp);
1777 (void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1778 }
1779
1780 mutex_exit(&ct->ct_mutex);
1781 }
1782
1783 /*
1784 * This function is called whenever system time (hrestime) is changed
1785 * explicitly. All the HRESTIME callouts must be expired at once.
1786 */
1787 /*ARGSUSED*/
1788 void
callout_hrestime(void)1789 callout_hrestime(void)
1790 {
1791 int t, f;
1792 callout_table_t *ct;
1793
1794 /*
1795 * Traverse every callout table in the system and process the hrestime
1796 * callouts therein.
1797 *
1798 * We look at all the tables because we don't know which ones were
1799 * onlined and offlined in the past. The offlined tables may still
1800 * have active cyclics processing timers somewhere.
1801 */
1802 for (f = 0; f < max_ncpus; f++) {
1803 for (t = 0; t < CALLOUT_NTYPES; t++) {
1804 ct = &callout_table[CALLOUT_TABLE(t, f)];
1805 callout_hrestime_one(ct);
1806 }
1807 }
1808 }
1809
1810 /*
1811 * Create the hash tables for this callout table.
1812 */
1813 static void
callout_hash_init(callout_table_t * ct)1814 callout_hash_init(callout_table_t *ct)
1815 {
1816 size_t size;
1817
1818 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1819 ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1820
1821 size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1822 ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1823 ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1824 }
1825
1826 /*
1827 * Create per-callout table kstats.
1828 */
1829 static void
callout_kstat_init(callout_table_t * ct)1830 callout_kstat_init(callout_table_t *ct)
1831 {
1832 callout_stat_type_t stat;
1833 kstat_t *ct_kstats;
1834 int ndx;
1835
1836 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1837 ASSERT(ct->ct_kstats == NULL);
1838
1839 ndx = ct - callout_table;
1840 ct_kstats = kstat_create("unix", ndx, "callout",
1841 "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1842
1843 if (ct_kstats == NULL) {
1844 cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1845 (void *)ct);
1846 } else {
1847 ct_kstats->ks_data = ct->ct_kstat_data;
1848 for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1849 kstat_named_init(&ct->ct_kstat_data[stat],
1850 callout_kstat_names[stat], KSTAT_DATA_INT64);
1851 ct->ct_kstats = ct_kstats;
1852 kstat_install(ct_kstats);
1853 }
1854 }
1855
1856 static void
callout_cyclic_init(callout_table_t * ct)1857 callout_cyclic_init(callout_table_t *ct)
1858 {
1859 cyc_handler_t hdlr;
1860 cyc_time_t when;
1861 processorid_t seqid;
1862 int t;
1863 cyclic_id_t cyclic, qcyclic;
1864
1865 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1866
1867 t = ct->ct_type;
1868 seqid = CALLOUT_TABLE_SEQID(ct);
1869
1870 /*
1871 * Create the taskq thread if the table type is normal.
1872 * Realtime tables are handled at PIL1 by a softint
1873 * handler.
1874 */
1875 if (t == CALLOUT_NORMAL) {
1876 ASSERT(ct->ct_taskq == NULL);
1877 /*
1878 * Each callout thread consumes exactly one
1879 * task structure while active. Therefore,
1880 * prepopulating with 2 * callout_threads tasks
1881 * ensures that there's at least one task per
1882 * thread that's either scheduled or on the
1883 * freelist. In turn, this guarantees that
1884 * taskq_dispatch() will always either succeed
1885 * (because there's a free task structure) or
1886 * be unnecessary (because "callout_excute(ct)"
1887 * has already scheduled).
1888 */
1889 ct->ct_taskq =
1890 taskq_create_instance("callout_taskq", seqid,
1891 callout_threads, maxclsyspri,
1892 2 * callout_threads, 2 * callout_threads,
1893 TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1894 }
1895
1896 /*
1897 * callouts can only be created in a table whose
1898 * cyclic has been initialized.
1899 */
1900 ASSERT(ct->ct_heap_num == 0);
1901
1902 /*
1903 * Drop the mutex before creating the callout cyclics. cyclic_add()
1904 * could potentially expand the cyclic heap. We don't want to be
1905 * holding the callout table mutex in that case. Note that this
1906 * function is called during CPU online. cpu_lock is held at this
1907 * point. So, only one thread can be executing the cyclic add logic
1908 * below at any time.
1909 */
1910 mutex_exit(&ct->ct_mutex);
1911
1912 /*
1913 * Create the callout table cyclics.
1914 *
1915 * The realtime cyclic handler executes at low PIL. The normal cyclic
1916 * handler executes at lock PIL. This is because there are cases
1917 * where code can block at PIL > 1 waiting for a normal callout handler
1918 * to unblock it directly or indirectly. If the normal cyclic were to
1919 * be executed at low PIL, it could get blocked out by the waiter
1920 * and cause a deadlock.
1921 */
1922 ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1923
1924 if (t == CALLOUT_REALTIME) {
1925 hdlr.cyh_level = callout_realtime_level;
1926 hdlr.cyh_func = (cyc_func_t)callout_realtime;
1927 } else {
1928 hdlr.cyh_level = callout_normal_level;
1929 hdlr.cyh_func = (cyc_func_t)callout_normal;
1930 }
1931 hdlr.cyh_arg = ct;
1932 when.cyt_when = CY_INFINITY;
1933 when.cyt_interval = CY_INFINITY;
1934
1935 cyclic = cyclic_add(&hdlr, &when);
1936
1937 if (t == CALLOUT_REALTIME)
1938 hdlr.cyh_func = (cyc_func_t)callout_queue_realtime;
1939 else
1940 hdlr.cyh_func = (cyc_func_t)callout_queue_normal;
1941
1942 qcyclic = cyclic_add(&hdlr, &when);
1943
1944 mutex_enter(&ct->ct_mutex);
1945 ct->ct_cyclic = cyclic;
1946 ct->ct_qcyclic = qcyclic;
1947 }
1948
1949 void
callout_cpu_online(cpu_t * cp)1950 callout_cpu_online(cpu_t *cp)
1951 {
1952 lgrp_handle_t hand;
1953 callout_cache_t *cache;
1954 char s[KMEM_CACHE_NAMELEN];
1955 callout_table_t *ct;
1956 processorid_t seqid;
1957 int t;
1958
1959 ASSERT(MUTEX_HELD(&cpu_lock));
1960
1961 /*
1962 * Locate the cache corresponding to the onlined CPU's lgroup.
1963 * Note that access to callout_caches is protected by cpu_lock.
1964 */
1965 hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1966 for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1967 if (cache->cc_hand == hand)
1968 break;
1969 }
1970
1971 /*
1972 * If not found, create one. The caches are never destroyed.
1973 */
1974 if (cache == NULL) {
1975 cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1976 cache->cc_hand = hand;
1977 (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1978 (long)hand);
1979 cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1980 CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1981 (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1982 (long)hand);
1983 cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1984 CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1985 cache->cc_next = callout_caches;
1986 callout_caches = cache;
1987 }
1988
1989 seqid = cp->cpu_seqid;
1990
1991 for (t = 0; t < CALLOUT_NTYPES; t++) {
1992 ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1993
1994 mutex_enter(&ct->ct_mutex);
1995 /*
1996 * Store convinience pointers to the kmem caches
1997 * in the callout table. These assignments should always be
1998 * done as callout tables can map to different physical
1999 * CPUs each time.
2000 */
2001 ct->ct_cache = cache->cc_cache;
2002 ct->ct_lcache = cache->cc_lcache;
2003
2004 /*
2005 * We use the heap pointer to check if stuff has been
2006 * initialized for this callout table.
2007 */
2008 if (ct->ct_heap == NULL) {
2009 callout_heap_init(ct);
2010 callout_hash_init(ct);
2011 callout_kstat_init(ct);
2012 callout_cyclic_init(ct);
2013 }
2014
2015 mutex_exit(&ct->ct_mutex);
2016
2017 /*
2018 * Move the cyclics to this CPU by doing a bind.
2019 */
2020 cyclic_bind(ct->ct_cyclic, cp, NULL);
2021 cyclic_bind(ct->ct_qcyclic, cp, NULL);
2022 }
2023 }
2024
2025 void
callout_cpu_offline(cpu_t * cp)2026 callout_cpu_offline(cpu_t *cp)
2027 {
2028 callout_table_t *ct;
2029 processorid_t seqid;
2030 int t;
2031
2032 ASSERT(MUTEX_HELD(&cpu_lock));
2033
2034 seqid = cp->cpu_seqid;
2035
2036 for (t = 0; t < CALLOUT_NTYPES; t++) {
2037 ct = &callout_table[CALLOUT_TABLE(t, seqid)];
2038
2039 /*
2040 * Unbind the cyclics. This will allow the cyclic subsystem
2041 * to juggle the cyclics during CPU offline.
2042 */
2043 cyclic_bind(ct->ct_cyclic, NULL, NULL);
2044 cyclic_bind(ct->ct_qcyclic, NULL, NULL);
2045 }
2046 }
2047
2048 /*
2049 * This is called to perform per-CPU initialization for slave CPUs at
2050 * boot time.
2051 */
2052 void
callout_mp_init(void)2053 callout_mp_init(void)
2054 {
2055 cpu_t *cp;
2056 size_t min, max;
2057
2058 if (callout_chunk == CALLOUT_CHUNK) {
2059 /*
2060 * No one has specified a chunk in /etc/system. We need to
2061 * compute it here based on the number of online CPUs and
2062 * available physical memory.
2063 */
2064 min = CALLOUT_MIN_HEAP_SIZE;
2065 max = ptob(physmem / CALLOUT_MEM_FRACTION);
2066 if (min > max)
2067 min = max;
2068 callout_chunk = min / sizeof (callout_heap_t);
2069 callout_chunk /= ncpus_online;
2070 callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2071 }
2072
2073 mutex_enter(&cpu_lock);
2074
2075 cp = cpu_active;
2076 do {
2077 callout_cpu_online(cp);
2078 } while ((cp = cp->cpu_next_onln) != cpu_active);
2079
2080 mutex_exit(&cpu_lock);
2081 }
2082
2083 /*
2084 * Initialize all callout tables. Called at boot time just before clkstart().
2085 */
2086 void
callout_init(void)2087 callout_init(void)
2088 {
2089 int f, t;
2090 size_t size;
2091 int table_id;
2092 callout_table_t *ct;
2093 long bits, fanout;
2094 uintptr_t buf;
2095
2096 /*
2097 * Initialize callout globals.
2098 */
2099 bits = 0;
2100 for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
2101 bits++;
2102 callout_table_bits = CALLOUT_TYPE_BITS + bits;
2103 callout_table_mask = (1 << callout_table_bits) - 1;
2104 callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
2105 callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
2106 callout_max_ticks = CALLOUT_MAX_TICKS;
2107 if (callout_min_reap == 0)
2108 callout_min_reap = CALLOUT_MIN_REAP;
2109
2110 if (callout_tolerance <= 0)
2111 callout_tolerance = CALLOUT_TOLERANCE;
2112 if (callout_threads <= 0)
2113 callout_threads = CALLOUT_THREADS;
2114 if (callout_chunk <= 0)
2115 callout_chunk = CALLOUT_CHUNK;
2116 else
2117 callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2118
2119 /*
2120 * Allocate all the callout tables based on max_ncpus. We have chosen
2121 * to do boot-time allocation instead of dynamic allocation because:
2122 *
2123 * - the size of the callout tables is not too large.
2124 * - there are race conditions involved in making this dynamic.
2125 * - the hash tables that go with the callout tables consume
2126 * most of the memory and they are only allocated in
2127 * callout_cpu_online().
2128 *
2129 * Each CPU has two tables that are consecutive in the array. The first
2130 * one is for realtime callouts and the second one is for normal ones.
2131 *
2132 * We do this alignment dance to make sure that callout table
2133 * structures will always be on a cache line boundary.
2134 */
2135 size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
2136 size += CALLOUT_ALIGN;
2137 buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
2138 callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
2139
2140 size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
2141 /*
2142 * Now, initialize the tables for all the CPUs.
2143 */
2144 for (f = 0; f < max_ncpus; f++) {
2145 for (t = 0; t < CALLOUT_NTYPES; t++) {
2146 table_id = CALLOUT_TABLE(t, f);
2147 ct = &callout_table[table_id];
2148 ct->ct_type = t;
2149 mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
2150 /*
2151 * Precompute the base IDs for long and short-term
2152 * legacy IDs. This makes ID generation during
2153 * timeout() fast.
2154 */
2155 ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
2156 ct->ct_long_id = CALLOUT_LONG_ID(table_id);
2157 /*
2158 * Precompute the base ID for generation-based IDs.
2159 * Note that when the first ID gets allocated, the
2160 * ID will wrap. This will cause the generation
2161 * number to be incremented to 1.
2162 */
2163 ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
2164 /*
2165 * Initialize the cyclics as NONE. This will get set
2166 * during CPU online. This is so that partially
2167 * populated systems will only have the required
2168 * number of cyclics, not more.
2169 */
2170 ct->ct_cyclic = CYCLIC_NONE;
2171 ct->ct_qcyclic = CYCLIC_NONE;
2172 ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
2173 }
2174 }
2175
2176 /*
2177 * Add the callback for CPR. This is called during checkpoint
2178 * resume to suspend and resume callouts.
2179 */
2180 (void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
2181 "callout_cpr");
2182 (void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
2183 "callout_debug");
2184
2185 /*
2186 * Call the per-CPU initialization function for the boot CPU. This
2187 * is done here because the function is not called automatically for
2188 * the boot CPU from the CPU online/offline hooks. Note that the
2189 * CPU lock is taken here because of convention.
2190 */
2191 mutex_enter(&cpu_lock);
2192 callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
2193 callout_cpu_online(CPU);
2194 mutex_exit(&cpu_lock);
2195
2196 /* heads-up to boot-time clients that timeouts now available */
2197 callout_init_done = 1;
2198 }
2199