1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/callo.h>
26 #include <sys/param.h>
27 #include <sys/types.h>
28 #include <sys/cpuvar.h>
29 #include <sys/thread.h>
30 #include <sys/kmem.h>
31 #include <sys/kmem_impl.h>
32 #include <sys/cmn_err.h>
33 #include <sys/callb.h>
34 #include <sys/debug.h>
35 #include <sys/vtrace.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38
39 int callout_init_done; /* useful during boot */
40
41 /*
42 * Callout tables. See timeout(9F) for details.
43 */
44 static int callout_threads; /* callout normal threads */
45 static hrtime_t callout_debug_hrtime; /* debugger entry time */
46 static int callout_chunk; /* callout heap chunk size */
47 static int callout_min_reap; /* callout minimum reap count */
48 static int callout_tolerance; /* callout hires tolerance */
49 static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */
50 static clock_t callout_max_ticks; /* max interval */
51 static hrtime_t callout_longterm; /* longterm nanoseconds */
52 static ulong_t callout_counter_low; /* callout ID increment */
53 static ulong_t callout_table_bits; /* number of table bits in ID */
54 static ulong_t callout_table_mask; /* mask for the table bits */
55 static callout_cache_t *callout_caches; /* linked list of caches */
56 #pragma align 64(callout_table)
57 static callout_table_t *callout_table; /* global callout table array */
58
59 /*
60 * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
61 * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
62 * via taskq, to a thread that executes at PIL 0 - so we end up running
63 * 'normal' callouts at PIL 0.
64 */
65 static volatile int callout_realtime_level = CY_LOW_LEVEL;
66 static volatile int callout_normal_level = CY_LOCK_LEVEL;
67
68 static char *callout_kstat_names[] = {
69 "callout_timeouts",
70 "callout_timeouts_pending",
71 "callout_untimeouts_unexpired",
72 "callout_untimeouts_executing",
73 "callout_untimeouts_expired",
74 "callout_expirations",
75 "callout_allocations",
76 "callout_cleanups",
77 };
78
79 static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int);
80
81 #define CALLOUT_HASH_INSERT(hash, cp, cnext, cprev) \
82 { \
83 callout_hash_t *hashp = &(hash); \
84 \
85 cp->cprev = NULL; \
86 cp->cnext = hashp->ch_head; \
87 if (hashp->ch_head == NULL) \
88 hashp->ch_tail = cp; \
89 else \
90 cp->cnext->cprev = cp; \
91 hashp->ch_head = cp; \
92 }
93
94 #define CALLOUT_HASH_APPEND(hash, cp, cnext, cprev) \
95 { \
96 callout_hash_t *hashp = &(hash); \
97 \
98 cp->cnext = NULL; \
99 cp->cprev = hashp->ch_tail; \
100 if (hashp->ch_tail == NULL) \
101 hashp->ch_head = cp; \
102 else \
103 cp->cprev->cnext = cp; \
104 hashp->ch_tail = cp; \
105 }
106
107 #define CALLOUT_HASH_DELETE(hash, cp, cnext, cprev) \
108 { \
109 callout_hash_t *hashp = &(hash); \
110 \
111 if (cp->cnext == NULL) \
112 hashp->ch_tail = cp->cprev; \
113 else \
114 cp->cnext->cprev = cp->cprev; \
115 if (cp->cprev == NULL) \
116 hashp->ch_head = cp->cnext; \
117 else \
118 cp->cprev->cnext = cp->cnext; \
119 }
120
121 /*
122 * These definitions help us queue callouts and callout lists. Here is
123 * the queueing rationale:
124 *
125 * - callouts are queued in a FIFO manner in the ID hash table.
126 * TCP timers are typically cancelled in the same order that they
127 * were issued. The FIFO queueing shortens the search for a callout
128 * during untimeout().
129 *
130 * - callouts are queued in a FIFO manner in their callout lists.
131 * This ensures that the callouts are executed in the same order that
132 * they were queued. This is fair. Plus, it helps to make each
133 * callout expiration timely. It also favors cancellations.
134 *
135 * - callout lists are queued in the following manner in the callout
136 * hash table buckets:
137 *
138 * - appended, if the callout list is a 1-nanosecond resolution
139 * callout list. When a callout is created, we first look for
140 * a callout list that has the same expiration so we can avoid
141 * allocating a callout list and inserting the expiration into
142 * the heap. However, we do not want to look at 1-nanosecond
143 * resolution callout lists as we will seldom find a match in
144 * them. Keeping these callout lists in the rear of the hash
145 * buckets allows us to skip these during the lookup.
146 *
147 * - inserted at the beginning, if the callout list is not a
148 * 1-nanosecond resolution callout list. This also has the
149 * side-effect of keeping the long term timers away from the
150 * front of the buckets.
151 *
152 * - callout lists are queued in a FIFO manner in the expired callouts
153 * list. This ensures that callout lists are executed in the order
154 * of expiration.
155 */
156 #define CALLOUT_APPEND(ct, cp) \
157 CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \
158 cp, c_idnext, c_idprev); \
159 CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
160
161 #define CALLOUT_DELETE(ct, cp) \
162 CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \
163 cp, c_idnext, c_idprev); \
164 CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
165
166 #define CALLOUT_LIST_INSERT(hash, cl) \
167 CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
168
169 #define CALLOUT_LIST_APPEND(hash, cl) \
170 CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
171
172 #define CALLOUT_LIST_DELETE(hash, cl) \
173 CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
174
175 #define CALLOUT_LIST_BEFORE(cl, nextcl) \
176 { \
177 (cl)->cl_prev = (nextcl)->cl_prev; \
178 (cl)->cl_next = (nextcl); \
179 (nextcl)->cl_prev = (cl); \
180 if (cl->cl_prev != NULL) \
181 cl->cl_prev->cl_next = cl; \
182 }
183
184 /*
185 * For normal callouts, there is a deadlock scenario if two callouts that
186 * have an inter-dependency end up on the same callout list. To break the
187 * deadlock, you need two taskq threads running in parallel. We compute
188 * the number of taskq threads here using a bunch of conditions to make
189 * it optimal for the common case. This is an ugly hack, but one that is
190 * necessary (sigh).
191 */
192 #define CALLOUT_THRESHOLD 100000000
193 #define CALLOUT_EXEC_COMPUTE(ct, nextexp, exec) \
194 { \
195 callout_list_t *cl; \
196 \
197 cl = ct->ct_expired.ch_head; \
198 if (cl == NULL) { \
199 /* \
200 * If the expired list is NULL, there is nothing to \
201 * process. \
202 */ \
203 exec = 0; \
204 } else if ((cl->cl_next == NULL) && \
205 (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) { \
206 /* \
207 * If there is only one callout list and it contains \
208 * only one callout, there is no need for two threads. \
209 */ \
210 exec = 1; \
211 } else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) { \
212 /* \
213 * If the next expiration of the cyclic is way out into \
214 * the future, we need two threads. \
215 */ \
216 exec = 2; \
217 } else { \
218 /* \
219 * We have multiple callouts to process. But the cyclic \
220 * will fire in the near future. So, we only need one \
221 * thread for now. \
222 */ \
223 exec = 1; \
224 } \
225 }
226
227 /*
228 * Macro to swap two heap items.
229 */
230 #define CALLOUT_SWAP(h1, h2) \
231 { \
232 callout_heap_t tmp; \
233 \
234 tmp = *h1; \
235 *h1 = *h2; \
236 *h2 = tmp; \
237 }
238
239 /*
240 * Macro to free a callout list.
241 */
242 #define CALLOUT_LIST_FREE(ct, cl) \
243 { \
244 cl->cl_next = ct->ct_lfree; \
245 ct->ct_lfree = cl; \
246 cl->cl_flags |= CALLOUT_LIST_FLAG_FREE; \
247 }
248
249 /*
250 * Macro to free a callout.
251 */
252 #define CALLOUT_FREE(ct, cl) \
253 { \
254 cp->c_idnext = ct->ct_free; \
255 ct->ct_free = cp; \
256 cp->c_xid |= CALLOUT_ID_FREE; \
257 }
258
259 /*
260 * Allocate a callout structure. We try quite hard because we
261 * can't sleep, and if we can't do the allocation, we're toast.
262 * Failing all, we try a KM_PANIC allocation. Note that we never
263 * deallocate a callout. See untimeout() for the reasoning.
264 */
265 static callout_t *
callout_alloc(callout_table_t * ct)266 callout_alloc(callout_table_t *ct)
267 {
268 size_t size;
269 callout_t *cp;
270
271 ASSERT(MUTEX_HELD(&ct->ct_mutex));
272 mutex_exit(&ct->ct_mutex);
273
274 cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
275 if (cp == NULL) {
276 size = sizeof (callout_t);
277 cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
278 }
279 cp->c_xid = 0;
280 cp->c_executor = NULL;
281 cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
282 cp->c_waiting = 0;
283
284 mutex_enter(&ct->ct_mutex);
285 ct->ct_allocations++;
286 return (cp);
287 }
288
289 /*
290 * Allocate a callout list structure. We try quite hard because we
291 * can't sleep, and if we can't do the allocation, we're toast.
292 * Failing all, we try a KM_PANIC allocation. Note that we never
293 * deallocate a callout list.
294 */
295 static void
callout_list_alloc(callout_table_t * ct)296 callout_list_alloc(callout_table_t *ct)
297 {
298 size_t size;
299 callout_list_t *cl;
300
301 ASSERT(MUTEX_HELD(&ct->ct_mutex));
302 mutex_exit(&ct->ct_mutex);
303
304 cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
305 if (cl == NULL) {
306 size = sizeof (callout_list_t);
307 cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
308 }
309 bzero(cl, sizeof (callout_list_t));
310
311 mutex_enter(&ct->ct_mutex);
312 CALLOUT_LIST_FREE(ct, cl);
313 }
314
315 /*
316 * Find a callout list that corresponds to an expiration and matching flags.
317 */
318 static callout_list_t *
callout_list_get(callout_table_t * ct,hrtime_t expiration,int flags,int hash)319 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
320 {
321 callout_list_t *cl;
322 int clflags;
323
324 ASSERT(MUTEX_HELD(&ct->ct_mutex));
325
326 if (flags & CALLOUT_LIST_FLAG_NANO) {
327 /*
328 * This is a 1-nanosecond resolution callout. We will rarely
329 * find a match for this. So, bail out.
330 */
331 return (NULL);
332 }
333
334 clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
335 for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
336 /*
337 * If we have reached a 1-nanosecond resolution callout list,
338 * we don't have much hope of finding a match in this hash
339 * bucket. So, just bail out.
340 */
341 if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
342 return (NULL);
343
344 if ((cl->cl_expiration == expiration) &&
345 ((cl->cl_flags & clflags) == (flags & clflags)))
346 return (cl);
347 }
348
349 return (NULL);
350 }
351
352 /*
353 * Add a new callout list into a callout table's queue in sorted order by
354 * expiration.
355 */
356 static int
callout_queue_add(callout_table_t * ct,callout_list_t * cl)357 callout_queue_add(callout_table_t *ct, callout_list_t *cl)
358 {
359 callout_list_t *nextcl;
360 hrtime_t expiration;
361
362 expiration = cl->cl_expiration;
363 nextcl = ct->ct_queue.ch_head;
364 if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) {
365 CALLOUT_LIST_INSERT(ct->ct_queue, cl);
366 return (1);
367 }
368
369 while (nextcl != NULL) {
370 if (expiration < nextcl->cl_expiration) {
371 CALLOUT_LIST_BEFORE(cl, nextcl);
372 return (0);
373 }
374 nextcl = nextcl->cl_next;
375 }
376 CALLOUT_LIST_APPEND(ct->ct_queue, cl);
377
378 return (0);
379 }
380
381 /*
382 * Insert a callout list into a callout table's queue and reprogram the queue
383 * cyclic if needed.
384 */
385 static void
callout_queue_insert(callout_table_t * ct,callout_list_t * cl)386 callout_queue_insert(callout_table_t *ct, callout_list_t *cl)
387 {
388 cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED;
389
390 /*
391 * Add the callout to the callout queue. If it ends up at the head,
392 * the cyclic needs to be reprogrammed as we have an earlier
393 * expiration.
394 *
395 * Also, during the CPR suspend phase, do not reprogram the cyclic.
396 * We don't want any callout activity. When the CPR resume phase is
397 * entered, the cyclic will be programmed for the earliest expiration
398 * in the queue.
399 */
400 if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0))
401 (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
402 }
403
404 /*
405 * Delete and handle all past expirations in a callout table's queue.
406 */
407 static hrtime_t
callout_queue_delete(callout_table_t * ct)408 callout_queue_delete(callout_table_t *ct)
409 {
410 callout_list_t *cl;
411 hrtime_t now;
412
413 ASSERT(MUTEX_HELD(&ct->ct_mutex));
414
415 now = gethrtime();
416 while ((cl = ct->ct_queue.ch_head) != NULL) {
417 if (cl->cl_expiration > now)
418 break;
419 cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
420 CALLOUT_LIST_DELETE(ct->ct_queue, cl);
421 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
422 }
423
424 /*
425 * If this callout queue is empty or callouts have been suspended,
426 * just return.
427 */
428 if ((cl == NULL) || (ct->ct_suspend > 0))
429 return (CY_INFINITY);
430
431 (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
432
433 return (cl->cl_expiration);
434 }
435
436 static hrtime_t
callout_queue_process(callout_table_t * ct,hrtime_t delta,int timechange)437 callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange)
438 {
439 callout_list_t *firstcl, *cl;
440 hrtime_t expiration, now;
441 int clflags;
442 callout_hash_t temp;
443
444 ASSERT(MUTEX_HELD(&ct->ct_mutex));
445
446 firstcl = ct->ct_queue.ch_head;
447 if (firstcl == NULL)
448 return (CY_INFINITY);
449
450 /*
451 * We walk the callout queue. If we encounter a hrestime entry that
452 * must be removed, we clean it out. Otherwise, we apply any
453 * adjustments needed to it. Because of the latter, we need to
454 * recreate the list as we go along.
455 */
456 temp = ct->ct_queue;
457 ct->ct_queue.ch_head = NULL;
458 ct->ct_queue.ch_tail = NULL;
459
460 clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
461 now = gethrtime();
462 while ((cl = temp.ch_head) != NULL) {
463 CALLOUT_LIST_DELETE(temp, cl);
464
465 /*
466 * Delete the callout and expire it, if one of the following
467 * is true:
468 * - the callout has expired
469 * - the callout is an absolute hrestime one and
470 * there has been a system time change
471 */
472 if ((cl->cl_expiration <= now) ||
473 (timechange && ((cl->cl_flags & clflags) == clflags))) {
474 cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
475 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
476 continue;
477 }
478
479 /*
480 * Apply adjustments, if any. Adjustments are applied after
481 * the system returns from KMDB or OBP. They are only applied
482 * to relative callout lists.
483 */
484 if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
485 expiration = cl->cl_expiration + delta;
486 if (expiration <= 0)
487 expiration = CY_INFINITY;
488 cl->cl_expiration = expiration;
489 }
490
491 (void) callout_queue_add(ct, cl);
492 }
493
494 /*
495 * We need to return the expiration to help program the cyclic.
496 * If there are expired callouts, the cyclic needs to go off
497 * immediately. If the queue has become empty, then we return infinity.
498 * Else, we return the expiration of the earliest callout in the queue.
499 */
500 if (ct->ct_expired.ch_head != NULL)
501 return (gethrtime());
502
503 cl = ct->ct_queue.ch_head;
504 if (cl == NULL)
505 return (CY_INFINITY);
506
507 return (cl->cl_expiration);
508 }
509
510 /*
511 * Initialize a callout table's heap, if necessary. Preallocate some free
512 * entries so we don't have to check for NULL elsewhere.
513 */
514 static void
callout_heap_init(callout_table_t * ct)515 callout_heap_init(callout_table_t *ct)
516 {
517 size_t size;
518
519 ASSERT(MUTEX_HELD(&ct->ct_mutex));
520 ASSERT(ct->ct_heap == NULL);
521
522 ct->ct_heap_num = 0;
523 ct->ct_heap_max = callout_chunk;
524 size = sizeof (callout_heap_t) * callout_chunk;
525 ct->ct_heap = kmem_alloc(size, KM_SLEEP);
526 }
527
528 /*
529 * Reallocate the heap. Return 0 if the heap is still full at the end of it.
530 * Return 1 otherwise. Note that the heap only expands, it never contracts.
531 */
532 static int
callout_heap_expand(callout_table_t * ct)533 callout_heap_expand(callout_table_t *ct)
534 {
535 size_t max, size, osize;
536 callout_heap_t *heap;
537
538 ASSERT(MUTEX_HELD(&ct->ct_mutex));
539 ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
540
541 while (ct->ct_heap_num == ct->ct_heap_max) {
542 max = ct->ct_heap_max;
543 mutex_exit(&ct->ct_mutex);
544
545 osize = sizeof (callout_heap_t) * max;
546 size = sizeof (callout_heap_t) * (max + callout_chunk);
547 heap = kmem_alloc(size, KM_NOSLEEP);
548
549 mutex_enter(&ct->ct_mutex);
550 if (heap == NULL) {
551 /*
552 * We could not allocate memory. If we can free up
553 * some entries, that would be great.
554 */
555 if (ct->ct_nreap > 0)
556 (void) callout_heap_process(ct, 0, 0);
557 /*
558 * If we still have no space in the heap, inform the
559 * caller.
560 */
561 if (ct->ct_heap_num == ct->ct_heap_max)
562 return (0);
563 return (1);
564 }
565 if (max < ct->ct_heap_max) {
566 /*
567 * Someone beat us to the allocation. Free what we
568 * just allocated and proceed.
569 */
570 kmem_free(heap, size);
571 continue;
572 }
573
574 bcopy(ct->ct_heap, heap, osize);
575 kmem_free(ct->ct_heap, osize);
576 ct->ct_heap = heap;
577 ct->ct_heap_max = size / sizeof (callout_heap_t);
578 }
579
580 return (1);
581 }
582
583 /*
584 * Move an expiration from the bottom of the heap to its correct place
585 * in the heap. If we reached the root doing this, return 1. Else,
586 * return 0.
587 */
588 static int
callout_upheap(callout_table_t * ct)589 callout_upheap(callout_table_t *ct)
590 {
591 int current, parent;
592 callout_heap_t *heap, *hcurrent, *hparent;
593
594 ASSERT(MUTEX_HELD(&ct->ct_mutex));
595 ASSERT(ct->ct_heap_num >= 1);
596
597 if (ct->ct_heap_num == 1) {
598 return (1);
599 }
600
601 heap = ct->ct_heap;
602 current = ct->ct_heap_num - 1;
603
604 for (;;) {
605 parent = CALLOUT_HEAP_PARENT(current);
606 hparent = &heap[parent];
607 hcurrent = &heap[current];
608
609 /*
610 * We have an expiration later than our parent; we're done.
611 */
612 if (hcurrent->ch_expiration >= hparent->ch_expiration) {
613 return (0);
614 }
615
616 /*
617 * We need to swap with our parent, and continue up the heap.
618 */
619 CALLOUT_SWAP(hparent, hcurrent);
620
621 /*
622 * If we just reached the root, we're done.
623 */
624 if (parent == 0) {
625 return (1);
626 }
627
628 current = parent;
629 }
630 /*NOTREACHED*/
631 }
632
633 /*
634 * Insert a new heap item into a callout table's heap.
635 */
636 static void
callout_heap_insert(callout_table_t * ct,callout_list_t * cl)637 callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
638 {
639 ASSERT(MUTEX_HELD(&ct->ct_mutex));
640 ASSERT(ct->ct_heap_num < ct->ct_heap_max);
641
642 cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED;
643 /*
644 * First, copy the expiration and callout list pointer to the bottom
645 * of the heap.
646 */
647 ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
648 ct->ct_heap[ct->ct_heap_num].ch_list = cl;
649 ct->ct_heap_num++;
650
651 /*
652 * Now, perform an upheap operation. If we reached the root, then
653 * the cyclic needs to be reprogrammed as we have an earlier
654 * expiration.
655 *
656 * Also, during the CPR suspend phase, do not reprogram the cyclic.
657 * We don't want any callout activity. When the CPR resume phase is
658 * entered, the cyclic will be programmed for the earliest expiration
659 * in the heap.
660 */
661 if (callout_upheap(ct) && (ct->ct_suspend == 0))
662 (void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
663 }
664
665 /*
666 * Move an expiration from the top of the heap to its correct place
667 * in the heap.
668 */
669 static void
callout_downheap(callout_table_t * ct)670 callout_downheap(callout_table_t *ct)
671 {
672 int current, left, right, nelems;
673 callout_heap_t *heap, *hleft, *hright, *hcurrent;
674
675 ASSERT(MUTEX_HELD(&ct->ct_mutex));
676 ASSERT(ct->ct_heap_num >= 1);
677
678 heap = ct->ct_heap;
679 current = 0;
680 nelems = ct->ct_heap_num;
681
682 for (;;) {
683 /*
684 * If we don't have a left child (i.e., we're a leaf), we're
685 * done.
686 */
687 if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
688 return;
689
690 hleft = &heap[left];
691 hcurrent = &heap[current];
692
693 right = CALLOUT_HEAP_RIGHT(current);
694
695 /*
696 * Even if we don't have a right child, we still need to compare
697 * our expiration against that of our left child.
698 */
699 if (right >= nelems)
700 goto comp_left;
701
702 hright = &heap[right];
703
704 /*
705 * We have both a left and a right child. We need to compare
706 * the expiration of the children to determine which
707 * expires earlier.
708 */
709 if (hright->ch_expiration < hleft->ch_expiration) {
710 /*
711 * Our right child is the earlier of our children.
712 * We'll now compare our expiration to its expiration.
713 * If ours is the earlier one, we're done.
714 */
715 if (hcurrent->ch_expiration <= hright->ch_expiration)
716 return;
717
718 /*
719 * Our right child expires earlier than we do; swap
720 * with our right child, and descend right.
721 */
722 CALLOUT_SWAP(hright, hcurrent);
723 current = right;
724 continue;
725 }
726
727 comp_left:
728 /*
729 * Our left child is the earlier of our children (or we have
730 * no right child). We'll now compare our expiration
731 * to its expiration. If ours is the earlier one, we're done.
732 */
733 if (hcurrent->ch_expiration <= hleft->ch_expiration)
734 return;
735
736 /*
737 * Our left child expires earlier than we do; swap with our
738 * left child, and descend left.
739 */
740 CALLOUT_SWAP(hleft, hcurrent);
741 current = left;
742 }
743 }
744
745 /*
746 * Delete and handle all past expirations in a callout table's heap.
747 */
748 static hrtime_t
callout_heap_delete(callout_table_t * ct)749 callout_heap_delete(callout_table_t *ct)
750 {
751 hrtime_t now, expiration, next;
752 callout_list_t *cl;
753 callout_heap_t *heap;
754 int hash;
755
756 ASSERT(MUTEX_HELD(&ct->ct_mutex));
757
758 if (CALLOUT_CLEANUP(ct)) {
759 /*
760 * There are too many heap elements pointing to empty callout
761 * lists. Clean them out.
762 */
763 (void) callout_heap_process(ct, 0, 0);
764 }
765
766 now = gethrtime();
767 heap = ct->ct_heap;
768
769 while (ct->ct_heap_num > 0) {
770 expiration = heap->ch_expiration;
771 hash = CALLOUT_CLHASH(expiration);
772 cl = heap->ch_list;
773 ASSERT(expiration == cl->cl_expiration);
774
775 if (cl->cl_callouts.ch_head == NULL) {
776 /*
777 * If the callout list is empty, reap it.
778 * Decrement the reap count.
779 */
780 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
781 CALLOUT_LIST_FREE(ct, cl);
782 ct->ct_nreap--;
783 } else {
784 /*
785 * If the root of the heap expires in the future,
786 * bail out.
787 */
788 if (expiration > now)
789 break;
790
791 /*
792 * Move the callout list for this expiration to the
793 * list of expired callout lists. It will be processed
794 * by the callout executor.
795 */
796 cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
797 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
798 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
799 }
800
801 /*
802 * Now delete the root. This is done by swapping the root with
803 * the last item in the heap and downheaping the item.
804 */
805 ct->ct_heap_num--;
806 if (ct->ct_heap_num > 0) {
807 heap[0] = heap[ct->ct_heap_num];
808 callout_downheap(ct);
809 }
810 }
811
812 /*
813 * If this callout table is empty or callouts have been suspended,
814 * just return. The cyclic has already been programmed to
815 * infinity by the cyclic subsystem.
816 */
817 if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
818 return (CY_INFINITY);
819
820 /*
821 * If the top expirations are within callout_tolerance of each other,
822 * delay the cyclic expire so that they can be processed together.
823 * This is to prevent high resolution timers from swamping the system
824 * with cyclic activity.
825 */
826 if (ct->ct_heap_num > 2) {
827 next = expiration + callout_tolerance;
828 if ((heap[1].ch_expiration < next) ||
829 (heap[2].ch_expiration < next))
830 expiration = next;
831 }
832
833 (void) cyclic_reprogram(ct->ct_cyclic, expiration);
834
835 return (expiration);
836 }
837
838 /*
839 * There are some situations when the entire heap is walked and processed.
840 * This function is called to do the processing. These are the situations:
841 *
842 * 1. When the reap count reaches its threshold, the heap has to be cleared
843 * of all empty callout lists.
844 *
845 * 2. When the system enters and exits KMDB/OBP, all entries in the heap
846 * need to be adjusted by the interval spent in KMDB/OBP.
847 *
848 * 3. When system time is changed, the heap has to be scanned for
849 * absolute hrestime timers. These need to be removed from the heap
850 * and expired immediately.
851 *
852 * In cases 2 and 3, it is a good idea to do 1 as well since we are
853 * scanning the heap anyway.
854 *
855 * If the root gets changed and/or callout lists are expired, return the
856 * new expiration to the caller so he can reprogram the cyclic accordingly.
857 */
858 static hrtime_t
callout_heap_process(callout_table_t * ct,hrtime_t delta,int timechange)859 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
860 {
861 callout_heap_t *heap;
862 callout_list_t *cl;
863 hrtime_t expiration, now;
864 int i, hash, clflags;
865 ulong_t num;
866
867 ASSERT(MUTEX_HELD(&ct->ct_mutex));
868
869 if (ct->ct_heap_num == 0)
870 return (CY_INFINITY);
871
872 if (ct->ct_nreap > 0)
873 ct->ct_cleanups++;
874
875 heap = ct->ct_heap;
876
877 /*
878 * We walk the heap from the top to the bottom. If we encounter
879 * a heap item that points to an empty callout list, we clean
880 * it out. If we encounter a hrestime entry that must be removed,
881 * again we clean it out. Otherwise, we apply any adjustments needed
882 * to an element.
883 *
884 * During the walk, we also compact the heap from the bottom and
885 * reconstruct the heap using upheap operations. This is very
886 * efficient if the number of elements to be cleaned is greater than
887 * or equal to half the heap. This is the common case.
888 *
889 * Even in the non-common case, the upheap operations should be short
890 * as the entries below generally tend to be bigger than the entries
891 * above.
892 */
893 num = ct->ct_heap_num;
894 ct->ct_heap_num = 0;
895 clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
896 now = gethrtime();
897 for (i = 0; i < num; i++) {
898 cl = heap[i].ch_list;
899 /*
900 * If the callout list is empty, delete the heap element and
901 * free the callout list.
902 */
903 if (cl->cl_callouts.ch_head == NULL) {
904 hash = CALLOUT_CLHASH(cl->cl_expiration);
905 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
906 CALLOUT_LIST_FREE(ct, cl);
907 continue;
908 }
909
910 /*
911 * Delete the heap element and expire the callout list, if
912 * one of the following is true:
913 * - the callout list has expired
914 * - the callout list is an absolute hrestime one and
915 * there has been a system time change
916 */
917 if ((cl->cl_expiration <= now) ||
918 (timechange && ((cl->cl_flags & clflags) == clflags))) {
919 hash = CALLOUT_CLHASH(cl->cl_expiration);
920 cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
921 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
922 CALLOUT_LIST_APPEND(ct->ct_expired, cl);
923 continue;
924 }
925
926 /*
927 * Apply adjustments, if any. Adjustments are applied after
928 * the system returns from KMDB or OBP. They are only applied
929 * to relative callout lists.
930 */
931 if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
932 hash = CALLOUT_CLHASH(cl->cl_expiration);
933 CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
934 expiration = cl->cl_expiration + delta;
935 if (expiration <= 0)
936 expiration = CY_INFINITY;
937 heap[i].ch_expiration = expiration;
938 cl->cl_expiration = expiration;
939 hash = CALLOUT_CLHASH(cl->cl_expiration);
940 if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
941 CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
942 } else {
943 CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
944 }
945 }
946
947 heap[ct->ct_heap_num] = heap[i];
948 ct->ct_heap_num++;
949 (void) callout_upheap(ct);
950 }
951
952 ct->ct_nreap = 0;
953
954 /*
955 * We need to return the expiration to help program the cyclic.
956 * If there are expired callouts, the cyclic needs to go off
957 * immediately. If the heap has become empty, then we return infinity.
958 * Else, return the expiration of the earliest callout in the heap.
959 */
960 if (ct->ct_expired.ch_head != NULL)
961 return (gethrtime());
962
963 if (ct->ct_heap_num == 0)
964 return (CY_INFINITY);
965
966 return (heap->ch_expiration);
967 }
968
969 /*
970 * Common function used to create normal and realtime callouts.
971 *
972 * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
973 * there is one restriction on a realtime callout handler - it should not
974 * directly or indirectly acquire cpu_lock. CPU offline waits for pending
975 * cyclic handlers to complete while holding cpu_lock. So, if a realtime
976 * callout handler were to try to get cpu_lock, there would be a deadlock
977 * during CPU offline.
978 */
979 callout_id_t
timeout_generic(int type,void (* func)(void *),void * arg,hrtime_t expiration,hrtime_t resolution,int flags)980 timeout_generic(int type, void (*func)(void *), void *arg,
981 hrtime_t expiration, hrtime_t resolution, int flags)
982 {
983 callout_table_t *ct;
984 callout_t *cp;
985 callout_id_t id;
986 callout_list_t *cl;
987 hrtime_t now, interval;
988 int hash, clflags;
989
990 ASSERT(resolution > 0);
991 ASSERT(func != NULL);
992
993 /*
994 * We get the current hrtime right upfront so that latencies in
995 * this function do not affect the accuracy of the callout.
996 */
997 now = gethrtime();
998
999 /*
1000 * We disable kernel preemption so that we remain on the same CPU
1001 * throughout. If we needed to reprogram the callout table's cyclic,
1002 * we can avoid X-calls if we are on the same CPU.
1003 *
1004 * Note that callout_alloc() releases and reacquires the callout
1005 * table mutex. While reacquiring the mutex, it is possible for us
1006 * to go to sleep and later migrate to another CPU. This should be
1007 * pretty rare, though.
1008 */
1009 kpreempt_disable();
1010
1011 ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
1012 mutex_enter(&ct->ct_mutex);
1013
1014 if (ct->ct_cyclic == CYCLIC_NONE) {
1015 mutex_exit(&ct->ct_mutex);
1016 /*
1017 * The callout table has not yet been initialized fully.
1018 * So, put this one on the boot callout table which is
1019 * always initialized.
1020 */
1021 ct = &callout_boot_ct[type];
1022 mutex_enter(&ct->ct_mutex);
1023 }
1024
1025 if (CALLOUT_CLEANUP(ct)) {
1026 /*
1027 * There are too many heap elements pointing to empty callout
1028 * lists. Clean them out. Since cleanup is only done once
1029 * in a while, no need to reprogram the cyclic if the root
1030 * of the heap gets cleaned out.
1031 */
1032 (void) callout_heap_process(ct, 0, 0);
1033 }
1034
1035 if ((cp = ct->ct_free) == NULL)
1036 cp = callout_alloc(ct);
1037 else
1038 ct->ct_free = cp->c_idnext;
1039
1040 cp->c_func = func;
1041 cp->c_arg = arg;
1042
1043 /*
1044 * Compute the expiration hrtime.
1045 */
1046 if (flags & CALLOUT_FLAG_ABSOLUTE) {
1047 interval = expiration - now;
1048 } else {
1049 interval = expiration;
1050 expiration += now;
1051 }
1052
1053 if (resolution > 1) {
1054 /*
1055 * Align expiration to the specified resolution.
1056 */
1057 if (flags & CALLOUT_FLAG_ROUNDUP)
1058 expiration += resolution - 1;
1059 expiration = (expiration / resolution) * resolution;
1060 }
1061
1062 if (expiration <= 0) {
1063 /*
1064 * expiration hrtime overflow has occurred. Just set the
1065 * expiration to infinity.
1066 */
1067 expiration = CY_INFINITY;
1068 }
1069
1070 /*
1071 * Assign an ID to this callout
1072 */
1073 if (flags & CALLOUT_FLAG_32BIT) {
1074 if (interval > callout_longterm) {
1075 id = (ct->ct_long_id - callout_counter_low);
1076 id |= CALLOUT_COUNTER_HIGH;
1077 ct->ct_long_id = id;
1078 } else {
1079 id = (ct->ct_short_id - callout_counter_low);
1080 id |= CALLOUT_COUNTER_HIGH;
1081 ct->ct_short_id = id;
1082 }
1083 } else {
1084 id = (ct->ct_gen_id - callout_counter_low);
1085 if ((id & CALLOUT_COUNTER_HIGH) == 0) {
1086 id |= CALLOUT_COUNTER_HIGH;
1087 id += CALLOUT_GENERATION_LOW;
1088 }
1089 ct->ct_gen_id = id;
1090 }
1091
1092 cp->c_xid = id;
1093
1094 clflags = 0;
1095 if (flags & CALLOUT_FLAG_ABSOLUTE)
1096 clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
1097 if (flags & CALLOUT_FLAG_HRESTIME)
1098 clflags |= CALLOUT_LIST_FLAG_HRESTIME;
1099 if (resolution == 1)
1100 clflags |= CALLOUT_LIST_FLAG_NANO;
1101 hash = CALLOUT_CLHASH(expiration);
1102
1103 again:
1104 /*
1105 * Try to see if a callout list already exists for this expiration.
1106 */
1107 cl = callout_list_get(ct, expiration, clflags, hash);
1108 if (cl == NULL) {
1109 /*
1110 * Check the free list. If we don't find one, we have to
1111 * take the slow path and allocate from kmem.
1112 */
1113 if ((cl = ct->ct_lfree) == NULL) {
1114 callout_list_alloc(ct);
1115 /*
1116 * In the above call, we drop the lock, allocate and
1117 * reacquire the lock. So, we could have been away
1118 * for a while. In the meantime, someone could have
1119 * inserted a callout list with the same expiration.
1120 * Plus, the heap could have become full. So, the best
1121 * course is to repeat the steps. This should be an
1122 * infrequent event.
1123 */
1124 goto again;
1125 }
1126 ct->ct_lfree = cl->cl_next;
1127 cl->cl_expiration = expiration;
1128 cl->cl_flags = clflags;
1129
1130 /*
1131 * Check if we have enough space in the heap to insert one
1132 * expiration. If not, expand the heap.
1133 */
1134 if (ct->ct_heap_num == ct->ct_heap_max) {
1135 if (callout_heap_expand(ct) == 0) {
1136 /*
1137 * Could not expand the heap. Just queue it.
1138 */
1139 callout_queue_insert(ct, cl);
1140 goto out;
1141 }
1142
1143 /*
1144 * In the above call, we drop the lock, allocate and
1145 * reacquire the lock. So, we could have been away
1146 * for a while. In the meantime, someone could have
1147 * inserted a callout list with the same expiration.
1148 * But we will not go back and check for it as this
1149 * should be a really infrequent event. There is no
1150 * point.
1151 */
1152 }
1153
1154 if (clflags & CALLOUT_LIST_FLAG_NANO) {
1155 CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
1156 } else {
1157 CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
1158 }
1159
1160 /*
1161 * This is a new expiration. So, insert it into the heap.
1162 * This will also reprogram the cyclic, if the expiration
1163 * propagated to the root of the heap.
1164 */
1165 callout_heap_insert(ct, cl);
1166 } else {
1167 /*
1168 * If the callout list was empty, untimeout_generic() would
1169 * have incremented a reap count. Decrement the reap count
1170 * as we are going to insert a callout into this list.
1171 */
1172 if (cl->cl_callouts.ch_head == NULL)
1173 ct->ct_nreap--;
1174 }
1175 out:
1176 cp->c_list = cl;
1177 CALLOUT_APPEND(ct, cp);
1178
1179 ct->ct_timeouts++;
1180 ct->ct_timeouts_pending++;
1181
1182 mutex_exit(&ct->ct_mutex);
1183
1184 kpreempt_enable();
1185
1186 TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
1187 "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
1188 cp);
1189
1190 return (id);
1191 }
1192
1193 timeout_id_t
timeout(void (* func)(void *),void * arg,clock_t delta)1194 timeout(void (*func)(void *), void *arg, clock_t delta)
1195 {
1196 ulong_t id;
1197
1198 /*
1199 * Make sure the callout runs at least 1 tick in the future.
1200 */
1201 if (delta <= 0)
1202 delta = 1;
1203 else if (delta > callout_max_ticks)
1204 delta = callout_max_ticks;
1205
1206 id = (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
1207 TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1208
1209 return ((timeout_id_t)id);
1210 }
1211
1212 /*
1213 * Convenience function that creates a normal callout with default parameters
1214 * and returns a full ID.
1215 */
1216 callout_id_t
timeout_default(void (* func)(void *),void * arg,clock_t delta)1217 timeout_default(void (*func)(void *), void *arg, clock_t delta)
1218 {
1219 callout_id_t id;
1220
1221 /*
1222 * Make sure the callout runs at least 1 tick in the future.
1223 */
1224 if (delta <= 0)
1225 delta = 1;
1226 else if (delta > callout_max_ticks)
1227 delta = callout_max_ticks;
1228
1229 id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
1230 nsec_per_tick, 0);
1231
1232 return (id);
1233 }
1234
1235 timeout_id_t
realtime_timeout(void (* func)(void *),void * arg,clock_t delta)1236 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
1237 {
1238 ulong_t id;
1239
1240 /*
1241 * Make sure the callout runs at least 1 tick in the future.
1242 */
1243 if (delta <= 0)
1244 delta = 1;
1245 else if (delta > callout_max_ticks)
1246 delta = callout_max_ticks;
1247
1248 id = (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
1249 TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1250
1251 return ((timeout_id_t)id);
1252 }
1253
1254 /*
1255 * Convenience function that creates a realtime callout with default parameters
1256 * and returns a full ID.
1257 */
1258 callout_id_t
realtime_timeout_default(void (* func)(void *),void * arg,clock_t delta)1259 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
1260 {
1261 callout_id_t id;
1262
1263 /*
1264 * Make sure the callout runs at least 1 tick in the future.
1265 */
1266 if (delta <= 0)
1267 delta = 1;
1268 else if (delta > callout_max_ticks)
1269 delta = callout_max_ticks;
1270
1271 id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
1272 nsec_per_tick, 0);
1273
1274 return (id);
1275 }
1276
1277 hrtime_t
untimeout_generic(callout_id_t id,int nowait)1278 untimeout_generic(callout_id_t id, int nowait)
1279 {
1280 callout_table_t *ct;
1281 callout_t *cp;
1282 callout_id_t xid;
1283 callout_list_t *cl;
1284 int hash, flags;
1285 callout_id_t bogus;
1286
1287 ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
1288 hash = CALLOUT_IDHASH(id);
1289
1290 mutex_enter(&ct->ct_mutex);
1291
1292 /*
1293 * Search the ID hash table for the callout.
1294 */
1295 for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
1296
1297 xid = cp->c_xid;
1298
1299 /*
1300 * Match the ID and generation number.
1301 */
1302 if ((xid & CALLOUT_ID_MASK) != id)
1303 continue;
1304
1305 if ((xid & CALLOUT_EXECUTING) == 0) {
1306 hrtime_t expiration;
1307
1308 /*
1309 * Delete the callout. If the callout list becomes
1310 * NULL, we don't remove it from the table. This is
1311 * so it can be reused. If the empty callout list
1312 * corresponds to the top of the the callout heap, we
1313 * don't reprogram the table cyclic here. This is in
1314 * order to avoid lots of X-calls to the CPU associated
1315 * with the callout table.
1316 */
1317 cl = cp->c_list;
1318 expiration = cl->cl_expiration;
1319 CALLOUT_DELETE(ct, cp);
1320 CALLOUT_FREE(ct, cp);
1321 ct->ct_untimeouts_unexpired++;
1322 ct->ct_timeouts_pending--;
1323
1324 /*
1325 * If the callout list has become empty, there are 3
1326 * possibilities. If it is present:
1327 * - in the heap, it needs to be cleaned along
1328 * with its heap entry. Increment a reap count.
1329 * - in the callout queue, free it.
1330 * - in the expired list, free it.
1331 */
1332 if (cl->cl_callouts.ch_head == NULL) {
1333 flags = cl->cl_flags;
1334 if (flags & CALLOUT_LIST_FLAG_HEAPED) {
1335 ct->ct_nreap++;
1336 } else if (flags & CALLOUT_LIST_FLAG_QUEUED) {
1337 CALLOUT_LIST_DELETE(ct->ct_queue, cl);
1338 CALLOUT_LIST_FREE(ct, cl);
1339 } else {
1340 CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1341 CALLOUT_LIST_FREE(ct, cl);
1342 }
1343 }
1344 mutex_exit(&ct->ct_mutex);
1345
1346 expiration -= gethrtime();
1347 TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
1348 "untimeout:ID %lx hrtime left %llx", id,
1349 expiration);
1350 return (expiration < 0 ? 0 : expiration);
1351 }
1352
1353 ct->ct_untimeouts_executing++;
1354 /*
1355 * The callout we want to delete is currently executing.
1356 * The DDI states that we must wait until the callout
1357 * completes before returning, so we block on c_done until the
1358 * callout ID changes (to the old ID if it's on the freelist,
1359 * or to a new callout ID if it's in use). This implicitly
1360 * assumes that callout structures are persistent (they are).
1361 */
1362 if (cp->c_executor == curthread) {
1363 /*
1364 * The timeout handler called untimeout() on itself.
1365 * Stupid, but legal. We can't wait for the timeout
1366 * to complete without deadlocking, so we just return.
1367 */
1368 mutex_exit(&ct->ct_mutex);
1369 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
1370 "untimeout_self:ID %x", id);
1371 return (-1);
1372 }
1373 if (nowait == 0) {
1374 /*
1375 * We need to wait. Indicate that we are waiting by
1376 * incrementing c_waiting. This prevents the executor
1377 * from doing a wakeup on c_done if there are no
1378 * waiters.
1379 */
1380 while (cp->c_xid == xid) {
1381 cp->c_waiting = 1;
1382 cv_wait(&cp->c_done, &ct->ct_mutex);
1383 }
1384 }
1385 mutex_exit(&ct->ct_mutex);
1386 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
1387 "untimeout_executing:ID %lx", id);
1388 return (-1);
1389 }
1390 ct->ct_untimeouts_expired++;
1391
1392 mutex_exit(&ct->ct_mutex);
1393 TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
1394 "untimeout_bogus_id:ID %lx", id);
1395
1396 /*
1397 * We didn't find the specified callout ID. This means either
1398 * (1) the callout already fired, or (2) the caller passed us
1399 * a bogus value. Perform a sanity check to detect case (2).
1400 */
1401 bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
1402 if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
1403 panic("untimeout: impossible timeout id %llx",
1404 (unsigned long long)id);
1405
1406 return (-1);
1407 }
1408
1409 clock_t
untimeout(timeout_id_t id_arg)1410 untimeout(timeout_id_t id_arg)
1411 {
1412 hrtime_t hleft;
1413 clock_t tleft;
1414 callout_id_t id;
1415
1416 id = (ulong_t)id_arg;
1417 hleft = untimeout_generic(id, 0);
1418 if (hleft < 0)
1419 tleft = -1;
1420 else if (hleft == 0)
1421 tleft = 0;
1422 else
1423 tleft = NSEC_TO_TICK(hleft);
1424
1425 return (tleft);
1426 }
1427
1428 /*
1429 * Convenience function to untimeout a timeout with a full ID with default
1430 * parameters.
1431 */
1432 clock_t
untimeout_default(callout_id_t id,int nowait)1433 untimeout_default(callout_id_t id, int nowait)
1434 {
1435 hrtime_t hleft;
1436 clock_t tleft;
1437
1438 hleft = untimeout_generic(id, nowait);
1439 if (hleft < 0)
1440 tleft = -1;
1441 else if (hleft == 0)
1442 tleft = 0;
1443 else
1444 tleft = NSEC_TO_TICK(hleft);
1445
1446 return (tleft);
1447 }
1448
1449 /*
1450 * Expire all the callouts queued in the specified callout list.
1451 */
1452 static void
callout_list_expire(callout_table_t * ct,callout_list_t * cl)1453 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
1454 {
1455 callout_t *cp, *cnext;
1456
1457 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1458 ASSERT(cl != NULL);
1459
1460 for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
1461 /*
1462 * Multiple executor threads could be running at the same
1463 * time. If this callout is already being executed,
1464 * go on to the next one.
1465 */
1466 if (cp->c_xid & CALLOUT_EXECUTING) {
1467 cnext = cp->c_clnext;
1468 continue;
1469 }
1470
1471 /*
1472 * Indicate to untimeout() that a callout is
1473 * being expired by the executor.
1474 */
1475 cp->c_xid |= CALLOUT_EXECUTING;
1476 cp->c_executor = curthread;
1477 mutex_exit(&ct->ct_mutex);
1478
1479 DTRACE_PROBE1(callout__start, callout_t *, cp);
1480 (*cp->c_func)(cp->c_arg);
1481 DTRACE_PROBE1(callout__end, callout_t *, cp);
1482
1483 mutex_enter(&ct->ct_mutex);
1484
1485 ct->ct_expirations++;
1486 ct->ct_timeouts_pending--;
1487 /*
1488 * Indicate completion for c_done.
1489 */
1490 cp->c_xid &= ~CALLOUT_EXECUTING;
1491 cp->c_executor = NULL;
1492 cnext = cp->c_clnext;
1493
1494 /*
1495 * Delete callout from ID hash table and the callout
1496 * list, return to freelist, and tell any untimeout() that
1497 * cares that we're done.
1498 */
1499 CALLOUT_DELETE(ct, cp);
1500 CALLOUT_FREE(ct, cp);
1501
1502 if (cp->c_waiting) {
1503 cp->c_waiting = 0;
1504 cv_broadcast(&cp->c_done);
1505 }
1506 }
1507 }
1508
1509 /*
1510 * Execute all expired callout lists for a callout table.
1511 */
1512 static void
callout_expire(callout_table_t * ct)1513 callout_expire(callout_table_t *ct)
1514 {
1515 callout_list_t *cl, *clnext;
1516
1517 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1518
1519 for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1520 /*
1521 * Expire all the callouts in this callout list.
1522 */
1523 callout_list_expire(ct, cl);
1524
1525 clnext = cl->cl_next;
1526 if (cl->cl_callouts.ch_head == NULL) {
1527 /*
1528 * Free the callout list.
1529 */
1530 CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1531 CALLOUT_LIST_FREE(ct, cl);
1532 }
1533 }
1534 }
1535
1536 /*
1537 * The cyclic handlers below process callouts in two steps:
1538 *
1539 * 1. Find all expired callout lists and queue them in a separate
1540 * list of expired callouts.
1541 * 2. Execute the expired callout lists.
1542 *
1543 * This is done for two reasons:
1544 *
1545 * 1. We want to quickly find the next earliest expiration to program
1546 * the cyclic to and reprogram it. We can do this right at the end
1547 * of step 1.
1548 * 2. The realtime cyclic handler expires callouts in place. However,
1549 * for normal callouts, callouts are expired by a taskq thread.
1550 * So, it is simpler and more robust to have the taskq thread just
1551 * do step 2.
1552 */
1553
1554 /*
1555 * Realtime callout cyclic handlers.
1556 */
1557 void
callout_realtime(callout_table_t * ct)1558 callout_realtime(callout_table_t *ct)
1559 {
1560 mutex_enter(&ct->ct_mutex);
1561 (void) callout_heap_delete(ct);
1562 callout_expire(ct);
1563 mutex_exit(&ct->ct_mutex);
1564 }
1565
1566 void
callout_queue_realtime(callout_table_t * ct)1567 callout_queue_realtime(callout_table_t *ct)
1568 {
1569 mutex_enter(&ct->ct_mutex);
1570 (void) callout_queue_delete(ct);
1571 callout_expire(ct);
1572 mutex_exit(&ct->ct_mutex);
1573 }
1574
1575 void
callout_execute(callout_table_t * ct)1576 callout_execute(callout_table_t *ct)
1577 {
1578 mutex_enter(&ct->ct_mutex);
1579 callout_expire(ct);
1580 mutex_exit(&ct->ct_mutex);
1581 }
1582
1583 /*
1584 * Normal callout cyclic handlers.
1585 */
1586 void
callout_normal(callout_table_t * ct)1587 callout_normal(callout_table_t *ct)
1588 {
1589 int i, exec;
1590 hrtime_t exp;
1591
1592 mutex_enter(&ct->ct_mutex);
1593 exp = callout_heap_delete(ct);
1594 CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1595 mutex_exit(&ct->ct_mutex);
1596
1597 for (i = 0; i < exec; i++) {
1598 ASSERT(ct->ct_taskq != NULL);
1599 (void) taskq_dispatch(ct->ct_taskq,
1600 (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1601 }
1602 }
1603
1604 void
callout_queue_normal(callout_table_t * ct)1605 callout_queue_normal(callout_table_t *ct)
1606 {
1607 int i, exec;
1608 hrtime_t exp;
1609
1610 mutex_enter(&ct->ct_mutex);
1611 exp = callout_queue_delete(ct);
1612 CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1613 mutex_exit(&ct->ct_mutex);
1614
1615 for (i = 0; i < exec; i++) {
1616 ASSERT(ct->ct_taskq != NULL);
1617 (void) taskq_dispatch(ct->ct_taskq,
1618 (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1619 }
1620 }
1621
1622 /*
1623 * Suspend callout processing.
1624 */
1625 static void
callout_suspend(void)1626 callout_suspend(void)
1627 {
1628 int t, f;
1629 callout_table_t *ct;
1630
1631 /*
1632 * Traverse every callout table in the system and suspend callout
1633 * processing.
1634 *
1635 * We need to suspend all the tables (including the inactive ones)
1636 * so that if a table is made active while the suspend is still on,
1637 * the table remains suspended.
1638 */
1639 for (f = 0; f < max_ncpus; f++) {
1640 for (t = 0; t < CALLOUT_NTYPES; t++) {
1641 ct = &callout_table[CALLOUT_TABLE(t, f)];
1642
1643 mutex_enter(&ct->ct_mutex);
1644 ct->ct_suspend++;
1645 if (ct->ct_cyclic == CYCLIC_NONE) {
1646 mutex_exit(&ct->ct_mutex);
1647 continue;
1648 }
1649 if (ct->ct_suspend == 1) {
1650 (void) cyclic_reprogram(ct->ct_cyclic,
1651 CY_INFINITY);
1652 (void) cyclic_reprogram(ct->ct_qcyclic,
1653 CY_INFINITY);
1654 }
1655 mutex_exit(&ct->ct_mutex);
1656 }
1657 }
1658 }
1659
1660 /*
1661 * Resume callout processing.
1662 */
1663 static void
callout_resume(hrtime_t delta,int timechange)1664 callout_resume(hrtime_t delta, int timechange)
1665 {
1666 hrtime_t hexp, qexp;
1667 int t, f;
1668 callout_table_t *ct;
1669
1670 /*
1671 * Traverse every callout table in the system and resume callout
1672 * processing. For active tables, perform any hrtime adjustments
1673 * necessary.
1674 */
1675 for (f = 0; f < max_ncpus; f++) {
1676 for (t = 0; t < CALLOUT_NTYPES; t++) {
1677 ct = &callout_table[CALLOUT_TABLE(t, f)];
1678
1679 mutex_enter(&ct->ct_mutex);
1680 if (ct->ct_cyclic == CYCLIC_NONE) {
1681 ct->ct_suspend--;
1682 mutex_exit(&ct->ct_mutex);
1683 continue;
1684 }
1685
1686 /*
1687 * If a delta is specified, adjust the expirations in
1688 * the heap by delta. Also, if the caller indicates
1689 * a timechange, process that. This step also cleans
1690 * out any empty callout lists that might happen to
1691 * be there.
1692 */
1693 hexp = callout_heap_process(ct, delta, timechange);
1694 qexp = callout_queue_process(ct, delta, timechange);
1695
1696 ct->ct_suspend--;
1697 if (ct->ct_suspend == 0) {
1698 (void) cyclic_reprogram(ct->ct_cyclic, hexp);
1699 (void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1700 }
1701
1702 mutex_exit(&ct->ct_mutex);
1703 }
1704 }
1705 }
1706
1707 /*
1708 * Callback handler used by CPR to stop and resume callouts.
1709 * The cyclic subsystem saves and restores hrtime during CPR.
1710 * That is why callout_resume() is called with a 0 delta.
1711 * Although hrtime is the same, hrestime (system time) has
1712 * progressed during CPR. So, we have to indicate a time change
1713 * to expire the absolute hrestime timers.
1714 */
1715 /*ARGSUSED*/
1716 static boolean_t
callout_cpr_callb(void * arg,int code)1717 callout_cpr_callb(void *arg, int code)
1718 {
1719 if (code == CB_CODE_CPR_CHKPT)
1720 callout_suspend();
1721 else
1722 callout_resume(0, 1);
1723
1724 return (B_TRUE);
1725 }
1726
1727 /*
1728 * Callback handler invoked when the debugger is entered or exited.
1729 */
1730 /*ARGSUSED*/
1731 static boolean_t
callout_debug_callb(void * arg,int code)1732 callout_debug_callb(void *arg, int code)
1733 {
1734 hrtime_t delta;
1735
1736 /*
1737 * When the system enters the debugger. make a note of the hrtime.
1738 * When it is resumed, compute how long the system was in the
1739 * debugger. This interval should not be counted for callouts.
1740 */
1741 if (code == 0) {
1742 callout_suspend();
1743 callout_debug_hrtime = gethrtime();
1744 } else {
1745 delta = gethrtime() - callout_debug_hrtime;
1746 callout_resume(delta, 0);
1747 }
1748
1749 return (B_TRUE);
1750 }
1751
1752 /*
1753 * Move the absolute hrestime callouts to the expired list. Then program the
1754 * table's cyclic to expire immediately so that the callouts can be executed
1755 * immediately.
1756 */
1757 static void
callout_hrestime_one(callout_table_t * ct)1758 callout_hrestime_one(callout_table_t *ct)
1759 {
1760 hrtime_t hexp, qexp;
1761
1762 mutex_enter(&ct->ct_mutex);
1763 if (ct->ct_cyclic == CYCLIC_NONE) {
1764 mutex_exit(&ct->ct_mutex);
1765 return;
1766 }
1767
1768 /*
1769 * Walk the heap and process all the absolute hrestime entries.
1770 */
1771 hexp = callout_heap_process(ct, 0, 1);
1772 qexp = callout_queue_process(ct, 0, 1);
1773
1774 if (ct->ct_suspend == 0) {
1775 (void) cyclic_reprogram(ct->ct_cyclic, hexp);
1776 (void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1777 }
1778
1779 mutex_exit(&ct->ct_mutex);
1780 }
1781
1782 /*
1783 * This function is called whenever system time (hrestime) is changed
1784 * explicitly. All the HRESTIME callouts must be expired at once.
1785 */
1786 /*ARGSUSED*/
1787 void
callout_hrestime(void)1788 callout_hrestime(void)
1789 {
1790 int t, f;
1791 callout_table_t *ct;
1792
1793 /*
1794 * Traverse every callout table in the system and process the hrestime
1795 * callouts therein.
1796 *
1797 * We look at all the tables because we don't know which ones were
1798 * onlined and offlined in the past. The offlined tables may still
1799 * have active cyclics processing timers somewhere.
1800 */
1801 for (f = 0; f < max_ncpus; f++) {
1802 for (t = 0; t < CALLOUT_NTYPES; t++) {
1803 ct = &callout_table[CALLOUT_TABLE(t, f)];
1804 callout_hrestime_one(ct);
1805 }
1806 }
1807 }
1808
1809 /*
1810 * Create the hash tables for this callout table.
1811 */
1812 static void
callout_hash_init(callout_table_t * ct)1813 callout_hash_init(callout_table_t *ct)
1814 {
1815 size_t size;
1816
1817 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1818 ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1819
1820 size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1821 ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1822 ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1823 }
1824
1825 /*
1826 * Create per-callout table kstats.
1827 */
1828 static void
callout_kstat_init(callout_table_t * ct)1829 callout_kstat_init(callout_table_t *ct)
1830 {
1831 callout_stat_type_t stat;
1832 kstat_t *ct_kstats;
1833 int ndx;
1834
1835 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1836 ASSERT(ct->ct_kstats == NULL);
1837
1838 ndx = ct - callout_table;
1839 ct_kstats = kstat_create("unix", ndx, "callout",
1840 "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1841
1842 if (ct_kstats == NULL) {
1843 cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1844 (void *)ct);
1845 } else {
1846 ct_kstats->ks_data = ct->ct_kstat_data;
1847 for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1848 kstat_named_init(&ct->ct_kstat_data[stat],
1849 callout_kstat_names[stat], KSTAT_DATA_INT64);
1850 ct->ct_kstats = ct_kstats;
1851 kstat_install(ct_kstats);
1852 }
1853 }
1854
1855 static void
callout_cyclic_init(callout_table_t * ct)1856 callout_cyclic_init(callout_table_t *ct)
1857 {
1858 cyc_handler_t hdlr;
1859 cyc_time_t when;
1860 processorid_t seqid;
1861 int t;
1862 cyclic_id_t cyclic, qcyclic;
1863
1864 ASSERT(MUTEX_HELD(&ct->ct_mutex));
1865
1866 t = ct->ct_type;
1867 seqid = CALLOUT_TABLE_SEQID(ct);
1868
1869 /*
1870 * Create the taskq thread if the table type is normal.
1871 * Realtime tables are handled at PIL1 by a softint
1872 * handler.
1873 */
1874 if (t == CALLOUT_NORMAL) {
1875 ASSERT(ct->ct_taskq == NULL);
1876 /*
1877 * Each callout thread consumes exactly one
1878 * task structure while active. Therefore,
1879 * prepopulating with 2 * callout_threads tasks
1880 * ensures that there's at least one task per
1881 * thread that's either scheduled or on the
1882 * freelist. In turn, this guarantees that
1883 * taskq_dispatch() will always either succeed
1884 * (because there's a free task structure) or
1885 * be unnecessary (because "callout_excute(ct)"
1886 * has already scheduled).
1887 */
1888 ct->ct_taskq =
1889 taskq_create_instance("callout_taskq", seqid,
1890 callout_threads, maxclsyspri,
1891 2 * callout_threads, 2 * callout_threads,
1892 TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1893 }
1894
1895 /*
1896 * callouts can only be created in a table whose
1897 * cyclic has been initialized.
1898 */
1899 ASSERT(ct->ct_heap_num == 0);
1900
1901 /*
1902 * Drop the mutex before creating the callout cyclics. cyclic_add()
1903 * could potentially expand the cyclic heap. We don't want to be
1904 * holding the callout table mutex in that case. Note that this
1905 * function is called during CPU online. cpu_lock is held at this
1906 * point. So, only one thread can be executing the cyclic add logic
1907 * below at any time.
1908 */
1909 mutex_exit(&ct->ct_mutex);
1910
1911 /*
1912 * Create the callout table cyclics.
1913 *
1914 * The realtime cyclic handler executes at low PIL. The normal cyclic
1915 * handler executes at lock PIL. This is because there are cases
1916 * where code can block at PIL > 1 waiting for a normal callout handler
1917 * to unblock it directly or indirectly. If the normal cyclic were to
1918 * be executed at low PIL, it could get blocked out by the waiter
1919 * and cause a deadlock.
1920 */
1921 ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1922
1923 if (t == CALLOUT_REALTIME) {
1924 hdlr.cyh_level = callout_realtime_level;
1925 hdlr.cyh_func = (cyc_func_t)callout_realtime;
1926 } else {
1927 hdlr.cyh_level = callout_normal_level;
1928 hdlr.cyh_func = (cyc_func_t)callout_normal;
1929 }
1930 hdlr.cyh_arg = ct;
1931 when.cyt_when = CY_INFINITY;
1932 when.cyt_interval = CY_INFINITY;
1933
1934 cyclic = cyclic_add(&hdlr, &when);
1935
1936 if (t == CALLOUT_REALTIME)
1937 hdlr.cyh_func = (cyc_func_t)callout_queue_realtime;
1938 else
1939 hdlr.cyh_func = (cyc_func_t)callout_queue_normal;
1940
1941 qcyclic = cyclic_add(&hdlr, &when);
1942
1943 mutex_enter(&ct->ct_mutex);
1944 ct->ct_cyclic = cyclic;
1945 ct->ct_qcyclic = qcyclic;
1946 }
1947
1948 void
callout_cpu_online(cpu_t * cp)1949 callout_cpu_online(cpu_t *cp)
1950 {
1951 lgrp_handle_t hand;
1952 callout_cache_t *cache;
1953 char s[KMEM_CACHE_NAMELEN];
1954 callout_table_t *ct;
1955 processorid_t seqid;
1956 int t;
1957
1958 ASSERT(MUTEX_HELD(&cpu_lock));
1959
1960 /*
1961 * Locate the cache corresponding to the onlined CPU's lgroup.
1962 * Note that access to callout_caches is protected by cpu_lock.
1963 */
1964 hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1965 for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1966 if (cache->cc_hand == hand)
1967 break;
1968 }
1969
1970 /*
1971 * If not found, create one. The caches are never destroyed.
1972 */
1973 if (cache == NULL) {
1974 cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1975 cache->cc_hand = hand;
1976 (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1977 (long)hand);
1978 cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1979 CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1980 (void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1981 (long)hand);
1982 cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1983 CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1984 cache->cc_next = callout_caches;
1985 callout_caches = cache;
1986 }
1987
1988 seqid = cp->cpu_seqid;
1989
1990 for (t = 0; t < CALLOUT_NTYPES; t++) {
1991 ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1992
1993 mutex_enter(&ct->ct_mutex);
1994 /*
1995 * Store convinience pointers to the kmem caches
1996 * in the callout table. These assignments should always be
1997 * done as callout tables can map to different physical
1998 * CPUs each time.
1999 */
2000 ct->ct_cache = cache->cc_cache;
2001 ct->ct_lcache = cache->cc_lcache;
2002
2003 /*
2004 * We use the heap pointer to check if stuff has been
2005 * initialized for this callout table.
2006 */
2007 if (ct->ct_heap == NULL) {
2008 callout_heap_init(ct);
2009 callout_hash_init(ct);
2010 callout_kstat_init(ct);
2011 callout_cyclic_init(ct);
2012 }
2013
2014 mutex_exit(&ct->ct_mutex);
2015
2016 /*
2017 * Move the cyclics to this CPU by doing a bind.
2018 */
2019 cyclic_bind(ct->ct_cyclic, cp, NULL);
2020 cyclic_bind(ct->ct_qcyclic, cp, NULL);
2021 }
2022 }
2023
2024 void
callout_cpu_offline(cpu_t * cp)2025 callout_cpu_offline(cpu_t *cp)
2026 {
2027 callout_table_t *ct;
2028 processorid_t seqid;
2029 int t;
2030
2031 ASSERT(MUTEX_HELD(&cpu_lock));
2032
2033 seqid = cp->cpu_seqid;
2034
2035 for (t = 0; t < CALLOUT_NTYPES; t++) {
2036 ct = &callout_table[CALLOUT_TABLE(t, seqid)];
2037
2038 /*
2039 * Unbind the cyclics. This will allow the cyclic subsystem
2040 * to juggle the cyclics during CPU offline.
2041 */
2042 cyclic_bind(ct->ct_cyclic, NULL, NULL);
2043 cyclic_bind(ct->ct_qcyclic, NULL, NULL);
2044 }
2045 }
2046
2047 /*
2048 * This is called to perform per-CPU initialization for slave CPUs at
2049 * boot time.
2050 */
2051 void
callout_mp_init(void)2052 callout_mp_init(void)
2053 {
2054 cpu_t *cp;
2055 size_t min, max;
2056
2057 if (callout_chunk == CALLOUT_CHUNK) {
2058 /*
2059 * No one has specified a chunk in /etc/system. We need to
2060 * compute it here based on the number of online CPUs and
2061 * available physical memory.
2062 */
2063 min = CALLOUT_MIN_HEAP_SIZE;
2064 max = ptob(physmem / CALLOUT_MEM_FRACTION);
2065 if (min > max)
2066 min = max;
2067 callout_chunk = min / sizeof (callout_heap_t);
2068 callout_chunk /= ncpus_online;
2069 callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2070 }
2071
2072 mutex_enter(&cpu_lock);
2073
2074 cp = cpu_active;
2075 do {
2076 callout_cpu_online(cp);
2077 } while ((cp = cp->cpu_next_onln) != cpu_active);
2078
2079 mutex_exit(&cpu_lock);
2080 }
2081
2082 /*
2083 * Initialize all callout tables. Called at boot time just before clkstart().
2084 */
2085 void
callout_init(void)2086 callout_init(void)
2087 {
2088 int f, t;
2089 size_t size;
2090 int table_id;
2091 callout_table_t *ct;
2092 long bits, fanout;
2093 uintptr_t buf;
2094
2095 /*
2096 * Initialize callout globals.
2097 */
2098 bits = 0;
2099 for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
2100 bits++;
2101 callout_table_bits = CALLOUT_TYPE_BITS + bits;
2102 callout_table_mask = (1 << callout_table_bits) - 1;
2103 callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
2104 callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
2105 callout_max_ticks = CALLOUT_MAX_TICKS;
2106 if (callout_min_reap == 0)
2107 callout_min_reap = CALLOUT_MIN_REAP;
2108
2109 if (callout_tolerance <= 0)
2110 callout_tolerance = CALLOUT_TOLERANCE;
2111 if (callout_threads <= 0)
2112 callout_threads = CALLOUT_THREADS;
2113 if (callout_chunk <= 0)
2114 callout_chunk = CALLOUT_CHUNK;
2115 else
2116 callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2117
2118 /*
2119 * Allocate all the callout tables based on max_ncpus. We have chosen
2120 * to do boot-time allocation instead of dynamic allocation because:
2121 *
2122 * - the size of the callout tables is not too large.
2123 * - there are race conditions involved in making this dynamic.
2124 * - the hash tables that go with the callout tables consume
2125 * most of the memory and they are only allocated in
2126 * callout_cpu_online().
2127 *
2128 * Each CPU has two tables that are consecutive in the array. The first
2129 * one is for realtime callouts and the second one is for normal ones.
2130 *
2131 * We do this alignment dance to make sure that callout table
2132 * structures will always be on a cache line boundary.
2133 */
2134 size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
2135 size += CALLOUT_ALIGN;
2136 buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
2137 callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
2138
2139 size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
2140 /*
2141 * Now, initialize the tables for all the CPUs.
2142 */
2143 for (f = 0; f < max_ncpus; f++) {
2144 for (t = 0; t < CALLOUT_NTYPES; t++) {
2145 table_id = CALLOUT_TABLE(t, f);
2146 ct = &callout_table[table_id];
2147 ct->ct_type = t;
2148 mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
2149 /*
2150 * Precompute the base IDs for long and short-term
2151 * legacy IDs. This makes ID generation during
2152 * timeout() fast.
2153 */
2154 ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
2155 ct->ct_long_id = CALLOUT_LONG_ID(table_id);
2156 /*
2157 * Precompute the base ID for generation-based IDs.
2158 * Note that when the first ID gets allocated, the
2159 * ID will wrap. This will cause the generation
2160 * number to be incremented to 1.
2161 */
2162 ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
2163 /*
2164 * Initialize the cyclics as NONE. This will get set
2165 * during CPU online. This is so that partially
2166 * populated systems will only have the required
2167 * number of cyclics, not more.
2168 */
2169 ct->ct_cyclic = CYCLIC_NONE;
2170 ct->ct_qcyclic = CYCLIC_NONE;
2171 ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
2172 }
2173 }
2174
2175 /*
2176 * Add the callback for CPR. This is called during checkpoint
2177 * resume to suspend and resume callouts.
2178 */
2179 (void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
2180 "callout_cpr");
2181 (void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
2182 "callout_debug");
2183
2184 /*
2185 * Call the per-CPU initialization function for the boot CPU. This
2186 * is done here because the function is not called automatically for
2187 * the boot CPU from the CPU online/offline hooks. Note that the
2188 * CPU lock is taken here because of convention.
2189 */
2190 mutex_enter(&cpu_lock);
2191 callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
2192 callout_cpu_online(CPU);
2193 mutex_exit(&cpu_lock);
2194
2195 /* heads-up to boot-time clients that timeouts now available */
2196 callout_init_done = 1;
2197 }
2198