xref: /illumos-gate/usr/src/uts/common/os/callout.c (revision fc8ae2ec4282de7ec96f48e11078345f3dc0ac3d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2016 by Delphix. All rights reserved.
24  */
25 
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/kmem.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
38 #include <sys/sdt.h>
39 
40 int callout_init_done;				/* useful during boot */
41 
42 /*
43  * Callout tables.  See timeout(9F) for details.
44  */
45 static int callout_threads;			/* callout normal threads */
46 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
47 static int callout_chunk;			/* callout heap chunk size */
48 static int callout_min_reap;			/* callout minimum reap count */
49 static int callout_tolerance;			/* callout hires tolerance */
50 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
51 static clock_t callout_max_ticks;		/* max interval */
52 static hrtime_t callout_longterm;		/* longterm nanoseconds */
53 static ulong_t callout_counter_low;		/* callout ID increment */
54 static ulong_t callout_table_bits;		/* number of table bits in ID */
55 static ulong_t callout_table_mask;		/* mask for the table bits */
56 static callout_cache_t *callout_caches;		/* linked list of caches */
57 #pragma align 64(callout_table)
58 static callout_table_t *callout_table;		/* global callout table array */
59 
60 /*
61  * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
62  * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
63  * via taskq, to a thread that executes at PIL 0 - so we end up running
64  * 'normal' callouts at PIL 0.
65  */
66 static volatile int callout_realtime_level = CY_LOW_LEVEL;
67 static volatile int callout_normal_level = CY_LOCK_LEVEL;
68 
69 static char *callout_kstat_names[] = {
70 	"callout_timeouts",
71 	"callout_timeouts_pending",
72 	"callout_untimeouts_unexpired",
73 	"callout_untimeouts_executing",
74 	"callout_untimeouts_expired",
75 	"callout_expirations",
76 	"callout_allocations",
77 	"callout_cleanups",
78 };
79 
80 static hrtime_t	callout_heap_process(callout_table_t *, hrtime_t, int);
81 
82 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
83 {							\
84 	callout_hash_t *hashp = &(hash);		\
85 							\
86 	cp->cprev = NULL;				\
87 	cp->cnext = hashp->ch_head;			\
88 	if (hashp->ch_head == NULL)			\
89 		hashp->ch_tail = cp;			\
90 	else						\
91 		cp->cnext->cprev = cp;			\
92 	hashp->ch_head = cp;				\
93 }
94 
95 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
96 {							\
97 	callout_hash_t *hashp = &(hash);		\
98 							\
99 	cp->cnext = NULL;				\
100 	cp->cprev = hashp->ch_tail;			\
101 	if (hashp->ch_tail == NULL)			\
102 		hashp->ch_head = cp;			\
103 	else						\
104 		cp->cprev->cnext = cp;			\
105 	hashp->ch_tail = cp;				\
106 }
107 
108 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
109 {							\
110 	callout_hash_t *hashp = &(hash);		\
111 							\
112 	if (cp->cnext == NULL)				\
113 		hashp->ch_tail = cp->cprev;		\
114 	else						\
115 		cp->cnext->cprev = cp->cprev;		\
116 	if (cp->cprev == NULL)				\
117 		hashp->ch_head = cp->cnext;		\
118 	else						\
119 		cp->cprev->cnext = cp->cnext;		\
120 }
121 
122 /*
123  * These definitions help us queue callouts and callout lists. Here is
124  * the queueing rationale:
125  *
126  *	- callouts are queued in a FIFO manner in the ID hash table.
127  *	  TCP timers are typically cancelled in the same order that they
128  *	  were issued. The FIFO queueing shortens the search for a callout
129  *	  during untimeout().
130  *
131  *	- callouts are queued in a FIFO manner in their callout lists.
132  *	  This ensures that the callouts are executed in the same order that
133  *	  they were queued. This is fair. Plus, it helps to make each
134  *	  callout expiration timely. It also favors cancellations.
135  *
136  *	- callout lists are queued in the following manner in the callout
137  *	  hash table buckets:
138  *
139  *		- appended, if the callout list is a 1-nanosecond resolution
140  *		  callout list. When a callout is created, we first look for
141  *		  a callout list that has the same expiration so we can avoid
142  *		  allocating a callout list and inserting the expiration into
143  *		  the heap. However, we do not want to look at 1-nanosecond
144  *		  resolution callout lists as we will seldom find a match in
145  *		  them. Keeping these callout lists in the rear of the hash
146  *		  buckets allows us to skip these during the lookup.
147  *
148  *		- inserted at the beginning, if the callout list is not a
149  *		  1-nanosecond resolution callout list. This also has the
150  *		  side-effect of keeping the long term timers away from the
151  *		  front of the buckets.
152  *
153  *	- callout lists are queued in a FIFO manner in the expired callouts
154  *	  list. This ensures that callout lists are executed in the order
155  *	  of expiration.
156  */
157 #define	CALLOUT_APPEND(ct, cp)						\
158 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
159 		cp, c_idnext, c_idprev);				\
160 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
161 
162 #define	CALLOUT_DELETE(ct, cp)						\
163 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
164 		cp, c_idnext, c_idprev);				\
165 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
166 
167 #define	CALLOUT_LIST_INSERT(hash, cl)				\
168 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
169 
170 #define	CALLOUT_LIST_APPEND(hash, cl)				\
171 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
172 
173 #define	CALLOUT_LIST_DELETE(hash, cl)				\
174 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
175 
176 #define	CALLOUT_LIST_BEFORE(cl, nextcl)			\
177 {							\
178 	(cl)->cl_prev = (nextcl)->cl_prev;		\
179 	(cl)->cl_next = (nextcl);			\
180 	(nextcl)->cl_prev = (cl);			\
181 	if (cl->cl_prev != NULL)			\
182 		cl->cl_prev->cl_next = cl;		\
183 }
184 
185 /*
186  * For normal callouts, there is a deadlock scenario if two callouts that
187  * have an inter-dependency end up on the same callout list. To break the
188  * deadlock, you need two taskq threads running in parallel. We compute
189  * the number of taskq threads here using a bunch of conditions to make
190  * it optimal for the common case. This is an ugly hack, but one that is
191  * necessary (sigh).
192  */
193 #define	CALLOUT_THRESHOLD	100000000
194 #define	CALLOUT_EXEC_COMPUTE(ct, nextexp, exec)				\
195 {									\
196 	callout_list_t *cl;						\
197 									\
198 	cl = ct->ct_expired.ch_head;					\
199 	if (cl == NULL) {						\
200 		/*							\
201 		 * If the expired list is NULL, there is nothing to	\
202 		 * process.						\
203 		 */							\
204 		exec = 0;						\
205 	} else if ((cl->cl_next == NULL) &&				\
206 	    (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) {	\
207 		/*							\
208 		 * If there is only one callout list and it contains	\
209 		 * only one callout, there is no need for two threads.	\
210 		 */							\
211 		exec = 1;						\
212 	} else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) {	\
213 		/*							\
214 		 * If the next expiration of the cyclic is way out into	\
215 		 * the future, we need two threads.			\
216 		 */							\
217 		exec = 2;						\
218 	} else {							\
219 		/*							\
220 		 * We have multiple callouts to process. But the cyclic	\
221 		 * will fire in the near future. So, we only need one	\
222 		 * thread for now.					\
223 		 */							\
224 		exec = 1;						\
225 	}								\
226 }
227 
228 /*
229  * Macro to swap two heap items.
230  */
231 #define	CALLOUT_SWAP(h1, h2)		\
232 {					\
233 	callout_heap_t tmp;		\
234 					\
235 	tmp = *h1;			\
236 	*h1 = *h2;			\
237 	*h2 = tmp;			\
238 }
239 
240 /*
241  * Macro to free a callout list.
242  */
243 #define	CALLOUT_LIST_FREE(ct, cl)			\
244 {							\
245 	cl->cl_next = ct->ct_lfree;			\
246 	ct->ct_lfree = cl;				\
247 	cl->cl_flags |= CALLOUT_LIST_FLAG_FREE;		\
248 }
249 
250 /*
251  * Macro to free a callout.
252  */
253 #define	CALLOUT_FREE(ct, cl)			\
254 {						\
255 	cp->c_idnext = ct->ct_free;		\
256 	ct->ct_free = cp;			\
257 	cp->c_xid |= CALLOUT_ID_FREE;		\
258 }
259 
260 /*
261  * Allocate a callout structure.  We try quite hard because we
262  * can't sleep, and if we can't do the allocation, we're toast.
263  * Failing all, we try a KM_PANIC allocation. Note that we never
264  * deallocate a callout. See untimeout() for the reasoning.
265  */
266 static callout_t *
267 callout_alloc(callout_table_t *ct)
268 {
269 	size_t size;
270 	callout_t *cp;
271 
272 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
273 	mutex_exit(&ct->ct_mutex);
274 
275 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
276 	if (cp == NULL) {
277 		size = sizeof (callout_t);
278 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
279 	}
280 	cp->c_xid = 0;
281 	cp->c_executor = NULL;
282 	cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
283 	cp->c_waiting = 0;
284 
285 	mutex_enter(&ct->ct_mutex);
286 	ct->ct_allocations++;
287 	return (cp);
288 }
289 
290 /*
291  * Allocate a callout list structure.  We try quite hard because we
292  * can't sleep, and if we can't do the allocation, we're toast.
293  * Failing all, we try a KM_PANIC allocation. Note that we never
294  * deallocate a callout list.
295  */
296 static void
297 callout_list_alloc(callout_table_t *ct)
298 {
299 	size_t size;
300 	callout_list_t *cl;
301 
302 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
303 	mutex_exit(&ct->ct_mutex);
304 
305 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
306 	if (cl == NULL) {
307 		size = sizeof (callout_list_t);
308 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
309 	}
310 	bzero(cl, sizeof (callout_list_t));
311 
312 	mutex_enter(&ct->ct_mutex);
313 	CALLOUT_LIST_FREE(ct, cl);
314 }
315 
316 /*
317  * Find a callout list that corresponds to an expiration and matching flags.
318  */
319 static callout_list_t *
320 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
321 {
322 	callout_list_t *cl;
323 	int clflags;
324 
325 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
326 
327 	if (flags & CALLOUT_LIST_FLAG_NANO) {
328 		/*
329 		 * This is a 1-nanosecond resolution callout. We will rarely
330 		 * find a match for this. So, bail out.
331 		 */
332 		return (NULL);
333 	}
334 
335 	clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
336 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
337 		/*
338 		 * If we have reached a 1-nanosecond resolution callout list,
339 		 * we don't have much hope of finding a match in this hash
340 		 * bucket. So, just bail out.
341 		 */
342 		if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
343 			return (NULL);
344 
345 		if ((cl->cl_expiration == expiration) &&
346 		    ((cl->cl_flags & clflags) == (flags & clflags)))
347 			return (cl);
348 	}
349 
350 	return (NULL);
351 }
352 
353 /*
354  * Add a new callout list into a callout table's queue in sorted order by
355  * expiration.
356  */
357 static int
358 callout_queue_add(callout_table_t *ct, callout_list_t *cl)
359 {
360 	callout_list_t *nextcl;
361 	hrtime_t expiration;
362 
363 	expiration = cl->cl_expiration;
364 	nextcl = ct->ct_queue.ch_head;
365 	if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) {
366 		CALLOUT_LIST_INSERT(ct->ct_queue, cl);
367 		return (1);
368 	}
369 
370 	while (nextcl != NULL) {
371 		if (expiration < nextcl->cl_expiration) {
372 			CALLOUT_LIST_BEFORE(cl, nextcl);
373 			return (0);
374 		}
375 		nextcl = nextcl->cl_next;
376 	}
377 	CALLOUT_LIST_APPEND(ct->ct_queue, cl);
378 
379 	return (0);
380 }
381 
382 /*
383  * Insert a callout list into a callout table's queue and reprogram the queue
384  * cyclic if needed.
385  */
386 static void
387 callout_queue_insert(callout_table_t *ct, callout_list_t *cl)
388 {
389 	cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED;
390 
391 	/*
392 	 * Add the callout to the callout queue. If it ends up at the head,
393 	 * the cyclic needs to be reprogrammed as we have an earlier
394 	 * expiration.
395 	 *
396 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
397 	 * We don't want any callout activity. When the CPR resume phase is
398 	 * entered, the cyclic will be programmed for the earliest expiration
399 	 * in the queue.
400 	 */
401 	if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0))
402 		(void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
403 }
404 
405 /*
406  * Delete and handle all past expirations in a callout table's queue.
407  */
408 static hrtime_t
409 callout_queue_delete(callout_table_t *ct)
410 {
411 	callout_list_t *cl;
412 	hrtime_t now;
413 
414 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
415 
416 	now = gethrtime();
417 	while ((cl = ct->ct_queue.ch_head) != NULL) {
418 		if (cl->cl_expiration > now)
419 			break;
420 		cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
421 		CALLOUT_LIST_DELETE(ct->ct_queue, cl);
422 		CALLOUT_LIST_APPEND(ct->ct_expired, cl);
423 	}
424 
425 	/*
426 	 * If this callout queue is empty or callouts have been suspended,
427 	 * just return.
428 	 */
429 	if ((cl == NULL) || (ct->ct_suspend > 0))
430 		return (CY_INFINITY);
431 
432 	(void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
433 
434 	return (cl->cl_expiration);
435 }
436 
437 static hrtime_t
438 callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange)
439 {
440 	callout_list_t *firstcl, *cl;
441 	hrtime_t expiration, now;
442 	int clflags;
443 	callout_hash_t temp;
444 
445 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
446 
447 	firstcl = ct->ct_queue.ch_head;
448 	if (firstcl == NULL)
449 		return (CY_INFINITY);
450 
451 	/*
452 	 * We walk the callout queue. If we encounter a hrestime entry that
453 	 * must be removed, we clean it out. Otherwise, we apply any
454 	 * adjustments needed to it. Because of the latter, we need to
455 	 * recreate the list as we go along.
456 	 */
457 	temp = ct->ct_queue;
458 	ct->ct_queue.ch_head = NULL;
459 	ct->ct_queue.ch_tail = NULL;
460 
461 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
462 	now = gethrtime();
463 	while ((cl = temp.ch_head) != NULL) {
464 		CALLOUT_LIST_DELETE(temp, cl);
465 
466 		/*
467 		 * Delete the callout and expire it, if one of the following
468 		 * is true:
469 		 *	- the callout has expired
470 		 *	- the callout is an absolute hrestime one and
471 		 *	  there has been a system time change
472 		 */
473 		if ((cl->cl_expiration <= now) ||
474 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
475 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
476 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
477 			continue;
478 		}
479 
480 		/*
481 		 * Apply adjustments, if any. Adjustments are applied after
482 		 * the system returns from KMDB or OBP. They are only applied
483 		 * to relative callout lists.
484 		 */
485 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
486 			expiration = cl->cl_expiration + delta;
487 			if (expiration <= 0)
488 				expiration = CY_INFINITY;
489 			cl->cl_expiration = expiration;
490 		}
491 
492 		(void) callout_queue_add(ct, cl);
493 	}
494 
495 	/*
496 	 * We need to return the expiration to help program the cyclic.
497 	 * If there are expired callouts, the cyclic needs to go off
498 	 * immediately. If the queue has become empty, then we return infinity.
499 	 * Else, we return the expiration of the earliest callout in the queue.
500 	 */
501 	if (ct->ct_expired.ch_head != NULL)
502 		return (gethrtime());
503 
504 	cl = ct->ct_queue.ch_head;
505 	if (cl == NULL)
506 		return (CY_INFINITY);
507 
508 	return (cl->cl_expiration);
509 }
510 
511 /*
512  * Initialize a callout table's heap, if necessary. Preallocate some free
513  * entries so we don't have to check for NULL elsewhere.
514  */
515 static void
516 callout_heap_init(callout_table_t *ct)
517 {
518 	size_t size;
519 
520 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
521 	ASSERT(ct->ct_heap == NULL);
522 
523 	ct->ct_heap_num = 0;
524 	ct->ct_heap_max = callout_chunk;
525 	size = sizeof (callout_heap_t) * callout_chunk;
526 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
527 }
528 
529 /*
530  * Reallocate the heap. Return 0 if the heap is still full at the end of it.
531  * Return 1 otherwise. Note that the heap only expands, it never contracts.
532  */
533 static int
534 callout_heap_expand(callout_table_t *ct)
535 {
536 	size_t max, size, osize;
537 	callout_heap_t *heap;
538 
539 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
540 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
541 
542 	while (ct->ct_heap_num == ct->ct_heap_max) {
543 		max = ct->ct_heap_max;
544 		mutex_exit(&ct->ct_mutex);
545 
546 		osize = sizeof (callout_heap_t) * max;
547 		size = sizeof (callout_heap_t) * (max + callout_chunk);
548 		heap = kmem_alloc(size, KM_NOSLEEP);
549 
550 		mutex_enter(&ct->ct_mutex);
551 		if (heap == NULL) {
552 			/*
553 			 * We could not allocate memory. If we can free up
554 			 * some entries, that would be great.
555 			 */
556 			if (ct->ct_nreap > 0)
557 				(void) callout_heap_process(ct, 0, 0);
558 			/*
559 			 * If we still have no space in the heap, inform the
560 			 * caller.
561 			 */
562 			if (ct->ct_heap_num == ct->ct_heap_max)
563 				return (0);
564 			return (1);
565 		}
566 		if (max < ct->ct_heap_max) {
567 			/*
568 			 * Someone beat us to the allocation. Free what we
569 			 * just allocated and proceed.
570 			 */
571 			kmem_free(heap, size);
572 			continue;
573 		}
574 
575 		bcopy(ct->ct_heap, heap, osize);
576 		kmem_free(ct->ct_heap, osize);
577 		ct->ct_heap = heap;
578 		ct->ct_heap_max = size / sizeof (callout_heap_t);
579 	}
580 
581 	return (1);
582 }
583 
584 /*
585  * Move an expiration from the bottom of the heap to its correct place
586  * in the heap. If we reached the root doing this, return 1. Else,
587  * return 0.
588  */
589 static int
590 callout_upheap(callout_table_t *ct)
591 {
592 	int current, parent;
593 	callout_heap_t *heap, *hcurrent, *hparent;
594 
595 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
596 	ASSERT(ct->ct_heap_num >= 1);
597 
598 	if (ct->ct_heap_num == 1) {
599 		return (1);
600 	}
601 
602 	heap = ct->ct_heap;
603 	current = ct->ct_heap_num - 1;
604 
605 	for (;;) {
606 		parent = CALLOUT_HEAP_PARENT(current);
607 		hparent = &heap[parent];
608 		hcurrent = &heap[current];
609 
610 		/*
611 		 * We have an expiration later than our parent; we're done.
612 		 */
613 		if (hcurrent->ch_expiration >= hparent->ch_expiration) {
614 			return (0);
615 		}
616 
617 		/*
618 		 * We need to swap with our parent, and continue up the heap.
619 		 */
620 		CALLOUT_SWAP(hparent, hcurrent);
621 
622 		/*
623 		 * If we just reached the root, we're done.
624 		 */
625 		if (parent == 0) {
626 			return (1);
627 		}
628 
629 		current = parent;
630 	}
631 	/*NOTREACHED*/
632 }
633 
634 /*
635  * Insert a new heap item into a callout table's heap.
636  */
637 static void
638 callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
639 {
640 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
641 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
642 
643 	cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED;
644 	/*
645 	 * First, copy the expiration and callout list pointer to the bottom
646 	 * of the heap.
647 	 */
648 	ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
649 	ct->ct_heap[ct->ct_heap_num].ch_list = cl;
650 	ct->ct_heap_num++;
651 
652 	/*
653 	 * Now, perform an upheap operation. If we reached the root, then
654 	 * the cyclic needs to be reprogrammed as we have an earlier
655 	 * expiration.
656 	 *
657 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
658 	 * We don't want any callout activity. When the CPR resume phase is
659 	 * entered, the cyclic will be programmed for the earliest expiration
660 	 * in the heap.
661 	 */
662 	if (callout_upheap(ct) && (ct->ct_suspend == 0))
663 		(void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
664 }
665 
666 /*
667  * Move an expiration from the top of the heap to its correct place
668  * in the heap.
669  */
670 static void
671 callout_downheap(callout_table_t *ct)
672 {
673 	int current, left, right, nelems;
674 	callout_heap_t *heap, *hleft, *hright, *hcurrent;
675 
676 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
677 	ASSERT(ct->ct_heap_num >= 1);
678 
679 	heap = ct->ct_heap;
680 	current = 0;
681 	nelems = ct->ct_heap_num;
682 
683 	for (;;) {
684 		/*
685 		 * If we don't have a left child (i.e., we're a leaf), we're
686 		 * done.
687 		 */
688 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
689 			return;
690 
691 		hleft = &heap[left];
692 		hcurrent = &heap[current];
693 
694 		right = CALLOUT_HEAP_RIGHT(current);
695 
696 		/*
697 		 * Even if we don't have a right child, we still need to compare
698 		 * our expiration against that of our left child.
699 		 */
700 		if (right >= nelems)
701 			goto comp_left;
702 
703 		hright = &heap[right];
704 
705 		/*
706 		 * We have both a left and a right child.  We need to compare
707 		 * the expiration of the children to determine which
708 		 * expires earlier.
709 		 */
710 		if (hright->ch_expiration < hleft->ch_expiration) {
711 			/*
712 			 * Our right child is the earlier of our children.
713 			 * We'll now compare our expiration to its expiration.
714 			 * If ours is the earlier one, we're done.
715 			 */
716 			if (hcurrent->ch_expiration <= hright->ch_expiration)
717 				return;
718 
719 			/*
720 			 * Our right child expires earlier than we do; swap
721 			 * with our right child, and descend right.
722 			 */
723 			CALLOUT_SWAP(hright, hcurrent);
724 			current = right;
725 			continue;
726 		}
727 
728 comp_left:
729 		/*
730 		 * Our left child is the earlier of our children (or we have
731 		 * no right child).  We'll now compare our expiration
732 		 * to its expiration. If ours is the earlier one, we're done.
733 		 */
734 		if (hcurrent->ch_expiration <= hleft->ch_expiration)
735 			return;
736 
737 		/*
738 		 * Our left child expires earlier than we do; swap with our
739 		 * left child, and descend left.
740 		 */
741 		CALLOUT_SWAP(hleft, hcurrent);
742 		current = left;
743 	}
744 }
745 
746 /*
747  * Delete and handle all past expirations in a callout table's heap.
748  */
749 static hrtime_t
750 callout_heap_delete(callout_table_t *ct)
751 {
752 	hrtime_t now, expiration, next;
753 	callout_list_t *cl;
754 	callout_heap_t *heap;
755 	int hash;
756 
757 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
758 
759 	if (CALLOUT_CLEANUP(ct)) {
760 		/*
761 		 * There are too many heap elements pointing to empty callout
762 		 * lists. Clean them out.
763 		 */
764 		(void) callout_heap_process(ct, 0, 0);
765 	}
766 
767 	now = gethrtime();
768 	heap = ct->ct_heap;
769 
770 	while (ct->ct_heap_num > 0) {
771 		expiration = heap->ch_expiration;
772 		hash = CALLOUT_CLHASH(expiration);
773 		cl = heap->ch_list;
774 		ASSERT(expiration == cl->cl_expiration);
775 
776 		if (cl->cl_callouts.ch_head == NULL) {
777 			/*
778 			 * If the callout list is empty, reap it.
779 			 * Decrement the reap count.
780 			 */
781 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
782 			CALLOUT_LIST_FREE(ct, cl);
783 			ct->ct_nreap--;
784 		} else {
785 			/*
786 			 * If the root of the heap expires in the future,
787 			 * bail out.
788 			 */
789 			if (expiration > now)
790 				break;
791 
792 			/*
793 			 * Move the callout list for this expiration to the
794 			 * list of expired callout lists. It will be processed
795 			 * by the callout executor.
796 			 */
797 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
798 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
799 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
800 		}
801 
802 		/*
803 		 * Now delete the root. This is done by swapping the root with
804 		 * the last item in the heap and downheaping the item.
805 		 */
806 		ct->ct_heap_num--;
807 		if (ct->ct_heap_num > 0) {
808 			heap[0] = heap[ct->ct_heap_num];
809 			callout_downheap(ct);
810 		}
811 	}
812 
813 	/*
814 	 * If this callout table is empty or callouts have been suspended,
815 	 * just return. The cyclic has already been programmed to
816 	 * infinity by the cyclic subsystem.
817 	 */
818 	if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
819 		return (CY_INFINITY);
820 
821 	/*
822 	 * If the top expirations are within callout_tolerance of each other,
823 	 * delay the cyclic expire so that they can be processed together.
824 	 * This is to prevent high resolution timers from swamping the system
825 	 * with cyclic activity.
826 	 */
827 	if (ct->ct_heap_num > 2) {
828 		next = expiration + callout_tolerance;
829 		if ((heap[1].ch_expiration < next) ||
830 		    (heap[2].ch_expiration < next))
831 			expiration = next;
832 	}
833 
834 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
835 
836 	return (expiration);
837 }
838 
839 /*
840  * There are some situations when the entire heap is walked and processed.
841  * This function is called to do the processing. These are the situations:
842  *
843  * 1. When the reap count reaches its threshold, the heap has to be cleared
844  *    of all empty callout lists.
845  *
846  * 2. When the system enters and exits KMDB/OBP, all entries in the heap
847  *    need to be adjusted by the interval spent in KMDB/OBP.
848  *
849  * 3. When system time is changed, the heap has to be scanned for
850  *    absolute hrestime timers. These need to be removed from the heap
851  *    and expired immediately.
852  *
853  * In cases 2 and 3, it is a good idea to do 1 as well since we are
854  * scanning the heap anyway.
855  *
856  * If the root gets changed and/or callout lists are expired, return the
857  * new expiration to the caller so it can reprogram the cyclic accordingly.
858  */
859 static hrtime_t
860 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
861 {
862 	callout_heap_t *heap;
863 	callout_list_t *cl;
864 	hrtime_t expiration, now;
865 	int i, hash, clflags;
866 	ulong_t num;
867 
868 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
869 
870 	if (ct->ct_heap_num == 0)
871 		return (CY_INFINITY);
872 
873 	if (ct->ct_nreap > 0)
874 		ct->ct_cleanups++;
875 
876 	heap = ct->ct_heap;
877 
878 	/*
879 	 * We walk the heap from the top to the bottom. If we encounter
880 	 * a heap item that points to an empty callout list, we clean
881 	 * it out. If we encounter a hrestime entry that must be removed,
882 	 * again we clean it out. Otherwise, we apply any adjustments needed
883 	 * to an element.
884 	 *
885 	 * During the walk, we also compact the heap from the bottom and
886 	 * reconstruct the heap using upheap operations. This is very
887 	 * efficient if the number of elements to be cleaned is greater than
888 	 * or equal to half the heap. This is the common case.
889 	 *
890 	 * Even in the non-common case, the upheap operations should be short
891 	 * as the entries below generally tend to be bigger than the entries
892 	 * above.
893 	 */
894 	num = ct->ct_heap_num;
895 	ct->ct_heap_num = 0;
896 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
897 	now = gethrtime();
898 	for (i = 0; i < num; i++) {
899 		cl = heap[i].ch_list;
900 		/*
901 		 * If the callout list is empty, delete the heap element and
902 		 * free the callout list.
903 		 */
904 		if (cl->cl_callouts.ch_head == NULL) {
905 			hash = CALLOUT_CLHASH(cl->cl_expiration);
906 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
907 			CALLOUT_LIST_FREE(ct, cl);
908 			continue;
909 		}
910 
911 		/*
912 		 * Delete the heap element and expire the callout list, if
913 		 * one of the following is true:
914 		 *	- the callout list has expired
915 		 *	- the callout list is an absolute hrestime one and
916 		 *	  there has been a system time change
917 		 */
918 		if ((cl->cl_expiration <= now) ||
919 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
920 			hash = CALLOUT_CLHASH(cl->cl_expiration);
921 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
922 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
923 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
924 			continue;
925 		}
926 
927 		/*
928 		 * Apply adjustments, if any. Adjustments are applied after
929 		 * the system returns from KMDB or OBP. They are only applied
930 		 * to relative callout lists.
931 		 */
932 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
933 			hash = CALLOUT_CLHASH(cl->cl_expiration);
934 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
935 			expiration = cl->cl_expiration + delta;
936 			if (expiration <= 0)
937 				expiration = CY_INFINITY;
938 			heap[i].ch_expiration = expiration;
939 			cl->cl_expiration = expiration;
940 			hash = CALLOUT_CLHASH(cl->cl_expiration);
941 			if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
942 				CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
943 			} else {
944 				CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
945 			}
946 		}
947 
948 		heap[ct->ct_heap_num] = heap[i];
949 		ct->ct_heap_num++;
950 		(void) callout_upheap(ct);
951 	}
952 
953 	ct->ct_nreap = 0;
954 
955 	/*
956 	 * We need to return the expiration to help program the cyclic.
957 	 * If there are expired callouts, the cyclic needs to go off
958 	 * immediately. If the heap has become empty, then we return infinity.
959 	 * Else, return the expiration of the earliest callout in the heap.
960 	 */
961 	if (ct->ct_expired.ch_head != NULL)
962 		return (gethrtime());
963 
964 	if (ct->ct_heap_num == 0)
965 		return (CY_INFINITY);
966 
967 	return (heap->ch_expiration);
968 }
969 
970 /*
971  * Common function used to create normal and realtime callouts.
972  *
973  * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
974  * there is one restriction on a realtime callout handler - it should not
975  * directly or indirectly acquire cpu_lock. CPU offline waits for pending
976  * cyclic handlers to complete while holding cpu_lock. So, if a realtime
977  * callout handler were to try to get cpu_lock, there would be a deadlock
978  * during CPU offline.
979  */
980 callout_id_t
981 timeout_generic(int type, void (*func)(void *), void *arg,
982 	hrtime_t expiration, hrtime_t resolution, int flags)
983 {
984 	callout_table_t *ct;
985 	callout_t *cp;
986 	callout_id_t id;
987 	callout_list_t *cl;
988 	hrtime_t now, interval;
989 	int hash, clflags;
990 
991 	ASSERT(resolution > 0);
992 	ASSERT(func != NULL);
993 
994 	/*
995 	 * We get the current hrtime right upfront so that latencies in
996 	 * this function do not affect the accuracy of the callout.
997 	 */
998 	now = gethrtime();
999 
1000 	/*
1001 	 * We disable kernel preemption so that we remain on the same CPU
1002 	 * throughout. If we needed to reprogram the callout table's cyclic,
1003 	 * we can avoid X-calls if we are on the same CPU.
1004 	 *
1005 	 * Note that callout_alloc() releases and reacquires the callout
1006 	 * table mutex. While reacquiring the mutex, it is possible for us
1007 	 * to go to sleep and later migrate to another CPU. This should be
1008 	 * pretty rare, though.
1009 	 */
1010 	kpreempt_disable();
1011 
1012 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
1013 	mutex_enter(&ct->ct_mutex);
1014 
1015 	if (ct->ct_cyclic == CYCLIC_NONE) {
1016 		mutex_exit(&ct->ct_mutex);
1017 		/*
1018 		 * The callout table has not yet been initialized fully.
1019 		 * So, put this one on the boot callout table which is
1020 		 * always initialized.
1021 		 */
1022 		ct = &callout_boot_ct[type];
1023 		mutex_enter(&ct->ct_mutex);
1024 	}
1025 
1026 	if (CALLOUT_CLEANUP(ct)) {
1027 		/*
1028 		 * There are too many heap elements pointing to empty callout
1029 		 * lists. Clean them out. Since cleanup is only done once
1030 		 * in a while, no need to reprogram the cyclic if the root
1031 		 * of the heap gets cleaned out.
1032 		 */
1033 		(void) callout_heap_process(ct, 0, 0);
1034 	}
1035 
1036 	if ((cp = ct->ct_free) == NULL)
1037 		cp = callout_alloc(ct);
1038 	else
1039 		ct->ct_free = cp->c_idnext;
1040 
1041 	cp->c_func = func;
1042 	cp->c_arg = arg;
1043 
1044 	/*
1045 	 * Compute the expiration hrtime.
1046 	 */
1047 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
1048 		interval = expiration - now;
1049 	} else {
1050 		interval = expiration;
1051 		expiration += now;
1052 	}
1053 
1054 	if (resolution > 1) {
1055 		/*
1056 		 * Align expiration to the specified resolution.
1057 		 */
1058 		if (flags & CALLOUT_FLAG_ROUNDUP)
1059 			expiration += resolution - 1;
1060 		expiration = (expiration / resolution) * resolution;
1061 	}
1062 
1063 	if (expiration <= 0) {
1064 		/*
1065 		 * expiration hrtime overflow has occurred. Just set the
1066 		 * expiration to infinity.
1067 		 */
1068 		expiration = CY_INFINITY;
1069 	}
1070 
1071 	/*
1072 	 * Assign an ID to this callout
1073 	 */
1074 	if (flags & CALLOUT_FLAG_32BIT) {
1075 		if (interval > callout_longterm) {
1076 			id = (ct->ct_long_id - callout_counter_low);
1077 			id |= CALLOUT_COUNTER_HIGH;
1078 			ct->ct_long_id = id;
1079 		} else {
1080 			id = (ct->ct_short_id - callout_counter_low);
1081 			id |= CALLOUT_COUNTER_HIGH;
1082 			ct->ct_short_id = id;
1083 		}
1084 	} else {
1085 		id = (ct->ct_gen_id - callout_counter_low);
1086 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
1087 			id |= CALLOUT_COUNTER_HIGH;
1088 			id += CALLOUT_GENERATION_LOW;
1089 		}
1090 		ct->ct_gen_id = id;
1091 	}
1092 
1093 	cp->c_xid = id;
1094 
1095 	clflags = 0;
1096 	if (flags & CALLOUT_FLAG_ABSOLUTE)
1097 		clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
1098 	if (flags & CALLOUT_FLAG_HRESTIME)
1099 		clflags |= CALLOUT_LIST_FLAG_HRESTIME;
1100 	if (resolution == 1)
1101 		clflags |= CALLOUT_LIST_FLAG_NANO;
1102 	hash = CALLOUT_CLHASH(expiration);
1103 
1104 again:
1105 	/*
1106 	 * Try to see if a callout list already exists for this expiration.
1107 	 */
1108 	cl = callout_list_get(ct, expiration, clflags, hash);
1109 	if (cl == NULL) {
1110 		/*
1111 		 * Check the free list. If we don't find one, we have to
1112 		 * take the slow path and allocate from kmem.
1113 		 */
1114 		if ((cl = ct->ct_lfree) == NULL) {
1115 			callout_list_alloc(ct);
1116 			/*
1117 			 * In the above call, we drop the lock, allocate and
1118 			 * reacquire the lock. So, we could have been away
1119 			 * for a while. In the meantime, someone could have
1120 			 * inserted a callout list with the same expiration.
1121 			 * Plus, the heap could have become full. So, the best
1122 			 * course is to repeat the steps. This should be an
1123 			 * infrequent event.
1124 			 */
1125 			goto again;
1126 		}
1127 		ct->ct_lfree = cl->cl_next;
1128 		cl->cl_expiration = expiration;
1129 		cl->cl_flags = clflags;
1130 
1131 		/*
1132 		 * Check if we have enough space in the heap to insert one
1133 		 * expiration. If not, expand the heap.
1134 		 */
1135 		if (ct->ct_heap_num == ct->ct_heap_max) {
1136 			if (callout_heap_expand(ct) == 0) {
1137 				/*
1138 				 * Could not expand the heap. Just queue it.
1139 				 */
1140 				callout_queue_insert(ct, cl);
1141 				goto out;
1142 			}
1143 
1144 			/*
1145 			 * In the above call, we drop the lock, allocate and
1146 			 * reacquire the lock. So, we could have been away
1147 			 * for a while. In the meantime, someone could have
1148 			 * inserted a callout list with the same expiration.
1149 			 * But we will not go back and check for it as this
1150 			 * should be a really infrequent event. There is no
1151 			 * point.
1152 			 */
1153 		}
1154 
1155 		if (clflags & CALLOUT_LIST_FLAG_NANO) {
1156 			CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
1157 		} else {
1158 			CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
1159 		}
1160 
1161 		/*
1162 		 * This is a new expiration. So, insert it into the heap.
1163 		 * This will also reprogram the cyclic, if the expiration
1164 		 * propagated to the root of the heap.
1165 		 */
1166 		callout_heap_insert(ct, cl);
1167 	} else {
1168 		/*
1169 		 * If the callout list was empty, untimeout_generic() would
1170 		 * have incremented a reap count. Decrement the reap count
1171 		 * as we are going to insert a callout into this list.
1172 		 */
1173 		if (cl->cl_callouts.ch_head == NULL)
1174 			ct->ct_nreap--;
1175 	}
1176 out:
1177 	cp->c_list = cl;
1178 	CALLOUT_APPEND(ct, cp);
1179 
1180 	ct->ct_timeouts++;
1181 	ct->ct_timeouts_pending++;
1182 
1183 	mutex_exit(&ct->ct_mutex);
1184 
1185 	kpreempt_enable();
1186 
1187 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
1188 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
1189 	    cp);
1190 
1191 	return (id);
1192 }
1193 
1194 timeout_id_t
1195 timeout(void (*func)(void *), void *arg, clock_t delta)
1196 {
1197 	ulong_t id;
1198 
1199 	/*
1200 	 * Make sure the callout runs at least 1 tick in the future.
1201 	 */
1202 	if (delta <= 0)
1203 		delta = 1;
1204 	else if (delta > callout_max_ticks)
1205 		delta = callout_max_ticks;
1206 
1207 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
1208 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1209 
1210 	return ((timeout_id_t)id);
1211 }
1212 
1213 /*
1214  * Convenience function that creates a normal callout with default parameters
1215  * and returns a full ID.
1216  */
1217 callout_id_t
1218 timeout_default(void (*func)(void *), void *arg, clock_t delta)
1219 {
1220 	callout_id_t id;
1221 
1222 	/*
1223 	 * Make sure the callout runs at least 1 tick in the future.
1224 	 */
1225 	if (delta <= 0)
1226 		delta = 1;
1227 	else if (delta > callout_max_ticks)
1228 		delta = callout_max_ticks;
1229 
1230 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
1231 	    nsec_per_tick, 0);
1232 
1233 	return (id);
1234 }
1235 
1236 timeout_id_t
1237 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
1238 {
1239 	ulong_t id;
1240 
1241 	/*
1242 	 * Make sure the callout runs at least 1 tick in the future.
1243 	 */
1244 	if (delta <= 0)
1245 		delta = 1;
1246 	else if (delta > callout_max_ticks)
1247 		delta = callout_max_ticks;
1248 
1249 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
1250 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1251 
1252 	return ((timeout_id_t)id);
1253 }
1254 
1255 /*
1256  * Convenience function that creates a realtime callout with default parameters
1257  * and returns a full ID.
1258  */
1259 callout_id_t
1260 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
1261 {
1262 	callout_id_t id;
1263 
1264 	/*
1265 	 * Make sure the callout runs at least 1 tick in the future.
1266 	 */
1267 	if (delta <= 0)
1268 		delta = 1;
1269 	else if (delta > callout_max_ticks)
1270 		delta = callout_max_ticks;
1271 
1272 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
1273 	    nsec_per_tick, 0);
1274 
1275 	return (id);
1276 }
1277 
1278 hrtime_t
1279 untimeout_generic(callout_id_t id, int nowait)
1280 {
1281 	callout_table_t *ct;
1282 	callout_t *cp;
1283 	callout_id_t xid;
1284 	callout_list_t *cl;
1285 	int hash, flags;
1286 	callout_id_t bogus;
1287 
1288 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
1289 	hash = CALLOUT_IDHASH(id);
1290 
1291 	mutex_enter(&ct->ct_mutex);
1292 
1293 	/*
1294 	 * Search the ID hash table for the callout.
1295 	 */
1296 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
1297 
1298 		xid = cp->c_xid;
1299 
1300 		/*
1301 		 * Match the ID and generation number.
1302 		 */
1303 		if ((xid & CALLOUT_ID_MASK) != id)
1304 			continue;
1305 
1306 		if ((xid & CALLOUT_EXECUTING) == 0) {
1307 			hrtime_t expiration;
1308 
1309 			/*
1310 			 * Delete the callout. If the callout list becomes
1311 			 * NULL, we don't remove it from the table. This is
1312 			 * so it can be reused. If the empty callout list
1313 			 * corresponds to the top of the the callout heap, we
1314 			 * don't reprogram the table cyclic here. This is in
1315 			 * order to avoid lots of X-calls to the CPU associated
1316 			 * with the callout table.
1317 			 */
1318 			cl = cp->c_list;
1319 			expiration = cl->cl_expiration;
1320 			CALLOUT_DELETE(ct, cp);
1321 			CALLOUT_FREE(ct, cp);
1322 			ct->ct_untimeouts_unexpired++;
1323 			ct->ct_timeouts_pending--;
1324 
1325 			/*
1326 			 * If the callout list has become empty, there are 3
1327 			 * possibilities. If it is present:
1328 			 *	- in the heap, it needs to be cleaned along
1329 			 *	  with its heap entry. Increment a reap count.
1330 			 *	- in the callout queue, free it.
1331 			 *	- in the expired list, free it.
1332 			 */
1333 			if (cl->cl_callouts.ch_head == NULL) {
1334 				flags = cl->cl_flags;
1335 				if (flags & CALLOUT_LIST_FLAG_HEAPED) {
1336 					ct->ct_nreap++;
1337 				} else if (flags & CALLOUT_LIST_FLAG_QUEUED) {
1338 					CALLOUT_LIST_DELETE(ct->ct_queue, cl);
1339 					CALLOUT_LIST_FREE(ct, cl);
1340 				} else {
1341 					CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1342 					CALLOUT_LIST_FREE(ct, cl);
1343 				}
1344 			}
1345 			mutex_exit(&ct->ct_mutex);
1346 
1347 			expiration -= gethrtime();
1348 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
1349 			    "untimeout:ID %lx hrtime left %llx", id,
1350 			    expiration);
1351 			return (expiration < 0 ? 0 : expiration);
1352 		}
1353 
1354 		ct->ct_untimeouts_executing++;
1355 		/*
1356 		 * The callout we want to delete is currently executing.
1357 		 * The DDI states that we must wait until the callout
1358 		 * completes before returning, so we block on c_done until the
1359 		 * callout ID changes (to the old ID if it's on the freelist,
1360 		 * or to a new callout ID if it's in use).  This implicitly
1361 		 * assumes that callout structures are persistent (they are).
1362 		 */
1363 		if (cp->c_executor == curthread) {
1364 			/*
1365 			 * The timeout handler called untimeout() on itself.
1366 			 * Stupid, but legal.  We can't wait for the timeout
1367 			 * to complete without deadlocking, so we just return.
1368 			 */
1369 			mutex_exit(&ct->ct_mutex);
1370 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
1371 			    "untimeout_self:ID %x", id);
1372 			return (-1);
1373 		}
1374 		if (nowait == 0) {
1375 			/*
1376 			 * We need to wait. Indicate that we are waiting by
1377 			 * incrementing c_waiting. This prevents the executor
1378 			 * from doing a wakeup on c_done if there are no
1379 			 * waiters.
1380 			 */
1381 			while (cp->c_xid == xid) {
1382 				cp->c_waiting = 1;
1383 				cv_wait(&cp->c_done, &ct->ct_mutex);
1384 			}
1385 		}
1386 		mutex_exit(&ct->ct_mutex);
1387 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
1388 		    "untimeout_executing:ID %lx", id);
1389 		return (-1);
1390 	}
1391 	ct->ct_untimeouts_expired++;
1392 
1393 	mutex_exit(&ct->ct_mutex);
1394 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
1395 	    "untimeout_bogus_id:ID %lx", id);
1396 
1397 	/*
1398 	 * We didn't find the specified callout ID.  This means either
1399 	 * (1) the callout already fired, or (2) the caller passed us
1400 	 * a bogus value.  Perform a sanity check to detect case (2).
1401 	 */
1402 	bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
1403 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
1404 		panic("untimeout: impossible timeout id %llx",
1405 		    (unsigned long long)id);
1406 
1407 	return (-1);
1408 }
1409 
1410 clock_t
1411 untimeout(timeout_id_t id_arg)
1412 {
1413 	hrtime_t hleft;
1414 	clock_t tleft;
1415 	callout_id_t id;
1416 
1417 	id = (ulong_t)id_arg;
1418 	hleft = untimeout_generic(id, 0);
1419 	if (hleft < 0)
1420 		tleft = -1;
1421 	else if (hleft == 0)
1422 		tleft = 0;
1423 	else
1424 		tleft = NSEC_TO_TICK(hleft);
1425 
1426 	return (tleft);
1427 }
1428 
1429 /*
1430  * Convenience function to untimeout a timeout with a full ID with default
1431  * parameters.
1432  */
1433 clock_t
1434 untimeout_default(callout_id_t id, int nowait)
1435 {
1436 	hrtime_t hleft;
1437 	clock_t tleft;
1438 
1439 	hleft = untimeout_generic(id, nowait);
1440 	if (hleft < 0)
1441 		tleft = -1;
1442 	else if (hleft == 0)
1443 		tleft = 0;
1444 	else
1445 		tleft = NSEC_TO_TICK(hleft);
1446 
1447 	return (tleft);
1448 }
1449 
1450 /*
1451  * Expire all the callouts queued in the specified callout list.
1452  */
1453 static void
1454 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
1455 {
1456 	callout_t *cp, *cnext;
1457 
1458 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1459 	ASSERT(cl != NULL);
1460 
1461 	for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
1462 		/*
1463 		 * Multiple executor threads could be running at the same
1464 		 * time. If this callout is already being executed,
1465 		 * go on to the next one.
1466 		 */
1467 		if (cp->c_xid & CALLOUT_EXECUTING) {
1468 			cnext = cp->c_clnext;
1469 			continue;
1470 		}
1471 
1472 		/*
1473 		 * Indicate to untimeout() that a callout is
1474 		 * being expired by the executor.
1475 		 */
1476 		cp->c_xid |= CALLOUT_EXECUTING;
1477 		cp->c_executor = curthread;
1478 		mutex_exit(&ct->ct_mutex);
1479 
1480 		DTRACE_PROBE1(callout__start, callout_t *, cp);
1481 		(*cp->c_func)(cp->c_arg);
1482 		DTRACE_PROBE1(callout__end, callout_t *, cp);
1483 
1484 		mutex_enter(&ct->ct_mutex);
1485 
1486 		ct->ct_expirations++;
1487 		ct->ct_timeouts_pending--;
1488 		/*
1489 		 * Indicate completion for c_done.
1490 		 */
1491 		cp->c_xid &= ~CALLOUT_EXECUTING;
1492 		cp->c_executor = NULL;
1493 		cnext = cp->c_clnext;
1494 
1495 		/*
1496 		 * Delete callout from ID hash table and the callout
1497 		 * list, return to freelist, and tell any untimeout() that
1498 		 * cares that we're done.
1499 		 */
1500 		CALLOUT_DELETE(ct, cp);
1501 		CALLOUT_FREE(ct, cp);
1502 
1503 		if (cp->c_waiting) {
1504 			cp->c_waiting = 0;
1505 			cv_broadcast(&cp->c_done);
1506 		}
1507 	}
1508 }
1509 
1510 /*
1511  * Execute all expired callout lists for a callout table.
1512  */
1513 static void
1514 callout_expire(callout_table_t *ct)
1515 {
1516 	callout_list_t *cl, *clnext;
1517 
1518 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1519 
1520 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1521 		/*
1522 		 * Expire all the callouts in this callout list.
1523 		 */
1524 		callout_list_expire(ct, cl);
1525 
1526 		clnext = cl->cl_next;
1527 		if (cl->cl_callouts.ch_head == NULL) {
1528 			/*
1529 			 * Free the callout list.
1530 			 */
1531 			CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1532 			CALLOUT_LIST_FREE(ct, cl);
1533 		}
1534 	}
1535 }
1536 
1537 /*
1538  * The cyclic handlers below process callouts in two steps:
1539  *
1540  *	1. Find all expired callout lists and queue them in a separate
1541  *	   list of expired callouts.
1542  *	2. Execute the expired callout lists.
1543  *
1544  * This is done for two reasons:
1545  *
1546  *	1. We want to quickly find the next earliest expiration to program
1547  *	   the cyclic to and reprogram it. We can do this right at the end
1548  *	   of step 1.
1549  *	2. The realtime cyclic handler expires callouts in place. However,
1550  *	   for normal callouts, callouts are expired by a taskq thread.
1551  *	   So, it is simpler and more robust to have the taskq thread just
1552  *	   do step 2.
1553  */
1554 
1555 /*
1556  * Realtime callout cyclic handlers.
1557  */
1558 void
1559 callout_realtime(callout_table_t *ct)
1560 {
1561 	mutex_enter(&ct->ct_mutex);
1562 	(void) callout_heap_delete(ct);
1563 	callout_expire(ct);
1564 	mutex_exit(&ct->ct_mutex);
1565 }
1566 
1567 void
1568 callout_queue_realtime(callout_table_t *ct)
1569 {
1570 	mutex_enter(&ct->ct_mutex);
1571 	(void) callout_queue_delete(ct);
1572 	callout_expire(ct);
1573 	mutex_exit(&ct->ct_mutex);
1574 }
1575 
1576 void
1577 callout_execute(callout_table_t *ct)
1578 {
1579 	mutex_enter(&ct->ct_mutex);
1580 	callout_expire(ct);
1581 	mutex_exit(&ct->ct_mutex);
1582 }
1583 
1584 /*
1585  * Normal callout cyclic handlers.
1586  */
1587 void
1588 callout_normal(callout_table_t *ct)
1589 {
1590 	int i, exec;
1591 	hrtime_t exp;
1592 
1593 	mutex_enter(&ct->ct_mutex);
1594 	exp = callout_heap_delete(ct);
1595 	CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1596 	mutex_exit(&ct->ct_mutex);
1597 
1598 	for (i = 0; i < exec; i++) {
1599 		ASSERT(ct->ct_taskq != NULL);
1600 		(void) taskq_dispatch(ct->ct_taskq,
1601 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1602 	}
1603 }
1604 
1605 void
1606 callout_queue_normal(callout_table_t *ct)
1607 {
1608 	int i, exec;
1609 	hrtime_t exp;
1610 
1611 	mutex_enter(&ct->ct_mutex);
1612 	exp = callout_queue_delete(ct);
1613 	CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1614 	mutex_exit(&ct->ct_mutex);
1615 
1616 	for (i = 0; i < exec; i++) {
1617 		ASSERT(ct->ct_taskq != NULL);
1618 		(void) taskq_dispatch(ct->ct_taskq,
1619 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1620 	}
1621 }
1622 
1623 /*
1624  * Suspend callout processing.
1625  */
1626 static void
1627 callout_suspend(void)
1628 {
1629 	int t, f;
1630 	callout_table_t *ct;
1631 
1632 	/*
1633 	 * Traverse every callout table in the system and suspend callout
1634 	 * processing.
1635 	 *
1636 	 * We need to suspend all the tables (including the inactive ones)
1637 	 * so that if a table is made active while the suspend is still on,
1638 	 * the table remains suspended.
1639 	 */
1640 	for (f = 0; f < max_ncpus; f++) {
1641 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1642 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1643 
1644 			mutex_enter(&ct->ct_mutex);
1645 			ct->ct_suspend++;
1646 			if (ct->ct_cyclic == CYCLIC_NONE) {
1647 				mutex_exit(&ct->ct_mutex);
1648 				continue;
1649 			}
1650 			if (ct->ct_suspend == 1) {
1651 				(void) cyclic_reprogram(ct->ct_cyclic,
1652 				    CY_INFINITY);
1653 				(void) cyclic_reprogram(ct->ct_qcyclic,
1654 				    CY_INFINITY);
1655 			}
1656 			mutex_exit(&ct->ct_mutex);
1657 		}
1658 	}
1659 }
1660 
1661 /*
1662  * Resume callout processing.
1663  */
1664 static void
1665 callout_resume(hrtime_t delta, int timechange)
1666 {
1667 	hrtime_t hexp, qexp;
1668 	int t, f;
1669 	callout_table_t *ct;
1670 
1671 	/*
1672 	 * Traverse every callout table in the system and resume callout
1673 	 * processing. For active tables, perform any hrtime adjustments
1674 	 * necessary.
1675 	 */
1676 	for (f = 0; f < max_ncpus; f++) {
1677 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1678 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1679 
1680 			mutex_enter(&ct->ct_mutex);
1681 			if (ct->ct_cyclic == CYCLIC_NONE) {
1682 				ct->ct_suspend--;
1683 				mutex_exit(&ct->ct_mutex);
1684 				continue;
1685 			}
1686 
1687 			/*
1688 			 * If a delta is specified, adjust the expirations in
1689 			 * the heap by delta. Also, if the caller indicates
1690 			 * a timechange, process that. This step also cleans
1691 			 * out any empty callout lists that might happen to
1692 			 * be there.
1693 			 */
1694 			hexp = callout_heap_process(ct, delta, timechange);
1695 			qexp = callout_queue_process(ct, delta, timechange);
1696 
1697 			ct->ct_suspend--;
1698 			if (ct->ct_suspend == 0) {
1699 				(void) cyclic_reprogram(ct->ct_cyclic, hexp);
1700 				(void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1701 			}
1702 
1703 			mutex_exit(&ct->ct_mutex);
1704 		}
1705 	}
1706 }
1707 
1708 /*
1709  * Callback handler used by CPR to stop and resume callouts.
1710  * The cyclic subsystem saves and restores hrtime during CPR.
1711  * That is why callout_resume() is called with a 0 delta.
1712  * Although hrtime is the same, hrestime (system time) has
1713  * progressed during CPR. So, we have to indicate a time change
1714  * to expire the absolute hrestime timers.
1715  */
1716 /*ARGSUSED*/
1717 static boolean_t
1718 callout_cpr_callb(void *arg, int code)
1719 {
1720 	if (code == CB_CODE_CPR_CHKPT)
1721 		callout_suspend();
1722 	else
1723 		callout_resume(0, 1);
1724 
1725 	return (B_TRUE);
1726 }
1727 
1728 /*
1729  * Callback handler invoked when the debugger is entered or exited.
1730  */
1731 /*ARGSUSED*/
1732 static boolean_t
1733 callout_debug_callb(void *arg, int code)
1734 {
1735 	hrtime_t delta;
1736 
1737 	/*
1738 	 * When the system enters the debugger. make a note of the hrtime.
1739 	 * When it is resumed, compute how long the system was in the
1740 	 * debugger. This interval should not be counted for callouts.
1741 	 */
1742 	if (code == 0) {
1743 		callout_suspend();
1744 		callout_debug_hrtime = gethrtime();
1745 	} else {
1746 		delta = gethrtime() - callout_debug_hrtime;
1747 		callout_resume(delta, 0);
1748 	}
1749 
1750 	return (B_TRUE);
1751 }
1752 
1753 /*
1754  * Move the absolute hrestime callouts to the expired list. Then program the
1755  * table's cyclic to expire immediately so that the callouts can be executed
1756  * immediately.
1757  */
1758 static void
1759 callout_hrestime_one(callout_table_t *ct)
1760 {
1761 	hrtime_t hexp, qexp;
1762 
1763 	mutex_enter(&ct->ct_mutex);
1764 	if (ct->ct_cyclic == CYCLIC_NONE) {
1765 		mutex_exit(&ct->ct_mutex);
1766 		return;
1767 	}
1768 
1769 	/*
1770 	 * Walk the heap and process all the absolute hrestime entries.
1771 	 */
1772 	hexp = callout_heap_process(ct, 0, 1);
1773 	qexp = callout_queue_process(ct, 0, 1);
1774 
1775 	if (ct->ct_suspend == 0) {
1776 		(void) cyclic_reprogram(ct->ct_cyclic, hexp);
1777 		(void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1778 	}
1779 
1780 	mutex_exit(&ct->ct_mutex);
1781 }
1782 
1783 /*
1784  * This function is called whenever system time (hrestime) is changed
1785  * explicitly. All the HRESTIME callouts must be expired at once.
1786  */
1787 /*ARGSUSED*/
1788 void
1789 callout_hrestime(void)
1790 {
1791 	int t, f;
1792 	callout_table_t *ct;
1793 
1794 	/*
1795 	 * Traverse every callout table in the system and process the hrestime
1796 	 * callouts therein.
1797 	 *
1798 	 * We look at all the tables because we don't know which ones were
1799 	 * onlined and offlined in the past. The offlined tables may still
1800 	 * have active cyclics processing timers somewhere.
1801 	 */
1802 	for (f = 0; f < max_ncpus; f++) {
1803 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1804 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1805 			callout_hrestime_one(ct);
1806 		}
1807 	}
1808 }
1809 
1810 /*
1811  * Create the hash tables for this callout table.
1812  */
1813 static void
1814 callout_hash_init(callout_table_t *ct)
1815 {
1816 	size_t size;
1817 
1818 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1819 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1820 
1821 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1822 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1823 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1824 }
1825 
1826 /*
1827  * Create per-callout table kstats.
1828  */
1829 static void
1830 callout_kstat_init(callout_table_t *ct)
1831 {
1832 	callout_stat_type_t stat;
1833 	kstat_t *ct_kstats;
1834 	int ndx;
1835 
1836 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1837 	ASSERT(ct->ct_kstats == NULL);
1838 
1839 	ndx = ct - callout_table;
1840 	ct_kstats = kstat_create("unix", ndx, "callout",
1841 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1842 
1843 	if (ct_kstats == NULL) {
1844 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1845 		    (void *)ct);
1846 	} else {
1847 		ct_kstats->ks_data = ct->ct_kstat_data;
1848 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1849 			kstat_named_init(&ct->ct_kstat_data[stat],
1850 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
1851 		ct->ct_kstats = ct_kstats;
1852 		kstat_install(ct_kstats);
1853 	}
1854 }
1855 
1856 static void
1857 callout_cyclic_init(callout_table_t *ct)
1858 {
1859 	cyc_handler_t hdlr;
1860 	cyc_time_t when;
1861 	processorid_t seqid;
1862 	int t;
1863 	cyclic_id_t cyclic, qcyclic;
1864 
1865 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1866 
1867 	t = ct->ct_type;
1868 	seqid = CALLOUT_TABLE_SEQID(ct);
1869 
1870 	/*
1871 	 * Create the taskq thread if the table type is normal.
1872 	 * Realtime tables are handled at PIL1 by a softint
1873 	 * handler.
1874 	 */
1875 	if (t == CALLOUT_NORMAL) {
1876 		ASSERT(ct->ct_taskq == NULL);
1877 		/*
1878 		 * Each callout thread consumes exactly one
1879 		 * task structure while active.  Therefore,
1880 		 * prepopulating with 2 * callout_threads tasks
1881 		 * ensures that there's at least one task per
1882 		 * thread that's either scheduled or on the
1883 		 * freelist.  In turn, this guarantees that
1884 		 * taskq_dispatch() will always either succeed
1885 		 * (because there's a free task structure) or
1886 		 * be unnecessary (because "callout_excute(ct)"
1887 		 * has already scheduled).
1888 		 */
1889 		ct->ct_taskq =
1890 		    taskq_create_instance("callout_taskq", seqid,
1891 		    callout_threads, maxclsyspri,
1892 		    2 * callout_threads, 2 * callout_threads,
1893 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1894 	}
1895 
1896 	/*
1897 	 * callouts can only be created in a table whose
1898 	 * cyclic has been initialized.
1899 	 */
1900 	ASSERT(ct->ct_heap_num == 0);
1901 
1902 	/*
1903 	 * Drop the mutex before creating the callout cyclics. cyclic_add()
1904 	 * could potentially expand the cyclic heap. We don't want to be
1905 	 * holding the callout table mutex in that case. Note that this
1906 	 * function is called during CPU online. cpu_lock is held at this
1907 	 * point. So, only one thread can be executing the cyclic add logic
1908 	 * below at any time.
1909 	 */
1910 	mutex_exit(&ct->ct_mutex);
1911 
1912 	/*
1913 	 * Create the callout table cyclics.
1914 	 *
1915 	 * The realtime cyclic handler executes at low PIL. The normal cyclic
1916 	 * handler executes at lock PIL. This is because there are cases
1917 	 * where code can block at PIL > 1 waiting for a normal callout handler
1918 	 * to unblock it directly or indirectly. If the normal cyclic were to
1919 	 * be executed at low PIL, it could get blocked out by the waiter
1920 	 * and cause a deadlock.
1921 	 */
1922 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1923 
1924 	if (t == CALLOUT_REALTIME) {
1925 		hdlr.cyh_level = callout_realtime_level;
1926 		hdlr.cyh_func = (cyc_func_t)callout_realtime;
1927 	} else {
1928 		hdlr.cyh_level = callout_normal_level;
1929 		hdlr.cyh_func = (cyc_func_t)callout_normal;
1930 	}
1931 	hdlr.cyh_arg = ct;
1932 	when.cyt_when = CY_INFINITY;
1933 	when.cyt_interval = CY_INFINITY;
1934 
1935 	cyclic = cyclic_add(&hdlr, &when);
1936 
1937 	if (t == CALLOUT_REALTIME)
1938 		hdlr.cyh_func = (cyc_func_t)callout_queue_realtime;
1939 	else
1940 		hdlr.cyh_func = (cyc_func_t)callout_queue_normal;
1941 
1942 	qcyclic = cyclic_add(&hdlr, &when);
1943 
1944 	mutex_enter(&ct->ct_mutex);
1945 	ct->ct_cyclic = cyclic;
1946 	ct->ct_qcyclic = qcyclic;
1947 }
1948 
1949 void
1950 callout_cpu_online(cpu_t *cp)
1951 {
1952 	lgrp_handle_t hand;
1953 	callout_cache_t *cache;
1954 	char s[KMEM_CACHE_NAMELEN];
1955 	callout_table_t *ct;
1956 	processorid_t seqid;
1957 	int t;
1958 
1959 	ASSERT(MUTEX_HELD(&cpu_lock));
1960 
1961 	/*
1962 	 * Locate the cache corresponding to the onlined CPU's lgroup.
1963 	 * Note that access to callout_caches is protected by cpu_lock.
1964 	 */
1965 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1966 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1967 		if (cache->cc_hand == hand)
1968 			break;
1969 	}
1970 
1971 	/*
1972 	 * If not found, create one. The caches are never destroyed.
1973 	 */
1974 	if (cache == NULL) {
1975 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1976 		cache->cc_hand = hand;
1977 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1978 		    (long)hand);
1979 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1980 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1981 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1982 		    (long)hand);
1983 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1984 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1985 		cache->cc_next = callout_caches;
1986 		callout_caches = cache;
1987 	}
1988 
1989 	seqid = cp->cpu_seqid;
1990 
1991 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1992 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1993 
1994 		mutex_enter(&ct->ct_mutex);
1995 		/*
1996 		 * Store convinience pointers to the kmem caches
1997 		 * in the callout table. These assignments should always be
1998 		 * done as callout tables can map to different physical
1999 		 * CPUs each time.
2000 		 */
2001 		ct->ct_cache = cache->cc_cache;
2002 		ct->ct_lcache = cache->cc_lcache;
2003 
2004 		/*
2005 		 * We use the heap pointer to check if stuff has been
2006 		 * initialized for this callout table.
2007 		 */
2008 		if (ct->ct_heap == NULL) {
2009 			callout_heap_init(ct);
2010 			callout_hash_init(ct);
2011 			callout_kstat_init(ct);
2012 			callout_cyclic_init(ct);
2013 		}
2014 
2015 		mutex_exit(&ct->ct_mutex);
2016 
2017 		/*
2018 		 * Move the cyclics to this CPU by doing a bind.
2019 		 */
2020 		cyclic_bind(ct->ct_cyclic, cp, NULL);
2021 		cyclic_bind(ct->ct_qcyclic, cp, NULL);
2022 	}
2023 }
2024 
2025 void
2026 callout_cpu_offline(cpu_t *cp)
2027 {
2028 	callout_table_t *ct;
2029 	processorid_t seqid;
2030 	int t;
2031 
2032 	ASSERT(MUTEX_HELD(&cpu_lock));
2033 
2034 	seqid = cp->cpu_seqid;
2035 
2036 	for (t = 0; t < CALLOUT_NTYPES; t++) {
2037 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
2038 
2039 		/*
2040 		 * Unbind the cyclics. This will allow the cyclic subsystem
2041 		 * to juggle the cyclics during CPU offline.
2042 		 */
2043 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
2044 		cyclic_bind(ct->ct_qcyclic, NULL, NULL);
2045 	}
2046 }
2047 
2048 /*
2049  * This is called to perform per-CPU initialization for slave CPUs at
2050  * boot time.
2051  */
2052 void
2053 callout_mp_init(void)
2054 {
2055 	cpu_t *cp;
2056 	size_t min, max;
2057 
2058 	if (callout_chunk == CALLOUT_CHUNK) {
2059 		/*
2060 		 * No one has specified a chunk in /etc/system. We need to
2061 		 * compute it here based on the number of online CPUs and
2062 		 * available physical memory.
2063 		 */
2064 		min = CALLOUT_MIN_HEAP_SIZE;
2065 		max = ptob(physmem / CALLOUT_MEM_FRACTION);
2066 		if (min > max)
2067 			min = max;
2068 		callout_chunk = min / sizeof (callout_heap_t);
2069 		callout_chunk /= ncpus_online;
2070 		callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2071 	}
2072 
2073 	mutex_enter(&cpu_lock);
2074 
2075 	cp = cpu_active;
2076 	do {
2077 		callout_cpu_online(cp);
2078 	} while ((cp = cp->cpu_next_onln) != cpu_active);
2079 
2080 	mutex_exit(&cpu_lock);
2081 }
2082 
2083 /*
2084  * Initialize all callout tables.  Called at boot time just before clkstart().
2085  */
2086 void
2087 callout_init(void)
2088 {
2089 	int f, t;
2090 	size_t size;
2091 	int table_id;
2092 	callout_table_t *ct;
2093 	long bits, fanout;
2094 	uintptr_t buf;
2095 
2096 	/*
2097 	 * Initialize callout globals.
2098 	 */
2099 	bits = 0;
2100 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
2101 		bits++;
2102 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
2103 	callout_table_mask = (1 << callout_table_bits) - 1;
2104 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
2105 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
2106 	callout_max_ticks = CALLOUT_MAX_TICKS;
2107 	if (callout_min_reap == 0)
2108 		callout_min_reap = CALLOUT_MIN_REAP;
2109 
2110 	if (callout_tolerance <= 0)
2111 		callout_tolerance = CALLOUT_TOLERANCE;
2112 	if (callout_threads <= 0)
2113 		callout_threads = CALLOUT_THREADS;
2114 	if (callout_chunk <= 0)
2115 		callout_chunk = CALLOUT_CHUNK;
2116 	else
2117 		callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2118 
2119 	/*
2120 	 * Allocate all the callout tables based on max_ncpus. We have chosen
2121 	 * to do boot-time allocation instead of dynamic allocation because:
2122 	 *
2123 	 *	- the size of the callout tables is not too large.
2124 	 *	- there are race conditions involved in making this dynamic.
2125 	 *	- the hash tables that go with the callout tables consume
2126 	 *	  most of the memory and they are only allocated in
2127 	 *	  callout_cpu_online().
2128 	 *
2129 	 * Each CPU has two tables that are consecutive in the array. The first
2130 	 * one is for realtime callouts and the second one is for normal ones.
2131 	 *
2132 	 * We do this alignment dance to make sure that callout table
2133 	 * structures will always be on a cache line boundary.
2134 	 */
2135 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
2136 	size += CALLOUT_ALIGN;
2137 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
2138 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
2139 
2140 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
2141 	/*
2142 	 * Now, initialize the tables for all the CPUs.
2143 	 */
2144 	for (f = 0; f < max_ncpus; f++) {
2145 		for (t = 0; t < CALLOUT_NTYPES; t++) {
2146 			table_id = CALLOUT_TABLE(t, f);
2147 			ct = &callout_table[table_id];
2148 			ct->ct_type = t;
2149 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
2150 			/*
2151 			 * Precompute the base IDs for long and short-term
2152 			 * legacy IDs. This makes ID generation during
2153 			 * timeout() fast.
2154 			 */
2155 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
2156 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
2157 			/*
2158 			 * Precompute the base ID for generation-based IDs.
2159 			 * Note that when the first ID gets allocated, the
2160 			 * ID will wrap. This will cause the generation
2161 			 * number to be incremented to 1.
2162 			 */
2163 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
2164 			/*
2165 			 * Initialize the cyclics as NONE. This will get set
2166 			 * during CPU online. This is so that partially
2167 			 * populated systems will only have the required
2168 			 * number of cyclics, not more.
2169 			 */
2170 			ct->ct_cyclic = CYCLIC_NONE;
2171 			ct->ct_qcyclic = CYCLIC_NONE;
2172 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
2173 		}
2174 	}
2175 
2176 	/*
2177 	 * Add the callback for CPR. This is called during checkpoint
2178 	 * resume to suspend and resume callouts.
2179 	 */
2180 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
2181 	    "callout_cpr");
2182 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
2183 	    "callout_debug");
2184 
2185 	/*
2186 	 * Call the per-CPU initialization function for the boot CPU. This
2187 	 * is done here because the function is not called automatically for
2188 	 * the boot CPU from the CPU online/offline hooks. Note that the
2189 	 * CPU lock is taken here because of convention.
2190 	 */
2191 	mutex_enter(&cpu_lock);
2192 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
2193 	callout_cpu_online(CPU);
2194 	mutex_exit(&cpu_lock);
2195 
2196 	/* heads-up to boot-time clients that timeouts now available */
2197 	callout_init_done = 1;
2198 }
2199