xref: /titanic_52/usr/src/uts/common/os/callout.c (revision efd4c9b63ad77503c101fc6c2ed8ba96c9d52964)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/callo.h>
26 #include <sys/param.h>
27 #include <sys/types.h>
28 #include <sys/cpuvar.h>
29 #include <sys/thread.h>
30 #include <sys/kmem.h>
31 #include <sys/kmem_impl.h>
32 #include <sys/cmn_err.h>
33 #include <sys/callb.h>
34 #include <sys/debug.h>
35 #include <sys/vtrace.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38 
39 int callout_init_done;				/* useful during boot */
40 
41 /*
42  * Callout tables.  See timeout(9F) for details.
43  */
44 static int callout_threads;			/* callout normal threads */
45 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
46 static int callout_chunk;			/* callout heap chunk size */
47 static int callout_min_reap;			/* callout minimum reap count */
48 static int callout_tolerance;			/* callout hires tolerance */
49 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
50 static clock_t callout_max_ticks;		/* max interval */
51 static hrtime_t callout_longterm;		/* longterm nanoseconds */
52 static ulong_t callout_counter_low;		/* callout ID increment */
53 static ulong_t callout_table_bits;		/* number of table bits in ID */
54 static ulong_t callout_table_mask;		/* mask for the table bits */
55 static callout_cache_t *callout_caches;		/* linked list of caches */
56 #pragma align 64(callout_table)
57 static callout_table_t *callout_table;		/* global callout table array */
58 
59 /*
60  * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
61  * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
62  * via taskq, to a thread that executes at PIL 0 - so we end up running
63  * 'normal' callouts at PIL 0.
64  */
65 static volatile int callout_realtime_level = CY_LOW_LEVEL;
66 static volatile int callout_normal_level = CY_LOCK_LEVEL;
67 
68 static char *callout_kstat_names[] = {
69 	"callout_timeouts",
70 	"callout_timeouts_pending",
71 	"callout_untimeouts_unexpired",
72 	"callout_untimeouts_executing",
73 	"callout_untimeouts_expired",
74 	"callout_expirations",
75 	"callout_allocations",
76 	"callout_cleanups",
77 };
78 
79 static hrtime_t	callout_heap_process(callout_table_t *, hrtime_t, int);
80 
81 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
82 {							\
83 	callout_hash_t *hashp = &(hash);		\
84 							\
85 	cp->cprev = NULL;				\
86 	cp->cnext = hashp->ch_head;			\
87 	if (hashp->ch_head == NULL)			\
88 		hashp->ch_tail = cp;			\
89 	else						\
90 		cp->cnext->cprev = cp;			\
91 	hashp->ch_head = cp;				\
92 }
93 
94 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
95 {							\
96 	callout_hash_t *hashp = &(hash);		\
97 							\
98 	cp->cnext = NULL;				\
99 	cp->cprev = hashp->ch_tail;			\
100 	if (hashp->ch_tail == NULL)			\
101 		hashp->ch_head = cp;			\
102 	else						\
103 		cp->cprev->cnext = cp;			\
104 	hashp->ch_tail = cp;				\
105 }
106 
107 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
108 {							\
109 	callout_hash_t *hashp = &(hash);		\
110 							\
111 	if (cp->cnext == NULL)				\
112 		hashp->ch_tail = cp->cprev;		\
113 	else						\
114 		cp->cnext->cprev = cp->cprev;		\
115 	if (cp->cprev == NULL)				\
116 		hashp->ch_head = cp->cnext;		\
117 	else						\
118 		cp->cprev->cnext = cp->cnext;		\
119 }
120 
121 /*
122  * These definitions help us queue callouts and callout lists. Here is
123  * the queueing rationale:
124  *
125  *	- callouts are queued in a FIFO manner in the ID hash table.
126  *	  TCP timers are typically cancelled in the same order that they
127  *	  were issued. The FIFO queueing shortens the search for a callout
128  *	  during untimeout().
129  *
130  *	- callouts are queued in a FIFO manner in their callout lists.
131  *	  This ensures that the callouts are executed in the same order that
132  *	  they were queued. This is fair. Plus, it helps to make each
133  *	  callout expiration timely. It also favors cancellations.
134  *
135  *	- callout lists are queued in the following manner in the callout
136  *	  hash table buckets:
137  *
138  *		- appended, if the callout list is a 1-nanosecond resolution
139  *		  callout list. When a callout is created, we first look for
140  *		  a callout list that has the same expiration so we can avoid
141  *		  allocating a callout list and inserting the expiration into
142  *		  the heap. However, we do not want to look at 1-nanosecond
143  *		  resolution callout lists as we will seldom find a match in
144  *		  them. Keeping these callout lists in the rear of the hash
145  *		  buckets allows us to skip these during the lookup.
146  *
147  *		- inserted at the beginning, if the callout list is not a
148  *		  1-nanosecond resolution callout list. This also has the
149  *		  side-effect of keeping the long term timers away from the
150  *		  front of the buckets.
151  *
152  *	- callout lists are queued in a FIFO manner in the expired callouts
153  *	  list. This ensures that callout lists are executed in the order
154  *	  of expiration.
155  */
156 #define	CALLOUT_APPEND(ct, cp)						\
157 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
158 		cp, c_idnext, c_idprev);				\
159 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
160 
161 #define	CALLOUT_DELETE(ct, cp)						\
162 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
163 		cp, c_idnext, c_idprev);				\
164 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
165 
166 #define	CALLOUT_LIST_INSERT(hash, cl)				\
167 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
168 
169 #define	CALLOUT_LIST_APPEND(hash, cl)				\
170 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
171 
172 #define	CALLOUT_LIST_DELETE(hash, cl)				\
173 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
174 
175 #define	CALLOUT_LIST_BEFORE(cl, nextcl)			\
176 {							\
177 	(cl)->cl_prev = (nextcl)->cl_prev;		\
178 	(cl)->cl_next = (nextcl);			\
179 	(nextcl)->cl_prev = (cl);			\
180 	if (cl->cl_prev != NULL)			\
181 		cl->cl_prev->cl_next = cl;		\
182 }
183 
184 /*
185  * For normal callouts, there is a deadlock scenario if two callouts that
186  * have an inter-dependency end up on the same callout list. To break the
187  * deadlock, you need two taskq threads running in parallel. We compute
188  * the number of taskq threads here using a bunch of conditions to make
189  * it optimal for the common case. This is an ugly hack, but one that is
190  * necessary (sigh).
191  */
192 #define	CALLOUT_THRESHOLD	100000000
193 #define	CALLOUT_EXEC_COMPUTE(ct, nextexp, exec)				\
194 {									\
195 	callout_list_t *cl;						\
196 									\
197 	cl = ct->ct_expired.ch_head;					\
198 	if (cl == NULL) {						\
199 		/*							\
200 		 * If the expired list is NULL, there is nothing to	\
201 		 * process.						\
202 		 */							\
203 		exec = 0;						\
204 	} else if ((cl->cl_next == NULL) &&				\
205 	    (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) {	\
206 		/*							\
207 		 * If there is only one callout list and it contains	\
208 		 * only one callout, there is no need for two threads.	\
209 		 */							\
210 		exec = 1;						\
211 	} else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) {	\
212 		/*							\
213 		 * If the next expiration of the cyclic is way out into	\
214 		 * the future, we need two threads.			\
215 		 */							\
216 		exec = 2;						\
217 	} else {							\
218 		/*							\
219 		 * We have multiple callouts to process. But the cyclic	\
220 		 * will fire in the near future. So, we only need one	\
221 		 * thread for now.					\
222 		 */							\
223 		exec = 1;						\
224 	}								\
225 }
226 
227 /*
228  * Macro to swap two heap items.
229  */
230 #define	CALLOUT_SWAP(h1, h2)		\
231 {					\
232 	callout_heap_t tmp;		\
233 					\
234 	tmp = *h1;			\
235 	*h1 = *h2;			\
236 	*h2 = tmp;			\
237 }
238 
239 /*
240  * Macro to free a callout list.
241  */
242 #define	CALLOUT_LIST_FREE(ct, cl)			\
243 {							\
244 	cl->cl_next = ct->ct_lfree;			\
245 	ct->ct_lfree = cl;				\
246 	cl->cl_flags |= CALLOUT_LIST_FLAG_FREE;		\
247 }
248 
249 /*
250  * Macro to free a callout.
251  */
252 #define	CALLOUT_FREE(ct, cl)			\
253 {						\
254 	cp->c_idnext = ct->ct_free;		\
255 	ct->ct_free = cp;			\
256 	cp->c_xid |= CALLOUT_ID_FREE;		\
257 }
258 
259 /*
260  * Allocate a callout structure.  We try quite hard because we
261  * can't sleep, and if we can't do the allocation, we're toast.
262  * Failing all, we try a KM_PANIC allocation. Note that we never
263  * deallocate a callout. See untimeout() for the reasoning.
264  */
265 static callout_t *
266 callout_alloc(callout_table_t *ct)
267 {
268 	size_t size;
269 	callout_t *cp;
270 
271 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
272 	mutex_exit(&ct->ct_mutex);
273 
274 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
275 	if (cp == NULL) {
276 		size = sizeof (callout_t);
277 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
278 	}
279 	cp->c_xid = 0;
280 	cp->c_executor = NULL;
281 	cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
282 	cp->c_waiting = 0;
283 
284 	mutex_enter(&ct->ct_mutex);
285 	ct->ct_allocations++;
286 	return (cp);
287 }
288 
289 /*
290  * Allocate a callout list structure.  We try quite hard because we
291  * can't sleep, and if we can't do the allocation, we're toast.
292  * Failing all, we try a KM_PANIC allocation. Note that we never
293  * deallocate a callout list.
294  */
295 static void
296 callout_list_alloc(callout_table_t *ct)
297 {
298 	size_t size;
299 	callout_list_t *cl;
300 
301 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
302 	mutex_exit(&ct->ct_mutex);
303 
304 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
305 	if (cl == NULL) {
306 		size = sizeof (callout_list_t);
307 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
308 	}
309 	bzero(cl, sizeof (callout_list_t));
310 
311 	mutex_enter(&ct->ct_mutex);
312 	CALLOUT_LIST_FREE(ct, cl);
313 }
314 
315 /*
316  * Find a callout list that corresponds to an expiration and matching flags.
317  */
318 static callout_list_t *
319 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
320 {
321 	callout_list_t *cl;
322 	int clflags;
323 
324 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
325 
326 	if (flags & CALLOUT_LIST_FLAG_NANO) {
327 		/*
328 		 * This is a 1-nanosecond resolution callout. We will rarely
329 		 * find a match for this. So, bail out.
330 		 */
331 		return (NULL);
332 	}
333 
334 	clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
335 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
336 		/*
337 		 * If we have reached a 1-nanosecond resolution callout list,
338 		 * we don't have much hope of finding a match in this hash
339 		 * bucket. So, just bail out.
340 		 */
341 		if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
342 			return (NULL);
343 
344 		if ((cl->cl_expiration == expiration) &&
345 		    ((cl->cl_flags & clflags) == (flags & clflags)))
346 			return (cl);
347 	}
348 
349 	return (NULL);
350 }
351 
352 /*
353  * Add a new callout list into a callout table's queue in sorted order by
354  * expiration.
355  */
356 static int
357 callout_queue_add(callout_table_t *ct, callout_list_t *cl)
358 {
359 	callout_list_t *nextcl;
360 	hrtime_t expiration;
361 
362 	expiration = cl->cl_expiration;
363 	nextcl = ct->ct_queue.ch_head;
364 	if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) {
365 		CALLOUT_LIST_INSERT(ct->ct_queue, cl);
366 		return (1);
367 	}
368 
369 	while (nextcl != NULL) {
370 		if (expiration < nextcl->cl_expiration) {
371 			CALLOUT_LIST_BEFORE(cl, nextcl);
372 			return (0);
373 		}
374 		nextcl = nextcl->cl_next;
375 	}
376 	CALLOUT_LIST_APPEND(ct->ct_queue, cl);
377 
378 	return (0);
379 }
380 
381 /*
382  * Insert a callout list into a callout table's queue and reprogram the queue
383  * cyclic if needed.
384  */
385 static void
386 callout_queue_insert(callout_table_t *ct, callout_list_t *cl)
387 {
388 	cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED;
389 
390 	/*
391 	 * Add the callout to the callout queue. If it ends up at the head,
392 	 * the cyclic needs to be reprogrammed as we have an earlier
393 	 * expiration.
394 	 *
395 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
396 	 * We don't want any callout activity. When the CPR resume phase is
397 	 * entered, the cyclic will be programmed for the earliest expiration
398 	 * in the queue.
399 	 */
400 	if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0))
401 		(void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
402 }
403 
404 /*
405  * Delete and handle all past expirations in a callout table's queue.
406  */
407 static hrtime_t
408 callout_queue_delete(callout_table_t *ct)
409 {
410 	callout_list_t *cl;
411 	hrtime_t now;
412 
413 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
414 
415 	now = gethrtime();
416 	while ((cl = ct->ct_queue.ch_head) != NULL) {
417 		if (cl->cl_expiration > now)
418 			break;
419 		cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
420 		CALLOUT_LIST_DELETE(ct->ct_queue, cl);
421 		CALLOUT_LIST_APPEND(ct->ct_expired, cl);
422 	}
423 
424 	/*
425 	 * If this callout queue is empty or callouts have been suspended,
426 	 * just return.
427 	 */
428 	if ((cl == NULL) || (ct->ct_suspend > 0))
429 		return (CY_INFINITY);
430 
431 	(void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
432 
433 	return (cl->cl_expiration);
434 }
435 
436 static hrtime_t
437 callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange)
438 {
439 	callout_list_t *firstcl, *cl;
440 	hrtime_t expiration, now;
441 	int clflags;
442 	callout_hash_t temp;
443 
444 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
445 
446 	firstcl = ct->ct_queue.ch_head;
447 	if (firstcl == NULL)
448 		return (CY_INFINITY);
449 
450 	/*
451 	 * We walk the callout queue. If we encounter a hrestime entry that
452 	 * must be removed, we clean it out. Otherwise, we apply any
453 	 * adjustments needed to it. Because of the latter, we need to
454 	 * recreate the list as we go along.
455 	 */
456 	temp = ct->ct_queue;
457 	ct->ct_queue.ch_head = NULL;
458 	ct->ct_queue.ch_tail = NULL;
459 
460 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
461 	now = gethrtime();
462 	while ((cl = temp.ch_head) != NULL) {
463 		CALLOUT_LIST_DELETE(temp, cl);
464 
465 		/*
466 		 * Delete the callout and expire it, if one of the following
467 		 * is true:
468 		 *	- the callout has expired
469 		 *	- the callout is an absolute hrestime one and
470 		 *	  there has been a system time change
471 		 */
472 		if ((cl->cl_expiration <= now) ||
473 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
474 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
475 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
476 			continue;
477 		}
478 
479 		/*
480 		 * Apply adjustments, if any. Adjustments are applied after
481 		 * the system returns from KMDB or OBP. They are only applied
482 		 * to relative callout lists.
483 		 */
484 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
485 			expiration = cl->cl_expiration + delta;
486 			if (expiration <= 0)
487 				expiration = CY_INFINITY;
488 			cl->cl_expiration = expiration;
489 		}
490 
491 		(void) callout_queue_add(ct, cl);
492 	}
493 
494 	/*
495 	 * We need to return the expiration to help program the cyclic.
496 	 * If there are expired callouts, the cyclic needs to go off
497 	 * immediately. If the queue has become empty, then we return infinity.
498 	 * Else, we return the expiration of the earliest callout in the queue.
499 	 */
500 	if (ct->ct_expired.ch_head != NULL)
501 		return (gethrtime());
502 
503 	cl = ct->ct_queue.ch_head;
504 	if (cl == NULL)
505 		return (CY_INFINITY);
506 
507 	return (cl->cl_expiration);
508 }
509 
510 /*
511  * Initialize a callout table's heap, if necessary. Preallocate some free
512  * entries so we don't have to check for NULL elsewhere.
513  */
514 static void
515 callout_heap_init(callout_table_t *ct)
516 {
517 	size_t size;
518 
519 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
520 	ASSERT(ct->ct_heap == NULL);
521 
522 	ct->ct_heap_num = 0;
523 	ct->ct_heap_max = callout_chunk;
524 	size = sizeof (callout_heap_t) * callout_chunk;
525 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
526 }
527 
528 /*
529  * Reallocate the heap. Return 0 if the heap is still full at the end of it.
530  * Return 1 otherwise. Note that the heap only expands, it never contracts.
531  */
532 static int
533 callout_heap_expand(callout_table_t *ct)
534 {
535 	size_t max, size, osize;
536 	callout_heap_t *heap;
537 
538 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
539 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
540 
541 	while (ct->ct_heap_num == ct->ct_heap_max) {
542 		max = ct->ct_heap_max;
543 		mutex_exit(&ct->ct_mutex);
544 
545 		osize = sizeof (callout_heap_t) * max;
546 		size = sizeof (callout_heap_t) * (max + callout_chunk);
547 		heap = kmem_alloc(size, KM_NOSLEEP);
548 
549 		mutex_enter(&ct->ct_mutex);
550 		if (heap == NULL) {
551 			/*
552 			 * We could not allocate memory. If we can free up
553 			 * some entries, that would be great.
554 			 */
555 			if (ct->ct_nreap > 0)
556 				(void) callout_heap_process(ct, 0, 0);
557 			/*
558 			 * If we still have no space in the heap, inform the
559 			 * caller.
560 			 */
561 			if (ct->ct_heap_num == ct->ct_heap_max)
562 				return (0);
563 			return (1);
564 		}
565 		if (max < ct->ct_heap_max) {
566 			/*
567 			 * Someone beat us to the allocation. Free what we
568 			 * just allocated and proceed.
569 			 */
570 			kmem_free(heap, size);
571 			continue;
572 		}
573 
574 		bcopy(ct->ct_heap, heap, osize);
575 		kmem_free(ct->ct_heap, osize);
576 		ct->ct_heap = heap;
577 		ct->ct_heap_max = size / sizeof (callout_heap_t);
578 	}
579 
580 	return (1);
581 }
582 
583 /*
584  * Move an expiration from the bottom of the heap to its correct place
585  * in the heap. If we reached the root doing this, return 1. Else,
586  * return 0.
587  */
588 static int
589 callout_upheap(callout_table_t *ct)
590 {
591 	int current, parent;
592 	callout_heap_t *heap, *hcurrent, *hparent;
593 
594 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
595 	ASSERT(ct->ct_heap_num >= 1);
596 
597 	if (ct->ct_heap_num == 1) {
598 		return (1);
599 	}
600 
601 	heap = ct->ct_heap;
602 	current = ct->ct_heap_num - 1;
603 
604 	for (;;) {
605 		parent = CALLOUT_HEAP_PARENT(current);
606 		hparent = &heap[parent];
607 		hcurrent = &heap[current];
608 
609 		/*
610 		 * We have an expiration later than our parent; we're done.
611 		 */
612 		if (hcurrent->ch_expiration >= hparent->ch_expiration) {
613 			return (0);
614 		}
615 
616 		/*
617 		 * We need to swap with our parent, and continue up the heap.
618 		 */
619 		CALLOUT_SWAP(hparent, hcurrent);
620 
621 		/*
622 		 * If we just reached the root, we're done.
623 		 */
624 		if (parent == 0) {
625 			return (1);
626 		}
627 
628 		current = parent;
629 	}
630 	/*NOTREACHED*/
631 }
632 
633 /*
634  * Insert a new heap item into a callout table's heap.
635  */
636 static void
637 callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
638 {
639 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
640 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
641 
642 	cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED;
643 	/*
644 	 * First, copy the expiration and callout list pointer to the bottom
645 	 * of the heap.
646 	 */
647 	ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
648 	ct->ct_heap[ct->ct_heap_num].ch_list = cl;
649 	ct->ct_heap_num++;
650 
651 	/*
652 	 * Now, perform an upheap operation. If we reached the root, then
653 	 * the cyclic needs to be reprogrammed as we have an earlier
654 	 * expiration.
655 	 *
656 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
657 	 * We don't want any callout activity. When the CPR resume phase is
658 	 * entered, the cyclic will be programmed for the earliest expiration
659 	 * in the heap.
660 	 */
661 	if (callout_upheap(ct) && (ct->ct_suspend == 0))
662 		(void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
663 }
664 
665 /*
666  * Move an expiration from the top of the heap to its correct place
667  * in the heap.
668  */
669 static void
670 callout_downheap(callout_table_t *ct)
671 {
672 	int current, left, right, nelems;
673 	callout_heap_t *heap, *hleft, *hright, *hcurrent;
674 
675 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
676 	ASSERT(ct->ct_heap_num >= 1);
677 
678 	heap = ct->ct_heap;
679 	current = 0;
680 	nelems = ct->ct_heap_num;
681 
682 	for (;;) {
683 		/*
684 		 * If we don't have a left child (i.e., we're a leaf), we're
685 		 * done.
686 		 */
687 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
688 			return;
689 
690 		hleft = &heap[left];
691 		hcurrent = &heap[current];
692 
693 		right = CALLOUT_HEAP_RIGHT(current);
694 
695 		/*
696 		 * Even if we don't have a right child, we still need to compare
697 		 * our expiration against that of our left child.
698 		 */
699 		if (right >= nelems)
700 			goto comp_left;
701 
702 		hright = &heap[right];
703 
704 		/*
705 		 * We have both a left and a right child.  We need to compare
706 		 * the expiration of the children to determine which
707 		 * expires earlier.
708 		 */
709 		if (hright->ch_expiration < hleft->ch_expiration) {
710 			/*
711 			 * Our right child is the earlier of our children.
712 			 * We'll now compare our expiration to its expiration.
713 			 * If ours is the earlier one, we're done.
714 			 */
715 			if (hcurrent->ch_expiration <= hright->ch_expiration)
716 				return;
717 
718 			/*
719 			 * Our right child expires earlier than we do; swap
720 			 * with our right child, and descend right.
721 			 */
722 			CALLOUT_SWAP(hright, hcurrent);
723 			current = right;
724 			continue;
725 		}
726 
727 comp_left:
728 		/*
729 		 * Our left child is the earlier of our children (or we have
730 		 * no right child).  We'll now compare our expiration
731 		 * to its expiration. If ours is the earlier one, we're done.
732 		 */
733 		if (hcurrent->ch_expiration <= hleft->ch_expiration)
734 			return;
735 
736 		/*
737 		 * Our left child expires earlier than we do; swap with our
738 		 * left child, and descend left.
739 		 */
740 		CALLOUT_SWAP(hleft, hcurrent);
741 		current = left;
742 	}
743 }
744 
745 /*
746  * Delete and handle all past expirations in a callout table's heap.
747  */
748 static hrtime_t
749 callout_heap_delete(callout_table_t *ct)
750 {
751 	hrtime_t now, expiration, next;
752 	callout_list_t *cl;
753 	callout_heap_t *heap;
754 	int hash;
755 
756 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
757 
758 	if (CALLOUT_CLEANUP(ct)) {
759 		/*
760 		 * There are too many heap elements pointing to empty callout
761 		 * lists. Clean them out.
762 		 */
763 		(void) callout_heap_process(ct, 0, 0);
764 	}
765 
766 	now = gethrtime();
767 	heap = ct->ct_heap;
768 
769 	while (ct->ct_heap_num > 0) {
770 		expiration = heap->ch_expiration;
771 		hash = CALLOUT_CLHASH(expiration);
772 		cl = heap->ch_list;
773 		ASSERT(expiration == cl->cl_expiration);
774 
775 		if (cl->cl_callouts.ch_head == NULL) {
776 			/*
777 			 * If the callout list is empty, reap it.
778 			 * Decrement the reap count.
779 			 */
780 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
781 			CALLOUT_LIST_FREE(ct, cl);
782 			ct->ct_nreap--;
783 		} else {
784 			/*
785 			 * If the root of the heap expires in the future,
786 			 * bail out.
787 			 */
788 			if (expiration > now)
789 				break;
790 
791 			/*
792 			 * Move the callout list for this expiration to the
793 			 * list of expired callout lists. It will be processed
794 			 * by the callout executor.
795 			 */
796 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
797 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
798 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
799 		}
800 
801 		/*
802 		 * Now delete the root. This is done by swapping the root with
803 		 * the last item in the heap and downheaping the item.
804 		 */
805 		ct->ct_heap_num--;
806 		if (ct->ct_heap_num > 0) {
807 			heap[0] = heap[ct->ct_heap_num];
808 			callout_downheap(ct);
809 		}
810 	}
811 
812 	/*
813 	 * If this callout table is empty or callouts have been suspended,
814 	 * just return. The cyclic has already been programmed to
815 	 * infinity by the cyclic subsystem.
816 	 */
817 	if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
818 		return (CY_INFINITY);
819 
820 	/*
821 	 * If the top expirations are within callout_tolerance of each other,
822 	 * delay the cyclic expire so that they can be processed together.
823 	 * This is to prevent high resolution timers from swamping the system
824 	 * with cyclic activity.
825 	 */
826 	if (ct->ct_heap_num > 2) {
827 		next = expiration + callout_tolerance;
828 		if ((heap[1].ch_expiration < next) ||
829 		    (heap[2].ch_expiration < next))
830 			expiration = next;
831 	}
832 
833 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
834 
835 	return (expiration);
836 }
837 
838 /*
839  * There are some situations when the entire heap is walked and processed.
840  * This function is called to do the processing. These are the situations:
841  *
842  * 1. When the reap count reaches its threshold, the heap has to be cleared
843  *    of all empty callout lists.
844  *
845  * 2. When the system enters and exits KMDB/OBP, all entries in the heap
846  *    need to be adjusted by the interval spent in KMDB/OBP.
847  *
848  * 3. When system time is changed, the heap has to be scanned for
849  *    absolute hrestime timers. These need to be removed from the heap
850  *    and expired immediately.
851  *
852  * In cases 2 and 3, it is a good idea to do 1 as well since we are
853  * scanning the heap anyway.
854  *
855  * If the root gets changed and/or callout lists are expired, return the
856  * new expiration to the caller so he can reprogram the cyclic accordingly.
857  */
858 static hrtime_t
859 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
860 {
861 	callout_heap_t *heap;
862 	callout_list_t *cl;
863 	hrtime_t expiration, now;
864 	int i, hash, clflags;
865 	ulong_t num;
866 
867 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
868 
869 	if (ct->ct_heap_num == 0)
870 		return (CY_INFINITY);
871 
872 	if (ct->ct_nreap > 0)
873 		ct->ct_cleanups++;
874 
875 	heap = ct->ct_heap;
876 
877 	/*
878 	 * We walk the heap from the top to the bottom. If we encounter
879 	 * a heap item that points to an empty callout list, we clean
880 	 * it out. If we encounter a hrestime entry that must be removed,
881 	 * again we clean it out. Otherwise, we apply any adjustments needed
882 	 * to an element.
883 	 *
884 	 * During the walk, we also compact the heap from the bottom and
885 	 * reconstruct the heap using upheap operations. This is very
886 	 * efficient if the number of elements to be cleaned is greater than
887 	 * or equal to half the heap. This is the common case.
888 	 *
889 	 * Even in the non-common case, the upheap operations should be short
890 	 * as the entries below generally tend to be bigger than the entries
891 	 * above.
892 	 */
893 	num = ct->ct_heap_num;
894 	ct->ct_heap_num = 0;
895 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
896 	now = gethrtime();
897 	for (i = 0; i < num; i++) {
898 		cl = heap[i].ch_list;
899 		/*
900 		 * If the callout list is empty, delete the heap element and
901 		 * free the callout list.
902 		 */
903 		if (cl->cl_callouts.ch_head == NULL) {
904 			hash = CALLOUT_CLHASH(cl->cl_expiration);
905 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
906 			CALLOUT_LIST_FREE(ct, cl);
907 			continue;
908 		}
909 
910 		/*
911 		 * Delete the heap element and expire the callout list, if
912 		 * one of the following is true:
913 		 *	- the callout list has expired
914 		 *	- the callout list is an absolute hrestime one and
915 		 *	  there has been a system time change
916 		 */
917 		if ((cl->cl_expiration <= now) ||
918 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
919 			hash = CALLOUT_CLHASH(cl->cl_expiration);
920 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
921 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
922 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
923 			continue;
924 		}
925 
926 		/*
927 		 * Apply adjustments, if any. Adjustments are applied after
928 		 * the system returns from KMDB or OBP. They are only applied
929 		 * to relative callout lists.
930 		 */
931 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
932 			hash = CALLOUT_CLHASH(cl->cl_expiration);
933 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
934 			expiration = cl->cl_expiration + delta;
935 			if (expiration <= 0)
936 				expiration = CY_INFINITY;
937 			heap[i].ch_expiration = expiration;
938 			cl->cl_expiration = expiration;
939 			hash = CALLOUT_CLHASH(cl->cl_expiration);
940 			if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
941 				CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
942 			} else {
943 				CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
944 			}
945 		}
946 
947 		heap[ct->ct_heap_num] = heap[i];
948 		ct->ct_heap_num++;
949 		(void) callout_upheap(ct);
950 	}
951 
952 	ct->ct_nreap = 0;
953 
954 	/*
955 	 * We need to return the expiration to help program the cyclic.
956 	 * If there are expired callouts, the cyclic needs to go off
957 	 * immediately. If the heap has become empty, then we return infinity.
958 	 * Else, return the expiration of the earliest callout in the heap.
959 	 */
960 	if (ct->ct_expired.ch_head != NULL)
961 		return (gethrtime());
962 
963 	if (ct->ct_heap_num == 0)
964 		return (CY_INFINITY);
965 
966 	return (heap->ch_expiration);
967 }
968 
969 /*
970  * Common function used to create normal and realtime callouts.
971  *
972  * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
973  * there is one restriction on a realtime callout handler - it should not
974  * directly or indirectly acquire cpu_lock. CPU offline waits for pending
975  * cyclic handlers to complete while holding cpu_lock. So, if a realtime
976  * callout handler were to try to get cpu_lock, there would be a deadlock
977  * during CPU offline.
978  */
979 callout_id_t
980 timeout_generic(int type, void (*func)(void *), void *arg,
981 	hrtime_t expiration, hrtime_t resolution, int flags)
982 {
983 	callout_table_t *ct;
984 	callout_t *cp;
985 	callout_id_t id;
986 	callout_list_t *cl;
987 	hrtime_t now, interval;
988 	int hash, clflags;
989 
990 	ASSERT(resolution > 0);
991 	ASSERT(func != NULL);
992 
993 	/*
994 	 * We get the current hrtime right upfront so that latencies in
995 	 * this function do not affect the accuracy of the callout.
996 	 */
997 	now = gethrtime();
998 
999 	/*
1000 	 * We disable kernel preemption so that we remain on the same CPU
1001 	 * throughout. If we needed to reprogram the callout table's cyclic,
1002 	 * we can avoid X-calls if we are on the same CPU.
1003 	 *
1004 	 * Note that callout_alloc() releases and reacquires the callout
1005 	 * table mutex. While reacquiring the mutex, it is possible for us
1006 	 * to go to sleep and later migrate to another CPU. This should be
1007 	 * pretty rare, though.
1008 	 */
1009 	kpreempt_disable();
1010 
1011 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
1012 	mutex_enter(&ct->ct_mutex);
1013 
1014 	if (ct->ct_cyclic == CYCLIC_NONE) {
1015 		mutex_exit(&ct->ct_mutex);
1016 		/*
1017 		 * The callout table has not yet been initialized fully.
1018 		 * So, put this one on the boot callout table which is
1019 		 * always initialized.
1020 		 */
1021 		ct = &callout_boot_ct[type];
1022 		mutex_enter(&ct->ct_mutex);
1023 	}
1024 
1025 	if (CALLOUT_CLEANUP(ct)) {
1026 		/*
1027 		 * There are too many heap elements pointing to empty callout
1028 		 * lists. Clean them out. Since cleanup is only done once
1029 		 * in a while, no need to reprogram the cyclic if the root
1030 		 * of the heap gets cleaned out.
1031 		 */
1032 		(void) callout_heap_process(ct, 0, 0);
1033 	}
1034 
1035 	if ((cp = ct->ct_free) == NULL)
1036 		cp = callout_alloc(ct);
1037 	else
1038 		ct->ct_free = cp->c_idnext;
1039 
1040 	cp->c_func = func;
1041 	cp->c_arg = arg;
1042 
1043 	/*
1044 	 * Compute the expiration hrtime.
1045 	 */
1046 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
1047 		interval = expiration - now;
1048 	} else {
1049 		interval = expiration;
1050 		expiration += now;
1051 	}
1052 
1053 	if (resolution > 1) {
1054 		/*
1055 		 * Align expiration to the specified resolution.
1056 		 */
1057 		if (flags & CALLOUT_FLAG_ROUNDUP)
1058 			expiration += resolution - 1;
1059 		expiration = (expiration / resolution) * resolution;
1060 	}
1061 
1062 	if (expiration <= 0) {
1063 		/*
1064 		 * expiration hrtime overflow has occurred. Just set the
1065 		 * expiration to infinity.
1066 		 */
1067 		expiration = CY_INFINITY;
1068 	}
1069 
1070 	/*
1071 	 * Assign an ID to this callout
1072 	 */
1073 	if (flags & CALLOUT_FLAG_32BIT) {
1074 		if (interval > callout_longterm) {
1075 			id = (ct->ct_long_id - callout_counter_low);
1076 			id |= CALLOUT_COUNTER_HIGH;
1077 			ct->ct_long_id = id;
1078 		} else {
1079 			id = (ct->ct_short_id - callout_counter_low);
1080 			id |= CALLOUT_COUNTER_HIGH;
1081 			ct->ct_short_id = id;
1082 		}
1083 	} else {
1084 		id = (ct->ct_gen_id - callout_counter_low);
1085 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
1086 			id |= CALLOUT_COUNTER_HIGH;
1087 			id += CALLOUT_GENERATION_LOW;
1088 		}
1089 		ct->ct_gen_id = id;
1090 	}
1091 
1092 	cp->c_xid = id;
1093 
1094 	clflags = 0;
1095 	if (flags & CALLOUT_FLAG_ABSOLUTE)
1096 		clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
1097 	if (flags & CALLOUT_FLAG_HRESTIME)
1098 		clflags |= CALLOUT_LIST_FLAG_HRESTIME;
1099 	if (resolution == 1)
1100 		clflags |= CALLOUT_LIST_FLAG_NANO;
1101 	hash = CALLOUT_CLHASH(expiration);
1102 
1103 again:
1104 	/*
1105 	 * Try to see if a callout list already exists for this expiration.
1106 	 */
1107 	cl = callout_list_get(ct, expiration, clflags, hash);
1108 	if (cl == NULL) {
1109 		/*
1110 		 * Check the free list. If we don't find one, we have to
1111 		 * take the slow path and allocate from kmem.
1112 		 */
1113 		if ((cl = ct->ct_lfree) == NULL) {
1114 			callout_list_alloc(ct);
1115 			/*
1116 			 * In the above call, we drop the lock, allocate and
1117 			 * reacquire the lock. So, we could have been away
1118 			 * for a while. In the meantime, someone could have
1119 			 * inserted a callout list with the same expiration.
1120 			 * Plus, the heap could have become full. So, the best
1121 			 * course is to repeat the steps. This should be an
1122 			 * infrequent event.
1123 			 */
1124 			goto again;
1125 		}
1126 		ct->ct_lfree = cl->cl_next;
1127 		cl->cl_expiration = expiration;
1128 		cl->cl_flags = clflags;
1129 
1130 		/*
1131 		 * Check if we have enough space in the heap to insert one
1132 		 * expiration. If not, expand the heap.
1133 		 */
1134 		if (ct->ct_heap_num == ct->ct_heap_max) {
1135 			if (callout_heap_expand(ct) == 0) {
1136 				/*
1137 				 * Could not expand the heap. Just queue it.
1138 				 */
1139 				callout_queue_insert(ct, cl);
1140 				goto out;
1141 			}
1142 
1143 			/*
1144 			 * In the above call, we drop the lock, allocate and
1145 			 * reacquire the lock. So, we could have been away
1146 			 * for a while. In the meantime, someone could have
1147 			 * inserted a callout list with the same expiration.
1148 			 * But we will not go back and check for it as this
1149 			 * should be a really infrequent event. There is no
1150 			 * point.
1151 			 */
1152 		}
1153 
1154 		if (clflags & CALLOUT_LIST_FLAG_NANO) {
1155 			CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
1156 		} else {
1157 			CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
1158 		}
1159 
1160 		/*
1161 		 * This is a new expiration. So, insert it into the heap.
1162 		 * This will also reprogram the cyclic, if the expiration
1163 		 * propagated to the root of the heap.
1164 		 */
1165 		callout_heap_insert(ct, cl);
1166 	} else {
1167 		/*
1168 		 * If the callout list was empty, untimeout_generic() would
1169 		 * have incremented a reap count. Decrement the reap count
1170 		 * as we are going to insert a callout into this list.
1171 		 */
1172 		if (cl->cl_callouts.ch_head == NULL)
1173 			ct->ct_nreap--;
1174 	}
1175 out:
1176 	cp->c_list = cl;
1177 	CALLOUT_APPEND(ct, cp);
1178 
1179 	ct->ct_timeouts++;
1180 	ct->ct_timeouts_pending++;
1181 
1182 	mutex_exit(&ct->ct_mutex);
1183 
1184 	kpreempt_enable();
1185 
1186 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
1187 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
1188 	    cp);
1189 
1190 	return (id);
1191 }
1192 
1193 timeout_id_t
1194 timeout(void (*func)(void *), void *arg, clock_t delta)
1195 {
1196 	ulong_t id;
1197 
1198 	/*
1199 	 * Make sure the callout runs at least 1 tick in the future.
1200 	 */
1201 	if (delta <= 0)
1202 		delta = 1;
1203 	else if (delta > callout_max_ticks)
1204 		delta = callout_max_ticks;
1205 
1206 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
1207 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1208 
1209 	return ((timeout_id_t)id);
1210 }
1211 
1212 /*
1213  * Convenience function that creates a normal callout with default parameters
1214  * and returns a full ID.
1215  */
1216 callout_id_t
1217 timeout_default(void (*func)(void *), void *arg, clock_t delta)
1218 {
1219 	callout_id_t id;
1220 
1221 	/*
1222 	 * Make sure the callout runs at least 1 tick in the future.
1223 	 */
1224 	if (delta <= 0)
1225 		delta = 1;
1226 	else if (delta > callout_max_ticks)
1227 		delta = callout_max_ticks;
1228 
1229 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
1230 	    nsec_per_tick, 0);
1231 
1232 	return (id);
1233 }
1234 
1235 timeout_id_t
1236 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
1237 {
1238 	ulong_t id;
1239 
1240 	/*
1241 	 * Make sure the callout runs at least 1 tick in the future.
1242 	 */
1243 	if (delta <= 0)
1244 		delta = 1;
1245 	else if (delta > callout_max_ticks)
1246 		delta = callout_max_ticks;
1247 
1248 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
1249 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1250 
1251 	return ((timeout_id_t)id);
1252 }
1253 
1254 /*
1255  * Convenience function that creates a realtime callout with default parameters
1256  * and returns a full ID.
1257  */
1258 callout_id_t
1259 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
1260 {
1261 	callout_id_t id;
1262 
1263 	/*
1264 	 * Make sure the callout runs at least 1 tick in the future.
1265 	 */
1266 	if (delta <= 0)
1267 		delta = 1;
1268 	else if (delta > callout_max_ticks)
1269 		delta = callout_max_ticks;
1270 
1271 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
1272 	    nsec_per_tick, 0);
1273 
1274 	return (id);
1275 }
1276 
1277 hrtime_t
1278 untimeout_generic(callout_id_t id, int nowait)
1279 {
1280 	callout_table_t *ct;
1281 	callout_t *cp;
1282 	callout_id_t xid;
1283 	callout_list_t *cl;
1284 	int hash, flags;
1285 	callout_id_t bogus;
1286 
1287 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
1288 	hash = CALLOUT_IDHASH(id);
1289 
1290 	mutex_enter(&ct->ct_mutex);
1291 
1292 	/*
1293 	 * Search the ID hash table for the callout.
1294 	 */
1295 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
1296 
1297 		xid = cp->c_xid;
1298 
1299 		/*
1300 		 * Match the ID and generation number.
1301 		 */
1302 		if ((xid & CALLOUT_ID_MASK) != id)
1303 			continue;
1304 
1305 		if ((xid & CALLOUT_EXECUTING) == 0) {
1306 			hrtime_t expiration;
1307 
1308 			/*
1309 			 * Delete the callout. If the callout list becomes
1310 			 * NULL, we don't remove it from the table. This is
1311 			 * so it can be reused. If the empty callout list
1312 			 * corresponds to the top of the the callout heap, we
1313 			 * don't reprogram the table cyclic here. This is in
1314 			 * order to avoid lots of X-calls to the CPU associated
1315 			 * with the callout table.
1316 			 */
1317 			cl = cp->c_list;
1318 			expiration = cl->cl_expiration;
1319 			CALLOUT_DELETE(ct, cp);
1320 			CALLOUT_FREE(ct, cp);
1321 			ct->ct_untimeouts_unexpired++;
1322 			ct->ct_timeouts_pending--;
1323 
1324 			/*
1325 			 * If the callout list has become empty, there are 3
1326 			 * possibilities. If it is present:
1327 			 *	- in the heap, it needs to be cleaned along
1328 			 *	  with its heap entry. Increment a reap count.
1329 			 *	- in the callout queue, free it.
1330 			 *	- in the expired list, free it.
1331 			 */
1332 			if (cl->cl_callouts.ch_head == NULL) {
1333 				flags = cl->cl_flags;
1334 				if (flags & CALLOUT_LIST_FLAG_HEAPED) {
1335 					ct->ct_nreap++;
1336 				} else if (flags & CALLOUT_LIST_FLAG_QUEUED) {
1337 					CALLOUT_LIST_DELETE(ct->ct_queue, cl);
1338 					CALLOUT_LIST_FREE(ct, cl);
1339 				} else {
1340 					CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1341 					CALLOUT_LIST_FREE(ct, cl);
1342 				}
1343 			}
1344 			mutex_exit(&ct->ct_mutex);
1345 
1346 			expiration -= gethrtime();
1347 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
1348 			    "untimeout:ID %lx hrtime left %llx", id,
1349 			    expiration);
1350 			return (expiration < 0 ? 0 : expiration);
1351 		}
1352 
1353 		ct->ct_untimeouts_executing++;
1354 		/*
1355 		 * The callout we want to delete is currently executing.
1356 		 * The DDI states that we must wait until the callout
1357 		 * completes before returning, so we block on c_done until the
1358 		 * callout ID changes (to the old ID if it's on the freelist,
1359 		 * or to a new callout ID if it's in use).  This implicitly
1360 		 * assumes that callout structures are persistent (they are).
1361 		 */
1362 		if (cp->c_executor == curthread) {
1363 			/*
1364 			 * The timeout handler called untimeout() on itself.
1365 			 * Stupid, but legal.  We can't wait for the timeout
1366 			 * to complete without deadlocking, so we just return.
1367 			 */
1368 			mutex_exit(&ct->ct_mutex);
1369 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
1370 			    "untimeout_self:ID %x", id);
1371 			return (-1);
1372 		}
1373 		if (nowait == 0) {
1374 			/*
1375 			 * We need to wait. Indicate that we are waiting by
1376 			 * incrementing c_waiting. This prevents the executor
1377 			 * from doing a wakeup on c_done if there are no
1378 			 * waiters.
1379 			 */
1380 			while (cp->c_xid == xid) {
1381 				cp->c_waiting = 1;
1382 				cv_wait(&cp->c_done, &ct->ct_mutex);
1383 			}
1384 		}
1385 		mutex_exit(&ct->ct_mutex);
1386 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
1387 		    "untimeout_executing:ID %lx", id);
1388 		return (-1);
1389 	}
1390 	ct->ct_untimeouts_expired++;
1391 
1392 	mutex_exit(&ct->ct_mutex);
1393 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
1394 	    "untimeout_bogus_id:ID %lx", id);
1395 
1396 	/*
1397 	 * We didn't find the specified callout ID.  This means either
1398 	 * (1) the callout already fired, or (2) the caller passed us
1399 	 * a bogus value.  Perform a sanity check to detect case (2).
1400 	 */
1401 	bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
1402 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
1403 		panic("untimeout: impossible timeout id %llx",
1404 		    (unsigned long long)id);
1405 
1406 	return (-1);
1407 }
1408 
1409 clock_t
1410 untimeout(timeout_id_t id_arg)
1411 {
1412 	hrtime_t hleft;
1413 	clock_t tleft;
1414 	callout_id_t id;
1415 
1416 	id = (ulong_t)id_arg;
1417 	hleft = untimeout_generic(id, 0);
1418 	if (hleft < 0)
1419 		tleft = -1;
1420 	else if (hleft == 0)
1421 		tleft = 0;
1422 	else
1423 		tleft = NSEC_TO_TICK(hleft);
1424 
1425 	return (tleft);
1426 }
1427 
1428 /*
1429  * Convenience function to untimeout a timeout with a full ID with default
1430  * parameters.
1431  */
1432 clock_t
1433 untimeout_default(callout_id_t id, int nowait)
1434 {
1435 	hrtime_t hleft;
1436 	clock_t tleft;
1437 
1438 	hleft = untimeout_generic(id, nowait);
1439 	if (hleft < 0)
1440 		tleft = -1;
1441 	else if (hleft == 0)
1442 		tleft = 0;
1443 	else
1444 		tleft = NSEC_TO_TICK(hleft);
1445 
1446 	return (tleft);
1447 }
1448 
1449 /*
1450  * Expire all the callouts queued in the specified callout list.
1451  */
1452 static void
1453 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
1454 {
1455 	callout_t *cp, *cnext;
1456 
1457 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1458 	ASSERT(cl != NULL);
1459 
1460 	for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
1461 		/*
1462 		 * Multiple executor threads could be running at the same
1463 		 * time. If this callout is already being executed,
1464 		 * go on to the next one.
1465 		 */
1466 		if (cp->c_xid & CALLOUT_EXECUTING) {
1467 			cnext = cp->c_clnext;
1468 			continue;
1469 		}
1470 
1471 		/*
1472 		 * Indicate to untimeout() that a callout is
1473 		 * being expired by the executor.
1474 		 */
1475 		cp->c_xid |= CALLOUT_EXECUTING;
1476 		cp->c_executor = curthread;
1477 		mutex_exit(&ct->ct_mutex);
1478 
1479 		DTRACE_PROBE1(callout__start, callout_t *, cp);
1480 		(*cp->c_func)(cp->c_arg);
1481 		DTRACE_PROBE1(callout__end, callout_t *, cp);
1482 
1483 		mutex_enter(&ct->ct_mutex);
1484 
1485 		ct->ct_expirations++;
1486 		ct->ct_timeouts_pending--;
1487 		/*
1488 		 * Indicate completion for c_done.
1489 		 */
1490 		cp->c_xid &= ~CALLOUT_EXECUTING;
1491 		cp->c_executor = NULL;
1492 		cnext = cp->c_clnext;
1493 
1494 		/*
1495 		 * Delete callout from ID hash table and the callout
1496 		 * list, return to freelist, and tell any untimeout() that
1497 		 * cares that we're done.
1498 		 */
1499 		CALLOUT_DELETE(ct, cp);
1500 		CALLOUT_FREE(ct, cp);
1501 
1502 		if (cp->c_waiting) {
1503 			cp->c_waiting = 0;
1504 			cv_broadcast(&cp->c_done);
1505 		}
1506 	}
1507 }
1508 
1509 /*
1510  * Execute all expired callout lists for a callout table.
1511  */
1512 static void
1513 callout_expire(callout_table_t *ct)
1514 {
1515 	callout_list_t *cl, *clnext;
1516 
1517 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1518 
1519 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1520 		/*
1521 		 * Expire all the callouts in this callout list.
1522 		 */
1523 		callout_list_expire(ct, cl);
1524 
1525 		clnext = cl->cl_next;
1526 		if (cl->cl_callouts.ch_head == NULL) {
1527 			/*
1528 			 * Free the callout list.
1529 			 */
1530 			CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1531 			CALLOUT_LIST_FREE(ct, cl);
1532 		}
1533 	}
1534 }
1535 
1536 /*
1537  * The cyclic handlers below process callouts in two steps:
1538  *
1539  *	1. Find all expired callout lists and queue them in a separate
1540  *	   list of expired callouts.
1541  *	2. Execute the expired callout lists.
1542  *
1543  * This is done for two reasons:
1544  *
1545  *	1. We want to quickly find the next earliest expiration to program
1546  *	   the cyclic to and reprogram it. We can do this right at the end
1547  *	   of step 1.
1548  *	2. The realtime cyclic handler expires callouts in place. However,
1549  *	   for normal callouts, callouts are expired by a taskq thread.
1550  *	   So, it is simpler and more robust to have the taskq thread just
1551  *	   do step 2.
1552  */
1553 
1554 /*
1555  * Realtime callout cyclic handlers.
1556  */
1557 void
1558 callout_realtime(callout_table_t *ct)
1559 {
1560 	mutex_enter(&ct->ct_mutex);
1561 	(void) callout_heap_delete(ct);
1562 	callout_expire(ct);
1563 	mutex_exit(&ct->ct_mutex);
1564 }
1565 
1566 void
1567 callout_queue_realtime(callout_table_t *ct)
1568 {
1569 	mutex_enter(&ct->ct_mutex);
1570 	(void) callout_queue_delete(ct);
1571 	callout_expire(ct);
1572 	mutex_exit(&ct->ct_mutex);
1573 }
1574 
1575 void
1576 callout_execute(callout_table_t *ct)
1577 {
1578 	mutex_enter(&ct->ct_mutex);
1579 	callout_expire(ct);
1580 	mutex_exit(&ct->ct_mutex);
1581 }
1582 
1583 /*
1584  * Normal callout cyclic handlers.
1585  */
1586 void
1587 callout_normal(callout_table_t *ct)
1588 {
1589 	int i, exec;
1590 	hrtime_t exp;
1591 
1592 	mutex_enter(&ct->ct_mutex);
1593 	exp = callout_heap_delete(ct);
1594 	CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1595 	mutex_exit(&ct->ct_mutex);
1596 
1597 	for (i = 0; i < exec; i++) {
1598 		ASSERT(ct->ct_taskq != NULL);
1599 		(void) taskq_dispatch(ct->ct_taskq,
1600 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1601 	}
1602 }
1603 
1604 void
1605 callout_queue_normal(callout_table_t *ct)
1606 {
1607 	int i, exec;
1608 	hrtime_t exp;
1609 
1610 	mutex_enter(&ct->ct_mutex);
1611 	exp = callout_queue_delete(ct);
1612 	CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1613 	mutex_exit(&ct->ct_mutex);
1614 
1615 	for (i = 0; i < exec; i++) {
1616 		ASSERT(ct->ct_taskq != NULL);
1617 		(void) taskq_dispatch(ct->ct_taskq,
1618 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1619 	}
1620 }
1621 
1622 /*
1623  * Suspend callout processing.
1624  */
1625 static void
1626 callout_suspend(void)
1627 {
1628 	int t, f;
1629 	callout_table_t *ct;
1630 
1631 	/*
1632 	 * Traverse every callout table in the system and suspend callout
1633 	 * processing.
1634 	 *
1635 	 * We need to suspend all the tables (including the inactive ones)
1636 	 * so that if a table is made active while the suspend is still on,
1637 	 * the table remains suspended.
1638 	 */
1639 	for (f = 0; f < max_ncpus; f++) {
1640 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1641 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1642 
1643 			mutex_enter(&ct->ct_mutex);
1644 			ct->ct_suspend++;
1645 			if (ct->ct_cyclic == CYCLIC_NONE) {
1646 				mutex_exit(&ct->ct_mutex);
1647 				continue;
1648 			}
1649 			if (ct->ct_suspend == 1) {
1650 				(void) cyclic_reprogram(ct->ct_cyclic,
1651 				    CY_INFINITY);
1652 				(void) cyclic_reprogram(ct->ct_qcyclic,
1653 				    CY_INFINITY);
1654 			}
1655 			mutex_exit(&ct->ct_mutex);
1656 		}
1657 	}
1658 }
1659 
1660 /*
1661  * Resume callout processing.
1662  */
1663 static void
1664 callout_resume(hrtime_t delta, int timechange)
1665 {
1666 	hrtime_t hexp, qexp;
1667 	int t, f;
1668 	callout_table_t *ct;
1669 
1670 	/*
1671 	 * Traverse every callout table in the system and resume callout
1672 	 * processing. For active tables, perform any hrtime adjustments
1673 	 * necessary.
1674 	 */
1675 	for (f = 0; f < max_ncpus; f++) {
1676 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1677 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1678 
1679 			mutex_enter(&ct->ct_mutex);
1680 			if (ct->ct_cyclic == CYCLIC_NONE) {
1681 				ct->ct_suspend--;
1682 				mutex_exit(&ct->ct_mutex);
1683 				continue;
1684 			}
1685 
1686 			/*
1687 			 * If a delta is specified, adjust the expirations in
1688 			 * the heap by delta. Also, if the caller indicates
1689 			 * a timechange, process that. This step also cleans
1690 			 * out any empty callout lists that might happen to
1691 			 * be there.
1692 			 */
1693 			hexp = callout_heap_process(ct, delta, timechange);
1694 			qexp = callout_queue_process(ct, delta, timechange);
1695 
1696 			ct->ct_suspend--;
1697 			if (ct->ct_suspend == 0) {
1698 				(void) cyclic_reprogram(ct->ct_cyclic, hexp);
1699 				(void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1700 			}
1701 
1702 			mutex_exit(&ct->ct_mutex);
1703 		}
1704 	}
1705 }
1706 
1707 /*
1708  * Callback handler used by CPR to stop and resume callouts.
1709  * The cyclic subsystem saves and restores hrtime during CPR.
1710  * That is why callout_resume() is called with a 0 delta.
1711  * Although hrtime is the same, hrestime (system time) has
1712  * progressed during CPR. So, we have to indicate a time change
1713  * to expire the absolute hrestime timers.
1714  */
1715 /*ARGSUSED*/
1716 static boolean_t
1717 callout_cpr_callb(void *arg, int code)
1718 {
1719 	if (code == CB_CODE_CPR_CHKPT)
1720 		callout_suspend();
1721 	else
1722 		callout_resume(0, 1);
1723 
1724 	return (B_TRUE);
1725 }
1726 
1727 /*
1728  * Callback handler invoked when the debugger is entered or exited.
1729  */
1730 /*ARGSUSED*/
1731 static boolean_t
1732 callout_debug_callb(void *arg, int code)
1733 {
1734 	hrtime_t delta;
1735 
1736 	/*
1737 	 * When the system enters the debugger. make a note of the hrtime.
1738 	 * When it is resumed, compute how long the system was in the
1739 	 * debugger. This interval should not be counted for callouts.
1740 	 */
1741 	if (code == 0) {
1742 		callout_suspend();
1743 		callout_debug_hrtime = gethrtime();
1744 	} else {
1745 		delta = gethrtime() - callout_debug_hrtime;
1746 		callout_resume(delta, 0);
1747 	}
1748 
1749 	return (B_TRUE);
1750 }
1751 
1752 /*
1753  * Move the absolute hrestime callouts to the expired list. Then program the
1754  * table's cyclic to expire immediately so that the callouts can be executed
1755  * immediately.
1756  */
1757 static void
1758 callout_hrestime_one(callout_table_t *ct)
1759 {
1760 	hrtime_t hexp, qexp;
1761 
1762 	mutex_enter(&ct->ct_mutex);
1763 	if (ct->ct_cyclic == CYCLIC_NONE) {
1764 		mutex_exit(&ct->ct_mutex);
1765 		return;
1766 	}
1767 
1768 	/*
1769 	 * Walk the heap and process all the absolute hrestime entries.
1770 	 */
1771 	hexp = callout_heap_process(ct, 0, 1);
1772 	qexp = callout_queue_process(ct, 0, 1);
1773 
1774 	if (ct->ct_suspend == 0) {
1775 		(void) cyclic_reprogram(ct->ct_cyclic, hexp);
1776 		(void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1777 	}
1778 
1779 	mutex_exit(&ct->ct_mutex);
1780 }
1781 
1782 /*
1783  * This function is called whenever system time (hrestime) is changed
1784  * explicitly. All the HRESTIME callouts must be expired at once.
1785  */
1786 /*ARGSUSED*/
1787 void
1788 callout_hrestime(void)
1789 {
1790 	int t, f;
1791 	callout_table_t *ct;
1792 
1793 	/*
1794 	 * Traverse every callout table in the system and process the hrestime
1795 	 * callouts therein.
1796 	 *
1797 	 * We look at all the tables because we don't know which ones were
1798 	 * onlined and offlined in the past. The offlined tables may still
1799 	 * have active cyclics processing timers somewhere.
1800 	 */
1801 	for (f = 0; f < max_ncpus; f++) {
1802 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1803 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1804 			callout_hrestime_one(ct);
1805 		}
1806 	}
1807 }
1808 
1809 /*
1810  * Create the hash tables for this callout table.
1811  */
1812 static void
1813 callout_hash_init(callout_table_t *ct)
1814 {
1815 	size_t size;
1816 
1817 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1818 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1819 
1820 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1821 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1822 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1823 }
1824 
1825 /*
1826  * Create per-callout table kstats.
1827  */
1828 static void
1829 callout_kstat_init(callout_table_t *ct)
1830 {
1831 	callout_stat_type_t stat;
1832 	kstat_t *ct_kstats;
1833 	int ndx;
1834 
1835 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1836 	ASSERT(ct->ct_kstats == NULL);
1837 
1838 	ndx = ct - callout_table;
1839 	ct_kstats = kstat_create("unix", ndx, "callout",
1840 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1841 
1842 	if (ct_kstats == NULL) {
1843 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1844 		    (void *)ct);
1845 	} else {
1846 		ct_kstats->ks_data = ct->ct_kstat_data;
1847 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1848 			kstat_named_init(&ct->ct_kstat_data[stat],
1849 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
1850 		ct->ct_kstats = ct_kstats;
1851 		kstat_install(ct_kstats);
1852 	}
1853 }
1854 
1855 static void
1856 callout_cyclic_init(callout_table_t *ct)
1857 {
1858 	cyc_handler_t hdlr;
1859 	cyc_time_t when;
1860 	processorid_t seqid;
1861 	int t;
1862 	cyclic_id_t cyclic, qcyclic;
1863 
1864 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1865 
1866 	t = ct->ct_type;
1867 	seqid = CALLOUT_TABLE_SEQID(ct);
1868 
1869 	/*
1870 	 * Create the taskq thread if the table type is normal.
1871 	 * Realtime tables are handled at PIL1 by a softint
1872 	 * handler.
1873 	 */
1874 	if (t == CALLOUT_NORMAL) {
1875 		ASSERT(ct->ct_taskq == NULL);
1876 		/*
1877 		 * Each callout thread consumes exactly one
1878 		 * task structure while active.  Therefore,
1879 		 * prepopulating with 2 * callout_threads tasks
1880 		 * ensures that there's at least one task per
1881 		 * thread that's either scheduled or on the
1882 		 * freelist.  In turn, this guarantees that
1883 		 * taskq_dispatch() will always either succeed
1884 		 * (because there's a free task structure) or
1885 		 * be unnecessary (because "callout_excute(ct)"
1886 		 * has already scheduled).
1887 		 */
1888 		ct->ct_taskq =
1889 		    taskq_create_instance("callout_taskq", seqid,
1890 		    callout_threads, maxclsyspri,
1891 		    2 * callout_threads, 2 * callout_threads,
1892 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1893 	}
1894 
1895 	/*
1896 	 * callouts can only be created in a table whose
1897 	 * cyclic has been initialized.
1898 	 */
1899 	ASSERT(ct->ct_heap_num == 0);
1900 
1901 	/*
1902 	 * Drop the mutex before creating the callout cyclics. cyclic_add()
1903 	 * could potentially expand the cyclic heap. We don't want to be
1904 	 * holding the callout table mutex in that case. Note that this
1905 	 * function is called during CPU online. cpu_lock is held at this
1906 	 * point. So, only one thread can be executing the cyclic add logic
1907 	 * below at any time.
1908 	 */
1909 	mutex_exit(&ct->ct_mutex);
1910 
1911 	/*
1912 	 * Create the callout table cyclics.
1913 	 *
1914 	 * The realtime cyclic handler executes at low PIL. The normal cyclic
1915 	 * handler executes at lock PIL. This is because there are cases
1916 	 * where code can block at PIL > 1 waiting for a normal callout handler
1917 	 * to unblock it directly or indirectly. If the normal cyclic were to
1918 	 * be executed at low PIL, it could get blocked out by the waiter
1919 	 * and cause a deadlock.
1920 	 */
1921 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1922 
1923 	if (t == CALLOUT_REALTIME) {
1924 		hdlr.cyh_level = callout_realtime_level;
1925 		hdlr.cyh_func = (cyc_func_t)callout_realtime;
1926 	} else {
1927 		hdlr.cyh_level = callout_normal_level;
1928 		hdlr.cyh_func = (cyc_func_t)callout_normal;
1929 	}
1930 	hdlr.cyh_arg = ct;
1931 	when.cyt_when = CY_INFINITY;
1932 	when.cyt_interval = CY_INFINITY;
1933 
1934 	cyclic = cyclic_add(&hdlr, &when);
1935 
1936 	if (t == CALLOUT_REALTIME)
1937 		hdlr.cyh_func = (cyc_func_t)callout_queue_realtime;
1938 	else
1939 		hdlr.cyh_func = (cyc_func_t)callout_queue_normal;
1940 
1941 	qcyclic = cyclic_add(&hdlr, &when);
1942 
1943 	mutex_enter(&ct->ct_mutex);
1944 	ct->ct_cyclic = cyclic;
1945 	ct->ct_qcyclic = qcyclic;
1946 }
1947 
1948 void
1949 callout_cpu_online(cpu_t *cp)
1950 {
1951 	lgrp_handle_t hand;
1952 	callout_cache_t *cache;
1953 	char s[KMEM_CACHE_NAMELEN];
1954 	callout_table_t *ct;
1955 	processorid_t seqid;
1956 	int t;
1957 
1958 	ASSERT(MUTEX_HELD(&cpu_lock));
1959 
1960 	/*
1961 	 * Locate the cache corresponding to the onlined CPU's lgroup.
1962 	 * Note that access to callout_caches is protected by cpu_lock.
1963 	 */
1964 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1965 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1966 		if (cache->cc_hand == hand)
1967 			break;
1968 	}
1969 
1970 	/*
1971 	 * If not found, create one. The caches are never destroyed.
1972 	 */
1973 	if (cache == NULL) {
1974 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1975 		cache->cc_hand = hand;
1976 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1977 		    (long)hand);
1978 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1979 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1980 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1981 		    (long)hand);
1982 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1983 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1984 		cache->cc_next = callout_caches;
1985 		callout_caches = cache;
1986 	}
1987 
1988 	seqid = cp->cpu_seqid;
1989 
1990 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1991 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1992 
1993 		mutex_enter(&ct->ct_mutex);
1994 		/*
1995 		 * Store convinience pointers to the kmem caches
1996 		 * in the callout table. These assignments should always be
1997 		 * done as callout tables can map to different physical
1998 		 * CPUs each time.
1999 		 */
2000 		ct->ct_cache = cache->cc_cache;
2001 		ct->ct_lcache = cache->cc_lcache;
2002 
2003 		/*
2004 		 * We use the heap pointer to check if stuff has been
2005 		 * initialized for this callout table.
2006 		 */
2007 		if (ct->ct_heap == NULL) {
2008 			callout_heap_init(ct);
2009 			callout_hash_init(ct);
2010 			callout_kstat_init(ct);
2011 			callout_cyclic_init(ct);
2012 		}
2013 
2014 		mutex_exit(&ct->ct_mutex);
2015 
2016 		/*
2017 		 * Move the cyclics to this CPU by doing a bind.
2018 		 */
2019 		cyclic_bind(ct->ct_cyclic, cp, NULL);
2020 		cyclic_bind(ct->ct_qcyclic, cp, NULL);
2021 	}
2022 }
2023 
2024 void
2025 callout_cpu_offline(cpu_t *cp)
2026 {
2027 	callout_table_t *ct;
2028 	processorid_t seqid;
2029 	int t;
2030 
2031 	ASSERT(MUTEX_HELD(&cpu_lock));
2032 
2033 	seqid = cp->cpu_seqid;
2034 
2035 	for (t = 0; t < CALLOUT_NTYPES; t++) {
2036 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
2037 
2038 		/*
2039 		 * Unbind the cyclics. This will allow the cyclic subsystem
2040 		 * to juggle the cyclics during CPU offline.
2041 		 */
2042 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
2043 		cyclic_bind(ct->ct_qcyclic, NULL, NULL);
2044 	}
2045 }
2046 
2047 /*
2048  * This is called to perform per-CPU initialization for slave CPUs at
2049  * boot time.
2050  */
2051 void
2052 callout_mp_init(void)
2053 {
2054 	cpu_t *cp;
2055 	size_t min, max;
2056 
2057 	if (callout_chunk == CALLOUT_CHUNK) {
2058 		/*
2059 		 * No one has specified a chunk in /etc/system. We need to
2060 		 * compute it here based on the number of online CPUs and
2061 		 * available physical memory.
2062 		 */
2063 		min = CALLOUT_MIN_HEAP_SIZE;
2064 		max = ptob(physmem / CALLOUT_MEM_FRACTION);
2065 		if (min > max)
2066 			min = max;
2067 		callout_chunk = min / sizeof (callout_heap_t);
2068 		callout_chunk /= ncpus_online;
2069 		callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2070 	}
2071 
2072 	mutex_enter(&cpu_lock);
2073 
2074 	cp = cpu_active;
2075 	do {
2076 		callout_cpu_online(cp);
2077 	} while ((cp = cp->cpu_next_onln) != cpu_active);
2078 
2079 	mutex_exit(&cpu_lock);
2080 }
2081 
2082 /*
2083  * Initialize all callout tables.  Called at boot time just before clkstart().
2084  */
2085 void
2086 callout_init(void)
2087 {
2088 	int f, t;
2089 	size_t size;
2090 	int table_id;
2091 	callout_table_t *ct;
2092 	long bits, fanout;
2093 	uintptr_t buf;
2094 
2095 	/*
2096 	 * Initialize callout globals.
2097 	 */
2098 	bits = 0;
2099 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
2100 		bits++;
2101 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
2102 	callout_table_mask = (1 << callout_table_bits) - 1;
2103 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
2104 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
2105 	callout_max_ticks = CALLOUT_MAX_TICKS;
2106 	if (callout_min_reap == 0)
2107 		callout_min_reap = CALLOUT_MIN_REAP;
2108 
2109 	if (callout_tolerance <= 0)
2110 		callout_tolerance = CALLOUT_TOLERANCE;
2111 	if (callout_threads <= 0)
2112 		callout_threads = CALLOUT_THREADS;
2113 	if (callout_chunk <= 0)
2114 		callout_chunk = CALLOUT_CHUNK;
2115 	else
2116 		callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2117 
2118 	/*
2119 	 * Allocate all the callout tables based on max_ncpus. We have chosen
2120 	 * to do boot-time allocation instead of dynamic allocation because:
2121 	 *
2122 	 *	- the size of the callout tables is not too large.
2123 	 *	- there are race conditions involved in making this dynamic.
2124 	 *	- the hash tables that go with the callout tables consume
2125 	 *	  most of the memory and they are only allocated in
2126 	 *	  callout_cpu_online().
2127 	 *
2128 	 * Each CPU has two tables that are consecutive in the array. The first
2129 	 * one is for realtime callouts and the second one is for normal ones.
2130 	 *
2131 	 * We do this alignment dance to make sure that callout table
2132 	 * structures will always be on a cache line boundary.
2133 	 */
2134 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
2135 	size += CALLOUT_ALIGN;
2136 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
2137 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
2138 
2139 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
2140 	/*
2141 	 * Now, initialize the tables for all the CPUs.
2142 	 */
2143 	for (f = 0; f < max_ncpus; f++) {
2144 		for (t = 0; t < CALLOUT_NTYPES; t++) {
2145 			table_id = CALLOUT_TABLE(t, f);
2146 			ct = &callout_table[table_id];
2147 			ct->ct_type = t;
2148 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
2149 			/*
2150 			 * Precompute the base IDs for long and short-term
2151 			 * legacy IDs. This makes ID generation during
2152 			 * timeout() fast.
2153 			 */
2154 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
2155 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
2156 			/*
2157 			 * Precompute the base ID for generation-based IDs.
2158 			 * Note that when the first ID gets allocated, the
2159 			 * ID will wrap. This will cause the generation
2160 			 * number to be incremented to 1.
2161 			 */
2162 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
2163 			/*
2164 			 * Initialize the cyclics as NONE. This will get set
2165 			 * during CPU online. This is so that partially
2166 			 * populated systems will only have the required
2167 			 * number of cyclics, not more.
2168 			 */
2169 			ct->ct_cyclic = CYCLIC_NONE;
2170 			ct->ct_qcyclic = CYCLIC_NONE;
2171 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
2172 		}
2173 	}
2174 
2175 	/*
2176 	 * Add the callback for CPR. This is called during checkpoint
2177 	 * resume to suspend and resume callouts.
2178 	 */
2179 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
2180 	    "callout_cpr");
2181 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
2182 	    "callout_debug");
2183 
2184 	/*
2185 	 * Call the per-CPU initialization function for the boot CPU. This
2186 	 * is done here because the function is not called automatically for
2187 	 * the boot CPU from the CPU online/offline hooks. Note that the
2188 	 * CPU lock is taken here because of convention.
2189 	 */
2190 	mutex_enter(&cpu_lock);
2191 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
2192 	callout_cpu_online(CPU);
2193 	mutex_exit(&cpu_lock);
2194 
2195 	/* heads-up to boot-time clients that timeouts now available */
2196 	callout_init_done = 1;
2197 }
2198