xref: /illumos-gate/usr/src/uts/common/os/callout.c (revision 44ed9dbbfa620821ecf59a131462082f628dd0f3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/kmem.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
38 #include <sys/sdt.h>
39 
40 /*
41  * Callout tables.  See timeout(9F) for details.
42  */
43 static int callout_threads;			/* callout normal threads */
44 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
45 static int callout_chunk;			/* callout heap chunk size */
46 static int callout_min_reap;			/* callout minimum reap count */
47 static int callout_tolerance;			/* callout hires tolerance */
48 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
49 static clock_t callout_max_ticks;		/* max interval */
50 static hrtime_t callout_longterm;		/* longterm nanoseconds */
51 static ulong_t callout_counter_low;		/* callout ID increment */
52 static ulong_t callout_table_bits;		/* number of table bits in ID */
53 static ulong_t callout_table_mask;		/* mask for the table bits */
54 static callout_cache_t *callout_caches;		/* linked list of caches */
55 #pragma align 64(callout_table)
56 static callout_table_t *callout_table;		/* global callout table array */
57 
58 /*
59  * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
60  * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
61  * via taskq, to a thread that executes at PIL 0 - so we end up running
62  * 'normal' callouts at PIL 0.
63  */
64 static volatile int callout_realtime_level = CY_LOW_LEVEL;
65 static volatile int callout_normal_level = CY_LOCK_LEVEL;
66 
67 static char *callout_kstat_names[] = {
68 	"callout_timeouts",
69 	"callout_timeouts_pending",
70 	"callout_untimeouts_unexpired",
71 	"callout_untimeouts_executing",
72 	"callout_untimeouts_expired",
73 	"callout_expirations",
74 	"callout_allocations",
75 	"callout_cleanups",
76 };
77 
78 static hrtime_t	callout_heap_process(callout_table_t *, hrtime_t, int);
79 
80 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
81 {							\
82 	callout_hash_t *hashp = &(hash);		\
83 							\
84 	cp->cprev = NULL;				\
85 	cp->cnext = hashp->ch_head;			\
86 	if (hashp->ch_head == NULL)			\
87 		hashp->ch_tail = cp;			\
88 	else						\
89 		cp->cnext->cprev = cp;			\
90 	hashp->ch_head = cp;				\
91 }
92 
93 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
94 {							\
95 	callout_hash_t *hashp = &(hash);		\
96 							\
97 	cp->cnext = NULL;				\
98 	cp->cprev = hashp->ch_tail;			\
99 	if (hashp->ch_tail == NULL)			\
100 		hashp->ch_head = cp;			\
101 	else						\
102 		cp->cprev->cnext = cp;			\
103 	hashp->ch_tail = cp;				\
104 }
105 
106 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
107 {							\
108 	callout_hash_t *hashp = &(hash);		\
109 							\
110 	if (cp->cnext == NULL)				\
111 		hashp->ch_tail = cp->cprev;		\
112 	else						\
113 		cp->cnext->cprev = cp->cprev;		\
114 	if (cp->cprev == NULL)				\
115 		hashp->ch_head = cp->cnext;		\
116 	else						\
117 		cp->cprev->cnext = cp->cnext;		\
118 }
119 
120 /*
121  * These definitions help us queue callouts and callout lists. Here is
122  * the queueing rationale:
123  *
124  *	- callouts are queued in a FIFO manner in the ID hash table.
125  *	  TCP timers are typically cancelled in the same order that they
126  *	  were issued. The FIFO queueing shortens the search for a callout
127  *	  during untimeout().
128  *
129  *	- callouts are queued in a FIFO manner in their callout lists.
130  *	  This ensures that the callouts are executed in the same order that
131  *	  they were queued. This is fair. Plus, it helps to make each
132  *	  callout expiration timely. It also favors cancellations.
133  *
134  *	- callout lists are queued in the following manner in the callout
135  *	  hash table buckets:
136  *
137  *		- appended, if the callout list is a 1-nanosecond resolution
138  *		  callout list. When a callout is created, we first look for
139  *		  a callout list that has the same expiration so we can avoid
140  *		  allocating a callout list and inserting the expiration into
141  *		  the heap. However, we do not want to look at 1-nanosecond
142  *		  resolution callout lists as we will seldom find a match in
143  *		  them. Keeping these callout lists in the rear of the hash
144  *		  buckets allows us to skip these during the lookup.
145  *
146  *		- inserted at the beginning, if the callout list is not a
147  *		  1-nanosecond resolution callout list. This also has the
148  *		  side-effect of keeping the long term timers away from the
149  *		  front of the buckets.
150  *
151  *	- callout lists are queued in a FIFO manner in the expired callouts
152  *	  list. This ensures that callout lists are executed in the order
153  *	  of expiration.
154  */
155 #define	CALLOUT_APPEND(ct, cp)						\
156 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
157 		cp, c_idnext, c_idprev);				\
158 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
159 
160 #define	CALLOUT_DELETE(ct, cp)						\
161 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
162 		cp, c_idnext, c_idprev);				\
163 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
164 
165 #define	CALLOUT_LIST_INSERT(hash, cl)				\
166 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
167 
168 #define	CALLOUT_LIST_APPEND(hash, cl)				\
169 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
170 
171 #define	CALLOUT_LIST_DELETE(hash, cl)				\
172 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
173 
174 #define	CALLOUT_LIST_BEFORE(cl, nextcl)			\
175 {							\
176 	(cl)->cl_prev = (nextcl)->cl_prev;		\
177 	(cl)->cl_next = (nextcl);			\
178 	(nextcl)->cl_prev = (cl);			\
179 	if (cl->cl_prev != NULL)			\
180 		cl->cl_prev->cl_next = cl;		\
181 }
182 
183 /*
184  * For normal callouts, there is a deadlock scenario if two callouts that
185  * have an inter-dependency end up on the same callout list. To break the
186  * deadlock, you need two taskq threads running in parallel. We compute
187  * the number of taskq threads here using a bunch of conditions to make
188  * it optimal for the common case. This is an ugly hack, but one that is
189  * necessary (sigh).
190  */
191 #define	CALLOUT_THRESHOLD	100000000
192 #define	CALLOUT_EXEC_COMPUTE(ct, nextexp, exec)				\
193 {									\
194 	callout_list_t *cl;						\
195 									\
196 	cl = ct->ct_expired.ch_head;					\
197 	if (cl == NULL) {						\
198 		/*							\
199 		 * If the expired list is NULL, there is nothing to	\
200 		 * process.						\
201 		 */							\
202 		exec = 0;						\
203 	} else if ((cl->cl_next == NULL) &&				\
204 	    (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) {	\
205 		/*							\
206 		 * If there is only one callout list and it contains	\
207 		 * only one callout, there is no need for two threads.	\
208 		 */							\
209 		exec = 1;						\
210 	} else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) {	\
211 		/*							\
212 		 * If the next expiration of the cyclic is way out into	\
213 		 * the future, we need two threads.			\
214 		 */							\
215 		exec = 2;						\
216 	} else {							\
217 		/*							\
218 		 * We have multiple callouts to process. But the cyclic	\
219 		 * will fire in the near future. So, we only need one	\
220 		 * thread for now.					\
221 		 */							\
222 		exec = 1;						\
223 	}								\
224 }
225 
226 /*
227  * Macro to swap two heap items.
228  */
229 #define	CALLOUT_SWAP(h1, h2)		\
230 {					\
231 	callout_heap_t tmp;		\
232 					\
233 	tmp = *h1;			\
234 	*h1 = *h2;			\
235 	*h2 = tmp;			\
236 }
237 
238 /*
239  * Macro to free a callout list.
240  */
241 #define	CALLOUT_LIST_FREE(ct, cl)			\
242 {							\
243 	cl->cl_next = ct->ct_lfree;			\
244 	ct->ct_lfree = cl;				\
245 	cl->cl_flags |= CALLOUT_LIST_FLAG_FREE;		\
246 }
247 
248 /*
249  * Macro to free a callout.
250  */
251 #define	CALLOUT_FREE(ct, cl)			\
252 {						\
253 	cp->c_idnext = ct->ct_free;		\
254 	ct->ct_free = cp;			\
255 	cp->c_xid |= CALLOUT_ID_FREE;		\
256 }
257 
258 /*
259  * Allocate a callout structure.  We try quite hard because we
260  * can't sleep, and if we can't do the allocation, we're toast.
261  * Failing all, we try a KM_PANIC allocation. Note that we never
262  * deallocate a callout. See untimeout() for the reasoning.
263  */
264 static callout_t *
265 callout_alloc(callout_table_t *ct)
266 {
267 	size_t size;
268 	callout_t *cp;
269 
270 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
271 	mutex_exit(&ct->ct_mutex);
272 
273 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
274 	if (cp == NULL) {
275 		size = sizeof (callout_t);
276 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
277 	}
278 	cp->c_xid = 0;
279 	cp->c_executor = NULL;
280 	cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
281 	cp->c_waiting = 0;
282 
283 	mutex_enter(&ct->ct_mutex);
284 	ct->ct_allocations++;
285 	return (cp);
286 }
287 
288 /*
289  * Allocate a callout list structure.  We try quite hard because we
290  * can't sleep, and if we can't do the allocation, we're toast.
291  * Failing all, we try a KM_PANIC allocation. Note that we never
292  * deallocate a callout list.
293  */
294 static void
295 callout_list_alloc(callout_table_t *ct)
296 {
297 	size_t size;
298 	callout_list_t *cl;
299 
300 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
301 	mutex_exit(&ct->ct_mutex);
302 
303 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
304 	if (cl == NULL) {
305 		size = sizeof (callout_list_t);
306 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
307 	}
308 	bzero(cl, sizeof (callout_list_t));
309 
310 	mutex_enter(&ct->ct_mutex);
311 	CALLOUT_LIST_FREE(ct, cl);
312 }
313 
314 /*
315  * Find a callout list that corresponds to an expiration and matching flags.
316  */
317 static callout_list_t *
318 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
319 {
320 	callout_list_t *cl;
321 	int clflags;
322 
323 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
324 
325 	if (flags & CALLOUT_LIST_FLAG_NANO) {
326 		/*
327 		 * This is a 1-nanosecond resolution callout. We will rarely
328 		 * find a match for this. So, bail out.
329 		 */
330 		return (NULL);
331 	}
332 
333 	clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
334 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
335 		/*
336 		 * If we have reached a 1-nanosecond resolution callout list,
337 		 * we don't have much hope of finding a match in this hash
338 		 * bucket. So, just bail out.
339 		 */
340 		if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
341 			return (NULL);
342 
343 		if ((cl->cl_expiration == expiration) &&
344 		    ((cl->cl_flags & clflags) == (flags & clflags)))
345 			return (cl);
346 	}
347 
348 	return (NULL);
349 }
350 
351 /*
352  * Add a new callout list into a callout table's queue in sorted order by
353  * expiration.
354  */
355 static int
356 callout_queue_add(callout_table_t *ct, callout_list_t *cl)
357 {
358 	callout_list_t *nextcl;
359 	hrtime_t expiration;
360 
361 	expiration = cl->cl_expiration;
362 	nextcl = ct->ct_queue.ch_head;
363 	if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) {
364 		CALLOUT_LIST_INSERT(ct->ct_queue, cl);
365 		return (1);
366 	}
367 
368 	while (nextcl != NULL) {
369 		if (expiration < nextcl->cl_expiration) {
370 			CALLOUT_LIST_BEFORE(cl, nextcl);
371 			return (0);
372 		}
373 		nextcl = nextcl->cl_next;
374 	}
375 	CALLOUT_LIST_APPEND(ct->ct_queue, cl);
376 
377 	return (0);
378 }
379 
380 /*
381  * Insert a callout list into a callout table's queue and reprogram the queue
382  * cyclic if needed.
383  */
384 static void
385 callout_queue_insert(callout_table_t *ct, callout_list_t *cl)
386 {
387 	cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED;
388 
389 	/*
390 	 * Add the callout to the callout queue. If it ends up at the head,
391 	 * the cyclic needs to be reprogrammed as we have an earlier
392 	 * expiration.
393 	 *
394 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
395 	 * We don't want any callout activity. When the CPR resume phase is
396 	 * entered, the cyclic will be programmed for the earliest expiration
397 	 * in the queue.
398 	 */
399 	if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0))
400 		(void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
401 }
402 
403 /*
404  * Delete and handle all past expirations in a callout table's queue.
405  */
406 static hrtime_t
407 callout_queue_delete(callout_table_t *ct)
408 {
409 	callout_list_t *cl;
410 	hrtime_t now;
411 
412 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
413 
414 	now = gethrtime();
415 	while ((cl = ct->ct_queue.ch_head) != NULL) {
416 		if (cl->cl_expiration > now)
417 			break;
418 		cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
419 		CALLOUT_LIST_DELETE(ct->ct_queue, cl);
420 		CALLOUT_LIST_APPEND(ct->ct_expired, cl);
421 	}
422 
423 	/*
424 	 * If this callout queue is empty or callouts have been suspended,
425 	 * just return.
426 	 */
427 	if ((cl == NULL) || (ct->ct_suspend > 0))
428 		return (CY_INFINITY);
429 
430 	(void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
431 
432 	return (cl->cl_expiration);
433 }
434 
435 static hrtime_t
436 callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange)
437 {
438 	callout_list_t *firstcl, *cl;
439 	hrtime_t expiration, now;
440 	int clflags;
441 	callout_hash_t temp;
442 
443 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
444 
445 	firstcl = ct->ct_queue.ch_head;
446 	if (firstcl == NULL)
447 		return (CY_INFINITY);
448 
449 	/*
450 	 * We walk the callout queue. If we encounter a hrestime entry that
451 	 * must be removed, we clean it out. Otherwise, we apply any
452 	 * adjustments needed to it. Because of the latter, we need to
453 	 * recreate the list as we go along.
454 	 */
455 	temp = ct->ct_queue;
456 	ct->ct_queue.ch_head = NULL;
457 	ct->ct_queue.ch_tail = NULL;
458 
459 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
460 	now = gethrtime();
461 	while ((cl = temp.ch_head) != NULL) {
462 		CALLOUT_LIST_DELETE(temp, cl);
463 
464 		/*
465 		 * Delete the callout and expire it, if one of the following
466 		 * is true:
467 		 *	- the callout has expired
468 		 *	- the callout is an absolute hrestime one and
469 		 *	  there has been a system time change
470 		 */
471 		if ((cl->cl_expiration <= now) ||
472 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
473 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
474 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
475 			continue;
476 		}
477 
478 		/*
479 		 * Apply adjustments, if any. Adjustments are applied after
480 		 * the system returns from KMDB or OBP. They are only applied
481 		 * to relative callout lists.
482 		 */
483 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
484 			expiration = cl->cl_expiration + delta;
485 			if (expiration <= 0)
486 				expiration = CY_INFINITY;
487 			cl->cl_expiration = expiration;
488 		}
489 
490 		(void) callout_queue_add(ct, cl);
491 	}
492 
493 	/*
494 	 * We need to return the expiration to help program the cyclic.
495 	 * If there are expired callouts, the cyclic needs to go off
496 	 * immediately. If the queue has become empty, then we return infinity.
497 	 * Else, we return the expiration of the earliest callout in the queue.
498 	 */
499 	if (ct->ct_expired.ch_head != NULL)
500 		return (gethrtime());
501 
502 	cl = ct->ct_queue.ch_head;
503 	if (cl == NULL)
504 		return (CY_INFINITY);
505 
506 	return (cl->cl_expiration);
507 }
508 
509 /*
510  * Initialize a callout table's heap, if necessary. Preallocate some free
511  * entries so we don't have to check for NULL elsewhere.
512  */
513 static void
514 callout_heap_init(callout_table_t *ct)
515 {
516 	size_t size;
517 
518 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
519 	ASSERT(ct->ct_heap == NULL);
520 
521 	ct->ct_heap_num = 0;
522 	ct->ct_heap_max = callout_chunk;
523 	size = sizeof (callout_heap_t) * callout_chunk;
524 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
525 }
526 
527 /*
528  * Reallocate the heap. Return 0 if the heap is still full at the end of it.
529  * Return 1 otherwise. Note that the heap only expands, it never contracts.
530  */
531 static int
532 callout_heap_expand(callout_table_t *ct)
533 {
534 	size_t max, size, osize;
535 	callout_heap_t *heap;
536 
537 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
538 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
539 
540 	while (ct->ct_heap_num == ct->ct_heap_max) {
541 		max = ct->ct_heap_max;
542 		mutex_exit(&ct->ct_mutex);
543 
544 		osize = sizeof (callout_heap_t) * max;
545 		size = sizeof (callout_heap_t) * (max + callout_chunk);
546 		heap = kmem_alloc(size, KM_NOSLEEP);
547 
548 		mutex_enter(&ct->ct_mutex);
549 		if (heap == NULL) {
550 			/*
551 			 * We could not allocate memory. If we can free up
552 			 * some entries, that would be great.
553 			 */
554 			if (ct->ct_nreap > 0)
555 				(void) callout_heap_process(ct, 0, 0);
556 			/*
557 			 * If we still have no space in the heap, inform the
558 			 * caller.
559 			 */
560 			if (ct->ct_heap_num == ct->ct_heap_max)
561 				return (0);
562 			return (1);
563 		}
564 		if (max < ct->ct_heap_max) {
565 			/*
566 			 * Someone beat us to the allocation. Free what we
567 			 * just allocated and proceed.
568 			 */
569 			kmem_free(heap, size);
570 			continue;
571 		}
572 
573 		bcopy(ct->ct_heap, heap, osize);
574 		kmem_free(ct->ct_heap, osize);
575 		ct->ct_heap = heap;
576 		ct->ct_heap_max = size / sizeof (callout_heap_t);
577 	}
578 
579 	return (1);
580 }
581 
582 /*
583  * Move an expiration from the bottom of the heap to its correct place
584  * in the heap. If we reached the root doing this, return 1. Else,
585  * return 0.
586  */
587 static int
588 callout_upheap(callout_table_t *ct)
589 {
590 	int current, parent;
591 	callout_heap_t *heap, *hcurrent, *hparent;
592 
593 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
594 	ASSERT(ct->ct_heap_num >= 1);
595 
596 	if (ct->ct_heap_num == 1) {
597 		return (1);
598 	}
599 
600 	heap = ct->ct_heap;
601 	current = ct->ct_heap_num - 1;
602 
603 	for (;;) {
604 		parent = CALLOUT_HEAP_PARENT(current);
605 		hparent = &heap[parent];
606 		hcurrent = &heap[current];
607 
608 		/*
609 		 * We have an expiration later than our parent; we're done.
610 		 */
611 		if (hcurrent->ch_expiration >= hparent->ch_expiration) {
612 			return (0);
613 		}
614 
615 		/*
616 		 * We need to swap with our parent, and continue up the heap.
617 		 */
618 		CALLOUT_SWAP(hparent, hcurrent);
619 
620 		/*
621 		 * If we just reached the root, we're done.
622 		 */
623 		if (parent == 0) {
624 			return (1);
625 		}
626 
627 		current = parent;
628 	}
629 	/*NOTREACHED*/
630 }
631 
632 /*
633  * Insert a new heap item into a callout table's heap.
634  */
635 static void
636 callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
637 {
638 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
639 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
640 
641 	cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED;
642 	/*
643 	 * First, copy the expiration and callout list pointer to the bottom
644 	 * of the heap.
645 	 */
646 	ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
647 	ct->ct_heap[ct->ct_heap_num].ch_list = cl;
648 	ct->ct_heap_num++;
649 
650 	/*
651 	 * Now, perform an upheap operation. If we reached the root, then
652 	 * the cyclic needs to be reprogrammed as we have an earlier
653 	 * expiration.
654 	 *
655 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
656 	 * We don't want any callout activity. When the CPR resume phase is
657 	 * entered, the cyclic will be programmed for the earliest expiration
658 	 * in the heap.
659 	 */
660 	if (callout_upheap(ct) && (ct->ct_suspend == 0))
661 		(void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
662 }
663 
664 /*
665  * Move an expiration from the top of the heap to its correct place
666  * in the heap.
667  */
668 static void
669 callout_downheap(callout_table_t *ct)
670 {
671 	int current, left, right, nelems;
672 	callout_heap_t *heap, *hleft, *hright, *hcurrent;
673 
674 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
675 	ASSERT(ct->ct_heap_num >= 1);
676 
677 	heap = ct->ct_heap;
678 	current = 0;
679 	nelems = ct->ct_heap_num;
680 
681 	for (;;) {
682 		/*
683 		 * If we don't have a left child (i.e., we're a leaf), we're
684 		 * done.
685 		 */
686 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
687 			return;
688 
689 		hleft = &heap[left];
690 		hcurrent = &heap[current];
691 
692 		right = CALLOUT_HEAP_RIGHT(current);
693 
694 		/*
695 		 * Even if we don't have a right child, we still need to compare
696 		 * our expiration against that of our left child.
697 		 */
698 		if (right >= nelems)
699 			goto comp_left;
700 
701 		hright = &heap[right];
702 
703 		/*
704 		 * We have both a left and a right child.  We need to compare
705 		 * the expiration of the children to determine which
706 		 * expires earlier.
707 		 */
708 		if (hright->ch_expiration < hleft->ch_expiration) {
709 			/*
710 			 * Our right child is the earlier of our children.
711 			 * We'll now compare our expiration to its expiration.
712 			 * If ours is the earlier one, we're done.
713 			 */
714 			if (hcurrent->ch_expiration <= hright->ch_expiration)
715 				return;
716 
717 			/*
718 			 * Our right child expires earlier than we do; swap
719 			 * with our right child, and descend right.
720 			 */
721 			CALLOUT_SWAP(hright, hcurrent);
722 			current = right;
723 			continue;
724 		}
725 
726 comp_left:
727 		/*
728 		 * Our left child is the earlier of our children (or we have
729 		 * no right child).  We'll now compare our expiration
730 		 * to its expiration. If ours is the earlier one, we're done.
731 		 */
732 		if (hcurrent->ch_expiration <= hleft->ch_expiration)
733 			return;
734 
735 		/*
736 		 * Our left child expires earlier than we do; swap with our
737 		 * left child, and descend left.
738 		 */
739 		CALLOUT_SWAP(hleft, hcurrent);
740 		current = left;
741 	}
742 }
743 
744 /*
745  * Delete and handle all past expirations in a callout table's heap.
746  */
747 static hrtime_t
748 callout_heap_delete(callout_table_t *ct)
749 {
750 	hrtime_t now, expiration, next;
751 	callout_list_t *cl;
752 	callout_heap_t *heap;
753 	int hash;
754 
755 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
756 
757 	if (CALLOUT_CLEANUP(ct)) {
758 		/*
759 		 * There are too many heap elements pointing to empty callout
760 		 * lists. Clean them out.
761 		 */
762 		(void) callout_heap_process(ct, 0, 0);
763 	}
764 
765 	now = gethrtime();
766 	heap = ct->ct_heap;
767 
768 	while (ct->ct_heap_num > 0) {
769 		expiration = heap->ch_expiration;
770 		hash = CALLOUT_CLHASH(expiration);
771 		cl = heap->ch_list;
772 		ASSERT(expiration == cl->cl_expiration);
773 
774 		if (cl->cl_callouts.ch_head == NULL) {
775 			/*
776 			 * If the callout list is empty, reap it.
777 			 * Decrement the reap count.
778 			 */
779 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
780 			CALLOUT_LIST_FREE(ct, cl);
781 			ct->ct_nreap--;
782 		} else {
783 			/*
784 			 * If the root of the heap expires in the future,
785 			 * bail out.
786 			 */
787 			if (expiration > now)
788 				break;
789 
790 			/*
791 			 * Move the callout list for this expiration to the
792 			 * list of expired callout lists. It will be processed
793 			 * by the callout executor.
794 			 */
795 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
796 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
797 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
798 		}
799 
800 		/*
801 		 * Now delete the root. This is done by swapping the root with
802 		 * the last item in the heap and downheaping the item.
803 		 */
804 		ct->ct_heap_num--;
805 		if (ct->ct_heap_num > 0) {
806 			heap[0] = heap[ct->ct_heap_num];
807 			callout_downheap(ct);
808 		}
809 	}
810 
811 	/*
812 	 * If this callout table is empty or callouts have been suspended,
813 	 * just return. The cyclic has already been programmed to
814 	 * infinity by the cyclic subsystem.
815 	 */
816 	if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
817 		return (CY_INFINITY);
818 
819 	/*
820 	 * If the top expirations are within callout_tolerance of each other,
821 	 * delay the cyclic expire so that they can be processed together.
822 	 * This is to prevent high resolution timers from swamping the system
823 	 * with cyclic activity.
824 	 */
825 	if (ct->ct_heap_num > 2) {
826 		next = expiration + callout_tolerance;
827 		if ((heap[1].ch_expiration < next) ||
828 		    (heap[2].ch_expiration < next))
829 			expiration = next;
830 	}
831 
832 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
833 
834 	return (expiration);
835 }
836 
837 /*
838  * There are some situations when the entire heap is walked and processed.
839  * This function is called to do the processing. These are the situations:
840  *
841  * 1. When the reap count reaches its threshold, the heap has to be cleared
842  *    of all empty callout lists.
843  *
844  * 2. When the system enters and exits KMDB/OBP, all entries in the heap
845  *    need to be adjusted by the interval spent in KMDB/OBP.
846  *
847  * 3. When system time is changed, the heap has to be scanned for
848  *    absolute hrestime timers. These need to be removed from the heap
849  *    and expired immediately.
850  *
851  * In cases 2 and 3, it is a good idea to do 1 as well since we are
852  * scanning the heap anyway.
853  *
854  * If the root gets changed and/or callout lists are expired, return the
855  * new expiration to the caller so he can reprogram the cyclic accordingly.
856  */
857 static hrtime_t
858 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
859 {
860 	callout_heap_t *heap;
861 	callout_list_t *cl;
862 	hrtime_t expiration, now;
863 	int i, hash, clflags;
864 	ulong_t num;
865 
866 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
867 
868 	if (ct->ct_heap_num == 0)
869 		return (CY_INFINITY);
870 
871 	if (ct->ct_nreap > 0)
872 		ct->ct_cleanups++;
873 
874 	heap = ct->ct_heap;
875 
876 	/*
877 	 * We walk the heap from the top to the bottom. If we encounter
878 	 * a heap item that points to an empty callout list, we clean
879 	 * it out. If we encounter a hrestime entry that must be removed,
880 	 * again we clean it out. Otherwise, we apply any adjustments needed
881 	 * to an element.
882 	 *
883 	 * During the walk, we also compact the heap from the bottom and
884 	 * reconstruct the heap using upheap operations. This is very
885 	 * efficient if the number of elements to be cleaned is greater than
886 	 * or equal to half the heap. This is the common case.
887 	 *
888 	 * Even in the non-common case, the upheap operations should be short
889 	 * as the entries below generally tend to be bigger than the entries
890 	 * above.
891 	 */
892 	num = ct->ct_heap_num;
893 	ct->ct_heap_num = 0;
894 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
895 	now = gethrtime();
896 	for (i = 0; i < num; i++) {
897 		cl = heap[i].ch_list;
898 		/*
899 		 * If the callout list is empty, delete the heap element and
900 		 * free the callout list.
901 		 */
902 		if (cl->cl_callouts.ch_head == NULL) {
903 			hash = CALLOUT_CLHASH(cl->cl_expiration);
904 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
905 			CALLOUT_LIST_FREE(ct, cl);
906 			continue;
907 		}
908 
909 		/*
910 		 * Delete the heap element and expire the callout list, if
911 		 * one of the following is true:
912 		 *	- the callout list has expired
913 		 *	- the callout list is an absolute hrestime one and
914 		 *	  there has been a system time change
915 		 */
916 		if ((cl->cl_expiration <= now) ||
917 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
918 			hash = CALLOUT_CLHASH(cl->cl_expiration);
919 			cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
920 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
921 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
922 			continue;
923 		}
924 
925 		/*
926 		 * Apply adjustments, if any. Adjustments are applied after
927 		 * the system returns from KMDB or OBP. They are only applied
928 		 * to relative callout lists.
929 		 */
930 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
931 			hash = CALLOUT_CLHASH(cl->cl_expiration);
932 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
933 			expiration = cl->cl_expiration + delta;
934 			if (expiration <= 0)
935 				expiration = CY_INFINITY;
936 			heap[i].ch_expiration = expiration;
937 			cl->cl_expiration = expiration;
938 			hash = CALLOUT_CLHASH(cl->cl_expiration);
939 			if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
940 				CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
941 			} else {
942 				CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
943 			}
944 		}
945 
946 		heap[ct->ct_heap_num] = heap[i];
947 		ct->ct_heap_num++;
948 		(void) callout_upheap(ct);
949 	}
950 
951 	ct->ct_nreap = 0;
952 
953 	/*
954 	 * We need to return the expiration to help program the cyclic.
955 	 * If there are expired callouts, the cyclic needs to go off
956 	 * immediately. If the heap has become empty, then we return infinity.
957 	 * Else, return the expiration of the earliest callout in the heap.
958 	 */
959 	if (ct->ct_expired.ch_head != NULL)
960 		return (gethrtime());
961 
962 	if (ct->ct_heap_num == 0)
963 		return (CY_INFINITY);
964 
965 	return (heap->ch_expiration);
966 }
967 
968 /*
969  * Common function used to create normal and realtime callouts.
970  *
971  * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
972  * there is one restriction on a realtime callout handler - it should not
973  * directly or indirectly acquire cpu_lock. CPU offline waits for pending
974  * cyclic handlers to complete while holding cpu_lock. So, if a realtime
975  * callout handler were to try to get cpu_lock, there would be a deadlock
976  * during CPU offline.
977  */
978 callout_id_t
979 timeout_generic(int type, void (*func)(void *), void *arg,
980 	hrtime_t expiration, hrtime_t resolution, int flags)
981 {
982 	callout_table_t *ct;
983 	callout_t *cp;
984 	callout_id_t id;
985 	callout_list_t *cl;
986 	hrtime_t now, interval;
987 	int hash, clflags;
988 
989 	ASSERT(resolution > 0);
990 	ASSERT(func != NULL);
991 
992 	/*
993 	 * We get the current hrtime right upfront so that latencies in
994 	 * this function do not affect the accuracy of the callout.
995 	 */
996 	now = gethrtime();
997 
998 	/*
999 	 * We disable kernel preemption so that we remain on the same CPU
1000 	 * throughout. If we needed to reprogram the callout table's cyclic,
1001 	 * we can avoid X-calls if we are on the same CPU.
1002 	 *
1003 	 * Note that callout_alloc() releases and reacquires the callout
1004 	 * table mutex. While reacquiring the mutex, it is possible for us
1005 	 * to go to sleep and later migrate to another CPU. This should be
1006 	 * pretty rare, though.
1007 	 */
1008 	kpreempt_disable();
1009 
1010 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
1011 	mutex_enter(&ct->ct_mutex);
1012 
1013 	if (ct->ct_cyclic == CYCLIC_NONE) {
1014 		mutex_exit(&ct->ct_mutex);
1015 		/*
1016 		 * The callout table has not yet been initialized fully.
1017 		 * So, put this one on the boot callout table which is
1018 		 * always initialized.
1019 		 */
1020 		ct = &callout_boot_ct[type];
1021 		mutex_enter(&ct->ct_mutex);
1022 	}
1023 
1024 	if (CALLOUT_CLEANUP(ct)) {
1025 		/*
1026 		 * There are too many heap elements pointing to empty callout
1027 		 * lists. Clean them out. Since cleanup is only done once
1028 		 * in a while, no need to reprogram the cyclic if the root
1029 		 * of the heap gets cleaned out.
1030 		 */
1031 		(void) callout_heap_process(ct, 0, 0);
1032 	}
1033 
1034 	if ((cp = ct->ct_free) == NULL)
1035 		cp = callout_alloc(ct);
1036 	else
1037 		ct->ct_free = cp->c_idnext;
1038 
1039 	cp->c_func = func;
1040 	cp->c_arg = arg;
1041 
1042 	/*
1043 	 * Compute the expiration hrtime.
1044 	 */
1045 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
1046 		interval = expiration - now;
1047 	} else {
1048 		interval = expiration;
1049 		expiration += now;
1050 	}
1051 
1052 	if (resolution > 1) {
1053 		/*
1054 		 * Align expiration to the specified resolution.
1055 		 */
1056 		if (flags & CALLOUT_FLAG_ROUNDUP)
1057 			expiration += resolution - 1;
1058 		expiration = (expiration / resolution) * resolution;
1059 	}
1060 
1061 	if (expiration <= 0) {
1062 		/*
1063 		 * expiration hrtime overflow has occurred. Just set the
1064 		 * expiration to infinity.
1065 		 */
1066 		expiration = CY_INFINITY;
1067 	}
1068 
1069 	/*
1070 	 * Assign an ID to this callout
1071 	 */
1072 	if (flags & CALLOUT_FLAG_32BIT) {
1073 		if (interval > callout_longterm) {
1074 			id = (ct->ct_long_id - callout_counter_low);
1075 			id |= CALLOUT_COUNTER_HIGH;
1076 			ct->ct_long_id = id;
1077 		} else {
1078 			id = (ct->ct_short_id - callout_counter_low);
1079 			id |= CALLOUT_COUNTER_HIGH;
1080 			ct->ct_short_id = id;
1081 		}
1082 	} else {
1083 		id = (ct->ct_gen_id - callout_counter_low);
1084 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
1085 			id |= CALLOUT_COUNTER_HIGH;
1086 			id += CALLOUT_GENERATION_LOW;
1087 		}
1088 		ct->ct_gen_id = id;
1089 	}
1090 
1091 	cp->c_xid = id;
1092 
1093 	clflags = 0;
1094 	if (flags & CALLOUT_FLAG_ABSOLUTE)
1095 		clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
1096 	if (flags & CALLOUT_FLAG_HRESTIME)
1097 		clflags |= CALLOUT_LIST_FLAG_HRESTIME;
1098 	if (resolution == 1)
1099 		clflags |= CALLOUT_LIST_FLAG_NANO;
1100 	hash = CALLOUT_CLHASH(expiration);
1101 
1102 again:
1103 	/*
1104 	 * Try to see if a callout list already exists for this expiration.
1105 	 */
1106 	cl = callout_list_get(ct, expiration, clflags, hash);
1107 	if (cl == NULL) {
1108 		/*
1109 		 * Check the free list. If we don't find one, we have to
1110 		 * take the slow path and allocate from kmem.
1111 		 */
1112 		if ((cl = ct->ct_lfree) == NULL) {
1113 			callout_list_alloc(ct);
1114 			/*
1115 			 * In the above call, we drop the lock, allocate and
1116 			 * reacquire the lock. So, we could have been away
1117 			 * for a while. In the meantime, someone could have
1118 			 * inserted a callout list with the same expiration.
1119 			 * Plus, the heap could have become full. So, the best
1120 			 * course is to repeat the steps. This should be an
1121 			 * infrequent event.
1122 			 */
1123 			goto again;
1124 		}
1125 		ct->ct_lfree = cl->cl_next;
1126 		cl->cl_expiration = expiration;
1127 		cl->cl_flags = clflags;
1128 
1129 		/*
1130 		 * Check if we have enough space in the heap to insert one
1131 		 * expiration. If not, expand the heap.
1132 		 */
1133 		if (ct->ct_heap_num == ct->ct_heap_max) {
1134 			if (callout_heap_expand(ct) == 0) {
1135 				/*
1136 				 * Could not expand the heap. Just queue it.
1137 				 */
1138 				callout_queue_insert(ct, cl);
1139 				goto out;
1140 			}
1141 
1142 			/*
1143 			 * In the above call, we drop the lock, allocate and
1144 			 * reacquire the lock. So, we could have been away
1145 			 * for a while. In the meantime, someone could have
1146 			 * inserted a callout list with the same expiration.
1147 			 * But we will not go back and check for it as this
1148 			 * should be a really infrequent event. There is no
1149 			 * point.
1150 			 */
1151 		}
1152 
1153 		if (clflags & CALLOUT_LIST_FLAG_NANO) {
1154 			CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
1155 		} else {
1156 			CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
1157 		}
1158 
1159 		/*
1160 		 * This is a new expiration. So, insert it into the heap.
1161 		 * This will also reprogram the cyclic, if the expiration
1162 		 * propagated to the root of the heap.
1163 		 */
1164 		callout_heap_insert(ct, cl);
1165 	} else {
1166 		/*
1167 		 * If the callout list was empty, untimeout_generic() would
1168 		 * have incremented a reap count. Decrement the reap count
1169 		 * as we are going to insert a callout into this list.
1170 		 */
1171 		if (cl->cl_callouts.ch_head == NULL)
1172 			ct->ct_nreap--;
1173 	}
1174 out:
1175 	cp->c_list = cl;
1176 	CALLOUT_APPEND(ct, cp);
1177 
1178 	ct->ct_timeouts++;
1179 	ct->ct_timeouts_pending++;
1180 
1181 	mutex_exit(&ct->ct_mutex);
1182 
1183 	kpreempt_enable();
1184 
1185 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
1186 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
1187 	    cp);
1188 
1189 	return (id);
1190 }
1191 
1192 timeout_id_t
1193 timeout(void (*func)(void *), void *arg, clock_t delta)
1194 {
1195 	ulong_t id;
1196 
1197 	/*
1198 	 * Make sure the callout runs at least 1 tick in the future.
1199 	 */
1200 	if (delta <= 0)
1201 		delta = 1;
1202 	else if (delta > callout_max_ticks)
1203 		delta = callout_max_ticks;
1204 
1205 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
1206 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1207 
1208 	return ((timeout_id_t)id);
1209 }
1210 
1211 /*
1212  * Convenience function that creates a normal callout with default parameters
1213  * and returns a full ID.
1214  */
1215 callout_id_t
1216 timeout_default(void (*func)(void *), void *arg, clock_t delta)
1217 {
1218 	callout_id_t id;
1219 
1220 	/*
1221 	 * Make sure the callout runs at least 1 tick in the future.
1222 	 */
1223 	if (delta <= 0)
1224 		delta = 1;
1225 	else if (delta > callout_max_ticks)
1226 		delta = callout_max_ticks;
1227 
1228 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
1229 	    nsec_per_tick, 0);
1230 
1231 	return (id);
1232 }
1233 
1234 timeout_id_t
1235 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
1236 {
1237 	ulong_t id;
1238 
1239 	/*
1240 	 * Make sure the callout runs at least 1 tick in the future.
1241 	 */
1242 	if (delta <= 0)
1243 		delta = 1;
1244 	else if (delta > callout_max_ticks)
1245 		delta = callout_max_ticks;
1246 
1247 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
1248 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1249 
1250 	return ((timeout_id_t)id);
1251 }
1252 
1253 /*
1254  * Convenience function that creates a realtime callout with default parameters
1255  * and returns a full ID.
1256  */
1257 callout_id_t
1258 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
1259 {
1260 	callout_id_t id;
1261 
1262 	/*
1263 	 * Make sure the callout runs at least 1 tick in the future.
1264 	 */
1265 	if (delta <= 0)
1266 		delta = 1;
1267 	else if (delta > callout_max_ticks)
1268 		delta = callout_max_ticks;
1269 
1270 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
1271 	    nsec_per_tick, 0);
1272 
1273 	return (id);
1274 }
1275 
1276 hrtime_t
1277 untimeout_generic(callout_id_t id, int nowait)
1278 {
1279 	callout_table_t *ct;
1280 	callout_t *cp;
1281 	callout_id_t xid;
1282 	callout_list_t *cl;
1283 	int hash, flags;
1284 	callout_id_t bogus;
1285 
1286 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
1287 	hash = CALLOUT_IDHASH(id);
1288 
1289 	mutex_enter(&ct->ct_mutex);
1290 
1291 	/*
1292 	 * Search the ID hash table for the callout.
1293 	 */
1294 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
1295 
1296 		xid = cp->c_xid;
1297 
1298 		/*
1299 		 * Match the ID and generation number.
1300 		 */
1301 		if ((xid & CALLOUT_ID_MASK) != id)
1302 			continue;
1303 
1304 		if ((xid & CALLOUT_EXECUTING) == 0) {
1305 			hrtime_t expiration;
1306 
1307 			/*
1308 			 * Delete the callout. If the callout list becomes
1309 			 * NULL, we don't remove it from the table. This is
1310 			 * so it can be reused. If the empty callout list
1311 			 * corresponds to the top of the the callout heap, we
1312 			 * don't reprogram the table cyclic here. This is in
1313 			 * order to avoid lots of X-calls to the CPU associated
1314 			 * with the callout table.
1315 			 */
1316 			cl = cp->c_list;
1317 			expiration = cl->cl_expiration;
1318 			CALLOUT_DELETE(ct, cp);
1319 			CALLOUT_FREE(ct, cp);
1320 			ct->ct_untimeouts_unexpired++;
1321 			ct->ct_timeouts_pending--;
1322 
1323 			/*
1324 			 * If the callout list has become empty, there are 3
1325 			 * possibilities. If it is present:
1326 			 *	- in the heap, it needs to be cleaned along
1327 			 *	  with its heap entry. Increment a reap count.
1328 			 *	- in the callout queue, free it.
1329 			 *	- in the expired list, free it.
1330 			 */
1331 			if (cl->cl_callouts.ch_head == NULL) {
1332 				flags = cl->cl_flags;
1333 				if (flags & CALLOUT_LIST_FLAG_HEAPED) {
1334 					ct->ct_nreap++;
1335 				} else if (flags & CALLOUT_LIST_FLAG_QUEUED) {
1336 					CALLOUT_LIST_DELETE(ct->ct_queue, cl);
1337 					CALLOUT_LIST_FREE(ct, cl);
1338 				} else {
1339 					CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1340 					CALLOUT_LIST_FREE(ct, cl);
1341 				}
1342 			}
1343 			mutex_exit(&ct->ct_mutex);
1344 
1345 			expiration -= gethrtime();
1346 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
1347 			    "untimeout:ID %lx hrtime left %llx", id,
1348 			    expiration);
1349 			return (expiration < 0 ? 0 : expiration);
1350 		}
1351 
1352 		ct->ct_untimeouts_executing++;
1353 		/*
1354 		 * The callout we want to delete is currently executing.
1355 		 * The DDI states that we must wait until the callout
1356 		 * completes before returning, so we block on c_done until the
1357 		 * callout ID changes (to the old ID if it's on the freelist,
1358 		 * or to a new callout ID if it's in use).  This implicitly
1359 		 * assumes that callout structures are persistent (they are).
1360 		 */
1361 		if (cp->c_executor == curthread) {
1362 			/*
1363 			 * The timeout handler called untimeout() on itself.
1364 			 * Stupid, but legal.  We can't wait for the timeout
1365 			 * to complete without deadlocking, so we just return.
1366 			 */
1367 			mutex_exit(&ct->ct_mutex);
1368 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
1369 			    "untimeout_self:ID %x", id);
1370 			return (-1);
1371 		}
1372 		if (nowait == 0) {
1373 			/*
1374 			 * We need to wait. Indicate that we are waiting by
1375 			 * incrementing c_waiting. This prevents the executor
1376 			 * from doing a wakeup on c_done if there are no
1377 			 * waiters.
1378 			 */
1379 			while (cp->c_xid == xid) {
1380 				cp->c_waiting = 1;
1381 				cv_wait(&cp->c_done, &ct->ct_mutex);
1382 			}
1383 		}
1384 		mutex_exit(&ct->ct_mutex);
1385 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
1386 		    "untimeout_executing:ID %lx", id);
1387 		return (-1);
1388 	}
1389 	ct->ct_untimeouts_expired++;
1390 
1391 	mutex_exit(&ct->ct_mutex);
1392 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
1393 	    "untimeout_bogus_id:ID %lx", id);
1394 
1395 	/*
1396 	 * We didn't find the specified callout ID.  This means either
1397 	 * (1) the callout already fired, or (2) the caller passed us
1398 	 * a bogus value.  Perform a sanity check to detect case (2).
1399 	 */
1400 	bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
1401 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
1402 		panic("untimeout: impossible timeout id %llx",
1403 		    (unsigned long long)id);
1404 
1405 	return (-1);
1406 }
1407 
1408 clock_t
1409 untimeout(timeout_id_t id_arg)
1410 {
1411 	hrtime_t hleft;
1412 	clock_t tleft;
1413 	callout_id_t id;
1414 
1415 	id = (ulong_t)id_arg;
1416 	hleft = untimeout_generic(id, 0);
1417 	if (hleft < 0)
1418 		tleft = -1;
1419 	else if (hleft == 0)
1420 		tleft = 0;
1421 	else
1422 		tleft = NSEC_TO_TICK(hleft);
1423 
1424 	return (tleft);
1425 }
1426 
1427 /*
1428  * Convenience function to untimeout a timeout with a full ID with default
1429  * parameters.
1430  */
1431 clock_t
1432 untimeout_default(callout_id_t id, int nowait)
1433 {
1434 	hrtime_t hleft;
1435 	clock_t tleft;
1436 
1437 	hleft = untimeout_generic(id, nowait);
1438 	if (hleft < 0)
1439 		tleft = -1;
1440 	else if (hleft == 0)
1441 		tleft = 0;
1442 	else
1443 		tleft = NSEC_TO_TICK(hleft);
1444 
1445 	return (tleft);
1446 }
1447 
1448 /*
1449  * Expire all the callouts queued in the specified callout list.
1450  */
1451 static void
1452 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
1453 {
1454 	callout_t *cp, *cnext;
1455 
1456 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1457 	ASSERT(cl != NULL);
1458 
1459 	for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
1460 		/*
1461 		 * Multiple executor threads could be running at the same
1462 		 * time. If this callout is already being executed,
1463 		 * go on to the next one.
1464 		 */
1465 		if (cp->c_xid & CALLOUT_EXECUTING) {
1466 			cnext = cp->c_clnext;
1467 			continue;
1468 		}
1469 
1470 		/*
1471 		 * Indicate to untimeout() that a callout is
1472 		 * being expired by the executor.
1473 		 */
1474 		cp->c_xid |= CALLOUT_EXECUTING;
1475 		cp->c_executor = curthread;
1476 		mutex_exit(&ct->ct_mutex);
1477 
1478 		DTRACE_PROBE1(callout__start, callout_t *, cp);
1479 		(*cp->c_func)(cp->c_arg);
1480 		DTRACE_PROBE1(callout__end, callout_t *, cp);
1481 
1482 		mutex_enter(&ct->ct_mutex);
1483 
1484 		ct->ct_expirations++;
1485 		ct->ct_timeouts_pending--;
1486 		/*
1487 		 * Indicate completion for c_done.
1488 		 */
1489 		cp->c_xid &= ~CALLOUT_EXECUTING;
1490 		cp->c_executor = NULL;
1491 		cnext = cp->c_clnext;
1492 
1493 		/*
1494 		 * Delete callout from ID hash table and the callout
1495 		 * list, return to freelist, and tell any untimeout() that
1496 		 * cares that we're done.
1497 		 */
1498 		CALLOUT_DELETE(ct, cp);
1499 		CALLOUT_FREE(ct, cp);
1500 
1501 		if (cp->c_waiting) {
1502 			cp->c_waiting = 0;
1503 			cv_broadcast(&cp->c_done);
1504 		}
1505 	}
1506 }
1507 
1508 /*
1509  * Execute all expired callout lists for a callout table.
1510  */
1511 static void
1512 callout_expire(callout_table_t *ct)
1513 {
1514 	callout_list_t *cl, *clnext;
1515 
1516 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1517 
1518 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1519 		/*
1520 		 * Expire all the callouts in this callout list.
1521 		 */
1522 		callout_list_expire(ct, cl);
1523 
1524 		clnext = cl->cl_next;
1525 		if (cl->cl_callouts.ch_head == NULL) {
1526 			/*
1527 			 * Free the callout list.
1528 			 */
1529 			CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1530 			CALLOUT_LIST_FREE(ct, cl);
1531 		}
1532 	}
1533 }
1534 
1535 /*
1536  * The cyclic handlers below process callouts in two steps:
1537  *
1538  *	1. Find all expired callout lists and queue them in a separate
1539  *	   list of expired callouts.
1540  *	2. Execute the expired callout lists.
1541  *
1542  * This is done for two reasons:
1543  *
1544  *	1. We want to quickly find the next earliest expiration to program
1545  *	   the cyclic to and reprogram it. We can do this right at the end
1546  *	   of step 1.
1547  *	2. The realtime cyclic handler expires callouts in place. However,
1548  *	   for normal callouts, callouts are expired by a taskq thread.
1549  *	   So, it is simpler and more robust to have the taskq thread just
1550  *	   do step 2.
1551  */
1552 
1553 /*
1554  * Realtime callout cyclic handlers.
1555  */
1556 void
1557 callout_realtime(callout_table_t *ct)
1558 {
1559 	mutex_enter(&ct->ct_mutex);
1560 	(void) callout_heap_delete(ct);
1561 	callout_expire(ct);
1562 	mutex_exit(&ct->ct_mutex);
1563 }
1564 
1565 void
1566 callout_queue_realtime(callout_table_t *ct)
1567 {
1568 	mutex_enter(&ct->ct_mutex);
1569 	(void) callout_queue_delete(ct);
1570 	callout_expire(ct);
1571 	mutex_exit(&ct->ct_mutex);
1572 }
1573 
1574 void
1575 callout_execute(callout_table_t *ct)
1576 {
1577 	mutex_enter(&ct->ct_mutex);
1578 	callout_expire(ct);
1579 	mutex_exit(&ct->ct_mutex);
1580 }
1581 
1582 /*
1583  * Normal callout cyclic handlers.
1584  */
1585 void
1586 callout_normal(callout_table_t *ct)
1587 {
1588 	int i, exec;
1589 	hrtime_t exp;
1590 
1591 	mutex_enter(&ct->ct_mutex);
1592 	exp = callout_heap_delete(ct);
1593 	CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1594 	mutex_exit(&ct->ct_mutex);
1595 
1596 	for (i = 0; i < exec; i++) {
1597 		ASSERT(ct->ct_taskq != NULL);
1598 		(void) taskq_dispatch(ct->ct_taskq,
1599 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1600 	}
1601 }
1602 
1603 void
1604 callout_queue_normal(callout_table_t *ct)
1605 {
1606 	int i, exec;
1607 	hrtime_t exp;
1608 
1609 	mutex_enter(&ct->ct_mutex);
1610 	exp = callout_queue_delete(ct);
1611 	CALLOUT_EXEC_COMPUTE(ct, exp, exec);
1612 	mutex_exit(&ct->ct_mutex);
1613 
1614 	for (i = 0; i < exec; i++) {
1615 		ASSERT(ct->ct_taskq != NULL);
1616 		(void) taskq_dispatch(ct->ct_taskq,
1617 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1618 	}
1619 }
1620 
1621 /*
1622  * Suspend callout processing.
1623  */
1624 static void
1625 callout_suspend(void)
1626 {
1627 	int t, f;
1628 	callout_table_t *ct;
1629 
1630 	/*
1631 	 * Traverse every callout table in the system and suspend callout
1632 	 * processing.
1633 	 *
1634 	 * We need to suspend all the tables (including the inactive ones)
1635 	 * so that if a table is made active while the suspend is still on,
1636 	 * the table remains suspended.
1637 	 */
1638 	for (f = 0; f < max_ncpus; f++) {
1639 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1640 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1641 
1642 			mutex_enter(&ct->ct_mutex);
1643 			ct->ct_suspend++;
1644 			if (ct->ct_cyclic == CYCLIC_NONE) {
1645 				mutex_exit(&ct->ct_mutex);
1646 				continue;
1647 			}
1648 			if (ct->ct_suspend == 1) {
1649 				(void) cyclic_reprogram(ct->ct_cyclic,
1650 				    CY_INFINITY);
1651 				(void) cyclic_reprogram(ct->ct_qcyclic,
1652 				    CY_INFINITY);
1653 			}
1654 			mutex_exit(&ct->ct_mutex);
1655 		}
1656 	}
1657 }
1658 
1659 /*
1660  * Resume callout processing.
1661  */
1662 static void
1663 callout_resume(hrtime_t delta, int timechange)
1664 {
1665 	hrtime_t hexp, qexp;
1666 	int t, f;
1667 	callout_table_t *ct;
1668 
1669 	/*
1670 	 * Traverse every callout table in the system and resume callout
1671 	 * processing. For active tables, perform any hrtime adjustments
1672 	 * necessary.
1673 	 */
1674 	for (f = 0; f < max_ncpus; f++) {
1675 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1676 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1677 
1678 			mutex_enter(&ct->ct_mutex);
1679 			if (ct->ct_cyclic == CYCLIC_NONE) {
1680 				ct->ct_suspend--;
1681 				mutex_exit(&ct->ct_mutex);
1682 				continue;
1683 			}
1684 
1685 			/*
1686 			 * If a delta is specified, adjust the expirations in
1687 			 * the heap by delta. Also, if the caller indicates
1688 			 * a timechange, process that. This step also cleans
1689 			 * out any empty callout lists that might happen to
1690 			 * be there.
1691 			 */
1692 			hexp = callout_heap_process(ct, delta, timechange);
1693 			qexp = callout_queue_process(ct, delta, timechange);
1694 
1695 			ct->ct_suspend--;
1696 			if (ct->ct_suspend == 0) {
1697 				(void) cyclic_reprogram(ct->ct_cyclic, hexp);
1698 				(void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1699 			}
1700 
1701 			mutex_exit(&ct->ct_mutex);
1702 		}
1703 	}
1704 }
1705 
1706 /*
1707  * Callback handler used by CPR to stop and resume callouts.
1708  * The cyclic subsystem saves and restores hrtime during CPR.
1709  * That is why callout_resume() is called with a 0 delta.
1710  * Although hrtime is the same, hrestime (system time) has
1711  * progressed during CPR. So, we have to indicate a time change
1712  * to expire the absolute hrestime timers.
1713  */
1714 /*ARGSUSED*/
1715 static boolean_t
1716 callout_cpr_callb(void *arg, int code)
1717 {
1718 	if (code == CB_CODE_CPR_CHKPT)
1719 		callout_suspend();
1720 	else
1721 		callout_resume(0, 1);
1722 
1723 	return (B_TRUE);
1724 }
1725 
1726 /*
1727  * Callback handler invoked when the debugger is entered or exited.
1728  */
1729 /*ARGSUSED*/
1730 static boolean_t
1731 callout_debug_callb(void *arg, int code)
1732 {
1733 	hrtime_t delta;
1734 
1735 	/*
1736 	 * When the system enters the debugger. make a note of the hrtime.
1737 	 * When it is resumed, compute how long the system was in the
1738 	 * debugger. This interval should not be counted for callouts.
1739 	 */
1740 	if (code == 0) {
1741 		callout_suspend();
1742 		callout_debug_hrtime = gethrtime();
1743 	} else {
1744 		delta = gethrtime() - callout_debug_hrtime;
1745 		callout_resume(delta, 0);
1746 	}
1747 
1748 	return (B_TRUE);
1749 }
1750 
1751 /*
1752  * Move the absolute hrestime callouts to the expired list. Then program the
1753  * table's cyclic to expire immediately so that the callouts can be executed
1754  * immediately.
1755  */
1756 static void
1757 callout_hrestime_one(callout_table_t *ct)
1758 {
1759 	hrtime_t hexp, qexp;
1760 
1761 	mutex_enter(&ct->ct_mutex);
1762 	if (ct->ct_cyclic == CYCLIC_NONE) {
1763 		mutex_exit(&ct->ct_mutex);
1764 		return;
1765 	}
1766 
1767 	/*
1768 	 * Walk the heap and process all the absolute hrestime entries.
1769 	 */
1770 	hexp = callout_heap_process(ct, 0, 1);
1771 	qexp = callout_queue_process(ct, 0, 1);
1772 
1773 	if (ct->ct_suspend == 0) {
1774 		(void) cyclic_reprogram(ct->ct_cyclic, hexp);
1775 		(void) cyclic_reprogram(ct->ct_qcyclic, qexp);
1776 	}
1777 
1778 	mutex_exit(&ct->ct_mutex);
1779 }
1780 
1781 /*
1782  * This function is called whenever system time (hrestime) is changed
1783  * explicitly. All the HRESTIME callouts must be expired at once.
1784  */
1785 /*ARGSUSED*/
1786 void
1787 callout_hrestime(void)
1788 {
1789 	int t, f;
1790 	callout_table_t *ct;
1791 
1792 	/*
1793 	 * Traverse every callout table in the system and process the hrestime
1794 	 * callouts therein.
1795 	 *
1796 	 * We look at all the tables because we don't know which ones were
1797 	 * onlined and offlined in the past. The offlined tables may still
1798 	 * have active cyclics processing timers somewhere.
1799 	 */
1800 	for (f = 0; f < max_ncpus; f++) {
1801 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1802 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1803 			callout_hrestime_one(ct);
1804 		}
1805 	}
1806 }
1807 
1808 /*
1809  * Create the hash tables for this callout table.
1810  */
1811 static void
1812 callout_hash_init(callout_table_t *ct)
1813 {
1814 	size_t size;
1815 
1816 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1817 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1818 
1819 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1820 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1821 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1822 }
1823 
1824 /*
1825  * Create per-callout table kstats.
1826  */
1827 static void
1828 callout_kstat_init(callout_table_t *ct)
1829 {
1830 	callout_stat_type_t stat;
1831 	kstat_t *ct_kstats;
1832 	int ndx;
1833 
1834 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1835 	ASSERT(ct->ct_kstats == NULL);
1836 
1837 	ndx = ct - callout_table;
1838 	ct_kstats = kstat_create("unix", ndx, "callout",
1839 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1840 
1841 	if (ct_kstats == NULL) {
1842 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1843 		    (void *)ct);
1844 	} else {
1845 		ct_kstats->ks_data = ct->ct_kstat_data;
1846 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1847 			kstat_named_init(&ct->ct_kstat_data[stat],
1848 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
1849 		ct->ct_kstats = ct_kstats;
1850 		kstat_install(ct_kstats);
1851 	}
1852 }
1853 
1854 static void
1855 callout_cyclic_init(callout_table_t *ct)
1856 {
1857 	cyc_handler_t hdlr;
1858 	cyc_time_t when;
1859 	processorid_t seqid;
1860 	int t;
1861 	cyclic_id_t cyclic, qcyclic;
1862 
1863 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1864 
1865 	t = ct->ct_type;
1866 	seqid = CALLOUT_TABLE_SEQID(ct);
1867 
1868 	/*
1869 	 * Create the taskq thread if the table type is normal.
1870 	 * Realtime tables are handled at PIL1 by a softint
1871 	 * handler.
1872 	 */
1873 	if (t == CALLOUT_NORMAL) {
1874 		ASSERT(ct->ct_taskq == NULL);
1875 		/*
1876 		 * Each callout thread consumes exactly one
1877 		 * task structure while active.  Therefore,
1878 		 * prepopulating with 2 * callout_threads tasks
1879 		 * ensures that there's at least one task per
1880 		 * thread that's either scheduled or on the
1881 		 * freelist.  In turn, this guarantees that
1882 		 * taskq_dispatch() will always either succeed
1883 		 * (because there's a free task structure) or
1884 		 * be unnecessary (because "callout_excute(ct)"
1885 		 * has already scheduled).
1886 		 */
1887 		ct->ct_taskq =
1888 		    taskq_create_instance("callout_taskq", seqid,
1889 		    callout_threads, maxclsyspri,
1890 		    2 * callout_threads, 2 * callout_threads,
1891 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1892 	}
1893 
1894 	/*
1895 	 * callouts can only be created in a table whose
1896 	 * cyclic has been initialized.
1897 	 */
1898 	ASSERT(ct->ct_heap_num == 0);
1899 
1900 	/*
1901 	 * Drop the mutex before creating the callout cyclics. cyclic_add()
1902 	 * could potentially expand the cyclic heap. We don't want to be
1903 	 * holding the callout table mutex in that case. Note that this
1904 	 * function is called during CPU online. cpu_lock is held at this
1905 	 * point. So, only one thread can be executing the cyclic add logic
1906 	 * below at any time.
1907 	 */
1908 	mutex_exit(&ct->ct_mutex);
1909 
1910 	/*
1911 	 * Create the callout table cyclics.
1912 	 *
1913 	 * The realtime cyclic handler executes at low PIL. The normal cyclic
1914 	 * handler executes at lock PIL. This is because there are cases
1915 	 * where code can block at PIL > 1 waiting for a normal callout handler
1916 	 * to unblock it directly or indirectly. If the normal cyclic were to
1917 	 * be executed at low PIL, it could get blocked out by the waiter
1918 	 * and cause a deadlock.
1919 	 */
1920 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1921 
1922 	if (t == CALLOUT_REALTIME) {
1923 		hdlr.cyh_level = callout_realtime_level;
1924 		hdlr.cyh_func = (cyc_func_t)callout_realtime;
1925 	} else {
1926 		hdlr.cyh_level = callout_normal_level;
1927 		hdlr.cyh_func = (cyc_func_t)callout_normal;
1928 	}
1929 	hdlr.cyh_arg = ct;
1930 	when.cyt_when = CY_INFINITY;
1931 	when.cyt_interval = CY_INFINITY;
1932 
1933 	cyclic = cyclic_add(&hdlr, &when);
1934 
1935 	if (t == CALLOUT_REALTIME)
1936 		hdlr.cyh_func = (cyc_func_t)callout_queue_realtime;
1937 	else
1938 		hdlr.cyh_func = (cyc_func_t)callout_queue_normal;
1939 
1940 	qcyclic = cyclic_add(&hdlr, &when);
1941 
1942 	mutex_enter(&ct->ct_mutex);
1943 	ct->ct_cyclic = cyclic;
1944 	ct->ct_qcyclic = qcyclic;
1945 }
1946 
1947 void
1948 callout_cpu_online(cpu_t *cp)
1949 {
1950 	lgrp_handle_t hand;
1951 	callout_cache_t *cache;
1952 	char s[KMEM_CACHE_NAMELEN];
1953 	callout_table_t *ct;
1954 	processorid_t seqid;
1955 	int t;
1956 
1957 	ASSERT(MUTEX_HELD(&cpu_lock));
1958 
1959 	/*
1960 	 * Locate the cache corresponding to the onlined CPU's lgroup.
1961 	 * Note that access to callout_caches is protected by cpu_lock.
1962 	 */
1963 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1964 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1965 		if (cache->cc_hand == hand)
1966 			break;
1967 	}
1968 
1969 	/*
1970 	 * If not found, create one. The caches are never destroyed.
1971 	 */
1972 	if (cache == NULL) {
1973 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1974 		cache->cc_hand = hand;
1975 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1976 		    (long)hand);
1977 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1978 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1979 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1980 		    (long)hand);
1981 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1982 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1983 		cache->cc_next = callout_caches;
1984 		callout_caches = cache;
1985 	}
1986 
1987 	seqid = cp->cpu_seqid;
1988 
1989 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1990 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1991 
1992 		mutex_enter(&ct->ct_mutex);
1993 		/*
1994 		 * Store convinience pointers to the kmem caches
1995 		 * in the callout table. These assignments should always be
1996 		 * done as callout tables can map to different physical
1997 		 * CPUs each time.
1998 		 */
1999 		ct->ct_cache = cache->cc_cache;
2000 		ct->ct_lcache = cache->cc_lcache;
2001 
2002 		/*
2003 		 * We use the heap pointer to check if stuff has been
2004 		 * initialized for this callout table.
2005 		 */
2006 		if (ct->ct_heap == NULL) {
2007 			callout_heap_init(ct);
2008 			callout_hash_init(ct);
2009 			callout_kstat_init(ct);
2010 			callout_cyclic_init(ct);
2011 		}
2012 
2013 		mutex_exit(&ct->ct_mutex);
2014 
2015 		/*
2016 		 * Move the cyclics to this CPU by doing a bind.
2017 		 */
2018 		cyclic_bind(ct->ct_cyclic, cp, NULL);
2019 		cyclic_bind(ct->ct_qcyclic, cp, NULL);
2020 	}
2021 }
2022 
2023 void
2024 callout_cpu_offline(cpu_t *cp)
2025 {
2026 	callout_table_t *ct;
2027 	processorid_t seqid;
2028 	int t;
2029 
2030 	ASSERT(MUTEX_HELD(&cpu_lock));
2031 
2032 	seqid = cp->cpu_seqid;
2033 
2034 	for (t = 0; t < CALLOUT_NTYPES; t++) {
2035 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
2036 
2037 		/*
2038 		 * Unbind the cyclics. This will allow the cyclic subsystem
2039 		 * to juggle the cyclics during CPU offline.
2040 		 */
2041 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
2042 		cyclic_bind(ct->ct_qcyclic, NULL, NULL);
2043 	}
2044 }
2045 
2046 /*
2047  * This is called to perform per-CPU initialization for slave CPUs at
2048  * boot time.
2049  */
2050 void
2051 callout_mp_init(void)
2052 {
2053 	cpu_t *cp;
2054 	size_t min, max;
2055 
2056 	if (callout_chunk == CALLOUT_CHUNK) {
2057 		/*
2058 		 * No one has specified a chunk in /etc/system. We need to
2059 		 * compute it here based on the number of online CPUs and
2060 		 * available physical memory.
2061 		 */
2062 		min = CALLOUT_MIN_HEAP_SIZE;
2063 		max = ptob(physmem / CALLOUT_MEM_FRACTION);
2064 		if (min > max)
2065 			min = max;
2066 		callout_chunk = min / sizeof (callout_heap_t);
2067 		callout_chunk /= ncpus_online;
2068 		callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2069 	}
2070 
2071 	mutex_enter(&cpu_lock);
2072 
2073 	cp = cpu_active;
2074 	do {
2075 		callout_cpu_online(cp);
2076 	} while ((cp = cp->cpu_next_onln) != cpu_active);
2077 
2078 	mutex_exit(&cpu_lock);
2079 }
2080 
2081 /*
2082  * Initialize all callout tables.  Called at boot time just before clkstart().
2083  */
2084 void
2085 callout_init(void)
2086 {
2087 	int f, t;
2088 	size_t size;
2089 	int table_id;
2090 	callout_table_t *ct;
2091 	long bits, fanout;
2092 	uintptr_t buf;
2093 
2094 	/*
2095 	 * Initialize callout globals.
2096 	 */
2097 	bits = 0;
2098 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
2099 		bits++;
2100 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
2101 	callout_table_mask = (1 << callout_table_bits) - 1;
2102 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
2103 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
2104 	callout_max_ticks = CALLOUT_MAX_TICKS;
2105 	if (callout_min_reap == 0)
2106 		callout_min_reap = CALLOUT_MIN_REAP;
2107 
2108 	if (callout_tolerance <= 0)
2109 		callout_tolerance = CALLOUT_TOLERANCE;
2110 	if (callout_threads <= 0)
2111 		callout_threads = CALLOUT_THREADS;
2112 	if (callout_chunk <= 0)
2113 		callout_chunk = CALLOUT_CHUNK;
2114 	else
2115 		callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
2116 
2117 	/*
2118 	 * Allocate all the callout tables based on max_ncpus. We have chosen
2119 	 * to do boot-time allocation instead of dynamic allocation because:
2120 	 *
2121 	 *	- the size of the callout tables is not too large.
2122 	 *	- there are race conditions involved in making this dynamic.
2123 	 *	- the hash tables that go with the callout tables consume
2124 	 *	  most of the memory and they are only allocated in
2125 	 *	  callout_cpu_online().
2126 	 *
2127 	 * Each CPU has two tables that are consecutive in the array. The first
2128 	 * one is for realtime callouts and the second one is for normal ones.
2129 	 *
2130 	 * We do this alignment dance to make sure that callout table
2131 	 * structures will always be on a cache line boundary.
2132 	 */
2133 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
2134 	size += CALLOUT_ALIGN;
2135 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
2136 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
2137 
2138 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
2139 	/*
2140 	 * Now, initialize the tables for all the CPUs.
2141 	 */
2142 	for (f = 0; f < max_ncpus; f++) {
2143 		for (t = 0; t < CALLOUT_NTYPES; t++) {
2144 			table_id = CALLOUT_TABLE(t, f);
2145 			ct = &callout_table[table_id];
2146 			ct->ct_type = t;
2147 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
2148 			/*
2149 			 * Precompute the base IDs for long and short-term
2150 			 * legacy IDs. This makes ID generation during
2151 			 * timeout() fast.
2152 			 */
2153 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
2154 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
2155 			/*
2156 			 * Precompute the base ID for generation-based IDs.
2157 			 * Note that when the first ID gets allocated, the
2158 			 * ID will wrap. This will cause the generation
2159 			 * number to be incremented to 1.
2160 			 */
2161 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
2162 			/*
2163 			 * Initialize the cyclics as NONE. This will get set
2164 			 * during CPU online. This is so that partially
2165 			 * populated systems will only have the required
2166 			 * number of cyclics, not more.
2167 			 */
2168 			ct->ct_cyclic = CYCLIC_NONE;
2169 			ct->ct_qcyclic = CYCLIC_NONE;
2170 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
2171 		}
2172 	}
2173 
2174 	/*
2175 	 * Add the callback for CPR. This is called during checkpoint
2176 	 * resume to suspend and resume callouts.
2177 	 */
2178 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
2179 	    "callout_cpr");
2180 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
2181 	    "callout_debug");
2182 
2183 	/*
2184 	 * Call the per-CPU initialization function for the boot CPU. This
2185 	 * is done here because the function is not called automatically for
2186 	 * the boot CPU from the CPU online/offline hooks. Note that the
2187 	 * CPU lock is taken here because of convention.
2188 	 */
2189 	mutex_enter(&cpu_lock);
2190 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
2191 	callout_cpu_online(CPU);
2192 	mutex_exit(&cpu_lock);
2193 }
2194