xref: /titanic_44/usr/src/uts/common/os/callout.c (revision 73a0bd151c1115bf39cc2caa30c7cbfdd86361c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/kmem.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
38 #include <sys/sdt.h>
39 
40 /*
41  * Callout tables.  See timeout(9F) for details.
42  */
43 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
44 static int callout_min_resolution;		/* Minimum resolution */
45 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
46 static clock_t callout_max_ticks;		/* max interval */
47 static hrtime_t callout_longterm;		/* longterm nanoseconds */
48 static ulong_t callout_counter_low;		/* callout ID increment */
49 static ulong_t callout_table_bits;		/* number of table bits in ID */
50 static ulong_t callout_table_mask;		/* mask for the table bits */
51 static callout_cache_t *callout_caches;		/* linked list of caches */
52 #pragma align 64(callout_table)
53 static callout_table_t *callout_table;		/* global callout table array */
54 
55 /*
56  * We run normal callouts from PIL 10. This means that no other handler that
57  * runs at PIL 10 is allowed to wait for normal callouts directly or indirectly
58  * as it will cause a deadlock. This has always been an unwritten rule.
59  * We are making it explicit here.
60  */
61 static int callout_realtime_level = CY_LOW_LEVEL;
62 static int callout_normal_level = CY_LOCK_LEVEL;
63 
64 static char *callout_kstat_names[] = {
65 	"callout_timeouts",
66 	"callout_timeouts_pending",
67 	"callout_untimeouts_unexpired",
68 	"callout_untimeouts_executing",
69 	"callout_untimeouts_expired",
70 	"callout_expirations",
71 	"callout_allocations",
72 };
73 
74 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
75 {							\
76 	callout_hash_t *hashp = &(hash);		\
77 							\
78 	cp->cprev = NULL;				\
79 	cp->cnext = hashp->ch_head;			\
80 	if (hashp->ch_head == NULL)			\
81 		hashp->ch_tail = cp;			\
82 	else						\
83 		cp->cnext->cprev = cp;			\
84 	hashp->ch_head = cp;				\
85 }
86 
87 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
88 {							\
89 	callout_hash_t *hashp = &(hash);		\
90 							\
91 	cp->cnext = NULL;				\
92 	cp->cprev = hashp->ch_tail;			\
93 	if (hashp->ch_tail == NULL)			\
94 		hashp->ch_head = cp;			\
95 	else						\
96 		cp->cprev->cnext = cp;			\
97 	hashp->ch_tail = cp;				\
98 }
99 
100 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
101 {							\
102 	callout_hash_t *hashp = &(hash);		\
103 							\
104 	if (cp->cnext == NULL)				\
105 		hashp->ch_tail = cp->cprev;		\
106 	else						\
107 		cp->cnext->cprev = cp->cprev;		\
108 	if (cp->cprev == NULL)				\
109 		hashp->ch_head = cp->cnext;		\
110 	else						\
111 		cp->cprev->cnext = cp->cnext;		\
112 }
113 
114 /*
115  * These definitions help us queue callouts and callout lists. Here is
116  * the queueing rationale:
117  *
118  *	- callouts are queued in a FIFO manner in the ID hash table.
119  *	  TCP timers are typically cancelled in the same order that they
120  *	  were issued. The FIFO queueing shortens the search for a callout
121  *	  during untimeout().
122  *
123  *	- callouts are queued in a FIFO manner in their callout lists.
124  *	  This ensures that the callouts are executed in the same order that
125  *	  they were queued. This is fair. Plus, it helps to make each
126  *	  callout expiration timely. It also favors cancellations.
127  *
128  *	- callout lists are queued in a LIFO manner in the callout list hash
129  *	  table. This ensures that long term timers stay at the rear of the
130  *	  hash lists.
131  *
132  *	- callout lists are queued in a FIFO manner in the expired callouts
133  *	  list. This ensures that callout lists are executed in the order
134  *	  of expiration.
135  */
136 #define	CALLOUT_APPEND(ct, cp)						\
137 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
138 		cp, c_idnext, c_idprev);				\
139 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
140 
141 #define	CALLOUT_DELETE(ct, cp)						\
142 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
143 		cp, c_idnext, c_idprev);				\
144 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
145 
146 #define	CALLOUT_LIST_INSERT(hash, cl)				\
147 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
148 
149 #define	CALLOUT_LIST_APPEND(hash, cl)				\
150 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
151 
152 #define	CALLOUT_LIST_DELETE(hash, cl)				\
153 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
154 
155 /*
156  * For normal callouts, there is a deadlock scenario if two callouts that
157  * have an inter-dependency end up on the same callout list. To break the
158  * deadlock, you need two taskq threads running in parallel. We compute
159  * the number of taskq threads here using a bunch of conditions to make
160  * it optimal for the common case. This is an ugly hack, but one that is
161  * necessary (sigh).
162  */
163 #define	CALLOUT_THRESHOLD	100000000
164 #define	CALLOUT_EXEC_COMPUTE(ct, exec)					\
165 {									\
166 	callout_list_t *cl;						\
167 									\
168 	cl = ct->ct_expired.ch_head;					\
169 	if (cl == NULL) {						\
170 		/*							\
171 		 * If the expired list is NULL, there is nothing to	\
172 		 * process.						\
173 		 */							\
174 		exec = 0;						\
175 	} else if ((cl->cl_next == NULL) &&				\
176 	    (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) {	\
177 		/*							\
178 		 * If there is only one callout list and it contains	\
179 		 * only one callout, there is no need for two threads.	\
180 		 */							\
181 		exec = 1;						\
182 	} else if ((ct->ct_heap_num == 0) ||				\
183 	    (ct->ct_heap[0] > gethrtime() + CALLOUT_THRESHOLD)) {	\
184 		/*							\
185 		 * If the heap has become empty, we need two threads as	\
186 		 * there is no one to kick off the second thread in the	\
187 		 * future. If the heap is not empty and the top of the	\
188 		 * heap does not expire in the near future, we need two	\
189 		 * threads.						\
190 		 */							\
191 		exec = 2;						\
192 	} else {							\
193 		/*							\
194 		 * We have multiple callouts to process. But the cyclic	\
195 		 * will fire in the near future. So, we only need one	\
196 		 * thread for now.					\
197 		 */							\
198 		exec = 1;						\
199 	}								\
200 }
201 
202 /*
203  * Allocate a callout structure.  We try quite hard because we
204  * can't sleep, and if we can't do the allocation, we're toast.
205  * Failing all, we try a KM_PANIC allocation. Note that we never
206  * deallocate a callout. See untimeout() for the reasoning.
207  */
208 static callout_t *
209 callout_alloc(callout_table_t *ct)
210 {
211 	size_t size;
212 	callout_t *cp;
213 
214 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
215 	mutex_exit(&ct->ct_mutex);
216 
217 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
218 	if (cp == NULL) {
219 		size = sizeof (callout_t);
220 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
221 	}
222 	cp->c_xid = 0;
223 	cp->c_executor = NULL;
224 	cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
225 	cp->c_waiting = 0;
226 
227 	mutex_enter(&ct->ct_mutex);
228 	ct->ct_allocations++;
229 	return (cp);
230 }
231 
232 /*
233  * Allocate a callout list structure.  We try quite hard because we
234  * can't sleep, and if we can't do the allocation, we're toast.
235  * Failing all, we try a KM_PANIC allocation. Note that we never
236  * deallocate a callout list.
237  */
238 static void
239 callout_list_alloc(callout_table_t *ct)
240 {
241 	size_t size;
242 	callout_list_t *cl;
243 
244 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
245 	mutex_exit(&ct->ct_mutex);
246 
247 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
248 	if (cl == NULL) {
249 		size = sizeof (callout_list_t);
250 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
251 	}
252 	bzero(cl, sizeof (callout_list_t));
253 
254 	mutex_enter(&ct->ct_mutex);
255 	cl->cl_next = ct->ct_lfree;
256 	ct->ct_lfree = cl;
257 }
258 
259 /*
260  * Find a callout list that corresponds to an expiration.
261  */
262 static callout_list_t *
263 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
264 {
265 	callout_list_t *cl;
266 
267 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
268 
269 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
270 		if ((cl->cl_expiration == expiration) &&
271 		    (cl->cl_flags == flags))
272 			return (cl);
273 	}
274 
275 	return (NULL);
276 }
277 
278 /*
279  * Find the callout list that corresponds to an expiration.
280  * If the callout list is null, free it. Else, return it.
281  */
282 static callout_list_t *
283 callout_list_check(callout_table_t *ct, hrtime_t expiration, int hash)
284 {
285 	callout_list_t *cl;
286 
287 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
288 
289 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
290 		if (cl->cl_expiration == expiration) {
291 			if (cl->cl_callouts.ch_head != NULL) {
292 				/*
293 				 * Found a match.
294 				 */
295 				return (cl);
296 			}
297 
298 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
299 			cl->cl_next = ct->ct_lfree;
300 			ct->ct_lfree = cl;
301 
302 			return (NULL);
303 		}
304 	}
305 
306 	return (NULL);
307 }
308 /*
309  * Initialize a callout table's heap, if necessary. Preallocate some free
310  * entries so we don't have to check for NULL elsewhere.
311  */
312 static void
313 callout_heap_init(callout_table_t *ct)
314 {
315 	size_t size;
316 
317 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
318 	ASSERT(ct->ct_heap == NULL);
319 
320 	ct->ct_heap_num = 0;
321 	ct->ct_heap_max = CALLOUT_CHUNK;
322 	size = sizeof (hrtime_t) * CALLOUT_CHUNK;
323 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
324 }
325 
326 /*
327  * Reallocate the heap. We try quite hard because we can't sleep, and if
328  * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC
329  * allocation. Note that the heap only expands, it never contracts.
330  */
331 static void
332 callout_heap_expand(callout_table_t *ct)
333 {
334 	size_t max, size, osize;
335 	hrtime_t *heap;
336 
337 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
338 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
339 
340 	while (ct->ct_heap_num == ct->ct_heap_max) {
341 		max = ct->ct_heap_max;
342 		mutex_exit(&ct->ct_mutex);
343 
344 		osize = sizeof (hrtime_t) * max;
345 		size = sizeof (hrtime_t) * (max + CALLOUT_CHUNK);
346 		heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
347 
348 		mutex_enter(&ct->ct_mutex);
349 		if (max < ct->ct_heap_max) {
350 			/*
351 			 * Someone beat us to the allocation. Free what we
352 			 * just allocated and proceed.
353 			 */
354 			kmem_free(heap, size);
355 			continue;
356 		}
357 
358 		bcopy(ct->ct_heap, heap, osize);
359 		kmem_free(ct->ct_heap, osize);
360 		ct->ct_heap = heap;
361 		ct->ct_heap_max = size / sizeof (hrtime_t);
362 	}
363 }
364 
365 /*
366  * Move an expiration from the bottom of the heap to its correct place
367  * in the heap. If we reached the root doing this, return 1. Else,
368  * return 0.
369  */
370 static int
371 callout_upheap(callout_table_t *ct)
372 {
373 	int current, parent;
374 	hrtime_t *heap, current_expiration, parent_expiration;
375 
376 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
377 	ASSERT(ct->ct_heap_num >= 1);
378 
379 	if (ct->ct_heap_num == 1) {
380 		return (1);
381 	}
382 
383 	heap = ct->ct_heap;
384 	current = ct->ct_heap_num - 1;
385 
386 	for (;;) {
387 		parent = CALLOUT_HEAP_PARENT(current);
388 		current_expiration = heap[current];
389 		parent_expiration = heap[parent];
390 
391 		/*
392 		 * We have an expiration later than our parent; we're done.
393 		 */
394 		if (current_expiration >= parent_expiration) {
395 			return (0);
396 		}
397 
398 		/*
399 		 * We need to swap with our parent, and continue up the heap.
400 		 */
401 		heap[parent] = current_expiration;
402 		heap[current] = parent_expiration;
403 
404 		/*
405 		 * If we just reached the root, we're done.
406 		 */
407 		if (parent == 0) {
408 			return (1);
409 		}
410 
411 		current = parent;
412 	}
413 	/*NOTREACHED*/
414 }
415 
416 /*
417  * Insert a new expiration into a callout table's heap.
418  */
419 static void
420 callout_heap_insert(callout_table_t *ct, hrtime_t expiration)
421 {
422 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
423 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
424 
425 	/*
426 	 * First, copy the expiration to the bottom of the heap.
427 	 */
428 	ct->ct_heap[ct->ct_heap_num] = expiration;
429 	ct->ct_heap_num++;
430 
431 	/*
432 	 * Now, perform an upheap operation. If we reached the root, then
433 	 * the cyclic needs to be reprogrammed as we have an earlier
434 	 * expiration.
435 	 *
436 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
437 	 * We don't want any callout activity. When the CPR resume phase is
438 	 * entered, the cyclic will be programmed for the earliest expiration
439 	 * in the heap.
440 	 */
441 	if (callout_upheap(ct) && (ct->ct_suspend == 0))
442 		(void) cyclic_reprogram(ct->ct_cyclic, expiration);
443 }
444 
445 /*
446  * Move an expiration from the top of the heap to its correct place
447  * in the heap.
448  */
449 static void
450 callout_downheap(callout_table_t *ct)
451 {
452 	int left, right, current, nelems;
453 	hrtime_t *heap, left_expiration, right_expiration, current_expiration;
454 
455 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
456 	ASSERT(ct->ct_heap_num >= 1);
457 
458 	heap = ct->ct_heap;
459 	current = 0;
460 	nelems = ct->ct_heap_num;
461 
462 	for (;;) {
463 		/*
464 		 * If we don't have a left child (i.e., we're a leaf), we're
465 		 * done.
466 		 */
467 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
468 			return;
469 
470 		left_expiration = heap[left];
471 		current_expiration = heap[current];
472 
473 		right = CALLOUT_HEAP_RIGHT(current);
474 
475 		/*
476 		 * Even if we don't have a right child, we still need to compare
477 		 * our expiration against that of our left child.
478 		 */
479 		if (right >= nelems)
480 			goto comp_left;
481 
482 		right_expiration = heap[right];
483 
484 		/*
485 		 * We have both a left and a right child.  We need to compare
486 		 * the expiration of the children to determine which
487 		 * expires earlier.
488 		 */
489 		if (right_expiration < left_expiration) {
490 			/*
491 			 * Our right child is the earlier of our children.
492 			 * We'll now compare our expiration to its expiration.
493 			 * If ours is the earlier one, we're done.
494 			 */
495 			if (current_expiration <= right_expiration)
496 				return;
497 
498 			/*
499 			 * Our right child expires earlier than we do; swap
500 			 * with our right child, and descend right.
501 			 */
502 			heap[right] = current_expiration;
503 			heap[current] = right_expiration;
504 			current = right;
505 			continue;
506 		}
507 
508 comp_left:
509 		/*
510 		 * Our left child is the earlier of our children (or we have
511 		 * no right child).  We'll now compare our expiration
512 		 * to its expiration. If ours is the earlier one, we're done.
513 		 */
514 		if (current_expiration <= left_expiration)
515 			return;
516 
517 		/*
518 		 * Our left child expires earlier than we do; swap with our
519 		 * left child, and descend left.
520 		 */
521 		heap[left] = current_expiration;
522 		heap[current] = left_expiration;
523 		current = left;
524 	}
525 }
526 
527 /*
528  * Delete and handle all past expirations in a callout table's heap.
529  */
530 static void
531 callout_heap_delete(callout_table_t *ct)
532 {
533 	hrtime_t now, expiration;
534 	callout_list_t *cl;
535 	int hash;
536 
537 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
538 
539 	now = gethrtime();
540 
541 	while (ct->ct_heap_num > 0) {
542 		expiration = ct->ct_heap[0];
543 		/*
544 		 * Find the callout list that corresponds to the expiration.
545 		 * If the callout list is empty, callout_list_check()
546 		 * will free the callout list and return NULL.
547 		 */
548 		hash = CALLOUT_CLHASH(expiration);
549 		cl = callout_list_check(ct, expiration, hash);
550 		if (cl != NULL) {
551 			/*
552 			 * If the root of the heap expires in the future, we are
553 			 * done. We are doing this check here instead of at the
554 			 * beginning because we want to first free all the
555 			 * empty callout lists at the top of the heap.
556 			 */
557 			if (expiration > now)
558 				break;
559 
560 			/*
561 			 * Move the callout list for this expiration to the
562 			 * list of expired callout lists. It will be processed
563 			 * by the callout executor.
564 			 */
565 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
566 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
567 		}
568 
569 		/*
570 		 * Now delete the root. This is done by swapping the root with
571 		 * the last item in the heap and downheaping the item.
572 		 */
573 		ct->ct_heap_num--;
574 		if (ct->ct_heap_num > 0) {
575 			ct->ct_heap[0] = ct->ct_heap[ct->ct_heap_num];
576 			callout_downheap(ct);
577 		}
578 	}
579 
580 	/*
581 	 * If this callout table is empty or callouts have been suspended
582 	 * by CPR, just return. The cyclic has already been programmed to
583 	 * infinity by the cyclic subsystem.
584 	 */
585 	if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
586 		return;
587 
588 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
589 }
590 
591 /*
592  * Common function used to create normal and realtime callouts.
593  *
594  * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
595  * there is one restriction on a realtime callout handler - it should not
596  * directly or indirectly acquire cpu_lock. CPU offline waits for pending
597  * cyclic handlers to complete while holding cpu_lock. So, if a realtime
598  * callout handler were to try to get cpu_lock, there would be a deadlock
599  * during CPU offline.
600  */
601 callout_id_t
602 timeout_generic(int type, void (*func)(void *), void *arg,
603 	hrtime_t expiration, hrtime_t resolution, int flags)
604 {
605 	callout_table_t *ct;
606 	callout_t *cp;
607 	callout_id_t id;
608 	callout_list_t *cl;
609 	hrtime_t now, interval;
610 	int hash;
611 
612 	ASSERT(resolution > 0);
613 	ASSERT(func != NULL);
614 
615 	/*
616 	 * Please see comment about minimum resolution in callout_init().
617 	 */
618 	if (resolution < callout_min_resolution)
619 		resolution = callout_min_resolution;
620 
621 	/*
622 	 * We disable kernel preemption so that we remain on the same CPU
623 	 * throughout. If we needed to reprogram the callout table's cyclic,
624 	 * we can avoid X-calls if we are on the same CPU.
625 	 *
626 	 * Note that callout_alloc() releases and reacquires the callout
627 	 * table mutex. While reacquiring the mutex, it is possible for us
628 	 * to go to sleep and later migrate to another CPU. This should be
629 	 * pretty rare, though.
630 	 */
631 	kpreempt_disable();
632 
633 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
634 	mutex_enter(&ct->ct_mutex);
635 
636 	if (ct->ct_cyclic == CYCLIC_NONE) {
637 		mutex_exit(&ct->ct_mutex);
638 		/*
639 		 * The callout table has not yet been initialized fully.
640 		 * So, put this one on the boot callout table which is
641 		 * always initialized.
642 		 */
643 		ct = &callout_boot_ct[type];
644 		mutex_enter(&ct->ct_mutex);
645 	}
646 
647 	if ((cp = ct->ct_free) == NULL)
648 		cp = callout_alloc(ct);
649 	else
650 		ct->ct_free = cp->c_idnext;
651 
652 	cp->c_func = func;
653 	cp->c_arg = arg;
654 
655 	/*
656 	 * Compute the expiration hrtime.
657 	 */
658 	now = gethrtime();
659 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
660 		interval = expiration - now;
661 	} else {
662 		interval = expiration;
663 		expiration += now;
664 	}
665 	if (flags & CALLOUT_FLAG_ROUNDUP)
666 		expiration += resolution - 1;
667 	expiration = (expiration / resolution) * resolution;
668 	if (expiration <= 0) {
669 		/*
670 		 * expiration hrtime overflow has occurred. Just set the
671 		 * expiration to infinity.
672 		 */
673 		expiration = CY_INFINITY;
674 	}
675 
676 	/*
677 	 * Assign an ID to this callout
678 	 */
679 	if (flags & CALLOUT_FLAG_32BIT) {
680 		if (interval > callout_longterm) {
681 			id = (ct->ct_long_id - callout_counter_low);
682 			id |= CALLOUT_COUNTER_HIGH;
683 			ct->ct_long_id = id;
684 		} else {
685 			id = (ct->ct_short_id - callout_counter_low);
686 			id |= CALLOUT_COUNTER_HIGH;
687 			ct->ct_short_id = id;
688 		}
689 	} else {
690 		id = (ct->ct_gen_id - callout_counter_low);
691 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
692 			id |= CALLOUT_COUNTER_HIGH;
693 			id += CALLOUT_GENERATION_LOW;
694 		}
695 		ct->ct_gen_id = id;
696 	}
697 
698 	cp->c_xid = id;
699 
700 	flags &= CALLOUT_LIST_FLAGS;
701 	hash = CALLOUT_CLHASH(expiration);
702 
703 again:
704 	/*
705 	 * Try to see if a callout list already exists for this expiration.
706 	 * Most of the time, this will be the case.
707 	 */
708 	cl = callout_list_get(ct, expiration, flags, hash);
709 	if (cl == NULL) {
710 		/*
711 		 * Check if we have enough space in the heap to insert one
712 		 * expiration. If not, expand the heap.
713 		 */
714 		if (ct->ct_heap_num == ct->ct_heap_max) {
715 			callout_heap_expand(ct);
716 			/*
717 			 * In the above call, we drop the lock, allocate and
718 			 * reacquire the lock. So, we could have been away
719 			 * for a while. In the meantime, someone could have
720 			 * inserted a callout list with the same expiration.
721 			 * So, the best course is to repeat the steps. This
722 			 * should be an infrequent event.
723 			 */
724 			goto again;
725 		}
726 
727 		/*
728 		 * Check the free list. If we don't find one, we have to
729 		 * take the slow path and allocate from kmem.
730 		 */
731 		if ((cl = ct->ct_lfree) == NULL) {
732 			callout_list_alloc(ct);
733 			/*
734 			 * In the above call, we drop the lock, allocate and
735 			 * reacquire the lock. So, we could have been away
736 			 * for a while. In the meantime, someone could have
737 			 * inserted a callout list with the same expiration.
738 			 * Plus, the heap could have become full. So, the best
739 			 * course is to repeat the steps. This should be an
740 			 * infrequent event.
741 			 */
742 			goto again;
743 		}
744 		ct->ct_lfree = cl->cl_next;
745 		cl->cl_expiration = expiration;
746 		cl->cl_flags = flags;
747 
748 		CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
749 
750 		/*
751 		 * This is a new expiration. So, insert it into the heap.
752 		 * This will also reprogram the cyclic, if the expiration
753 		 * propagated to the root of the heap.
754 		 */
755 		callout_heap_insert(ct, expiration);
756 	}
757 	cp->c_list = cl;
758 	CALLOUT_APPEND(ct, cp);
759 
760 	ct->ct_timeouts++;
761 	ct->ct_timeouts_pending++;
762 
763 	mutex_exit(&ct->ct_mutex);
764 
765 	kpreempt_enable();
766 
767 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
768 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
769 	    cp);
770 
771 	return (id);
772 }
773 
774 timeout_id_t
775 timeout(void (*func)(void *), void *arg, clock_t delta)
776 {
777 	ulong_t id;
778 
779 	/*
780 	 * Make sure the callout runs at least 1 tick in the future.
781 	 */
782 	if (delta <= 0)
783 		delta = 1;
784 	else if (delta > callout_max_ticks)
785 		delta = callout_max_ticks;
786 
787 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
788 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
789 
790 	return ((timeout_id_t)id);
791 }
792 
793 /*
794  * Convenience function that creates a normal callout with default parameters
795  * and returns a full ID.
796  */
797 callout_id_t
798 timeout_default(void (*func)(void *), void *arg, clock_t delta)
799 {
800 	callout_id_t id;
801 
802 	/*
803 	 * Make sure the callout runs at least 1 tick in the future.
804 	 */
805 	if (delta <= 0)
806 		delta = 1;
807 	else if (delta > callout_max_ticks)
808 		delta = callout_max_ticks;
809 
810 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
811 	    nsec_per_tick, 0);
812 
813 	return (id);
814 }
815 
816 timeout_id_t
817 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
818 {
819 	ulong_t id;
820 
821 	/*
822 	 * Make sure the callout runs at least 1 tick in the future.
823 	 */
824 	if (delta <= 0)
825 		delta = 1;
826 	else if (delta > callout_max_ticks)
827 		delta = callout_max_ticks;
828 
829 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
830 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
831 
832 	return ((timeout_id_t)id);
833 }
834 
835 /*
836  * Convenience function that creates a realtime callout with default parameters
837  * and returns a full ID.
838  */
839 callout_id_t
840 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
841 {
842 	callout_id_t id;
843 
844 	/*
845 	 * Make sure the callout runs at least 1 tick in the future.
846 	 */
847 	if (delta <= 0)
848 		delta = 1;
849 	else if (delta > callout_max_ticks)
850 		delta = callout_max_ticks;
851 
852 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
853 	    nsec_per_tick, 0);
854 
855 	return (id);
856 }
857 
858 hrtime_t
859 untimeout_generic(callout_id_t id, int nowait)
860 {
861 	callout_table_t *ct;
862 	callout_t *cp;
863 	callout_id_t xid;
864 	int hash;
865 	callout_id_t bogus;
866 
867 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
868 	hash = CALLOUT_IDHASH(id);
869 
870 	mutex_enter(&ct->ct_mutex);
871 
872 	/*
873 	 * Search the ID hash table for the callout.
874 	 */
875 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
876 
877 		xid = cp->c_xid;
878 
879 		/*
880 		 * Match the ID and generation number.
881 		 */
882 		if ((xid & CALLOUT_ID_MASK) != id)
883 			continue;
884 
885 		if ((xid & CALLOUT_EXECUTING) == 0) {
886 			hrtime_t expiration;
887 
888 			/*
889 			 * Delete the callout. If the callout list becomes
890 			 * NULL, we don't remove it from the table. This is
891 			 * so it can be reused. If the empty callout list
892 			 * corresponds to the top of the the callout heap, we
893 			 * don't reprogram the table cyclic here. This is in
894 			 * order to avoid lots of X-calls to the CPU associated
895 			 * with the callout table.
896 			 */
897 			expiration = cp->c_list->cl_expiration;
898 			CALLOUT_DELETE(ct, cp);
899 			cp->c_idnext = ct->ct_free;
900 			ct->ct_free = cp;
901 			ct->ct_untimeouts_unexpired++;
902 			ct->ct_timeouts_pending--;
903 			mutex_exit(&ct->ct_mutex);
904 
905 			expiration -= gethrtime();
906 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
907 			    "untimeout:ID %lx hrtime left %llx", id,
908 			    expiration);
909 			return (expiration < 0 ? 0 : expiration);
910 		}
911 
912 		ct->ct_untimeouts_executing++;
913 		/*
914 		 * The callout we want to delete is currently executing.
915 		 * The DDI states that we must wait until the callout
916 		 * completes before returning, so we block on c_done until the
917 		 * callout ID changes (to the old ID if it's on the freelist,
918 		 * or to a new callout ID if it's in use).  This implicitly
919 		 * assumes that callout structures are persistent (they are).
920 		 */
921 		if (cp->c_executor == curthread) {
922 			/*
923 			 * The timeout handler called untimeout() on itself.
924 			 * Stupid, but legal.  We can't wait for the timeout
925 			 * to complete without deadlocking, so we just return.
926 			 */
927 			mutex_exit(&ct->ct_mutex);
928 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
929 			    "untimeout_self:ID %x", id);
930 			return (-1);
931 		}
932 		if (nowait == 0) {
933 			/*
934 			 * We need to wait. Indicate that we are waiting by
935 			 * incrementing c_waiting. This prevents the executor
936 			 * from doing a wakeup on c_done if there are no
937 			 * waiters.
938 			 */
939 			while (cp->c_xid == xid) {
940 				cp->c_waiting = 1;
941 				cv_wait(&cp->c_done, &ct->ct_mutex);
942 			}
943 		}
944 		mutex_exit(&ct->ct_mutex);
945 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
946 		    "untimeout_executing:ID %lx", id);
947 		return (-1);
948 	}
949 	ct->ct_untimeouts_expired++;
950 
951 	mutex_exit(&ct->ct_mutex);
952 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
953 	    "untimeout_bogus_id:ID %lx", id);
954 
955 	/*
956 	 * We didn't find the specified callout ID.  This means either
957 	 * (1) the callout already fired, or (2) the caller passed us
958 	 * a bogus value.  Perform a sanity check to detect case (2).
959 	 */
960 	bogus = (CALLOUT_EXECUTING | CALLOUT_COUNTER_HIGH);
961 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
962 		panic("untimeout: impossible timeout id %llx",
963 		    (unsigned long long)id);
964 
965 	return (-1);
966 }
967 
968 clock_t
969 untimeout(timeout_id_t id_arg)
970 {
971 	hrtime_t hleft;
972 	clock_t tleft;
973 	callout_id_t id;
974 
975 	id = (ulong_t)id_arg;
976 	hleft = untimeout_generic(id, 0);
977 	if (hleft < 0)
978 		tleft = -1;
979 	else if (hleft == 0)
980 		tleft = 0;
981 	else
982 		tleft = NSEC_TO_TICK(hleft);
983 
984 	return (tleft);
985 }
986 
987 /*
988  * Convenience function to untimeout a timeout with a full ID with default
989  * parameters.
990  */
991 clock_t
992 untimeout_default(callout_id_t id, int nowait)
993 {
994 	hrtime_t hleft;
995 	clock_t tleft;
996 
997 	hleft = untimeout_generic(id, nowait);
998 	if (hleft < 0)
999 		tleft = -1;
1000 	else if (hleft == 0)
1001 		tleft = 0;
1002 	else
1003 		tleft = NSEC_TO_TICK(hleft);
1004 
1005 	return (tleft);
1006 }
1007 
1008 /*
1009  * Expire all the callouts queued in the specified callout list.
1010  */
1011 static void
1012 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
1013 {
1014 	callout_t *cp, *cnext;
1015 
1016 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1017 	ASSERT(cl != NULL);
1018 
1019 	for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
1020 		/*
1021 		 * Multiple executor threads could be running at the same
1022 		 * time. If this callout is already being executed,
1023 		 * go on to the next one.
1024 		 */
1025 		if (cp->c_xid & CALLOUT_EXECUTING) {
1026 			cnext = cp->c_clnext;
1027 			continue;
1028 		}
1029 
1030 		/*
1031 		 * Indicate to untimeout() that a callout is
1032 		 * being expired by the executor.
1033 		 */
1034 		cp->c_xid |= CALLOUT_EXECUTING;
1035 		cp->c_executor = curthread;
1036 		mutex_exit(&ct->ct_mutex);
1037 
1038 		DTRACE_PROBE1(callout__start, callout_t *, cp);
1039 		(*cp->c_func)(cp->c_arg);
1040 		DTRACE_PROBE1(callout__end, callout_t *, cp);
1041 
1042 		mutex_enter(&ct->ct_mutex);
1043 
1044 		ct->ct_expirations++;
1045 		ct->ct_timeouts_pending--;
1046 		/*
1047 		 * Indicate completion for c_done.
1048 		 */
1049 		cp->c_xid &= ~CALLOUT_EXECUTING;
1050 		cp->c_executor = NULL;
1051 		cnext = cp->c_clnext;
1052 
1053 		/*
1054 		 * Delete callout from ID hash table and the callout
1055 		 * list, return to freelist, and tell any untimeout() that
1056 		 * cares that we're done.
1057 		 */
1058 		CALLOUT_DELETE(ct, cp);
1059 		cp->c_idnext = ct->ct_free;
1060 		ct->ct_free = cp;
1061 
1062 		if (cp->c_waiting) {
1063 			cp->c_waiting = 0;
1064 			cv_broadcast(&cp->c_done);
1065 		}
1066 	}
1067 }
1068 
1069 /*
1070  * Execute all expired callout lists for a callout table.
1071  */
1072 static void
1073 callout_expire(callout_table_t *ct)
1074 {
1075 	callout_list_t *cl, *clnext;
1076 
1077 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1078 
1079 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1080 		/*
1081 		 * Expire all the callouts in this callout list.
1082 		 */
1083 		callout_list_expire(ct, cl);
1084 
1085 		clnext = cl->cl_next;
1086 		if (cl->cl_callouts.ch_head == NULL) {
1087 			/*
1088 			 * Free the callout list.
1089 			 */
1090 			CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1091 			cl->cl_next = ct->ct_lfree;
1092 			ct->ct_lfree = cl;
1093 		}
1094 	}
1095 }
1096 
1097 /*
1098  * The cyclic handlers below process callouts in two steps:
1099  *
1100  *	1. Find all expired callout lists and queue them in a separate
1101  *	   list of expired callouts.
1102  *	2. Execute the expired callout lists.
1103  *
1104  * This is done for two reasons:
1105  *
1106  *	1. We want to quickly find the next earliest expiration to program
1107  *	   the cyclic to and reprogram it. We can do this right at the end
1108  *	   of step 1.
1109  *	2. The realtime cyclic handler expires callouts in place. However,
1110  *	   for normal callouts, callouts are expired by a taskq thread.
1111  *	   So, it is simpler and more robust to have the taskq thread just
1112  *	   do step 2.
1113  */
1114 
1115 /*
1116  * Realtime callout cyclic handler.
1117  */
1118 void
1119 callout_realtime(callout_table_t *ct)
1120 {
1121 	mutex_enter(&ct->ct_mutex);
1122 	callout_heap_delete(ct);
1123 	callout_expire(ct);
1124 	mutex_exit(&ct->ct_mutex);
1125 }
1126 
1127 void
1128 callout_execute(callout_table_t *ct)
1129 {
1130 	mutex_enter(&ct->ct_mutex);
1131 	callout_expire(ct);
1132 	mutex_exit(&ct->ct_mutex);
1133 }
1134 
1135 /*
1136  * Normal callout cyclic handler.
1137  */
1138 void
1139 callout_normal(callout_table_t *ct)
1140 {
1141 	int i, exec;
1142 
1143 	mutex_enter(&ct->ct_mutex);
1144 	callout_heap_delete(ct);
1145 	CALLOUT_EXEC_COMPUTE(ct, exec);
1146 	mutex_exit(&ct->ct_mutex);
1147 
1148 	for (i = 0; i < exec; i++) {
1149 		ASSERT(ct->ct_taskq != NULL);
1150 		(void) taskq_dispatch(ct->ct_taskq,
1151 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1152 	}
1153 }
1154 
1155 /*
1156  * Suspend callout processing.
1157  */
1158 static void
1159 callout_suspend(void)
1160 {
1161 	int t, f;
1162 	callout_table_t *ct;
1163 
1164 	/*
1165 	 * Traverse every callout table in the system and suspend callout
1166 	 * processing.
1167 	 *
1168 	 * We need to suspend all the tables (including the inactive ones)
1169 	 * so that if a table is made active while the suspend is still on,
1170 	 * the table remains suspended.
1171 	 */
1172 	for (f = 0; f < max_ncpus; f++) {
1173 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1174 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1175 
1176 			mutex_enter(&ct->ct_mutex);
1177 			ct->ct_suspend++;
1178 			if (ct->ct_cyclic == CYCLIC_NONE) {
1179 				mutex_exit(&ct->ct_mutex);
1180 				continue;
1181 			}
1182 			if (ct->ct_suspend == 1)
1183 				(void) cyclic_reprogram(ct->ct_cyclic,
1184 				    CY_INFINITY);
1185 			mutex_exit(&ct->ct_mutex);
1186 		}
1187 	}
1188 }
1189 
1190 static void
1191 callout_adjust(callout_table_t *ct, hrtime_t delta)
1192 {
1193 	int hash, newhash;
1194 	hrtime_t expiration;
1195 	callout_list_t *cl;
1196 	callout_hash_t list;
1197 
1198 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1199 
1200 	/*
1201 	 * In order to adjust the expirations, we null out the heap. Then,
1202 	 * we reinsert adjusted expirations in the heap. Keeps it simple.
1203 	 * Note that since the CALLOUT_TABLE_SUSPENDED flag is set by the
1204 	 * caller, the heap insert does not result in cyclic reprogramming.
1205 	 */
1206 	ct->ct_heap_num = 0;
1207 
1208 	/*
1209 	 * First, remove all the callout lists from the table and string them
1210 	 * in a list.
1211 	 */
1212 	list.ch_head = list.ch_tail = NULL;
1213 	for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
1214 		while ((cl = ct->ct_clhash[hash].ch_head) != NULL) {
1215 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
1216 			CALLOUT_LIST_APPEND(list, cl);
1217 		}
1218 	}
1219 
1220 	/*
1221 	 * Now, traverse the callout lists and adjust their expirations.
1222 	 */
1223 	while ((cl = list.ch_head) != NULL) {
1224 		CALLOUT_LIST_DELETE(list, cl);
1225 		/*
1226 		 * Set the new expiration and reinsert in the right
1227 		 * hash bucket.
1228 		 */
1229 		expiration = cl->cl_expiration;
1230 		expiration += delta;
1231 		cl->cl_expiration = expiration;
1232 		newhash = CALLOUT_CLHASH(expiration);
1233 		CALLOUT_LIST_INSERT(ct->ct_clhash[newhash], cl);
1234 		callout_heap_insert(ct, expiration);
1235 	}
1236 }
1237 
1238 /*
1239  * Resume callout processing.
1240  */
1241 static void
1242 callout_resume(hrtime_t delta)
1243 {
1244 	hrtime_t exp;
1245 	int t, f;
1246 	callout_table_t *ct;
1247 
1248 	/*
1249 	 * Traverse every callout table in the system and resume callout
1250 	 * processing. For active tables, perform any hrtime adjustments
1251 	 * necessary.
1252 	 */
1253 	for (f = 0; f < max_ncpus; f++) {
1254 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1255 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1256 
1257 			mutex_enter(&ct->ct_mutex);
1258 			if (ct->ct_cyclic == CYCLIC_NONE) {
1259 				ct->ct_suspend--;
1260 				mutex_exit(&ct->ct_mutex);
1261 				continue;
1262 			}
1263 
1264 			if (delta)
1265 				callout_adjust(ct, delta);
1266 
1267 			ct->ct_suspend--;
1268 			if (ct->ct_suspend == 0) {
1269 				/*
1270 				 * If the expired list is non-empty, then have
1271 				 * the cyclic expire immediately. Else, program
1272 				 * the cyclic based on the heap.
1273 				 */
1274 				if (ct->ct_expired.ch_head != NULL)
1275 					exp = gethrtime();
1276 				else if (ct->ct_heap_num > 0)
1277 					exp = ct->ct_heap[0];
1278 				else
1279 					exp = 0;
1280 				if (exp != 0)
1281 					(void) cyclic_reprogram(ct->ct_cyclic,
1282 					    exp);
1283 			}
1284 			mutex_exit(&ct->ct_mutex);
1285 		}
1286 	}
1287 }
1288 
1289 /*
1290  * Callback handler used by CPR to stop and resume callouts.
1291  */
1292 /*ARGSUSED*/
1293 static boolean_t
1294 callout_cpr_callb(void *arg, int code)
1295 {
1296 	if (code == CB_CODE_CPR_CHKPT)
1297 		callout_suspend();
1298 	else
1299 		callout_resume(0);
1300 
1301 	return (B_TRUE);
1302 }
1303 
1304 /*
1305  * Callback handler invoked when the debugger is entered or exited.
1306  */
1307 /*ARGSUSED*/
1308 static boolean_t
1309 callout_debug_callb(void *arg, int code)
1310 {
1311 	hrtime_t delta;
1312 
1313 	/*
1314 	 * When the system enters the debugger. make a note of the hrtime.
1315 	 * When it is resumed, compute how long the system was in the
1316 	 * debugger. This interval should not be counted for callouts.
1317 	 */
1318 	if (code == 0) {
1319 		callout_suspend();
1320 		callout_debug_hrtime = gethrtime();
1321 	} else {
1322 		delta = gethrtime() - callout_debug_hrtime;
1323 		callout_resume(delta);
1324 	}
1325 
1326 	return (B_TRUE);
1327 }
1328 
1329 /*
1330  * Move the absolute hrestime callouts to the expired list. Then program the
1331  * table's cyclic to expire immediately so that the callouts can be executed
1332  * immediately.
1333  */
1334 static void
1335 callout_hrestime_one(callout_table_t *ct)
1336 {
1337 	callout_list_t *cl, *clnext;
1338 	int hash, flags;
1339 
1340 	mutex_enter(&ct->ct_mutex);
1341 	if (ct->ct_heap_num == 0) {
1342 		mutex_exit(&ct->ct_mutex);
1343 		return;
1344 	}
1345 
1346 	flags = CALLOUT_LIST_FLAGS;
1347 	for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
1348 		for (cl = ct->ct_clhash[hash].ch_head; cl; cl = clnext) {
1349 			clnext = cl->cl_next;
1350 			if (cl->cl_flags == flags) {
1351 				CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
1352 				CALLOUT_LIST_APPEND(ct->ct_expired, cl);
1353 			}
1354 		}
1355 	}
1356 
1357 	if ((ct->ct_expired.ch_head != NULL) && (ct->ct_suspend == 0))
1358 		(void) cyclic_reprogram(ct->ct_cyclic, gethrtime());
1359 
1360 	mutex_exit(&ct->ct_mutex);
1361 }
1362 
1363 /*
1364  * This function is called whenever system time (hrestime) is changed
1365  * explicitly. All the HRESTIME callouts must be expired at once.
1366  */
1367 /*ARGSUSED*/
1368 void
1369 callout_hrestime(void)
1370 {
1371 	int t, f;
1372 	callout_table_t *ct;
1373 
1374 	/*
1375 	 * Traverse every callout table in the system and process the hrestime
1376 	 * callouts therein.
1377 	 *
1378 	 * We look at all the tables because we don't know which ones were
1379 	 * onlined and offlined in the past. The offlined tables may still
1380 	 * have active cyclics processing timers somewhere.
1381 	 */
1382 	for (f = 0; f < max_ncpus; f++) {
1383 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1384 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1385 			callout_hrestime_one(ct);
1386 		}
1387 	}
1388 }
1389 
1390 /*
1391  * Create the hash tables for this callout table.
1392  */
1393 static void
1394 callout_hash_init(callout_table_t *ct)
1395 {
1396 	size_t size;
1397 
1398 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1399 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1400 
1401 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1402 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1403 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1404 }
1405 
1406 /*
1407  * Create per-callout table kstats.
1408  */
1409 static void
1410 callout_kstat_init(callout_table_t *ct)
1411 {
1412 	callout_stat_type_t stat;
1413 	kstat_t *ct_kstats;
1414 	int ndx;
1415 
1416 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1417 	ASSERT(ct->ct_kstats == NULL);
1418 
1419 	ndx = ct - callout_table;
1420 	ct_kstats = kstat_create("unix", ndx, "callout",
1421 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1422 
1423 	if (ct_kstats == NULL) {
1424 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1425 		    (void *)ct);
1426 	} else {
1427 		ct_kstats->ks_data = ct->ct_kstat_data;
1428 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1429 			kstat_named_init(&ct->ct_kstat_data[stat],
1430 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
1431 		ct->ct_kstats = ct_kstats;
1432 		kstat_install(ct_kstats);
1433 	}
1434 }
1435 
1436 static void
1437 callout_cyclic_init(callout_table_t *ct)
1438 {
1439 	cyc_handler_t hdlr;
1440 	cyc_time_t when;
1441 	processorid_t seqid;
1442 	int t;
1443 
1444 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1445 
1446 	t = CALLOUT_TABLE_TYPE(ct);
1447 	seqid = CALLOUT_TABLE_SEQID(ct);
1448 
1449 	/*
1450 	 * Create the taskq thread if the table type is normal.
1451 	 * Realtime tables are handled at PIL1 by a softint
1452 	 * handler.
1453 	 */
1454 	if (t == CALLOUT_NORMAL) {
1455 		ASSERT(ct->ct_taskq == NULL);
1456 		/*
1457 		 * Each callout thread consumes exactly one
1458 		 * task structure while active.  Therefore,
1459 		 * prepopulating with 2 * CALLOUT_THREADS tasks
1460 		 * ensures that there's at least one task per
1461 		 * thread that's either scheduled or on the
1462 		 * freelist.  In turn, this guarantees that
1463 		 * taskq_dispatch() will always either succeed
1464 		 * (because there's a free task structure) or
1465 		 * be unnecessary (because "callout_excute(ct)"
1466 		 * has already scheduled).
1467 		 */
1468 		ct->ct_taskq =
1469 		    taskq_create_instance("callout_taskq", seqid,
1470 		    CALLOUT_THREADS, maxclsyspri,
1471 		    2 * CALLOUT_THREADS, 2 * CALLOUT_THREADS,
1472 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1473 	}
1474 
1475 	/*
1476 	 * callouts can only be created in a table whose
1477 	 * cyclic has been initialized.
1478 	 */
1479 	ASSERT(ct->ct_heap_num == 0);
1480 
1481 	/*
1482 	 * Create the callout table cyclics.
1483 	 *
1484 	 * The realtime cyclic handler executes at low PIL. The normal cyclic
1485 	 * handler executes at lock PIL. This is because there are cases
1486 	 * where code can block at PIL > 1 waiting for a normal callout handler
1487 	 * to unblock it directly or indirectly. If the normal cyclic were to
1488 	 * be executed at low PIL, it could get blocked out by the waiter
1489 	 * and cause a deadlock.
1490 	 */
1491 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1492 
1493 	hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t);
1494 	if (ct->ct_type == CALLOUT_REALTIME)
1495 		hdlr.cyh_level = callout_realtime_level;
1496 	else
1497 		hdlr.cyh_level = callout_normal_level;
1498 	hdlr.cyh_arg = ct;
1499 	when.cyt_when = CY_INFINITY;
1500 	when.cyt_interval = CY_INFINITY;
1501 
1502 	ct->ct_cyclic = cyclic_add(&hdlr, &when);
1503 }
1504 
1505 void
1506 callout_cpu_online(cpu_t *cp)
1507 {
1508 	lgrp_handle_t hand;
1509 	callout_cache_t *cache;
1510 	char s[KMEM_CACHE_NAMELEN];
1511 	callout_table_t *ct;
1512 	processorid_t seqid;
1513 	int t;
1514 
1515 	ASSERT(MUTEX_HELD(&cpu_lock));
1516 
1517 	/*
1518 	 * Locate the cache corresponding to the onlined CPU's lgroup.
1519 	 * Note that access to callout_caches is protected by cpu_lock.
1520 	 */
1521 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1522 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1523 		if (cache->cc_hand == hand)
1524 			break;
1525 	}
1526 
1527 	/*
1528 	 * If not found, create one. The caches are never destroyed.
1529 	 */
1530 	if (cache == NULL) {
1531 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1532 		cache->cc_hand = hand;
1533 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1534 		    (long)hand);
1535 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1536 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1537 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1538 		    (long)hand);
1539 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1540 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1541 		cache->cc_next = callout_caches;
1542 		callout_caches = cache;
1543 	}
1544 
1545 	seqid = cp->cpu_seqid;
1546 
1547 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1548 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1549 
1550 		mutex_enter(&ct->ct_mutex);
1551 		/*
1552 		 * Store convinience pointers to the kmem caches
1553 		 * in the callout table. These assignments should always be
1554 		 * done as callout tables can map to different physical
1555 		 * CPUs each time.
1556 		 */
1557 		ct->ct_cache = cache->cc_cache;
1558 		ct->ct_lcache = cache->cc_lcache;
1559 
1560 		/*
1561 		 * We use the heap pointer to check if stuff has been
1562 		 * initialized for this callout table.
1563 		 */
1564 		if (ct->ct_heap == NULL) {
1565 			callout_heap_init(ct);
1566 			callout_hash_init(ct);
1567 			callout_kstat_init(ct);
1568 			callout_cyclic_init(ct);
1569 		}
1570 
1571 		mutex_exit(&ct->ct_mutex);
1572 
1573 		/*
1574 		 * Move the cyclic to this CPU by doing a bind.
1575 		 */
1576 		cyclic_bind(ct->ct_cyclic, cp, NULL);
1577 	}
1578 }
1579 
1580 void
1581 callout_cpu_offline(cpu_t *cp)
1582 {
1583 	callout_table_t *ct;
1584 	processorid_t seqid;
1585 	int t;
1586 
1587 	ASSERT(MUTEX_HELD(&cpu_lock));
1588 
1589 	seqid = cp->cpu_seqid;
1590 
1591 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1592 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1593 
1594 		/*
1595 		 * Unbind the cyclic. This will allow the cyclic subsystem
1596 		 * to juggle the cyclic during CPU offline.
1597 		 */
1598 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
1599 	}
1600 }
1601 
1602 /*
1603  * This is called to perform per-CPU initialization for slave CPUs at
1604  * boot time.
1605  */
1606 void
1607 callout_mp_init(void)
1608 {
1609 	cpu_t *cp;
1610 
1611 	mutex_enter(&cpu_lock);
1612 
1613 	cp = cpu_active;
1614 	do {
1615 		callout_cpu_online(cp);
1616 	} while ((cp = cp->cpu_next_onln) != cpu_active);
1617 
1618 	mutex_exit(&cpu_lock);
1619 }
1620 
1621 /*
1622  * Initialize all callout tables.  Called at boot time just before clkstart().
1623  */
1624 void
1625 callout_init(void)
1626 {
1627 	int f, t;
1628 	size_t size;
1629 	int table_id;
1630 	callout_table_t *ct;
1631 	long bits, fanout;
1632 	uintptr_t buf;
1633 
1634 	/*
1635 	 * Initialize callout globals.
1636 	 */
1637 	bits = 0;
1638 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
1639 		bits++;
1640 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
1641 	callout_table_mask = (1 << callout_table_bits) - 1;
1642 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
1643 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
1644 	callout_max_ticks = CALLOUT_MAX_TICKS;
1645 
1646 	/*
1647 	 * Because of the variability in timing behavior across systems with
1648 	 * different architectures, we cannot allow arbitrarily low
1649 	 * resolutions. The minimum resolution has to be determined in a
1650 	 * platform-specific way. Until then, we define a blanket minimum
1651 	 * resolution for callouts of CALLOUT_MIN_RESOLUTION.
1652 	 *
1653 	 * If, in the future, someone requires lower resolution timers, they
1654 	 * can do one of two things:
1655 	 *
1656 	 *	- Define a lower value for callout_min_resolution. This would
1657 	 *	  affect all clients of the callout subsystem. If this done
1658 	 *	  via /etc/system, then no code changes are required and it
1659 	 *	  would affect only that customer.
1660 	 *
1661 	 *	- Define a flag to be passed to timeout creation that allows
1662 	 *	  the lower resolution. This involves code changes. But it
1663 	 *	  would affect only the calling module. It is the developer's
1664 	 *	  responsibility to test on all systems and make sure that
1665 	 *	  everything works.
1666 	 */
1667 	if (callout_min_resolution <= 0)
1668 		callout_min_resolution = CALLOUT_MIN_RESOLUTION;
1669 
1670 	/*
1671 	 * Allocate all the callout tables based on max_ncpus. We have chosen
1672 	 * to do boot-time allocation instead of dynamic allocation because:
1673 	 *
1674 	 *	- the size of the callout tables is not too large.
1675 	 *	- there are race conditions involved in making this dynamic.
1676 	 *	- the hash tables that go with the callout tables consume
1677 	 *	  most of the memory and they are only allocated in
1678 	 *	  callout_cpu_online().
1679 	 *
1680 	 * Each CPU has two tables that are consecutive in the array. The first
1681 	 * one is for realtime callouts and the second one is for normal ones.
1682 	 *
1683 	 * We do this alignment dance to make sure that callout table
1684 	 * structures will always be on a cache line boundary.
1685 	 */
1686 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
1687 	size += CALLOUT_ALIGN;
1688 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
1689 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
1690 
1691 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
1692 	/*
1693 	 * Now, initialize the tables for all the CPUs.
1694 	 */
1695 	for (f = 0; f < max_ncpus; f++) {
1696 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1697 			table_id = CALLOUT_TABLE(t, f);
1698 			ct = &callout_table[table_id];
1699 			ct->ct_type = t;
1700 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1701 			/*
1702 			 * Precompute the base IDs for long and short-term
1703 			 * legacy IDs. This makes ID generation during
1704 			 * timeout() fast.
1705 			 */
1706 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
1707 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
1708 			/*
1709 			 * Precompute the base ID for generation-based IDs.
1710 			 * Note that when the first ID gets allocated, the
1711 			 * ID will wrap. This will cause the generation
1712 			 * number to be incremented to 1.
1713 			 */
1714 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
1715 			/*
1716 			 * Initialize the cyclic as NONE. This will get set
1717 			 * during CPU online. This is so that partially
1718 			 * populated systems will only have the required
1719 			 * number of cyclics, not more.
1720 			 */
1721 			ct->ct_cyclic = CYCLIC_NONE;
1722 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
1723 		}
1724 	}
1725 
1726 	/*
1727 	 * Add the callback for CPR. This is called during checkpoint
1728 	 * resume to suspend and resume callouts.
1729 	 */
1730 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
1731 	    "callout_cpr");
1732 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
1733 	    "callout_debug");
1734 
1735 	/*
1736 	 * Call the per-CPU initialization function for the boot CPU. This
1737 	 * is done here because the function is not called automatically for
1738 	 * the boot CPU from the CPU online/offline hooks. Note that the
1739 	 * CPU lock is taken here because of convention.
1740 	 */
1741 	mutex_enter(&cpu_lock);
1742 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
1743 	callout_cpu_online(CPU);
1744 	mutex_exit(&cpu_lock);
1745 }
1746