xref: /illumos-gate/usr/src/uts/common/os/callout.c (revision d0f3f37e7f24f68fdbd85386c60e576883622762)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/kmem.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
38 #include <sys/sdt.h>
39 
40 /*
41  * Callout tables.  See timeout(9F) for details.
42  */
43 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
44 static int callout_min_resolution;		/* Minimum resolution */
45 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
46 static hrtime_t callout_longterm;		/* longterm nanoseconds */
47 static ulong_t callout_counter_low;		/* callout ID increment */
48 static ulong_t callout_table_bits;		/* number of table bits in ID */
49 static ulong_t callout_table_mask;		/* mask for the table bits */
50 static callout_cache_t *callout_caches;		/* linked list of caches */
51 #pragma align 64(callout_table)
52 static callout_table_t *callout_table;		/* global callout table array */
53 
54 static char *callout_kstat_names[] = {
55 	"callout_timeouts",
56 	"callout_timeouts_pending",
57 	"callout_untimeouts_unexpired",
58 	"callout_untimeouts_executing",
59 	"callout_untimeouts_expired",
60 	"callout_expirations",
61 	"callout_allocations",
62 };
63 
64 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
65 {							\
66 	callout_hash_t *hashp = &(hash);		\
67 							\
68 	cp->cprev = NULL;				\
69 	cp->cnext = hashp->ch_head;			\
70 	if (hashp->ch_head == NULL)			\
71 		hashp->ch_tail = cp;			\
72 	else						\
73 		cp->cnext->cprev = cp;			\
74 	hashp->ch_head = cp;				\
75 }
76 
77 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
78 {							\
79 	callout_hash_t *hashp = &(hash);		\
80 							\
81 	cp->cnext = NULL;				\
82 	cp->cprev = hashp->ch_tail;			\
83 	if (hashp->ch_tail == NULL)			\
84 		hashp->ch_head = cp;			\
85 	else						\
86 		cp->cprev->cnext = cp;			\
87 	hashp->ch_tail = cp;				\
88 }
89 
90 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
91 {							\
92 	callout_hash_t *hashp = &(hash);		\
93 							\
94 	if (cp->cnext == NULL)				\
95 		hashp->ch_tail = cp->cprev;		\
96 	else						\
97 		cp->cnext->cprev = cp->cprev;		\
98 	if (cp->cprev == NULL)				\
99 		hashp->ch_head = cp->cnext;		\
100 	else						\
101 		cp->cprev->cnext = cp->cnext;		\
102 }
103 
104 /*
105  * These definitions help us queue callouts and callout lists. Here is
106  * the queueing rationale:
107  *
108  *	- callouts are queued in a FIFO manner in the ID hash table.
109  *	  TCP timers are typically cancelled in the same order that they
110  *	  were issued. The FIFO queueing shortens the search for a callout
111  *	  during untimeout().
112  *
113  *	- callouts are queued in a FIFO manner in their callout lists.
114  *	  This ensures that the callouts are executed in the same order that
115  *	  they were queued. This is fair. Plus, it helps to make each
116  *	  callout expiration timely. It also favors cancellations.
117  *
118  *	- callout lists are queued in a LIFO manner in the callout list hash
119  *	  table. This ensures that long term timers stay at the rear of the
120  *	  hash lists.
121  *
122  *	- callout lists are queued in a FIFO manner in the expired callouts
123  *	  list. This ensures that callout lists are executed in the order
124  *	  of expiration.
125  */
126 #define	CALLOUT_APPEND(ct, cp)						\
127 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
128 		cp, c_idnext, c_idprev);				\
129 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
130 
131 #define	CALLOUT_DELETE(ct, cp)						\
132 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
133 		cp, c_idnext, c_idprev);				\
134 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
135 
136 #define	CALLOUT_LIST_INSERT(hash, cl)				\
137 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
138 
139 #define	CALLOUT_LIST_APPEND(hash, cl)				\
140 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
141 
142 #define	CALLOUT_LIST_DELETE(hash, cl)				\
143 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
144 
145 /*
146  * Allocate a callout structure.  We try quite hard because we
147  * can't sleep, and if we can't do the allocation, we're toast.
148  * Failing all, we try a KM_PANIC allocation. Note that we never
149  * deallocate a callout. See untimeout() for the reasoning.
150  */
151 static callout_t *
152 callout_alloc(callout_table_t *ct)
153 {
154 	size_t size;
155 	callout_t *cp;
156 
157 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
158 	mutex_exit(&ct->ct_mutex);
159 
160 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
161 	if (cp == NULL) {
162 		size = sizeof (callout_t);
163 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
164 	}
165 	cp->c_xid = 0;
166 
167 	mutex_enter(&ct->ct_mutex);
168 	ct->ct_allocations++;
169 	return (cp);
170 }
171 
172 /*
173  * Allocate a callout list structure.  We try quite hard because we
174  * can't sleep, and if we can't do the allocation, we're toast.
175  * Failing all, we try a KM_PANIC allocation. Note that we never
176  * deallocate a callout list.
177  */
178 static void
179 callout_list_alloc(callout_table_t *ct)
180 {
181 	size_t size;
182 	callout_list_t *cl;
183 
184 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
185 	mutex_exit(&ct->ct_mutex);
186 
187 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
188 	if (cl == NULL) {
189 		size = sizeof (callout_list_t);
190 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
191 	}
192 	bzero(cl, sizeof (callout_list_t));
193 
194 	mutex_enter(&ct->ct_mutex);
195 	cl->cl_next = ct->ct_lfree;
196 	ct->ct_lfree = cl;
197 }
198 
199 /*
200  * Find the callout list that corresponds to an expiration. There can
201  * be only one.
202  */
203 static callout_list_t *
204 callout_list_get(callout_table_t *ct, hrtime_t expiration, int hash)
205 {
206 	callout_list_t *cl;
207 
208 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
209 
210 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
211 		if (cl->cl_expiration == expiration)
212 			return (cl);
213 	}
214 
215 	return (NULL);
216 }
217 
218 /*
219  * Find the callout list that corresponds to an expiration. There can
220  * be only one. If the callout list is null, free it. Else, return it.
221  */
222 static callout_list_t *
223 callout_list_check(callout_table_t *ct, hrtime_t expiration, int hash)
224 {
225 	callout_list_t *cl;
226 
227 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
228 
229 	cl = callout_list_get(ct, expiration, hash);
230 	if (cl != NULL) {
231 		if (cl->cl_callouts.ch_head != NULL) {
232 			/*
233 			 * There is exactly one callout list for every
234 			 * unique expiration. So, we are done.
235 			 */
236 			return (cl);
237 		}
238 
239 		CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
240 		cl->cl_next = ct->ct_lfree;
241 		ct->ct_lfree = cl;
242 	}
243 
244 	return (NULL);
245 }
246 
247 /*
248  * Initialize a callout table's heap, if necessary. Preallocate some free
249  * entries so we don't have to check for NULL elsewhere.
250  */
251 static void
252 callout_heap_init(callout_table_t *ct)
253 {
254 	size_t size;
255 
256 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
257 	ASSERT(ct->ct_heap == NULL);
258 
259 	ct->ct_heap_num = 0;
260 	ct->ct_heap_max = CALLOUT_CHUNK;
261 	size = sizeof (hrtime_t) * CALLOUT_CHUNK;
262 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
263 }
264 
265 /*
266  * Reallocate the heap. We try quite hard because we can't sleep, and if
267  * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC
268  * allocation. Note that the heap only expands, it never contracts.
269  */
270 static void
271 callout_heap_expand(callout_table_t *ct)
272 {
273 	size_t max, size, osize;
274 	hrtime_t *heap;
275 
276 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
277 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
278 
279 	while (ct->ct_heap_num == ct->ct_heap_max) {
280 		max = ct->ct_heap_max;
281 		mutex_exit(&ct->ct_mutex);
282 
283 		osize = sizeof (hrtime_t) * max;
284 		size = sizeof (hrtime_t) * (max + CALLOUT_CHUNK);
285 		heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
286 
287 		mutex_enter(&ct->ct_mutex);
288 		if (max < ct->ct_heap_max) {
289 			/*
290 			 * Someone beat us to the allocation. Free what we
291 			 * just allocated and proceed.
292 			 */
293 			kmem_free(heap, size);
294 			continue;
295 		}
296 
297 		bcopy(ct->ct_heap, heap, osize);
298 		kmem_free(ct->ct_heap, osize);
299 		ct->ct_heap = heap;
300 		ct->ct_heap_max = size / sizeof (hrtime_t);
301 	}
302 }
303 
304 /*
305  * Move an expiration from the bottom of the heap to its correct place
306  * in the heap. If we reached the root doing this, return 1. Else,
307  * return 0.
308  */
309 static int
310 callout_upheap(callout_table_t *ct)
311 {
312 	int current, parent;
313 	hrtime_t *heap, current_expiration, parent_expiration;
314 
315 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
316 	ASSERT(ct->ct_heap_num >= 1);
317 
318 	if (ct->ct_heap_num == 1) {
319 		return (1);
320 	}
321 
322 	heap = ct->ct_heap;
323 	current = ct->ct_heap_num - 1;
324 
325 	for (;;) {
326 		parent = CALLOUT_HEAP_PARENT(current);
327 		current_expiration = heap[current];
328 		parent_expiration = heap[parent];
329 
330 		/*
331 		 * We have an expiration later than our parent; we're done.
332 		 */
333 		if (current_expiration >= parent_expiration) {
334 			return (0);
335 		}
336 
337 		/*
338 		 * We need to swap with our parent, and continue up the heap.
339 		 */
340 		heap[parent] = current_expiration;
341 		heap[current] = parent_expiration;
342 
343 		/*
344 		 * If we just reached the root, we're done.
345 		 */
346 		if (parent == 0) {
347 			return (1);
348 		}
349 
350 		current = parent;
351 	}
352 	/*NOTREACHED*/
353 }
354 
355 /*
356  * Insert a new, unique expiration into a callout table's heap.
357  */
358 static void
359 callout_heap_insert(callout_table_t *ct, hrtime_t expiration)
360 {
361 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
362 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
363 
364 	/*
365 	 * First, copy the expiration to the bottom of the heap.
366 	 */
367 	ct->ct_heap[ct->ct_heap_num] = expiration;
368 	ct->ct_heap_num++;
369 
370 	/*
371 	 * Now, perform an upheap operation. If we reached the root, then
372 	 * the cyclic needs to be reprogrammed as we have an earlier
373 	 * expiration.
374 	 *
375 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
376 	 * We don't want any callout activity. When the CPR resume phase is
377 	 * entered, the cyclic will be programmed for the earliest expiration
378 	 * in the heap.
379 	 */
380 	if (callout_upheap(ct) && !(ct->ct_flags & CALLOUT_TABLE_SUSPENDED))
381 		(void) cyclic_reprogram(ct->ct_cyclic, expiration);
382 }
383 
384 /*
385  * Move an expiration from the top of the heap to its correct place
386  * in the heap.
387  */
388 static void
389 callout_downheap(callout_table_t *ct)
390 {
391 	int left, right, current, nelems;
392 	hrtime_t *heap, left_expiration, right_expiration, current_expiration;
393 
394 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
395 	ASSERT(ct->ct_heap_num >= 1);
396 
397 	heap = ct->ct_heap;
398 	current = 0;
399 	nelems = ct->ct_heap_num;
400 
401 	for (;;) {
402 		/*
403 		 * If we don't have a left child (i.e., we're a leaf), we're
404 		 * done.
405 		 */
406 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
407 			return;
408 
409 		left_expiration = heap[left];
410 		current_expiration = heap[current];
411 
412 		right = CALLOUT_HEAP_RIGHT(current);
413 
414 		/*
415 		 * Even if we don't have a right child, we still need to compare
416 		 * our expiration against that of our left child.
417 		 */
418 		if (right >= nelems)
419 			goto comp_left;
420 
421 		right_expiration = heap[right];
422 
423 		/*
424 		 * We have both a left and a right child.  We need to compare
425 		 * the expiration of the children to determine which
426 		 * expires earlier.
427 		 */
428 		if (right_expiration < left_expiration) {
429 			/*
430 			 * Our right child is the earlier of our children.
431 			 * We'll now compare our expiration to its expiration.
432 			 * If ours is the earlier one, we're done.
433 			 */
434 			if (current_expiration <= right_expiration)
435 				return;
436 
437 			/*
438 			 * Our right child expires earlier than we do; swap
439 			 * with our right child, and descend right.
440 			 */
441 			heap[right] = current_expiration;
442 			heap[current] = right_expiration;
443 			current = right;
444 			continue;
445 		}
446 
447 comp_left:
448 		/*
449 		 * Our left child is the earlier of our children (or we have
450 		 * no right child).  We'll now compare our expiration
451 		 * to its expiration. If ours is the earlier one, we're done.
452 		 */
453 		if (current_expiration <= left_expiration)
454 			return;
455 
456 		/*
457 		 * Our left child expires earlier than we do; swap with our
458 		 * left child, and descend left.
459 		 */
460 		heap[left] = current_expiration;
461 		heap[current] = left_expiration;
462 		current = left;
463 	}
464 }
465 
466 /*
467  * Delete and handle all past expirations in a callout table's heap.
468  */
469 static void
470 callout_heap_delete(callout_table_t *ct)
471 {
472 	hrtime_t now, expiration;
473 	callout_list_t *cl;
474 	int hash;
475 
476 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
477 
478 	now = gethrtime();
479 
480 	while (ct->ct_heap_num > 0) {
481 		expiration = ct->ct_heap[0];
482 		/*
483 		 * Find the callout list that corresponds to the expiration.
484 		 * If the callout list is empty, callout_list_check()
485 		 * will free the callout list and return NULL.
486 		 */
487 		hash = CALLOUT_CLHASH(expiration);
488 		cl = callout_list_check(ct, expiration, hash);
489 		if (cl != NULL) {
490 			/*
491 			 * If the root of the heap expires in the future, we are
492 			 * done. We are doing this check here instead of at the
493 			 * beginning because we want to first free all the
494 			 * empty callout lists at the top of the heap.
495 			 */
496 			if (expiration > now)
497 				break;
498 
499 			/*
500 			 * Move the callout list for this expiration to the
501 			 * list of expired callout lists. It will be processed
502 			 * by the callout executor.
503 			 */
504 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
505 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
506 		}
507 
508 		/*
509 		 * Now delete the root. This is done by swapping the root with
510 		 * the last item in the heap and downheaping the item.
511 		 */
512 		ct->ct_heap_num--;
513 		if (ct->ct_heap_num > 0) {
514 			ct->ct_heap[0] = ct->ct_heap[ct->ct_heap_num];
515 			callout_downheap(ct);
516 		}
517 	}
518 
519 	/*
520 	 * If this callout table is empty or callouts have been suspended
521 	 * by CPR, just return. The cyclic has already been programmed to
522 	 * infinity by the cyclic subsystem.
523 	 */
524 	if ((ct->ct_heap_num == 0) || (ct->ct_flags & CALLOUT_TABLE_SUSPENDED))
525 		return;
526 
527 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
528 }
529 
530 callout_id_t
531 timeout_generic(int type, void (*func)(void *), void *arg,
532 	hrtime_t expiration, hrtime_t resolution, int flags)
533 {
534 	callout_table_t *ct;
535 	callout_t *cp;
536 	callout_id_t id;
537 	callout_list_t *cl;
538 	hrtime_t now, interval;
539 	int hash;
540 
541 	ASSERT(resolution > 0);
542 	ASSERT(func != NULL);
543 
544 	/*
545 	 * Please see comment about minimum resolution in callout_init().
546 	 */
547 	if (resolution < callout_min_resolution)
548 		resolution = callout_min_resolution;
549 
550 	/*
551 	 * We disable kernel preemption so that we remain on the same CPU
552 	 * throughout. If we needed to reprogram the callout table's cyclic,
553 	 * we can avoid X-calls if we are on the same CPU.
554 	 *
555 	 * Note that callout_alloc() releases and reacquires the callout
556 	 * table mutex. While reacquiring the mutex, it is possible for us
557 	 * to go to sleep and later migrate to another CPU. This should be
558 	 * pretty rare, though.
559 	 */
560 	kpreempt_disable();
561 
562 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
563 	mutex_enter(&ct->ct_mutex);
564 
565 	if (ct->ct_cyclic == CYCLIC_NONE) {
566 		mutex_exit(&ct->ct_mutex);
567 		/*
568 		 * The callout table has not yet been initialized fully.
569 		 * So, put this one on the boot callout table which is
570 		 * always initialized.
571 		 */
572 		ct = &callout_boot_ct[type];
573 		mutex_enter(&ct->ct_mutex);
574 	}
575 
576 	if ((cp = ct->ct_free) == NULL)
577 		cp = callout_alloc(ct);
578 	else
579 		ct->ct_free = cp->c_idnext;
580 
581 	cp->c_func = func;
582 	cp->c_arg = arg;
583 
584 	/*
585 	 * Compute the expiration hrtime.
586 	 */
587 	now = gethrtime();
588 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
589 		ASSERT(expiration > 0);
590 		interval = expiration - now;
591 	} else {
592 		interval = expiration;
593 		expiration += now;
594 		ASSERT(expiration > 0);
595 	}
596 	if (flags & CALLOUT_FLAG_ROUNDUP)
597 		expiration += resolution - 1;
598 	expiration = (expiration / resolution) * resolution;
599 
600 	/*
601 	 * Assign an ID to this callout
602 	 */
603 	if (flags & CALLOUT_FLAG_32BIT) {
604 		if (interval > callout_longterm) {
605 			id = (ct->ct_long_id - callout_counter_low);
606 			id |= CALLOUT_COUNTER_HIGH;
607 			ct->ct_long_id = id;
608 		} else {
609 			id = (ct->ct_short_id - callout_counter_low);
610 			id |= CALLOUT_COUNTER_HIGH;
611 			ct->ct_short_id = id;
612 		}
613 	} else {
614 		id = (ct->ct_gen_id - callout_counter_low);
615 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
616 			id |= CALLOUT_COUNTER_HIGH;
617 			id += CALLOUT_GENERATION_LOW;
618 		}
619 		ct->ct_gen_id = id;
620 	}
621 
622 	cp->c_xid = id;
623 	if (flags & CALLOUT_FLAG_HRESTIME)
624 		cp->c_xid |= CALLOUT_HRESTIME;
625 
626 	hash = CALLOUT_CLHASH(expiration);
627 
628 again:
629 	/*
630 	 * Try to see if a callout list already exists for this expiration.
631 	 * Most of the time, this will be the case.
632 	 */
633 	cl = callout_list_get(ct, expiration, hash);
634 	if (cl == NULL) {
635 		/*
636 		 * Check if we have enough space in the heap to insert one
637 		 * expiration. If not, expand the heap.
638 		 */
639 		if (ct->ct_heap_num == ct->ct_heap_max) {
640 			callout_heap_expand(ct);
641 			/*
642 			 * In the above call, we drop the lock, allocate and
643 			 * reacquire the lock. So, we could have been away
644 			 * for a while. In the meantime, someone could have
645 			 * inserted a callout list with the same expiration.
646 			 * So, the best course is to repeat the steps. This
647 			 * should be an infrequent event.
648 			 */
649 			goto again;
650 		}
651 
652 		/*
653 		 * Check the free list. If we don't find one, we have to
654 		 * take the slow path and allocate from kmem.
655 		 */
656 		if ((cl = ct->ct_lfree) == NULL) {
657 			callout_list_alloc(ct);
658 			/*
659 			 * In the above call, we drop the lock, allocate and
660 			 * reacquire the lock. So, we could have been away
661 			 * for a while. In the meantime, someone could have
662 			 * inserted a callout list with the same expiration.
663 			 * Plus, the heap could have become full. So, the best
664 			 * course is to repeat the steps. This should be an
665 			 * infrequent event.
666 			 */
667 			goto again;
668 		}
669 		ct->ct_lfree = cl->cl_next;
670 		cl->cl_expiration = expiration;
671 
672 		CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
673 
674 		/*
675 		 * This is a new expiration. So, insert it into the heap.
676 		 * This will also reprogram the cyclic, if the expiration
677 		 * propagated to the root of the heap.
678 		 */
679 		callout_heap_insert(ct, expiration);
680 	}
681 	cp->c_list = cl;
682 	CALLOUT_APPEND(ct, cp);
683 
684 	ct->ct_timeouts++;
685 	ct->ct_timeouts_pending++;
686 
687 	mutex_exit(&ct->ct_mutex);
688 
689 	kpreempt_enable();
690 
691 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
692 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
693 	    cp);
694 
695 	return (id);
696 }
697 
698 timeout_id_t
699 timeout(void (*func)(void *), void *arg, clock_t delta)
700 {
701 	ulong_t id;
702 
703 	/*
704 	 * Make sure the callout runs at least 1 tick in the future.
705 	 */
706 	if (delta <= 0)
707 		delta = 1;
708 
709 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
710 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
711 
712 	return ((timeout_id_t)id);
713 }
714 
715 /*
716  * Convenience function that creates a normal callout with default parameters
717  * and returns a full ID.
718  */
719 callout_id_t
720 timeout_default(void (*func)(void *), void *arg, clock_t delta)
721 {
722 	callout_id_t id;
723 
724 	/*
725 	 * Make sure the callout runs at least 1 tick in the future.
726 	 */
727 	if (delta <= 0)
728 		delta = 1;
729 
730 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
731 	    nsec_per_tick, 0);
732 
733 	return (id);
734 }
735 
736 timeout_id_t
737 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
738 {
739 	ulong_t id;
740 
741 	/*
742 	 * Make sure the callout runs at least 1 tick in the future.
743 	 */
744 	if (delta <= 0)
745 		delta = 1;
746 
747 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
748 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
749 
750 	return ((timeout_id_t)id);
751 }
752 
753 /*
754  * Convenience function that creates a realtime callout with default parameters
755  * and returns a full ID.
756  */
757 callout_id_t
758 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
759 {
760 	callout_id_t id;
761 
762 	/*
763 	 * Make sure the callout runs at least 1 tick in the future.
764 	 */
765 	if (delta <= 0)
766 		delta = 1;
767 
768 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
769 	    nsec_per_tick, 0);
770 
771 	return (id);
772 }
773 
774 hrtime_t
775 untimeout_generic(callout_id_t id, int nowait)
776 {
777 	callout_table_t *ct;
778 	callout_t *cp;
779 	callout_id_t xid;
780 	callout_list_t *cl;
781 	int hash;
782 	callout_id_t bogus;
783 
784 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
785 	hash = CALLOUT_IDHASH(id);
786 
787 	mutex_enter(&ct->ct_mutex);
788 
789 	/*
790 	 * Search the ID hash table for the callout.
791 	 */
792 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
793 
794 		xid = cp->c_xid;
795 
796 		/*
797 		 * Match the ID and generation number.
798 		 */
799 		if ((xid & CALLOUT_ID_MASK) != id)
800 			continue;
801 
802 		cl = cp->c_list;
803 		if ((xid & CALLOUT_EXECUTING) == 0) {
804 			hrtime_t expiration;
805 
806 			/*
807 			 * Delete the callout. If the callout list becomes
808 			 * NULL, we don't remove it from the table. This is
809 			 * so it can be reused. If the empty callout list
810 			 * corresponds to the top of the the callout heap, we
811 			 * don't reprogram the table cyclic here. This is in
812 			 * order to avoid lots of X-calls to the CPU associated
813 			 * with the callout table.
814 			 */
815 			expiration = cl->cl_expiration;
816 			CALLOUT_DELETE(ct, cp);
817 			cp->c_idnext = ct->ct_free;
818 			ct->ct_free = cp;
819 			ct->ct_untimeouts_unexpired++;
820 			ct->ct_timeouts_pending--;
821 			mutex_exit(&ct->ct_mutex);
822 
823 			expiration -= gethrtime();
824 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
825 			    "untimeout:ID %lx hrtime left %llx", id,
826 			    expiration);
827 			return (expiration < 0 ? 0 : expiration);
828 		}
829 
830 		ct->ct_untimeouts_executing++;
831 		/*
832 		 * The callout we want to delete is currently executing.
833 		 * The DDI states that we must wait until the callout
834 		 * completes before returning, so we block on cl_done until the
835 		 * callout ID changes (to the old ID if it's on the freelist,
836 		 * or to a new callout ID if it's in use).  This implicitly
837 		 * assumes that callout structures are persistent (they are).
838 		 */
839 		if (cl->cl_executor == curthread) {
840 			/*
841 			 * The timeout handler called untimeout() on itself.
842 			 * Stupid, but legal.  We can't wait for the timeout
843 			 * to complete without deadlocking, so we just return.
844 			 */
845 			mutex_exit(&ct->ct_mutex);
846 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
847 			    "untimeout_self:ID %x", id);
848 			return (-1);
849 		}
850 		if (nowait == 0) {
851 			/*
852 			 * We need to wait. Indicate that we are waiting by
853 			 * incrementing cl_waiting. This prevents the executor
854 			 * from doing a wakeup on cl_done if there are no
855 			 * waiters.
856 			 */
857 			while (cp->c_xid == xid) {
858 				cl->cl_waiting = 1;
859 				cv_wait(&cl->cl_done, &ct->ct_mutex);
860 			}
861 		}
862 		mutex_exit(&ct->ct_mutex);
863 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
864 		    "untimeout_executing:ID %lx", id);
865 		return (-1);
866 	}
867 	ct->ct_untimeouts_expired++;
868 
869 	mutex_exit(&ct->ct_mutex);
870 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
871 	    "untimeout_bogus_id:ID %lx", id);
872 
873 	/*
874 	 * We didn't find the specified callout ID.  This means either
875 	 * (1) the callout already fired, or (2) the caller passed us
876 	 * a bogus value.  Perform a sanity check to detect case (2).
877 	 */
878 	bogus = (CALLOUT_EXECUTING | CALLOUT_HRESTIME | CALLOUT_COUNTER_HIGH);
879 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
880 		panic("untimeout: impossible timeout id %llx",
881 		    (unsigned long long)id);
882 
883 	return (-1);
884 }
885 
886 clock_t
887 untimeout(timeout_id_t id_arg)
888 {
889 	hrtime_t hleft;
890 	clock_t tleft;
891 	callout_id_t id;
892 
893 	id = (ulong_t)id_arg;
894 	hleft = untimeout_generic(id, 0);
895 	if (hleft < 0)
896 		tleft = -1;
897 	else if (hleft == 0)
898 		tleft = 0;
899 	else
900 		tleft = NSEC_TO_TICK(hleft);
901 
902 	return (tleft);
903 }
904 
905 /*
906  * Convenience function to untimeout a timeout with a full ID with default
907  * parameters.
908  */
909 clock_t
910 untimeout_default(callout_id_t id, int nowait)
911 {
912 	hrtime_t hleft;
913 	clock_t tleft;
914 
915 	hleft = untimeout_generic(id, nowait);
916 	if (hleft < 0)
917 		tleft = -1;
918 	else if (hleft == 0)
919 		tleft = 0;
920 	else
921 		tleft = NSEC_TO_TICK(hleft);
922 
923 	return (tleft);
924 }
925 
926 /*
927  * Expire all the callouts queued in the specified callout list.
928  */
929 static void
930 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
931 {
932 	callout_t *cp;
933 
934 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
935 	ASSERT(cl != NULL);
936 
937 	cl->cl_executor = curthread;
938 
939 	while ((cp = cl->cl_callouts.ch_head) != NULL) {
940 		/*
941 		 * Indicate to untimeout() that a callout is
942 		 * being expired by the executor.
943 		 */
944 		cp->c_xid |= CALLOUT_EXECUTING;
945 		mutex_exit(&ct->ct_mutex);
946 
947 		DTRACE_PROBE1(callout__start, callout_t *, cp);
948 		(*cp->c_func)(cp->c_arg);
949 		DTRACE_PROBE1(callout__end, callout_t *, cp);
950 
951 		mutex_enter(&ct->ct_mutex);
952 
953 		ct->ct_expirations++;
954 		ct->ct_timeouts_pending--;
955 		/*
956 		 * Indicate completion for cl_done.
957 		 */
958 		cp->c_xid &= ~CALLOUT_EXECUTING;
959 
960 		/*
961 		 * Delete callout from ID hash table and the callout
962 		 * list, return to freelist, and tell any untimeout() that
963 		 * cares that we're done.
964 		 */
965 		CALLOUT_DELETE(ct, cp);
966 		cp->c_idnext = ct->ct_free;
967 		ct->ct_free = cp;
968 
969 		if (cl->cl_waiting) {
970 			cl->cl_waiting = 0;
971 			cv_broadcast(&cl->cl_done);
972 		}
973 	}
974 
975 	cl->cl_executor = NULL;
976 }
977 
978 /*
979  * Execute all expired callout lists for a callout table.
980  */
981 static void
982 callout_expire(callout_table_t *ct)
983 {
984 	callout_list_t *cl, *clnext;
985 
986 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
987 
988 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
989 		/*
990 		 * Multiple executor threads could be running at the same
991 		 * time. Each callout list is processed by only one thread.
992 		 * If this callout list is already being processed by another
993 		 * executor, go on to the next one.
994 		 */
995 		if (cl->cl_executor != NULL) {
996 			clnext = cl->cl_next;
997 			continue;
998 		}
999 
1000 		/*
1001 		 * Expire all the callouts in this callout list.
1002 		 */
1003 		callout_list_expire(ct, cl);
1004 
1005 		/*
1006 		 * Free the callout list.
1007 		 */
1008 		clnext = cl->cl_next;
1009 		CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1010 		cl->cl_next = ct->ct_lfree;
1011 		ct->ct_lfree = cl;
1012 	}
1013 }
1014 
1015 /*
1016  * The cyclic handlers below process callouts in two steps:
1017  *
1018  *	1. Find all expired callout lists and queue them in a separate
1019  *	   list of expired callouts.
1020  *	2. Execute the expired callout lists.
1021  *
1022  * This is done for two reasons:
1023  *
1024  *	1. We want to quickly find the next earliest expiration to program
1025  *	   the cyclic to and reprogram it. We can do this right at the end
1026  *	   of step 1.
1027  *	2. The realtime cyclic handler expires callouts in place. However,
1028  *	   for normal callouts, callouts are expired by a taskq thread.
1029  *	   So, it is simpler and more robust to have the taskq thread just
1030  *	   do step 2.
1031  */
1032 
1033 /*
1034  * Realtime callout cyclic handler.
1035  */
1036 void
1037 callout_realtime(callout_table_t *ct)
1038 {
1039 	mutex_enter(&ct->ct_mutex);
1040 	callout_heap_delete(ct);
1041 	callout_expire(ct);
1042 	mutex_exit(&ct->ct_mutex);
1043 }
1044 
1045 void
1046 callout_execute(callout_table_t *ct)
1047 {
1048 	mutex_enter(&ct->ct_mutex);
1049 	callout_expire(ct);
1050 	mutex_exit(&ct->ct_mutex);
1051 }
1052 
1053 /*
1054  * Normal callout cyclic handler.
1055  */
1056 void
1057 callout_normal(callout_table_t *ct)
1058 {
1059 	int exec;
1060 
1061 	mutex_enter(&ct->ct_mutex);
1062 	callout_heap_delete(ct);
1063 	exec = (ct->ct_expired.ch_head != NULL);
1064 	mutex_exit(&ct->ct_mutex);
1065 
1066 	if (exec) {
1067 		ASSERT(ct->ct_taskq != NULL);
1068 		(void) taskq_dispatch(ct->ct_taskq,
1069 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1070 	}
1071 }
1072 
1073 /*
1074  * Suspend callout processing.
1075  */
1076 static void
1077 callout_suspend(void)
1078 {
1079 	int t, f;
1080 	callout_table_t *ct;
1081 
1082 	/*
1083 	 * Traverse every callout table in the system and suspend callout
1084 	 * processing.
1085 	 *
1086 	 * We need to suspend all the tables (including the inactive ones)
1087 	 * so that if a table is made active while the suspend is still on,
1088 	 * the table remains suspended.
1089 	 */
1090 	for (f = 0; f < max_ncpus; f++) {
1091 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1092 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1093 
1094 			mutex_enter(&ct->ct_mutex);
1095 			ct->ct_flags |= CALLOUT_TABLE_SUSPENDED;
1096 			if (ct->ct_cyclic == CYCLIC_NONE) {
1097 				mutex_exit(&ct->ct_mutex);
1098 				continue;
1099 			}
1100 			(void) cyclic_reprogram(ct->ct_cyclic, CY_INFINITY);
1101 			mutex_exit(&ct->ct_mutex);
1102 		}
1103 	}
1104 }
1105 
1106 static void
1107 callout_adjust(callout_table_t *ct, hrtime_t delta)
1108 {
1109 	int hash, newhash;
1110 	hrtime_t expiration;
1111 	callout_list_t *cl;
1112 	callout_hash_t list;
1113 
1114 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1115 
1116 	/*
1117 	 * In order to adjust the expirations, we null out the heap. Then,
1118 	 * we reinsert adjusted expirations in the heap. Keeps it simple.
1119 	 * Note that since the CALLOUT_TABLE_SUSPENDED flag is set by the
1120 	 * caller, the heap insert does not result in cyclic reprogramming.
1121 	 */
1122 	ct->ct_heap_num = 0;
1123 
1124 	/*
1125 	 * First, remove all the callout lists from the table and string them
1126 	 * in a list.
1127 	 */
1128 	list.ch_head = list.ch_tail = NULL;
1129 	for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
1130 		while ((cl = ct->ct_clhash[hash].ch_head) != NULL) {
1131 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
1132 			CALLOUT_LIST_APPEND(list, cl);
1133 		}
1134 	}
1135 
1136 	/*
1137 	 * Now, traverse the callout lists and adjust their expirations.
1138 	 */
1139 	while ((cl = list.ch_head) != NULL) {
1140 		CALLOUT_LIST_DELETE(list, cl);
1141 		/*
1142 		 * Set the new expiration and reinsert in the right
1143 		 * hash bucket.
1144 		 */
1145 		expiration = cl->cl_expiration;
1146 		expiration += delta;
1147 		cl->cl_expiration = expiration;
1148 		newhash = CALLOUT_CLHASH(expiration);
1149 		CALLOUT_LIST_INSERT(ct->ct_clhash[newhash], cl);
1150 		callout_heap_insert(ct, expiration);
1151 	}
1152 }
1153 
1154 /*
1155  * Resume callout processing.
1156  */
1157 static void
1158 callout_resume(hrtime_t delta)
1159 {
1160 	hrtime_t exp;
1161 	int t, f;
1162 	callout_table_t *ct;
1163 
1164 	/*
1165 	 * Traverse every callout table in the system and resume callout
1166 	 * processing. For active tables, perform any hrtime adjustments
1167 	 * necessary.
1168 	 */
1169 	for (f = 0; f < max_ncpus; f++) {
1170 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1171 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1172 
1173 			mutex_enter(&ct->ct_mutex);
1174 			if (ct->ct_cyclic == CYCLIC_NONE) {
1175 				ct->ct_flags &= ~CALLOUT_TABLE_SUSPENDED;
1176 				mutex_exit(&ct->ct_mutex);
1177 				continue;
1178 			}
1179 
1180 			if (delta)
1181 				callout_adjust(ct, delta);
1182 
1183 			ct->ct_flags &= ~CALLOUT_TABLE_SUSPENDED;
1184 
1185 			/*
1186 			 * If the expired list is non-empty, then have the
1187 			 * cyclic expire immediately. Else, program the
1188 			 * cyclic based on the heap.
1189 			 */
1190 			if (ct->ct_expired.ch_head != NULL)
1191 				exp = gethrtime();
1192 			else if (ct->ct_heap_num > 0)
1193 				exp = ct->ct_heap[0];
1194 			else
1195 				exp = 0;
1196 			if (exp != 0)
1197 				(void) cyclic_reprogram(ct->ct_cyclic, exp);
1198 			mutex_exit(&ct->ct_mutex);
1199 		}
1200 	}
1201 }
1202 
1203 /*
1204  * Callback handler used by CPR to stop and resume callouts.
1205  */
1206 /*ARGSUSED*/
1207 static boolean_t
1208 callout_cpr_callb(void *arg, int code)
1209 {
1210 	if (code == CB_CODE_CPR_CHKPT)
1211 		callout_suspend();
1212 	else
1213 		callout_resume(0);
1214 
1215 	return (B_TRUE);
1216 }
1217 
1218 /*
1219  * Callback handler invoked when the debugger is entered or exited.
1220  */
1221 /*ARGSUSED*/
1222 static boolean_t
1223 callout_debug_callb(void *arg, int code)
1224 {
1225 	hrtime_t delta;
1226 
1227 	/*
1228 	 * When the system enters the debugger. make a note of the hrtime.
1229 	 * When it is resumed, compute how long the system was in the
1230 	 * debugger. This interval should not be counted for callouts.
1231 	 */
1232 	if (code == 0) {
1233 		callout_suspend();
1234 		callout_debug_hrtime = gethrtime();
1235 	} else {
1236 		delta = gethrtime() - callout_debug_hrtime;
1237 		callout_resume(delta);
1238 	}
1239 
1240 	return (B_TRUE);
1241 }
1242 
1243 /*
1244  * Move the hrestime callouts to the expired list. Then program the table's
1245  * cyclic to expire immediately so that the callouts can be executed
1246  * immediately.
1247  */
1248 static void
1249 callout_hrestime_one(callout_table_t *ct)
1250 {
1251 	callout_list_t *cl, *ecl;
1252 	callout_t *cp;
1253 	int hash;
1254 
1255 	mutex_enter(&ct->ct_mutex);
1256 	if (ct->ct_heap_num == 0) {
1257 		mutex_exit(&ct->ct_mutex);
1258 		return;
1259 	}
1260 
1261 	if (ct->ct_lfree == NULL)
1262 		callout_list_alloc(ct);
1263 	ecl = ct->ct_lfree;
1264 	ct->ct_lfree = ecl->cl_next;
1265 
1266 	for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
1267 		for (cl = ct->ct_clhash[hash].ch_head; cl; cl = cl->cl_next) {
1268 			for (cp = cl->cl_callouts.ch_head; cp;
1269 			    cp = cp->c_clnext) {
1270 				if ((cp->c_xid & CALLOUT_HRESTIME) == 0)
1271 					continue;
1272 				CALLOUT_HASH_DELETE(cl->cl_callouts, cp,
1273 				    c_clnext, c_clprev);
1274 				cp->c_list = ecl;
1275 				CALLOUT_HASH_APPEND(ecl->cl_callouts, cp,
1276 				    c_clnext, c_clprev);
1277 			}
1278 		}
1279 	}
1280 
1281 	if (ecl->cl_callouts.ch_head != NULL) {
1282 		CALLOUT_LIST_APPEND(ct->ct_expired, ecl);
1283 		if (!(ct->ct_flags & CALLOUT_TABLE_SUSPENDED))
1284 			(void) cyclic_reprogram(ct->ct_cyclic, gethrtime());
1285 	} else {
1286 		ecl->cl_next = ct->ct_lfree;
1287 		ct->ct_lfree = ecl;
1288 	}
1289 	mutex_exit(&ct->ct_mutex);
1290 }
1291 
1292 /*
1293  * This function is called whenever system time (hrestime) is changed
1294  * explicitly. All the HRESTIME callouts must be expired at once.
1295  */
1296 /*ARGSUSED*/
1297 void
1298 callout_hrestime(void)
1299 {
1300 	int t, f;
1301 	callout_table_t *ct;
1302 
1303 	/*
1304 	 * Traverse every callout table in the system and process the hrestime
1305 	 * callouts therein.
1306 	 *
1307 	 * We look at all the tables because we don't know which ones were
1308 	 * onlined and offlined in the past. The offlined tables may still
1309 	 * have active cyclics processing timers somewhere.
1310 	 */
1311 	for (f = 0; f < max_ncpus; f++) {
1312 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1313 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1314 			callout_hrestime_one(ct);
1315 		}
1316 	}
1317 }
1318 
1319 /*
1320  * Create the hash tables for this callout table.
1321  */
1322 static void
1323 callout_hash_init(callout_table_t *ct)
1324 {
1325 	size_t size;
1326 
1327 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1328 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1329 
1330 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1331 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1332 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1333 }
1334 
1335 /*
1336  * Create per-callout table kstats.
1337  */
1338 static void
1339 callout_kstat_init(callout_table_t *ct)
1340 {
1341 	callout_stat_type_t stat;
1342 	kstat_t *ct_kstats;
1343 	int ndx;
1344 
1345 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1346 	ASSERT(ct->ct_kstats == NULL);
1347 
1348 	ndx = ct - callout_table;
1349 	ct_kstats = kstat_create("unix", ndx, "callout",
1350 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1351 
1352 	if (ct_kstats == NULL) {
1353 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1354 		    (void *)ct);
1355 	} else {
1356 		ct_kstats->ks_data = ct->ct_kstat_data;
1357 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1358 			kstat_named_init(&ct->ct_kstat_data[stat],
1359 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
1360 		ct->ct_kstats = ct_kstats;
1361 		kstat_install(ct_kstats);
1362 	}
1363 }
1364 
1365 static void
1366 callout_cyclic_init(callout_table_t *ct)
1367 {
1368 	cyc_handler_t hdlr;
1369 	cyc_time_t when;
1370 	processorid_t seqid;
1371 	int t;
1372 
1373 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1374 
1375 	t = CALLOUT_TABLE_TYPE(ct);
1376 	seqid = CALLOUT_TABLE_SEQID(ct);
1377 
1378 	/*
1379 	 * Create the taskq thread if the table type is normal.
1380 	 * Realtime tables are handled at PIL1 by a softint
1381 	 * handler.
1382 	 */
1383 	if (t == CALLOUT_NORMAL) {
1384 		ASSERT(ct->ct_taskq == NULL);
1385 		/*
1386 		 * Each callout thread consumes exactly one
1387 		 * task structure while active.  Therefore,
1388 		 * prepopulating with 2 * CALLOUT_THREADS tasks
1389 		 * ensures that there's at least one task per
1390 		 * thread that's either scheduled or on the
1391 		 * freelist.  In turn, this guarantees that
1392 		 * taskq_dispatch() will always either succeed
1393 		 * (because there's a free task structure) or
1394 		 * be unnecessary (because "callout_excute(ct)"
1395 		 * has already scheduled).
1396 		 */
1397 		ct->ct_taskq =
1398 		    taskq_create_instance("callout_taskq", seqid,
1399 		    CALLOUT_THREADS, maxclsyspri,
1400 		    2 * CALLOUT_THREADS, 2 * CALLOUT_THREADS,
1401 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1402 	}
1403 
1404 	/*
1405 	 * callouts can only be created in a table whose
1406 	 * cyclic has been initialized.
1407 	 */
1408 	ASSERT(ct->ct_heap_num == 0);
1409 
1410 	/*
1411 	 * Create the callout table cyclics.
1412 	 */
1413 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1414 
1415 	/*
1416 	 * Ideally, the handlers for CALLOUT_REALTIME and CALLOUT_NORMAL should
1417 	 * be run at CY_LOW_LEVEL. But there are some callers of the delay(9F)
1418 	 * function that call delay(9F) illegally from PIL > 0. delay(9F) uses
1419 	 * normal callouts. In order to avoid a deadlock, we run the normal
1420 	 * handler from LOCK level. When the delay(9F) issue is fixed, this
1421 	 * should be fixed as well.
1422 	 */
1423 	hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t);
1424 	hdlr.cyh_level = (t == CALLOUT_REALTIME) ? CY_LOW_LEVEL : CY_LOCK_LEVEL;
1425 	hdlr.cyh_arg = ct;
1426 	when.cyt_when = CY_INFINITY;
1427 	when.cyt_interval = CY_INFINITY;
1428 
1429 	ct->ct_cyclic = cyclic_add(&hdlr, &when);
1430 }
1431 
1432 void
1433 callout_cpu_online(cpu_t *cp)
1434 {
1435 	lgrp_handle_t hand;
1436 	callout_cache_t *cache;
1437 	char s[KMEM_CACHE_NAMELEN];
1438 	callout_table_t *ct;
1439 	processorid_t seqid;
1440 	int t;
1441 
1442 	ASSERT(MUTEX_HELD(&cpu_lock));
1443 
1444 	/*
1445 	 * Locate the cache corresponding to the onlined CPU's lgroup.
1446 	 * Note that access to callout_caches is protected by cpu_lock.
1447 	 */
1448 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1449 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1450 		if (cache->cc_hand == hand)
1451 			break;
1452 	}
1453 
1454 	/*
1455 	 * If not found, create one. The caches are never destroyed.
1456 	 */
1457 	if (cache == NULL) {
1458 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1459 		cache->cc_hand = hand;
1460 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1461 		    (long)hand);
1462 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1463 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1464 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1465 		    (long)hand);
1466 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1467 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1468 		cache->cc_next = callout_caches;
1469 		callout_caches = cache;
1470 	}
1471 
1472 	seqid = cp->cpu_seqid;
1473 
1474 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1475 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1476 
1477 		mutex_enter(&ct->ct_mutex);
1478 		/*
1479 		 * Store convinience pointers to the kmem caches
1480 		 * in the callout table. These assignments should always be
1481 		 * done as callout tables can map to different physical
1482 		 * CPUs each time.
1483 		 */
1484 		ct->ct_cache = cache->cc_cache;
1485 		ct->ct_lcache = cache->cc_lcache;
1486 
1487 		/*
1488 		 * We use the heap pointer to check if stuff has been
1489 		 * initialized for this callout table.
1490 		 */
1491 		if (ct->ct_heap == NULL) {
1492 			callout_heap_init(ct);
1493 			callout_hash_init(ct);
1494 			callout_kstat_init(ct);
1495 			callout_cyclic_init(ct);
1496 		}
1497 
1498 		mutex_exit(&ct->ct_mutex);
1499 
1500 		/*
1501 		 * Move the cyclic to this CPU by doing a bind. Then unbind
1502 		 * the cyclic. This will allow the cyclic subsystem to juggle
1503 		 * the cyclic during CPU offline.
1504 		 */
1505 		cyclic_bind(ct->ct_cyclic, cp, NULL);
1506 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
1507 	}
1508 }
1509 
1510 /*
1511  * This is called to perform per-CPU initialization for slave CPUs at
1512  * boot time.
1513  */
1514 void
1515 callout_mp_init(void)
1516 {
1517 	cpu_t *cp;
1518 
1519 	mutex_enter(&cpu_lock);
1520 
1521 	cp = cpu_active;
1522 	do {
1523 		callout_cpu_online(cp);
1524 	} while ((cp = cp->cpu_next_onln) != cpu_active);
1525 
1526 	mutex_exit(&cpu_lock);
1527 }
1528 
1529 /*
1530  * Initialize all callout tables.  Called at boot time just before clkstart().
1531  */
1532 void
1533 callout_init(void)
1534 {
1535 	int f, t;
1536 	size_t size;
1537 	int table_id;
1538 	callout_table_t *ct;
1539 	long bits, fanout;
1540 	uintptr_t buf;
1541 
1542 	/*
1543 	 * Initialize callout globals.
1544 	 */
1545 	bits = 0;
1546 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
1547 		bits++;
1548 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
1549 	callout_table_mask = (1 << callout_table_bits) - 1;
1550 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
1551 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
1552 
1553 	/*
1554 	 * Because of the variability in timing behavior across systems with
1555 	 * different architectures, we cannot allow arbitrarily low
1556 	 * resolutions. The minimum resolution has to be determined in a
1557 	 * platform-specific way. Until then, we define a blanket minimum
1558 	 * resolution for callouts of CALLOUT_MIN_RESOLUTION.
1559 	 *
1560 	 * If, in the future, someone requires lower resolution timers, they
1561 	 * can do one of two things:
1562 	 *
1563 	 *	- Define a lower value for callout_min_resolution. This would
1564 	 *	  affect all clients of the callout subsystem. If this done
1565 	 *	  via /etc/system, then no code changes are required and it
1566 	 *	  would affect only that customer.
1567 	 *
1568 	 *	- Define a flag to be passed to timeout creation that allows
1569 	 *	  the lower resolution. This involves code changes. But it
1570 	 *	  would affect only the calling module. It is the developer's
1571 	 *	  responsibility to test on all systems and make sure that
1572 	 *	  everything works.
1573 	 */
1574 	if (callout_min_resolution <= 0)
1575 		callout_min_resolution = CALLOUT_MIN_RESOLUTION;
1576 
1577 	/*
1578 	 * Allocate all the callout tables based on max_ncpus. We have chosen
1579 	 * to do boot-time allocation instead of dynamic allocation because:
1580 	 *
1581 	 *	- the size of the callout tables is not too large.
1582 	 *	- there are race conditions involved in making this dynamic.
1583 	 *	- the hash tables that go with the callout tables consume
1584 	 *	  most of the memory and they are only allocated in
1585 	 *	  callout_cpu_online().
1586 	 *
1587 	 * Each CPU has two tables that are consecutive in the array. The first
1588 	 * one is for realtime callouts and the second one is for normal ones.
1589 	 *
1590 	 * We do this alignment dance to make sure that callout table
1591 	 * structures will always be on a cache line boundary.
1592 	 */
1593 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
1594 	size += CALLOUT_ALIGN;
1595 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
1596 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
1597 
1598 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
1599 	/*
1600 	 * Now, initialize the tables for all the CPUs.
1601 	 */
1602 	for (f = 0; f < max_ncpus; f++) {
1603 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1604 			table_id = CALLOUT_TABLE(t, f);
1605 			ct = &callout_table[table_id];
1606 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1607 			/*
1608 			 * Precompute the base IDs for long and short-term
1609 			 * legacy IDs. This makes ID generation during
1610 			 * timeout() fast.
1611 			 */
1612 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
1613 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
1614 			/*
1615 			 * Precompute the base ID for generation-based IDs.
1616 			 * Note that when the first ID gets allocated, the
1617 			 * ID will wrap. This will cause the generation
1618 			 * number to be incremented to 1.
1619 			 */
1620 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
1621 			/*
1622 			 * Initialize the cyclic as NONE. This will get set
1623 			 * during CPU online. This is so that partially
1624 			 * populated systems will only have the required
1625 			 * number of cyclics, not more.
1626 			 */
1627 			ct->ct_cyclic = CYCLIC_NONE;
1628 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
1629 		}
1630 	}
1631 
1632 	/*
1633 	 * Add the callback for CPR. This is called during checkpoint
1634 	 * resume to suspend and resume callouts.
1635 	 */
1636 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
1637 	    "callout_cpr");
1638 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
1639 	    "callout_debug");
1640 
1641 	/*
1642 	 * Call the per-CPU initialization function for the boot CPU. This
1643 	 * is done here because the function is not called automatically for
1644 	 * the boot CPU from the CPU online/offline hooks. Note that the
1645 	 * CPU lock is taken here because of convention.
1646 	 */
1647 	mutex_enter(&cpu_lock);
1648 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
1649 	callout_cpu_online(CPU);
1650 	mutex_exit(&cpu_lock);
1651 }
1652