xref: /illumos-gate/usr/src/uts/common/os/callout.c (revision 46b592853d0f4f11781b6b0a7533f267c6aee132)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/kmem.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
38 #include <sys/sdt.h>
39 
40 /*
41  * Callout tables.  See timeout(9F) for details.
42  */
43 static int callout_threads;			/* callout normal threads */
44 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
45 static int callout_min_reap;			/* callout minimum reap count */
46 static int callout_tolerance;			/* callout hires tolerance */
47 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
48 static clock_t callout_max_ticks;		/* max interval */
49 static hrtime_t callout_longterm;		/* longterm nanoseconds */
50 static ulong_t callout_counter_low;		/* callout ID increment */
51 static ulong_t callout_table_bits;		/* number of table bits in ID */
52 static ulong_t callout_table_mask;		/* mask for the table bits */
53 static callout_cache_t *callout_caches;		/* linked list of caches */
54 #pragma align 64(callout_table)
55 static callout_table_t *callout_table;		/* global callout table array */
56 
57 /*
58  * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
59  * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
60  * via taskq, to a thread that executes at PIL 0 - so we end up running
61  * 'normal' callouts at PIL 0.
62  */
63 static volatile int callout_realtime_level = CY_LOW_LEVEL;
64 static volatile int callout_normal_level = CY_LOCK_LEVEL;
65 
66 static char *callout_kstat_names[] = {
67 	"callout_timeouts",
68 	"callout_timeouts_pending",
69 	"callout_untimeouts_unexpired",
70 	"callout_untimeouts_executing",
71 	"callout_untimeouts_expired",
72 	"callout_expirations",
73 	"callout_allocations",
74 	"callout_cleanups",
75 };
76 
77 static hrtime_t	callout_heap_process(callout_table_t *, hrtime_t, int);
78 
79 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
80 {							\
81 	callout_hash_t *hashp = &(hash);		\
82 							\
83 	cp->cprev = NULL;				\
84 	cp->cnext = hashp->ch_head;			\
85 	if (hashp->ch_head == NULL)			\
86 		hashp->ch_tail = cp;			\
87 	else						\
88 		cp->cnext->cprev = cp;			\
89 	hashp->ch_head = cp;				\
90 }
91 
92 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
93 {							\
94 	callout_hash_t *hashp = &(hash);		\
95 							\
96 	cp->cnext = NULL;				\
97 	cp->cprev = hashp->ch_tail;			\
98 	if (hashp->ch_tail == NULL)			\
99 		hashp->ch_head = cp;			\
100 	else						\
101 		cp->cprev->cnext = cp;			\
102 	hashp->ch_tail = cp;				\
103 }
104 
105 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
106 {							\
107 	callout_hash_t *hashp = &(hash);		\
108 							\
109 	if (cp->cnext == NULL)				\
110 		hashp->ch_tail = cp->cprev;		\
111 	else						\
112 		cp->cnext->cprev = cp->cprev;		\
113 	if (cp->cprev == NULL)				\
114 		hashp->ch_head = cp->cnext;		\
115 	else						\
116 		cp->cprev->cnext = cp->cnext;		\
117 }
118 
119 /*
120  * These definitions help us queue callouts and callout lists. Here is
121  * the queueing rationale:
122  *
123  *	- callouts are queued in a FIFO manner in the ID hash table.
124  *	  TCP timers are typically cancelled in the same order that they
125  *	  were issued. The FIFO queueing shortens the search for a callout
126  *	  during untimeout().
127  *
128  *	- callouts are queued in a FIFO manner in their callout lists.
129  *	  This ensures that the callouts are executed in the same order that
130  *	  they were queued. This is fair. Plus, it helps to make each
131  *	  callout expiration timely. It also favors cancellations.
132  *
133  *	- callout lists are queued in the following manner in the callout
134  *	  hash table buckets:
135  *
136  *		- appended, if the callout list is a 1-nanosecond resolution
137  *		  callout list. When a callout is created, we first look for
138  *		  a callout list that has the same expiration so we can avoid
139  *		  allocating a callout list and inserting the expiration into
140  *		  the heap. However, we do not want to look at 1-nanosecond
141  *		  resolution callout lists as we will seldom find a match in
142  *		  them. Keeping these callout lists in the rear of the hash
143  *		  buckets allows us to skip these during the lookup.
144  *
145  *		- inserted at the beginning, if the callout list is not a
146  *		  1-nanosecond resolution callout list. This also has the
147  *		  side-effect of keeping the long term timers away from the
148  *		  front of the buckets.
149  *
150  *	- callout lists are queued in a FIFO manner in the expired callouts
151  *	  list. This ensures that callout lists are executed in the order
152  *	  of expiration.
153  */
154 #define	CALLOUT_APPEND(ct, cp)						\
155 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
156 		cp, c_idnext, c_idprev);				\
157 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
158 
159 #define	CALLOUT_DELETE(ct, cp)						\
160 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
161 		cp, c_idnext, c_idprev);				\
162 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
163 
164 #define	CALLOUT_LIST_INSERT(hash, cl)				\
165 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
166 
167 #define	CALLOUT_LIST_APPEND(hash, cl)				\
168 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
169 
170 #define	CALLOUT_LIST_DELETE(hash, cl)				\
171 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
172 
173 /*
174  * For normal callouts, there is a deadlock scenario if two callouts that
175  * have an inter-dependency end up on the same callout list. To break the
176  * deadlock, you need two taskq threads running in parallel. We compute
177  * the number of taskq threads here using a bunch of conditions to make
178  * it optimal for the common case. This is an ugly hack, but one that is
179  * necessary (sigh).
180  */
181 #define	CALLOUT_THRESHOLD	100000000
182 #define	CALLOUT_EXEC_COMPUTE(ct, exec)					\
183 {									\
184 	callout_list_t *cl;						\
185 									\
186 	cl = ct->ct_expired.ch_head;					\
187 	if (cl == NULL) {						\
188 		/*							\
189 		 * If the expired list is NULL, there is nothing to	\
190 		 * process.						\
191 		 */							\
192 		exec = 0;						\
193 	} else if ((cl->cl_next == NULL) &&				\
194 	    (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) {	\
195 		/*							\
196 		 * If there is only one callout list and it contains	\
197 		 * only one callout, there is no need for two threads.	\
198 		 */							\
199 		exec = 1;						\
200 	} else if ((ct->ct_heap_num == 0) ||				\
201 	    (ct->ct_heap[0].ch_expiration > gethrtime() + CALLOUT_THRESHOLD)) {\
202 		/*							\
203 		 * If the heap has become empty, we need two threads as	\
204 		 * there is no one to kick off the second thread in the	\
205 		 * future. If the heap is not empty and the top of the	\
206 		 * heap does not expire in the near future, we need two	\
207 		 * threads.						\
208 		 */							\
209 		exec = 2;						\
210 	} else {							\
211 		/*							\
212 		 * We have multiple callouts to process. But the cyclic	\
213 		 * will fire in the near future. So, we only need one	\
214 		 * thread for now.					\
215 		 */							\
216 		exec = 1;						\
217 	}								\
218 }
219 
220 /*
221  * Macro to swap two heap items.
222  */
223 #define	CALLOUT_SWAP(h1, h2)		\
224 {					\
225 	callout_heap_t tmp;		\
226 					\
227 	tmp = *h1;			\
228 	*h1 = *h2;			\
229 	*h2 = tmp;			\
230 }
231 
232 /*
233  * Macro to free a callout list.
234  */
235 #define	CALLOUT_LIST_FREE(ct, cl)			\
236 {							\
237 	cl->cl_next = ct->ct_lfree;			\
238 	ct->ct_lfree = cl;				\
239 	cl->cl_flags |= CALLOUT_LIST_FLAG_FREE;		\
240 }
241 
242 /*
243  * Allocate a callout structure.  We try quite hard because we
244  * can't sleep, and if we can't do the allocation, we're toast.
245  * Failing all, we try a KM_PANIC allocation. Note that we never
246  * deallocate a callout. See untimeout() for the reasoning.
247  */
248 static callout_t *
249 callout_alloc(callout_table_t *ct)
250 {
251 	size_t size;
252 	callout_t *cp;
253 
254 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
255 	mutex_exit(&ct->ct_mutex);
256 
257 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
258 	if (cp == NULL) {
259 		size = sizeof (callout_t);
260 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
261 	}
262 	cp->c_xid = 0;
263 	cp->c_executor = NULL;
264 	cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
265 	cp->c_waiting = 0;
266 
267 	mutex_enter(&ct->ct_mutex);
268 	ct->ct_allocations++;
269 	return (cp);
270 }
271 
272 /*
273  * Allocate a callout list structure.  We try quite hard because we
274  * can't sleep, and if we can't do the allocation, we're toast.
275  * Failing all, we try a KM_PANIC allocation. Note that we never
276  * deallocate a callout list.
277  */
278 static void
279 callout_list_alloc(callout_table_t *ct)
280 {
281 	size_t size;
282 	callout_list_t *cl;
283 
284 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
285 	mutex_exit(&ct->ct_mutex);
286 
287 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
288 	if (cl == NULL) {
289 		size = sizeof (callout_list_t);
290 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
291 	}
292 	bzero(cl, sizeof (callout_list_t));
293 
294 	mutex_enter(&ct->ct_mutex);
295 	CALLOUT_LIST_FREE(ct, cl);
296 }
297 
298 /*
299  * Find a callout list that corresponds to an expiration and matching flags.
300  */
301 static callout_list_t *
302 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
303 {
304 	callout_list_t *cl;
305 	int clflags;
306 
307 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
308 
309 	if (flags & CALLOUT_LIST_FLAG_NANO) {
310 		/*
311 		 * This is a 1-nanosecond resolution callout. We will rarely
312 		 * find a match for this. So, bail out.
313 		 */
314 		return (NULL);
315 	}
316 
317 	clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
318 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
319 		/*
320 		 * If we have reached a 1-nanosecond resolution callout list,
321 		 * we don't have much hope of finding a match in this hash
322 		 * bucket. So, just bail out.
323 		 */
324 		if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
325 			return (NULL);
326 
327 		if ((cl->cl_expiration == expiration) &&
328 		    ((cl->cl_flags & clflags) == (flags & clflags)))
329 			return (cl);
330 	}
331 
332 	return (NULL);
333 }
334 
335 /*
336  * Initialize a callout table's heap, if necessary. Preallocate some free
337  * entries so we don't have to check for NULL elsewhere.
338  */
339 static void
340 callout_heap_init(callout_table_t *ct)
341 {
342 	size_t size;
343 
344 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
345 	ASSERT(ct->ct_heap == NULL);
346 
347 	ct->ct_heap_num = 0;
348 	ct->ct_heap_max = CALLOUT_CHUNK;
349 	size = sizeof (callout_heap_t) * CALLOUT_CHUNK;
350 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
351 }
352 
353 /*
354  * Reallocate the heap. We try quite hard because we can't sleep, and if
355  * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC
356  * allocation. Note that the heap only expands, it never contracts.
357  */
358 static void
359 callout_heap_expand(callout_table_t *ct)
360 {
361 	size_t max, size, osize;
362 	callout_heap_t *heap;
363 
364 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
365 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
366 
367 	while (ct->ct_heap_num == ct->ct_heap_max) {
368 		max = ct->ct_heap_max;
369 		mutex_exit(&ct->ct_mutex);
370 
371 		osize = sizeof (callout_heap_t) * max;
372 		size = sizeof (callout_heap_t) * (max + CALLOUT_CHUNK);
373 		heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
374 
375 		mutex_enter(&ct->ct_mutex);
376 		if (max < ct->ct_heap_max) {
377 			/*
378 			 * Someone beat us to the allocation. Free what we
379 			 * just allocated and proceed.
380 			 */
381 			kmem_free(heap, size);
382 			continue;
383 		}
384 
385 		bcopy(ct->ct_heap, heap, osize);
386 		kmem_free(ct->ct_heap, osize);
387 		ct->ct_heap = heap;
388 		ct->ct_heap_max = size / sizeof (callout_heap_t);
389 	}
390 }
391 
392 /*
393  * Move an expiration from the bottom of the heap to its correct place
394  * in the heap. If we reached the root doing this, return 1. Else,
395  * return 0.
396  */
397 static int
398 callout_upheap(callout_table_t *ct)
399 {
400 	int current, parent;
401 	callout_heap_t *heap, *hcurrent, *hparent;
402 
403 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
404 	ASSERT(ct->ct_heap_num >= 1);
405 
406 	if (ct->ct_heap_num == 1) {
407 		return (1);
408 	}
409 
410 	heap = ct->ct_heap;
411 	current = ct->ct_heap_num - 1;
412 
413 	for (;;) {
414 		parent = CALLOUT_HEAP_PARENT(current);
415 		hparent = &heap[parent];
416 		hcurrent = &heap[current];
417 
418 		/*
419 		 * We have an expiration later than our parent; we're done.
420 		 */
421 		if (hcurrent->ch_expiration >= hparent->ch_expiration) {
422 			return (0);
423 		}
424 
425 		/*
426 		 * We need to swap with our parent, and continue up the heap.
427 		 */
428 		CALLOUT_SWAP(hparent, hcurrent);
429 
430 		/*
431 		 * If we just reached the root, we're done.
432 		 */
433 		if (parent == 0) {
434 			return (1);
435 		}
436 
437 		current = parent;
438 	}
439 	/*NOTREACHED*/
440 }
441 
442 /*
443  * Insert a new heap item into a callout table's heap.
444  */
445 static void
446 callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
447 {
448 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
449 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
450 
451 	/*
452 	 * First, copy the expiration and callout list pointer to the bottom
453 	 * of the heap.
454 	 */
455 	ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
456 	ct->ct_heap[ct->ct_heap_num].ch_list = cl;
457 	ct->ct_heap_num++;
458 
459 	/*
460 	 * Now, perform an upheap operation. If we reached the root, then
461 	 * the cyclic needs to be reprogrammed as we have an earlier
462 	 * expiration.
463 	 *
464 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
465 	 * We don't want any callout activity. When the CPR resume phase is
466 	 * entered, the cyclic will be programmed for the earliest expiration
467 	 * in the heap.
468 	 */
469 	if (callout_upheap(ct) && (ct->ct_suspend == 0))
470 		(void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
471 }
472 
473 /*
474  * Move an expiration from the top of the heap to its correct place
475  * in the heap.
476  */
477 static void
478 callout_downheap(callout_table_t *ct)
479 {
480 	int current, left, right, nelems;
481 	callout_heap_t *heap, *hleft, *hright, *hcurrent;
482 
483 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
484 	ASSERT(ct->ct_heap_num >= 1);
485 
486 	heap = ct->ct_heap;
487 	current = 0;
488 	nelems = ct->ct_heap_num;
489 
490 	for (;;) {
491 		/*
492 		 * If we don't have a left child (i.e., we're a leaf), we're
493 		 * done.
494 		 */
495 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
496 			return;
497 
498 		hleft = &heap[left];
499 		hcurrent = &heap[current];
500 
501 		right = CALLOUT_HEAP_RIGHT(current);
502 
503 		/*
504 		 * Even if we don't have a right child, we still need to compare
505 		 * our expiration against that of our left child.
506 		 */
507 		if (right >= nelems)
508 			goto comp_left;
509 
510 		hright = &heap[right];
511 
512 		/*
513 		 * We have both a left and a right child.  We need to compare
514 		 * the expiration of the children to determine which
515 		 * expires earlier.
516 		 */
517 		if (hright->ch_expiration < hleft->ch_expiration) {
518 			/*
519 			 * Our right child is the earlier of our children.
520 			 * We'll now compare our expiration to its expiration.
521 			 * If ours is the earlier one, we're done.
522 			 */
523 			if (hcurrent->ch_expiration <= hright->ch_expiration)
524 				return;
525 
526 			/*
527 			 * Our right child expires earlier than we do; swap
528 			 * with our right child, and descend right.
529 			 */
530 			CALLOUT_SWAP(hright, hcurrent);
531 			current = right;
532 			continue;
533 		}
534 
535 comp_left:
536 		/*
537 		 * Our left child is the earlier of our children (or we have
538 		 * no right child).  We'll now compare our expiration
539 		 * to its expiration. If ours is the earlier one, we're done.
540 		 */
541 		if (hcurrent->ch_expiration <= hleft->ch_expiration)
542 			return;
543 
544 		/*
545 		 * Our left child expires earlier than we do; swap with our
546 		 * left child, and descend left.
547 		 */
548 		CALLOUT_SWAP(hleft, hcurrent);
549 		current = left;
550 	}
551 }
552 
553 /*
554  * Delete and handle all past expirations in a callout table's heap.
555  */
556 static void
557 callout_heap_delete(callout_table_t *ct)
558 {
559 	hrtime_t now, expiration, next;
560 	callout_list_t *cl;
561 	callout_heap_t *heap;
562 	int hash;
563 
564 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
565 
566 	if (CALLOUT_CLEANUP(ct)) {
567 		/*
568 		 * There are too many heap elements pointing to empty callout
569 		 * lists. Clean them out.
570 		 */
571 		(void) callout_heap_process(ct, 0, 0);
572 	}
573 
574 	now = gethrtime();
575 	heap = ct->ct_heap;
576 
577 	while (ct->ct_heap_num > 0) {
578 		expiration = heap->ch_expiration;
579 		hash = CALLOUT_CLHASH(expiration);
580 		cl = heap->ch_list;
581 		ASSERT(expiration == cl->cl_expiration);
582 
583 		if (cl->cl_callouts.ch_head == NULL) {
584 			/*
585 			 * If the callout list is empty, reap it.
586 			 * Decrement the reap count.
587 			 */
588 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
589 			CALLOUT_LIST_FREE(ct, cl);
590 			ct->ct_nreap--;
591 		} else {
592 			/*
593 			 * If the root of the heap expires in the future,
594 			 * bail out.
595 			 */
596 			if (expiration > now)
597 				break;
598 
599 			/*
600 			 * Move the callout list for this expiration to the
601 			 * list of expired callout lists. It will be processed
602 			 * by the callout executor.
603 			 */
604 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
605 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
606 		}
607 
608 		/*
609 		 * Now delete the root. This is done by swapping the root with
610 		 * the last item in the heap and downheaping the item.
611 		 */
612 		ct->ct_heap_num--;
613 		if (ct->ct_heap_num > 0) {
614 			heap[0] = heap[ct->ct_heap_num];
615 			callout_downheap(ct);
616 		}
617 	}
618 
619 	/*
620 	 * If this callout table is empty or callouts have been suspended,
621 	 * just return. The cyclic has already been programmed to
622 	 * infinity by the cyclic subsystem.
623 	 */
624 	if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
625 		return;
626 
627 	/*
628 	 * If the top expirations are within callout_tolerance of each other,
629 	 * delay the cyclic expire so that they can be processed together.
630 	 * This is to prevent high resolution timers from swamping the system
631 	 * with cyclic activity.
632 	 */
633 	if (ct->ct_heap_num > 2) {
634 		next = expiration + callout_tolerance;
635 		if ((heap[1].ch_expiration < next) ||
636 		    (heap[2].ch_expiration < next))
637 			expiration = next;
638 	}
639 
640 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
641 }
642 
643 /*
644  * There are some situations when the entire heap is walked and processed.
645  * This function is called to do the processing. These are the situations:
646  *
647  * 1. When the reap count reaches its threshold, the heap has to be cleared
648  *    of all empty callout lists.
649  *
650  * 2. When the system enters and exits KMDB/OBP, all entries in the heap
651  *    need to be adjusted by the interval spent in KMDB/OBP.
652  *
653  * 3. When system time is changed, the heap has to be scanned for
654  *    absolute hrestime timers. These need to be removed from the heap
655  *    and expired immediately.
656  *
657  * In cases 2 and 3, it is a good idea to do 1 as well since we are
658  * scanning the heap anyway.
659  *
660  * If the root gets changed and/or callout lists are expired, return the
661  * new expiration to the caller so he can reprogram the cyclic accordingly.
662  */
663 static hrtime_t
664 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
665 {
666 	callout_heap_t *heap;
667 	callout_list_t *cl, *rootcl;
668 	hrtime_t expiration, now;
669 	int i, hash, clflags, expired;
670 	ulong_t num;
671 
672 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
673 
674 	if (ct->ct_heap_num == 0)
675 		return (0);
676 
677 	if (ct->ct_nreap > 0)
678 		ct->ct_cleanups++;
679 
680 	heap = ct->ct_heap;
681 	rootcl = heap->ch_list;
682 
683 	/*
684 	 * We walk the heap from the top to the bottom. If we encounter
685 	 * a heap item that points to an empty callout list, we clean
686 	 * it out. If we encounter a hrestime entry that must be removed,
687 	 * again we clean it out. Otherwise, we apply any adjustments needed
688 	 * to an element.
689 	 *
690 	 * During the walk, we also compact the heap from the bottom and
691 	 * reconstruct the heap using upheap operations. This is very
692 	 * efficient if the number of elements to be cleaned is greater than
693 	 * or equal to half the heap. This is the common case.
694 	 *
695 	 * Even in the non-common case, the upheap operations should be short
696 	 * as the entries below generally tend to be bigger than the entries
697 	 * above.
698 	 */
699 	num = ct->ct_heap_num;
700 	ct->ct_heap_num = 0;
701 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
702 	now = gethrtime();
703 	expired = 0;
704 	for (i = 0; i < num; i++) {
705 		cl = heap[i].ch_list;
706 		/*
707 		 * If the callout list is empty, delete the heap element and
708 		 * free the callout list.
709 		 */
710 		if (cl->cl_callouts.ch_head == NULL) {
711 			hash = CALLOUT_CLHASH(cl->cl_expiration);
712 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
713 			CALLOUT_LIST_FREE(ct, cl);
714 			continue;
715 		}
716 
717 		/*
718 		 * Delete the heap element and expire the callout list, if
719 		 * one of the following is true:
720 		 *	- the callout list has expired
721 		 *	- the callout list is an absolute hrestime one and
722 		 *	  there has been a system time change
723 		 */
724 		if ((cl->cl_expiration <= now) ||
725 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
726 			hash = CALLOUT_CLHASH(cl->cl_expiration);
727 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
728 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
729 			expired = 1;
730 			continue;
731 		}
732 
733 		/*
734 		 * Apply adjustments, if any. Adjustments are applied after
735 		 * the system returns from KMDB or OBP. They are only applied
736 		 * to relative callout lists.
737 		 */
738 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
739 			hash = CALLOUT_CLHASH(cl->cl_expiration);
740 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
741 			expiration = cl->cl_expiration + delta;
742 			if (expiration <= 0)
743 				expiration = CY_INFINITY;
744 			heap[i].ch_expiration = expiration;
745 			cl->cl_expiration = expiration;
746 			hash = CALLOUT_CLHASH(cl->cl_expiration);
747 			if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
748 				CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
749 			} else {
750 				CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
751 			}
752 		}
753 
754 		heap[ct->ct_heap_num] = heap[i];
755 		ct->ct_heap_num++;
756 		(void) callout_upheap(ct);
757 	}
758 
759 	ct->ct_nreap = 0;
760 
761 	if (expired)
762 		expiration = gethrtime();
763 	else if (ct->ct_heap_num == 0)
764 		expiration = CY_INFINITY;
765 	else if (rootcl != heap->ch_list)
766 		expiration = heap->ch_expiration;
767 	else
768 		expiration = 0;
769 
770 	return (expiration);
771 }
772 
773 /*
774  * Common function used to create normal and realtime callouts.
775  *
776  * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
777  * there is one restriction on a realtime callout handler - it should not
778  * directly or indirectly acquire cpu_lock. CPU offline waits for pending
779  * cyclic handlers to complete while holding cpu_lock. So, if a realtime
780  * callout handler were to try to get cpu_lock, there would be a deadlock
781  * during CPU offline.
782  */
783 callout_id_t
784 timeout_generic(int type, void (*func)(void *), void *arg,
785 	hrtime_t expiration, hrtime_t resolution, int flags)
786 {
787 	callout_table_t *ct;
788 	callout_t *cp;
789 	callout_id_t id;
790 	callout_list_t *cl;
791 	hrtime_t now, interval, rexpiration;
792 	int hash, clflags;
793 
794 	ASSERT(resolution > 0);
795 	ASSERT(func != NULL);
796 
797 	/*
798 	 * We get the current hrtime right upfront so that latencies in
799 	 * this function do not affect the accuracy of the callout.
800 	 */
801 	now = gethrtime();
802 
803 	/*
804 	 * We disable kernel preemption so that we remain on the same CPU
805 	 * throughout. If we needed to reprogram the callout table's cyclic,
806 	 * we can avoid X-calls if we are on the same CPU.
807 	 *
808 	 * Note that callout_alloc() releases and reacquires the callout
809 	 * table mutex. While reacquiring the mutex, it is possible for us
810 	 * to go to sleep and later migrate to another CPU. This should be
811 	 * pretty rare, though.
812 	 */
813 	kpreempt_disable();
814 
815 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
816 	mutex_enter(&ct->ct_mutex);
817 
818 	if (ct->ct_cyclic == CYCLIC_NONE) {
819 		mutex_exit(&ct->ct_mutex);
820 		/*
821 		 * The callout table has not yet been initialized fully.
822 		 * So, put this one on the boot callout table which is
823 		 * always initialized.
824 		 */
825 		ct = &callout_boot_ct[type];
826 		mutex_enter(&ct->ct_mutex);
827 	}
828 
829 	if (CALLOUT_CLEANUP(ct)) {
830 		/*
831 		 * There are too many heap elements pointing to empty callout
832 		 * lists. Clean them out.
833 		 */
834 		rexpiration = callout_heap_process(ct, 0, 0);
835 		if ((rexpiration != 0) && (ct->ct_suspend == 0))
836 			(void) cyclic_reprogram(ct->ct_cyclic, rexpiration);
837 	}
838 
839 	if ((cp = ct->ct_free) == NULL)
840 		cp = callout_alloc(ct);
841 	else
842 		ct->ct_free = cp->c_idnext;
843 
844 	cp->c_func = func;
845 	cp->c_arg = arg;
846 
847 	/*
848 	 * Compute the expiration hrtime.
849 	 */
850 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
851 		interval = expiration - now;
852 	} else {
853 		interval = expiration;
854 		expiration += now;
855 	}
856 
857 	if (resolution > 1) {
858 		/*
859 		 * Align expiration to the specified resolution.
860 		 */
861 		if (flags & CALLOUT_FLAG_ROUNDUP)
862 			expiration += resolution - 1;
863 		expiration = (expiration / resolution) * resolution;
864 	}
865 
866 	if (expiration <= 0) {
867 		/*
868 		 * expiration hrtime overflow has occurred. Just set the
869 		 * expiration to infinity.
870 		 */
871 		expiration = CY_INFINITY;
872 	}
873 
874 	/*
875 	 * Assign an ID to this callout
876 	 */
877 	if (flags & CALLOUT_FLAG_32BIT) {
878 		if (interval > callout_longterm) {
879 			id = (ct->ct_long_id - callout_counter_low);
880 			id |= CALLOUT_COUNTER_HIGH;
881 			ct->ct_long_id = id;
882 		} else {
883 			id = (ct->ct_short_id - callout_counter_low);
884 			id |= CALLOUT_COUNTER_HIGH;
885 			ct->ct_short_id = id;
886 		}
887 	} else {
888 		id = (ct->ct_gen_id - callout_counter_low);
889 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
890 			id |= CALLOUT_COUNTER_HIGH;
891 			id += CALLOUT_GENERATION_LOW;
892 		}
893 		ct->ct_gen_id = id;
894 	}
895 
896 	cp->c_xid = id;
897 
898 	clflags = 0;
899 	if (flags & CALLOUT_FLAG_ABSOLUTE)
900 		clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
901 	if (flags & CALLOUT_FLAG_HRESTIME)
902 		clflags |= CALLOUT_LIST_FLAG_HRESTIME;
903 	if (resolution == 1)
904 		clflags |= CALLOUT_LIST_FLAG_NANO;
905 	hash = CALLOUT_CLHASH(expiration);
906 
907 again:
908 	/*
909 	 * Try to see if a callout list already exists for this expiration.
910 	 */
911 	cl = callout_list_get(ct, expiration, clflags, hash);
912 	if (cl == NULL) {
913 		/*
914 		 * Check if we have enough space in the heap to insert one
915 		 * expiration. If not, expand the heap.
916 		 */
917 		if (ct->ct_heap_num == ct->ct_heap_max) {
918 			callout_heap_expand(ct);
919 			/*
920 			 * In the above call, we drop the lock, allocate and
921 			 * reacquire the lock. So, we could have been away
922 			 * for a while. In the meantime, someone could have
923 			 * inserted a callout list with the same expiration.
924 			 * So, the best course is to repeat the steps. This
925 			 * should be an infrequent event.
926 			 */
927 			goto again;
928 		}
929 
930 		/*
931 		 * Check the free list. If we don't find one, we have to
932 		 * take the slow path and allocate from kmem.
933 		 */
934 		if ((cl = ct->ct_lfree) == NULL) {
935 			callout_list_alloc(ct);
936 			/*
937 			 * In the above call, we drop the lock, allocate and
938 			 * reacquire the lock. So, we could have been away
939 			 * for a while. In the meantime, someone could have
940 			 * inserted a callout list with the same expiration.
941 			 * Plus, the heap could have become full. So, the best
942 			 * course is to repeat the steps. This should be an
943 			 * infrequent event.
944 			 */
945 			goto again;
946 		}
947 		ct->ct_lfree = cl->cl_next;
948 		cl->cl_expiration = expiration;
949 		cl->cl_flags = clflags;
950 
951 		if (clflags & CALLOUT_LIST_FLAG_NANO) {
952 			CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
953 		} else {
954 			CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
955 		}
956 
957 		/*
958 		 * This is a new expiration. So, insert it into the heap.
959 		 * This will also reprogram the cyclic, if the expiration
960 		 * propagated to the root of the heap.
961 		 */
962 		callout_heap_insert(ct, cl);
963 	} else {
964 		/*
965 		 * If the callout list was empty, untimeout_generic() would
966 		 * have incremented a reap count. Decrement the reap count
967 		 * as we are going to insert a callout into this list.
968 		 */
969 		if (cl->cl_callouts.ch_head == NULL)
970 			ct->ct_nreap--;
971 	}
972 	cp->c_list = cl;
973 	CALLOUT_APPEND(ct, cp);
974 
975 	ct->ct_timeouts++;
976 	ct->ct_timeouts_pending++;
977 
978 	mutex_exit(&ct->ct_mutex);
979 
980 	kpreempt_enable();
981 
982 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
983 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
984 	    cp);
985 
986 	return (id);
987 }
988 
989 timeout_id_t
990 timeout(void (*func)(void *), void *arg, clock_t delta)
991 {
992 	ulong_t id;
993 
994 	/*
995 	 * Make sure the callout runs at least 1 tick in the future.
996 	 */
997 	if (delta <= 0)
998 		delta = 1;
999 	else if (delta > callout_max_ticks)
1000 		delta = callout_max_ticks;
1001 
1002 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
1003 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1004 
1005 	return ((timeout_id_t)id);
1006 }
1007 
1008 /*
1009  * Convenience function that creates a normal callout with default parameters
1010  * and returns a full ID.
1011  */
1012 callout_id_t
1013 timeout_default(void (*func)(void *), void *arg, clock_t delta)
1014 {
1015 	callout_id_t id;
1016 
1017 	/*
1018 	 * Make sure the callout runs at least 1 tick in the future.
1019 	 */
1020 	if (delta <= 0)
1021 		delta = 1;
1022 	else if (delta > callout_max_ticks)
1023 		delta = callout_max_ticks;
1024 
1025 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
1026 	    nsec_per_tick, 0);
1027 
1028 	return (id);
1029 }
1030 
1031 timeout_id_t
1032 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
1033 {
1034 	ulong_t id;
1035 
1036 	/*
1037 	 * Make sure the callout runs at least 1 tick in the future.
1038 	 */
1039 	if (delta <= 0)
1040 		delta = 1;
1041 	else if (delta > callout_max_ticks)
1042 		delta = callout_max_ticks;
1043 
1044 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
1045 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
1046 
1047 	return ((timeout_id_t)id);
1048 }
1049 
1050 /*
1051  * Convenience function that creates a realtime callout with default parameters
1052  * and returns a full ID.
1053  */
1054 callout_id_t
1055 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
1056 {
1057 	callout_id_t id;
1058 
1059 	/*
1060 	 * Make sure the callout runs at least 1 tick in the future.
1061 	 */
1062 	if (delta <= 0)
1063 		delta = 1;
1064 	else if (delta > callout_max_ticks)
1065 		delta = callout_max_ticks;
1066 
1067 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
1068 	    nsec_per_tick, 0);
1069 
1070 	return (id);
1071 }
1072 
1073 hrtime_t
1074 untimeout_generic(callout_id_t id, int nowait)
1075 {
1076 	callout_table_t *ct;
1077 	callout_t *cp;
1078 	callout_id_t xid;
1079 	callout_list_t *cl;
1080 	int hash;
1081 	callout_id_t bogus;
1082 
1083 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
1084 	hash = CALLOUT_IDHASH(id);
1085 
1086 	mutex_enter(&ct->ct_mutex);
1087 
1088 	/*
1089 	 * Search the ID hash table for the callout.
1090 	 */
1091 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
1092 
1093 		xid = cp->c_xid;
1094 
1095 		/*
1096 		 * Match the ID and generation number.
1097 		 */
1098 		if ((xid & CALLOUT_ID_MASK) != id)
1099 			continue;
1100 
1101 		if ((xid & CALLOUT_EXECUTING) == 0) {
1102 			hrtime_t expiration;
1103 
1104 			/*
1105 			 * Delete the callout. If the callout list becomes
1106 			 * NULL, we don't remove it from the table. This is
1107 			 * so it can be reused. If the empty callout list
1108 			 * corresponds to the top of the the callout heap, we
1109 			 * don't reprogram the table cyclic here. This is in
1110 			 * order to avoid lots of X-calls to the CPU associated
1111 			 * with the callout table.
1112 			 */
1113 			cl = cp->c_list;
1114 			expiration = cl->cl_expiration;
1115 			CALLOUT_DELETE(ct, cp);
1116 			cp->c_idnext = ct->ct_free;
1117 			ct->ct_free = cp;
1118 			cp->c_xid |= CALLOUT_FREE;
1119 			ct->ct_untimeouts_unexpired++;
1120 			ct->ct_timeouts_pending--;
1121 
1122 			/*
1123 			 * If the callout list has become empty, it needs
1124 			 * to be cleaned along with its heap entry. Increment
1125 			 * a reap count.
1126 			 */
1127 			if (cl->cl_callouts.ch_head == NULL)
1128 				ct->ct_nreap++;
1129 			mutex_exit(&ct->ct_mutex);
1130 
1131 			expiration -= gethrtime();
1132 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
1133 			    "untimeout:ID %lx hrtime left %llx", id,
1134 			    expiration);
1135 			return (expiration < 0 ? 0 : expiration);
1136 		}
1137 
1138 		ct->ct_untimeouts_executing++;
1139 		/*
1140 		 * The callout we want to delete is currently executing.
1141 		 * The DDI states that we must wait until the callout
1142 		 * completes before returning, so we block on c_done until the
1143 		 * callout ID changes (to the old ID if it's on the freelist,
1144 		 * or to a new callout ID if it's in use).  This implicitly
1145 		 * assumes that callout structures are persistent (they are).
1146 		 */
1147 		if (cp->c_executor == curthread) {
1148 			/*
1149 			 * The timeout handler called untimeout() on itself.
1150 			 * Stupid, but legal.  We can't wait for the timeout
1151 			 * to complete without deadlocking, so we just return.
1152 			 */
1153 			mutex_exit(&ct->ct_mutex);
1154 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
1155 			    "untimeout_self:ID %x", id);
1156 			return (-1);
1157 		}
1158 		if (nowait == 0) {
1159 			/*
1160 			 * We need to wait. Indicate that we are waiting by
1161 			 * incrementing c_waiting. This prevents the executor
1162 			 * from doing a wakeup on c_done if there are no
1163 			 * waiters.
1164 			 */
1165 			while (cp->c_xid == xid) {
1166 				cp->c_waiting = 1;
1167 				cv_wait(&cp->c_done, &ct->ct_mutex);
1168 			}
1169 		}
1170 		mutex_exit(&ct->ct_mutex);
1171 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
1172 		    "untimeout_executing:ID %lx", id);
1173 		return (-1);
1174 	}
1175 	ct->ct_untimeouts_expired++;
1176 
1177 	mutex_exit(&ct->ct_mutex);
1178 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
1179 	    "untimeout_bogus_id:ID %lx", id);
1180 
1181 	/*
1182 	 * We didn't find the specified callout ID.  This means either
1183 	 * (1) the callout already fired, or (2) the caller passed us
1184 	 * a bogus value.  Perform a sanity check to detect case (2).
1185 	 */
1186 	bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
1187 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
1188 		panic("untimeout: impossible timeout id %llx",
1189 		    (unsigned long long)id);
1190 
1191 	return (-1);
1192 }
1193 
1194 clock_t
1195 untimeout(timeout_id_t id_arg)
1196 {
1197 	hrtime_t hleft;
1198 	clock_t tleft;
1199 	callout_id_t id;
1200 
1201 	id = (ulong_t)id_arg;
1202 	hleft = untimeout_generic(id, 0);
1203 	if (hleft < 0)
1204 		tleft = -1;
1205 	else if (hleft == 0)
1206 		tleft = 0;
1207 	else
1208 		tleft = NSEC_TO_TICK(hleft);
1209 
1210 	return (tleft);
1211 }
1212 
1213 /*
1214  * Convenience function to untimeout a timeout with a full ID with default
1215  * parameters.
1216  */
1217 clock_t
1218 untimeout_default(callout_id_t id, int nowait)
1219 {
1220 	hrtime_t hleft;
1221 	clock_t tleft;
1222 
1223 	hleft = untimeout_generic(id, nowait);
1224 	if (hleft < 0)
1225 		tleft = -1;
1226 	else if (hleft == 0)
1227 		tleft = 0;
1228 	else
1229 		tleft = NSEC_TO_TICK(hleft);
1230 
1231 	return (tleft);
1232 }
1233 
1234 /*
1235  * Expire all the callouts queued in the specified callout list.
1236  */
1237 static void
1238 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
1239 {
1240 	callout_t *cp, *cnext;
1241 
1242 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1243 	ASSERT(cl != NULL);
1244 
1245 	for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
1246 		/*
1247 		 * Multiple executor threads could be running at the same
1248 		 * time. If this callout is already being executed,
1249 		 * go on to the next one.
1250 		 */
1251 		if (cp->c_xid & CALLOUT_EXECUTING) {
1252 			cnext = cp->c_clnext;
1253 			continue;
1254 		}
1255 
1256 		/*
1257 		 * Indicate to untimeout() that a callout is
1258 		 * being expired by the executor.
1259 		 */
1260 		cp->c_xid |= CALLOUT_EXECUTING;
1261 		cp->c_executor = curthread;
1262 		mutex_exit(&ct->ct_mutex);
1263 
1264 		DTRACE_PROBE1(callout__start, callout_t *, cp);
1265 		(*cp->c_func)(cp->c_arg);
1266 		DTRACE_PROBE1(callout__end, callout_t *, cp);
1267 
1268 		mutex_enter(&ct->ct_mutex);
1269 
1270 		ct->ct_expirations++;
1271 		ct->ct_timeouts_pending--;
1272 		/*
1273 		 * Indicate completion for c_done.
1274 		 */
1275 		cp->c_xid &= ~CALLOUT_EXECUTING;
1276 		cp->c_executor = NULL;
1277 		cnext = cp->c_clnext;
1278 
1279 		/*
1280 		 * Delete callout from ID hash table and the callout
1281 		 * list, return to freelist, and tell any untimeout() that
1282 		 * cares that we're done.
1283 		 */
1284 		CALLOUT_DELETE(ct, cp);
1285 		cp->c_idnext = ct->ct_free;
1286 		ct->ct_free = cp;
1287 		cp->c_xid |= CALLOUT_FREE;
1288 
1289 		if (cp->c_waiting) {
1290 			cp->c_waiting = 0;
1291 			cv_broadcast(&cp->c_done);
1292 		}
1293 	}
1294 }
1295 
1296 /*
1297  * Execute all expired callout lists for a callout table.
1298  */
1299 static void
1300 callout_expire(callout_table_t *ct)
1301 {
1302 	callout_list_t *cl, *clnext;
1303 
1304 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1305 
1306 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1307 		/*
1308 		 * Expire all the callouts in this callout list.
1309 		 */
1310 		callout_list_expire(ct, cl);
1311 
1312 		clnext = cl->cl_next;
1313 		if (cl->cl_callouts.ch_head == NULL) {
1314 			/*
1315 			 * Free the callout list.
1316 			 */
1317 			CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1318 			CALLOUT_LIST_FREE(ct, cl);
1319 		}
1320 	}
1321 }
1322 
1323 /*
1324  * The cyclic handlers below process callouts in two steps:
1325  *
1326  *	1. Find all expired callout lists and queue them in a separate
1327  *	   list of expired callouts.
1328  *	2. Execute the expired callout lists.
1329  *
1330  * This is done for two reasons:
1331  *
1332  *	1. We want to quickly find the next earliest expiration to program
1333  *	   the cyclic to and reprogram it. We can do this right at the end
1334  *	   of step 1.
1335  *	2. The realtime cyclic handler expires callouts in place. However,
1336  *	   for normal callouts, callouts are expired by a taskq thread.
1337  *	   So, it is simpler and more robust to have the taskq thread just
1338  *	   do step 2.
1339  */
1340 
1341 /*
1342  * Realtime callout cyclic handler.
1343  */
1344 void
1345 callout_realtime(callout_table_t *ct)
1346 {
1347 	mutex_enter(&ct->ct_mutex);
1348 	callout_heap_delete(ct);
1349 	callout_expire(ct);
1350 	mutex_exit(&ct->ct_mutex);
1351 }
1352 
1353 void
1354 callout_execute(callout_table_t *ct)
1355 {
1356 	mutex_enter(&ct->ct_mutex);
1357 	callout_expire(ct);
1358 	mutex_exit(&ct->ct_mutex);
1359 }
1360 
1361 /*
1362  * Normal callout cyclic handler.
1363  */
1364 void
1365 callout_normal(callout_table_t *ct)
1366 {
1367 	int i, exec;
1368 
1369 	mutex_enter(&ct->ct_mutex);
1370 	callout_heap_delete(ct);
1371 	CALLOUT_EXEC_COMPUTE(ct, exec);
1372 	mutex_exit(&ct->ct_mutex);
1373 
1374 	for (i = 0; i < exec; i++) {
1375 		ASSERT(ct->ct_taskq != NULL);
1376 		(void) taskq_dispatch(ct->ct_taskq,
1377 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1378 	}
1379 }
1380 
1381 /*
1382  * Suspend callout processing.
1383  */
1384 static void
1385 callout_suspend(void)
1386 {
1387 	int t, f;
1388 	callout_table_t *ct;
1389 
1390 	/*
1391 	 * Traverse every callout table in the system and suspend callout
1392 	 * processing.
1393 	 *
1394 	 * We need to suspend all the tables (including the inactive ones)
1395 	 * so that if a table is made active while the suspend is still on,
1396 	 * the table remains suspended.
1397 	 */
1398 	for (f = 0; f < max_ncpus; f++) {
1399 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1400 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1401 
1402 			mutex_enter(&ct->ct_mutex);
1403 			ct->ct_suspend++;
1404 			if (ct->ct_cyclic == CYCLIC_NONE) {
1405 				mutex_exit(&ct->ct_mutex);
1406 				continue;
1407 			}
1408 			if (ct->ct_suspend == 1)
1409 				(void) cyclic_reprogram(ct->ct_cyclic,
1410 				    CY_INFINITY);
1411 			mutex_exit(&ct->ct_mutex);
1412 		}
1413 	}
1414 }
1415 
1416 /*
1417  * Resume callout processing.
1418  */
1419 static void
1420 callout_resume(hrtime_t delta, int timechange)
1421 {
1422 	hrtime_t exp;
1423 	int t, f;
1424 	callout_table_t *ct;
1425 
1426 	/*
1427 	 * Traverse every callout table in the system and resume callout
1428 	 * processing. For active tables, perform any hrtime adjustments
1429 	 * necessary.
1430 	 */
1431 	for (f = 0; f < max_ncpus; f++) {
1432 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1433 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1434 
1435 			mutex_enter(&ct->ct_mutex);
1436 			if (ct->ct_cyclic == CYCLIC_NONE) {
1437 				ct->ct_suspend--;
1438 				mutex_exit(&ct->ct_mutex);
1439 				continue;
1440 			}
1441 
1442 			/*
1443 			 * If a delta is specified, adjust the expirations in
1444 			 * the heap by delta. Also, if the caller indicates
1445 			 * a timechange, process that. This step also cleans
1446 			 * out any empty callout lists that might happen to
1447 			 * be there.
1448 			 */
1449 			(void) callout_heap_process(ct, delta, timechange);
1450 
1451 			ct->ct_suspend--;
1452 			if (ct->ct_suspend == 0) {
1453 				/*
1454 				 * If the expired list is non-empty, then have
1455 				 * the cyclic expire immediately. Else, program
1456 				 * the cyclic based on the heap.
1457 				 */
1458 				if (ct->ct_expired.ch_head != NULL)
1459 					exp = gethrtime();
1460 				else if (ct->ct_heap_num > 0)
1461 					exp = ct->ct_heap[0].ch_expiration;
1462 				else
1463 					exp = 0;
1464 				if (exp != 0)
1465 					(void) cyclic_reprogram(ct->ct_cyclic,
1466 					    exp);
1467 			}
1468 
1469 			mutex_exit(&ct->ct_mutex);
1470 		}
1471 	}
1472 }
1473 
1474 /*
1475  * Callback handler used by CPR to stop and resume callouts.
1476  * The cyclic subsystem saves and restores hrtime during CPR.
1477  * That is why callout_resume() is called with a 0 delta.
1478  * Although hrtime is the same, hrestime (system time) has
1479  * progressed during CPR. So, we have to indicate a time change
1480  * to expire the absolute hrestime timers.
1481  */
1482 /*ARGSUSED*/
1483 static boolean_t
1484 callout_cpr_callb(void *arg, int code)
1485 {
1486 	if (code == CB_CODE_CPR_CHKPT)
1487 		callout_suspend();
1488 	else
1489 		callout_resume(0, 1);
1490 
1491 	return (B_TRUE);
1492 }
1493 
1494 /*
1495  * Callback handler invoked when the debugger is entered or exited.
1496  */
1497 /*ARGSUSED*/
1498 static boolean_t
1499 callout_debug_callb(void *arg, int code)
1500 {
1501 	hrtime_t delta;
1502 
1503 	/*
1504 	 * When the system enters the debugger. make a note of the hrtime.
1505 	 * When it is resumed, compute how long the system was in the
1506 	 * debugger. This interval should not be counted for callouts.
1507 	 */
1508 	if (code == 0) {
1509 		callout_suspend();
1510 		callout_debug_hrtime = gethrtime();
1511 	} else {
1512 		delta = gethrtime() - callout_debug_hrtime;
1513 		callout_resume(delta, 0);
1514 	}
1515 
1516 	return (B_TRUE);
1517 }
1518 
1519 /*
1520  * Move the absolute hrestime callouts to the expired list. Then program the
1521  * table's cyclic to expire immediately so that the callouts can be executed
1522  * immediately.
1523  */
1524 static void
1525 callout_hrestime_one(callout_table_t *ct)
1526 {
1527 	hrtime_t expiration;
1528 
1529 	mutex_enter(&ct->ct_mutex);
1530 	if (ct->ct_heap_num == 0) {
1531 		mutex_exit(&ct->ct_mutex);
1532 		return;
1533 	}
1534 
1535 	/*
1536 	 * Walk the heap and process all the absolute hrestime entries.
1537 	 */
1538 	expiration = callout_heap_process(ct, 0, 1);
1539 
1540 	if ((expiration != 0) && (ct->ct_suspend == 0))
1541 		(void) cyclic_reprogram(ct->ct_cyclic, expiration);
1542 
1543 	mutex_exit(&ct->ct_mutex);
1544 }
1545 
1546 /*
1547  * This function is called whenever system time (hrestime) is changed
1548  * explicitly. All the HRESTIME callouts must be expired at once.
1549  */
1550 /*ARGSUSED*/
1551 void
1552 callout_hrestime(void)
1553 {
1554 	int t, f;
1555 	callout_table_t *ct;
1556 
1557 	/*
1558 	 * Traverse every callout table in the system and process the hrestime
1559 	 * callouts therein.
1560 	 *
1561 	 * We look at all the tables because we don't know which ones were
1562 	 * onlined and offlined in the past. The offlined tables may still
1563 	 * have active cyclics processing timers somewhere.
1564 	 */
1565 	for (f = 0; f < max_ncpus; f++) {
1566 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1567 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1568 			callout_hrestime_one(ct);
1569 		}
1570 	}
1571 }
1572 
1573 /*
1574  * Create the hash tables for this callout table.
1575  */
1576 static void
1577 callout_hash_init(callout_table_t *ct)
1578 {
1579 	size_t size;
1580 
1581 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1582 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1583 
1584 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1585 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1586 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1587 }
1588 
1589 /*
1590  * Create per-callout table kstats.
1591  */
1592 static void
1593 callout_kstat_init(callout_table_t *ct)
1594 {
1595 	callout_stat_type_t stat;
1596 	kstat_t *ct_kstats;
1597 	int ndx;
1598 
1599 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1600 	ASSERT(ct->ct_kstats == NULL);
1601 
1602 	ndx = ct - callout_table;
1603 	ct_kstats = kstat_create("unix", ndx, "callout",
1604 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1605 
1606 	if (ct_kstats == NULL) {
1607 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1608 		    (void *)ct);
1609 	} else {
1610 		ct_kstats->ks_data = ct->ct_kstat_data;
1611 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1612 			kstat_named_init(&ct->ct_kstat_data[stat],
1613 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
1614 		ct->ct_kstats = ct_kstats;
1615 		kstat_install(ct_kstats);
1616 	}
1617 }
1618 
1619 static void
1620 callout_cyclic_init(callout_table_t *ct)
1621 {
1622 	cyc_handler_t hdlr;
1623 	cyc_time_t when;
1624 	processorid_t seqid;
1625 	int t;
1626 
1627 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1628 
1629 	t = CALLOUT_TABLE_TYPE(ct);
1630 	seqid = CALLOUT_TABLE_SEQID(ct);
1631 
1632 	/*
1633 	 * Create the taskq thread if the table type is normal.
1634 	 * Realtime tables are handled at PIL1 by a softint
1635 	 * handler.
1636 	 */
1637 	if (t == CALLOUT_NORMAL) {
1638 		ASSERT(ct->ct_taskq == NULL);
1639 		/*
1640 		 * Each callout thread consumes exactly one
1641 		 * task structure while active.  Therefore,
1642 		 * prepopulating with 2 * callout_threads tasks
1643 		 * ensures that there's at least one task per
1644 		 * thread that's either scheduled or on the
1645 		 * freelist.  In turn, this guarantees that
1646 		 * taskq_dispatch() will always either succeed
1647 		 * (because there's a free task structure) or
1648 		 * be unnecessary (because "callout_excute(ct)"
1649 		 * has already scheduled).
1650 		 */
1651 		ct->ct_taskq =
1652 		    taskq_create_instance("callout_taskq", seqid,
1653 		    callout_threads, maxclsyspri,
1654 		    2 * callout_threads, 2 * callout_threads,
1655 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1656 	}
1657 
1658 	/*
1659 	 * callouts can only be created in a table whose
1660 	 * cyclic has been initialized.
1661 	 */
1662 	ASSERT(ct->ct_heap_num == 0);
1663 
1664 	/*
1665 	 * Create the callout table cyclics.
1666 	 *
1667 	 * The realtime cyclic handler executes at low PIL. The normal cyclic
1668 	 * handler executes at lock PIL. This is because there are cases
1669 	 * where code can block at PIL > 1 waiting for a normal callout handler
1670 	 * to unblock it directly or indirectly. If the normal cyclic were to
1671 	 * be executed at low PIL, it could get blocked out by the waiter
1672 	 * and cause a deadlock.
1673 	 */
1674 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1675 
1676 	hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t);
1677 	if (ct->ct_type == CALLOUT_REALTIME)
1678 		hdlr.cyh_level = callout_realtime_level;
1679 	else
1680 		hdlr.cyh_level = callout_normal_level;
1681 	hdlr.cyh_arg = ct;
1682 	when.cyt_when = CY_INFINITY;
1683 	when.cyt_interval = CY_INFINITY;
1684 
1685 	ct->ct_cyclic = cyclic_add(&hdlr, &when);
1686 }
1687 
1688 void
1689 callout_cpu_online(cpu_t *cp)
1690 {
1691 	lgrp_handle_t hand;
1692 	callout_cache_t *cache;
1693 	char s[KMEM_CACHE_NAMELEN];
1694 	callout_table_t *ct;
1695 	processorid_t seqid;
1696 	int t;
1697 
1698 	ASSERT(MUTEX_HELD(&cpu_lock));
1699 
1700 	/*
1701 	 * Locate the cache corresponding to the onlined CPU's lgroup.
1702 	 * Note that access to callout_caches is protected by cpu_lock.
1703 	 */
1704 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1705 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1706 		if (cache->cc_hand == hand)
1707 			break;
1708 	}
1709 
1710 	/*
1711 	 * If not found, create one. The caches are never destroyed.
1712 	 */
1713 	if (cache == NULL) {
1714 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1715 		cache->cc_hand = hand;
1716 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1717 		    (long)hand);
1718 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1719 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1720 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1721 		    (long)hand);
1722 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1723 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1724 		cache->cc_next = callout_caches;
1725 		callout_caches = cache;
1726 	}
1727 
1728 	seqid = cp->cpu_seqid;
1729 
1730 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1731 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1732 
1733 		mutex_enter(&ct->ct_mutex);
1734 		/*
1735 		 * Store convinience pointers to the kmem caches
1736 		 * in the callout table. These assignments should always be
1737 		 * done as callout tables can map to different physical
1738 		 * CPUs each time.
1739 		 */
1740 		ct->ct_cache = cache->cc_cache;
1741 		ct->ct_lcache = cache->cc_lcache;
1742 
1743 		/*
1744 		 * We use the heap pointer to check if stuff has been
1745 		 * initialized for this callout table.
1746 		 */
1747 		if (ct->ct_heap == NULL) {
1748 			callout_heap_init(ct);
1749 			callout_hash_init(ct);
1750 			callout_kstat_init(ct);
1751 			callout_cyclic_init(ct);
1752 		}
1753 
1754 		mutex_exit(&ct->ct_mutex);
1755 
1756 		/*
1757 		 * Move the cyclic to this CPU by doing a bind.
1758 		 */
1759 		cyclic_bind(ct->ct_cyclic, cp, NULL);
1760 	}
1761 }
1762 
1763 void
1764 callout_cpu_offline(cpu_t *cp)
1765 {
1766 	callout_table_t *ct;
1767 	processorid_t seqid;
1768 	int t;
1769 
1770 	ASSERT(MUTEX_HELD(&cpu_lock));
1771 
1772 	seqid = cp->cpu_seqid;
1773 
1774 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1775 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1776 
1777 		/*
1778 		 * Unbind the cyclic. This will allow the cyclic subsystem
1779 		 * to juggle the cyclic during CPU offline.
1780 		 */
1781 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
1782 	}
1783 }
1784 
1785 /*
1786  * This is called to perform per-CPU initialization for slave CPUs at
1787  * boot time.
1788  */
1789 void
1790 callout_mp_init(void)
1791 {
1792 	cpu_t *cp;
1793 
1794 	mutex_enter(&cpu_lock);
1795 
1796 	cp = cpu_active;
1797 	do {
1798 		callout_cpu_online(cp);
1799 	} while ((cp = cp->cpu_next_onln) != cpu_active);
1800 
1801 	mutex_exit(&cpu_lock);
1802 }
1803 
1804 /*
1805  * Initialize all callout tables.  Called at boot time just before clkstart().
1806  */
1807 void
1808 callout_init(void)
1809 {
1810 	int f, t;
1811 	size_t size;
1812 	int table_id;
1813 	callout_table_t *ct;
1814 	long bits, fanout;
1815 	uintptr_t buf;
1816 
1817 	/*
1818 	 * Initialize callout globals.
1819 	 */
1820 	bits = 0;
1821 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
1822 		bits++;
1823 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
1824 	callout_table_mask = (1 << callout_table_bits) - 1;
1825 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
1826 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
1827 	callout_max_ticks = CALLOUT_MAX_TICKS;
1828 	if (callout_min_reap == 0)
1829 		callout_min_reap = CALLOUT_MIN_REAP;
1830 
1831 	if (callout_tolerance <= 0)
1832 		callout_tolerance = CALLOUT_TOLERANCE;
1833 	if (callout_threads <= 0)
1834 		callout_threads = CALLOUT_THREADS;
1835 
1836 	/*
1837 	 * Allocate all the callout tables based on max_ncpus. We have chosen
1838 	 * to do boot-time allocation instead of dynamic allocation because:
1839 	 *
1840 	 *	- the size of the callout tables is not too large.
1841 	 *	- there are race conditions involved in making this dynamic.
1842 	 *	- the hash tables that go with the callout tables consume
1843 	 *	  most of the memory and they are only allocated in
1844 	 *	  callout_cpu_online().
1845 	 *
1846 	 * Each CPU has two tables that are consecutive in the array. The first
1847 	 * one is for realtime callouts and the second one is for normal ones.
1848 	 *
1849 	 * We do this alignment dance to make sure that callout table
1850 	 * structures will always be on a cache line boundary.
1851 	 */
1852 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
1853 	size += CALLOUT_ALIGN;
1854 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
1855 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
1856 
1857 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
1858 	/*
1859 	 * Now, initialize the tables for all the CPUs.
1860 	 */
1861 	for (f = 0; f < max_ncpus; f++) {
1862 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1863 			table_id = CALLOUT_TABLE(t, f);
1864 			ct = &callout_table[table_id];
1865 			ct->ct_type = t;
1866 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1867 			/*
1868 			 * Precompute the base IDs for long and short-term
1869 			 * legacy IDs. This makes ID generation during
1870 			 * timeout() fast.
1871 			 */
1872 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
1873 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
1874 			/*
1875 			 * Precompute the base ID for generation-based IDs.
1876 			 * Note that when the first ID gets allocated, the
1877 			 * ID will wrap. This will cause the generation
1878 			 * number to be incremented to 1.
1879 			 */
1880 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
1881 			/*
1882 			 * Initialize the cyclic as NONE. This will get set
1883 			 * during CPU online. This is so that partially
1884 			 * populated systems will only have the required
1885 			 * number of cyclics, not more.
1886 			 */
1887 			ct->ct_cyclic = CYCLIC_NONE;
1888 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
1889 		}
1890 	}
1891 
1892 	/*
1893 	 * Add the callback for CPR. This is called during checkpoint
1894 	 * resume to suspend and resume callouts.
1895 	 */
1896 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
1897 	    "callout_cpr");
1898 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
1899 	    "callout_debug");
1900 
1901 	/*
1902 	 * Call the per-CPU initialization function for the boot CPU. This
1903 	 * is done here because the function is not called automatically for
1904 	 * the boot CPU from the CPU online/offline hooks. Note that the
1905 	 * CPU lock is taken here because of convention.
1906 	 */
1907 	mutex_enter(&cpu_lock);
1908 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
1909 	callout_cpu_online(CPU);
1910 	mutex_exit(&cpu_lock);
1911 }
1912