xref: /freebsd/sys/kern/kern_timeout.c (revision 2e5b60079b7d8c3ca68f1390cd90f305e651f8d3)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_callout_profiling.h"
41 #if defined(__arm__)
42 #include "opt_timer.h"
43 #endif
44 #include "opt_rss.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/bus.h>
49 #include <sys/callout.h>
50 #include <sys/file.h>
51 #include <sys/interrupt.h>
52 #include <sys/kernel.h>
53 #include <sys/ktr.h>
54 #include <sys/lock.h>
55 #include <sys/malloc.h>
56 #include <sys/mutex.h>
57 #include <sys/rmlock.h>
58 #include <sys/rwlock.h>
59 #include <sys/proc.h>
60 #include <sys/sdt.h>
61 #include <sys/sleepqueue.h>
62 #include <sys/sysctl.h>
63 #include <sys/smp.h>
64 
65 #ifdef SMP
66 #include <machine/cpu.h>
67 #endif
68 
69 #ifndef NO_EVENTTIMERS
70 DPCPU_DECLARE(sbintime_t, hardclocktime);
71 #endif
72 
73 SDT_PROVIDER_DEFINE(callout_execute);
74 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__start,
75     "struct callout *");
76 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__end,
77     "struct callout *");
78 
79 #ifdef CALLOUT_PROFILING
80 static int avg_depth;
81 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
82     "Average number of items examined per softclock call. Units = 1/1000");
83 static int avg_gcalls;
84 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
85     "Average number of Giant callouts made per softclock call. Units = 1/1000");
86 static int avg_lockcalls;
87 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
88     "Average number of lock callouts made per softclock call. Units = 1/1000");
89 static int avg_mpcalls;
90 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
91     "Average number of MP callouts made per softclock call. Units = 1/1000");
92 static int avg_depth_dir;
93 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
94     "Average number of direct callouts examined per callout_process call. "
95     "Units = 1/1000");
96 static int avg_lockcalls_dir;
97 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
98     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
99     "callout_process call. Units = 1/1000");
100 static int avg_mpcalls_dir;
101 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
102     0, "Average number of MP direct callouts made per callout_process call. "
103     "Units = 1/1000");
104 #endif
105 
106 static int ncallout;
107 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
108     "Number of entries in callwheel and size of timeout() preallocation");
109 
110 #ifdef	RSS
111 static int pin_default_swi = 1;
112 static int pin_pcpu_swi = 1;
113 #else
114 static int pin_default_swi = 0;
115 static int pin_pcpu_swi = 0;
116 #endif
117 
118 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
119     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
120 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
121     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
122 
123 /*
124  * TODO:
125  *	allocate more timeout table slots when table overflows.
126  */
127 u_int callwheelsize, callwheelmask;
128 
129 typedef void callout_mutex_op_t(struct lock_object *);
130 typedef int callout_owned_op_t(struct lock_object *);
131 
132 struct callout_mutex_ops {
133 	callout_mutex_op_t *lock;
134 	callout_mutex_op_t *unlock;
135 	callout_owned_op_t *owned;
136 };
137 
138 enum {
139 	CALLOUT_LC_UNUSED_0,
140 	CALLOUT_LC_UNUSED_1,
141 	CALLOUT_LC_UNUSED_2,
142 	CALLOUT_LC_UNUSED_3,
143 	CALLOUT_LC_SPIN,
144 	CALLOUT_LC_MUTEX,
145 	CALLOUT_LC_RW,
146 	CALLOUT_LC_RM,
147 };
148 
149 static void
150 callout_mutex_op_none(struct lock_object *lock)
151 {
152 }
153 
154 static int
155 callout_owned_op_none(struct lock_object *lock)
156 {
157 	return (0);
158 }
159 
160 static void
161 callout_mutex_lock(struct lock_object *lock)
162 {
163 	mtx_lock((struct mtx *)lock);
164 }
165 
166 static void
167 callout_mutex_unlock(struct lock_object *lock)
168 {
169 	mtx_unlock((struct mtx *)lock);
170 }
171 
172 static void
173 callout_mutex_lock_spin(struct lock_object *lock)
174 {
175 	mtx_lock_spin((struct mtx *)lock);
176 }
177 
178 static void
179 callout_mutex_unlock_spin(struct lock_object *lock)
180 {
181 	mtx_unlock_spin((struct mtx *)lock);
182 }
183 
184 static int
185 callout_mutex_owned(struct lock_object *lock)
186 {
187 	return (mtx_owned((struct mtx *)lock));
188 }
189 
190 static void
191 callout_rm_wlock(struct lock_object *lock)
192 {
193 	rm_wlock((struct rmlock *)lock);
194 }
195 
196 static void
197 callout_rm_wunlock(struct lock_object *lock)
198 {
199 	rm_wunlock((struct rmlock *)lock);
200 }
201 
202 static int
203 callout_rm_owned(struct lock_object *lock)
204 {
205 	return (rm_wowned((struct rmlock *)lock));
206 }
207 
208 static void
209 callout_rw_wlock(struct lock_object *lock)
210 {
211 	rw_wlock((struct rwlock *)lock);
212 }
213 
214 static void
215 callout_rw_wunlock(struct lock_object *lock)
216 {
217 	rw_wunlock((struct rwlock *)lock);
218 }
219 
220 static int
221 callout_rw_owned(struct lock_object *lock)
222 {
223 	return (rw_wowned((struct rwlock *)lock));
224 }
225 
226 static const struct callout_mutex_ops callout_mutex_ops[8] = {
227 	[CALLOUT_LC_UNUSED_0] = {
228 		.lock = callout_mutex_op_none,
229 		.unlock = callout_mutex_op_none,
230 		.owned = callout_owned_op_none,
231 	},
232 	[CALLOUT_LC_UNUSED_1] = {
233 		.lock = callout_mutex_op_none,
234 		.unlock = callout_mutex_op_none,
235 		.owned = callout_owned_op_none,
236 	},
237 	[CALLOUT_LC_UNUSED_2] = {
238 		.lock = callout_mutex_op_none,
239 		.unlock = callout_mutex_op_none,
240 		.owned = callout_owned_op_none,
241 	},
242 	[CALLOUT_LC_UNUSED_3] = {
243 		.lock = callout_mutex_op_none,
244 		.unlock = callout_mutex_op_none,
245 		.owned = callout_owned_op_none,
246 	},
247 	[CALLOUT_LC_SPIN] = {
248 		.lock = callout_mutex_lock_spin,
249 		.unlock = callout_mutex_unlock_spin,
250 		.owned = callout_mutex_owned,
251 	},
252 	[CALLOUT_LC_MUTEX] = {
253 		.lock = callout_mutex_lock,
254 		.unlock = callout_mutex_unlock,
255 		.owned = callout_mutex_owned,
256 	},
257 	[CALLOUT_LC_RW] = {
258 		.lock = callout_rw_wlock,
259 		.unlock = callout_rw_wunlock,
260 		.owned = callout_rw_owned,
261 	},
262 	[CALLOUT_LC_RM] = {
263 		.lock = callout_rm_wlock,
264 		.unlock = callout_rm_wunlock,
265 		.owned = callout_rm_owned,
266 	},
267 };
268 
269 static void
270 callout_lock_client(int c_flags, struct lock_object *c_lock)
271 {
272 	callout_mutex_ops[CALLOUT_GET_LC(c_flags)].lock(c_lock);
273 }
274 
275 static void
276 callout_unlock_client(int c_flags, struct lock_object *c_lock)
277 {
278 	callout_mutex_ops[CALLOUT_GET_LC(c_flags)].unlock(c_lock);
279 }
280 
281 #ifdef SMP
282 static int
283 callout_lock_owned_client(int c_flags, struct lock_object *c_lock)
284 {
285 	return (callout_mutex_ops[CALLOUT_GET_LC(c_flags)].owned(c_lock));
286 }
287 #endif
288 
289 /*
290  * The callout CPU exec structure represent information necessary for
291  * describing the state of callouts currently running on the CPU and
292  * for handling deferred callout restarts.
293  *
294  * In particular, the first entry of the array cc_exec_entity holds
295  * information for callouts running from the SWI thread context, while
296  * the second one holds information for callouts running directly from
297  * the hardware interrupt context.
298  */
299 struct cc_exec {
300 	/*
301 	 * The "cc_curr" points to the currently executing callout and
302 	 * is protected by the "cc_lock" spinlock. If no callback is
303 	 * currently executing it is equal to "NULL".
304 	 */
305 	struct callout		*cc_curr;
306 	/*
307 	 * The "cc_restart_args" structure holds the argument for a
308 	 * deferred callback restart and is protected by the "cc_lock"
309 	 * spinlock. The structure is only valid if "cc_restart" is
310 	 * "true". If "cc_restart" is "false" the information in the
311 	 * "cc_restart_args" structure shall be ignored.
312 	 */
313 	struct callout_args	cc_restart_args;
314 	bool			cc_restart;
315 	/*
316 	 * The "cc_cancel" variable allows the currently pending
317 	 * callback to be atomically cancelled. This field is write
318 	 * protected by the "cc_lock" spinlock.
319 	 */
320 	bool cc_cancel;
321 	/*
322 	 * The "cc_drain_fn" points to a function which shall be
323 	 * called with the argument stored in "cc_drain_arg" when an
324 	 * asynchronous drain is performed. This field is write
325 	 * protected by the "cc_lock" spinlock.
326 	 */
327 	callout_func_t *cc_drain_fn;
328 	void *cc_drain_arg;
329 };
330 
331 /*
332  * There is one "struct callout_cpu" per CPU, holding all relevant
333  * state for the callout processing thread on the individual CPU.
334  */
335 struct callout_cpu {
336 	struct mtx_padalign	cc_lock;
337 	struct cc_exec 		cc_exec_entity[2];
338 	struct callout		*cc_exec_next_dir;
339 	struct callout		*cc_callout;
340 	struct callout_list	*cc_callwheel;
341 	struct callout_tailq	cc_expireq;
342 	struct callout_slist	cc_callfree;
343 	sbintime_t		cc_firstevent;
344 	sbintime_t		cc_lastscan;
345 	void			*cc_cookie;
346 	u_int			cc_bucket;
347 	char			cc_ktr_event_name[20];
348 };
349 
350 #ifdef SMP
351 struct callout_cpu cc_cpu[MAXCPU];
352 #define	CPUBLOCK	MAXCPU
353 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
354 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
355 #else
356 struct callout_cpu cc_cpu;
357 #define	CC_CPU(cpu)	&cc_cpu
358 #define	CC_SELF()	&cc_cpu
359 #endif
360 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
361 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
362 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
363 
364 static int timeout_cpu;
365 
366 static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
367 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
368 #ifdef CALLOUT_PROFILING
369 		    int *mpcalls, int *lockcalls, int *gcalls,
370 #endif
371 		    int direct);
372 
373 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
374 
375 /*
376  * Kernel low level callwheel initialization called from cpu0 during
377  * kernel startup:
378  */
379 static void
380 callout_callwheel_init(void *dummy)
381 {
382 	struct callout_cpu *cc;
383 
384 	/*
385 	 * Calculate the size of the callout wheel and the preallocated
386 	 * timeout() structures.
387 	 * XXX: Clip callout to result of previous function of maxusers
388 	 * maximum 384.  This is still huge, but acceptable.
389 	 */
390 	ncallout = imin(16 + maxproc + maxfiles, 18508);
391 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
392 
393 	/*
394 	 * Calculate callout wheel size, should be next power of two higher
395 	 * than 'ncallout'.
396 	 */
397 	callwheelsize = 1 << fls(ncallout);
398 	callwheelmask = callwheelsize - 1;
399 
400 	/*
401 	 * Fetch whether we're pinning the swi's or not.
402 	 */
403 	TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
404 	TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
405 
406 	/*
407 	 * Only cpu0 handles timeout(9) and receives a preallocation.
408 	 *
409 	 * XXX: Once all timeout(9) consumers are converted this can
410 	 * be removed.
411 	 */
412 	timeout_cpu = PCPU_GET(cpuid);
413 	cc = CC_CPU(timeout_cpu);
414 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
415 	    M_CALLOUT, M_WAITOK);
416 	callout_cpu_init(cc, timeout_cpu);
417 }
418 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
419 
420 /*
421  * Initialize the per-cpu callout structures.
422  */
423 static void
424 callout_cpu_init(struct callout_cpu *cc, int cpu)
425 {
426 	struct callout *c;
427 	int i;
428 
429 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
430 	SLIST_INIT(&cc->cc_callfree);
431 	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
432 	    M_CALLOUT, M_WAITOK);
433 	for (i = 0; i < callwheelsize; i++)
434 		LIST_INIT(&cc->cc_callwheel[i]);
435 	TAILQ_INIT(&cc->cc_expireq);
436 	cc->cc_firstevent = SBT_MAX;
437 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
438 	    "callwheel cpu %d", cpu);
439 	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
440 		return;
441 	for (i = 0; i < ncallout; i++) {
442 		c = &cc->cc_callout[i];
443 		callout_init(c, 0);
444 		c->c_flags |= CALLOUT_LOCAL_ALLOC;
445 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
446 	}
447 }
448 
449 /*
450  * Start standard softclock thread.
451  */
452 static void
453 start_softclock(void *dummy)
454 {
455 	struct callout_cpu *cc;
456 	char name[MAXCOMLEN];
457 #ifdef SMP
458 	int cpu;
459 	struct intr_event *ie;
460 #endif
461 
462 	cc = CC_CPU(timeout_cpu);
463 	snprintf(name, sizeof(name), "clock (%d)", timeout_cpu);
464 	if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
465 	    INTR_MPSAFE, &cc->cc_cookie))
466 		panic("died while creating standard software ithreads");
467 	if (pin_default_swi &&
468 	    (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
469 		printf("%s: timeout clock couldn't be pinned to cpu %d\n",
470 		    __func__,
471 		    timeout_cpu);
472 	}
473 
474 #ifdef SMP
475 	CPU_FOREACH(cpu) {
476 		if (cpu == timeout_cpu)
477 			continue;
478 		cc = CC_CPU(cpu);
479 		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
480 		callout_cpu_init(cc, cpu);
481 		snprintf(name, sizeof(name), "clock (%d)", cpu);
482 		ie = NULL;
483 		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
484 		    INTR_MPSAFE, &cc->cc_cookie))
485 			panic("died while creating standard software ithreads");
486 		if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
487 			printf("%s: per-cpu clock couldn't be pinned to "
488 			    "cpu %d\n",
489 			    __func__,
490 			    cpu);
491 		}
492 	}
493 #endif
494 }
495 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
496 
497 #define	CC_HASH_SHIFT	8
498 
499 static inline u_int
500 callout_hash(sbintime_t sbt)
501 {
502 
503 	return (sbt >> (32 - CC_HASH_SHIFT));
504 }
505 
506 static inline u_int
507 callout_get_bucket(sbintime_t sbt)
508 {
509 
510 	return (callout_hash(sbt) & callwheelmask);
511 }
512 
513 void
514 callout_process(sbintime_t now)
515 {
516 	struct callout *tmp, *tmpn;
517 	struct callout_cpu *cc;
518 	struct callout_list *sc;
519 	sbintime_t first, last, max, tmp_max;
520 	uint32_t lookahead;
521 	u_int firstb, lastb, nowb;
522 #ifdef CALLOUT_PROFILING
523 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
524 #endif
525 	cc = CC_SELF();
526 	CC_LOCK(cc);
527 
528 	/* Compute the buckets of the last scan and present times. */
529 	firstb = callout_hash(cc->cc_lastscan);
530 	cc->cc_lastscan = now;
531 	nowb = callout_hash(now);
532 
533 	/* Compute the last bucket and minimum time of the bucket after it. */
534 	if (nowb == firstb)
535 		lookahead = (SBT_1S / 16);
536 	else if (nowb - firstb == 1)
537 		lookahead = (SBT_1S / 8);
538 	else
539 		lookahead = (SBT_1S / 2);
540 	first = last = now;
541 	first += (lookahead / 2);
542 	last += lookahead;
543 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
544 	lastb = callout_hash(last) - 1;
545 	max = last;
546 
547 	/*
548 	 * Check if we wrapped around the entire wheel from the last scan.
549 	 * In case, we need to scan entirely the wheel for pending callouts.
550 	 */
551 	if (lastb - firstb >= callwheelsize) {
552 		lastb = firstb + callwheelsize - 1;
553 		if (nowb - firstb >= callwheelsize)
554 			nowb = lastb;
555 	}
556 
557 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
558 	do {
559 		sc = &cc->cc_callwheel[firstb & callwheelmask];
560 		tmp = LIST_FIRST(sc);
561 		while (tmp != NULL) {
562 			/* Run the callout if present time within allowed. */
563 			if (tmp->c_time <= now) {
564 				/*
565 				 * Consumer told us the callout may be run
566 				 * directly from hardware interrupt context.
567 				 */
568 				if (tmp->c_flags & CALLOUT_DIRECT) {
569 #ifdef CALLOUT_PROFILING
570 					++depth_dir;
571 #endif
572 					cc->cc_exec_next_dir =
573 					    LIST_NEXT(tmp, c_links.le);
574 					cc->cc_bucket = firstb & callwheelmask;
575 					LIST_REMOVE(tmp, c_links.le);
576 					softclock_call_cc(tmp, cc,
577 #ifdef CALLOUT_PROFILING
578 					    &mpcalls_dir, &lockcalls_dir, NULL,
579 #endif
580 					    1);
581 					tmp = cc->cc_exec_next_dir;
582 				} else {
583 					tmpn = LIST_NEXT(tmp, c_links.le);
584 					LIST_REMOVE(tmp, c_links.le);
585 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
586 					    tmp, c_links.tqe);
587 					tmp->c_flags |= CALLOUT_PROCESSED;
588 					tmp = tmpn;
589 				}
590 				continue;
591 			}
592 			/* Skip events from distant future. */
593 			if (tmp->c_time >= max)
594 				goto next;
595 			/*
596 			 * Event minimal time is bigger than present maximal
597 			 * time, so it cannot be aggregated.
598 			 */
599 			if (tmp->c_time > last) {
600 				lastb = nowb;
601 				goto next;
602 			}
603 			/* Update first and last time, respecting this event. */
604 			if (tmp->c_time < first)
605 				first = tmp->c_time;
606 			tmp_max = tmp->c_time + tmp->c_precision;
607 			if (tmp_max < last)
608 				last = tmp_max;
609 next:
610 			tmp = LIST_NEXT(tmp, c_links.le);
611 		}
612 		/* Proceed with the next bucket. */
613 		firstb++;
614 		/*
615 		 * Stop if we looked after present time and found
616 		 * some event we can't execute at now.
617 		 * Stop if we looked far enough into the future.
618 		 */
619 	} while (((int)(firstb - lastb)) <= 0);
620 	cc->cc_firstevent = last;
621 #ifndef NO_EVENTTIMERS
622 	cpu_new_callout(curcpu, last, first);
623 #endif
624 #ifdef CALLOUT_PROFILING
625 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
626 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
627 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
628 #endif
629 	CC_UNLOCK(cc);
630 	/*
631 	 * swi_sched acquires the thread lock, so we don't want to call it
632 	 * with cc_lock held; incorrect locking order.
633 	 */
634 	if (!TAILQ_EMPTY(&cc->cc_expireq))
635 		swi_sched(cc->cc_cookie, 0);
636 }
637 
638 static struct callout_cpu *
639 callout_lock(struct callout *c)
640 {
641 	struct callout_cpu *cc;
642 	cc = CC_CPU(c->c_cpu);
643 	CC_LOCK(cc);
644 	return (cc);
645 }
646 
647 static struct callout_cpu *
648 callout_cc_add_locked(struct callout *c, struct callout_cpu *cc,
649     struct callout_args *coa, bool can_swap_cpu)
650 {
651 #ifndef NO_EVENTTIMERS
652 	sbintime_t sbt;
653 #endif
654 	int bucket;
655 
656 	CC_LOCK_ASSERT(cc);
657 
658 	/* update flags before swapping locks, if any */
659 	c->c_flags &= ~(CALLOUT_PROCESSED | CALLOUT_DIRECT | CALLOUT_DEFRESTART);
660 	if (coa->flags & C_DIRECT_EXEC)
661 		c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING | CALLOUT_DIRECT);
662 	else
663 		c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
664 
665 #ifdef SMP
666 	/*
667 	 * Check if we are changing the CPU on which the callback
668 	 * should be executed and if we have a lock protecting us:
669 	 */
670 	if (can_swap_cpu != false && coa->cpu != c->c_cpu &&
671 	    callout_lock_owned_client(c->c_flags, c->c_lock) != 0) {
672 		CC_UNLOCK(cc);
673 		c->c_cpu = coa->cpu;
674 		cc = callout_lock(c);
675 	}
676 #endif
677 	if (coa->time < cc->cc_lastscan)
678 		coa->time = cc->cc_lastscan;
679 	c->c_arg = coa->arg;
680 	c->c_func = coa->func;
681 	c->c_time = coa->time;
682 	c->c_precision = coa->precision;
683 
684 	bucket = callout_get_bucket(c->c_time);
685 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
686 	    c, (int)(c->c_precision >> 32),
687 	    (u_int)(c->c_precision & 0xffffffff));
688 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
689 
690 	/* Ensure we are first to be scanned, if called via a callback */
691 	if (cc->cc_bucket == bucket)
692 		cc->cc_exec_next_dir = c;
693 #ifndef NO_EVENTTIMERS
694 	/*
695 	 * Inform the eventtimers(4) subsystem there's a new callout
696 	 * that has been inserted, but only if really required.
697 	 */
698 	if (SBT_MAX - c->c_time < c->c_precision)
699 		c->c_precision = SBT_MAX - c->c_time;
700 	sbt = c->c_time + c->c_precision;
701 	if (sbt < cc->cc_firstevent) {
702 		cc->cc_firstevent = sbt;
703 		cpu_new_callout(coa->cpu, sbt, c->c_time);
704 	}
705 #endif
706 	return (cc);
707 }
708 
709 static void
710 callout_cc_del(struct callout *c, struct callout_cpu *cc)
711 {
712 
713 	c->c_func = NULL;
714 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
715 }
716 
717 static void
718 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
719 #ifdef CALLOUT_PROFILING
720     int *mpcalls, int *lockcalls, int *gcalls,
721 #endif
722     int direct)
723 {
724 	callout_func_t *c_func;
725 	void *c_arg;
726 	struct lock_object *c_lock;
727 	int c_flags;
728 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
729 	sbintime_t sbt1, sbt2;
730 	struct timespec ts2;
731 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
732 	static timeout_t *lastfunc;
733 #endif
734 
735 	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
736 	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
737 	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
738 	c_lock = c->c_lock;
739 	c_func = c->c_func;
740 	c_arg = c->c_arg;
741 	c_flags = c->c_flags;
742 
743 	/* remove pending bit */
744 	c->c_flags &= ~CALLOUT_PENDING;
745 
746 	/* reset our local state */
747 	cc->cc_exec_entity[direct].cc_curr = c;
748 	cc->cc_exec_entity[direct].cc_restart = false;
749 	cc->cc_exec_entity[direct].cc_drain_fn = NULL;
750 	cc->cc_exec_entity[direct].cc_drain_arg = NULL;
751 
752 	if (c_lock != NULL) {
753 		cc->cc_exec_entity[direct].cc_cancel = false;
754 		CC_UNLOCK(cc);
755 
756 		/* unlocked region for switching locks */
757 
758 		callout_lock_client(c_flags, c_lock);
759 
760 		/*
761 		 * Check if the callout may have been cancelled while
762 		 * we were switching locks. Even though the callout is
763 		 * specifying a lock, it might not be certain this
764 		 * lock is locked when starting and stopping callouts.
765 		 */
766 		CC_LOCK(cc);
767 		if (cc->cc_exec_entity[direct].cc_cancel) {
768 			callout_unlock_client(c_flags, c_lock);
769 			goto skip_cc_locked;
770 		}
771 		if (c_lock == &Giant.lock_object) {
772 #ifdef CALLOUT_PROFILING
773 			(*gcalls)++;
774 #endif
775 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
776 			    c, c_func, c_arg);
777 		} else {
778 #ifdef CALLOUT_PROFILING
779 			(*lockcalls)++;
780 #endif
781 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
782 			    c, c_func, c_arg);
783 		}
784 	} else {
785 #ifdef CALLOUT_PROFILING
786 		(*mpcalls)++;
787 #endif
788 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
789 		    c, c_func, c_arg);
790 	}
791 	/* The callout cannot be stopped now! */
792 	cc->cc_exec_entity[direct].cc_cancel = true;
793 	CC_UNLOCK(cc);
794 
795 	/* unlocked region */
796 	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
797 	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
798 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
799 	sbt1 = sbinuptime();
800 #endif
801 	THREAD_NO_SLEEPING();
802 	SDT_PROBE(callout_execute, kernel, , callout__start, c, 0, 0, 0, 0);
803 	c_func(c_arg);
804 	SDT_PROBE(callout_execute, kernel, , callout__end, c, 0, 0, 0, 0);
805 	THREAD_SLEEPING_OK();
806 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
807 	sbt2 = sbinuptime();
808 	sbt2 -= sbt1;
809 	if (sbt2 > maxdt) {
810 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
811 			ts2 = sbttots(sbt2);
812 			printf(
813 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
814 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
815 		}
816 		maxdt = sbt2;
817 		lastfunc = c_func;
818 	}
819 #endif
820 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
821 	CTR1(KTR_CALLOUT, "callout %p finished", c);
822 
823 	/*
824 	 * At this point the callback structure might have been freed,
825 	 * so we need to check the previously copied value of
826 	 * "c->c_flags":
827 	 */
828 	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
829 		callout_unlock_client(c_flags, c_lock);
830 
831 	CC_LOCK(cc);
832 
833 skip_cc_locked:
834 	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
835 	cc->cc_exec_entity[direct].cc_curr = NULL;
836 
837 	/* Check if there is anything which needs draining */
838 	if (cc->cc_exec_entity[direct].cc_drain_fn != NULL) {
839 		/*
840 		 * Unlock the CPU callout last, so that any use of
841 		 * structures belonging to the callout are complete:
842 		 */
843 		CC_UNLOCK(cc);
844 		/* call drain function unlocked */
845 		cc->cc_exec_entity[direct].cc_drain_fn(
846 		    cc->cc_exec_entity[direct].cc_drain_arg);
847 		CC_LOCK(cc);
848 	} else if (c_flags & CALLOUT_LOCAL_ALLOC) {
849 		/* return callout back to freelist */
850 		callout_cc_del(c, cc);
851 	} else if (cc->cc_exec_entity[direct].cc_restart) {
852 		/* [re-]schedule callout, if any */
853 		cc = callout_cc_add_locked(c, cc,
854 		    &cc->cc_exec_entity[direct].cc_restart_args, false);
855 	}
856 }
857 
858 /*
859  * The callout mechanism is based on the work of Adam M. Costello and
860  * George Varghese, published in a technical report entitled "Redesigning
861  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
862  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
863  * used in this implementation was published by G. Varghese and T. Lauck in
864  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
865  * the Efficient Implementation of a Timer Facility" in the Proceedings of
866  * the 11th ACM Annual Symposium on Operating Systems Principles,
867  * Austin, Texas Nov 1987.
868  */
869 
870 /*
871  * Software (low priority) clock interrupt.
872  * Run periodic events from timeout queue.
873  */
874 void
875 softclock(void *arg)
876 {
877 	struct callout_cpu *cc;
878 	struct callout *c;
879 #ifdef CALLOUT_PROFILING
880 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
881 #endif
882 
883 	cc = (struct callout_cpu *)arg;
884 	CC_LOCK(cc);
885 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
886 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
887 		softclock_call_cc(c, cc,
888 #ifdef CALLOUT_PROFILING
889 		    &mpcalls, &lockcalls, &gcalls,
890 #endif
891 		    0);
892 #ifdef CALLOUT_PROFILING
893 		++depth;
894 #endif
895 	}
896 #ifdef CALLOUT_PROFILING
897 	avg_depth += (depth * 1000 - avg_depth) >> 8;
898 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
899 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
900 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
901 #endif
902 	CC_UNLOCK(cc);
903 }
904 
905 /*
906  * timeout --
907  *	Execute a function after a specified length of time.
908  *
909  * untimeout --
910  *	Cancel previous timeout function call.
911  *
912  * callout_handle_init --
913  *	Initialize a handle so that using it with untimeout is benign.
914  *
915  *	See AT&T BCI Driver Reference Manual for specification.  This
916  *	implementation differs from that one in that although an
917  *	identification value is returned from timeout, the original
918  *	arguments to timeout as well as the identifier are used to
919  *	identify entries for untimeout.
920  */
921 struct callout_handle
922 timeout(timeout_t *ftn, void *arg, int to_ticks)
923 {
924 	struct callout_cpu *cc;
925 	struct callout *new;
926 	struct callout_handle handle;
927 
928 	cc = CC_CPU(timeout_cpu);
929 	CC_LOCK(cc);
930 	/* Fill in the next free callout structure. */
931 	new = SLIST_FIRST(&cc->cc_callfree);
932 	if (new == NULL)
933 		/* XXX Attempt to malloc first */
934 		panic("timeout table full");
935 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
936 	handle.callout = new;
937 	CC_UNLOCK(cc);
938 
939 	callout_reset(new, to_ticks, ftn, arg);
940 
941 	return (handle);
942 }
943 
944 void
945 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
946 {
947 	struct callout_cpu *cc;
948 	bool match;
949 
950 	/*
951 	 * Check for a handle that was initialized
952 	 * by callout_handle_init, but never used
953 	 * for a real timeout.
954 	 */
955 	if (handle.callout == NULL)
956 		return;
957 
958 	cc = callout_lock(handle.callout);
959 	match = (handle.callout->c_func == ftn && handle.callout->c_arg == arg);
960 	CC_UNLOCK(cc);
961 
962 	if (match)
963 		callout_stop(handle.callout);
964 }
965 
966 void
967 callout_handle_init(struct callout_handle *handle)
968 {
969 	handle->callout = NULL;
970 }
971 
972 static int
973 callout_restart_async(struct callout *c, struct callout_args *coa,
974     callout_func_t *drain_fn, void *drain_arg)
975 {
976 	struct callout_cpu *cc;
977 	int cancelled;
978 	int direct;
979 
980 	cc = callout_lock(c);
981 
982 	/* Figure out if the callout is direct or not */
983 	direct = ((c->c_flags & CALLOUT_DIRECT) != 0);
984 
985 	/*
986 	 * Check if the callback is currently scheduled for
987 	 * completion:
988 	 */
989 	if (cc->cc_exec_entity[direct].cc_curr == c) {
990 		/*
991 		 * Try to prevent the callback from running by setting
992 		 * the "cc_cancel" variable to "true". Also check if
993 		 * the callout was previously subject to a deferred
994 		 * callout restart:
995 		 */
996 		if (cc->cc_exec_entity[direct].cc_cancel == false ||
997 		    (c->c_flags & CALLOUT_DEFRESTART) != 0) {
998 			cc->cc_exec_entity[direct].cc_cancel = true;
999 			cancelled = 1;
1000 		} else {
1001 			cancelled = 0;
1002 		}
1003 
1004 		/*
1005 		 * Prevent callback restart if "callout_drain_xxx()"
1006 		 * is being called or we are stopping the callout or
1007 		 * the callback was preallocated by us:
1008 		 */
1009 		if (cc->cc_exec_entity[direct].cc_drain_fn != NULL ||
1010 		    coa == NULL || (c->c_flags & CALLOUT_LOCAL_ALLOC) != 0) {
1011 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
1012 			    cancelled ? "cancelled and draining" : "draining",
1013 			    c, c->c_func, c->c_arg);
1014 
1015 			/* clear old flags, if any */
1016 			c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING |
1017 			    CALLOUT_DEFRESTART | CALLOUT_PROCESSED);
1018 
1019 			/* clear restart flag, if any */
1020 			cc->cc_exec_entity[direct].cc_restart = false;
1021 
1022 			/* set drain function, if any */
1023 			if (drain_fn != NULL) {
1024 				cc->cc_exec_entity[direct].cc_drain_fn = drain_fn;
1025 				cc->cc_exec_entity[direct].cc_drain_arg = drain_arg;
1026 				cancelled |= 2;		/* XXX define the value */
1027 			}
1028 		} else {
1029 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
1030 			    cancelled ? "cancelled and restarting" : "restarting",
1031 			    c, c->c_func, c->c_arg);
1032 
1033 			/* get us back into the game */
1034 			c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING |
1035 			    CALLOUT_DEFRESTART);
1036 			c->c_flags &= ~CALLOUT_PROCESSED;
1037 
1038 			/* enable deferred restart */
1039 			cc->cc_exec_entity[direct].cc_restart = true;
1040 
1041 			/* store arguments for the deferred restart, if any */
1042 			cc->cc_exec_entity[direct].cc_restart_args = *coa;
1043 		}
1044 	} else {
1045 		/* stop callout */
1046 		if (c->c_flags & CALLOUT_PENDING) {
1047 			/*
1048 			 * The callback has not yet been executed, and
1049 			 * we simply just need to unlink it:
1050 			 */
1051 			if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
1052 				if (cc->cc_exec_next_dir == c)
1053 					cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
1054 				LIST_REMOVE(c, c_links.le);
1055 			} else {
1056 				TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
1057 			}
1058 			cancelled = 1;
1059 		} else {
1060 			cancelled = 0;
1061 		}
1062 
1063 		CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
1064 		    cancelled ? "rescheduled" : "scheduled",
1065 		    c, c->c_func, c->c_arg);
1066 
1067 		/* [re-]schedule callout, if any */
1068 		if (coa != NULL) {
1069 			cc = callout_cc_add_locked(c, cc, coa, true);
1070 		} else {
1071 			/* clear old flags, if any */
1072 			c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING |
1073 			    CALLOUT_DEFRESTART | CALLOUT_PROCESSED);
1074 
1075 			/* return callback to pre-allocated list, if any */
1076 			if ((c->c_flags & CALLOUT_LOCAL_ALLOC) && cancelled != 0) {
1077 				callout_cc_del(c, cc);
1078 			}
1079 		}
1080 	}
1081 	CC_UNLOCK(cc);
1082 	return (cancelled);
1083 }
1084 
1085 /*
1086  * New interface; clients allocate their own callout structures.
1087  *
1088  * callout_reset() - establish or change a timeout
1089  * callout_stop() - disestablish a timeout
1090  * callout_init() - initialize a callout structure so that it can
1091  *	safely be passed to callout_reset() and callout_stop()
1092  *
1093  * <sys/callout.h> defines three convenience macros:
1094  *
1095  * callout_active() - returns truth if callout has not been stopped,
1096  *	drained, or deactivated since the last time the callout was
1097  *	reset.
1098  * callout_pending() - returns truth if callout is still waiting for timeout
1099  * callout_deactivate() - marks the callout as having been serviced
1100  */
1101 int
1102 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
1103     callout_func_t *ftn, void *arg, int cpu, int flags)
1104 {
1105 	struct callout_args coa;
1106 
1107 	/* store arguments for callout add function */
1108 	coa.func = ftn;
1109 	coa.arg = arg;
1110 	coa.precision = precision;
1111 	coa.flags = flags;
1112 	coa.cpu = cpu;
1113 
1114 	/* compute the rest of the arguments needed */
1115 	if (coa.flags & C_ABSOLUTE) {
1116 		coa.time = sbt;
1117 	} else {
1118 		sbintime_t pr;
1119 
1120 		if ((coa.flags & C_HARDCLOCK) && (sbt < tick_sbt))
1121 			sbt = tick_sbt;
1122 		if ((coa.flags & C_HARDCLOCK) ||
1123 #ifdef NO_EVENTTIMERS
1124 		    sbt >= sbt_timethreshold) {
1125 			coa.time = getsbinuptime();
1126 
1127 			/* Add safety belt for the case of hz > 1000. */
1128 			coa.time += tc_tick_sbt - tick_sbt;
1129 #else
1130 		    sbt >= sbt_tickthreshold) {
1131 			/*
1132 			 * Obtain the time of the last hardclock() call on
1133 			 * this CPU directly from the kern_clocksource.c.
1134 			 * This value is per-CPU, but it is equal for all
1135 			 * active ones.
1136 			 */
1137 #ifdef __LP64__
1138 			coa.time = DPCPU_GET(hardclocktime);
1139 #else
1140 			spinlock_enter();
1141 			coa.time = DPCPU_GET(hardclocktime);
1142 			spinlock_exit();
1143 #endif
1144 #endif
1145 			if ((coa.flags & C_HARDCLOCK) == 0)
1146 				coa.time += tick_sbt;
1147 		} else
1148 			coa.time = sbinuptime();
1149 		if (SBT_MAX - coa.time < sbt)
1150 			coa.time = SBT_MAX;
1151 		else
1152 			coa.time += sbt;
1153 		pr = ((C_PRELGET(coa.flags) < 0) ? sbt >> tc_precexp :
1154 		    sbt >> C_PRELGET(coa.flags));
1155 		if (pr > coa.precision)
1156 			coa.precision = pr;
1157 	}
1158 
1159 	/* get callback started, if any */
1160 	return (callout_restart_async(c, &coa, NULL, NULL));
1161 }
1162 
1163 /*
1164  * Common idioms that can be optimized in the future.
1165  */
1166 int
1167 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
1168 {
1169 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
1170 }
1171 
1172 int
1173 callout_schedule(struct callout *c, int to_ticks)
1174 {
1175 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
1176 }
1177 
1178 int
1179 callout_stop(struct callout *c)
1180 {
1181 	/* get callback stopped, if any */
1182 	return (callout_restart_async(c, NULL, NULL, NULL));
1183 }
1184 
1185 static void
1186 callout_drain_function(void *arg)
1187 {
1188 	wakeup(arg);
1189 }
1190 
1191 int
1192 callout_drain_async(struct callout *c, callout_func_t *fn, void *arg)
1193 {
1194 	/* get callback stopped, if any */
1195 	return (callout_restart_async(c, NULL, fn, arg) & 2);
1196 }
1197 
1198 int
1199 callout_drain(struct callout *c)
1200 {
1201 	int cancelled;
1202 
1203 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
1204 	    "Draining callout");
1205 
1206 	callout_lock_client(c->c_flags, c->c_lock);
1207 
1208 	/* at this point the "c->c_cpu" field is not changing */
1209 
1210 	cancelled = callout_drain_async(c, &callout_drain_function, c);
1211 
1212 	if (cancelled != 0) {
1213 		struct callout_cpu *cc;
1214 		int direct;
1215 
1216 		CTR3(KTR_CALLOUT, "need to drain %p func %p arg %p",
1217 		    c, c->c_func, c->c_arg);
1218 
1219 		cc = callout_lock(c);
1220 		direct = ((c->c_flags & CALLOUT_DIRECT) != 0);
1221 
1222 		/*
1223 		 * We've gotten our callout CPU lock, it is safe to
1224 		 * drop the initial lock:
1225 		 */
1226 		callout_unlock_client(c->c_flags, c->c_lock);
1227 
1228 		/* Wait for drain to complete */
1229 
1230 		while (cc->cc_exec_entity[direct].cc_curr == c)
1231 			msleep_spin(c, (struct mtx *)&cc->cc_lock, "codrain", 0);
1232 
1233 		CC_UNLOCK(cc);
1234 	} else {
1235 		callout_unlock_client(c->c_flags, c->c_lock);
1236 	}
1237 
1238 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
1239 	    c, c->c_func, c->c_arg);
1240 
1241 	return (cancelled & 1);
1242 }
1243 
1244 void
1245 callout_init(struct callout *c, int mpsafe)
1246 {
1247 	if (mpsafe) {
1248 		_callout_init_lock(c, NULL, CALLOUT_RETURNUNLOCKED);
1249 	} else {
1250 		_callout_init_lock(c, &Giant.lock_object, 0);
1251 	}
1252 }
1253 
1254 void
1255 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
1256 {
1257 	bzero(c, sizeof *c);
1258 	KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0,
1259 	    ("callout_init_lock: bad flags 0x%08x", flags));
1260 	flags &= CALLOUT_RETURNUNLOCKED;
1261 	if (lock != NULL) {
1262 		struct lock_class *class = LOCK_CLASS(lock);
1263 		if (class == &lock_class_mtx_sleep)
1264 			flags |= CALLOUT_SET_LC(CALLOUT_LC_MUTEX);
1265 		else if (class == &lock_class_mtx_spin)
1266 			flags |= CALLOUT_SET_LC(CALLOUT_LC_SPIN);
1267 		else if (class == &lock_class_rm)
1268 			flags |= CALLOUT_SET_LC(CALLOUT_LC_RM);
1269 		else if (class == &lock_class_rw)
1270 			flags |= CALLOUT_SET_LC(CALLOUT_LC_RW);
1271 		else
1272 			panic("callout_init_lock: Unsupported lock class '%s'\n", class->lc_name);
1273 	} else {
1274 		flags |= CALLOUT_SET_LC(CALLOUT_LC_UNUSED_0);
1275 	}
1276 	c->c_lock = lock;
1277 	c->c_flags = flags;
1278 	c->c_cpu = timeout_cpu;
1279 }
1280 
1281 #ifdef APM_FIXUP_CALLTODO
1282 /*
1283  * Adjust the kernel calltodo timeout list.  This routine is used after
1284  * an APM resume to recalculate the calltodo timer list values with the
1285  * number of hz's we have been sleeping.  The next hardclock() will detect
1286  * that there are fired timers and run softclock() to execute them.
1287  *
1288  * Please note, I have not done an exhaustive analysis of what code this
1289  * might break.  I am motivated to have my select()'s and alarm()'s that
1290  * have expired during suspend firing upon resume so that the applications
1291  * which set the timer can do the maintanence the timer was for as close
1292  * as possible to the originally intended time.  Testing this code for a
1293  * week showed that resuming from a suspend resulted in 22 to 25 timers
1294  * firing, which seemed independant on whether the suspend was 2 hours or
1295  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
1296  */
1297 void
1298 adjust_timeout_calltodo(struct timeval *time_change)
1299 {
1300 	register struct callout *p;
1301 	unsigned long delta_ticks;
1302 
1303 	/*
1304 	 * How many ticks were we asleep?
1305 	 * (stolen from tvtohz()).
1306 	 */
1307 
1308 	/* Don't do anything */
1309 	if (time_change->tv_sec < 0)
1310 		return;
1311 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
1312 		delta_ticks = (time_change->tv_sec * 1000000 +
1313 			       time_change->tv_usec + (tick - 1)) / tick + 1;
1314 	else if (time_change->tv_sec <= LONG_MAX / hz)
1315 		delta_ticks = time_change->tv_sec * hz +
1316 			      (time_change->tv_usec + (tick - 1)) / tick + 1;
1317 	else
1318 		delta_ticks = LONG_MAX;
1319 
1320 	if (delta_ticks > INT_MAX)
1321 		delta_ticks = INT_MAX;
1322 
1323 	/*
1324 	 * Now rip through the timer calltodo list looking for timers
1325 	 * to expire.
1326 	 */
1327 
1328 	/* don't collide with softclock() */
1329 	CC_LOCK(cc);
1330 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
1331 		p->c_time -= delta_ticks;
1332 
1333 		/* Break if the timer had more time on it than delta_ticks */
1334 		if (p->c_time > 0)
1335 			break;
1336 
1337 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
1338 		delta_ticks = -p->c_time;
1339 	}
1340 	CC_UNLOCK(cc);
1341 
1342 	return;
1343 }
1344 #endif /* APM_FIXUP_CALLTODO */
1345 
1346 static int
1347 flssbt(sbintime_t sbt)
1348 {
1349 
1350 	sbt += (uint64_t)sbt >> 1;
1351 	if (sizeof(long) >= sizeof(sbintime_t))
1352 		return (flsl(sbt));
1353 	if (sbt >= SBT_1S)
1354 		return (flsl(((uint64_t)sbt) >> 32) + 32);
1355 	return (flsl(sbt));
1356 }
1357 
1358 /*
1359  * Dump immediate statistic snapshot of the scheduled callouts.
1360  */
1361 static int
1362 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
1363 {
1364 	struct callout *tmp;
1365 	struct callout_cpu *cc;
1366 	struct callout_list *sc;
1367 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
1368 	int ct[64], cpr[64], ccpbk[32];
1369 	int error, val, i, count, tcum, pcum, maxc, c, medc;
1370 #ifdef SMP
1371 	int cpu;
1372 #endif
1373 
1374 	val = 0;
1375 	error = sysctl_handle_int(oidp, &val, 0, req);
1376 	if (error != 0 || req->newptr == NULL)
1377 		return (error);
1378 	count = maxc = 0;
1379 	st = spr = maxt = maxpr = 0;
1380 	bzero(ccpbk, sizeof(ccpbk));
1381 	bzero(ct, sizeof(ct));
1382 	bzero(cpr, sizeof(cpr));
1383 	now = sbinuptime();
1384 #ifdef SMP
1385 	CPU_FOREACH(cpu) {
1386 		cc = CC_CPU(cpu);
1387 #else
1388 		cc = CC_CPU(timeout_cpu);
1389 #endif
1390 		CC_LOCK(cc);
1391 		for (i = 0; i < callwheelsize; i++) {
1392 			sc = &cc->cc_callwheel[i];
1393 			c = 0;
1394 			LIST_FOREACH(tmp, sc, c_links.le) {
1395 				c++;
1396 				t = tmp->c_time - now;
1397 				if (t < 0)
1398 					t = 0;
1399 				st += t / SBT_1US;
1400 				spr += tmp->c_precision / SBT_1US;
1401 				if (t > maxt)
1402 					maxt = t;
1403 				if (tmp->c_precision > maxpr)
1404 					maxpr = tmp->c_precision;
1405 				ct[flssbt(t)]++;
1406 				cpr[flssbt(tmp->c_precision)]++;
1407 			}
1408 			if (c > maxc)
1409 				maxc = c;
1410 			ccpbk[fls(c + c / 2)]++;
1411 			count += c;
1412 		}
1413 		CC_UNLOCK(cc);
1414 #ifdef SMP
1415 	}
1416 #endif
1417 
1418 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
1419 		tcum += ct[i];
1420 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
1421 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
1422 		pcum += cpr[i];
1423 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
1424 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
1425 		c += ccpbk[i];
1426 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
1427 
1428 	printf("Scheduled callouts statistic snapshot:\n");
1429 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
1430 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
1431 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
1432 	    medc,
1433 	    count / callwheelsize / mp_ncpus,
1434 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
1435 	    maxc);
1436 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
1437 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
1438 	    (st / count) / 1000000, (st / count) % 1000000,
1439 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
1440 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
1441 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
1442 	    (spr / count) / 1000000, (spr / count) % 1000000,
1443 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
1444 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
1445 	    "   prec\t   pcum\n");
1446 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
1447 		if (ct[i] == 0 && cpr[i] == 0)
1448 			continue;
1449 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
1450 		tcum += ct[i];
1451 		pcum += cpr[i];
1452 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
1453 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
1454 		    i - 1 - (32 - CC_HASH_SHIFT),
1455 		    ct[i], tcum, cpr[i], pcum);
1456 	}
1457 	return (error);
1458 }
1459 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
1460     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1461     0, 0, sysctl_kern_callout_stat, "I",
1462     "Dump immediate statistic snapshot of the scheduled callouts");
1463