xref: /illumos-gate/usr/src/uts/common/sys/callo.h (revision d88e498a7e760a60ae266eb725566f1f7ed86ad5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22 /*	  All Rights Reserved  	*/
23 
24 
25 /*
26  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #ifndef _SYS_CALLO_H
31 #define	_SYS_CALLO_H
32 
33 #include <sys/t_lock.h>
34 #include <sys/taskq.h>
35 #include <sys/lgrp.h>
36 #include <sys/processor.h>
37 #include <sys/cyclic.h>
38 #include <sys/kstat.h>
39 #include <sys/systm.h>
40 
41 #ifdef	__cplusplus
42 extern "C" {
43 #endif
44 
45 #ifdef	_KERNEL
46 
47 typedef struct callout_list	callout_list_t;
48 
49 /*
50  * The callout mechanism provides general-purpose event scheduling:
51  * an arbitrary function is called in a specified amount of time.
52  * The expiration time for a callout is kept in its callout list
53  * structure.
54  */
55 typedef struct callout {
56 	struct callout	*c_idnext;	/* next in ID hash, or on freelist */
57 	struct callout	*c_idprev;	/* prev in ID hash */
58 	struct callout	*c_clnext;	/* next in callout list */
59 	struct callout	*c_clprev;	/* prev in callout list */
60 	callout_id_t	c_xid;		/* extended callout ID; see below */
61 	callout_list_t	*c_list;	/* callout list */
62 	void		(*c_func)(void *); /* function to call */
63 	void		*c_arg;		/* argument to function */
64 	kthread_t	*c_executor;	/* executing thread */
65 	kcondvar_t	c_done;		/* signal callout completion */
66 	ushort_t	c_waiting;	/* untimeout waiting flag */
67 } callout_t;
68 
69 /*
70  * The callout ID (callout_id_t) uniquely identifies a callout. The callout
71  * ID is always 64 bits internally. The lower 32 bits contain an ID value.
72  * The upper 32 bits contain a generation number and flags. When the ID value
73  * wraps the generation number is incremented during ID generation. This
74  * protects callers from ID collisions that can happen as a result of the wrap.
75  *
76  * The kernel internal interface, timeout_generic(), always returns a
77  * callout_id_t. But the legacy interfaces, timeout() and realtime_timeout()
78  * return a timeout_id_t. On a 64-bit system, timeout_id_t is also 64 bits.
79  * So, the full 64-bit ID (sans the flags) can be returned. However, on 32-bit
80  * systems, timeout_id_t is 32 bits. So, only the lower 32 bits can be
81  * returned. In such cases, a default generation number of 0 is assigned to
82  * the legacy IDs.
83  *
84  * The lower 32-bit ID space is partitioned into two spaces - one for 32-bit
85  * IDs and the other for 64-bit IDs. The 32-bit ID space is further divided
86  * into two spaces - one for short-term callouts and one for long-term.
87  *
88  * Here is the bit layout for the callout ID:
89  *
90  *      63   62  ...  32   31      30     29 .. X+1  X ... 1   0
91  *  ----------------------------------------------------------------
92  *  | Exec | Generation | Long | Counter | ID bits | Table  | Type |
93  *  |      | number     | term | High    |         | number |      |
94  *  ----------------------------------------------------------------
95  *
96  * Exec(uting):
97  *    This is the executing bit which is only set in the extended callout
98  *    ID. This bit indicates that the callout handler is currently being
99  *    executed.
100  *
101  * Generation number:
102  *    This is the generation part of the ID.
103  *
104  * Long term:
105  *    This bit indicates whether this is a short-term or a long-term callout.
106  *    The long-term bit exists to address the problem of callout ID collision
107  *    on 32-bit systems. This is an issue because the system typically
108  *    generates a large number of timeout() requests, which means that callout
109  *    IDs eventually get recycled. Most timeouts are very short-lived, so that
110  *    ID recycling isn't a problem; but there are a handful of timeouts which
111  *    are sufficiently long-lived to see their own IDs reused. We use the
112  *    long-term bit to partition the ID namespace into pieces; the short-term
113  *    space gets all the heavy traffic and can wrap frequently (i.e., on the
114  *    order of a day) with no ill effects; the long-term space gets very little
115  *    traffic and thus never wraps. That said, we need to future proof callouts
116  *    in case 32-bit systems grow in size and are able to consume callout IDs
117  *    at faster rates. So, we should make all the kernel clients that use
118  *    callouts to use the internal interface so that they can use IDs outside
119  *    of the legacy space with a proper generation number.
120  *
121  * Counter High + ID counter bits:
122  *    These bits represent the actual ID bits in the callout ID.
123  *    The highest bit of the running counter is always set; this ensures that
124  *    the callout ID is always non-zero, thus eliminating the need for an
125  *    explicit wrap-around test during ID generation.
126  *
127  * Table number:
128  *    These bits carry the table number for the callout table where the callout
129  *    is queued. Each CPU has its own callout table. So, the callout tables are
130  *    numbered from 0 - (max_ncpus - 1). Because max_ncpus is different on
131  *    different systems, the actual number of table number bits will vary
132  *    accordingly. And so will the ID counter bits.
133  *
134  * Type:
135  *    This bit represents the callout (table) type. Each CPU has one realtime
136  *    and one normal callout table.
137  */
138 #define	CALLOUT_EXECUTING	0x8000000000000000ULL
139 #define	CALLOUT_ID_MASK		~(CALLOUT_EXECUTING)
140 #define	CALLOUT_GENERATION_LOW	0x100000000ULL
141 #define	CALLOUT_LONGTERM	0x80000000
142 #define	CALLOUT_COUNTER_HIGH	0x40000000
143 #define	CALLOUT_TYPE_BITS	1
144 #define	CALLOUT_NTYPES		(1 << CALLOUT_TYPE_BITS)
145 #define	CALLOUT_TYPE_MASK	(CALLOUT_NTYPES - 1)
146 #define	CALLOUT_COUNTER_SHIFT	callout_table_bits
147 #define	CALLOUT_TABLE(t, f)	(((f) << CALLOUT_TYPE_BITS) | (t))
148 #define	CALLOUT_TABLE_NUM(ct)	((ct) - callout_table)
149 #define	CALLOUT_TABLE_TYPE(ct)	(CALLOUT_TABLE_NUM(ct) & CALLOUT_TYPE_MASK)
150 #define	CALLOUT_TABLE_SEQID(ct)	(CALLOUT_TABLE_NUM(ct) >> CALLOUT_TYPE_BITS)
151 
152 /*
153  * We assume that during any period of CALLOUT_LONGTERM_TICKS ticks, at most
154  * (CALLOUT_COUNTER_HIGH / callout_counter_low) callouts will be generated.
155  */
156 #define	CALLOUT_LONGTERM_TICKS	0x4000UL
157 #define	CALLOUT_BUCKET_SHIFT	9
158 #define	CALLOUT_BUCKETS		(1 << CALLOUT_BUCKET_SHIFT)
159 #define	CALLOUT_BUCKET_MASK	(CALLOUT_BUCKETS - 1)
160 #define	CALLOUT_HASH(x)		((x) & CALLOUT_BUCKET_MASK)
161 #define	CALLOUT_IDHASH(x)	CALLOUT_HASH((x) >> CALLOUT_COUNTER_SHIFT)
162 /*
163  * The multiply by 0 and 1 below are cosmetic. Just to align things better
164  * and make it more readable. The multiplications will be done at compile
165  * time.
166  */
167 #define	CALLOUT_CLHASH(x)			\
168 	CALLOUT_HASH(				\
169 	    ((x)>>(CALLOUT_BUCKET_SHIFT*0)) ^	\
170 	    ((x)>>(CALLOUT_BUCKET_SHIFT*1)) ^	\
171 	    ((x)>>(CALLOUT_BUCKET_SHIFT*2)) ^	\
172 	    ((x)>>(CALLOUT_BUCKET_SHIFT*3)))
173 
174 #define	CALLOUT_ID_TO_TABLE(id)		((id) & callout_table_mask)
175 
176 #define	CALLOUT_SHORT_ID(table)		\
177 		((callout_id_t)(table) | CALLOUT_COUNTER_HIGH)
178 #define	CALLOUT_LONG_ID(table)		\
179 		(CALLOUT_SHORT_ID(table) | CALLOUT_LONGTERM)
180 
181 #define	CALLOUT_THREADS		2		/* keep it simple for now */
182 
183 #define	CALLOUT_REALTIME	0		/* realtime callout type */
184 #define	CALLOUT_NORMAL		1		/* normal callout type */
185 
186 /*
187  * callout_t's are cache-aligned structures allocated from kmem caches. One kmem
188  * cache is created per lgrp and is shared by all CPUs in that lgrp. Benefits:
189  *	- cache pages are mapped only in the TLBs of the CPUs of the lgrp
190  *	- data in cache pages is present only in those CPU caches
191  *	- memory access performance improves with locality-awareness in kmem
192  *
193  * The following structure is used to manage per-lgroup kmem caches.
194  *
195  * NOTE: Free callout_t's go to a callout table's freelist. CPUs map to callout
196  * tables via their sequence IDs, not CPU IDs. DR operations can cause a
197  * free list to have callouts from multiple lgrp caches. This takes away some
198  * performance, but is no worse than if we did not use lgrp caches at all.
199  */
200 typedef struct callout_cache {
201 	struct callout_cache	*cc_next;	/* link in the global list */
202 	lgrp_handle_t		cc_hand;	/* lgroup handle */
203 	kmem_cache_t		*cc_cache;	/* kmem cache pointer */
204 	kmem_cache_t		*cc_lcache;	/* kmem cache pointer */
205 } callout_cache_t;
206 
207 /*
208  * The callout hash structure is used for queueing both callouts and
209  * callout lists. That is why the fields are declared as void *.
210  */
211 typedef struct callout_hash {
212 	void	*ch_head;
213 	void	*ch_tail;
214 } callout_hash_t;
215 
216 struct callout_list {
217 	callout_list_t	*cl_next;	/* next in clhash */
218 	callout_list_t	*cl_prev;	/* prev in clhash */
219 	hrtime_t	cl_expiration;	/* expiration for callouts in list */
220 	callout_hash_t	cl_callouts;	/* list of callouts */
221 	int		cl_flags;	/* callout flags */
222 };
223 
224 /*
225  * Per-callout table kstats.
226  *
227  * CALLOUT_TIMEOUTS
228  *	Callouts created since boot.
229  * CALLOUT_TIMEOUTS_PENDING
230  *	Number of outstanding callouts.
231  * CALLOUT_UNTIMEOUTS_UNEXPIRED
232  *	Number of cancelled callouts that have not expired.
233  * CALLOUT_UNTIMEOUTS_EXECUTING
234  *	Number of cancelled callouts that were executing at the time of
235  *	cancellation.
236  * CALLOUT_UNTIMEOUTS_EXPIRED
237  *	Number of cancelled callouts that had already expired at the time
238  *	of cancellations.
239  * CALLOUT_EXPIRATIONS
240  *	Number of callouts that expired.
241  * CALLOUT_ALLOCATIONS
242  *	Number of callout structures allocated.
243  */
244 typedef enum callout_stat_type {
245 	CALLOUT_TIMEOUTS,
246 	CALLOUT_TIMEOUTS_PENDING,
247 	CALLOUT_UNTIMEOUTS_UNEXPIRED,
248 	CALLOUT_UNTIMEOUTS_EXECUTING,
249 	CALLOUT_UNTIMEOUTS_EXPIRED,
250 	CALLOUT_EXPIRATIONS,
251 	CALLOUT_ALLOCATIONS,
252 	CALLOUT_NUM_STATS
253 } callout_stat_type_t;
254 
255 /*
256  * Callout flags:
257  *
258  * CALLOUT_FLAG_ROUNDUP
259  *	Roundup the expiration time to the next resolution boundary.
260  *	If this flag is not specified, the expiration time is rounded down.
261  * CALLOUT_FLAG_ABSOLUTE
262  *	Normally, the expiration passed to the timeout API functions is an
263  *	expiration interval. If this flag is specified, then it is
264  *	interpreted as the expiration time itself.
265  * CALLOUT_FLAG_HRESTIME
266  *	Normally, callouts are not affected by changes to system time
267  *	(hrestime). This flag is used to create a callout that is affected
268  *	by system time. If system time changes, these timers must be
269  *	handled in a special way (see callout.c). These are used by condition
270  *	variables and LWP timers that need this behavior.
271  * CALLOUT_FLAG_32BIT
272  *	Legacy interfaces timeout() and realtime_timeout() pass this flag
273  *	to timeout_generic() to indicate that a 32-bit ID should be allocated.
274  */
275 #define	CALLOUT_FLAG_ROUNDUP		0x1
276 #define	CALLOUT_FLAG_ABSOLUTE		0x2
277 #define	CALLOUT_FLAG_HRESTIME		0x4
278 #define	CALLOUT_FLAG_32BIT		0x8
279 
280 #define	CALLOUT_LIST_FLAGS	(CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_HRESTIME)
281 /*
282  * On 32-bit systems, the legacy interfaces, timeout() and realtime_timeout(),
283  * must pass CALLOUT_FLAG_32BIT to timeout_generic() so that a 32-bit ID
284  * can be generated.
285  */
286 #ifdef _LP64
287 #define	CALLOUT_LEGACY		0
288 #else
289 #define	CALLOUT_LEGACY		CALLOUT_FLAG_32BIT
290 #endif
291 
292 /*
293  * All of the state information associated with a callout table.
294  * The fields are ordered with cache performance in mind.
295  */
296 typedef struct callout_table {
297 	kmutex_t	ct_mutex;	/* protects all callout state */
298 	callout_t	*ct_free;	/* free callout structures */
299 	callout_list_t	*ct_lfree;	/* free callout list structures */
300 	callout_id_t	ct_short_id;	/* most recently issued short-term ID */
301 	callout_id_t	ct_long_id;	/* most recently issued long-term ID */
302 	callout_hash_t 	*ct_idhash;	/* ID hash chains */
303 	callout_hash_t 	*ct_clhash;	/* callout list hash */
304 	kstat_named_t	*ct_kstat_data;	/* callout kstat data */
305 
306 	uint_t		ct_type;	/* callout table type */
307 	uint_t		ct_suspend;	/* suspend count */
308 	cyclic_id_t	ct_cyclic;	/* cyclic for this table */
309 	hrtime_t	*ct_heap;	/* callout expiration heap */
310 	ulong_t		ct_heap_num;	/* occupied slots in the heap */
311 	ulong_t		ct_heap_max;	/* end of the heap */
312 	kmem_cache_t	*ct_cache;	/* callout kmem cache */
313 	kmem_cache_t	*ct_lcache;	/* callout list kmem cache */
314 	callout_id_t	ct_gen_id;	/* generation based ID */
315 
316 	callout_hash_t	ct_expired;	/* list of expired callout lists */
317 	taskq_t		*ct_taskq;	/* taskq to execute normal callouts */
318 	kstat_t		*ct_kstats;	/* callout kstats */
319 #ifdef _LP64
320 	ulong_t		ct_pad[4];	/* cache alignment */
321 #else
322 	ulong_t		ct_pad[7];	/* cache alignment */
323 #endif
324 } callout_table_t;
325 
326 /*
327  * Short hand definitions for the callout kstats.
328  */
329 #define	ct_timeouts							\
330 		ct_kstat_data[CALLOUT_TIMEOUTS].value.ui64
331 #define	ct_timeouts_pending						\
332 		ct_kstat_data[CALLOUT_TIMEOUTS_PENDING].value.ui64
333 #define	ct_untimeouts_unexpired						\
334 		ct_kstat_data[CALLOUT_UNTIMEOUTS_UNEXPIRED].value.ui64
335 #define	ct_untimeouts_executing						\
336 		ct_kstat_data[CALLOUT_UNTIMEOUTS_EXECUTING].value.ui64
337 #define	ct_untimeouts_expired						\
338 		ct_kstat_data[CALLOUT_UNTIMEOUTS_EXPIRED].value.ui64
339 #define	ct_expirations							\
340 		ct_kstat_data[CALLOUT_EXPIRATIONS].value.ui64
341 #define	ct_allocations							\
342 		ct_kstat_data[CALLOUT_ALLOCATIONS].value.ui64
343 
344 #define	CALLOUT_CHUNK	128
345 
346 #define	CALLOUT_HEAP_PARENT(index)	(((index) - 1) >> 1)
347 #define	CALLOUT_HEAP_RIGHT(index)	(((index) + 1) << 1)
348 #define	CALLOUT_HEAP_LEFT(index)	((((index) + 1) << 1) - 1)
349 
350 #define	CALLOUT_CYCLIC_HANDLER(t)					\
351 	((t == CALLOUT_REALTIME) ? callout_realtime : callout_normal)
352 
353 /*
354  * We define a blanket minimum resolution for callouts of 1 millisecond.
355  * 1 millisecond is a safe value as it is already supported when the clock
356  * resolution is set to high.
357  */
358 #define	CALLOUT_MIN_RESOLUTION		1000000ULL
359 #define	CALLOUT_TCP_RESOLUTION		10000000ULL
360 
361 #define	CALLOUT_ALIGN	64	/* cache line size */
362 
363 #ifdef _LP64
364 #define	CALLOUT_MAX_TICKS	NSEC_TO_TICK(CY_INFINITY);
365 #else
366 #define	CALLOUT_MAX_TICKS	LONG_MAX
367 #endif
368 
369 extern void		callout_init(void);
370 extern void		membar_sync(void);
371 extern void		callout_cpu_online(cpu_t *);
372 extern void		callout_cpu_offline(cpu_t *);
373 extern void		callout_hrestime(void);
374 
375 #endif
376 
377 #ifdef	__cplusplus
378 }
379 #endif
380 
381 #endif	/* _SYS_CALLO_H */
382