xref: /illumos-gate/usr/src/uts/common/os/clock_highres.c (revision 1fa2a66491e7d8ae0be84e7da4da8e812480c710)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2016, Joyent Inc.
29  */
30 
31 #include <sys/timer.h>
32 #include <sys/systm.h>
33 #include <sys/param.h>
34 #include <sys/kmem.h>
35 #include <sys/debug.h>
36 #include <sys/cyclic.h>
37 #include <sys/cmn_err.h>
38 #include <sys/pset.h>
39 #include <sys/atomic.h>
40 #include <sys/policy.h>
41 
42 static clock_backend_t clock_highres;
43 
44 /* minimum non-privileged interval (200us) */
45 long clock_highres_interval_min = 200000;
46 
47 /*ARGSUSED*/
48 static int
49 clock_highres_settime(timespec_t *ts)
50 {
51 	return (EINVAL);
52 }
53 
54 static int
55 clock_highres_gettime(timespec_t *ts)
56 {
57 	hrt2ts(gethrtime(), (timestruc_t *)ts);
58 
59 	return (0);
60 }
61 
62 static int
63 clock_highres_getres(timespec_t *ts)
64 {
65 	hrt2ts(cyclic_getres(), (timestruc_t *)ts);
66 
67 	return (0);
68 }
69 
70 /*ARGSUSED*/
71 static int
72 clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))
73 {
74 	it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);
75 	it->it_fire = fire;
76 
77 	return (0);
78 }
79 
80 static void
81 clock_highres_fire(void *arg)
82 {
83 	itimer_t *it = (itimer_t *)arg;
84 	hrtime_t *addr = &it->it_hrtime;
85 	hrtime_t old = *addr, new = gethrtime();
86 
87 	do {
88 		old = *addr;
89 	} while (atomic_cas_64((uint64_t *)addr, old, new) != old);
90 
91 	it->it_fire(it);
92 }
93 
94 static int
95 clock_highres_timer_settime(itimer_t *it, int flags,
96     const struct itimerspec *when)
97 {
98 	cyclic_id_t cyc, *cycp = it->it_arg;
99 	proc_t *p = curproc;
100 	kthread_t *t = curthread;
101 	cyc_time_t cyctime;
102 	cyc_handler_t hdlr;
103 	cpu_t *cpu;
104 	cpupart_t *cpupart;
105 	int pset;
106 	boolean_t value_need_clamp = B_FALSE;
107 	boolean_t intval_need_clamp = B_FALSE;
108 	cred_t *cr = CRED();
109 	struct itimerspec clamped;
110 
111 	/*
112 	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny
113 	 * service; only allow privileged users to create such timers.
114 	 * Non-privileged users (those without the "proc_clock_highres"
115 	 * privilege) can create timers with lower resolution but if they
116 	 * attempt to use a very low time value (< 200us) then their
117 	 * timer will be clamped at 200us.
118 	 */
119 	if (when->it_value.tv_sec == 0 &&
120 	    when->it_value.tv_nsec > 0 &&
121 	    when->it_value.tv_nsec < clock_highres_interval_min)
122 		value_need_clamp = B_TRUE;
123 
124 	if (when->it_interval.tv_sec == 0 &&
125 	    when->it_interval.tv_nsec > 0 &&
126 	    when->it_interval.tv_nsec < clock_highres_interval_min)
127 		intval_need_clamp = B_TRUE;
128 
129 	if ((value_need_clamp || intval_need_clamp) &&
130 	    secpolicy_clock_highres(cr) != 0) {
131 		clamped.it_value.tv_sec = when->it_value.tv_sec;
132 		clamped.it_interval.tv_sec = when->it_interval.tv_sec;
133 
134 		if (value_need_clamp) {
135 			clamped.it_value.tv_nsec = clock_highres_interval_min;
136 		} else {
137 			clamped.it_value.tv_nsec = when->it_value.tv_nsec;
138 		}
139 
140 		if (intval_need_clamp) {
141 			clamped.it_interval.tv_nsec =
142 			    clock_highres_interval_min;
143 		} else {
144 			clamped.it_interval.tv_nsec = when->it_interval.tv_nsec;
145 		}
146 
147 		when = &clamped;
148 	}
149 
150 	cyctime.cyt_when = ts2hrt(&when->it_value);
151 	cyctime.cyt_interval = ts2hrt(&when->it_interval);
152 
153 	if (cyctime.cyt_when != 0 && cyctime.cyt_interval == 0 &&
154 	    it->it_itime.it_interval.tv_sec == 0 &&
155 	    it->it_itime.it_interval.tv_nsec == 0 &&
156 	    (cyc = *cycp) != CYCLIC_NONE) {
157 		/*
158 		 * If our existing timer is a one-shot and our new timer is a
159 		 * one-shot, we'll save ourselves a world of grief and just
160 		 * reprogram the cyclic.
161 		 */
162 		it->it_itime = *when;
163 
164 		if (!(flags & TIMER_ABSTIME))
165 			cyctime.cyt_when += gethrtime();
166 
167 		hrt2ts(cyctime.cyt_when, &it->it_itime.it_value);
168 		(void) cyclic_reprogram(cyc, cyctime.cyt_when);
169 		return (0);
170 	}
171 
172 	mutex_enter(&cpu_lock);
173 	if ((cyc = *cycp) != CYCLIC_NONE) {
174 		cyclic_remove(cyc);
175 		*cycp = CYCLIC_NONE;
176 	}
177 
178 	if (cyctime.cyt_when == 0) {
179 		mutex_exit(&cpu_lock);
180 		return (0);
181 	}
182 
183 	if (!(flags & TIMER_ABSTIME))
184 		cyctime.cyt_when += gethrtime();
185 
186 	/*
187 	 * Now we will check for overflow (that is, we will check to see
188 	 * that the start time plus the interval time doesn't exceed
189 	 * INT64_MAX).  The astute code reviewer will observe that this
190 	 * one-time check doesn't guarantee that a future expiration
191 	 * will not wrap.  We wish to prove, then, that if a future
192 	 * expiration does wrap, the earliest the problem can be encountered
193 	 * is (INT64_MAX / 2) nanoseconds (191 years) after boot.  Formally:
194 	 *
195 	 *  Given:	s + i < m	s > 0	i > 0
196 	 *		s + ni > m	n > 1
197 	 *
198 	 *    (where "s" is the start time, "i" is the interval, "n" is the
199 	 *    number of times the cyclic has fired and "m" is INT64_MAX)
200 	 *
201 	 *  Prove:
202 	 *		(a)  s + (n - 1)i > (m / 2)
203 	 *		(b)  s + (n - 1)i < m
204 	 *
205 	 * That is, prove that we must have fired at least once 191 years
206 	 * after boot.  The proof is very straightforward; since the left
207 	 * side of (a) is minimized when i is small, it is sufficient to show
208 	 * that the statement is true for i's smallest possible value
209 	 * (((m - s) / n) + epsilon).  The same goes for (b); showing that the
210 	 * statement is true for i's largest possible value (m - s + epsilon)
211 	 * is sufficient to prove the statement.
212 	 *
213 	 * The actual arithmetic manipulation is left up to reader.
214 	 */
215 	if (cyctime.cyt_when > INT64_MAX - cyctime.cyt_interval) {
216 		mutex_exit(&cpu_lock);
217 		return (EOVERFLOW);
218 	}
219 
220 	if (cyctime.cyt_interval == 0) {
221 		/*
222 		 * If this is a one-shot, then we set the interval to be
223 		 * inifinite.  If this timer is never touched, this cyclic will
224 		 * simply consume space in the cyclic subsystem.  As soon as
225 		 * timer_settime() or timer_delete() is called, the cyclic is
226 		 * removed (so it's not possible to run the machine out
227 		 * of resources by creating one-shots).
228 		 */
229 		cyctime.cyt_interval = CY_INFINITY;
230 	}
231 
232 	it->it_itime = *when;
233 
234 	hrt2ts(cyctime.cyt_when, &it->it_itime.it_value);
235 
236 	hdlr.cyh_func = (cyc_func_t)clock_highres_fire;
237 	hdlr.cyh_arg = it;
238 	hdlr.cyh_level = CY_LOW_LEVEL;
239 
240 	if (cyctime.cyt_when != 0)
241 		*cycp = cyc = cyclic_add(&hdlr, &cyctime);
242 
243 	/*
244 	 * Now that we have the cyclic created, we need to bind it to our
245 	 * bound CPU and processor set (if any).
246 	 */
247 	mutex_enter(&p->p_lock);
248 	cpu = t->t_bound_cpu;
249 	cpupart = t->t_cpupart;
250 	pset = t->t_bind_pset;
251 
252 	mutex_exit(&p->p_lock);
253 
254 	cyclic_bind(cyc, cpu, pset == PS_NONE ? NULL : cpupart);
255 
256 	mutex_exit(&cpu_lock);
257 
258 	return (0);
259 }
260 
261 static int
262 clock_highres_timer_gettime(itimer_t *it, struct itimerspec *when)
263 {
264 	/*
265 	 * CLOCK_HIGHRES doesn't update it_itime.
266 	 */
267 	hrtime_t start = ts2hrt(&it->it_itime.it_value);
268 	hrtime_t interval = ts2hrt(&it->it_itime.it_interval);
269 	hrtime_t diff, now = gethrtime();
270 	hrtime_t *addr = &it->it_hrtime;
271 	hrtime_t last;
272 
273 	/*
274 	 * We're using atomic_cas_64() here only to assure that we slurp the
275 	 * entire timestamp atomically.
276 	 */
277 	last = atomic_cas_64((uint64_t *)addr, 0, 0);
278 
279 	*when = it->it_itime;
280 
281 	if (!timerspecisset(&when->it_value))
282 		return (0);
283 
284 	if (start > now) {
285 		/*
286 		 * We haven't gone off yet...
287 		 */
288 		diff = start - now;
289 	} else {
290 		if (interval == 0) {
291 			/*
292 			 * This is a one-shot which should have already
293 			 * fired; set it_value to 0.
294 			 */
295 			timerspecclear(&when->it_value);
296 			return (0);
297 		}
298 
299 		/*
300 		 * Calculate how far we are into this interval.
301 		 */
302 		diff = (now - start) % interval;
303 
304 		/*
305 		 * Now check to see if we've dealt with the last interval
306 		 * yet.
307 		 */
308 		if (now - diff > last) {
309 			/*
310 			 * The last interval hasn't fired; set it_value to 0.
311 			 */
312 			timerspecclear(&when->it_value);
313 			return (0);
314 		}
315 
316 		/*
317 		 * The last interval _has_ fired; we can return the amount
318 		 * of time left in this interval.
319 		 */
320 		diff = interval - diff;
321 	}
322 
323 	hrt2ts(diff, &when->it_value);
324 
325 	return (0);
326 }
327 
328 static int
329 clock_highres_timer_delete(itimer_t *it)
330 {
331 	cyclic_id_t cyc;
332 
333 	if (it->it_arg == NULL) {
334 		/*
335 		 * This timer was never fully created; we must have failed
336 		 * in the clock_highres_timer_create() routine.
337 		 */
338 		return (0);
339 	}
340 
341 	mutex_enter(&cpu_lock);
342 
343 	if ((cyc = *((cyclic_id_t *)it->it_arg)) != CYCLIC_NONE)
344 		cyclic_remove(cyc);
345 
346 	mutex_exit(&cpu_lock);
347 
348 	kmem_free(it->it_arg, sizeof (cyclic_id_t));
349 
350 	return (0);
351 }
352 
353 static void
354 clock_highres_timer_lwpbind(itimer_t *it)
355 {
356 	proc_t *p = curproc;
357 	kthread_t *t = curthread;
358 	cyclic_id_t cyc = *((cyclic_id_t *)it->it_arg);
359 	cpu_t *cpu;
360 	cpupart_t *cpupart;
361 	int pset;
362 
363 	if (cyc == CYCLIC_NONE)
364 		return;
365 
366 	mutex_enter(&cpu_lock);
367 	mutex_enter(&p->p_lock);
368 
369 	/*
370 	 * Okay, now we can safely look at the bindings.
371 	 */
372 	cpu = t->t_bound_cpu;
373 	cpupart = t->t_cpupart;
374 	pset = t->t_bind_pset;
375 
376 	/*
377 	 * Now we drop p_lock.  We haven't dropped cpu_lock; we're guaranteed
378 	 * that even if the bindings change, the CPU and/or processor set
379 	 * that this timer was bound to remain valid (and the combination
380 	 * remains self-consistent).
381 	 */
382 	mutex_exit(&p->p_lock);
383 
384 	cyclic_bind(cyc, cpu, pset == PS_NONE ? NULL : cpupart);
385 
386 	mutex_exit(&cpu_lock);
387 }
388 
389 void
390 clock_highres_init()
391 {
392 	clock_backend_t *be = &clock_highres;
393 	struct sigevent *ev = &be->clk_default;
394 
395 	ev->sigev_signo = SIGALRM;
396 	ev->sigev_notify = SIGEV_SIGNAL;
397 	ev->sigev_value.sival_ptr = NULL;
398 
399 	be->clk_clock_settime = clock_highres_settime;
400 	be->clk_clock_gettime = clock_highres_gettime;
401 	be->clk_clock_getres = clock_highres_getres;
402 	be->clk_timer_create = clock_highres_timer_create;
403 	be->clk_timer_gettime = clock_highres_timer_gettime;
404 	be->clk_timer_settime = clock_highres_timer_settime;
405 	be->clk_timer_delete = clock_highres_timer_delete;
406 	be->clk_timer_lwpbind = clock_highres_timer_lwpbind;
407 
408 	clock_add_backend(CLOCK_HIGHRES, &clock_highres);
409 }
410