xref: /linux/kernel/time/namespace.c (revision 163b099146b85d1b05bd2eaa045acbeee25c29e4)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Author: Andrei Vagin <avagin@openvz.org>
4   * Author: Dmitry Safonov <dima@arista.com>
5   */
6  
7  #include <linux/time_namespace.h>
8  #include <linux/user_namespace.h>
9  #include <linux/sched/signal.h>
10  #include <linux/sched/task.h>
11  #include <linux/clocksource.h>
12  #include <linux/seq_file.h>
13  #include <linux/proc_ns.h>
14  #include <linux/export.h>
15  #include <linux/time.h>
16  #include <linux/slab.h>
17  #include <linux/cred.h>
18  #include <linux/err.h>
19  #include <linux/mm.h>
20  
21  #include <vdso/datapage.h>
22  
23  ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
24  				struct timens_offsets *ns_offsets)
25  {
26  	ktime_t offset;
27  
28  	switch (clockid) {
29  	case CLOCK_MONOTONIC:
30  		offset = timespec64_to_ktime(ns_offsets->monotonic);
31  		break;
32  	case CLOCK_BOOTTIME:
33  	case CLOCK_BOOTTIME_ALARM:
34  		offset = timespec64_to_ktime(ns_offsets->boottime);
35  		break;
36  	default:
37  		return tim;
38  	}
39  
40  	/*
41  	 * Check that @tim value is in [offset, KTIME_MAX + offset]
42  	 * and subtract offset.
43  	 */
44  	if (tim < offset) {
45  		/*
46  		 * User can specify @tim *absolute* value - if it's lesser than
47  		 * the time namespace's offset - it's already expired.
48  		 */
49  		tim = 0;
50  	} else {
51  		tim = ktime_sub(tim, offset);
52  		if (unlikely(tim > KTIME_MAX))
53  			tim = KTIME_MAX;
54  	}
55  
56  	return tim;
57  }
58  
59  static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
60  {
61  	return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
62  }
63  
64  static void dec_time_namespaces(struct ucounts *ucounts)
65  {
66  	dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
67  }
68  
69  /**
70   * clone_time_ns - Clone a time namespace
71   * @user_ns:	User namespace which owns a new namespace.
72   * @old_ns:	Namespace to clone
73   *
74   * Clone @old_ns and set the clone refcount to 1
75   *
76   * Return: The new namespace or ERR_PTR.
77   */
78  static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
79  					  struct time_namespace *old_ns)
80  {
81  	struct time_namespace *ns;
82  	struct ucounts *ucounts;
83  	int err;
84  
85  	err = -ENOSPC;
86  	ucounts = inc_time_namespaces(user_ns);
87  	if (!ucounts)
88  		goto fail;
89  
90  	err = -ENOMEM;
91  	ns = kmalloc(sizeof(*ns), GFP_KERNEL);
92  	if (!ns)
93  		goto fail_dec;
94  
95  	refcount_set(&ns->ns.count, 1);
96  
97  	ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
98  	if (!ns->vvar_page)
99  		goto fail_free;
100  
101  	err = ns_alloc_inum(&ns->ns);
102  	if (err)
103  		goto fail_free_page;
104  
105  	ns->ucounts = ucounts;
106  	ns->ns.ops = &timens_operations;
107  	ns->user_ns = get_user_ns(user_ns);
108  	ns->offsets = old_ns->offsets;
109  	ns->frozen_offsets = false;
110  	return ns;
111  
112  fail_free_page:
113  	__free_page(ns->vvar_page);
114  fail_free:
115  	kfree(ns);
116  fail_dec:
117  	dec_time_namespaces(ucounts);
118  fail:
119  	return ERR_PTR(err);
120  }
121  
122  /**
123   * copy_time_ns - Create timens_for_children from @old_ns
124   * @flags:	Cloning flags
125   * @user_ns:	User namespace which owns a new namespace.
126   * @old_ns:	Namespace to clone
127   *
128   * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
129   * adds a refcounter to @old_ns otherwise.
130   *
131   * Return: timens_for_children namespace or ERR_PTR.
132   */
133  struct time_namespace *copy_time_ns(unsigned long flags,
134  	struct user_namespace *user_ns, struct time_namespace *old_ns)
135  {
136  	if (!(flags & CLONE_NEWTIME))
137  		return get_time_ns(old_ns);
138  
139  	return clone_time_ns(user_ns, old_ns);
140  }
141  
142  static struct timens_offset offset_from_ts(struct timespec64 off)
143  {
144  	struct timens_offset ret;
145  
146  	ret.sec = off.tv_sec;
147  	ret.nsec = off.tv_nsec;
148  
149  	return ret;
150  }
151  
152  /*
153   * A time namespace VVAR page has the same layout as the VVAR page which
154   * contains the system wide VDSO data.
155   *
156   * For a normal task the VVAR pages are installed in the normal ordering:
157   *     VVAR
158   *     PVCLOCK
159   *     HVCLOCK
160   *     TIMENS   <- Not really required
161   *
162   * Now for a timens task the pages are installed in the following order:
163   *     TIMENS
164   *     PVCLOCK
165   *     HVCLOCK
166   *     VVAR
167   *
168   * The check for vdso_data->clock_mode is in the unlikely path of
169   * the seq begin magic. So for the non-timens case most of the time
170   * 'seq' is even, so the branch is not taken.
171   *
172   * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
173   * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
174   * update to finish and for 'seq' to become even anyway.
175   *
176   * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
177   * enforces the time namespace handling path.
178   */
179  static void timens_setup_vdso_data(struct vdso_data *vdata,
180  				   struct time_namespace *ns)
181  {
182  	struct timens_offset *offset = vdata->offset;
183  	struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
184  	struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
185  
186  	vdata->seq			= 1;
187  	vdata->clock_mode		= VDSO_CLOCKMODE_TIMENS;
188  	offset[CLOCK_MONOTONIC]		= monotonic;
189  	offset[CLOCK_MONOTONIC_RAW]	= monotonic;
190  	offset[CLOCK_MONOTONIC_COARSE]	= monotonic;
191  	offset[CLOCK_BOOTTIME]		= boottime;
192  	offset[CLOCK_BOOTTIME_ALARM]	= boottime;
193  }
194  
195  /*
196   * Protects possibly multiple offsets writers racing each other
197   * and tasks entering the namespace.
198   */
199  static DEFINE_MUTEX(offset_lock);
200  
201  static void timens_set_vvar_page(struct task_struct *task,
202  				struct time_namespace *ns)
203  {
204  	struct vdso_data *vdata;
205  	unsigned int i;
206  
207  	if (ns == &init_time_ns)
208  		return;
209  
210  	/* Fast-path, taken by every task in namespace except the first. */
211  	if (likely(ns->frozen_offsets))
212  		return;
213  
214  	mutex_lock(&offset_lock);
215  	/* Nothing to-do: vvar_page has been already initialized. */
216  	if (ns->frozen_offsets)
217  		goto out;
218  
219  	ns->frozen_offsets = true;
220  	vdata = arch_get_vdso_data(page_address(ns->vvar_page));
221  
222  	for (i = 0; i < CS_BASES; i++)
223  		timens_setup_vdso_data(&vdata[i], ns);
224  
225  out:
226  	mutex_unlock(&offset_lock);
227  }
228  
229  void free_time_ns(struct time_namespace *ns)
230  {
231  	dec_time_namespaces(ns->ucounts);
232  	put_user_ns(ns->user_ns);
233  	ns_free_inum(&ns->ns);
234  	__free_page(ns->vvar_page);
235  	kfree(ns);
236  }
237  
238  static struct time_namespace *to_time_ns(struct ns_common *ns)
239  {
240  	return container_of(ns, struct time_namespace, ns);
241  }
242  
243  static struct ns_common *timens_get(struct task_struct *task)
244  {
245  	struct time_namespace *ns = NULL;
246  	struct nsproxy *nsproxy;
247  
248  	task_lock(task);
249  	nsproxy = task->nsproxy;
250  	if (nsproxy) {
251  		ns = nsproxy->time_ns;
252  		get_time_ns(ns);
253  	}
254  	task_unlock(task);
255  
256  	return ns ? &ns->ns : NULL;
257  }
258  
259  static struct ns_common *timens_for_children_get(struct task_struct *task)
260  {
261  	struct time_namespace *ns = NULL;
262  	struct nsproxy *nsproxy;
263  
264  	task_lock(task);
265  	nsproxy = task->nsproxy;
266  	if (nsproxy) {
267  		ns = nsproxy->time_ns_for_children;
268  		get_time_ns(ns);
269  	}
270  	task_unlock(task);
271  
272  	return ns ? &ns->ns : NULL;
273  }
274  
275  static void timens_put(struct ns_common *ns)
276  {
277  	put_time_ns(to_time_ns(ns));
278  }
279  
280  void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
281  {
282  	timens_set_vvar_page(tsk, ns);
283  	vdso_join_timens(tsk, ns);
284  }
285  
286  static int timens_install(struct nsset *nsset, struct ns_common *new)
287  {
288  	struct nsproxy *nsproxy = nsset->nsproxy;
289  	struct time_namespace *ns = to_time_ns(new);
290  
291  	if (!current_is_single_threaded())
292  		return -EUSERS;
293  
294  	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
295  	    !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
296  		return -EPERM;
297  
298  	get_time_ns(ns);
299  	put_time_ns(nsproxy->time_ns);
300  	nsproxy->time_ns = ns;
301  
302  	get_time_ns(ns);
303  	put_time_ns(nsproxy->time_ns_for_children);
304  	nsproxy->time_ns_for_children = ns;
305  	return 0;
306  }
307  
308  void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
309  {
310  	struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
311  	struct time_namespace *ns = to_time_ns(nsc);
312  
313  	/* create_new_namespaces() already incremented the ref counter */
314  	if (nsproxy->time_ns == nsproxy->time_ns_for_children)
315  		return;
316  
317  	get_time_ns(ns);
318  	put_time_ns(nsproxy->time_ns);
319  	nsproxy->time_ns = ns;
320  
321  	timens_commit(tsk, ns);
322  }
323  
324  static struct user_namespace *timens_owner(struct ns_common *ns)
325  {
326  	return to_time_ns(ns)->user_ns;
327  }
328  
329  static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
330  {
331  	char *clock;
332  
333  	switch (clockid) {
334  	case CLOCK_BOOTTIME:
335  		clock = "boottime";
336  		break;
337  	case CLOCK_MONOTONIC:
338  		clock = "monotonic";
339  		break;
340  	default:
341  		clock = "unknown";
342  		break;
343  	}
344  	seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec);
345  }
346  
347  void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
348  {
349  	struct ns_common *ns;
350  	struct time_namespace *time_ns;
351  
352  	ns = timens_for_children_get(p);
353  	if (!ns)
354  		return;
355  	time_ns = to_time_ns(ns);
356  
357  	show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
358  	show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
359  	put_time_ns(time_ns);
360  }
361  
362  int proc_timens_set_offset(struct file *file, struct task_struct *p,
363  			   struct proc_timens_offset *offsets, int noffsets)
364  {
365  	struct ns_common *ns;
366  	struct time_namespace *time_ns;
367  	struct timespec64 tp;
368  	int i, err;
369  
370  	ns = timens_for_children_get(p);
371  	if (!ns)
372  		return -ESRCH;
373  	time_ns = to_time_ns(ns);
374  
375  	if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
376  		put_time_ns(time_ns);
377  		return -EPERM;
378  	}
379  
380  	for (i = 0; i < noffsets; i++) {
381  		struct proc_timens_offset *off = &offsets[i];
382  
383  		switch (off->clockid) {
384  		case CLOCK_MONOTONIC:
385  			ktime_get_ts64(&tp);
386  			break;
387  		case CLOCK_BOOTTIME:
388  			ktime_get_boottime_ts64(&tp);
389  			break;
390  		default:
391  			err = -EINVAL;
392  			goto out;
393  		}
394  
395  		err = -ERANGE;
396  
397  		if (off->val.tv_sec > KTIME_SEC_MAX ||
398  		    off->val.tv_sec < -KTIME_SEC_MAX)
399  			goto out;
400  
401  		tp = timespec64_add(tp, off->val);
402  		/*
403  		 * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is
404  		 * still unreachable.
405  		 */
406  		if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
407  			goto out;
408  	}
409  
410  	mutex_lock(&offset_lock);
411  	if (time_ns->frozen_offsets) {
412  		err = -EACCES;
413  		goto out_unlock;
414  	}
415  
416  	err = 0;
417  	/* Don't report errors after this line */
418  	for (i = 0; i < noffsets; i++) {
419  		struct proc_timens_offset *off = &offsets[i];
420  		struct timespec64 *offset = NULL;
421  
422  		switch (off->clockid) {
423  		case CLOCK_MONOTONIC:
424  			offset = &time_ns->offsets.monotonic;
425  			break;
426  		case CLOCK_BOOTTIME:
427  			offset = &time_ns->offsets.boottime;
428  			break;
429  		}
430  
431  		*offset = off->val;
432  	}
433  
434  out_unlock:
435  	mutex_unlock(&offset_lock);
436  out:
437  	put_time_ns(time_ns);
438  
439  	return err;
440  }
441  
442  const struct proc_ns_operations timens_operations = {
443  	.name		= "time",
444  	.type		= CLONE_NEWTIME,
445  	.get		= timens_get,
446  	.put		= timens_put,
447  	.install	= timens_install,
448  	.owner		= timens_owner,
449  };
450  
451  const struct proc_ns_operations timens_for_children_operations = {
452  	.name		= "time_for_children",
453  	.real_ns_name	= "time",
454  	.type		= CLONE_NEWTIME,
455  	.get		= timens_for_children_get,
456  	.put		= timens_put,
457  	.install	= timens_install,
458  	.owner		= timens_owner,
459  };
460  
461  struct time_namespace init_time_ns = {
462  	.ns.count	= REFCOUNT_INIT(3),
463  	.user_ns	= &init_user_ns,
464  	.ns.inum	= PROC_TIME_INIT_INO,
465  	.ns.ops		= &timens_operations,
466  	.frozen_offsets	= true,
467  };
468