1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Author: Andrei Vagin <avagin@openvz.org> 4 * Author: Dmitry Safonov <dima@arista.com> 5 */ 6 7 #include <linux/time_namespace.h> 8 #include <linux/user_namespace.h> 9 #include <linux/sched/signal.h> 10 #include <linux/sched/task.h> 11 #include <linux/clocksource.h> 12 #include <linux/seq_file.h> 13 #include <linux/proc_ns.h> 14 #include <linux/export.h> 15 #include <linux/nstree.h> 16 #include <linux/time.h> 17 #include <linux/slab.h> 18 #include <linux/cred.h> 19 #include <linux/err.h> 20 #include <linux/mm.h> 21 22 #include <vdso/datapage.h> 23 24 ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, 25 struct timens_offsets *ns_offsets) 26 { 27 ktime_t offset; 28 29 switch (clockid) { 30 case CLOCK_MONOTONIC: 31 offset = timespec64_to_ktime(ns_offsets->monotonic); 32 break; 33 case CLOCK_BOOTTIME: 34 case CLOCK_BOOTTIME_ALARM: 35 offset = timespec64_to_ktime(ns_offsets->boottime); 36 break; 37 default: 38 return tim; 39 } 40 41 /* 42 * Check that @tim value is in [offset, KTIME_MAX + offset] 43 * and subtract offset. 44 */ 45 if (tim < offset) { 46 /* 47 * User can specify @tim *absolute* value - if it's lesser than 48 * the time namespace's offset - it's already expired. 49 */ 50 tim = 0; 51 } else { 52 tim = ktime_sub(tim, offset); 53 if (unlikely(tim > KTIME_MAX)) 54 tim = KTIME_MAX; 55 } 56 57 return tim; 58 } 59 60 static struct ucounts *inc_time_namespaces(struct user_namespace *ns) 61 { 62 return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); 63 } 64 65 static void dec_time_namespaces(struct ucounts *ucounts) 66 { 67 dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); 68 } 69 70 /** 71 * clone_time_ns - Clone a time namespace 72 * @user_ns: User namespace which owns a new namespace. 73 * @old_ns: Namespace to clone 74 * 75 * Clone @old_ns and set the clone refcount to 1 76 * 77 * Return: The new namespace or ERR_PTR. 78 */ 79 static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, 80 struct time_namespace *old_ns) 81 { 82 struct time_namespace *ns; 83 struct ucounts *ucounts; 84 int err; 85 86 err = -ENOSPC; 87 ucounts = inc_time_namespaces(user_ns); 88 if (!ucounts) 89 goto fail; 90 91 err = -ENOMEM; 92 ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); 93 if (!ns) 94 goto fail_dec; 95 96 ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 97 if (!ns->vvar_page) 98 goto fail_free; 99 100 err = ns_common_init(ns); 101 if (err) 102 goto fail_free_page; 103 104 ns->ucounts = ucounts; 105 ns->user_ns = get_user_ns(user_ns); 106 ns->offsets = old_ns->offsets; 107 ns->frozen_offsets = false; 108 ns_tree_add(ns); 109 return ns; 110 111 fail_free_page: 112 __free_page(ns->vvar_page); 113 fail_free: 114 kfree(ns); 115 fail_dec: 116 dec_time_namespaces(ucounts); 117 fail: 118 return ERR_PTR(err); 119 } 120 121 /** 122 * copy_time_ns - Create timens_for_children from @old_ns 123 * @flags: Cloning flags 124 * @user_ns: User namespace which owns a new namespace. 125 * @old_ns: Namespace to clone 126 * 127 * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; 128 * adds a refcounter to @old_ns otherwise. 129 * 130 * Return: timens_for_children namespace or ERR_PTR. 131 */ 132 struct time_namespace *copy_time_ns(u64 flags, 133 struct user_namespace *user_ns, struct time_namespace *old_ns) 134 { 135 if (!(flags & CLONE_NEWTIME)) 136 return get_time_ns(old_ns); 137 138 return clone_time_ns(user_ns, old_ns); 139 } 140 141 static struct timens_offset offset_from_ts(struct timespec64 off) 142 { 143 struct timens_offset ret; 144 145 ret.sec = off.tv_sec; 146 ret.nsec = off.tv_nsec; 147 148 return ret; 149 } 150 151 /* 152 * A time namespace VVAR page has the same layout as the VVAR page which 153 * contains the system wide VDSO data. 154 * 155 * For a normal task the VVAR pages are installed in the normal ordering: 156 * VVAR 157 * PVCLOCK 158 * HVCLOCK 159 * TIMENS <- Not really required 160 * 161 * Now for a timens task the pages are installed in the following order: 162 * TIMENS 163 * PVCLOCK 164 * HVCLOCK 165 * VVAR 166 * 167 * The check for vdso_clock->clock_mode is in the unlikely path of 168 * the seq begin magic. So for the non-timens case most of the time 169 * 'seq' is even, so the branch is not taken. 170 * 171 * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check 172 * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the 173 * update to finish and for 'seq' to become even anyway. 174 * 175 * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which 176 * enforces the time namespace handling path. 177 */ 178 static void timens_setup_vdso_clock_data(struct vdso_clock *vc, 179 struct time_namespace *ns) 180 { 181 struct timens_offset *offset = vc->offset; 182 struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); 183 struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); 184 185 vc->seq = 1; 186 vc->clock_mode = VDSO_CLOCKMODE_TIMENS; 187 offset[CLOCK_MONOTONIC] = monotonic; 188 offset[CLOCK_MONOTONIC_RAW] = monotonic; 189 offset[CLOCK_MONOTONIC_COARSE] = monotonic; 190 offset[CLOCK_BOOTTIME] = boottime; 191 offset[CLOCK_BOOTTIME_ALARM] = boottime; 192 } 193 194 struct page *find_timens_vvar_page(struct vm_area_struct *vma) 195 { 196 if (likely(vma->vm_mm == current->mm)) 197 return current->nsproxy->time_ns->vvar_page; 198 199 /* 200 * VM_PFNMAP | VM_IO protect .fault() handler from being called 201 * through interfaces like /proc/$pid/mem or 202 * process_vm_{readv,writev}() as long as there's no .access() 203 * in special_mapping_vmops(). 204 * For more details check_vma_flags() and __access_remote_vm() 205 */ 206 207 WARN(1, "vvar_page accessed remotely"); 208 209 return NULL; 210 } 211 212 /* 213 * Protects possibly multiple offsets writers racing each other 214 * and tasks entering the namespace. 215 */ 216 static DEFINE_MUTEX(offset_lock); 217 218 static void timens_set_vvar_page(struct task_struct *task, 219 struct time_namespace *ns) 220 { 221 struct vdso_time_data *vdata; 222 struct vdso_clock *vc; 223 unsigned int i; 224 225 if (ns == &init_time_ns) 226 return; 227 228 /* Fast-path, taken by every task in namespace except the first. */ 229 if (likely(ns->frozen_offsets)) 230 return; 231 232 mutex_lock(&offset_lock); 233 /* Nothing to-do: vvar_page has been already initialized. */ 234 if (ns->frozen_offsets) 235 goto out; 236 237 ns->frozen_offsets = true; 238 vdata = page_address(ns->vvar_page); 239 vc = vdata->clock_data; 240 241 for (i = 0; i < CS_BASES; i++) 242 timens_setup_vdso_clock_data(&vc[i], ns); 243 244 if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { 245 for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) 246 timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); 247 } 248 249 out: 250 mutex_unlock(&offset_lock); 251 } 252 253 void free_time_ns(struct time_namespace *ns) 254 { 255 ns_tree_remove(ns); 256 dec_time_namespaces(ns->ucounts); 257 put_user_ns(ns->user_ns); 258 ns_common_free(ns); 259 __free_page(ns->vvar_page); 260 /* Concurrent nstree traversal depends on a grace period. */ 261 kfree_rcu(ns, ns.ns_rcu); 262 } 263 264 static struct ns_common *timens_get(struct task_struct *task) 265 { 266 struct time_namespace *ns = NULL; 267 struct nsproxy *nsproxy; 268 269 task_lock(task); 270 nsproxy = task->nsproxy; 271 if (nsproxy) { 272 ns = nsproxy->time_ns; 273 get_time_ns(ns); 274 } 275 task_unlock(task); 276 277 return ns ? &ns->ns : NULL; 278 } 279 280 static struct ns_common *timens_for_children_get(struct task_struct *task) 281 { 282 struct time_namespace *ns = NULL; 283 struct nsproxy *nsproxy; 284 285 task_lock(task); 286 nsproxy = task->nsproxy; 287 if (nsproxy) { 288 ns = nsproxy->time_ns_for_children; 289 get_time_ns(ns); 290 } 291 task_unlock(task); 292 293 return ns ? &ns->ns : NULL; 294 } 295 296 static void timens_put(struct ns_common *ns) 297 { 298 put_time_ns(to_time_ns(ns)); 299 } 300 301 void timens_commit(struct task_struct *tsk, struct time_namespace *ns) 302 { 303 timens_set_vvar_page(tsk, ns); 304 vdso_join_timens(tsk, ns); 305 } 306 307 static int timens_install(struct nsset *nsset, struct ns_common *new) 308 { 309 struct nsproxy *nsproxy = nsset->nsproxy; 310 struct time_namespace *ns = to_time_ns(new); 311 312 if (!current_is_single_threaded()) 313 return -EUSERS; 314 315 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 316 !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) 317 return -EPERM; 318 319 get_time_ns(ns); 320 put_time_ns(nsproxy->time_ns); 321 nsproxy->time_ns = ns; 322 323 get_time_ns(ns); 324 put_time_ns(nsproxy->time_ns_for_children); 325 nsproxy->time_ns_for_children = ns; 326 return 0; 327 } 328 329 void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) 330 { 331 struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; 332 struct time_namespace *ns = to_time_ns(nsc); 333 334 /* create_new_namespaces() already incremented the ref counter */ 335 if (nsproxy->time_ns == nsproxy->time_ns_for_children) 336 return; 337 338 get_time_ns(ns); 339 put_time_ns(nsproxy->time_ns); 340 nsproxy->time_ns = ns; 341 342 timens_commit(tsk, ns); 343 } 344 345 static struct user_namespace *timens_owner(struct ns_common *ns) 346 { 347 return to_time_ns(ns)->user_ns; 348 } 349 350 static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) 351 { 352 char *clock; 353 354 switch (clockid) { 355 case CLOCK_BOOTTIME: 356 clock = "boottime"; 357 break; 358 case CLOCK_MONOTONIC: 359 clock = "monotonic"; 360 break; 361 default: 362 clock = "unknown"; 363 break; 364 } 365 seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); 366 } 367 368 void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) 369 { 370 struct ns_common *ns; 371 struct time_namespace *time_ns; 372 373 ns = timens_for_children_get(p); 374 if (!ns) 375 return; 376 time_ns = to_time_ns(ns); 377 378 show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); 379 show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); 380 put_time_ns(time_ns); 381 } 382 383 int proc_timens_set_offset(struct file *file, struct task_struct *p, 384 struct proc_timens_offset *offsets, int noffsets) 385 { 386 struct ns_common *ns; 387 struct time_namespace *time_ns; 388 struct timespec64 tp; 389 int i, err; 390 391 ns = timens_for_children_get(p); 392 if (!ns) 393 return -ESRCH; 394 time_ns = to_time_ns(ns); 395 396 if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { 397 put_time_ns(time_ns); 398 return -EPERM; 399 } 400 401 for (i = 0; i < noffsets; i++) { 402 struct proc_timens_offset *off = &offsets[i]; 403 404 switch (off->clockid) { 405 case CLOCK_MONOTONIC: 406 ktime_get_ts64(&tp); 407 break; 408 case CLOCK_BOOTTIME: 409 ktime_get_boottime_ts64(&tp); 410 break; 411 default: 412 err = -EINVAL; 413 goto out; 414 } 415 416 err = -ERANGE; 417 418 if (off->val.tv_sec > KTIME_SEC_MAX || 419 off->val.tv_sec < -KTIME_SEC_MAX) 420 goto out; 421 422 tp = timespec64_add(tp, off->val); 423 /* 424 * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is 425 * still unreachable. 426 */ 427 if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) 428 goto out; 429 } 430 431 mutex_lock(&offset_lock); 432 if (time_ns->frozen_offsets) { 433 err = -EACCES; 434 goto out_unlock; 435 } 436 437 err = 0; 438 /* Don't report errors after this line */ 439 for (i = 0; i < noffsets; i++) { 440 struct proc_timens_offset *off = &offsets[i]; 441 struct timespec64 *offset = NULL; 442 443 switch (off->clockid) { 444 case CLOCK_MONOTONIC: 445 offset = &time_ns->offsets.monotonic; 446 break; 447 case CLOCK_BOOTTIME: 448 offset = &time_ns->offsets.boottime; 449 break; 450 } 451 452 *offset = off->val; 453 } 454 455 out_unlock: 456 mutex_unlock(&offset_lock); 457 out: 458 put_time_ns(time_ns); 459 460 return err; 461 } 462 463 const struct proc_ns_operations timens_operations = { 464 .name = "time", 465 .get = timens_get, 466 .put = timens_put, 467 .install = timens_install, 468 .owner = timens_owner, 469 }; 470 471 const struct proc_ns_operations timens_for_children_operations = { 472 .name = "time_for_children", 473 .real_ns_name = "time", 474 .get = timens_for_children_get, 475 .put = timens_put, 476 .install = timens_install, 477 .owner = timens_owner, 478 }; 479 480 struct time_namespace init_time_ns = { 481 .ns.ns_type = ns_common_type(&init_time_ns), 482 .ns.__ns_ref = REFCOUNT_INIT(3), 483 .user_ns = &init_user_ns, 484 .ns.inum = ns_init_inum(&init_time_ns), 485 .ns.ops = &timens_operations, 486 .frozen_offsets = true, 487 }; 488 489 void __init time_ns_init(void) 490 { 491 ns_tree_add(&init_time_ns); 492 } 493