1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/kernel/sys.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/export.h> 9 #include <linux/mm.h> 10 #include <linux/utsname.h> 11 #include <linux/mman.h> 12 #include <linux/reboot.h> 13 #include <linux/prctl.h> 14 #include <linux/highuid.h> 15 #include <linux/fs.h> 16 #include <linux/kmod.h> 17 #include <linux/perf_event.h> 18 #include <linux/resource.h> 19 #include <linux/kernel.h> 20 #include <linux/workqueue.h> 21 #include <linux/capability.h> 22 #include <linux/device.h> 23 #include <linux/key.h> 24 #include <linux/times.h> 25 #include <linux/posix-timers.h> 26 #include <linux/security.h> 27 #include <linux/suspend.h> 28 #include <linux/tty.h> 29 #include <linux/signal.h> 30 #include <linux/cn_proc.h> 31 #include <linux/getcpu.h> 32 #include <linux/task_io_accounting_ops.h> 33 #include <linux/seccomp.h> 34 #include <linux/cpu.h> 35 #include <linux/personality.h> 36 #include <linux/ptrace.h> 37 #include <linux/fs_struct.h> 38 #include <linux/file.h> 39 #include <linux/mount.h> 40 #include <linux/gfp.h> 41 #include <linux/syscore_ops.h> 42 #include <linux/version.h> 43 #include <linux/ctype.h> 44 #include <linux/syscall_user_dispatch.h> 45 46 #include <linux/compat.h> 47 #include <linux/syscalls.h> 48 #include <linux/kprobes.h> 49 #include <linux/user_namespace.h> 50 #include <linux/time_namespace.h> 51 #include <linux/binfmts.h> 52 53 #include <linux/sched.h> 54 #include <linux/sched/autogroup.h> 55 #include <linux/sched/loadavg.h> 56 #include <linux/sched/stat.h> 57 #include <linux/sched/mm.h> 58 #include <linux/sched/coredump.h> 59 #include <linux/sched/task.h> 60 #include <linux/sched/cputime.h> 61 #include <linux/rcupdate.h> 62 #include <linux/uidgid.h> 63 #include <linux/cred.h> 64 65 #include <linux/nospec.h> 66 67 #include <linux/kmsg_dump.h> 68 /* Move somewhere else to avoid recompiling? */ 69 #include <generated/utsrelease.h> 70 71 #include <linux/uaccess.h> 72 #include <asm/io.h> 73 #include <asm/unistd.h> 74 75 #include "uid16.h" 76 77 #ifndef SET_UNALIGN_CTL 78 # define SET_UNALIGN_CTL(a, b) (-EINVAL) 79 #endif 80 #ifndef GET_UNALIGN_CTL 81 # define GET_UNALIGN_CTL(a, b) (-EINVAL) 82 #endif 83 #ifndef SET_FPEMU_CTL 84 # define SET_FPEMU_CTL(a, b) (-EINVAL) 85 #endif 86 #ifndef GET_FPEMU_CTL 87 # define GET_FPEMU_CTL(a, b) (-EINVAL) 88 #endif 89 #ifndef SET_FPEXC_CTL 90 # define SET_FPEXC_CTL(a, b) (-EINVAL) 91 #endif 92 #ifndef GET_FPEXC_CTL 93 # define GET_FPEXC_CTL(a, b) (-EINVAL) 94 #endif 95 #ifndef GET_ENDIAN 96 # define GET_ENDIAN(a, b) (-EINVAL) 97 #endif 98 #ifndef SET_ENDIAN 99 # define SET_ENDIAN(a, b) (-EINVAL) 100 #endif 101 #ifndef GET_TSC_CTL 102 # define GET_TSC_CTL(a) (-EINVAL) 103 #endif 104 #ifndef SET_TSC_CTL 105 # define SET_TSC_CTL(a) (-EINVAL) 106 #endif 107 #ifndef GET_FP_MODE 108 # define GET_FP_MODE(a) (-EINVAL) 109 #endif 110 #ifndef SET_FP_MODE 111 # define SET_FP_MODE(a,b) (-EINVAL) 112 #endif 113 #ifndef SVE_SET_VL 114 # define SVE_SET_VL(a) (-EINVAL) 115 #endif 116 #ifndef SVE_GET_VL 117 # define SVE_GET_VL() (-EINVAL) 118 #endif 119 #ifndef PAC_RESET_KEYS 120 # define PAC_RESET_KEYS(a, b) (-EINVAL) 121 #endif 122 #ifndef PAC_SET_ENABLED_KEYS 123 # define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL) 124 #endif 125 #ifndef PAC_GET_ENABLED_KEYS 126 # define PAC_GET_ENABLED_KEYS(a) (-EINVAL) 127 #endif 128 #ifndef SET_TAGGED_ADDR_CTRL 129 # define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) 130 #endif 131 #ifndef GET_TAGGED_ADDR_CTRL 132 # define GET_TAGGED_ADDR_CTRL() (-EINVAL) 133 #endif 134 135 /* 136 * this is where the system-wide overflow UID and GID are defined, for 137 * architectures that now have 32-bit UID/GID but didn't in the past 138 */ 139 140 int overflowuid = DEFAULT_OVERFLOWUID; 141 int overflowgid = DEFAULT_OVERFLOWGID; 142 143 EXPORT_SYMBOL(overflowuid); 144 EXPORT_SYMBOL(overflowgid); 145 146 /* 147 * the same as above, but for filesystems which can only store a 16-bit 148 * UID and GID. as such, this is needed on all architectures 149 */ 150 151 int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; 152 int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; 153 154 EXPORT_SYMBOL(fs_overflowuid); 155 EXPORT_SYMBOL(fs_overflowgid); 156 157 /* 158 * Returns true if current's euid is same as p's uid or euid, 159 * or has CAP_SYS_NICE to p's user_ns. 160 * 161 * Called with rcu_read_lock, creds are safe 162 */ 163 static bool set_one_prio_perm(struct task_struct *p) 164 { 165 const struct cred *cred = current_cred(), *pcred = __task_cred(p); 166 167 if (uid_eq(pcred->uid, cred->euid) || 168 uid_eq(pcred->euid, cred->euid)) 169 return true; 170 if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) 171 return true; 172 return false; 173 } 174 175 /* 176 * set the priority of a task 177 * - the caller must hold the RCU read lock 178 */ 179 static int set_one_prio(struct task_struct *p, int niceval, int error) 180 { 181 int no_nice; 182 183 if (!set_one_prio_perm(p)) { 184 error = -EPERM; 185 goto out; 186 } 187 if (niceval < task_nice(p) && !can_nice(p, niceval)) { 188 error = -EACCES; 189 goto out; 190 } 191 no_nice = security_task_setnice(p, niceval); 192 if (no_nice) { 193 error = no_nice; 194 goto out; 195 } 196 if (error == -ESRCH) 197 error = 0; 198 set_user_nice(p, niceval); 199 out: 200 return error; 201 } 202 203 SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) 204 { 205 struct task_struct *g, *p; 206 struct user_struct *user; 207 const struct cred *cred = current_cred(); 208 int error = -EINVAL; 209 struct pid *pgrp; 210 kuid_t uid; 211 212 if (which > PRIO_USER || which < PRIO_PROCESS) 213 goto out; 214 215 /* normalize: avoid signed division (rounding problems) */ 216 error = -ESRCH; 217 if (niceval < MIN_NICE) 218 niceval = MIN_NICE; 219 if (niceval > MAX_NICE) 220 niceval = MAX_NICE; 221 222 rcu_read_lock(); 223 read_lock(&tasklist_lock); 224 switch (which) { 225 case PRIO_PROCESS: 226 if (who) 227 p = find_task_by_vpid(who); 228 else 229 p = current; 230 if (p) 231 error = set_one_prio(p, niceval, error); 232 break; 233 case PRIO_PGRP: 234 if (who) 235 pgrp = find_vpid(who); 236 else 237 pgrp = task_pgrp(current); 238 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 239 error = set_one_prio(p, niceval, error); 240 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 241 break; 242 case PRIO_USER: 243 uid = make_kuid(cred->user_ns, who); 244 user = cred->user; 245 if (!who) 246 uid = cred->uid; 247 else if (!uid_eq(uid, cred->uid)) { 248 user = find_user(uid); 249 if (!user) 250 goto out_unlock; /* No processes for this user */ 251 } 252 do_each_thread(g, p) { 253 if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) 254 error = set_one_prio(p, niceval, error); 255 } while_each_thread(g, p); 256 if (!uid_eq(uid, cred->uid)) 257 free_uid(user); /* For find_user() */ 258 break; 259 } 260 out_unlock: 261 read_unlock(&tasklist_lock); 262 rcu_read_unlock(); 263 out: 264 return error; 265 } 266 267 /* 268 * Ugh. To avoid negative return values, "getpriority()" will 269 * not return the normal nice-value, but a negated value that 270 * has been offset by 20 (ie it returns 40..1 instead of -20..19) 271 * to stay compatible. 272 */ 273 SYSCALL_DEFINE2(getpriority, int, which, int, who) 274 { 275 struct task_struct *g, *p; 276 struct user_struct *user; 277 const struct cred *cred = current_cred(); 278 long niceval, retval = -ESRCH; 279 struct pid *pgrp; 280 kuid_t uid; 281 282 if (which > PRIO_USER || which < PRIO_PROCESS) 283 return -EINVAL; 284 285 rcu_read_lock(); 286 read_lock(&tasklist_lock); 287 switch (which) { 288 case PRIO_PROCESS: 289 if (who) 290 p = find_task_by_vpid(who); 291 else 292 p = current; 293 if (p) { 294 niceval = nice_to_rlimit(task_nice(p)); 295 if (niceval > retval) 296 retval = niceval; 297 } 298 break; 299 case PRIO_PGRP: 300 if (who) 301 pgrp = find_vpid(who); 302 else 303 pgrp = task_pgrp(current); 304 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 305 niceval = nice_to_rlimit(task_nice(p)); 306 if (niceval > retval) 307 retval = niceval; 308 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 309 break; 310 case PRIO_USER: 311 uid = make_kuid(cred->user_ns, who); 312 user = cred->user; 313 if (!who) 314 uid = cred->uid; 315 else if (!uid_eq(uid, cred->uid)) { 316 user = find_user(uid); 317 if (!user) 318 goto out_unlock; /* No processes for this user */ 319 } 320 do_each_thread(g, p) { 321 if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { 322 niceval = nice_to_rlimit(task_nice(p)); 323 if (niceval > retval) 324 retval = niceval; 325 } 326 } while_each_thread(g, p); 327 if (!uid_eq(uid, cred->uid)) 328 free_uid(user); /* for find_user() */ 329 break; 330 } 331 out_unlock: 332 read_unlock(&tasklist_lock); 333 rcu_read_unlock(); 334 335 return retval; 336 } 337 338 /* 339 * Unprivileged users may change the real gid to the effective gid 340 * or vice versa. (BSD-style) 341 * 342 * If you set the real gid at all, or set the effective gid to a value not 343 * equal to the real gid, then the saved gid is set to the new effective gid. 344 * 345 * This makes it possible for a setgid program to completely drop its 346 * privileges, which is often a useful assertion to make when you are doing 347 * a security audit over a program. 348 * 349 * The general idea is that a program which uses just setregid() will be 350 * 100% compatible with BSD. A program which uses just setgid() will be 351 * 100% compatible with POSIX with saved IDs. 352 * 353 * SMP: There are not races, the GIDs are checked only by filesystem 354 * operations (as far as semantic preservation is concerned). 355 */ 356 #ifdef CONFIG_MULTIUSER 357 long __sys_setregid(gid_t rgid, gid_t egid) 358 { 359 struct user_namespace *ns = current_user_ns(); 360 const struct cred *old; 361 struct cred *new; 362 int retval; 363 kgid_t krgid, kegid; 364 365 krgid = make_kgid(ns, rgid); 366 kegid = make_kgid(ns, egid); 367 368 if ((rgid != (gid_t) -1) && !gid_valid(krgid)) 369 return -EINVAL; 370 if ((egid != (gid_t) -1) && !gid_valid(kegid)) 371 return -EINVAL; 372 373 new = prepare_creds(); 374 if (!new) 375 return -ENOMEM; 376 old = current_cred(); 377 378 retval = -EPERM; 379 if (rgid != (gid_t) -1) { 380 if (gid_eq(old->gid, krgid) || 381 gid_eq(old->egid, krgid) || 382 ns_capable_setid(old->user_ns, CAP_SETGID)) 383 new->gid = krgid; 384 else 385 goto error; 386 } 387 if (egid != (gid_t) -1) { 388 if (gid_eq(old->gid, kegid) || 389 gid_eq(old->egid, kegid) || 390 gid_eq(old->sgid, kegid) || 391 ns_capable_setid(old->user_ns, CAP_SETGID)) 392 new->egid = kegid; 393 else 394 goto error; 395 } 396 397 if (rgid != (gid_t) -1 || 398 (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) 399 new->sgid = new->egid; 400 new->fsgid = new->egid; 401 402 retval = security_task_fix_setgid(new, old, LSM_SETID_RE); 403 if (retval < 0) 404 goto error; 405 406 return commit_creds(new); 407 408 error: 409 abort_creds(new); 410 return retval; 411 } 412 413 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 414 { 415 return __sys_setregid(rgid, egid); 416 } 417 418 /* 419 * setgid() is implemented like SysV w/ SAVED_IDS 420 * 421 * SMP: Same implicit races as above. 422 */ 423 long __sys_setgid(gid_t gid) 424 { 425 struct user_namespace *ns = current_user_ns(); 426 const struct cred *old; 427 struct cred *new; 428 int retval; 429 kgid_t kgid; 430 431 kgid = make_kgid(ns, gid); 432 if (!gid_valid(kgid)) 433 return -EINVAL; 434 435 new = prepare_creds(); 436 if (!new) 437 return -ENOMEM; 438 old = current_cred(); 439 440 retval = -EPERM; 441 if (ns_capable_setid(old->user_ns, CAP_SETGID)) 442 new->gid = new->egid = new->sgid = new->fsgid = kgid; 443 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) 444 new->egid = new->fsgid = kgid; 445 else 446 goto error; 447 448 retval = security_task_fix_setgid(new, old, LSM_SETID_ID); 449 if (retval < 0) 450 goto error; 451 452 return commit_creds(new); 453 454 error: 455 abort_creds(new); 456 return retval; 457 } 458 459 SYSCALL_DEFINE1(setgid, gid_t, gid) 460 { 461 return __sys_setgid(gid); 462 } 463 464 /* 465 * change the user struct in a credentials set to match the new UID 466 */ 467 static int set_user(struct cred *new) 468 { 469 struct user_struct *new_user; 470 471 new_user = alloc_uid(new->uid); 472 if (!new_user) 473 return -EAGAIN; 474 475 /* 476 * We don't fail in case of NPROC limit excess here because too many 477 * poorly written programs don't check set*uid() return code, assuming 478 * it never fails if called by root. We may still enforce NPROC limit 479 * for programs doing set*uid()+execve() by harmlessly deferring the 480 * failure to the execve() stage. 481 */ 482 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && 483 new_user != INIT_USER) 484 current->flags |= PF_NPROC_EXCEEDED; 485 else 486 current->flags &= ~PF_NPROC_EXCEEDED; 487 488 free_uid(new->user); 489 new->user = new_user; 490 return 0; 491 } 492 493 /* 494 * Unprivileged users may change the real uid to the effective uid 495 * or vice versa. (BSD-style) 496 * 497 * If you set the real uid at all, or set the effective uid to a value not 498 * equal to the real uid, then the saved uid is set to the new effective uid. 499 * 500 * This makes it possible for a setuid program to completely drop its 501 * privileges, which is often a useful assertion to make when you are doing 502 * a security audit over a program. 503 * 504 * The general idea is that a program which uses just setreuid() will be 505 * 100% compatible with BSD. A program which uses just setuid() will be 506 * 100% compatible with POSIX with saved IDs. 507 */ 508 long __sys_setreuid(uid_t ruid, uid_t euid) 509 { 510 struct user_namespace *ns = current_user_ns(); 511 const struct cred *old; 512 struct cred *new; 513 int retval; 514 kuid_t kruid, keuid; 515 516 kruid = make_kuid(ns, ruid); 517 keuid = make_kuid(ns, euid); 518 519 if ((ruid != (uid_t) -1) && !uid_valid(kruid)) 520 return -EINVAL; 521 if ((euid != (uid_t) -1) && !uid_valid(keuid)) 522 return -EINVAL; 523 524 new = prepare_creds(); 525 if (!new) 526 return -ENOMEM; 527 old = current_cred(); 528 529 retval = -EPERM; 530 if (ruid != (uid_t) -1) { 531 new->uid = kruid; 532 if (!uid_eq(old->uid, kruid) && 533 !uid_eq(old->euid, kruid) && 534 !ns_capable_setid(old->user_ns, CAP_SETUID)) 535 goto error; 536 } 537 538 if (euid != (uid_t) -1) { 539 new->euid = keuid; 540 if (!uid_eq(old->uid, keuid) && 541 !uid_eq(old->euid, keuid) && 542 !uid_eq(old->suid, keuid) && 543 !ns_capable_setid(old->user_ns, CAP_SETUID)) 544 goto error; 545 } 546 547 if (!uid_eq(new->uid, old->uid)) { 548 retval = set_user(new); 549 if (retval < 0) 550 goto error; 551 } 552 if (ruid != (uid_t) -1 || 553 (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) 554 new->suid = new->euid; 555 new->fsuid = new->euid; 556 557 retval = security_task_fix_setuid(new, old, LSM_SETID_RE); 558 if (retval < 0) 559 goto error; 560 561 return commit_creds(new); 562 563 error: 564 abort_creds(new); 565 return retval; 566 } 567 568 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 569 { 570 return __sys_setreuid(ruid, euid); 571 } 572 573 /* 574 * setuid() is implemented like SysV with SAVED_IDS 575 * 576 * Note that SAVED_ID's is deficient in that a setuid root program 577 * like sendmail, for example, cannot set its uid to be a normal 578 * user and then switch back, because if you're root, setuid() sets 579 * the saved uid too. If you don't like this, blame the bright people 580 * in the POSIX committee and/or USG. Note that the BSD-style setreuid() 581 * will allow a root program to temporarily drop privileges and be able to 582 * regain them by swapping the real and effective uid. 583 */ 584 long __sys_setuid(uid_t uid) 585 { 586 struct user_namespace *ns = current_user_ns(); 587 const struct cred *old; 588 struct cred *new; 589 int retval; 590 kuid_t kuid; 591 592 kuid = make_kuid(ns, uid); 593 if (!uid_valid(kuid)) 594 return -EINVAL; 595 596 new = prepare_creds(); 597 if (!new) 598 return -ENOMEM; 599 old = current_cred(); 600 601 retval = -EPERM; 602 if (ns_capable_setid(old->user_ns, CAP_SETUID)) { 603 new->suid = new->uid = kuid; 604 if (!uid_eq(kuid, old->uid)) { 605 retval = set_user(new); 606 if (retval < 0) 607 goto error; 608 } 609 } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { 610 goto error; 611 } 612 613 new->fsuid = new->euid = kuid; 614 615 retval = security_task_fix_setuid(new, old, LSM_SETID_ID); 616 if (retval < 0) 617 goto error; 618 619 return commit_creds(new); 620 621 error: 622 abort_creds(new); 623 return retval; 624 } 625 626 SYSCALL_DEFINE1(setuid, uid_t, uid) 627 { 628 return __sys_setuid(uid); 629 } 630 631 632 /* 633 * This function implements a generic ability to update ruid, euid, 634 * and suid. This allows you to implement the 4.4 compatible seteuid(). 635 */ 636 long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 637 { 638 struct user_namespace *ns = current_user_ns(); 639 const struct cred *old; 640 struct cred *new; 641 int retval; 642 kuid_t kruid, keuid, ksuid; 643 644 kruid = make_kuid(ns, ruid); 645 keuid = make_kuid(ns, euid); 646 ksuid = make_kuid(ns, suid); 647 648 if ((ruid != (uid_t) -1) && !uid_valid(kruid)) 649 return -EINVAL; 650 651 if ((euid != (uid_t) -1) && !uid_valid(keuid)) 652 return -EINVAL; 653 654 if ((suid != (uid_t) -1) && !uid_valid(ksuid)) 655 return -EINVAL; 656 657 new = prepare_creds(); 658 if (!new) 659 return -ENOMEM; 660 661 old = current_cred(); 662 663 retval = -EPERM; 664 if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { 665 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 666 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 667 goto error; 668 if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && 669 !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) 670 goto error; 671 if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && 672 !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) 673 goto error; 674 } 675 676 if (ruid != (uid_t) -1) { 677 new->uid = kruid; 678 if (!uid_eq(kruid, old->uid)) { 679 retval = set_user(new); 680 if (retval < 0) 681 goto error; 682 } 683 } 684 if (euid != (uid_t) -1) 685 new->euid = keuid; 686 if (suid != (uid_t) -1) 687 new->suid = ksuid; 688 new->fsuid = new->euid; 689 690 retval = security_task_fix_setuid(new, old, LSM_SETID_RES); 691 if (retval < 0) 692 goto error; 693 694 return commit_creds(new); 695 696 error: 697 abort_creds(new); 698 return retval; 699 } 700 701 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) 702 { 703 return __sys_setresuid(ruid, euid, suid); 704 } 705 706 SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) 707 { 708 const struct cred *cred = current_cred(); 709 int retval; 710 uid_t ruid, euid, suid; 711 712 ruid = from_kuid_munged(cred->user_ns, cred->uid); 713 euid = from_kuid_munged(cred->user_ns, cred->euid); 714 suid = from_kuid_munged(cred->user_ns, cred->suid); 715 716 retval = put_user(ruid, ruidp); 717 if (!retval) { 718 retval = put_user(euid, euidp); 719 if (!retval) 720 return put_user(suid, suidp); 721 } 722 return retval; 723 } 724 725 /* 726 * Same as above, but for rgid, egid, sgid. 727 */ 728 long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 729 { 730 struct user_namespace *ns = current_user_ns(); 731 const struct cred *old; 732 struct cred *new; 733 int retval; 734 kgid_t krgid, kegid, ksgid; 735 736 krgid = make_kgid(ns, rgid); 737 kegid = make_kgid(ns, egid); 738 ksgid = make_kgid(ns, sgid); 739 740 if ((rgid != (gid_t) -1) && !gid_valid(krgid)) 741 return -EINVAL; 742 if ((egid != (gid_t) -1) && !gid_valid(kegid)) 743 return -EINVAL; 744 if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) 745 return -EINVAL; 746 747 new = prepare_creds(); 748 if (!new) 749 return -ENOMEM; 750 old = current_cred(); 751 752 retval = -EPERM; 753 if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { 754 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && 755 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) 756 goto error; 757 if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && 758 !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) 759 goto error; 760 if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && 761 !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) 762 goto error; 763 } 764 765 if (rgid != (gid_t) -1) 766 new->gid = krgid; 767 if (egid != (gid_t) -1) 768 new->egid = kegid; 769 if (sgid != (gid_t) -1) 770 new->sgid = ksgid; 771 new->fsgid = new->egid; 772 773 retval = security_task_fix_setgid(new, old, LSM_SETID_RES); 774 if (retval < 0) 775 goto error; 776 777 return commit_creds(new); 778 779 error: 780 abort_creds(new); 781 return retval; 782 } 783 784 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) 785 { 786 return __sys_setresgid(rgid, egid, sgid); 787 } 788 789 SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) 790 { 791 const struct cred *cred = current_cred(); 792 int retval; 793 gid_t rgid, egid, sgid; 794 795 rgid = from_kgid_munged(cred->user_ns, cred->gid); 796 egid = from_kgid_munged(cred->user_ns, cred->egid); 797 sgid = from_kgid_munged(cred->user_ns, cred->sgid); 798 799 retval = put_user(rgid, rgidp); 800 if (!retval) { 801 retval = put_user(egid, egidp); 802 if (!retval) 803 retval = put_user(sgid, sgidp); 804 } 805 806 return retval; 807 } 808 809 810 /* 811 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This 812 * is used for "access()" and for the NFS daemon (letting nfsd stay at 813 * whatever uid it wants to). It normally shadows "euid", except when 814 * explicitly set by setfsuid() or for access.. 815 */ 816 long __sys_setfsuid(uid_t uid) 817 { 818 const struct cred *old; 819 struct cred *new; 820 uid_t old_fsuid; 821 kuid_t kuid; 822 823 old = current_cred(); 824 old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); 825 826 kuid = make_kuid(old->user_ns, uid); 827 if (!uid_valid(kuid)) 828 return old_fsuid; 829 830 new = prepare_creds(); 831 if (!new) 832 return old_fsuid; 833 834 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 835 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 836 ns_capable_setid(old->user_ns, CAP_SETUID)) { 837 if (!uid_eq(kuid, old->fsuid)) { 838 new->fsuid = kuid; 839 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 840 goto change_okay; 841 } 842 } 843 844 abort_creds(new); 845 return old_fsuid; 846 847 change_okay: 848 commit_creds(new); 849 return old_fsuid; 850 } 851 852 SYSCALL_DEFINE1(setfsuid, uid_t, uid) 853 { 854 return __sys_setfsuid(uid); 855 } 856 857 /* 858 * Samma på svenska.. 859 */ 860 long __sys_setfsgid(gid_t gid) 861 { 862 const struct cred *old; 863 struct cred *new; 864 gid_t old_fsgid; 865 kgid_t kgid; 866 867 old = current_cred(); 868 old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); 869 870 kgid = make_kgid(old->user_ns, gid); 871 if (!gid_valid(kgid)) 872 return old_fsgid; 873 874 new = prepare_creds(); 875 if (!new) 876 return old_fsgid; 877 878 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || 879 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || 880 ns_capable_setid(old->user_ns, CAP_SETGID)) { 881 if (!gid_eq(kgid, old->fsgid)) { 882 new->fsgid = kgid; 883 if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) 884 goto change_okay; 885 } 886 } 887 888 abort_creds(new); 889 return old_fsgid; 890 891 change_okay: 892 commit_creds(new); 893 return old_fsgid; 894 } 895 896 SYSCALL_DEFINE1(setfsgid, gid_t, gid) 897 { 898 return __sys_setfsgid(gid); 899 } 900 #endif /* CONFIG_MULTIUSER */ 901 902 /** 903 * sys_getpid - return the thread group id of the current process 904 * 905 * Note, despite the name, this returns the tgid not the pid. The tgid and 906 * the pid are identical unless CLONE_THREAD was specified on clone() in 907 * which case the tgid is the same in all threads of the same group. 908 * 909 * This is SMP safe as current->tgid does not change. 910 */ 911 SYSCALL_DEFINE0(getpid) 912 { 913 return task_tgid_vnr(current); 914 } 915 916 /* Thread ID - the internal kernel "pid" */ 917 SYSCALL_DEFINE0(gettid) 918 { 919 return task_pid_vnr(current); 920 } 921 922 /* 923 * Accessing ->real_parent is not SMP-safe, it could 924 * change from under us. However, we can use a stale 925 * value of ->real_parent under rcu_read_lock(), see 926 * release_task()->call_rcu(delayed_put_task_struct). 927 */ 928 SYSCALL_DEFINE0(getppid) 929 { 930 int pid; 931 932 rcu_read_lock(); 933 pid = task_tgid_vnr(rcu_dereference(current->real_parent)); 934 rcu_read_unlock(); 935 936 return pid; 937 } 938 939 SYSCALL_DEFINE0(getuid) 940 { 941 /* Only we change this so SMP safe */ 942 return from_kuid_munged(current_user_ns(), current_uid()); 943 } 944 945 SYSCALL_DEFINE0(geteuid) 946 { 947 /* Only we change this so SMP safe */ 948 return from_kuid_munged(current_user_ns(), current_euid()); 949 } 950 951 SYSCALL_DEFINE0(getgid) 952 { 953 /* Only we change this so SMP safe */ 954 return from_kgid_munged(current_user_ns(), current_gid()); 955 } 956 957 SYSCALL_DEFINE0(getegid) 958 { 959 /* Only we change this so SMP safe */ 960 return from_kgid_munged(current_user_ns(), current_egid()); 961 } 962 963 static void do_sys_times(struct tms *tms) 964 { 965 u64 tgutime, tgstime, cutime, cstime; 966 967 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 968 cutime = current->signal->cutime; 969 cstime = current->signal->cstime; 970 tms->tms_utime = nsec_to_clock_t(tgutime); 971 tms->tms_stime = nsec_to_clock_t(tgstime); 972 tms->tms_cutime = nsec_to_clock_t(cutime); 973 tms->tms_cstime = nsec_to_clock_t(cstime); 974 } 975 976 SYSCALL_DEFINE1(times, struct tms __user *, tbuf) 977 { 978 if (tbuf) { 979 struct tms tmp; 980 981 do_sys_times(&tmp); 982 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 983 return -EFAULT; 984 } 985 force_successful_syscall_return(); 986 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 987 } 988 989 #ifdef CONFIG_COMPAT 990 static compat_clock_t clock_t_to_compat_clock_t(clock_t x) 991 { 992 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); 993 } 994 995 COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) 996 { 997 if (tbuf) { 998 struct tms tms; 999 struct compat_tms tmp; 1000 1001 do_sys_times(&tms); 1002 /* Convert our struct tms to the compat version. */ 1003 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); 1004 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); 1005 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); 1006 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); 1007 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 1008 return -EFAULT; 1009 } 1010 force_successful_syscall_return(); 1011 return compat_jiffies_to_clock_t(jiffies); 1012 } 1013 #endif 1014 1015 /* 1016 * This needs some heavy checking ... 1017 * I just haven't the stomach for it. I also don't fully 1018 * understand sessions/pgrp etc. Let somebody who does explain it. 1019 * 1020 * OK, I think I have the protection semantics right.... this is really 1021 * only important on a multi-user system anyway, to make sure one user 1022 * can't send a signal to a process owned by another. -TYT, 12/12/91 1023 * 1024 * !PF_FORKNOEXEC check to conform completely to POSIX. 1025 */ 1026 SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) 1027 { 1028 struct task_struct *p; 1029 struct task_struct *group_leader = current->group_leader; 1030 struct pid *pgrp; 1031 int err; 1032 1033 if (!pid) 1034 pid = task_pid_vnr(group_leader); 1035 if (!pgid) 1036 pgid = pid; 1037 if (pgid < 0) 1038 return -EINVAL; 1039 rcu_read_lock(); 1040 1041 /* From this point forward we keep holding onto the tasklist lock 1042 * so that our parent does not change from under us. -DaveM 1043 */ 1044 write_lock_irq(&tasklist_lock); 1045 1046 err = -ESRCH; 1047 p = find_task_by_vpid(pid); 1048 if (!p) 1049 goto out; 1050 1051 err = -EINVAL; 1052 if (!thread_group_leader(p)) 1053 goto out; 1054 1055 if (same_thread_group(p->real_parent, group_leader)) { 1056 err = -EPERM; 1057 if (task_session(p) != task_session(group_leader)) 1058 goto out; 1059 err = -EACCES; 1060 if (!(p->flags & PF_FORKNOEXEC)) 1061 goto out; 1062 } else { 1063 err = -ESRCH; 1064 if (p != group_leader) 1065 goto out; 1066 } 1067 1068 err = -EPERM; 1069 if (p->signal->leader) 1070 goto out; 1071 1072 pgrp = task_pid(p); 1073 if (pgid != pid) { 1074 struct task_struct *g; 1075 1076 pgrp = find_vpid(pgid); 1077 g = pid_task(pgrp, PIDTYPE_PGID); 1078 if (!g || task_session(g) != task_session(group_leader)) 1079 goto out; 1080 } 1081 1082 err = security_task_setpgid(p, pgid); 1083 if (err) 1084 goto out; 1085 1086 if (task_pgrp(p) != pgrp) 1087 change_pid(p, PIDTYPE_PGID, pgrp); 1088 1089 err = 0; 1090 out: 1091 /* All paths lead to here, thus we are safe. -DaveM */ 1092 write_unlock_irq(&tasklist_lock); 1093 rcu_read_unlock(); 1094 return err; 1095 } 1096 1097 static int do_getpgid(pid_t pid) 1098 { 1099 struct task_struct *p; 1100 struct pid *grp; 1101 int retval; 1102 1103 rcu_read_lock(); 1104 if (!pid) 1105 grp = task_pgrp(current); 1106 else { 1107 retval = -ESRCH; 1108 p = find_task_by_vpid(pid); 1109 if (!p) 1110 goto out; 1111 grp = task_pgrp(p); 1112 if (!grp) 1113 goto out; 1114 1115 retval = security_task_getpgid(p); 1116 if (retval) 1117 goto out; 1118 } 1119 retval = pid_vnr(grp); 1120 out: 1121 rcu_read_unlock(); 1122 return retval; 1123 } 1124 1125 SYSCALL_DEFINE1(getpgid, pid_t, pid) 1126 { 1127 return do_getpgid(pid); 1128 } 1129 1130 #ifdef __ARCH_WANT_SYS_GETPGRP 1131 1132 SYSCALL_DEFINE0(getpgrp) 1133 { 1134 return do_getpgid(0); 1135 } 1136 1137 #endif 1138 1139 SYSCALL_DEFINE1(getsid, pid_t, pid) 1140 { 1141 struct task_struct *p; 1142 struct pid *sid; 1143 int retval; 1144 1145 rcu_read_lock(); 1146 if (!pid) 1147 sid = task_session(current); 1148 else { 1149 retval = -ESRCH; 1150 p = find_task_by_vpid(pid); 1151 if (!p) 1152 goto out; 1153 sid = task_session(p); 1154 if (!sid) 1155 goto out; 1156 1157 retval = security_task_getsid(p); 1158 if (retval) 1159 goto out; 1160 } 1161 retval = pid_vnr(sid); 1162 out: 1163 rcu_read_unlock(); 1164 return retval; 1165 } 1166 1167 static void set_special_pids(struct pid *pid) 1168 { 1169 struct task_struct *curr = current->group_leader; 1170 1171 if (task_session(curr) != pid) 1172 change_pid(curr, PIDTYPE_SID, pid); 1173 1174 if (task_pgrp(curr) != pid) 1175 change_pid(curr, PIDTYPE_PGID, pid); 1176 } 1177 1178 int ksys_setsid(void) 1179 { 1180 struct task_struct *group_leader = current->group_leader; 1181 struct pid *sid = task_pid(group_leader); 1182 pid_t session = pid_vnr(sid); 1183 int err = -EPERM; 1184 1185 write_lock_irq(&tasklist_lock); 1186 /* Fail if I am already a session leader */ 1187 if (group_leader->signal->leader) 1188 goto out; 1189 1190 /* Fail if a process group id already exists that equals the 1191 * proposed session id. 1192 */ 1193 if (pid_task(sid, PIDTYPE_PGID)) 1194 goto out; 1195 1196 group_leader->signal->leader = 1; 1197 set_special_pids(sid); 1198 1199 proc_clear_tty(group_leader); 1200 1201 err = session; 1202 out: 1203 write_unlock_irq(&tasklist_lock); 1204 if (err > 0) { 1205 proc_sid_connector(group_leader); 1206 sched_autogroup_create_attach(group_leader); 1207 } 1208 return err; 1209 } 1210 1211 SYSCALL_DEFINE0(setsid) 1212 { 1213 return ksys_setsid(); 1214 } 1215 1216 DECLARE_RWSEM(uts_sem); 1217 1218 #ifdef COMPAT_UTS_MACHINE 1219 #define override_architecture(name) \ 1220 (personality(current->personality) == PER_LINUX32 && \ 1221 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ 1222 sizeof(COMPAT_UTS_MACHINE))) 1223 #else 1224 #define override_architecture(name) 0 1225 #endif 1226 1227 /* 1228 * Work around broken programs that cannot handle "Linux 3.0". 1229 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 1230 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be 1231 * 2.6.60. 1232 */ 1233 static int override_release(char __user *release, size_t len) 1234 { 1235 int ret = 0; 1236 1237 if (current->personality & UNAME26) { 1238 const char *rest = UTS_RELEASE; 1239 char buf[65] = { 0 }; 1240 int ndots = 0; 1241 unsigned v; 1242 size_t copy; 1243 1244 while (*rest) { 1245 if (*rest == '.' && ++ndots >= 3) 1246 break; 1247 if (!isdigit(*rest) && *rest != '.') 1248 break; 1249 rest++; 1250 } 1251 v = LINUX_VERSION_PATCHLEVEL + 60; 1252 copy = clamp_t(size_t, len, 1, sizeof(buf)); 1253 copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); 1254 ret = copy_to_user(release, buf, copy + 1); 1255 } 1256 return ret; 1257 } 1258 1259 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1260 { 1261 struct new_utsname tmp; 1262 1263 down_read(&uts_sem); 1264 memcpy(&tmp, utsname(), sizeof(tmp)); 1265 up_read(&uts_sem); 1266 if (copy_to_user(name, &tmp, sizeof(tmp))) 1267 return -EFAULT; 1268 1269 if (override_release(name->release, sizeof(name->release))) 1270 return -EFAULT; 1271 if (override_architecture(name)) 1272 return -EFAULT; 1273 return 0; 1274 } 1275 1276 #ifdef __ARCH_WANT_SYS_OLD_UNAME 1277 /* 1278 * Old cruft 1279 */ 1280 SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) 1281 { 1282 struct old_utsname tmp; 1283 1284 if (!name) 1285 return -EFAULT; 1286 1287 down_read(&uts_sem); 1288 memcpy(&tmp, utsname(), sizeof(tmp)); 1289 up_read(&uts_sem); 1290 if (copy_to_user(name, &tmp, sizeof(tmp))) 1291 return -EFAULT; 1292 1293 if (override_release(name->release, sizeof(name->release))) 1294 return -EFAULT; 1295 if (override_architecture(name)) 1296 return -EFAULT; 1297 return 0; 1298 } 1299 1300 SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) 1301 { 1302 struct oldold_utsname tmp; 1303 1304 if (!name) 1305 return -EFAULT; 1306 1307 memset(&tmp, 0, sizeof(tmp)); 1308 1309 down_read(&uts_sem); 1310 memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); 1311 memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); 1312 memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); 1313 memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); 1314 memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); 1315 up_read(&uts_sem); 1316 if (copy_to_user(name, &tmp, sizeof(tmp))) 1317 return -EFAULT; 1318 1319 if (override_architecture(name)) 1320 return -EFAULT; 1321 if (override_release(name->release, sizeof(name->release))) 1322 return -EFAULT; 1323 return 0; 1324 } 1325 #endif 1326 1327 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1328 { 1329 int errno; 1330 char tmp[__NEW_UTS_LEN]; 1331 1332 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) 1333 return -EPERM; 1334 1335 if (len < 0 || len > __NEW_UTS_LEN) 1336 return -EINVAL; 1337 errno = -EFAULT; 1338 if (!copy_from_user(tmp, name, len)) { 1339 struct new_utsname *u; 1340 1341 down_write(&uts_sem); 1342 u = utsname(); 1343 memcpy(u->nodename, tmp, len); 1344 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1345 errno = 0; 1346 uts_proc_notify(UTS_PROC_HOSTNAME); 1347 up_write(&uts_sem); 1348 } 1349 return errno; 1350 } 1351 1352 #ifdef __ARCH_WANT_SYS_GETHOSTNAME 1353 1354 SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) 1355 { 1356 int i; 1357 struct new_utsname *u; 1358 char tmp[__NEW_UTS_LEN + 1]; 1359 1360 if (len < 0) 1361 return -EINVAL; 1362 down_read(&uts_sem); 1363 u = utsname(); 1364 i = 1 + strlen(u->nodename); 1365 if (i > len) 1366 i = len; 1367 memcpy(tmp, u->nodename, i); 1368 up_read(&uts_sem); 1369 if (copy_to_user(name, tmp, i)) 1370 return -EFAULT; 1371 return 0; 1372 } 1373 1374 #endif 1375 1376 /* 1377 * Only setdomainname; getdomainname can be implemented by calling 1378 * uname() 1379 */ 1380 SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) 1381 { 1382 int errno; 1383 char tmp[__NEW_UTS_LEN]; 1384 1385 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) 1386 return -EPERM; 1387 if (len < 0 || len > __NEW_UTS_LEN) 1388 return -EINVAL; 1389 1390 errno = -EFAULT; 1391 if (!copy_from_user(tmp, name, len)) { 1392 struct new_utsname *u; 1393 1394 down_write(&uts_sem); 1395 u = utsname(); 1396 memcpy(u->domainname, tmp, len); 1397 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1398 errno = 0; 1399 uts_proc_notify(UTS_PROC_DOMAINNAME); 1400 up_write(&uts_sem); 1401 } 1402 return errno; 1403 } 1404 1405 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1406 { 1407 struct rlimit value; 1408 int ret; 1409 1410 ret = do_prlimit(current, resource, NULL, &value); 1411 if (!ret) 1412 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1413 1414 return ret; 1415 } 1416 1417 #ifdef CONFIG_COMPAT 1418 1419 COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, 1420 struct compat_rlimit __user *, rlim) 1421 { 1422 struct rlimit r; 1423 struct compat_rlimit r32; 1424 1425 if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) 1426 return -EFAULT; 1427 1428 if (r32.rlim_cur == COMPAT_RLIM_INFINITY) 1429 r.rlim_cur = RLIM_INFINITY; 1430 else 1431 r.rlim_cur = r32.rlim_cur; 1432 if (r32.rlim_max == COMPAT_RLIM_INFINITY) 1433 r.rlim_max = RLIM_INFINITY; 1434 else 1435 r.rlim_max = r32.rlim_max; 1436 return do_prlimit(current, resource, &r, NULL); 1437 } 1438 1439 COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, 1440 struct compat_rlimit __user *, rlim) 1441 { 1442 struct rlimit r; 1443 int ret; 1444 1445 ret = do_prlimit(current, resource, NULL, &r); 1446 if (!ret) { 1447 struct compat_rlimit r32; 1448 if (r.rlim_cur > COMPAT_RLIM_INFINITY) 1449 r32.rlim_cur = COMPAT_RLIM_INFINITY; 1450 else 1451 r32.rlim_cur = r.rlim_cur; 1452 if (r.rlim_max > COMPAT_RLIM_INFINITY) 1453 r32.rlim_max = COMPAT_RLIM_INFINITY; 1454 else 1455 r32.rlim_max = r.rlim_max; 1456 1457 if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) 1458 return -EFAULT; 1459 } 1460 return ret; 1461 } 1462 1463 #endif 1464 1465 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1466 1467 /* 1468 * Back compatibility for getrlimit. Needed for some apps. 1469 */ 1470 SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1471 struct rlimit __user *, rlim) 1472 { 1473 struct rlimit x; 1474 if (resource >= RLIM_NLIMITS) 1475 return -EINVAL; 1476 1477 resource = array_index_nospec(resource, RLIM_NLIMITS); 1478 task_lock(current->group_leader); 1479 x = current->signal->rlim[resource]; 1480 task_unlock(current->group_leader); 1481 if (x.rlim_cur > 0x7FFFFFFF) 1482 x.rlim_cur = 0x7FFFFFFF; 1483 if (x.rlim_max > 0x7FFFFFFF) 1484 x.rlim_max = 0x7FFFFFFF; 1485 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; 1486 } 1487 1488 #ifdef CONFIG_COMPAT 1489 COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1490 struct compat_rlimit __user *, rlim) 1491 { 1492 struct rlimit r; 1493 1494 if (resource >= RLIM_NLIMITS) 1495 return -EINVAL; 1496 1497 resource = array_index_nospec(resource, RLIM_NLIMITS); 1498 task_lock(current->group_leader); 1499 r = current->signal->rlim[resource]; 1500 task_unlock(current->group_leader); 1501 if (r.rlim_cur > 0x7FFFFFFF) 1502 r.rlim_cur = 0x7FFFFFFF; 1503 if (r.rlim_max > 0x7FFFFFFF) 1504 r.rlim_max = 0x7FFFFFFF; 1505 1506 if (put_user(r.rlim_cur, &rlim->rlim_cur) || 1507 put_user(r.rlim_max, &rlim->rlim_max)) 1508 return -EFAULT; 1509 return 0; 1510 } 1511 #endif 1512 1513 #endif 1514 1515 static inline bool rlim64_is_infinity(__u64 rlim64) 1516 { 1517 #if BITS_PER_LONG < 64 1518 return rlim64 >= ULONG_MAX; 1519 #else 1520 return rlim64 == RLIM64_INFINITY; 1521 #endif 1522 } 1523 1524 static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) 1525 { 1526 if (rlim->rlim_cur == RLIM_INFINITY) 1527 rlim64->rlim_cur = RLIM64_INFINITY; 1528 else 1529 rlim64->rlim_cur = rlim->rlim_cur; 1530 if (rlim->rlim_max == RLIM_INFINITY) 1531 rlim64->rlim_max = RLIM64_INFINITY; 1532 else 1533 rlim64->rlim_max = rlim->rlim_max; 1534 } 1535 1536 static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) 1537 { 1538 if (rlim64_is_infinity(rlim64->rlim_cur)) 1539 rlim->rlim_cur = RLIM_INFINITY; 1540 else 1541 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; 1542 if (rlim64_is_infinity(rlim64->rlim_max)) 1543 rlim->rlim_max = RLIM_INFINITY; 1544 else 1545 rlim->rlim_max = (unsigned long)rlim64->rlim_max; 1546 } 1547 1548 /* make sure you are allowed to change @tsk limits before calling this */ 1549 int do_prlimit(struct task_struct *tsk, unsigned int resource, 1550 struct rlimit *new_rlim, struct rlimit *old_rlim) 1551 { 1552 struct rlimit *rlim; 1553 int retval = 0; 1554 1555 if (resource >= RLIM_NLIMITS) 1556 return -EINVAL; 1557 if (new_rlim) { 1558 if (new_rlim->rlim_cur > new_rlim->rlim_max) 1559 return -EINVAL; 1560 if (resource == RLIMIT_NOFILE && 1561 new_rlim->rlim_max > sysctl_nr_open) 1562 return -EPERM; 1563 } 1564 1565 /* protect tsk->signal and tsk->sighand from disappearing */ 1566 read_lock(&tasklist_lock); 1567 if (!tsk->sighand) { 1568 retval = -ESRCH; 1569 goto out; 1570 } 1571 1572 rlim = tsk->signal->rlim + resource; 1573 task_lock(tsk->group_leader); 1574 if (new_rlim) { 1575 /* Keep the capable check against init_user_ns until 1576 cgroups can contain all limits */ 1577 if (new_rlim->rlim_max > rlim->rlim_max && 1578 !capable(CAP_SYS_RESOURCE)) 1579 retval = -EPERM; 1580 if (!retval) 1581 retval = security_task_setrlimit(tsk, resource, new_rlim); 1582 } 1583 if (!retval) { 1584 if (old_rlim) 1585 *old_rlim = *rlim; 1586 if (new_rlim) 1587 *rlim = *new_rlim; 1588 } 1589 task_unlock(tsk->group_leader); 1590 1591 /* 1592 * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not 1593 * infinite. In case of RLIM_INFINITY the posix CPU timer code 1594 * ignores the rlimit. 1595 */ 1596 if (!retval && new_rlim && resource == RLIMIT_CPU && 1597 new_rlim->rlim_cur != RLIM_INFINITY && 1598 IS_ENABLED(CONFIG_POSIX_TIMERS)) 1599 update_rlimit_cpu(tsk, new_rlim->rlim_cur); 1600 out: 1601 read_unlock(&tasklist_lock); 1602 return retval; 1603 } 1604 1605 /* rcu lock must be held */ 1606 static int check_prlimit_permission(struct task_struct *task, 1607 unsigned int flags) 1608 { 1609 const struct cred *cred = current_cred(), *tcred; 1610 bool id_match; 1611 1612 if (current == task) 1613 return 0; 1614 1615 tcred = __task_cred(task); 1616 id_match = (uid_eq(cred->uid, tcred->euid) && 1617 uid_eq(cred->uid, tcred->suid) && 1618 uid_eq(cred->uid, tcred->uid) && 1619 gid_eq(cred->gid, tcred->egid) && 1620 gid_eq(cred->gid, tcred->sgid) && 1621 gid_eq(cred->gid, tcred->gid)); 1622 if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) 1623 return -EPERM; 1624 1625 return security_task_prlimit(cred, tcred, flags); 1626 } 1627 1628 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1629 const struct rlimit64 __user *, new_rlim, 1630 struct rlimit64 __user *, old_rlim) 1631 { 1632 struct rlimit64 old64, new64; 1633 struct rlimit old, new; 1634 struct task_struct *tsk; 1635 unsigned int checkflags = 0; 1636 int ret; 1637 1638 if (old_rlim) 1639 checkflags |= LSM_PRLIMIT_READ; 1640 1641 if (new_rlim) { 1642 if (copy_from_user(&new64, new_rlim, sizeof(new64))) 1643 return -EFAULT; 1644 rlim64_to_rlim(&new64, &new); 1645 checkflags |= LSM_PRLIMIT_WRITE; 1646 } 1647 1648 rcu_read_lock(); 1649 tsk = pid ? find_task_by_vpid(pid) : current; 1650 if (!tsk) { 1651 rcu_read_unlock(); 1652 return -ESRCH; 1653 } 1654 ret = check_prlimit_permission(tsk, checkflags); 1655 if (ret) { 1656 rcu_read_unlock(); 1657 return ret; 1658 } 1659 get_task_struct(tsk); 1660 rcu_read_unlock(); 1661 1662 ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, 1663 old_rlim ? &old : NULL); 1664 1665 if (!ret && old_rlim) { 1666 rlim_to_rlim64(&old, &old64); 1667 if (copy_to_user(old_rlim, &old64, sizeof(old64))) 1668 ret = -EFAULT; 1669 } 1670 1671 put_task_struct(tsk); 1672 return ret; 1673 } 1674 1675 SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1676 { 1677 struct rlimit new_rlim; 1678 1679 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1680 return -EFAULT; 1681 return do_prlimit(current, resource, &new_rlim, NULL); 1682 } 1683 1684 /* 1685 * It would make sense to put struct rusage in the task_struct, 1686 * except that would make the task_struct be *really big*. After 1687 * task_struct gets moved into malloc'ed memory, it would 1688 * make sense to do this. It will make moving the rest of the information 1689 * a lot simpler! (Which we're not doing right now because we're not 1690 * measuring them yet). 1691 * 1692 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have 1693 * races with threads incrementing their own counters. But since word 1694 * reads are atomic, we either get new values or old values and we don't 1695 * care which for the sums. We always take the siglock to protect reading 1696 * the c* fields from p->signal from races with exit.c updating those 1697 * fields when reaping, so a sample either gets all the additions of a 1698 * given child after it's reaped, or none so this sample is before reaping. 1699 * 1700 * Locking: 1701 * We need to take the siglock for CHILDEREN, SELF and BOTH 1702 * for the cases current multithreaded, non-current single threaded 1703 * non-current multithreaded. Thread traversal is now safe with 1704 * the siglock held. 1705 * Strictly speaking, we donot need to take the siglock if we are current and 1706 * single threaded, as no one else can take our signal_struct away, no one 1707 * else can reap the children to update signal->c* counters, and no one else 1708 * can race with the signal-> fields. If we do not take any lock, the 1709 * signal-> fields could be read out of order while another thread was just 1710 * exiting. So we should place a read memory barrier when we avoid the lock. 1711 * On the writer side, write memory barrier is implied in __exit_signal 1712 * as __exit_signal releases the siglock spinlock after updating the signal-> 1713 * fields. But we don't do this yet to keep things simple. 1714 * 1715 */ 1716 1717 static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) 1718 { 1719 r->ru_nvcsw += t->nvcsw; 1720 r->ru_nivcsw += t->nivcsw; 1721 r->ru_minflt += t->min_flt; 1722 r->ru_majflt += t->maj_flt; 1723 r->ru_inblock += task_io_get_inblock(t); 1724 r->ru_oublock += task_io_get_oublock(t); 1725 } 1726 1727 void getrusage(struct task_struct *p, int who, struct rusage *r) 1728 { 1729 struct task_struct *t; 1730 unsigned long flags; 1731 u64 tgutime, tgstime, utime, stime; 1732 unsigned long maxrss = 0; 1733 1734 memset((char *)r, 0, sizeof (*r)); 1735 utime = stime = 0; 1736 1737 if (who == RUSAGE_THREAD) { 1738 task_cputime_adjusted(current, &utime, &stime); 1739 accumulate_thread_rusage(p, r); 1740 maxrss = p->signal->maxrss; 1741 goto out; 1742 } 1743 1744 if (!lock_task_sighand(p, &flags)) 1745 return; 1746 1747 switch (who) { 1748 case RUSAGE_BOTH: 1749 case RUSAGE_CHILDREN: 1750 utime = p->signal->cutime; 1751 stime = p->signal->cstime; 1752 r->ru_nvcsw = p->signal->cnvcsw; 1753 r->ru_nivcsw = p->signal->cnivcsw; 1754 r->ru_minflt = p->signal->cmin_flt; 1755 r->ru_majflt = p->signal->cmaj_flt; 1756 r->ru_inblock = p->signal->cinblock; 1757 r->ru_oublock = p->signal->coublock; 1758 maxrss = p->signal->cmaxrss; 1759 1760 if (who == RUSAGE_CHILDREN) 1761 break; 1762 fallthrough; 1763 1764 case RUSAGE_SELF: 1765 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1766 utime += tgutime; 1767 stime += tgstime; 1768 r->ru_nvcsw += p->signal->nvcsw; 1769 r->ru_nivcsw += p->signal->nivcsw; 1770 r->ru_minflt += p->signal->min_flt; 1771 r->ru_majflt += p->signal->maj_flt; 1772 r->ru_inblock += p->signal->inblock; 1773 r->ru_oublock += p->signal->oublock; 1774 if (maxrss < p->signal->maxrss) 1775 maxrss = p->signal->maxrss; 1776 t = p; 1777 do { 1778 accumulate_thread_rusage(t, r); 1779 } while_each_thread(p, t); 1780 break; 1781 1782 default: 1783 BUG(); 1784 } 1785 unlock_task_sighand(p, &flags); 1786 1787 out: 1788 r->ru_utime = ns_to_kernel_old_timeval(utime); 1789 r->ru_stime = ns_to_kernel_old_timeval(stime); 1790 1791 if (who != RUSAGE_CHILDREN) { 1792 struct mm_struct *mm = get_task_mm(p); 1793 1794 if (mm) { 1795 setmax_mm_hiwater_rss(&maxrss, mm); 1796 mmput(mm); 1797 } 1798 } 1799 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ 1800 } 1801 1802 SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) 1803 { 1804 struct rusage r; 1805 1806 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1807 who != RUSAGE_THREAD) 1808 return -EINVAL; 1809 1810 getrusage(current, who, &r); 1811 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1812 } 1813 1814 #ifdef CONFIG_COMPAT 1815 COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) 1816 { 1817 struct rusage r; 1818 1819 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1820 who != RUSAGE_THREAD) 1821 return -EINVAL; 1822 1823 getrusage(current, who, &r); 1824 return put_compat_rusage(&r, ru); 1825 } 1826 #endif 1827 1828 SYSCALL_DEFINE1(umask, int, mask) 1829 { 1830 mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); 1831 return mask; 1832 } 1833 1834 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1835 { 1836 struct fd exe; 1837 struct file *old_exe, *exe_file; 1838 struct inode *inode; 1839 int err; 1840 1841 exe = fdget(fd); 1842 if (!exe.file) 1843 return -EBADF; 1844 1845 inode = file_inode(exe.file); 1846 1847 /* 1848 * Because the original mm->exe_file points to executable file, make 1849 * sure that this one is executable as well, to avoid breaking an 1850 * overall picture. 1851 */ 1852 err = -EACCES; 1853 if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) 1854 goto exit; 1855 1856 err = file_permission(exe.file, MAY_EXEC); 1857 if (err) 1858 goto exit; 1859 1860 /* 1861 * Forbid mm->exe_file change if old file still mapped. 1862 */ 1863 exe_file = get_mm_exe_file(mm); 1864 err = -EBUSY; 1865 if (exe_file) { 1866 struct vm_area_struct *vma; 1867 1868 mmap_read_lock(mm); 1869 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1870 if (!vma->vm_file) 1871 continue; 1872 if (path_equal(&vma->vm_file->f_path, 1873 &exe_file->f_path)) 1874 goto exit_err; 1875 } 1876 1877 mmap_read_unlock(mm); 1878 fput(exe_file); 1879 } 1880 1881 err = 0; 1882 /* set the new file, lockless */ 1883 get_file(exe.file); 1884 old_exe = xchg(&mm->exe_file, exe.file); 1885 if (old_exe) 1886 fput(old_exe); 1887 exit: 1888 fdput(exe); 1889 return err; 1890 exit_err: 1891 mmap_read_unlock(mm); 1892 fput(exe_file); 1893 goto exit; 1894 } 1895 1896 /* 1897 * Check arithmetic relations of passed addresses. 1898 * 1899 * WARNING: we don't require any capability here so be very careful 1900 * in what is allowed for modification from userspace. 1901 */ 1902 static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) 1903 { 1904 unsigned long mmap_max_addr = TASK_SIZE; 1905 int error = -EINVAL, i; 1906 1907 static const unsigned char offsets[] = { 1908 offsetof(struct prctl_mm_map, start_code), 1909 offsetof(struct prctl_mm_map, end_code), 1910 offsetof(struct prctl_mm_map, start_data), 1911 offsetof(struct prctl_mm_map, end_data), 1912 offsetof(struct prctl_mm_map, start_brk), 1913 offsetof(struct prctl_mm_map, brk), 1914 offsetof(struct prctl_mm_map, start_stack), 1915 offsetof(struct prctl_mm_map, arg_start), 1916 offsetof(struct prctl_mm_map, arg_end), 1917 offsetof(struct prctl_mm_map, env_start), 1918 offsetof(struct prctl_mm_map, env_end), 1919 }; 1920 1921 /* 1922 * Make sure the members are not somewhere outside 1923 * of allowed address space. 1924 */ 1925 for (i = 0; i < ARRAY_SIZE(offsets); i++) { 1926 u64 val = *(u64 *)((char *)prctl_map + offsets[i]); 1927 1928 if ((unsigned long)val >= mmap_max_addr || 1929 (unsigned long)val < mmap_min_addr) 1930 goto out; 1931 } 1932 1933 /* 1934 * Make sure the pairs are ordered. 1935 */ 1936 #define __prctl_check_order(__m1, __op, __m2) \ 1937 ((unsigned long)prctl_map->__m1 __op \ 1938 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL 1939 error = __prctl_check_order(start_code, <, end_code); 1940 error |= __prctl_check_order(start_data,<=, end_data); 1941 error |= __prctl_check_order(start_brk, <=, brk); 1942 error |= __prctl_check_order(arg_start, <=, arg_end); 1943 error |= __prctl_check_order(env_start, <=, env_end); 1944 if (error) 1945 goto out; 1946 #undef __prctl_check_order 1947 1948 error = -EINVAL; 1949 1950 /* 1951 * @brk should be after @end_data in traditional maps. 1952 */ 1953 if (prctl_map->start_brk <= prctl_map->end_data || 1954 prctl_map->brk <= prctl_map->end_data) 1955 goto out; 1956 1957 /* 1958 * Neither we should allow to override limits if they set. 1959 */ 1960 if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, 1961 prctl_map->start_brk, prctl_map->end_data, 1962 prctl_map->start_data)) 1963 goto out; 1964 1965 error = 0; 1966 out: 1967 return error; 1968 } 1969 1970 #ifdef CONFIG_CHECKPOINT_RESTORE 1971 static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) 1972 { 1973 struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; 1974 unsigned long user_auxv[AT_VECTOR_SIZE]; 1975 struct mm_struct *mm = current->mm; 1976 int error; 1977 1978 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); 1979 BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); 1980 1981 if (opt == PR_SET_MM_MAP_SIZE) 1982 return put_user((unsigned int)sizeof(prctl_map), 1983 (unsigned int __user *)addr); 1984 1985 if (data_size != sizeof(prctl_map)) 1986 return -EINVAL; 1987 1988 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) 1989 return -EFAULT; 1990 1991 error = validate_prctl_map_addr(&prctl_map); 1992 if (error) 1993 return error; 1994 1995 if (prctl_map.auxv_size) { 1996 /* 1997 * Someone is trying to cheat the auxv vector. 1998 */ 1999 if (!prctl_map.auxv || 2000 prctl_map.auxv_size > sizeof(mm->saved_auxv)) 2001 return -EINVAL; 2002 2003 memset(user_auxv, 0, sizeof(user_auxv)); 2004 if (copy_from_user(user_auxv, 2005 (const void __user *)prctl_map.auxv, 2006 prctl_map.auxv_size)) 2007 return -EFAULT; 2008 2009 /* Last entry must be AT_NULL as specification requires */ 2010 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; 2011 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; 2012 } 2013 2014 if (prctl_map.exe_fd != (u32)-1) { 2015 /* 2016 * Check if the current user is checkpoint/restore capable. 2017 * At the time of this writing, it checks for CAP_SYS_ADMIN 2018 * or CAP_CHECKPOINT_RESTORE. 2019 * Note that a user with access to ptrace can masquerade an 2020 * arbitrary program as any executable, even setuid ones. 2021 * This may have implications in the tomoyo subsystem. 2022 */ 2023 if (!checkpoint_restore_ns_capable(current_user_ns())) 2024 return -EPERM; 2025 2026 error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); 2027 if (error) 2028 return error; 2029 } 2030 2031 /* 2032 * arg_lock protects concurrent updates but we still need mmap_lock for 2033 * read to exclude races with sys_brk. 2034 */ 2035 mmap_read_lock(mm); 2036 2037 /* 2038 * We don't validate if these members are pointing to 2039 * real present VMAs because application may have correspond 2040 * VMAs already unmapped and kernel uses these members for statistics 2041 * output in procfs mostly, except 2042 * 2043 * - @start_brk/@brk which are used in do_brk_flags but kernel lookups 2044 * for VMAs when updating these members so anything wrong written 2045 * here cause kernel to swear at userspace program but won't lead 2046 * to any problem in kernel itself 2047 */ 2048 2049 spin_lock(&mm->arg_lock); 2050 mm->start_code = prctl_map.start_code; 2051 mm->end_code = prctl_map.end_code; 2052 mm->start_data = prctl_map.start_data; 2053 mm->end_data = prctl_map.end_data; 2054 mm->start_brk = prctl_map.start_brk; 2055 mm->brk = prctl_map.brk; 2056 mm->start_stack = prctl_map.start_stack; 2057 mm->arg_start = prctl_map.arg_start; 2058 mm->arg_end = prctl_map.arg_end; 2059 mm->env_start = prctl_map.env_start; 2060 mm->env_end = prctl_map.env_end; 2061 spin_unlock(&mm->arg_lock); 2062 2063 /* 2064 * Note this update of @saved_auxv is lockless thus 2065 * if someone reads this member in procfs while we're 2066 * updating -- it may get partly updated results. It's 2067 * known and acceptable trade off: we leave it as is to 2068 * not introduce additional locks here making the kernel 2069 * more complex. 2070 */ 2071 if (prctl_map.auxv_size) 2072 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); 2073 2074 mmap_read_unlock(mm); 2075 return 0; 2076 } 2077 #endif /* CONFIG_CHECKPOINT_RESTORE */ 2078 2079 static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, 2080 unsigned long len) 2081 { 2082 /* 2083 * This doesn't move the auxiliary vector itself since it's pinned to 2084 * mm_struct, but it permits filling the vector with new values. It's 2085 * up to the caller to provide sane values here, otherwise userspace 2086 * tools which use this vector might be unhappy. 2087 */ 2088 unsigned long user_auxv[AT_VECTOR_SIZE] = {}; 2089 2090 if (len > sizeof(user_auxv)) 2091 return -EINVAL; 2092 2093 if (copy_from_user(user_auxv, (const void __user *)addr, len)) 2094 return -EFAULT; 2095 2096 /* Make sure the last entry is always AT_NULL */ 2097 user_auxv[AT_VECTOR_SIZE - 2] = 0; 2098 user_auxv[AT_VECTOR_SIZE - 1] = 0; 2099 2100 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); 2101 2102 task_lock(current); 2103 memcpy(mm->saved_auxv, user_auxv, len); 2104 task_unlock(current); 2105 2106 return 0; 2107 } 2108 2109 static int prctl_set_mm(int opt, unsigned long addr, 2110 unsigned long arg4, unsigned long arg5) 2111 { 2112 struct mm_struct *mm = current->mm; 2113 struct prctl_mm_map prctl_map = { 2114 .auxv = NULL, 2115 .auxv_size = 0, 2116 .exe_fd = -1, 2117 }; 2118 struct vm_area_struct *vma; 2119 int error; 2120 2121 if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && 2122 opt != PR_SET_MM_MAP && 2123 opt != PR_SET_MM_MAP_SIZE))) 2124 return -EINVAL; 2125 2126 #ifdef CONFIG_CHECKPOINT_RESTORE 2127 if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) 2128 return prctl_set_mm_map(opt, (const void __user *)addr, arg4); 2129 #endif 2130 2131 if (!capable(CAP_SYS_RESOURCE)) 2132 return -EPERM; 2133 2134 if (opt == PR_SET_MM_EXE_FILE) 2135 return prctl_set_mm_exe_file(mm, (unsigned int)addr); 2136 2137 if (opt == PR_SET_MM_AUXV) 2138 return prctl_set_auxv(mm, addr, arg4); 2139 2140 if (addr >= TASK_SIZE || addr < mmap_min_addr) 2141 return -EINVAL; 2142 2143 error = -EINVAL; 2144 2145 /* 2146 * arg_lock protects concurrent updates of arg boundaries, we need 2147 * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr 2148 * validation. 2149 */ 2150 mmap_read_lock(mm); 2151 vma = find_vma(mm, addr); 2152 2153 spin_lock(&mm->arg_lock); 2154 prctl_map.start_code = mm->start_code; 2155 prctl_map.end_code = mm->end_code; 2156 prctl_map.start_data = mm->start_data; 2157 prctl_map.end_data = mm->end_data; 2158 prctl_map.start_brk = mm->start_brk; 2159 prctl_map.brk = mm->brk; 2160 prctl_map.start_stack = mm->start_stack; 2161 prctl_map.arg_start = mm->arg_start; 2162 prctl_map.arg_end = mm->arg_end; 2163 prctl_map.env_start = mm->env_start; 2164 prctl_map.env_end = mm->env_end; 2165 2166 switch (opt) { 2167 case PR_SET_MM_START_CODE: 2168 prctl_map.start_code = addr; 2169 break; 2170 case PR_SET_MM_END_CODE: 2171 prctl_map.end_code = addr; 2172 break; 2173 case PR_SET_MM_START_DATA: 2174 prctl_map.start_data = addr; 2175 break; 2176 case PR_SET_MM_END_DATA: 2177 prctl_map.end_data = addr; 2178 break; 2179 case PR_SET_MM_START_STACK: 2180 prctl_map.start_stack = addr; 2181 break; 2182 case PR_SET_MM_START_BRK: 2183 prctl_map.start_brk = addr; 2184 break; 2185 case PR_SET_MM_BRK: 2186 prctl_map.brk = addr; 2187 break; 2188 case PR_SET_MM_ARG_START: 2189 prctl_map.arg_start = addr; 2190 break; 2191 case PR_SET_MM_ARG_END: 2192 prctl_map.arg_end = addr; 2193 break; 2194 case PR_SET_MM_ENV_START: 2195 prctl_map.env_start = addr; 2196 break; 2197 case PR_SET_MM_ENV_END: 2198 prctl_map.env_end = addr; 2199 break; 2200 default: 2201 goto out; 2202 } 2203 2204 error = validate_prctl_map_addr(&prctl_map); 2205 if (error) 2206 goto out; 2207 2208 switch (opt) { 2209 /* 2210 * If command line arguments and environment 2211 * are placed somewhere else on stack, we can 2212 * set them up here, ARG_START/END to setup 2213 * command line arguments and ENV_START/END 2214 * for environment. 2215 */ 2216 case PR_SET_MM_START_STACK: 2217 case PR_SET_MM_ARG_START: 2218 case PR_SET_MM_ARG_END: 2219 case PR_SET_MM_ENV_START: 2220 case PR_SET_MM_ENV_END: 2221 if (!vma) { 2222 error = -EFAULT; 2223 goto out; 2224 } 2225 } 2226 2227 mm->start_code = prctl_map.start_code; 2228 mm->end_code = prctl_map.end_code; 2229 mm->start_data = prctl_map.start_data; 2230 mm->end_data = prctl_map.end_data; 2231 mm->start_brk = prctl_map.start_brk; 2232 mm->brk = prctl_map.brk; 2233 mm->start_stack = prctl_map.start_stack; 2234 mm->arg_start = prctl_map.arg_start; 2235 mm->arg_end = prctl_map.arg_end; 2236 mm->env_start = prctl_map.env_start; 2237 mm->env_end = prctl_map.env_end; 2238 2239 error = 0; 2240 out: 2241 spin_unlock(&mm->arg_lock); 2242 mmap_read_unlock(mm); 2243 return error; 2244 } 2245 2246 #ifdef CONFIG_CHECKPOINT_RESTORE 2247 static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) 2248 { 2249 return put_user(me->clear_child_tid, tid_addr); 2250 } 2251 #else 2252 static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) 2253 { 2254 return -EINVAL; 2255 } 2256 #endif 2257 2258 static int propagate_has_child_subreaper(struct task_struct *p, void *data) 2259 { 2260 /* 2261 * If task has has_child_subreaper - all its descendants 2262 * already have these flag too and new descendants will 2263 * inherit it on fork, skip them. 2264 * 2265 * If we've found child_reaper - skip descendants in 2266 * it's subtree as they will never get out pidns. 2267 */ 2268 if (p->signal->has_child_subreaper || 2269 is_child_reaper(task_pid(p))) 2270 return 0; 2271 2272 p->signal->has_child_subreaper = 1; 2273 return 1; 2274 } 2275 2276 int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) 2277 { 2278 return -EINVAL; 2279 } 2280 2281 int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, 2282 unsigned long ctrl) 2283 { 2284 return -EINVAL; 2285 } 2286 2287 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) 2288 2289 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 2290 unsigned long, arg4, unsigned long, arg5) 2291 { 2292 struct task_struct *me = current; 2293 unsigned char comm[sizeof(me->comm)]; 2294 long error; 2295 2296 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 2297 if (error != -ENOSYS) 2298 return error; 2299 2300 error = 0; 2301 switch (option) { 2302 case PR_SET_PDEATHSIG: 2303 if (!valid_signal(arg2)) { 2304 error = -EINVAL; 2305 break; 2306 } 2307 me->pdeath_signal = arg2; 2308 break; 2309 case PR_GET_PDEATHSIG: 2310 error = put_user(me->pdeath_signal, (int __user *)arg2); 2311 break; 2312 case PR_GET_DUMPABLE: 2313 error = get_dumpable(me->mm); 2314 break; 2315 case PR_SET_DUMPABLE: 2316 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { 2317 error = -EINVAL; 2318 break; 2319 } 2320 set_dumpable(me->mm, arg2); 2321 break; 2322 2323 case PR_SET_UNALIGN: 2324 error = SET_UNALIGN_CTL(me, arg2); 2325 break; 2326 case PR_GET_UNALIGN: 2327 error = GET_UNALIGN_CTL(me, arg2); 2328 break; 2329 case PR_SET_FPEMU: 2330 error = SET_FPEMU_CTL(me, arg2); 2331 break; 2332 case PR_GET_FPEMU: 2333 error = GET_FPEMU_CTL(me, arg2); 2334 break; 2335 case PR_SET_FPEXC: 2336 error = SET_FPEXC_CTL(me, arg2); 2337 break; 2338 case PR_GET_FPEXC: 2339 error = GET_FPEXC_CTL(me, arg2); 2340 break; 2341 case PR_GET_TIMING: 2342 error = PR_TIMING_STATISTICAL; 2343 break; 2344 case PR_SET_TIMING: 2345 if (arg2 != PR_TIMING_STATISTICAL) 2346 error = -EINVAL; 2347 break; 2348 case PR_SET_NAME: 2349 comm[sizeof(me->comm) - 1] = 0; 2350 if (strncpy_from_user(comm, (char __user *)arg2, 2351 sizeof(me->comm) - 1) < 0) 2352 return -EFAULT; 2353 set_task_comm(me, comm); 2354 proc_comm_connector(me); 2355 break; 2356 case PR_GET_NAME: 2357 get_task_comm(comm, me); 2358 if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) 2359 return -EFAULT; 2360 break; 2361 case PR_GET_ENDIAN: 2362 error = GET_ENDIAN(me, arg2); 2363 break; 2364 case PR_SET_ENDIAN: 2365 error = SET_ENDIAN(me, arg2); 2366 break; 2367 case PR_GET_SECCOMP: 2368 error = prctl_get_seccomp(); 2369 break; 2370 case PR_SET_SECCOMP: 2371 error = prctl_set_seccomp(arg2, (char __user *)arg3); 2372 break; 2373 case PR_GET_TSC: 2374 error = GET_TSC_CTL(arg2); 2375 break; 2376 case PR_SET_TSC: 2377 error = SET_TSC_CTL(arg2); 2378 break; 2379 case PR_TASK_PERF_EVENTS_DISABLE: 2380 error = perf_event_task_disable(); 2381 break; 2382 case PR_TASK_PERF_EVENTS_ENABLE: 2383 error = perf_event_task_enable(); 2384 break; 2385 case PR_GET_TIMERSLACK: 2386 if (current->timer_slack_ns > ULONG_MAX) 2387 error = ULONG_MAX; 2388 else 2389 error = current->timer_slack_ns; 2390 break; 2391 case PR_SET_TIMERSLACK: 2392 if (arg2 <= 0) 2393 current->timer_slack_ns = 2394 current->default_timer_slack_ns; 2395 else 2396 current->timer_slack_ns = arg2; 2397 break; 2398 case PR_MCE_KILL: 2399 if (arg4 | arg5) 2400 return -EINVAL; 2401 switch (arg2) { 2402 case PR_MCE_KILL_CLEAR: 2403 if (arg3 != 0) 2404 return -EINVAL; 2405 current->flags &= ~PF_MCE_PROCESS; 2406 break; 2407 case PR_MCE_KILL_SET: 2408 current->flags |= PF_MCE_PROCESS; 2409 if (arg3 == PR_MCE_KILL_EARLY) 2410 current->flags |= PF_MCE_EARLY; 2411 else if (arg3 == PR_MCE_KILL_LATE) 2412 current->flags &= ~PF_MCE_EARLY; 2413 else if (arg3 == PR_MCE_KILL_DEFAULT) 2414 current->flags &= 2415 ~(PF_MCE_EARLY|PF_MCE_PROCESS); 2416 else 2417 return -EINVAL; 2418 break; 2419 default: 2420 return -EINVAL; 2421 } 2422 break; 2423 case PR_MCE_KILL_GET: 2424 if (arg2 | arg3 | arg4 | arg5) 2425 return -EINVAL; 2426 if (current->flags & PF_MCE_PROCESS) 2427 error = (current->flags & PF_MCE_EARLY) ? 2428 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; 2429 else 2430 error = PR_MCE_KILL_DEFAULT; 2431 break; 2432 case PR_SET_MM: 2433 error = prctl_set_mm(arg2, arg3, arg4, arg5); 2434 break; 2435 case PR_GET_TID_ADDRESS: 2436 error = prctl_get_tid_address(me, (int __user * __user *)arg2); 2437 break; 2438 case PR_SET_CHILD_SUBREAPER: 2439 me->signal->is_child_subreaper = !!arg2; 2440 if (!arg2) 2441 break; 2442 2443 walk_process_tree(me, propagate_has_child_subreaper, NULL); 2444 break; 2445 case PR_GET_CHILD_SUBREAPER: 2446 error = put_user(me->signal->is_child_subreaper, 2447 (int __user *)arg2); 2448 break; 2449 case PR_SET_NO_NEW_PRIVS: 2450 if (arg2 != 1 || arg3 || arg4 || arg5) 2451 return -EINVAL; 2452 2453 task_set_no_new_privs(current); 2454 break; 2455 case PR_GET_NO_NEW_PRIVS: 2456 if (arg2 || arg3 || arg4 || arg5) 2457 return -EINVAL; 2458 return task_no_new_privs(current) ? 1 : 0; 2459 case PR_GET_THP_DISABLE: 2460 if (arg2 || arg3 || arg4 || arg5) 2461 return -EINVAL; 2462 error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); 2463 break; 2464 case PR_SET_THP_DISABLE: 2465 if (arg3 || arg4 || arg5) 2466 return -EINVAL; 2467 if (mmap_write_lock_killable(me->mm)) 2468 return -EINTR; 2469 if (arg2) 2470 set_bit(MMF_DISABLE_THP, &me->mm->flags); 2471 else 2472 clear_bit(MMF_DISABLE_THP, &me->mm->flags); 2473 mmap_write_unlock(me->mm); 2474 break; 2475 case PR_MPX_ENABLE_MANAGEMENT: 2476 case PR_MPX_DISABLE_MANAGEMENT: 2477 /* No longer implemented: */ 2478 return -EINVAL; 2479 case PR_SET_FP_MODE: 2480 error = SET_FP_MODE(me, arg2); 2481 break; 2482 case PR_GET_FP_MODE: 2483 error = GET_FP_MODE(me); 2484 break; 2485 case PR_SVE_SET_VL: 2486 error = SVE_SET_VL(arg2); 2487 break; 2488 case PR_SVE_GET_VL: 2489 error = SVE_GET_VL(); 2490 break; 2491 case PR_GET_SPECULATION_CTRL: 2492 if (arg3 || arg4 || arg5) 2493 return -EINVAL; 2494 error = arch_prctl_spec_ctrl_get(me, arg2); 2495 break; 2496 case PR_SET_SPECULATION_CTRL: 2497 if (arg4 || arg5) 2498 return -EINVAL; 2499 error = arch_prctl_spec_ctrl_set(me, arg2, arg3); 2500 break; 2501 case PR_PAC_RESET_KEYS: 2502 if (arg3 || arg4 || arg5) 2503 return -EINVAL; 2504 error = PAC_RESET_KEYS(me, arg2); 2505 break; 2506 case PR_PAC_SET_ENABLED_KEYS: 2507 if (arg4 || arg5) 2508 return -EINVAL; 2509 error = PAC_SET_ENABLED_KEYS(me, arg2, arg3); 2510 break; 2511 case PR_PAC_GET_ENABLED_KEYS: 2512 if (arg2 || arg3 || arg4 || arg5) 2513 return -EINVAL; 2514 error = PAC_GET_ENABLED_KEYS(me); 2515 break; 2516 case PR_SET_TAGGED_ADDR_CTRL: 2517 if (arg3 || arg4 || arg5) 2518 return -EINVAL; 2519 error = SET_TAGGED_ADDR_CTRL(arg2); 2520 break; 2521 case PR_GET_TAGGED_ADDR_CTRL: 2522 if (arg2 || arg3 || arg4 || arg5) 2523 return -EINVAL; 2524 error = GET_TAGGED_ADDR_CTRL(); 2525 break; 2526 case PR_SET_IO_FLUSHER: 2527 if (!capable(CAP_SYS_RESOURCE)) 2528 return -EPERM; 2529 2530 if (arg3 || arg4 || arg5) 2531 return -EINVAL; 2532 2533 if (arg2 == 1) 2534 current->flags |= PR_IO_FLUSHER; 2535 else if (!arg2) 2536 current->flags &= ~PR_IO_FLUSHER; 2537 else 2538 return -EINVAL; 2539 break; 2540 case PR_GET_IO_FLUSHER: 2541 if (!capable(CAP_SYS_RESOURCE)) 2542 return -EPERM; 2543 2544 if (arg2 || arg3 || arg4 || arg5) 2545 return -EINVAL; 2546 2547 error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; 2548 break; 2549 case PR_SET_SYSCALL_USER_DISPATCH: 2550 error = set_syscall_user_dispatch(arg2, arg3, arg4, 2551 (char __user *) arg5); 2552 break; 2553 default: 2554 error = -EINVAL; 2555 break; 2556 } 2557 return error; 2558 } 2559 2560 SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, 2561 struct getcpu_cache __user *, unused) 2562 { 2563 int err = 0; 2564 int cpu = raw_smp_processor_id(); 2565 2566 if (cpup) 2567 err |= put_user(cpu, cpup); 2568 if (nodep) 2569 err |= put_user(cpu_to_node(cpu), nodep); 2570 return err ? -EFAULT : 0; 2571 } 2572 2573 /** 2574 * do_sysinfo - fill in sysinfo struct 2575 * @info: pointer to buffer to fill 2576 */ 2577 static int do_sysinfo(struct sysinfo *info) 2578 { 2579 unsigned long mem_total, sav_total; 2580 unsigned int mem_unit, bitcount; 2581 struct timespec64 tp; 2582 2583 memset(info, 0, sizeof(struct sysinfo)); 2584 2585 ktime_get_boottime_ts64(&tp); 2586 timens_add_boottime(&tp); 2587 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 2588 2589 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); 2590 2591 info->procs = nr_threads; 2592 2593 si_meminfo(info); 2594 si_swapinfo(info); 2595 2596 /* 2597 * If the sum of all the available memory (i.e. ram + swap) 2598 * is less than can be stored in a 32 bit unsigned long then 2599 * we can be binary compatible with 2.2.x kernels. If not, 2600 * well, in that case 2.2.x was broken anyways... 2601 * 2602 * -Erik Andersen <andersee@debian.org> 2603 */ 2604 2605 mem_total = info->totalram + info->totalswap; 2606 if (mem_total < info->totalram || mem_total < info->totalswap) 2607 goto out; 2608 bitcount = 0; 2609 mem_unit = info->mem_unit; 2610 while (mem_unit > 1) { 2611 bitcount++; 2612 mem_unit >>= 1; 2613 sav_total = mem_total; 2614 mem_total <<= 1; 2615 if (mem_total < sav_total) 2616 goto out; 2617 } 2618 2619 /* 2620 * If mem_total did not overflow, multiply all memory values by 2621 * info->mem_unit and set it to 1. This leaves things compatible 2622 * with 2.2.x, and also retains compatibility with earlier 2.4.x 2623 * kernels... 2624 */ 2625 2626 info->mem_unit = 1; 2627 info->totalram <<= bitcount; 2628 info->freeram <<= bitcount; 2629 info->sharedram <<= bitcount; 2630 info->bufferram <<= bitcount; 2631 info->totalswap <<= bitcount; 2632 info->freeswap <<= bitcount; 2633 info->totalhigh <<= bitcount; 2634 info->freehigh <<= bitcount; 2635 2636 out: 2637 return 0; 2638 } 2639 2640 SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) 2641 { 2642 struct sysinfo val; 2643 2644 do_sysinfo(&val); 2645 2646 if (copy_to_user(info, &val, sizeof(struct sysinfo))) 2647 return -EFAULT; 2648 2649 return 0; 2650 } 2651 2652 #ifdef CONFIG_COMPAT 2653 struct compat_sysinfo { 2654 s32 uptime; 2655 u32 loads[3]; 2656 u32 totalram; 2657 u32 freeram; 2658 u32 sharedram; 2659 u32 bufferram; 2660 u32 totalswap; 2661 u32 freeswap; 2662 u16 procs; 2663 u16 pad; 2664 u32 totalhigh; 2665 u32 freehigh; 2666 u32 mem_unit; 2667 char _f[20-2*sizeof(u32)-sizeof(int)]; 2668 }; 2669 2670 COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) 2671 { 2672 struct sysinfo s; 2673 struct compat_sysinfo s_32; 2674 2675 do_sysinfo(&s); 2676 2677 /* Check to see if any memory value is too large for 32-bit and scale 2678 * down if needed 2679 */ 2680 if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { 2681 int bitcount = 0; 2682 2683 while (s.mem_unit < PAGE_SIZE) { 2684 s.mem_unit <<= 1; 2685 bitcount++; 2686 } 2687 2688 s.totalram >>= bitcount; 2689 s.freeram >>= bitcount; 2690 s.sharedram >>= bitcount; 2691 s.bufferram >>= bitcount; 2692 s.totalswap >>= bitcount; 2693 s.freeswap >>= bitcount; 2694 s.totalhigh >>= bitcount; 2695 s.freehigh >>= bitcount; 2696 } 2697 2698 memset(&s_32, 0, sizeof(s_32)); 2699 s_32.uptime = s.uptime; 2700 s_32.loads[0] = s.loads[0]; 2701 s_32.loads[1] = s.loads[1]; 2702 s_32.loads[2] = s.loads[2]; 2703 s_32.totalram = s.totalram; 2704 s_32.freeram = s.freeram; 2705 s_32.sharedram = s.sharedram; 2706 s_32.bufferram = s.bufferram; 2707 s_32.totalswap = s.totalswap; 2708 s_32.freeswap = s.freeswap; 2709 s_32.procs = s.procs; 2710 s_32.totalhigh = s.totalhigh; 2711 s_32.freehigh = s.freehigh; 2712 s_32.mem_unit = s.mem_unit; 2713 if (copy_to_user(info, &s_32, sizeof(s_32))) 2714 return -EFAULT; 2715 return 0; 2716 } 2717 #endif /* CONFIG_COMPAT */ 2718