1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 #ifndef _GNU_SOURCE 24 #define _GNU_SOURCE 25 #endif 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <stdbool.h> 30 #include <stddef.h> 31 #include <string.h> 32 #include <linux/types.h> 33 #include <sys/wait.h> 34 #include <sys/stat.h> 35 #include <sys/mount.h> 36 #include <fcntl.h> 37 #include <errno.h> 38 #include <sched.h> 39 #include <syscall.h> 40 #include <sys/socket.h> 41 42 #include <sys/list.h> 43 44 #ifndef UINT_MAX 45 #define UINT_MAX 4294967295U 46 #endif 47 48 #ifndef __NR_Linux 49 #if defined __alpha__ 50 #define __NR_Linux 110 51 #elif defined _MIPS_SIM 52 #if _MIPS_SIM == _MIPS_SIM_ABI32 53 #define __NR_Linux 4000 54 #endif 55 #if _MIPS_SIM == _MIPS_SIM_NABI32 56 #define __NR_Linux 6000 57 #endif 58 #if _MIPS_SIM == _MIPS_SIM_ABI64 59 #define __NR_Linux 5000 60 #endif 61 #elif defined __ia64__ 62 #define __NR_Linux 1024 63 #else 64 #define __NR_Linux 0 65 #endif 66 #endif 67 68 #ifndef __NR_mount_setattr 69 #define __NR_mount_setattr (442 + __NR_Linux) 70 #endif 71 72 #ifndef __NR_open_tree 73 #define __NR_open_tree (428 + __NR_Linux) 74 #endif 75 76 #ifndef __NR_move_mount 77 #define __NR_move_mount (429 + __NR_Linux) 78 #endif 79 80 #ifndef MNT_DETACH 81 #define MNT_DETACH 2 82 #endif 83 84 #ifndef MOVE_MOUNT_F_EMPTY_PATH 85 #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 86 #endif 87 88 #ifndef MOUNT_ATTR_IDMAP 89 #define MOUNT_ATTR_IDMAP 0x00100000 90 #endif 91 92 #ifndef OPEN_TREE_CLONE 93 #define OPEN_TREE_CLONE 1 94 #endif 95 96 #ifndef OPEN_TREE_CLOEXEC 97 #define OPEN_TREE_CLOEXEC O_CLOEXEC 98 #endif 99 100 #ifndef AT_RECURSIVE 101 #define AT_RECURSIVE 0x8000 102 #endif 103 104 typedef struct { 105 __u64 attr_set; 106 __u64 attr_clr; 107 __u64 propagation; 108 __u64 userns_fd; 109 } mount_attr_t; 110 111 static inline int 112 sys_mount_setattr(int dfd, const char *path, unsigned int flags, 113 mount_attr_t *attr, size_t size) 114 { 115 return (syscall(__NR_mount_setattr, dfd, path, flags, attr, size)); 116 } 117 118 static inline int 119 sys_open_tree(int dfd, const char *filename, unsigned int flags) 120 { 121 return (syscall(__NR_open_tree, dfd, filename, flags)); 122 } 123 124 static inline int sys_move_mount(int from_dfd, const char *from_pathname, 125 int to_dfd, const char *to_pathname, unsigned int flags) 126 { 127 return (syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, 128 to_pathname, flags)); 129 } 130 131 typedef enum idmap_type_t { 132 TYPE_UID, 133 TYPE_GID, 134 TYPE_BOTH 135 } idmap_type_t; 136 137 struct idmap_entry { 138 __u32 first; 139 __u32 lower_first; 140 __u32 count; 141 idmap_type_t type; 142 list_node_t node; 143 }; 144 145 static void 146 log_msg(const char *msg, ...) 147 { 148 va_list ap; 149 150 va_start(ap, msg); 151 vfprintf(stderr, msg, ap); 152 fputc('\n', stderr); 153 va_end(ap); 154 } 155 156 #define log_errno(msg, args...) \ 157 do { \ 158 log_msg("%s:%d:%s: [%m] " msg, __FILE__, __LINE__,\ 159 __FUNCTION__, ##args); \ 160 } while (0) 161 162 /* 163 * Parse the idmapping in the following format 164 * and add to the list: 165 * 166 * u:nsid_first:hostid_first:count 167 * g:nsid_first:hostid_first:count 168 * b:nsid_first:hostid_first:count 169 * 170 * The delimiter can be : or space character. 171 * 172 * Return: 173 * 0 if success 174 * ENOMEM if out of memory 175 * EINVAL if wrong arg or input 176 */ 177 static int 178 parse_idmap_entry(list_t *head, char *input) 179 { 180 char *token, *savedptr = NULL; 181 struct idmap_entry *entry; 182 unsigned long ul; 183 char *delimiter = (char *)": "; 184 char c; 185 186 if (!input || !head) 187 return (EINVAL); 188 entry = malloc(sizeof (*entry)); 189 if (!entry) 190 return (ENOMEM); 191 192 token = strtok_r(input, delimiter, &savedptr); 193 if (token) 194 c = token[0]; 195 if (!token || (c != 'b' && c != 'u' && c != 'g')) 196 goto errout; 197 entry->type = (c == 'b') ? TYPE_BOTH : 198 ((c == 'u') ? TYPE_UID : TYPE_GID); 199 200 token = strtok_r(NULL, delimiter, &savedptr); 201 if (!token) 202 goto errout; 203 ul = strtoul(token, NULL, 10); 204 if (ul > UINT_MAX || errno != 0) 205 goto errout; 206 entry->first = (__u32)ul; 207 208 token = strtok_r(NULL, delimiter, &savedptr); 209 if (!token) 210 goto errout; 211 ul = strtoul(token, NULL, 10); 212 if (ul > UINT_MAX || errno != 0) 213 goto errout; 214 entry->lower_first = (__u32)ul; 215 216 token = strtok_r(NULL, delimiter, &savedptr); 217 if (!token) 218 goto errout; 219 ul = strtoul(token, NULL, 10); 220 if (ul > UINT_MAX || errno != 0) 221 goto errout; 222 entry->count = (__u32)ul; 223 224 list_insert_tail(head, entry); 225 226 return (0); 227 228 errout: 229 free(entry); 230 return (EINVAL); 231 } 232 233 /* 234 * Release all the entries in the list 235 */ 236 static void 237 free_idmap(list_t *head) 238 { 239 struct idmap_entry *entry; 240 241 while ((entry = list_remove_head(head)) != NULL) 242 free(entry); 243 /* list_destroy() to be done by the caller */ 244 } 245 246 /* 247 * Write all bytes in the buffer to fd 248 */ 249 static ssize_t 250 write_buf(int fd, const char *buf, size_t buf_size) 251 { 252 ssize_t written, total_written = 0; 253 size_t remaining = buf_size; 254 char *position = (char *)buf; 255 256 for (;;) { 257 written = write(fd, position, remaining); 258 if (written < 0 && errno == EINTR) 259 continue; 260 if (written < 0) { 261 log_errno("write"); 262 return (written); 263 } 264 total_written += written; 265 if (total_written == buf_size) 266 break; 267 remaining -= written; 268 position += written; 269 } 270 271 return (total_written); 272 } 273 274 /* 275 * Read data from file into buffer 276 */ 277 static ssize_t 278 read_buf(int fd, char *buf, size_t buf_size) 279 { 280 int ret; 281 for (;;) { 282 ret = read(fd, buf, buf_size); 283 if (ret < 0 && errno == EINTR) 284 continue; 285 break; 286 } 287 if (ret < 0) 288 log_errno("read"); 289 return (ret); 290 } 291 292 /* 293 * Write idmap of the given type in the buffer to the 294 * process' uid_map or gid_map proc file. 295 * 296 * Return: 297 * 0 if success 298 * errno if there's any error 299 */ 300 static int 301 write_idmap(pid_t pid, char *buf, size_t buf_size, idmap_type_t type) 302 { 303 char path[PATH_MAX]; 304 int fd = -EBADF; 305 int ret; 306 307 (void) snprintf(path, sizeof (path), "/proc/%d/%cid_map", 308 pid, type == TYPE_UID ? 'u' : 'g'); 309 fd = open(path, O_WRONLY | O_CLOEXEC); 310 if (fd < 0) { 311 ret = errno; 312 log_errno("open(%s)", path); 313 goto out; 314 } 315 ret = write_buf(fd, buf, buf_size); 316 if (ret < 0) 317 ret = errno; 318 else 319 ret = 0; 320 out: 321 if (fd >= 0) 322 close(fd); 323 return (ret); 324 } 325 326 /* 327 * Write idmap info in the list to the process 328 * user namespace, i.e. its /proc/<pid>/uid_map 329 * and /proc/<pid>/gid_map file. 330 * 331 * Return: 332 * 0 if success 333 * errno if it fails 334 */ 335 static int 336 write_pid_idmaps(pid_t pid, list_t *head) 337 { 338 char *buf_uids, *buf_gids; 339 char *curr_bufu, *curr_bufg; 340 /* max 4k to be allowed for each map */ 341 int size_buf_uids = 4096, size_buf_gids = 4096; 342 struct idmap_entry *entry; 343 int uid_filled, gid_filled; 344 int ret = 0; 345 int has_uids = 0, has_gids = 0; 346 size_t buf_size; 347 348 buf_uids = malloc(size_buf_uids); 349 if (!buf_uids) 350 return (ENOMEM); 351 buf_gids = malloc(size_buf_gids); 352 if (!buf_gids) { 353 free(buf_uids); 354 return (ENOMEM); 355 } 356 curr_bufu = buf_uids; 357 curr_bufg = buf_gids; 358 359 for (entry = list_head(head); entry; entry = list_next(head, entry)) { 360 if (entry->type == TYPE_UID || entry->type == TYPE_BOTH) { 361 uid_filled = snprintf(curr_bufu, size_buf_uids, 362 "%u %u %u\n", entry->first, entry->lower_first, 363 entry->count); 364 if (uid_filled <= 0 || uid_filled >= size_buf_uids) { 365 ret = E2BIG; 366 goto out; 367 } 368 curr_bufu += uid_filled; 369 size_buf_uids -= uid_filled; 370 has_uids = 1; 371 } 372 if (entry->type == TYPE_GID || entry->type == TYPE_BOTH) { 373 gid_filled = snprintf(curr_bufg, size_buf_gids, 374 "%u %u %u\n", entry->first, entry->lower_first, 375 entry->count); 376 if (gid_filled <= 0 || gid_filled >= size_buf_gids) { 377 ret = E2BIG; 378 goto out; 379 } 380 curr_bufg += gid_filled; 381 size_buf_gids -= gid_filled; 382 has_gids = 1; 383 } 384 } 385 if (has_uids) { 386 buf_size = curr_bufu - buf_uids; 387 ret = write_idmap(pid, buf_uids, buf_size, TYPE_UID); 388 if (ret) 389 goto out; 390 } 391 if (has_gids) { 392 buf_size = curr_bufg - buf_gids; 393 ret = write_idmap(pid, buf_gids, buf_size, TYPE_GID); 394 } 395 396 out: 397 free(buf_uids); 398 free(buf_gids); 399 return (ret); 400 } 401 402 /* 403 * Wait for the child process to exit 404 * and reap it. 405 * 406 * Return: 407 * process exit code if available 408 */ 409 static int 410 wait_for_pid(pid_t pid) 411 { 412 int status; 413 int ret; 414 415 for (;;) { 416 ret = waitpid(pid, &status, 0); 417 if (ret < 0) { 418 if (errno == EINTR) 419 continue; 420 return (EXIT_FAILURE); 421 } 422 break; 423 } 424 if (!WIFEXITED(status)) 425 return (EXIT_FAILURE); 426 return (WEXITSTATUS(status)); 427 } 428 429 /* 430 * Get the file descriptor of the process user namespace 431 * given its pid. 432 * 433 * Return: 434 * fd if success 435 * -1 if it fails 436 */ 437 static int 438 userns_fd_from_pid(pid_t pid) 439 { 440 int fd; 441 char path[PATH_MAX]; 442 443 (void) snprintf(path, sizeof (path), "/proc/%d/ns/user", pid); 444 fd = open(path, O_RDONLY | O_CLOEXEC); 445 if (fd < 0) 446 log_errno("open(%s)", path); 447 return (fd); 448 } 449 450 /* 451 * Get the user namespace file descriptor given a list 452 * of idmap info. 453 * 454 * Return: 455 * fd if success 456 * -errno if it fails 457 */ 458 static int 459 userns_fd_from_idmap(list_t *head) 460 { 461 pid_t pid; 462 int ret, fd; 463 int fds[2]; 464 char c; 465 int saved_errno = 0; 466 467 /* socketpair for bidirectional communication */ 468 ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, fds); 469 if (ret) { 470 log_errno("socketpair"); 471 return (-errno); 472 } 473 474 pid = fork(); 475 if (pid < 0) { 476 log_errno("fork"); 477 fd = -errno; 478 goto out; 479 } 480 481 if (pid == 0) { 482 /* child process */ 483 ret = unshare(CLONE_NEWUSER); 484 if (ret == 0) { 485 /* notify the parent of success */ 486 ret = write_buf(fds[1], "1", 1); 487 if (ret < 0) 488 saved_errno = errno; 489 else { 490 /* 491 * Until the parent has written to idmap, 492 * we cannot exit, otherwise the defunct 493 * process is owned by the real root, writing 494 * to its idmap ends up with EPERM in the 495 * context of a user ns 496 */ 497 ret = read_buf(fds[1], &c, 1); 498 if (ret < 0) 499 saved_errno = errno; 500 } 501 } else { 502 saved_errno = errno; 503 log_errno("unshare"); 504 ret = write_buf(fds[1], "0", 1); 505 if (ret < 0) 506 saved_errno = errno; 507 } 508 exit(saved_errno); 509 } 510 511 /* parent process */ 512 ret = read_buf(fds[0], &c, 1); 513 if (ret == 1 && c == '1') { 514 ret = write_pid_idmaps(pid, head); 515 if (!ret) { 516 fd = userns_fd_from_pid(pid); 517 if (fd < 0) 518 fd = -errno; 519 } else { 520 fd = -ret; 521 } 522 /* Let child know it can exit */ 523 (void) write_buf(fds[0], "1", 1); 524 } else { 525 fd = -EBADF; 526 } 527 (void) wait_for_pid(pid); 528 out: 529 close(fds[0]); 530 close(fds[1]); 531 return (fd); 532 } 533 534 /* 535 * Check if the operating system supports idmapped mount on the 536 * given path or not. 537 * 538 * Return: 539 * true if supported 540 * false if not supported 541 */ 542 static bool 543 is_idmap_supported(char *path) 544 { 545 list_t head; 546 int ret; 547 int tree_fd = -EBADF, path_fd = -EBADF; 548 mount_attr_t attr = { 549 .attr_set = MOUNT_ATTR_IDMAP, 550 .userns_fd = -EBADF, 551 }; 552 553 /* strtok_r() won't be happy with a const string */ 554 /* To check if idmapped mount can be done in a user ns, map 0 to 0 */ 555 char *input = strdup("b:0:0:1"); 556 557 if (!input) { 558 errno = ENOMEM; 559 log_errno("strdup"); 560 return (false); 561 } 562 563 list_create(&head, sizeof (struct idmap_entry), 564 offsetof(struct idmap_entry, node)); 565 ret = parse_idmap_entry(&head, input); 566 if (ret) { 567 errno = ret; 568 log_errno("parse_idmap_entry(%s)", input); 569 goto out1; 570 } 571 ret = userns_fd_from_idmap(&head); 572 if (ret < 0) 573 goto out1; 574 attr.userns_fd = ret; 575 ret = openat(-EBADF, path, O_DIRECTORY | O_CLOEXEC); 576 if (ret < 0) { 577 log_errno("openat(%s)", path); 578 goto out; 579 } 580 path_fd = ret; 581 ret = sys_open_tree(path_fd, "", AT_EMPTY_PATH | AT_NO_AUTOMOUNT | 582 AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); 583 if (ret < 0) { 584 log_errno("sys_open_tree"); 585 goto out; 586 } 587 tree_fd = ret; 588 ret = sys_mount_setattr(tree_fd, "", AT_EMPTY_PATH, &attr, 589 sizeof (attr)); 590 if (ret < 0) { 591 log_errno("sys_mount_setattr"); 592 } 593 out: 594 close(attr.userns_fd); 595 out1: 596 free_idmap(&head); 597 list_destroy(&head); 598 if (tree_fd >= 0) 599 close(tree_fd); 600 if (path_fd >= 0) 601 close(path_fd); 602 free(input); 603 return (ret == 0); 604 } 605 606 /* 607 * Check if the given path is a mount point or not. 608 * 609 * Return: 610 * true if it is 611 * false otherwise 612 */ 613 static bool 614 is_mountpoint(char *path) 615 { 616 char *parent; 617 struct stat st_me, st_parent; 618 bool ret; 619 620 parent = malloc(strlen(path)+4); 621 if (!parent) { 622 errno = ENOMEM; 623 log_errno("malloc"); 624 return (false); 625 } 626 strcat(strcpy(parent, path), "/.."); 627 if (lstat(path, &st_me) != 0 || 628 lstat(parent, &st_parent) != 0) 629 ret = false; 630 else 631 if (st_me.st_dev != st_parent.st_dev || 632 st_me.st_ino == st_parent.st_ino) 633 ret = true; 634 else 635 ret = false; 636 free(parent); 637 return (ret); 638 } 639 640 /* 641 * Remount the source on the new target folder with the given 642 * list of idmap info. If target is NULL, the source will be 643 * unmounted and then remounted if it is a mountpoint, otherwise 644 * no unmount is done, the source is simply idmap remounted. 645 * 646 * Return: 647 * 0 if success 648 * -errno otherwise 649 */ 650 static int 651 do_idmap_mount(list_t *idmap, char *source, char *target, int flags) 652 { 653 int ret; 654 int tree_fd = -EBADF, source_fd = -EBADF; 655 mount_attr_t attr = { 656 .attr_set = MOUNT_ATTR_IDMAP, 657 .userns_fd = -EBADF, 658 }; 659 660 ret = userns_fd_from_idmap(idmap); 661 if (ret < 0) 662 goto out1; 663 attr.userns_fd = ret; 664 ret = openat(-EBADF, source, O_DIRECTORY | O_CLOEXEC); 665 if (ret < 0) { 666 ret = -errno; 667 log_errno("openat(%s)", source); 668 goto out; 669 } 670 source_fd = ret; 671 ret = sys_open_tree(source_fd, "", AT_EMPTY_PATH | AT_NO_AUTOMOUNT | 672 AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE | flags); 673 if (ret < 0) { 674 ret = -errno; 675 log_errno("sys_open_tree"); 676 goto out; 677 } 678 tree_fd = ret; 679 ret = sys_mount_setattr(tree_fd, "", AT_EMPTY_PATH | flags, &attr, 680 sizeof (attr)); 681 if (ret < 0) { 682 ret = -errno; 683 log_errno("sys_mount_setattr"); 684 goto out; 685 } 686 if (target == NULL && is_mountpoint(source)) { 687 ret = umount2(source, MNT_DETACH); 688 if (ret < 0) { 689 ret = -errno; 690 log_errno("umount2(%s)", source); 691 goto out; 692 } 693 } 694 ret = sys_move_mount(tree_fd, "", -EBADF, target == NULL ? 695 source : target, MOVE_MOUNT_F_EMPTY_PATH); 696 if (ret < 0) { 697 ret = -errno; 698 log_errno("sys_move_mount(%s)", target == NULL ? 699 source : target); 700 } 701 out: 702 close(attr.userns_fd); 703 out1: 704 if (tree_fd >= 0) 705 close(tree_fd); 706 if (source_fd >= 0) 707 close(source_fd); 708 return (ret); 709 } 710 711 static void 712 print_usage(char *argv[]) 713 { 714 fprintf(stderr, "Usage: %s [-r] [-c] [-m <idmap1>] [-m <idmap2>]" \ 715 " ... [<source>] [<target>]\n", argv[0]); 716 fprintf(stderr, "\n"); 717 fprintf(stderr, " -r Recursively do idmapped mount.\n"); 718 fprintf(stderr, "\n"); 719 fprintf(stderr, " -c Checks if idmapped mount is supported " \ 720 "on the <source> by the operating system or not.\n"); 721 fprintf(stderr, "\n"); 722 fprintf(stderr, " -m <idmap> to specify the idmap info, " \ 723 "in the following format:\n"); 724 fprintf(stderr, " <id_type>:<nsid_first>:<hostid_first>:<count>\n"); 725 fprintf(stderr, "\n"); 726 fprintf(stderr, " <id_type> can be either of 'b', 'u', and 'g'.\n"); 727 fprintf(stderr, "\n"); 728 fprintf(stderr, "The <source> folder will be mounted at <target> " \ 729 "with the provided idmap information.\nIf no <target> is " \ 730 "specified, and <source> is a mount point, " \ 731 "then <source> will be unmounted and then remounted.\n"); 732 } 733 734 int 735 main(int argc, char *argv[]) 736 { 737 int opt; 738 list_t idmap_head; 739 int check_supported = 0; 740 int ret = EXIT_SUCCESS; 741 char *source = NULL, *target = NULL; 742 int flags = 0; 743 744 list_create(&idmap_head, sizeof (struct idmap_entry), 745 offsetof(struct idmap_entry, node)); 746 747 while ((opt = getopt(argc, argv, "rcm:")) != -1) { 748 switch (opt) { 749 case 'r': 750 flags |= AT_RECURSIVE; 751 break; 752 case 'c': 753 check_supported = 1; 754 break; 755 case 'm': 756 ret = parse_idmap_entry(&idmap_head, optarg); 757 if (ret) { 758 errno = ret; 759 log_errno("parse_idmap_entry(%s)", optarg); 760 ret = EXIT_FAILURE; 761 goto out; 762 } 763 break; 764 default: 765 print_usage(argv); 766 exit(EXIT_FAILURE); 767 } 768 } 769 770 if (check_supported == 0 && list_is_empty(&idmap_head)) { 771 print_usage(argv); 772 ret = EXIT_FAILURE; 773 goto out; 774 } 775 776 if (optind >= argc) { 777 fprintf(stderr, "Expected to have <source>, <target>.\n"); 778 print_usage(argv); 779 ret = EXIT_FAILURE; 780 goto out; 781 } 782 783 source = argv[optind]; 784 if (optind < (argc - 1)) { 785 target = argv[optind + 1]; 786 } 787 788 if (check_supported) { 789 free_idmap(&idmap_head); 790 list_destroy(&idmap_head); 791 if (is_idmap_supported(source)) { 792 printf("idmapped mount is supported on [%s].\n", 793 source); 794 return (EXIT_SUCCESS); 795 } else { 796 printf("idmapped mount is NOT supported.\n"); 797 return (EXIT_FAILURE); 798 } 799 } 800 801 ret = do_idmap_mount(&idmap_head, source, target, flags); 802 if (ret) 803 ret = EXIT_FAILURE; 804 out: 805 free_idmap(&idmap_head); 806 list_destroy(&idmap_head); 807 808 exit(ret); 809 } 810