1 /* 2 * Copyright (c) 2021 Klara Systems, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/sysmacros.h> 29 #include <sys/kmem.h> 30 #include <linux/file.h> 31 #include <linux/magic.h> 32 #include <sys/zone.h> 33 #include <sys/string.h> 34 35 #if defined(CONFIG_USER_NS) 36 #include <linux/statfs.h> 37 #include <linux/proc_ns.h> 38 #endif 39 40 #include <sys/mutex.h> 41 42 static kmutex_t zone_datasets_lock; 43 static struct list_head zone_datasets; 44 45 typedef struct zone_datasets { 46 struct list_head zds_list; /* zone_datasets linkage */ 47 struct user_namespace *zds_userns; /* namespace reference */ 48 struct list_head zds_datasets; /* datasets for the namespace */ 49 } zone_datasets_t; 50 51 typedef struct zone_dataset { 52 struct list_head zd_list; /* zone_dataset linkage */ 53 size_t zd_dsnamelen; /* length of name */ 54 char zd_dsname[]; /* name of the member dataset */ 55 } zone_dataset_t; 56 57 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) 58 /* 59 * Returns: 60 * - 0 on success 61 * - EBADF if it cannot open the provided file descriptor 62 * - ENOTTY if the file itself is a not a user namespace file. We want to 63 * intercept this error in the ZFS layer. We cannot just return one of the 64 * ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS 65 * and the SPL layers. 66 */ 67 static int 68 user_ns_get(int fd, struct user_namespace **userns) 69 { 70 struct kstatfs st; 71 struct file *nsfile; 72 struct ns_common *ns; 73 int error; 74 75 if ((nsfile = fget(fd)) == NULL) 76 return (EBADF); 77 if (vfs_statfs(&nsfile->f_path, &st) != 0) { 78 error = ENOTTY; 79 goto done; 80 } 81 if (st.f_type != NSFS_MAGIC) { 82 error = ENOTTY; 83 goto done; 84 } 85 ns = get_proc_ns(file_inode(nsfile)); 86 if (ns->ops->type != CLONE_NEWUSER) { 87 error = ENOTTY; 88 goto done; 89 } 90 *userns = container_of(ns, struct user_namespace, ns); 91 92 error = 0; 93 done: 94 fput(nsfile); 95 96 return (error); 97 } 98 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ 99 100 static unsigned int 101 user_ns_zoneid(struct user_namespace *user_ns) 102 { 103 unsigned int r; 104 105 #if defined(HAVE_USER_NS_COMMON_INUM) 106 r = user_ns->ns.inum; 107 #else 108 r = user_ns->proc_inum; 109 #endif 110 111 return (r); 112 } 113 114 static struct zone_datasets * 115 zone_datasets_lookup(unsigned int nsinum) 116 { 117 zone_datasets_t *zds; 118 119 list_for_each_entry(zds, &zone_datasets, zds_list) { 120 if (user_ns_zoneid(zds->zds_userns) == nsinum) 121 return (zds); 122 } 123 return (NULL); 124 } 125 126 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) 127 static struct zone_dataset * 128 zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen) 129 { 130 zone_dataset_t *zd; 131 132 list_for_each_entry(zd, &zds->zds_datasets, zd_list) { 133 if (zd->zd_dsnamelen != dsnamelen) 134 continue; 135 if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) 136 return (zd); 137 } 138 139 return (NULL); 140 } 141 142 static int 143 zone_dataset_cred_check(cred_t *cred) 144 { 145 146 if (!uid_eq(cred->uid, GLOBAL_ROOT_UID)) 147 return (EPERM); 148 149 return (0); 150 } 151 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ 152 153 static int 154 zone_dataset_name_check(const char *dataset, size_t *dsnamelen) 155 { 156 157 if (dataset[0] == '\0' || dataset[0] == '/') 158 return (ENOENT); 159 160 *dsnamelen = strlen(dataset); 161 /* Ignore trailing slash, if supplied. */ 162 if (dataset[*dsnamelen - 1] == '/') 163 (*dsnamelen)--; 164 165 return (0); 166 } 167 168 int 169 zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd) 170 { 171 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) 172 struct user_namespace *userns; 173 zone_datasets_t *zds; 174 zone_dataset_t *zd; 175 int error; 176 size_t dsnamelen; 177 178 if ((error = zone_dataset_cred_check(cred)) != 0) 179 return (error); 180 if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) 181 return (error); 182 if ((error = user_ns_get(userns_fd, &userns)) != 0) 183 return (error); 184 185 mutex_enter(&zone_datasets_lock); 186 zds = zone_datasets_lookup(user_ns_zoneid(userns)); 187 if (zds == NULL) { 188 zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP); 189 INIT_LIST_HEAD(&zds->zds_list); 190 INIT_LIST_HEAD(&zds->zds_datasets); 191 zds->zds_userns = userns; 192 /* 193 * Lock the namespace by incresing its refcount to prevent 194 * the namespace ID from being reused. 195 */ 196 get_user_ns(userns); 197 list_add_tail(&zds->zds_list, &zone_datasets); 198 } else { 199 zd = zone_dataset_lookup(zds, dataset, dsnamelen); 200 if (zd != NULL) { 201 mutex_exit(&zone_datasets_lock); 202 return (EEXIST); 203 } 204 } 205 206 zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP); 207 zd->zd_dsnamelen = dsnamelen; 208 strlcpy(zd->zd_dsname, dataset, dsnamelen + 1); 209 INIT_LIST_HEAD(&zd->zd_list); 210 list_add_tail(&zd->zd_list, &zds->zds_datasets); 211 212 mutex_exit(&zone_datasets_lock); 213 return (0); 214 #else 215 return (ENXIO); 216 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ 217 } 218 EXPORT_SYMBOL(zone_dataset_attach); 219 220 int 221 zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd) 222 { 223 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) 224 struct user_namespace *userns; 225 zone_datasets_t *zds; 226 zone_dataset_t *zd; 227 int error; 228 size_t dsnamelen; 229 230 if ((error = zone_dataset_cred_check(cred)) != 0) 231 return (error); 232 if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) 233 return (error); 234 if ((error = user_ns_get(userns_fd, &userns)) != 0) 235 return (error); 236 237 mutex_enter(&zone_datasets_lock); 238 zds = zone_datasets_lookup(user_ns_zoneid(userns)); 239 if (zds != NULL) 240 zd = zone_dataset_lookup(zds, dataset, dsnamelen); 241 if (zds == NULL || zd == NULL) { 242 mutex_exit(&zone_datasets_lock); 243 return (ENOENT); 244 } 245 246 list_del(&zd->zd_list); 247 kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); 248 249 /* Prune the namespace entry if it has no more delegations. */ 250 if (list_empty(&zds->zds_datasets)) { 251 /* 252 * Decrease the refcount now that the namespace is no longer 253 * used. It is no longer necessary to prevent the namespace ID 254 * from being reused. 255 */ 256 put_user_ns(userns); 257 list_del(&zds->zds_list); 258 kmem_free(zds, sizeof (*zds)); 259 } 260 261 mutex_exit(&zone_datasets_lock); 262 return (0); 263 #else 264 return (ENXIO); 265 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */ 266 } 267 EXPORT_SYMBOL(zone_dataset_detach); 268 269 /* 270 * A dataset is visible if: 271 * - It is a parent of a namespace entry. 272 * - It is one of the namespace entries. 273 * - It is a child of a namespace entry. 274 * 275 * A dataset is writable if: 276 * - It is one of the namespace entries. 277 * - It is a child of a namespace entry. 278 * 279 * The parent datasets of namespace entries are visible and 280 * read-only to provide a path back to the root of the pool. 281 */ 282 int 283 zone_dataset_visible(const char *dataset, int *write) 284 { 285 zone_datasets_t *zds; 286 zone_dataset_t *zd; 287 size_t dsnamelen, zd_len; 288 int visible; 289 290 /* Default to read-only, in case visible is returned. */ 291 if (write != NULL) 292 *write = 0; 293 if (zone_dataset_name_check(dataset, &dsnamelen) != 0) 294 return (0); 295 if (INGLOBALZONE(curproc)) { 296 if (write != NULL) 297 *write = 1; 298 return (1); 299 } 300 301 mutex_enter(&zone_datasets_lock); 302 zds = zone_datasets_lookup(crgetzoneid(curproc->cred)); 303 if (zds == NULL) { 304 mutex_exit(&zone_datasets_lock); 305 return (0); 306 } 307 308 visible = 0; 309 list_for_each_entry(zd, &zds->zds_datasets, zd_list) { 310 zd_len = strlen(zd->zd_dsname); 311 if (zd_len > dsnamelen) { 312 /* 313 * The name of the namespace entry is longer than that 314 * of the dataset, so it could be that the dataset is a 315 * parent of the namespace entry. 316 */ 317 visible = memcmp(zd->zd_dsname, dataset, 318 dsnamelen) == 0 && 319 zd->zd_dsname[dsnamelen] == '/'; 320 if (visible) 321 break; 322 } else if (zd_len == dsnamelen) { 323 /* 324 * The name of the namespace entry is as long as that 325 * of the dataset, so perhaps the dataset itself is the 326 * namespace entry. 327 */ 328 visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0; 329 if (visible) { 330 if (write != NULL) 331 *write = 1; 332 break; 333 } 334 } else { 335 /* 336 * The name of the namespace entry is shorter than that 337 * of the dataset, so perhaps the dataset is a child of 338 * the namespace entry. 339 */ 340 visible = memcmp(zd->zd_dsname, dataset, 341 zd_len) == 0 && dataset[zd_len] == '/'; 342 if (visible) { 343 if (write != NULL) 344 *write = 1; 345 break; 346 } 347 } 348 } 349 350 mutex_exit(&zone_datasets_lock); 351 return (visible); 352 } 353 EXPORT_SYMBOL(zone_dataset_visible); 354 355 unsigned int 356 global_zoneid(void) 357 { 358 unsigned int z = 0; 359 360 #if defined(CONFIG_USER_NS) 361 z = user_ns_zoneid(&init_user_ns); 362 #endif 363 364 return (z); 365 } 366 EXPORT_SYMBOL(global_zoneid); 367 368 unsigned int 369 crgetzoneid(const cred_t *cr) 370 { 371 unsigned int r = 0; 372 373 #if defined(CONFIG_USER_NS) 374 r = user_ns_zoneid(cr->user_ns); 375 #endif 376 377 return (r); 378 } 379 EXPORT_SYMBOL(crgetzoneid); 380 381 boolean_t 382 inglobalzone(proc_t *proc) 383 { 384 #if defined(CONFIG_USER_NS) 385 return (proc->cred->user_ns == &init_user_ns); 386 #else 387 return (B_TRUE); 388 #endif 389 } 390 EXPORT_SYMBOL(inglobalzone); 391 392 int 393 spl_zone_init(void) 394 { 395 mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL); 396 INIT_LIST_HEAD(&zone_datasets); 397 return (0); 398 } 399 400 void 401 spl_zone_fini(void) 402 { 403 zone_datasets_t *zds; 404 zone_dataset_t *zd; 405 406 /* 407 * It would be better to assert an empty zone_datasets, but since 408 * there's no automatic mechanism for cleaning them up if the user 409 * namespace is destroyed, just do it here, since spl is about to go 410 * out of context. 411 */ 412 while (!list_empty(&zone_datasets)) { 413 zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list); 414 while (!list_empty(&zds->zds_datasets)) { 415 zd = list_entry(zds->zds_datasets.next, 416 zone_dataset_t, zd_list); 417 list_del(&zd->zd_list); 418 kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); 419 } 420 put_user_ns(zds->zds_userns); 421 list_del(&zds->zds_list); 422 kmem_free(zds, sizeof (*zds)); 423 } 424 mutex_destroy(&zone_datasets_lock); 425 } 426