1 // SPDX-License-Identifier: BSD-2-Clause 2 /* 3 * Copyright (c) 2021 Klara Systems, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 /* 29 * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> 30 */ 31 32 #include <sys/types.h> 33 #include <sys/sysmacros.h> 34 #include <sys/kmem.h> 35 #include <linux/file.h> 36 #include <linux/magic.h> 37 #include <sys/zone.h> 38 #include <sys/string.h> 39 40 #if defined(CONFIG_USER_NS) 41 #include <linux/statfs.h> 42 #include <linux/proc_ns.h> 43 #endif 44 45 #include <sys/mutex.h> 46 47 static kmutex_t zone_datasets_lock; 48 static struct list_head zone_datasets; 49 50 typedef struct zone_datasets { 51 struct list_head zds_list; /* zone_datasets linkage */ 52 struct user_namespace *zds_userns; /* namespace reference */ 53 struct list_head zds_datasets; /* datasets for the namespace */ 54 } zone_datasets_t; 55 56 typedef struct zone_dataset { 57 struct list_head zd_list; /* zone_dataset linkage */ 58 size_t zd_dsnamelen; /* length of name */ 59 char zd_dsname[]; /* name of the member dataset */ 60 } zone_dataset_t; 61 62 #ifdef CONFIG_USER_NS 63 64 /* 65 * Linux 6.18 moved the generic namespace type away from ns->ops->type onto 66 * ns_common itself. 67 */ 68 #ifdef HAVE_NS_COMMON_TYPE 69 #define ns_is_newuser(ns) \ 70 ((ns)->ns_type == CLONE_NEWUSER) 71 #else 72 #define ns_is_newuser(ns) \ 73 ((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER) 74 #endif 75 76 /* 77 * Returns: 78 * - 0 on success 79 * - EBADF if it cannot open the provided file descriptor 80 * - ENOTTY if the file itself is a not a user namespace file. We want to 81 * intercept this error in the ZFS layer. We cannot just return one of the 82 * ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS 83 * and the SPL layers. 84 */ 85 static int 86 user_ns_get(int fd, struct user_namespace **userns) 87 { 88 struct kstatfs st; 89 struct file *nsfile; 90 struct ns_common *ns; 91 int error; 92 93 if ((nsfile = fget(fd)) == NULL) 94 return (EBADF); 95 if (vfs_statfs(&nsfile->f_path, &st) != 0) { 96 error = ENOTTY; 97 goto done; 98 } 99 if (st.f_type != NSFS_MAGIC) { 100 error = ENOTTY; 101 goto done; 102 } 103 ns = get_proc_ns(file_inode(nsfile)); 104 if (!ns_is_newuser(ns)) { 105 error = ENOTTY; 106 goto done; 107 } 108 *userns = container_of(ns, struct user_namespace, ns); 109 110 error = 0; 111 done: 112 fput(nsfile); 113 114 return (error); 115 } 116 #endif /* CONFIG_USER_NS */ 117 118 static unsigned int 119 user_ns_zoneid(struct user_namespace *user_ns) 120 { 121 unsigned int r; 122 123 r = user_ns->ns.inum; 124 125 return (r); 126 } 127 128 static struct zone_datasets * 129 zone_datasets_lookup(unsigned int nsinum) 130 { 131 zone_datasets_t *zds; 132 133 list_for_each_entry(zds, &zone_datasets, zds_list) { 134 if (user_ns_zoneid(zds->zds_userns) == nsinum) 135 return (zds); 136 } 137 return (NULL); 138 } 139 140 #ifdef CONFIG_USER_NS 141 static struct zone_dataset * 142 zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen) 143 { 144 zone_dataset_t *zd; 145 146 list_for_each_entry(zd, &zds->zds_datasets, zd_list) { 147 if (zd->zd_dsnamelen != dsnamelen) 148 continue; 149 if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) 150 return (zd); 151 } 152 153 return (NULL); 154 } 155 156 static int 157 zone_dataset_cred_check(cred_t *cred) 158 { 159 160 if (!uid_eq(cred->uid, GLOBAL_ROOT_UID)) 161 return (EPERM); 162 163 return (0); 164 } 165 #endif /* CONFIG_USER_NS */ 166 167 static int 168 zone_dataset_name_check(const char *dataset, size_t *dsnamelen) 169 { 170 171 if (dataset[0] == '\0' || dataset[0] == '/') 172 return (ENOENT); 173 174 *dsnamelen = strlen(dataset); 175 /* Ignore trailing slash, if supplied. */ 176 if (dataset[*dsnamelen - 1] == '/') 177 (*dsnamelen)--; 178 179 return (0); 180 } 181 182 int 183 zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd) 184 { 185 #ifdef CONFIG_USER_NS 186 struct user_namespace *userns; 187 zone_datasets_t *zds; 188 zone_dataset_t *zd; 189 int error; 190 size_t dsnamelen; 191 192 if ((error = zone_dataset_cred_check(cred)) != 0) 193 return (error); 194 if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) 195 return (error); 196 if ((error = user_ns_get(userns_fd, &userns)) != 0) 197 return (error); 198 199 mutex_enter(&zone_datasets_lock); 200 zds = zone_datasets_lookup(user_ns_zoneid(userns)); 201 if (zds == NULL) { 202 zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP); 203 INIT_LIST_HEAD(&zds->zds_list); 204 INIT_LIST_HEAD(&zds->zds_datasets); 205 zds->zds_userns = userns; 206 /* 207 * Lock the namespace by incresing its refcount to prevent 208 * the namespace ID from being reused. 209 */ 210 get_user_ns(userns); 211 list_add_tail(&zds->zds_list, &zone_datasets); 212 } else { 213 zd = zone_dataset_lookup(zds, dataset, dsnamelen); 214 if (zd != NULL) { 215 mutex_exit(&zone_datasets_lock); 216 return (EEXIST); 217 } 218 } 219 220 zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP); 221 zd->zd_dsnamelen = dsnamelen; 222 strlcpy(zd->zd_dsname, dataset, dsnamelen + 1); 223 INIT_LIST_HEAD(&zd->zd_list); 224 list_add_tail(&zd->zd_list, &zds->zds_datasets); 225 226 mutex_exit(&zone_datasets_lock); 227 return (0); 228 #else 229 return (ENXIO); 230 #endif /* CONFIG_USER_NS */ 231 } 232 EXPORT_SYMBOL(zone_dataset_attach); 233 234 int 235 zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd) 236 { 237 #ifdef CONFIG_USER_NS 238 struct user_namespace *userns; 239 zone_datasets_t *zds; 240 zone_dataset_t *zd; 241 int error; 242 size_t dsnamelen; 243 244 if ((error = zone_dataset_cred_check(cred)) != 0) 245 return (error); 246 if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) 247 return (error); 248 if ((error = user_ns_get(userns_fd, &userns)) != 0) 249 return (error); 250 251 mutex_enter(&zone_datasets_lock); 252 zds = zone_datasets_lookup(user_ns_zoneid(userns)); 253 if (zds != NULL) 254 zd = zone_dataset_lookup(zds, dataset, dsnamelen); 255 if (zds == NULL || zd == NULL) { 256 mutex_exit(&zone_datasets_lock); 257 return (ENOENT); 258 } 259 260 list_del(&zd->zd_list); 261 kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); 262 263 /* Prune the namespace entry if it has no more delegations. */ 264 if (list_empty(&zds->zds_datasets)) { 265 /* 266 * Decrease the refcount now that the namespace is no longer 267 * used. It is no longer necessary to prevent the namespace ID 268 * from being reused. 269 */ 270 put_user_ns(userns); 271 list_del(&zds->zds_list); 272 kmem_free(zds, sizeof (*zds)); 273 } 274 275 mutex_exit(&zone_datasets_lock); 276 return (0); 277 #else 278 return (ENXIO); 279 #endif /* CONFIG_USER_NS */ 280 } 281 EXPORT_SYMBOL(zone_dataset_detach); 282 283 /* 284 * A dataset is visible if: 285 * - It is a parent of a namespace entry. 286 * - It is one of the namespace entries. 287 * - It is a child of a namespace entry. 288 * 289 * A dataset is writable if: 290 * - It is one of the namespace entries. 291 * - It is a child of a namespace entry. 292 * 293 * The parent datasets of namespace entries are visible and 294 * read-only to provide a path back to the root of the pool. 295 */ 296 int 297 zone_dataset_visible(const char *dataset, int *write) 298 { 299 zone_datasets_t *zds; 300 zone_dataset_t *zd; 301 size_t dsnamelen, zd_len; 302 int visible; 303 304 /* Default to read-only, in case visible is returned. */ 305 if (write != NULL) 306 *write = 0; 307 if (zone_dataset_name_check(dataset, &dsnamelen) != 0) 308 return (0); 309 if (INGLOBALZONE(curproc)) { 310 if (write != NULL) 311 *write = 1; 312 return (1); 313 } 314 315 mutex_enter(&zone_datasets_lock); 316 zds = zone_datasets_lookup(crgetzoneid(curproc->cred)); 317 if (zds == NULL) { 318 mutex_exit(&zone_datasets_lock); 319 return (0); 320 } 321 322 visible = 0; 323 list_for_each_entry(zd, &zds->zds_datasets, zd_list) { 324 zd_len = strlen(zd->zd_dsname); 325 if (zd_len > dsnamelen) { 326 /* 327 * The name of the namespace entry is longer than that 328 * of the dataset, so it could be that the dataset is a 329 * parent of the namespace entry. 330 */ 331 visible = memcmp(zd->zd_dsname, dataset, 332 dsnamelen) == 0 && 333 zd->zd_dsname[dsnamelen] == '/'; 334 if (visible) 335 break; 336 } else if (zd_len == dsnamelen) { 337 /* 338 * The name of the namespace entry is as long as that 339 * of the dataset, so perhaps the dataset itself is the 340 * namespace entry. 341 */ 342 visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0; 343 if (visible) { 344 if (write != NULL) 345 *write = 1; 346 break; 347 } 348 } else { 349 /* 350 * The name of the namespace entry is shorter than that 351 * of the dataset, so perhaps the dataset is a child of 352 * the namespace entry. 353 */ 354 visible = memcmp(zd->zd_dsname, dataset, 355 zd_len) == 0 && dataset[zd_len] == '/'; 356 if (visible) { 357 if (write != NULL) 358 *write = 1; 359 break; 360 } 361 } 362 } 363 364 mutex_exit(&zone_datasets_lock); 365 return (visible); 366 } 367 EXPORT_SYMBOL(zone_dataset_visible); 368 369 unsigned int 370 global_zoneid(void) 371 { 372 unsigned int z = 0; 373 374 #if defined(CONFIG_USER_NS) 375 z = user_ns_zoneid(&init_user_ns); 376 #endif 377 378 return (z); 379 } 380 EXPORT_SYMBOL(global_zoneid); 381 382 unsigned int 383 crgetzoneid(const cred_t *cr) 384 { 385 unsigned int r = 0; 386 387 #if defined(CONFIG_USER_NS) 388 r = user_ns_zoneid(cr->user_ns); 389 #endif 390 391 return (r); 392 } 393 EXPORT_SYMBOL(crgetzoneid); 394 395 boolean_t 396 inglobalzone(proc_t *proc) 397 { 398 #if defined(CONFIG_USER_NS) 399 return (proc->cred->user_ns == &init_user_ns); 400 #else 401 return (B_TRUE); 402 #endif 403 } 404 EXPORT_SYMBOL(inglobalzone); 405 406 int 407 spl_zone_init(void) 408 { 409 mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL); 410 INIT_LIST_HEAD(&zone_datasets); 411 return (0); 412 } 413 414 void 415 spl_zone_fini(void) 416 { 417 zone_datasets_t *zds; 418 zone_dataset_t *zd; 419 420 /* 421 * It would be better to assert an empty zone_datasets, but since 422 * there's no automatic mechanism for cleaning them up if the user 423 * namespace is destroyed, just do it here, since spl is about to go 424 * out of context. 425 */ 426 while (!list_empty(&zone_datasets)) { 427 zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list); 428 while (!list_empty(&zds->zds_datasets)) { 429 zd = list_entry(zds->zds_datasets.next, 430 zone_dataset_t, zd_list); 431 list_del(&zd->zd_list); 432 kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); 433 } 434 put_user_ns(zds->zds_userns); 435 list_del(&zds->zds_list); 436 kmem_free(zds, sizeof (*zds)); 437 } 438 mutex_destroy(&zone_datasets_lock); 439 } 440