xref: /freebsd/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c (revision e6e941e659ab7b3db6786103c1cdc30735a82e32)
1 // SPDX-License-Identifier: BSD-2-Clause
2 /*
3  * Copyright (c) 2021 Klara Systems, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
30  */
31 
32 #include <sys/types.h>
33 #include <sys/sysmacros.h>
34 #include <sys/kmem.h>
35 #include <linux/file.h>
36 #include <linux/magic.h>
37 #include <sys/zone.h>
38 #include <sys/string.h>
39 
40 #if defined(CONFIG_USER_NS)
41 #include <linux/statfs.h>
42 #include <linux/proc_ns.h>
43 #endif
44 
45 #include <sys/mutex.h>
46 
47 static kmutex_t zone_datasets_lock;
48 static struct list_head zone_datasets;
49 
50 typedef struct zone_datasets {
51 	struct list_head zds_list;	/* zone_datasets linkage */
52 	struct user_namespace *zds_userns; /* namespace reference */
53 	struct list_head zds_datasets;	/* datasets for the namespace */
54 } zone_datasets_t;
55 
56 typedef struct zone_dataset {
57 	struct list_head zd_list;	/* zone_dataset linkage */
58 	size_t zd_dsnamelen;		/* length of name */
59 	char zd_dsname[];		/* name of the member dataset */
60 } zone_dataset_t;
61 
62 #ifdef CONFIG_USER_NS
63 
64 /*
65  * Linux 6.18 moved the generic namespace type away from ns->ops->type onto
66  * ns_common itself.
67  */
68 #ifdef HAVE_NS_COMMON_TYPE
69 #define	ns_is_newuser(ns)	\
70 	((ns)->ns_type == CLONE_NEWUSER)
71 #else
72 #define	ns_is_newuser(ns)	\
73 	((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER)
74 #endif
75 
76 /*
77  * Returns:
78  * - 0 on success
79  * - EBADF if it cannot open the provided file descriptor
80  * - ENOTTY if the file itself is a not a user namespace file. We want to
81  *   intercept this error in the ZFS layer. We cannot just return one of the
82  *   ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
83  *   and the SPL layers.
84  */
85 static int
user_ns_get(int fd,struct user_namespace ** userns)86 user_ns_get(int fd, struct user_namespace **userns)
87 {
88 	struct kstatfs st;
89 	struct file *nsfile;
90 	struct ns_common *ns;
91 	int error;
92 
93 	if ((nsfile = fget(fd)) == NULL)
94 		return (EBADF);
95 	if (vfs_statfs(&nsfile->f_path, &st) != 0) {
96 		error = ENOTTY;
97 		goto done;
98 	}
99 	if (st.f_type != NSFS_MAGIC) {
100 		error = ENOTTY;
101 		goto done;
102 	}
103 	ns = get_proc_ns(file_inode(nsfile));
104 	if (!ns_is_newuser(ns)) {
105 		error = ENOTTY;
106 		goto done;
107 	}
108 	*userns = container_of(ns, struct user_namespace, ns);
109 
110 	error = 0;
111 done:
112 	fput(nsfile);
113 
114 	return (error);
115 }
116 #endif /* CONFIG_USER_NS */
117 
118 static unsigned int
user_ns_zoneid(struct user_namespace * user_ns)119 user_ns_zoneid(struct user_namespace *user_ns)
120 {
121 	unsigned int r;
122 
123 	r = user_ns->ns.inum;
124 
125 	return (r);
126 }
127 
128 static struct zone_datasets *
zone_datasets_lookup(unsigned int nsinum)129 zone_datasets_lookup(unsigned int nsinum)
130 {
131 	zone_datasets_t *zds;
132 
133 	list_for_each_entry(zds, &zone_datasets, zds_list) {
134 		if (user_ns_zoneid(zds->zds_userns) == nsinum)
135 			return (zds);
136 	}
137 	return (NULL);
138 }
139 
140 #ifdef CONFIG_USER_NS
141 static struct zone_dataset *
zone_dataset_lookup(zone_datasets_t * zds,const char * dataset,size_t dsnamelen)142 zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
143 {
144 	zone_dataset_t *zd;
145 
146 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
147 		if (zd->zd_dsnamelen != dsnamelen)
148 			continue;
149 		if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
150 			return (zd);
151 	}
152 
153 	return (NULL);
154 }
155 
156 static int
zone_dataset_cred_check(cred_t * cred)157 zone_dataset_cred_check(cred_t *cred)
158 {
159 
160 	if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
161 		return (EPERM);
162 
163 	return (0);
164 }
165 #endif /* CONFIG_USER_NS */
166 
167 static int
zone_dataset_name_check(const char * dataset,size_t * dsnamelen)168 zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
169 {
170 
171 	if (dataset[0] == '\0' || dataset[0] == '/')
172 		return (ENOENT);
173 
174 	*dsnamelen = strlen(dataset);
175 	/* Ignore trailing slash, if supplied. */
176 	if (dataset[*dsnamelen - 1] == '/')
177 		(*dsnamelen)--;
178 
179 	return (0);
180 }
181 
182 int
zone_dataset_attach(cred_t * cred,const char * dataset,int userns_fd)183 zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
184 {
185 #ifdef CONFIG_USER_NS
186 	struct user_namespace *userns;
187 	zone_datasets_t *zds;
188 	zone_dataset_t *zd;
189 	int error;
190 	size_t dsnamelen;
191 
192 	if ((error = zone_dataset_cred_check(cred)) != 0)
193 		return (error);
194 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
195 		return (error);
196 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
197 		return (error);
198 
199 	mutex_enter(&zone_datasets_lock);
200 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
201 	if (zds == NULL) {
202 		zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
203 		INIT_LIST_HEAD(&zds->zds_list);
204 		INIT_LIST_HEAD(&zds->zds_datasets);
205 		zds->zds_userns = userns;
206 		/*
207 		 * Lock the namespace by incresing its refcount to prevent
208 		 * the namespace ID from being reused.
209 		 */
210 		get_user_ns(userns);
211 		list_add_tail(&zds->zds_list, &zone_datasets);
212 	} else {
213 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
214 		if (zd != NULL) {
215 			mutex_exit(&zone_datasets_lock);
216 			return (EEXIST);
217 		}
218 	}
219 
220 	zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
221 	zd->zd_dsnamelen = dsnamelen;
222 	strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
223 	INIT_LIST_HEAD(&zd->zd_list);
224 	list_add_tail(&zd->zd_list, &zds->zds_datasets);
225 
226 	mutex_exit(&zone_datasets_lock);
227 	return (0);
228 #else
229 	return (ENXIO);
230 #endif /* CONFIG_USER_NS */
231 }
232 EXPORT_SYMBOL(zone_dataset_attach);
233 
234 int
zone_dataset_detach(cred_t * cred,const char * dataset,int userns_fd)235 zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
236 {
237 #ifdef CONFIG_USER_NS
238 	struct user_namespace *userns;
239 	zone_datasets_t *zds;
240 	zone_dataset_t *zd;
241 	int error;
242 	size_t dsnamelen;
243 
244 	if ((error = zone_dataset_cred_check(cred)) != 0)
245 		return (error);
246 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
247 		return (error);
248 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
249 		return (error);
250 
251 	mutex_enter(&zone_datasets_lock);
252 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
253 	if (zds != NULL)
254 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
255 	if (zds == NULL || zd == NULL) {
256 		mutex_exit(&zone_datasets_lock);
257 		return (ENOENT);
258 	}
259 
260 	list_del(&zd->zd_list);
261 	kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
262 
263 	/* Prune the namespace entry if it has no more delegations. */
264 	if (list_empty(&zds->zds_datasets)) {
265 		/*
266 		 * Decrease the refcount now that the namespace is no longer
267 		 * used. It is no longer necessary to prevent the namespace ID
268 		 * from being reused.
269 		 */
270 		put_user_ns(userns);
271 		list_del(&zds->zds_list);
272 		kmem_free(zds, sizeof (*zds));
273 	}
274 
275 	mutex_exit(&zone_datasets_lock);
276 	return (0);
277 #else
278 	return (ENXIO);
279 #endif /* CONFIG_USER_NS */
280 }
281 EXPORT_SYMBOL(zone_dataset_detach);
282 
283 /*
284  * A dataset is visible if:
285  * - It is a parent of a namespace entry.
286  * - It is one of the namespace entries.
287  * - It is a child of a namespace entry.
288  *
289  * A dataset is writable if:
290  * - It is one of the namespace entries.
291  * - It is a child of a namespace entry.
292  *
293  * The parent datasets of namespace entries are visible and
294  * read-only to provide a path back to the root of the pool.
295  */
296 int
zone_dataset_visible(const char * dataset,int * write)297 zone_dataset_visible(const char *dataset, int *write)
298 {
299 	zone_datasets_t *zds;
300 	zone_dataset_t *zd;
301 	size_t dsnamelen, zd_len;
302 	int visible;
303 
304 	/* Default to read-only, in case visible is returned. */
305 	if (write != NULL)
306 		*write = 0;
307 	if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
308 		return (0);
309 	if (INGLOBALZONE(curproc)) {
310 		if (write != NULL)
311 			*write = 1;
312 		return (1);
313 	}
314 
315 	mutex_enter(&zone_datasets_lock);
316 	zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
317 	if (zds == NULL) {
318 		mutex_exit(&zone_datasets_lock);
319 		return (0);
320 	}
321 
322 	visible = 0;
323 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
324 		zd_len = strlen(zd->zd_dsname);
325 		if (zd_len > dsnamelen) {
326 			/*
327 			 * The name of the namespace entry is longer than that
328 			 * of the dataset, so it could be that the dataset is a
329 			 * parent of the namespace entry.
330 			 */
331 			visible = memcmp(zd->zd_dsname, dataset,
332 			    dsnamelen) == 0 &&
333 			    zd->zd_dsname[dsnamelen] == '/';
334 			if (visible)
335 				break;
336 		} else if (zd_len == dsnamelen) {
337 			/*
338 			 * The name of the namespace entry is as long as that
339 			 * of the dataset, so perhaps the dataset itself is the
340 			 * namespace entry.
341 			 */
342 			visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
343 			if (visible) {
344 				if (write != NULL)
345 					*write = 1;
346 				break;
347 			}
348 		} else {
349 			/*
350 			 * The name of the namespace entry is shorter than that
351 			 * of the dataset, so perhaps the dataset is a child of
352 			 * the namespace entry.
353 			 */
354 			visible = memcmp(zd->zd_dsname, dataset,
355 			    zd_len) == 0 && dataset[zd_len] == '/';
356 			if (visible) {
357 				if (write != NULL)
358 					*write = 1;
359 				break;
360 			}
361 		}
362 	}
363 
364 	mutex_exit(&zone_datasets_lock);
365 	return (visible);
366 }
367 EXPORT_SYMBOL(zone_dataset_visible);
368 
369 unsigned int
global_zoneid(void)370 global_zoneid(void)
371 {
372 	unsigned int z = 0;
373 
374 #if defined(CONFIG_USER_NS)
375 	z = user_ns_zoneid(&init_user_ns);
376 #endif
377 
378 	return (z);
379 }
380 EXPORT_SYMBOL(global_zoneid);
381 
382 unsigned int
crgetzoneid(const cred_t * cr)383 crgetzoneid(const cred_t *cr)
384 {
385 	unsigned int r = 0;
386 
387 #if defined(CONFIG_USER_NS)
388 	r = user_ns_zoneid(cr->user_ns);
389 #endif
390 
391 	return (r);
392 }
393 EXPORT_SYMBOL(crgetzoneid);
394 
395 boolean_t
inglobalzone(proc_t * proc)396 inglobalzone(proc_t *proc)
397 {
398 #if defined(CONFIG_USER_NS)
399 	return (proc->cred->user_ns == &init_user_ns);
400 #else
401 	return (B_TRUE);
402 #endif
403 }
404 EXPORT_SYMBOL(inglobalzone);
405 
406 int
spl_zone_init(void)407 spl_zone_init(void)
408 {
409 	mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
410 	INIT_LIST_HEAD(&zone_datasets);
411 	return (0);
412 }
413 
414 void
spl_zone_fini(void)415 spl_zone_fini(void)
416 {
417 	zone_datasets_t *zds;
418 	zone_dataset_t *zd;
419 
420 	/*
421 	 * It would be better to assert an empty zone_datasets, but since
422 	 * there's no automatic mechanism for cleaning them up if the user
423 	 * namespace is destroyed, just do it here, since spl is about to go
424 	 * out of context.
425 	 */
426 	while (!list_empty(&zone_datasets)) {
427 		zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
428 		while (!list_empty(&zds->zds_datasets)) {
429 			zd = list_entry(zds->zds_datasets.next,
430 			    zone_dataset_t, zd_list);
431 			list_del(&zd->zd_list);
432 			kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
433 		}
434 		put_user_ns(zds->zds_userns);
435 		list_del(&zds->zds_list);
436 		kmem_free(zds, sizeof (*zds));
437 	}
438 	mutex_destroy(&zone_datasets_lock);
439 }
440